diff --git a/README.md b/README.md index f15386a..4144d02 100644 --- a/README.md +++ b/README.md @@ -410,17 +410,25 @@ brightdata scraper create https://example.com/product/1 \ ### `scraper run` -Run a scraper (built with `scraper create` or in the web UI) against a URL and get the extracted data. +Run a scraper (built with `scraper create` or in the web UI) against one or more URLs and get the extracted data. ```bash -brightdata scraper run [options] +brightdata scraper run [url] [options] ``` +Provide URLs in exactly one of three ways: + +- Positional `` — single URL (legacy form, unchanged). +- `--urls ` — comma-separated list. +- `--input-file ` — file with one URL per line, **or** a JSON array of URL strings, **or** a JSON array of `{"url": "..."}` objects. + | Flag | Description | |---|---| -| `--sync` | Use the synchronous `/dca/crawl` endpoint (server-side cap of 25–50s) | +| `--urls ` | Comma-separated list of URLs (multi-URL batch path) | +| `--input-file ` | File with URLs (txt one-per-line, or JSON array) | +| `--sync` | Use the synchronous `/dca/crawl` endpoint (single-URL only, server-side cap of 25–50s) | | `--sync-timeout ` | Sync-mode server timeout, `25`–`50` (default: `50`) | -| `--timeout ` | Async polling timeout (default: `600`) | +| `--timeout ` | Polling timeout (default: `600` single-URL, `3600` batch) | | `--name ` | Human-readable job name | | `--version ` | Scraper version (e.g. `dev`) | | `-o, --output ` | Write output to file | @@ -428,9 +436,12 @@ brightdata scraper run [options] | `--timing` | Show request timing | | `-k, --api-key ` | Override API key | -By default the command uses the async flow: it triggers `/dca/trigger_immediate`, gets back a `response_id`, and polls `/dca/get_result` until the data is ready. Use `--sync` for one-shot scrapes that you expect to finish within ~50 seconds; on a sync server-side timeout the command exits with the `response_id` so you can re-run without `--sync` to poll for the result. +**Routing** + +- **Single URL** (positional, or one entry via `--urls` / `--input-file`) → async flow: `/dca/trigger_immediate` → poll `/dca/get_result`. Use `--sync` for `/dca/crawl` (one-shot, 25–50s). +- **Multiple URLs** (`--urls` / `--input-file` with 2+ entries) → single POST to `/dca/trigger` with an array body, one `collection_id`, polled via `/dca/dataset`. This mirrors the canonical batch shape used by the reference SDKs ([`triggerWithUrls`](https://github.com/brightdata/bright-data-scraper-studio-nodejs-project) / [`trigger_with_urls`](https://github.com/brightdata/bright-data-scraper-studio-python-project)). `--sync` is incompatible with multi-URL — `/dca/crawl` accepts only a single URL. -If a URL expands to more pages than the realtime job limit allows (e.g. paginated listings, infinite scroll), the CLI automatically falls back to the batch endpoint (`/dca/trigger` → poll `/dca/dataset`). The fallback prints a one-line notice and adjusts the poll interval and timeout for the longer batch wait. No flag required. +If a single URL expands to more pages than the realtime job limit allows (paginated listings, infinite scroll), the CLI automatically falls back to the batch endpoint and prints a one-line notice. No flag required. **Examples** @@ -448,6 +459,18 @@ brightdata scraper run c_mp3tuab31lswoxvpws https://example.com/p/1 --sync # Sync with a shorter server timeout and a job name brightdata scraper run c_mp3tuab31lswoxvpws https://example.com/p/1 \ --sync --sync-timeout 30 --name first-test + +# Multi-URL batch — one API call, one snapshot, one merged result array +brightdata scraper run c_mp3tuab31lswoxvpws \ + --urls "https://example.com/p/1,https://example.com/p/2,https://example.com/p/3" \ + --pretty -o products.json + +# Multi-URL from a file (one URL per line; # comments and blank lines skipped) +brightdata scraper run c_mp3tuab31lswoxvpws --input-file urls.txt -o products.json + +# Multi-URL from a JSON array +echo '["https://example.com/p/1","https://example.com/p/2"]' > urls.json +brightdata scraper run c_mp3tuab31lswoxvpws --input-file urls.json ``` --- diff --git a/src/__tests__/commands/scraper.test.ts b/src/__tests__/commands/scraper.test.ts index b07b14f..db93225 100644 --- a/src/__tests__/commands/scraper.test.ts +++ b/src/__tests__/commands/scraper.test.ts @@ -1,3 +1,6 @@ +import {writeFileSync, mkdtempSync, rmSync} from 'node:fs'; +import {tmpdir} from 'node:os'; +import {join} from 'node:path'; import {describe, it, expect, beforeEach, afterEach, vi} from 'vitest'; import {Command} from 'commander'; import type {Scraper_create_opts} from '../../types/scraper'; @@ -68,6 +71,10 @@ import { AI_TRIGGER_DEFAULT_RETRIES, AI_TRIGGER_RETRY_BASE_MS, AI_TRIGGER_RETRY_MAX_MS, + parse_urls_arg, + read_input_file, + resolve_run_inputs, + is_valid_url, } from '../../commands/scraper'; describe('commands/scraper', ()=>{ @@ -1163,4 +1170,285 @@ describe('commands/scraper', ()=>{ error.mockRestore(); }); }); + + describe('is_valid_url', ()=>{ + it('accepts http/https URLs', ()=>{ + expect(is_valid_url('https://example.com')).toBe(true); + expect(is_valid_url('http://example.com/a/b?c=1')).toBe(true); + }); + + it('rejects garbage', ()=>{ + expect(is_valid_url('not a url')).toBe(false); + expect(is_valid_url('')).toBe(false); + expect(is_valid_url(' ')).toBe(false); + }); + }); + + describe('parse_urls_arg', ()=>{ + it('splits, trims, and drops empties', ()=>{ + expect(parse_urls_arg( + ' https://a.com , https://b.com ,, https://c.com')) + .toEqual(['https://a.com', 'https://b.com', 'https://c.com']); + }); + + it('returns single URL for a non-comma input', ()=>{ + expect(parse_urls_arg('https://only.example.com')) + .toEqual(['https://only.example.com']); + }); + + it('returns empty array for blank input', ()=>{ + expect(parse_urls_arg('')).toEqual([]); + expect(parse_urls_arg(' , , ')).toEqual([]); + }); + }); + + describe('read_input_file', ()=>{ + let tmp_dir: string; + + beforeEach(()=>{ + tmp_dir = mkdtempSync(join(tmpdir(), 'bdata-test-')); + }); + + afterEach(()=>{ + rmSync(tmp_dir, {recursive: true, force: true}); + }); + + const write = (name: string, content: string): string=>{ + const p = join(tmp_dir, name); + writeFileSync(p, content, 'utf8'); + return p; + }; + + it('reads newline-separated URLs', ()=>{ + const p = write('urls.txt', + 'https://a.com\nhttps://b.com\nhttps://c.com'); + expect(read_input_file(p)).toEqual([ + 'https://a.com', 'https://b.com', 'https://c.com']); + }); + + it('skips blank lines and # comments', ()=>{ + const p = write('urls.txt', + '# top comment\n' + +'https://a.com\n' + +'\n' + +' \n' + +'# section\n' + +'https://b.com # inline comment ok\n' + +'https://c.com'); + expect(read_input_file(p)).toEqual([ + 'https://a.com', 'https://b.com', 'https://c.com']); + }); + + it('reads JSON array of strings', ()=>{ + const p = write('urls.json', + JSON.stringify(['https://a.com', 'https://b.com'])); + expect(read_input_file(p)).toEqual([ + 'https://a.com', 'https://b.com']); + }); + + it('reads JSON array of {url} objects', ()=>{ + const p = write('urls.json', JSON.stringify([ + {url: 'https://a.com'}, + {url: 'https://b.com', extra: 'ignored'}, + ])); + expect(read_input_file(p)).toEqual([ + 'https://a.com', 'https://b.com']); + }); + + it('throws on missing file', ()=>{ + expect(()=>read_input_file(join(tmp_dir, 'missing.txt'))) + .toThrow(/Cannot read --input-file/); + }); + + it('throws on malformed JSON', ()=>{ + const p = write('bad.json', '[{not valid'); + expect(()=>read_input_file(p)) + .toThrow(/failed to parse/); + }); + + it('throws on non-array JSON', ()=>{ + const p = write('obj.json', '{"url": "https://a.com"}'); + expect(()=>read_input_file(p)) + .toThrow(/must be an array/); + }); + + it('throws on JSON entry with neither string nor {url}', ()=>{ + const p = write('mixed.json', + JSON.stringify(['https://a.com', {wrong: 'field'}])); + expect(()=>read_input_file(p)) + .toThrow(/must be a string or an object with a "url"/); + }); + + it('returns empty array for an empty file', ()=>{ + const p = write('empty.txt', ' \n\n '); + expect(read_input_file(p)).toEqual([]); + }); + }); + + describe('resolve_run_inputs', ()=>{ + let tmp_dir: string; + + beforeEach(()=>{ + tmp_dir = mkdtempSync(join(tmpdir(), 'bdata-test-')); + }); + + afterEach(()=>{ + rmSync(tmp_dir, {recursive: true, force: true}); + }); + + it('returns the positional URL as a single-element list', ()=>{ + expect(resolve_run_inputs('https://a.com', {})) + .toEqual(['https://a.com']); + }); + + it('parses --urls', ()=>{ + expect(resolve_run_inputs(undefined, + {urls: 'https://a.com,https://b.com'})) + .toEqual(['https://a.com', 'https://b.com']); + }); + + it('reads --input-file', ()=>{ + const p = join(tmp_dir, 'urls.txt'); + writeFileSync(p, 'https://a.com\nhttps://b.com', 'utf8'); + expect(resolve_run_inputs(undefined, {inputFile: p})) + .toEqual(['https://a.com', 'https://b.com']); + }); + + it('rejects when no source is provided', ()=>{ + expect(()=>resolve_run_inputs(undefined, {})) + .toThrow(/requires one of: positional, --urls/); + }); + + it('rejects when multiple sources are provided', ()=>{ + expect(()=>resolve_run_inputs('https://a.com', + {urls: 'https://b.com'})) + .toThrow(/only one input source/); + expect(()=>resolve_run_inputs(undefined, + {urls: 'https://a.com', inputFile: '/tmp/x'})) + .toThrow(/only one input source/); + }); + + it('rejects when parsed list is empty', ()=>{ + expect(()=>resolve_run_inputs(undefined, {urls: ' , , '})) + .toThrow(/No URLs to scrape/); + }); + + it('rejects invalid URLs and names them', ()=>{ + expect(()=>resolve_run_inputs(undefined, + {urls: 'https://a.com,not-a-url,also bad'})) + .toThrow(/Invalid URL\(s\):.*not-a-url/); + }); + }); + + describe('handle_run_scraper multi-URL', ()=>{ + let fetch_spy: ReturnType; + let tmp_dir: string; + + beforeEach(()=>{ + fetch_spy = vi.spyOn(global, 'fetch') as never; + tmp_dir = mkdtempSync(join(tmpdir(), 'bdata-test-')); + }); + + afterEach(()=>{ + fetch_spy.mockRestore(); + rmSync(tmp_dir, {recursive: true, force: true}); + }); + + it('--urls posts an array body to /dca/trigger and polls /dca/dataset', + async()=>{ + mocks.post.mockResolvedValueOnce({collection_id: 'd_batch'}); + fetch_spy.mockImplementation(()=>Promise.resolve({ + status: 200, + text: ()=>Promise.resolve( + '[{"title":"A"},{"title":"B"},{"title":"C"}]'), + } as unknown as Response)); + mocks.poll_until.mockImplementationOnce(async(o: never)=>{ + const cfg = o as {fetch_once: ()=>Promise}; + const r = await cfg.fetch_once(); + return {result: r, attempts: 1, last_status: '__ready__'}; + }); + await handle_run_scraper('c_abc', undefined, { + urls: 'https://a.com,https://b.com,https://c.com', + }); + expect(mocks.post).toHaveBeenCalledTimes(1); + const call = mocks.post.mock.calls[0]; + expect(String(call[1])).toMatch(/\/dca\/trigger\?collector=c_abc/); + expect(call[2]).toEqual([ + {url: 'https://a.com'}, + {url: 'https://b.com'}, + {url: 'https://c.com'}, + ]); + expect(mocks.print).toHaveBeenCalledWith( + [{title: 'A'}, {title: 'B'}, {title: 'C'}], + {json: undefined, pretty: undefined, output: undefined} + ); + }); + + it('--input-file routes to the same batch path', async()=>{ + const p = join(tmp_dir, 'urls.txt'); + writeFileSync(p, 'https://a.com\nhttps://b.com', 'utf8'); + mocks.post.mockResolvedValueOnce({collection_id: 'd_batch'}); + fetch_spy.mockImplementation(()=>Promise.resolve({ + status: 200, + text: ()=>Promise.resolve('[{"ok":1},{"ok":2}]'), + } as unknown as Response)); + mocks.poll_until.mockImplementationOnce(async(o: never)=>{ + const cfg = o as {fetch_once: ()=>Promise}; + const r = await cfg.fetch_once(); + return {result: r, attempts: 1, last_status: '__ready__'}; + }); + await handle_run_scraper('c_abc', undefined, {inputFile: p}); + expect(mocks.post.mock.calls[0][2]).toEqual([ + {url: 'https://a.com'}, + {url: 'https://b.com'}, + ]); + }); + + it('rejects --sync combined with --urls', async()=>{ + await expect( + handle_run_scraper('c_abc', undefined, { + sync: true, + urls: 'https://a.com,https://b.com', + }) + ).rejects.toThrow(/--sync cannot be combined with --urls/); + expect(mocks.fail).toHaveBeenCalledWith( + expect.stringContaining( + '--sync cannot be combined with --urls')); + expect(mocks.post).not.toHaveBeenCalled(); + }); + + it('rejects when no URL source is provided', async()=>{ + await expect( + handle_run_scraper('c_abc', undefined, {}) + ).rejects.toThrow( + /requires one of: positional, --urls, or --input-file/); + }); + + it('rejects when positional and --urls are both set', async()=>{ + await expect( + handle_run_scraper('c_abc', 'https://a.com', + {urls: 'https://b.com'}) + ).rejects.toThrow(/only one input source/); + }); + + it('single URL via --urls still takes the legacy single path', + async()=>{ + mocks.post.mockResolvedValueOnce({response_id: 'r_xyz'}); + fetch_spy.mockImplementation(()=>Promise.resolve({ + status: 200, + text: ()=>Promise.resolve('{"title":"only"}'), + } as unknown as Response)); + mocks.poll_until.mockImplementationOnce(async(o: never)=>{ + const cfg = o as {fetch_once: ()=>Promise}; + const r = await cfg.fetch_once(); + return {result: r, attempts: 1, last_status: '__ready__'}; + }); + await handle_run_scraper('c_abc', undefined, + {urls: 'https://only.com'}); + expect(String(mocks.post.mock.calls[0][1])).toMatch( + /\/dca\/trigger_immediate\?collector=c_abc/); + expect(mocks.post.mock.calls[0][2]).toEqual( + {url: 'https://only.com'}); + }); + }); }); diff --git a/src/commands/scraper.ts b/src/commands/scraper.ts index 02742a8..68ed144 100644 --- a/src/commands/scraper.ts +++ b/src/commands/scraper.ts @@ -1,3 +1,4 @@ +import {readFileSync} from 'node:fs'; import {Command} from 'commander'; import {post, get, type Body_hint, type Retry_config, type Retry_event} from '../utils/client'; @@ -381,6 +382,108 @@ const parse_sync_timeout = (raw: string|undefined): number=>{ return Math.floor(value); }; +const is_valid_url = (s: string): boolean=>{ + try { + // eslint-disable-next-line no-new + new URL(s); + return true; + } catch { + return false; + } +}; + +const parse_urls_arg = (raw: string): string[]=>{ + return raw.split(',') + .map(u=>u.trim()) + .filter(u=>u.length > 0); +}; + +const read_input_file = (path: string): string[]=>{ + let raw: string; + try { + raw = readFileSync(path, 'utf8'); + } catch(e) { + throw new Error( + `Cannot read --input-file "${path}": ${(e as Error).message}`); + } + const trimmed = raw.trim(); + if (!trimmed) + return []; + if (trimmed.startsWith('[') || trimmed.startsWith('{')) + { + let parsed: unknown; + try { + parsed = JSON.parse(trimmed); + } catch(e) { + throw new Error( + `--input-file "${path}" looks like JSON but failed to parse: ` + +`${(e as Error).message}`); + } + if (!Array.isArray(parsed)) + throw new Error( + `--input-file "${path}" JSON must be an array of URL ` + +`strings or {url} objects, got ${typeof parsed}.`); + const urls: string[] = []; + for (const [i, item] of parsed.entries()) + { + if (typeof item == 'string') + { + urls.push(item); + continue; + } + if (item && typeof item == 'object' + && typeof (item as {url?: unknown}).url == 'string') + { + urls.push((item as {url: string}).url); + continue; + } + throw new Error( + `--input-file "${path}" entry ${i} must be a string or ` + +'an object with a "url" string field.'); + } + return urls; + } + return trimmed.split(/\r?\n/) + .map(line=>line.replace(/\s+#.*$/, '').trim()) + .filter(line=>line.length > 0 && !line.startsWith('#')); +}; + +const resolve_run_inputs = ( + positional: string|undefined, + opts: Pick +): string[]=>{ + const sources: string[] = []; + if (positional) + sources.push(''); + if (opts.urls) + sources.push('--urls'); + if (opts.inputFile) + sources.push('--input-file'); + if (sources.length == 0) + throw new Error( + 'scraper run requires one of: positional, --urls, ' + +'or --input-file.'); + if (sources.length > 1) + throw new Error( + `scraper run accepts only one input source; got: ` + +`${sources.join(', ')}. Pick one.`); + let urls: string[]; + if (positional) + urls = [positional]; + else if (opts.urls) + urls = parse_urls_arg(opts.urls); + else + urls = read_input_file(opts.inputFile!); + if (urls.length == 0) + throw new Error('No URLs to scrape after parsing inputs.'); + const invalid = urls.filter(u=>!is_valid_url(u)); + if (invalid.length > 0) + throw new Error( + `Invalid URL(s): ${invalid.slice(0, 3).join(', ')}` + +(invalid.length > 3 ? ` (+${invalid.length - 3} more)` : '')); + return urls; +}; + const build_run_request = (url: string): Run_request=>({url}); const build_run_query = ( @@ -491,8 +594,9 @@ const fetch_raw = async( const run_batch = async( api_key: string, collector_id: string, - url: string, - opts: Scraper_run_opts + urls: string[], + opts: Scraper_run_opts, + reason: 'page_limit_fallback'|'multi_url' = 'page_limit_fallback' )=>{ const timeout_raw = opts.timeout ?? String(BATCH_TIMEOUT_DEFAULT); let timeout = BATCH_TIMEOUT_DEFAULT; @@ -502,8 +606,17 @@ const run_batch = async( fail((e as Error).message); return; } - console.error(dim( - 'Realtime page limit exceeded — switching to batch mode...')); + if (reason == 'page_limit_fallback') + { + console.error(dim( + 'Realtime page limit exceeded — switching to batch mode...')); + } + else + { + console.error(dim( + `Running batch for ${urls.length} URLs ` + +'via /dca/trigger...')); + } const trigger_spinner = start_spinner('Submitting batch job...'); let collection_id = ''; try { @@ -511,7 +624,7 @@ const run_batch = async( const trigger = await post( api_key, `${BATCH_TRIGGER_ENDPOINT}?${query}`, - [build_run_request(url)], + urls.map(build_run_request), {timing: opts.timing, hints: SCRAPER_BODY_HINTS} ); trigger_spinner.stop(); @@ -570,10 +683,31 @@ const run_batch = async( const handle_run_scraper = async( collector_id: string, - url: string, + url: string|undefined, opts: Scraper_run_opts )=>{ const api_key = ensure_authenticated(opts.apiKey); + let urls: string[]; + try { + urls = resolve_run_inputs(url, opts); + } catch(e) { + fail((e as Error).message); + return; + } + if (urls.length > 1) + { + if (opts.sync) + { + fail( + '--sync cannot be combined with --urls / --input-file. ' + +'The /dca/crawl endpoint accepts only a single URL. ' + +'Drop --sync to use the batch endpoint (/dca/trigger).'); + return; + } + await run_batch(api_key, collector_id, urls, opts, 'multi_url'); + return; + } + const single_url = urls[0]; if (opts.sync) { let sync_timeout = SYNC_TIMEOUT_DEFAULT; @@ -590,7 +724,7 @@ const handle_run_scraper = async( const res = await fetch_raw(api_key, `${SYNC_CRAWL_ENDPOINT}?${query}`, { method: 'POST', - body: JSON.stringify(build_run_request(url)), + body: JSON.stringify(build_run_request(single_url)), }); spinner.stop(); if (res.status == 202) @@ -618,7 +752,7 @@ const handle_run_scraper = async( const data = parse_result_body(res.body); if (is_realtime_page_limit_error(data)) { - await run_batch(api_key, collector_id, url, opts); + await run_batch(api_key, collector_id, [single_url], opts); return; } print(data, {json: opts.json, pretty: opts.pretty, @@ -647,7 +781,7 @@ const handle_run_scraper = async( const trigger = await post( api_key, `${TRIGGER_IMMEDIATE_ENDPOINT}?${trigger_query}`, - build_run_request(url), + build_run_request(single_url), {timing: opts.timing, hints: SCRAPER_BODY_HINTS} ); trigger_spinner.stop(); @@ -689,7 +823,7 @@ const handle_run_scraper = async( const data = parse_result_body(poll_result.result.body); if (is_realtime_page_limit_error(data)) { - await run_batch(api_key, collector_id, url, opts); + await run_batch(api_key, collector_id, [single_url], opts); return; } print(data, {json: opts.json, pretty: opts.pretty, @@ -737,17 +871,28 @@ const create_subcommand = new Command('create') .action(handle_create_scraper); const run_subcommand = new Command('run') - .description('Run a Bright Data scraper on a URL and return the data') + .description( + 'Run a Bright Data scraper on one or more URLs and return the data') .argument('', 'Collector ID of the scraper (returned by `scraper create`)') - .argument('', 'URL to scrape') + .argument('[url]', + 'URL to scrape. Omit when using --urls or --input-file.') + .option('--urls ', + 'Comma-separated list of URLs. Mirror of triggerWithUrls / ' + +'trigger_with_urls from the Bright Data Scraper Studio ' + +'reference SDKs. Routes via /dca/trigger as a single batch.') + .option('--input-file ', + 'Path to a file with URLs: one per line (# comments and ' + +'blank lines skipped), OR a JSON array of strings, OR a ' + +'JSON array of {"url": "..."} objects.') .option('--sync', - 'Use the synchronous /dca/crawl endpoint (server-side 25-50s cap)') + 'Use the synchronous /dca/crawl endpoint (server-side 25-50s cap). ' + +'Single-URL only.') .option('--sync-timeout ', `Sync-mode server timeout (${SYNC_TIMEOUT_MIN}-${SYNC_TIMEOUT_MAX}, ` +`default ${SYNC_TIMEOUT_DEFAULT})`) .option('--timeout ', - 'Polling timeout in async mode (default: 600)') + 'Polling timeout in async mode (default: 600; batch mode: 3600)') .option('--name ', 'Human-readable job name') .option('--version ', 'Scraper version (e.g. "dev")') .option('-o, --output ', 'Write output to file') @@ -786,4 +931,8 @@ export { AI_TRIGGER_DEFAULT_RETRIES, AI_TRIGGER_RETRY_BASE_MS, AI_TRIGGER_RETRY_MAX_MS, + parse_urls_arg, + read_input_file, + resolve_run_inputs, + is_valid_url, }; diff --git a/src/types/scraper.ts b/src/types/scraper.ts index 29998be..7e1ab32 100644 --- a/src/types/scraper.ts +++ b/src/types/scraper.ts @@ -85,6 +85,8 @@ type Scraper_run_opts = { pretty?: boolean; timing?: boolean; apiKey?: string; + urls?: string; + inputFile?: string; }; type Batch_trigger_response = {