Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 29 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -410,27 +410,38 @@ brightdata scraper create https://example.com/product/1 \

### `scraper run`

Run a scraper (built with `scraper create` or in the web UI) against a URL and get the extracted data.
Run a scraper (built with `scraper create` or in the web UI) against one or more URLs and get the extracted data.

```bash
brightdata scraper run <collector_id> <url> [options]
brightdata scraper run <collector_id> [url] [options]
```

Provide URLs in exactly one of three ways:

- Positional `<url>` — single URL (legacy form, unchanged).
- `--urls <u1,u2,...>` — comma-separated list.
- `--input-file <path>` — file with one URL per line, **or** a JSON array of URL strings, **or** a JSON array of `{"url": "..."}` objects.

| Flag | Description |
|---|---|
| `--sync` | Use the synchronous `/dca/crawl` endpoint (server-side cap of 25–50s) |
| `--urls <list>` | Comma-separated list of URLs (multi-URL batch path) |
| `--input-file <path>` | File with URLs (txt one-per-line, or JSON array) |
| `--sync` | Use the synchronous `/dca/crawl` endpoint (single-URL only, server-side cap of 25–50s) |
| `--sync-timeout <seconds>` | Sync-mode server timeout, `25`–`50` (default: `50`) |
| `--timeout <seconds>` | Async polling timeout (default: `600`) |
| `--timeout <seconds>` | Polling timeout (default: `600` single-URL, `3600` batch) |
| `--name <name>` | Human-readable job name |
| `--version <version>` | Scraper version (e.g. `dev`) |
| `-o, --output <path>` | Write output to file |
| `--json` / `--pretty` | JSON output (raw / indented) |
| `--timing` | Show request timing |
| `-k, --api-key <key>` | Override API key |

By default the command uses the async flow: it triggers `/dca/trigger_immediate`, gets back a `response_id`, and polls `/dca/get_result` until the data is ready. Use `--sync` for one-shot scrapes that you expect to finish within ~50 seconds; on a sync server-side timeout the command exits with the `response_id` so you can re-run without `--sync` to poll for the result.
**Routing**

- **Single URL** (positional, or one entry via `--urls` / `--input-file`) → async flow: `/dca/trigger_immediate` → poll `/dca/get_result`. Use `--sync` for `/dca/crawl` (one-shot, 25–50s).
- **Multiple URLs** (`--urls` / `--input-file` with 2+ entries) → single POST to `/dca/trigger` with an array body, one `collection_id`, polled via `/dca/dataset`. This mirrors the canonical batch shape used by the reference SDKs ([`triggerWithUrls`](https://github.com/brightdata/bright-data-scraper-studio-nodejs-project) / [`trigger_with_urls`](https://github.com/brightdata/bright-data-scraper-studio-python-project)). `--sync` is incompatible with multi-URL — `/dca/crawl` accepts only a single URL.

If a URL expands to more pages than the realtime job limit allows (e.g. paginated listings, infinite scroll), the CLI automatically falls back to the batch endpoint (`/dca/trigger` → poll `/dca/dataset`). The fallback prints a one-line notice and adjusts the poll interval and timeout for the longer batch wait. No flag required.
If a single URL expands to more pages than the realtime job limit allows (paginated listings, infinite scroll), the CLI automatically falls back to the batch endpoint and prints a one-line notice. No flag required.

**Examples**

Expand All @@ -448,6 +459,18 @@ brightdata scraper run c_mp3tuab31lswoxvpws https://example.com/p/1 --sync
# Sync with a shorter server timeout and a job name
brightdata scraper run c_mp3tuab31lswoxvpws https://example.com/p/1 \
--sync --sync-timeout 30 --name first-test

# Multi-URL batch — one API call, one snapshot, one merged result array
brightdata scraper run c_mp3tuab31lswoxvpws \
--urls "https://example.com/p/1,https://example.com/p/2,https://example.com/p/3" \
--pretty -o products.json

# Multi-URL from a file (one URL per line; # comments and blank lines skipped)
brightdata scraper run c_mp3tuab31lswoxvpws --input-file urls.txt -o products.json

# Multi-URL from a JSON array
echo '["https://example.com/p/1","https://example.com/p/2"]' > urls.json
brightdata scraper run c_mp3tuab31lswoxvpws --input-file urls.json
```

---
Expand Down
288 changes: 288 additions & 0 deletions src/__tests__/commands/scraper.test.ts
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
import {writeFileSync, mkdtempSync, rmSync} from 'node:fs';
import {tmpdir} from 'node:os';
import {join} from 'node:path';
import {describe, it, expect, beforeEach, afterEach, vi} from 'vitest';
import {Command} from 'commander';
import type {Scraper_create_opts} from '../../types/scraper';
Expand Down Expand Up @@ -68,6 +71,10 @@ import {
AI_TRIGGER_DEFAULT_RETRIES,
AI_TRIGGER_RETRY_BASE_MS,
AI_TRIGGER_RETRY_MAX_MS,
parse_urls_arg,
read_input_file,
resolve_run_inputs,
is_valid_url,
} from '../../commands/scraper';

describe('commands/scraper', ()=>{
Expand Down Expand Up @@ -1163,4 +1170,285 @@ describe('commands/scraper', ()=>{
error.mockRestore();
});
});

describe('is_valid_url', ()=>{
it('accepts http/https URLs', ()=>{
expect(is_valid_url('https://example.com')).toBe(true);
expect(is_valid_url('http://example.com/a/b?c=1')).toBe(true);
});

it('rejects garbage', ()=>{
expect(is_valid_url('not a url')).toBe(false);
expect(is_valid_url('')).toBe(false);
expect(is_valid_url(' ')).toBe(false);
});
});

describe('parse_urls_arg', ()=>{
it('splits, trims, and drops empties', ()=>{
expect(parse_urls_arg(
' https://a.com , https://b.com ,, https://c.com'))
.toEqual(['https://a.com', 'https://b.com', 'https://c.com']);
});

it('returns single URL for a non-comma input', ()=>{
expect(parse_urls_arg('https://only.example.com'))
.toEqual(['https://only.example.com']);
});

it('returns empty array for blank input', ()=>{
expect(parse_urls_arg('')).toEqual([]);
expect(parse_urls_arg(' , , ')).toEqual([]);
});
});

describe('read_input_file', ()=>{
let tmp_dir: string;

beforeEach(()=>{
tmp_dir = mkdtempSync(join(tmpdir(), 'bdata-test-'));
});

afterEach(()=>{
rmSync(tmp_dir, {recursive: true, force: true});
});

const write = (name: string, content: string): string=>{
const p = join(tmp_dir, name);
writeFileSync(p, content, 'utf8');
return p;
};

it('reads newline-separated URLs', ()=>{
const p = write('urls.txt',
'https://a.com\nhttps://b.com\nhttps://c.com');
expect(read_input_file(p)).toEqual([
'https://a.com', 'https://b.com', 'https://c.com']);
});

it('skips blank lines and # comments', ()=>{
const p = write('urls.txt',
'# top comment\n'
+'https://a.com\n'
+'\n'
+' \n'
+'# section\n'
+'https://b.com # inline comment ok\n'
+'https://c.com');
expect(read_input_file(p)).toEqual([
'https://a.com', 'https://b.com', 'https://c.com']);
});

it('reads JSON array of strings', ()=>{
const p = write('urls.json',
JSON.stringify(['https://a.com', 'https://b.com']));
expect(read_input_file(p)).toEqual([
'https://a.com', 'https://b.com']);
});

it('reads JSON array of {url} objects', ()=>{
const p = write('urls.json', JSON.stringify([
{url: 'https://a.com'},
{url: 'https://b.com', extra: 'ignored'},
]));
expect(read_input_file(p)).toEqual([
'https://a.com', 'https://b.com']);
});

it('throws on missing file', ()=>{
expect(()=>read_input_file(join(tmp_dir, 'missing.txt')))
.toThrow(/Cannot read --input-file/);
});

it('throws on malformed JSON', ()=>{
const p = write('bad.json', '[{not valid');
expect(()=>read_input_file(p))
.toThrow(/failed to parse/);
});

it('throws on non-array JSON', ()=>{
const p = write('obj.json', '{"url": "https://a.com"}');
expect(()=>read_input_file(p))
.toThrow(/must be an array/);
});

it('throws on JSON entry with neither string nor {url}', ()=>{
const p = write('mixed.json',
JSON.stringify(['https://a.com', {wrong: 'field'}]));
expect(()=>read_input_file(p))
.toThrow(/must be a string or an object with a "url"/);
});

it('returns empty array for an empty file', ()=>{
const p = write('empty.txt', ' \n\n ');
expect(read_input_file(p)).toEqual([]);
});
});

describe('resolve_run_inputs', ()=>{
let tmp_dir: string;

beforeEach(()=>{
tmp_dir = mkdtempSync(join(tmpdir(), 'bdata-test-'));
});

afterEach(()=>{
rmSync(tmp_dir, {recursive: true, force: true});
});

it('returns the positional URL as a single-element list', ()=>{
expect(resolve_run_inputs('https://a.com', {}))
.toEqual(['https://a.com']);
});

it('parses --urls', ()=>{
expect(resolve_run_inputs(undefined,
{urls: 'https://a.com,https://b.com'}))
.toEqual(['https://a.com', 'https://b.com']);
});

it('reads --input-file', ()=>{
const p = join(tmp_dir, 'urls.txt');
writeFileSync(p, 'https://a.com\nhttps://b.com', 'utf8');
expect(resolve_run_inputs(undefined, {inputFile: p}))
.toEqual(['https://a.com', 'https://b.com']);
});

it('rejects when no source is provided', ()=>{
expect(()=>resolve_run_inputs(undefined, {}))
.toThrow(/requires one of: <url> positional, --urls/);
});

it('rejects when multiple sources are provided', ()=>{
expect(()=>resolve_run_inputs('https://a.com',
{urls: 'https://b.com'}))
.toThrow(/only one input source/);
expect(()=>resolve_run_inputs(undefined,
{urls: 'https://a.com', inputFile: '/tmp/x'}))
.toThrow(/only one input source/);
});

it('rejects when parsed list is empty', ()=>{
expect(()=>resolve_run_inputs(undefined, {urls: ' , , '}))
.toThrow(/No URLs to scrape/);
});

it('rejects invalid URLs and names them', ()=>{
expect(()=>resolve_run_inputs(undefined,
{urls: 'https://a.com,not-a-url,also bad'}))
.toThrow(/Invalid URL\(s\):.*not-a-url/);
});
});

describe('handle_run_scraper multi-URL', ()=>{
let fetch_spy: ReturnType<typeof vi.spyOn>;
let tmp_dir: string;

beforeEach(()=>{
fetch_spy = vi.spyOn(global, 'fetch') as never;
tmp_dir = mkdtempSync(join(tmpdir(), 'bdata-test-'));
});

afterEach(()=>{
fetch_spy.mockRestore();
rmSync(tmp_dir, {recursive: true, force: true});
});

it('--urls posts an array body to /dca/trigger and polls /dca/dataset',
async()=>{
mocks.post.mockResolvedValueOnce({collection_id: 'd_batch'});
fetch_spy.mockImplementation(()=>Promise.resolve({
status: 200,
text: ()=>Promise.resolve(
'[{"title":"A"},{"title":"B"},{"title":"C"}]'),
} as unknown as Response));
mocks.poll_until.mockImplementationOnce(async(o: never)=>{
const cfg = o as {fetch_once: ()=>Promise<unknown>};
const r = await cfg.fetch_once();
return {result: r, attempts: 1, last_status: '__ready__'};
});
await handle_run_scraper('c_abc', undefined, {
urls: 'https://a.com,https://b.com,https://c.com',
});
expect(mocks.post).toHaveBeenCalledTimes(1);
const call = mocks.post.mock.calls[0];
expect(String(call[1])).toMatch(/\/dca\/trigger\?collector=c_abc/);
expect(call[2]).toEqual([
{url: 'https://a.com'},
{url: 'https://b.com'},
{url: 'https://c.com'},
]);
expect(mocks.print).toHaveBeenCalledWith(
[{title: 'A'}, {title: 'B'}, {title: 'C'}],
{json: undefined, pretty: undefined, output: undefined}
);
});

it('--input-file routes to the same batch path', async()=>{
const p = join(tmp_dir, 'urls.txt');
writeFileSync(p, 'https://a.com\nhttps://b.com', 'utf8');
mocks.post.mockResolvedValueOnce({collection_id: 'd_batch'});
fetch_spy.mockImplementation(()=>Promise.resolve({
status: 200,
text: ()=>Promise.resolve('[{"ok":1},{"ok":2}]'),
} as unknown as Response));
mocks.poll_until.mockImplementationOnce(async(o: never)=>{
const cfg = o as {fetch_once: ()=>Promise<unknown>};
const r = await cfg.fetch_once();
return {result: r, attempts: 1, last_status: '__ready__'};
});
await handle_run_scraper('c_abc', undefined, {inputFile: p});
expect(mocks.post.mock.calls[0][2]).toEqual([
{url: 'https://a.com'},
{url: 'https://b.com'},
]);
});

it('rejects --sync combined with --urls', async()=>{
await expect(
handle_run_scraper('c_abc', undefined, {
sync: true,
urls: 'https://a.com,https://b.com',
})
).rejects.toThrow(/--sync cannot be combined with --urls/);
expect(mocks.fail).toHaveBeenCalledWith(
expect.stringContaining(
'--sync cannot be combined with --urls'));
expect(mocks.post).not.toHaveBeenCalled();
});

it('rejects when no URL source is provided', async()=>{
await expect(
handle_run_scraper('c_abc', undefined, {})
).rejects.toThrow(
/requires one of: <url> positional, --urls, or --input-file/);
});

it('rejects when positional and --urls are both set', async()=>{
await expect(
handle_run_scraper('c_abc', 'https://a.com',
{urls: 'https://b.com'})
).rejects.toThrow(/only one input source/);
});

it('single URL via --urls still takes the legacy single path',
async()=>{
mocks.post.mockResolvedValueOnce({response_id: 'r_xyz'});
fetch_spy.mockImplementation(()=>Promise.resolve({
status: 200,
text: ()=>Promise.resolve('{"title":"only"}'),
} as unknown as Response));
mocks.poll_until.mockImplementationOnce(async(o: never)=>{
const cfg = o as {fetch_once: ()=>Promise<unknown>};
const r = await cfg.fetch_once();
return {result: r, attempts: 1, last_status: '__ready__'};
});
await handle_run_scraper('c_abc', undefined,
{urls: 'https://only.com'});
expect(String(mocks.post.mock.calls[0][1])).toMatch(
/\/dca\/trigger_immediate\?collector=c_abc/);
expect(mocks.post.mock.calls[0][2]).toEqual(
{url: 'https://only.com'});
});
});
});
Loading