Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
81 changes: 78 additions & 3 deletions src/__tests__/commands/scraper.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@ import {
parse_sync_timeout,
is_realtime_page_limit_error,
classify_dataset,
SCRAPER_BODY_HINTS,
} from '../../commands/scraper';

describe('commands/scraper', ()=>{
Expand All @@ -68,6 +69,80 @@ describe('commands/scraper', ()=>{
mocks.start.mockReturnValue({stop: mocks.stop});
});

// Scraper hints live with this command, not the shared client, and
// must travel to the HTTP layer on every API call the command makes.
describe('SCRAPER_BODY_HINTS', ()=>{
it('contains a hint for the stub-collector 403 response body',
()=>{
const stub_hint = SCRAPER_BODY_HINTS.find(h=>
h.pattern.test('Collector does not have a template'));
expect(stub_hint).toBeDefined();
expect(stub_hint!.hint).toMatch(/AI generation has not completed/);
expect(stub_hint!.hint).toMatch(/bdata scraper create/);
// Must NOT echo the misleading generic 403 hint.
expect(stub_hint!.hint).not.toMatch(/zone permissions/);
});

it('contains a hint for the parallel-job cap 429 response body',
()=>{
const cap_hint = SCRAPER_BODY_HINTS.find(h=>
h.pattern.test('Cannot run more than 3 jobs in parallel'));
expect(cap_hint).toBeDefined();
expect(cap_hint!.hint).toMatch(/concurrent-job cap/);
expect(cap_hint!.hint).toMatch(/serialise/);
});

it('cap pattern is case-insensitive and number-agnostic '
+'(future-proof if the cap changes)', ()=>{
const cap_pattern = SCRAPER_BODY_HINTS[1].pattern;
expect(cap_pattern.test('cannot run more than 5 jobs '
+'in parallel')).toBe(true);
expect(cap_pattern.test('CANNOT RUN MORE THAN 100 JOBS '
+'IN PARALLEL')).toBe(true);
});

it('is passed via `hints` opt to client.post on every '
+'AI-Flow call in handle_create_scraper', async()=>{
mocks.post
.mockResolvedValueOnce({id: 'c_abc', name: 'cli-scraper-1'})
.mockResolvedValueOnce({id: 'ia_xyz', queued: false});
mocks.poll_until.mockResolvedValue({
result: {status: 'done', completed_steps: []},
attempts: 1,
});
await handle_create_scraper(
'https://example.com/p/1',
'extract title',
{}
);
for (const call of mocks.post.mock.calls)
{
const opts = call[3] as {hints?: unknown};
expect(opts).toBeDefined();
expect(opts.hints).toBe(SCRAPER_BODY_HINTS);
}
});

it('is passed via `hints` opt to client.post on the '
+'trigger_immediate call in handle_run_scraper', async()=>{
mocks.post.mockResolvedValueOnce({response_id: 'r_xyz'});
const fetch_spy = vi.spyOn(global, 'fetch')
.mockImplementation(()=>Promise.resolve({
status: 200,
text: ()=>Promise.resolve('{"title":"ok"}'),
} as unknown as Response));
mocks.poll_until.mockImplementation(async(o: never)=>{
const cfg = o as {fetch_once: ()=>Promise<unknown>};
const r = await cfg.fetch_once();
return {result: r, attempts: 1};
});
await handle_run_scraper('c_abc', 'https://x.com/p/1', {});
const opts = mocks.post.mock.calls[0][3] as {hints?: unknown};
expect(opts.hints).toBe(SCRAPER_BODY_HINTS);
fetch_spy.mockRestore();
});
});

describe('build_template_request', ()=>{
it('uses auto-generated name and stub webhook by default', ()=>{
const before = Math.floor(Date.now()/1000);
Expand Down Expand Up @@ -176,15 +251,15 @@ describe('commands/scraper', ()=>{
expect.objectContaining({
deliver: expect.objectContaining({type: 'webhook'}),
}),
{timing: undefined}
expect.objectContaining({timing: undefined})
);
expect(mocks.post).toHaveBeenNthCalledWith(
2,
'api_key',
'/dca/collectors/c_abc/automate_template',
{description: 'extract title',
urls: ['https://example.com/p/1']},
{timing: undefined}
expect.objectContaining({timing: undefined})
);
expect(mocks.poll_until).toHaveBeenCalledTimes(1);
expect(mocks.poll_until).toHaveBeenCalledWith(
Expand Down Expand Up @@ -404,7 +479,7 @@ describe('commands/scraper', ()=>{
expect.stringMatching(
/\/dca\/trigger_immediate\?collector=c_abc/),
{url: 'https://x.com/p/1'},
{timing: undefined}
expect.objectContaining({timing: undefined})
);
expect(mocks.print).toHaveBeenCalledWith(
{title: 'hello'},
Expand Down
77 changes: 77 additions & 0 deletions src/__tests__/utils/client.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
import {describe, it, expect} from 'vitest';
import {pick_hint, ERROR_HINTS, type Body_hint} from '../../utils/client';

// Generic mock patterns only — the client ships no command-specific
// patterns of its own, so the pick_hint contract stays clear.
const MOCK_HINTS: Body_hint[] = [
{pattern: /quota.*exceeded/i,
hint: 'Your monthly quota is exhausted. Top up in the dashboard.'},
{pattern: /not allowed in region/i,
hint: 'This API is geo-restricted. Use --country to override.'},
];

describe('utils/client.pick_hint', ()=>{
it('returns the status-based hint when no extra hints are passed', ()=>{
expect(pick_hint(401, 'anything')).toBe(ERROR_HINTS[401]);
expect(pick_hint(403, 'anything')).toBe(ERROR_HINTS[403]);
expect(pick_hint(404, 'anything')).toBe(ERROR_HINTS[404]);
expect(pick_hint(429, 'anything')).toBe(ERROR_HINTS[429]);
});

it('returns the status-based hint when extra hints are passed '
+'but nothing matches the body', ()=>{
expect(pick_hint(403, 'plain unauthorized message', MOCK_HINTS))
.toBe(ERROR_HINTS[403]);
});

it('overrides with the first matching extra hint', ()=>{
expect(pick_hint(429, 'monthly quota exceeded', MOCK_HINTS))
.toMatch(/Top up in the dashboard/);
expect(pick_hint(403, 'not allowed in region: CN', MOCK_HINTS))
.toMatch(/geo-restricted/);
});

it('is case-insensitive on the caller\'s pattern', ()=>{
expect(pick_hint(429, 'MONTHLY QUOTA EXCEEDED', MOCK_HINTS))
.toMatch(/Top up in the dashboard/);
});

it('respects first-match-wins order in the caller\'s list', ()=>{
const ordered: Body_hint[] = [
{pattern: /generic/, hint: 'first wins'},
{pattern: /generic/, hint: 'second loses'},
];
expect(pick_hint(500, 'generic error', ordered))
.toBe('first wins');
});

it('returns undefined for unknown statuses with no body match', ()=>{
expect(pick_hint(418, 'teapot')).toBeUndefined();
expect(pick_hint(599, 'mystery')).toBeUndefined();
});

it('does not consult ERROR_HINTS when an extra-hint pattern '
+'matches', ()=>{
// The override should win even when the generic status hint
// would also be applicable.
const matches_403: Body_hint[] = [
{pattern: /custom 403 case/i,
hint: 'override hint, not the zone-permissions one'},
];
expect(pick_hint(403, 'this is a custom 403 case', matches_403))
.toBe('override hint, not the zone-permissions one');
});

it('does not leak any scraper-studio vocabulary in the default '
+'hints', ()=>{
// The shared client must stay generic. Catches accidental
// regressions where someone adds a domain-specific hint here.
for (const hint of Object.values(ERROR_HINTS))
{
expect(hint).not.toMatch(/scraper create/i);
expect(hint).not.toMatch(/collector/i);
expect(hint).not.toMatch(/AI[- ]Flow/i);
expect(hint).not.toMatch(/automate_template/i);
}
});
});
32 changes: 26 additions & 6 deletions src/commands/scraper.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import {Command} from 'commander';
import {post, get} from '../utils/client';
import {post, get, type Body_hint} from '../utils/client';
import {load as load_config} from '../utils/config';
import {ensure_authenticated} from '../utils/auth';
import {start as start_spinner} from '../utils/spinner';
Expand All @@ -18,6 +18,25 @@ import type {
Batch_trigger_response,
} from '../types/scraper';

// Scraper-studio body-pattern hints. Kept here, not in client.ts, so
// the DCA vocabulary doesn't leak into other commands. First match wins.
const SCRAPER_BODY_HINTS: Body_hint[] = [
{
pattern: /collector does not have a template/i,
hint: 'AI generation has not completed for this collector. '
+'Re-run `bdata scraper create` (the previous attempt may '
+'have hit the AI-Flow parallel-job cap), or open '
+'https://brightdata.com/cp/scrapers to inspect / finish '
+'the half-built collector in the web UI.',
},
{
pattern: /cannot run more than \d+ jobs in parallel/i,
hint: 'You hit the AI-Flow concurrent-job cap. Wait for an '
+'in-flight `scraper create` to finish, or serialise '
+'your launches.',
},
];

const CREATE_TEMPLATE_ENDPOINT = '/dca/collector';
const AI_TRIGGER_PATH = 'automate_template';
const AI_PROGRESS_PATH = 'automate_template/progress';
Expand Down Expand Up @@ -105,7 +124,7 @@ const handle_create_scraper = async(
api_key,
CREATE_TEMPLATE_ENDPOINT,
template_body,
{timing: opts.timing}
{timing: opts.timing, hints: SCRAPER_BODY_HINTS}
);
create_spinner.stop();
if (!template.id)
Expand All @@ -129,7 +148,7 @@ const handle_create_scraper = async(
api_key,
`/dca/collectors/${collector_id}/${AI_TRIGGER_PATH}`,
build_ai_request(url, description),
{timing: opts.timing}
{timing: opts.timing, hints: SCRAPER_BODY_HINTS}
);
trigger_spinner.stop();
} catch(e) {
Expand All @@ -148,7 +167,7 @@ const handle_create_scraper = async(
fetch_once: ()=>get<Ai_progress_response>(
api_key,
`/dca/collectors/${collector_id}/${AI_PROGRESS_PATH}`,
{timing: opts.timing}
{timing: opts.timing, hints: SCRAPER_BODY_HINTS}
),
get_status: extract_progress_status,
running_statuses: [RUNNING_SENTINEL],
Expand Down Expand Up @@ -338,7 +357,7 @@ const run_batch = async(
api_key,
`${BATCH_TRIGGER_ENDPOINT}?${query}`,
[build_run_request(url)],
{timing: opts.timing}
{timing: opts.timing, hints: SCRAPER_BODY_HINTS}
);
trigger_spinner.stop();
if (!trigger.collection_id)
Expand Down Expand Up @@ -474,7 +493,7 @@ const handle_run_scraper = async(
api_key,
`${TRIGGER_IMMEDIATE_ENDPOINT}?${trigger_query}`,
build_run_request(url),
{timing: opts.timing}
{timing: opts.timing, hints: SCRAPER_BODY_HINTS}
);
trigger_spinner.stop();
if (!trigger.response_id)
Expand Down Expand Up @@ -592,4 +611,5 @@ export {
parse_sync_timeout,
is_realtime_page_limit_error,
classify_dataset,
SCRAPER_BODY_HINTS,
};
35 changes: 30 additions & 5 deletions src/utils/client.ts
Original file line number Diff line number Diff line change
Expand Up @@ -11,12 +11,33 @@ const ERROR_HINTS: Record<number, string> = {
429: 'Rate limit exceeded. Wait a moment and try again.',
};

// Commands pass body-pattern overrides via Request_opts.hints; the
// client stays generic, consulting them before the ERROR_HINTS map.
type Body_hint = {pattern: RegExp; hint: string};

const pick_hint = (
status: number,
body: string,
extra_hints?: Body_hint[]
): string|undefined=>{
if (extra_hints)
{
for (const {pattern, hint} of extra_hints)
{
if (pattern.test(body))
return hint;
}
}
return ERROR_HINTS[status];
};

type Request_opts = {
method?: string;
body?: unknown;
headers?: Record<string, string>;
timing?: boolean;
raw_buffer?: boolean;
hints?: Body_hint[];
};

type Api_error = {
Expand All @@ -27,10 +48,14 @@ type Api_error = {

const sleep = (ms: number)=>new Promise(resolve=>setTimeout(resolve, ms));

const format_error = (status: number, detail: string): Api_error=>({
const format_error = (
status: number,
detail: string,
extra_hints?: Body_hint[]
): Api_error=>({
status,
message: detail,
hint: ERROR_HINTS[status],
hint: pick_hint(status, detail, extra_hints),
});

const request = async<T = unknown>(
Expand Down Expand Up @@ -93,7 +118,7 @@ const request = async<T = unknown>(
if (err_body)
detail = err_body;
} catch(_e) {}
const api_err = format_error(res.status, detail);
const api_err = format_error(res.status, detail, opts.hints);
const msg = [
`Error: ${api_err.message}`,
` Status: ${api_err.status}`,
Expand Down Expand Up @@ -133,5 +158,5 @@ const get = <T = unknown>(
opts: Omit<Request_opts, 'method'> = {}
): Promise<T>=>request<T>(api_key, endpoint, {method: 'GET', ...opts});

export {request, post, get};
export type {Request_opts, Api_error};
export {request, post, get, pick_hint, ERROR_HINTS};
export type {Request_opts, Api_error, Body_hint};