diff --git a/src/__tests__/commands/scraper.test.ts b/src/__tests__/commands/scraper.test.ts index 39e7e0c..8fb4e12 100644 --- a/src/__tests__/commands/scraper.test.ts +++ b/src/__tests__/commands/scraper.test.ts @@ -58,6 +58,7 @@ import { parse_sync_timeout, is_realtime_page_limit_error, classify_dataset, + SCRAPER_BODY_HINTS, } from '../../commands/scraper'; describe('commands/scraper', ()=>{ @@ -68,6 +69,80 @@ describe('commands/scraper', ()=>{ mocks.start.mockReturnValue({stop: mocks.stop}); }); + // Scraper hints live with this command, not the shared client, and + // must travel to the HTTP layer on every API call the command makes. + describe('SCRAPER_BODY_HINTS', ()=>{ + it('contains a hint for the stub-collector 403 response body', + ()=>{ + const stub_hint = SCRAPER_BODY_HINTS.find(h=> + h.pattern.test('Collector does not have a template')); + expect(stub_hint).toBeDefined(); + expect(stub_hint!.hint).toMatch(/AI generation has not completed/); + expect(stub_hint!.hint).toMatch(/bdata scraper create/); + // Must NOT echo the misleading generic 403 hint. + expect(stub_hint!.hint).not.toMatch(/zone permissions/); + }); + + it('contains a hint for the parallel-job cap 429 response body', + ()=>{ + const cap_hint = SCRAPER_BODY_HINTS.find(h=> + h.pattern.test('Cannot run more than 3 jobs in parallel')); + expect(cap_hint).toBeDefined(); + expect(cap_hint!.hint).toMatch(/concurrent-job cap/); + expect(cap_hint!.hint).toMatch(/serialise/); + }); + + it('cap pattern is case-insensitive and number-agnostic ' + +'(future-proof if the cap changes)', ()=>{ + const cap_pattern = SCRAPER_BODY_HINTS[1].pattern; + expect(cap_pattern.test('cannot run more than 5 jobs ' + +'in parallel')).toBe(true); + expect(cap_pattern.test('CANNOT RUN MORE THAN 100 JOBS ' + +'IN PARALLEL')).toBe(true); + }); + + it('is passed via `hints` opt to client.post on every ' + +'AI-Flow call in handle_create_scraper', async()=>{ + mocks.post + .mockResolvedValueOnce({id: 'c_abc', name: 'cli-scraper-1'}) + .mockResolvedValueOnce({id: 'ia_xyz', queued: false}); + mocks.poll_until.mockResolvedValue({ + result: {status: 'done', completed_steps: []}, + attempts: 1, + }); + await handle_create_scraper( + 'https://example.com/p/1', + 'extract title', + {} + ); + for (const call of mocks.post.mock.calls) + { + const opts = call[3] as {hints?: unknown}; + expect(opts).toBeDefined(); + expect(opts.hints).toBe(SCRAPER_BODY_HINTS); + } + }); + + it('is passed via `hints` opt to client.post on the ' + +'trigger_immediate call in handle_run_scraper', async()=>{ + mocks.post.mockResolvedValueOnce({response_id: 'r_xyz'}); + const fetch_spy = vi.spyOn(global, 'fetch') + .mockImplementation(()=>Promise.resolve({ + status: 200, + text: ()=>Promise.resolve('{"title":"ok"}'), + } as unknown as Response)); + mocks.poll_until.mockImplementation(async(o: never)=>{ + const cfg = o as {fetch_once: ()=>Promise}; + const r = await cfg.fetch_once(); + return {result: r, attempts: 1}; + }); + await handle_run_scraper('c_abc', 'https://x.com/p/1', {}); + const opts = mocks.post.mock.calls[0][3] as {hints?: unknown}; + expect(opts.hints).toBe(SCRAPER_BODY_HINTS); + fetch_spy.mockRestore(); + }); + }); + describe('build_template_request', ()=>{ it('uses auto-generated name and stub webhook by default', ()=>{ const before = Math.floor(Date.now()/1000); @@ -176,7 +251,7 @@ describe('commands/scraper', ()=>{ expect.objectContaining({ deliver: expect.objectContaining({type: 'webhook'}), }), - {timing: undefined} + expect.objectContaining({timing: undefined}) ); expect(mocks.post).toHaveBeenNthCalledWith( 2, @@ -184,7 +259,7 @@ describe('commands/scraper', ()=>{ '/dca/collectors/c_abc/automate_template', {description: 'extract title', urls: ['https://example.com/p/1']}, - {timing: undefined} + expect.objectContaining({timing: undefined}) ); expect(mocks.poll_until).toHaveBeenCalledTimes(1); expect(mocks.poll_until).toHaveBeenCalledWith( @@ -404,7 +479,7 @@ describe('commands/scraper', ()=>{ expect.stringMatching( /\/dca\/trigger_immediate\?collector=c_abc/), {url: 'https://x.com/p/1'}, - {timing: undefined} + expect.objectContaining({timing: undefined}) ); expect(mocks.print).toHaveBeenCalledWith( {title: 'hello'}, diff --git a/src/__tests__/utils/client.test.ts b/src/__tests__/utils/client.test.ts new file mode 100644 index 0000000..1325370 --- /dev/null +++ b/src/__tests__/utils/client.test.ts @@ -0,0 +1,77 @@ +import {describe, it, expect} from 'vitest'; +import {pick_hint, ERROR_HINTS, type Body_hint} from '../../utils/client'; + +// Generic mock patterns only — the client ships no command-specific +// patterns of its own, so the pick_hint contract stays clear. +const MOCK_HINTS: Body_hint[] = [ + {pattern: /quota.*exceeded/i, + hint: 'Your monthly quota is exhausted. Top up in the dashboard.'}, + {pattern: /not allowed in region/i, + hint: 'This API is geo-restricted. Use --country to override.'}, +]; + +describe('utils/client.pick_hint', ()=>{ + it('returns the status-based hint when no extra hints are passed', ()=>{ + expect(pick_hint(401, 'anything')).toBe(ERROR_HINTS[401]); + expect(pick_hint(403, 'anything')).toBe(ERROR_HINTS[403]); + expect(pick_hint(404, 'anything')).toBe(ERROR_HINTS[404]); + expect(pick_hint(429, 'anything')).toBe(ERROR_HINTS[429]); + }); + + it('returns the status-based hint when extra hints are passed ' + +'but nothing matches the body', ()=>{ + expect(pick_hint(403, 'plain unauthorized message', MOCK_HINTS)) + .toBe(ERROR_HINTS[403]); + }); + + it('overrides with the first matching extra hint', ()=>{ + expect(pick_hint(429, 'monthly quota exceeded', MOCK_HINTS)) + .toMatch(/Top up in the dashboard/); + expect(pick_hint(403, 'not allowed in region: CN', MOCK_HINTS)) + .toMatch(/geo-restricted/); + }); + + it('is case-insensitive on the caller\'s pattern', ()=>{ + expect(pick_hint(429, 'MONTHLY QUOTA EXCEEDED', MOCK_HINTS)) + .toMatch(/Top up in the dashboard/); + }); + + it('respects first-match-wins order in the caller\'s list', ()=>{ + const ordered: Body_hint[] = [ + {pattern: /generic/, hint: 'first wins'}, + {pattern: /generic/, hint: 'second loses'}, + ]; + expect(pick_hint(500, 'generic error', ordered)) + .toBe('first wins'); + }); + + it('returns undefined for unknown statuses with no body match', ()=>{ + expect(pick_hint(418, 'teapot')).toBeUndefined(); + expect(pick_hint(599, 'mystery')).toBeUndefined(); + }); + + it('does not consult ERROR_HINTS when an extra-hint pattern ' + +'matches', ()=>{ + // The override should win even when the generic status hint + // would also be applicable. + const matches_403: Body_hint[] = [ + {pattern: /custom 403 case/i, + hint: 'override hint, not the zone-permissions one'}, + ]; + expect(pick_hint(403, 'this is a custom 403 case', matches_403)) + .toBe('override hint, not the zone-permissions one'); + }); + + it('does not leak any scraper-studio vocabulary in the default ' + +'hints', ()=>{ + // The shared client must stay generic. Catches accidental + // regressions where someone adds a domain-specific hint here. + for (const hint of Object.values(ERROR_HINTS)) + { + expect(hint).not.toMatch(/scraper create/i); + expect(hint).not.toMatch(/collector/i); + expect(hint).not.toMatch(/AI[- ]Flow/i); + expect(hint).not.toMatch(/automate_template/i); + } + }); +}); diff --git a/src/commands/scraper.ts b/src/commands/scraper.ts index 9ab4400..6416b83 100644 --- a/src/commands/scraper.ts +++ b/src/commands/scraper.ts @@ -1,5 +1,5 @@ import {Command} from 'commander'; -import {post, get} from '../utils/client'; +import {post, get, type Body_hint} from '../utils/client'; import {load as load_config} from '../utils/config'; import {ensure_authenticated} from '../utils/auth'; import {start as start_spinner} from '../utils/spinner'; @@ -18,6 +18,25 @@ import type { Batch_trigger_response, } from '../types/scraper'; +// Scraper-studio body-pattern hints. Kept here, not in client.ts, so +// the DCA vocabulary doesn't leak into other commands. First match wins. +const SCRAPER_BODY_HINTS: Body_hint[] = [ + { + pattern: /collector does not have a template/i, + hint: 'AI generation has not completed for this collector. ' + +'Re-run `bdata scraper create` (the previous attempt may ' + +'have hit the AI-Flow parallel-job cap), or open ' + +'https://brightdata.com/cp/scrapers to inspect / finish ' + +'the half-built collector in the web UI.', + }, + { + pattern: /cannot run more than \d+ jobs in parallel/i, + hint: 'You hit the AI-Flow concurrent-job cap. Wait for an ' + +'in-flight `scraper create` to finish, or serialise ' + +'your launches.', + }, +]; + const CREATE_TEMPLATE_ENDPOINT = '/dca/collector'; const AI_TRIGGER_PATH = 'automate_template'; const AI_PROGRESS_PATH = 'automate_template/progress'; @@ -105,7 +124,7 @@ const handle_create_scraper = async( api_key, CREATE_TEMPLATE_ENDPOINT, template_body, - {timing: opts.timing} + {timing: opts.timing, hints: SCRAPER_BODY_HINTS} ); create_spinner.stop(); if (!template.id) @@ -129,7 +148,7 @@ const handle_create_scraper = async( api_key, `/dca/collectors/${collector_id}/${AI_TRIGGER_PATH}`, build_ai_request(url, description), - {timing: opts.timing} + {timing: opts.timing, hints: SCRAPER_BODY_HINTS} ); trigger_spinner.stop(); } catch(e) { @@ -148,7 +167,7 @@ const handle_create_scraper = async( fetch_once: ()=>get( api_key, `/dca/collectors/${collector_id}/${AI_PROGRESS_PATH}`, - {timing: opts.timing} + {timing: opts.timing, hints: SCRAPER_BODY_HINTS} ), get_status: extract_progress_status, running_statuses: [RUNNING_SENTINEL], @@ -338,7 +357,7 @@ const run_batch = async( api_key, `${BATCH_TRIGGER_ENDPOINT}?${query}`, [build_run_request(url)], - {timing: opts.timing} + {timing: opts.timing, hints: SCRAPER_BODY_HINTS} ); trigger_spinner.stop(); if (!trigger.collection_id) @@ -474,7 +493,7 @@ const handle_run_scraper = async( api_key, `${TRIGGER_IMMEDIATE_ENDPOINT}?${trigger_query}`, build_run_request(url), - {timing: opts.timing} + {timing: opts.timing, hints: SCRAPER_BODY_HINTS} ); trigger_spinner.stop(); if (!trigger.response_id) @@ -592,4 +611,5 @@ export { parse_sync_timeout, is_realtime_page_limit_error, classify_dataset, + SCRAPER_BODY_HINTS, }; diff --git a/src/utils/client.ts b/src/utils/client.ts index 6930232..2e119a9 100644 --- a/src/utils/client.ts +++ b/src/utils/client.ts @@ -11,12 +11,33 @@ const ERROR_HINTS: Record = { 429: 'Rate limit exceeded. Wait a moment and try again.', }; +// Commands pass body-pattern overrides via Request_opts.hints; the +// client stays generic, consulting them before the ERROR_HINTS map. +type Body_hint = {pattern: RegExp; hint: string}; + +const pick_hint = ( + status: number, + body: string, + extra_hints?: Body_hint[] +): string|undefined=>{ + if (extra_hints) + { + for (const {pattern, hint} of extra_hints) + { + if (pattern.test(body)) + return hint; + } + } + return ERROR_HINTS[status]; +}; + type Request_opts = { method?: string; body?: unknown; headers?: Record; timing?: boolean; raw_buffer?: boolean; + hints?: Body_hint[]; }; type Api_error = { @@ -27,10 +48,14 @@ type Api_error = { const sleep = (ms: number)=>new Promise(resolve=>setTimeout(resolve, ms)); -const format_error = (status: number, detail: string): Api_error=>({ +const format_error = ( + status: number, + detail: string, + extra_hints?: Body_hint[] +): Api_error=>({ status, message: detail, - hint: ERROR_HINTS[status], + hint: pick_hint(status, detail, extra_hints), }); const request = async( @@ -93,7 +118,7 @@ const request = async( if (err_body) detail = err_body; } catch(_e) {} - const api_err = format_error(res.status, detail); + const api_err = format_error(res.status, detail, opts.hints); const msg = [ `Error: ${api_err.message}`, ` Status: ${api_err.status}`, @@ -133,5 +158,5 @@ const get = ( opts: Omit = {} ): Promise=>request(api_key, endpoint, {method: 'GET', ...opts}); -export {request, post, get}; -export type {Request_opts, Api_error}; +export {request, post, get, pick_hint, ERROR_HINTS}; +export type {Request_opts, Api_error, Body_hint};