diff --git a/src/analyst/analyst.test.ts b/src/analyst/analyst.test.ts index ff71a24b..e0da4a0f 100644 --- a/src/analyst/analyst.test.ts +++ b/src/analyst/analyst.test.ts @@ -754,7 +754,15 @@ describe('ChatClient signal racing', () => { // documents the limit: races live in wrapLlmClient, mock passes // through. Either resolves (slow path) or rejects (when bound). // We just assert it eventually completes without hanging. - await Promise.race([p.catch(() => undefined), new Promise((r) => setTimeout(r, 200))]) - expect(true).toBe(true) + const settled = await Promise.race([ + p.then( + () => 'settled', + () => 'settled', + ), + new Promise((r) => setTimeout(() => r('hung'), 200)), + ]) + // The mock transport passes the signal through; the real contract here is + // only that the call SETTLES (resolves or rejects) and never hangs. + expect(settled).toBe('settled') }) }) diff --git a/src/analyst/registry.budget.test.ts b/src/analyst/registry.budget.test.ts new file mode 100644 index 00000000..22ce9916 --- /dev/null +++ b/src/analyst/registry.budget.test.ts @@ -0,0 +1,33 @@ +import { describe, expect, it } from 'vitest' +import { AnalystRegistry } from './registry' +import type { Analyst, AnalystContext, AnalystFinding } from './types' + +function recorder(id: string, inputKind: Analyst['inputKind'], sink: { budget?: number }): Analyst { + return { + id, + description: id, + inputKind, + cost: { kind: 'deterministic' }, + version: '1.0.0', + async analyze(_input: unknown, ctx: AnalystContext): Promise { + sink.budget = ctx.budgetUsd + return [] + }, + } +} + +describe('AnalystRegistry budget allocation', () => { + it('splits budget across analysts that actually run, not ones skipped for missing input', async () => { + const sink: { budget?: number } = {} + const reg = new AnalystRegistry() + reg.register(recorder('runs', 'trace-store', sink)) + // artifact-dir input is NOT supplied → this analyst is skipped, so it must + // not dilute the budget of the one that runs. + reg.register(recorder('skipped', 'artifact-dir', {})) + + await reg.run('t', { traceStore: {} as never }, { budget: { totalUsd: 10 } }) + + // runnableCount = 1 → full budget. The pre-fix code divided by 2 → 5. + expect(sink.budget).toBe(10) + }) +}) diff --git a/src/analyst/registry.ts b/src/analyst/registry.ts index c26c65dd..9f8f7614 100644 --- a/src/analyst/registry.ts +++ b/src/analyst/registry.ts @@ -196,6 +196,13 @@ export class AnalystRegistry { let totalCost = 0 let remainingUsd = budget?.totalUsd + // Budget is split only across analysts that actually run. Analysts skipped + // for missing input never spend, so counting them would under-budget the + // ones that do. routeInput is pure, so the pre-count is safe. + const runnableCount = selected.filter( + (a) => this.routeInput(a, inputs).kind !== 'missing', + ).length + for (const analyst of selected) { const t0 = Date.now() const input = this.routeInput(analyst, inputs) @@ -218,7 +225,7 @@ export class AnalystRegistry { const perBudget = allocateBudget(budget, { analyst, remainingUsd, - runningCount: selected.length, + runningCount: runnableCount, }) const ctx: AnalystContext = { diff --git a/src/client.test.ts b/src/client.test.ts new file mode 100644 index 00000000..0bcf8d6a --- /dev/null +++ b/src/client.test.ts @@ -0,0 +1,124 @@ +import { afterEach, beforeEach, describe, expect, it, vi } from 'vitest' +import { ProductClient } from './client' + +function mkOk(body: unknown): Response { + return new Response(JSON.stringify(body), { + status: 200, + headers: { 'content-type': 'application/json' }, + }) +} + +function mkErr(status: number, body: string): Response { + return new Response(body, { status }) +} + +const ROUTES = { + tasks: '/api/tasks', + events: '/api/events', + approvals: '/api/approvals', + vault: '/api/vault', + generations: '/api/generations', +} + +function newClient() { + return new ProductClient({ baseUrl: 'https://app.test', routes: ROUTES }) +} + +describe('ProductClient — fail-loud on non-ok responses', () => { + const realFetch = globalThis.fetch + afterEach(() => { + globalThis.fetch = realFetch + }) + + it('getTasks throws on a 500 instead of masking it as zero results', async () => { + // OLD behavior: res.json() was called without checking res.ok, then the + // error body's missing `tasks` key was masked by `?? []`, so a 500 with an + // error JSON body returned [] — indistinguishable from a healthy empty set. + globalThis.fetch = vi.fn(async () => + mkErr(500, JSON.stringify({ error: 'internal server error' })), + ) as unknown as typeof fetch + + await expect(newClient().getTasks('ws-1')).rejects.toThrow(/HTTP 500/) + }) + + it('getApprovals throws on a 401 (auth) rather than returning [])', async () => { + globalThis.fetch = vi.fn(async () => + mkErr(401, JSON.stringify({ error: 'unauthorized' })), + ) as unknown as typeof fetch + + await expect(newClient().getApprovals('ws-1')).rejects.toThrow(/HTTP 401/) + }) + + it('getEvents fails loud when a 200 body omits the required array field', async () => { + // A wrong route or a contract drift can return 200 `{}`. The old `?? []` + // turned that into "zero events"; now it must surface as a defect. + globalThis.fetch = vi.fn(async () => mkOk({ unexpected: true })) as unknown as typeof fetch + + await expect(newClient().getEvents('ws-1')).rejects.toThrow(/missing array field "events"/) + }) + + it('getVaultTree fails loud when a 200 body omits the tree array', async () => { + globalThis.fetch = vi.fn(async () => mkOk({})) as unknown as typeof fetch + await expect(newClient().getVaultTree('ws-1')).rejects.toThrow(/missing array field "tree"/) + }) + + it('returns the parsed array on a healthy 200', async () => { + globalThis.fetch = vi.fn(async () => + mkOk({ tasks: [{ id: 't1', title: 'a', status: 'open', priority: 'high' }] }), + ) as unknown as typeof fetch + + const tasks = await newClient().getTasks('ws-1') + expect(tasks).toHaveLength(1) + expect(tasks[0]!.id).toBe('t1') + }) + + it('a genuinely empty result set is still allowed (empty array, not masked error)', async () => { + globalThis.fetch = vi.fn(async () => mkOk({ tasks: [] })) as unknown as typeof fetch + const tasks = await newClient().getTasks('ws-1') + expect(tasks).toEqual([]) + }) + + it('generic post throws with status + body on a 4xx', async () => { + globalThis.fetch = vi.fn(async () => + mkErr(422, 'validation failed: name required'), + ) as unknown as typeof fetch + + await expect(newClient().post('/api/x', { a: 1 })).rejects.toThrow( + /HTTP 422.*validation failed/, + ) + }) +}) + +describe('ProductClient — request timeout', () => { + const realFetch = globalThis.fetch + beforeEach(() => { + vi.useFakeTimers() + }) + afterEach(() => { + vi.useRealTimers() + globalThis.fetch = realFetch + }) + + it('aborts a hung request after the configured timeout', async () => { + // fetch resolves only if its signal aborts (mirrors how undici/fetch + // rejects an aborted request). A never-resolving server must not hang the + // harness forever. + globalThis.fetch = ((_url: string, init: RequestInit) => + new Promise((_resolve, reject) => { + const signal = init.signal + if (signal) { + signal.addEventListener('abort', () => { + const err = new Error('The operation was aborted') + err.name = 'AbortError' + reject(err) + }) + } + })) as unknown as typeof fetch + + const client = new ProductClient({ baseUrl: 'https://app.test', routes: ROUTES, timeoutMs: 50 }) + const p = client.get('/api/slow') + const assertion = expect(p).rejects.toThrow(/aborted/i) + await vi.advanceTimersByTimeAsync(60) + await assertion + }) +}) diff --git a/src/client.ts b/src/client.ts index fd85303d..f01007bc 100644 --- a/src/client.ts +++ b/src/client.ts @@ -5,14 +5,18 @@ import type { CheckResult, ProductClientConfig, RouteMap, TestResult } from './t * * Routes are config, not hardcoded. Each agent provides its own RouteMap. */ +const DEFAULT_REQUEST_TIMEOUT_MS = 30_000 + export class ProductClient { private baseUrl: string private routes: RouteMap private cookies: string = '' + private timeoutMs: number constructor(config: ProductClientConfig) { this.baseUrl = config.baseUrl.replace(/\/+$/, '') this.routes = config.routes + this.timeoutMs = config.timeoutMs ?? DEFAULT_REQUEST_TIMEOUT_MS } private route(name: keyof RouteMap): string { @@ -21,6 +25,57 @@ export class ProductClient { return path } + /** + * Single HTTP boundary for every JSON request. Aborts after `timeoutMs`, + * checks `res.ok` BEFORE parsing, and throws `${method} ${path} failed: + * HTTP ${status} — ${body}` on a non-ok response so a 4xx/5xx error body + * can never be parsed as a success shape. Callers inspect the resolved + * value only after a successful return. + */ + private async request( + method: string, + path: string, + body?: Record, + ): Promise> { + const controller = new AbortController() + const timeout = setTimeout(() => controller.abort(), this.timeoutMs) + let res: Response + try { + res = await fetch(`${this.baseUrl}${path}`, { + method, + headers: { + ...(body ? { 'Content-Type': 'application/json' } : {}), + Origin: this.baseUrl, + Cookie: this.cookies, + }, + ...(body ? { body: JSON.stringify(body) } : {}), + signal: controller.signal, + }) + } finally { + clearTimeout(timeout) + } + if (!res.ok) { + const text = await res.text().catch(() => '') + throw new Error(`${method} ${path} failed: HTTP ${res.status} — ${text.slice(0, 500)}`) + } + return res.json() as Promise> + } + + /** + * Read a required collection field. A missing/non-array field is a contract + * violation (wrong route, drift, partial body) — surface it loud instead of + * masking it as a healthy empty set. A genuinely empty `[]` passes through. + */ + private requireArray(res: Record, field: string): T[] { + const value = res[field] + if (!Array.isArray(value)) { + throw new Error( + `Response missing array field "${field}" (got ${typeof value}: ${JSON.stringify(value)?.slice(0, 200)})`, + ) + } + return value as T[] + } + async signup(name: string, email: string, password: string): Promise<{ userId: string }> { const res = await this.post(this.route('signup'), { name, email, password }) const user = res.user as Record | undefined @@ -121,19 +176,25 @@ export class ProductClient { workspaceId: string, ): Promise<{ id: string; title: string; status: string; priority: string }[]> { const res = await this.get(`${this.route('tasks')}?workspaceId=${workspaceId}`) - return (res.tasks ?? []) as { id: string; title: string; status: string; priority: string }[] + return this.requireArray<{ id: string; title: string; status: string; priority: string }>( + res, + 'tasks', + ) } async getEvents(workspaceId: string): Promise<{ id: string; title: string; type: string }[]> { const res = await this.get(`${this.route('events')}?workspaceId=${workspaceId}`) - return (res.events ?? []) as { id: string; title: string; type: string }[] + return this.requireArray<{ id: string; title: string; type: string }>(res, 'events') } async getApprovals( workspaceId: string, ): Promise<{ id: string; title: string; status: string; type: string }[]> { const res = await this.get(`${this.route('approvals')}?workspaceId=${workspaceId}`) - return (res.actions ?? []) as { id: string; title: string; status: string; type: string }[] + return this.requireArray<{ id: string; title: string; status: string; type: string }>( + res, + 'actions', + ) } async getVaultTree(workspaceId: string): Promise { @@ -146,7 +207,7 @@ export class ProductClient { if (node.children) extract(node.children) } } - extract((res.tree ?? []) as unknown[]) + extract(this.requireArray(res, 'tree')) return paths } @@ -162,43 +223,22 @@ export class ProductClient { workspaceId: string, ): Promise<{ id: string; type: string; prompt: string }[]> { const res = await this.get(`${this.route('generations')}?workspaceId=${workspaceId}`) - return (res.generations ?? []) as { id: string; type: string; prompt: string }[] + return this.requireArray<{ id: string; type: string; prompt: string }>(res, 'generations') } /** Generic GET for custom routes */ async get(path: string): Promise> { - const res = await fetch(`${this.baseUrl}${path}`, { - headers: { Cookie: this.cookies }, - }) - return res.json() as Promise> + return this.request('GET', path) } /** Generic POST for custom routes */ async post(path: string, body: Record): Promise> { - const res = await fetch(`${this.baseUrl}${path}`, { - method: 'POST', - headers: { - 'Content-Type': 'application/json', - Origin: this.baseUrl, - Cookie: this.cookies, - }, - body: JSON.stringify(body), - }) - return res.json() as Promise> + return this.request('POST', path, body) } /** Generic PATCH for custom routes */ async patch(path: string, body: Record): Promise> { - const res = await fetch(`${this.baseUrl}${path}`, { - method: 'PATCH', - headers: { - 'Content-Type': 'application/json', - Origin: this.baseUrl, - Cookie: this.cookies, - }, - body: JSON.stringify(body), - }) - return res.json() as Promise> + return this.request('PATCH', path, body) } } diff --git a/src/command-runner.test.ts b/src/command-runner.test.ts index 9310e0d9..2cd3bea9 100644 --- a/src/command-runner.test.ts +++ b/src/command-runner.test.ts @@ -1,4 +1,4 @@ -import { mkdirSync, mkdtempSync, rmSync, writeFileSync } from 'node:fs' +import { chmodSync, mkdirSync, mkdtempSync, rmSync, writeFileSync } from 'node:fs' import { tmpdir } from 'node:os' import { join } from 'node:path' import { describe, expect, it } from 'vitest' @@ -98,4 +98,66 @@ describe('localCommandRunner', () => { }) expect(r.stdout).toBe('1') }) + + it('hasBin resolves a real binary without relying on `which`', async () => { + // `node` is on PATH in any environment that runs this suite. The + // resolution must not depend on a `which`/`command -v` subprocess + // (absent on Windows / minimal containers). + expect(await localCommandRunner.hasBin('node')).toBe(true) + }) + + it('hasBin resolves an absolute executable path directly', async () => { + expect(await localCommandRunner.hasBin(process.execPath)).toBe(true) + }) + + it('readFile surfaces a non-ENOENT error instead of collapsing it to null', async () => { + const wd = mkdtempSync(join(tmpdir(), 'cmd-runner-errno-')) + try { + // Reading a directory as a file yields EISDIR, not ENOENT. The old + // bare `catch { return null }` reported this real IO error as a + // missing file; the fix must surface (throw) it. + await expect(localCommandRunner.readFile(wd)).rejects.toThrow() + } finally { + rmSync(wd, { recursive: true, force: true }) + } + }) + + it('readFile returns null only for genuinely-absent (ENOENT) files', async () => { + expect( + await localCommandRunner.readFile('/tmp/cmd-runner-definitely-absent-xyz-123'), + ).toBeNull() + }) + + it('readDir surfaces a non-ENOENT error instead of reporting an empty dir', async () => { + const wd = mkdtempSync(join(tmpdir(), 'cmd-runner-errno-dir-')) + const file = join(wd, 'not-a-dir.txt') + try { + writeFileSync(file, 'x') + // readdir on a regular file yields ENOTDIR, not ENOENT. The old + // bare `catch { return [] }` masked this as "empty directory." + await expect(localCommandRunner.readDir(file)).rejects.toThrow() + } finally { + rmSync(wd, { recursive: true, force: true }) + } + }) + + it('readDir still returns [] for a genuinely-absent (ENOENT) directory', async () => { + expect(await localCommandRunner.readDir('/tmp/cmd-runner-no-such-dir-xyz-123')).toEqual([]) + }) + + it.skipIf(process.platform === 'win32' || process.getuid?.() === 0)( + 'readFile surfaces EACCES (permission) rather than masking it as absent', + async () => { + const wd = mkdtempSync(join(tmpdir(), 'cmd-runner-eacces-')) + const file = join(wd, 'secret.txt') + try { + writeFileSync(file, 'classified') + chmodSync(file, 0o000) + await expect(localCommandRunner.readFile(file)).rejects.toThrow() + } finally { + chmodSync(file, 0o600) + rmSync(wd, { recursive: true, force: true }) + } + }, + ) }) diff --git a/src/command-runner.ts b/src/command-runner.ts index d35be29f..36c3cf7c 100644 --- a/src/command-runner.ts +++ b/src/command-runner.ts @@ -17,8 +17,15 @@ */ import { spawnSync } from 'node:child_process' -import { existsSync, readdirSync, readFileSync, statSync } from 'node:fs' -import { join } from 'node:path' +import { + accessSync, + existsSync, + constants as fsConstants, + readdirSync, + readFileSync, + statSync, +} from 'node:fs' +import { delimiter, join } from 'node:path' // ─── Types ────────────────────────────────────────────────────────────── @@ -110,8 +117,7 @@ export const localCommandRunner: CommandRunner = { } }, async hasBin(name: string): Promise { - const r = spawnSync('which', [name], { encoding: 'utf8', timeout: 2000 }) - return r.status === 0 && (r.stdout ?? '').trim().length > 0 + return resolveBinOnPath(name) !== null }, async fileExists(path: string): Promise { return existsSync(path) @@ -119,16 +125,25 @@ export const localCommandRunner: CommandRunner = { async readFile(path: string): Promise { try { return readFileSync(path, 'utf8') - } catch { - return null + } catch (err) { + // ENOENT is the only "legitimately absent" signal → null. Any other + // errno (EACCES, EISDIR, EIO, …) is a real failure that must NOT + // masquerade as "file not present" — surface it so callers don't + // treat a permission/IO error as a missing artifact. + if (isErrnoCode(err, 'ENOENT')) return null + throw err } }, async readDir(path: string): Promise { let entries: string[] try { entries = readdirSync(path) - } catch { - return [] + } catch (err) { + // Same ENOENT-vs-real-error split as readFile: a missing directory + // is legitimately empty (→ []); EACCES/ENOTDIR/EIO must surface so + // an unreadable directory isn't silently reported as having no files. + if (isErrnoCode(err, 'ENOENT')) return [] + throw err } const out: DirEntry[] = [] for (const name of entries) { @@ -140,10 +155,63 @@ export const localCommandRunner: CommandRunner = { isFile: st.isFile(), sizeBytes: st.isFile() ? st.size : null, }) - } catch { - // skip unreadable + } catch (err) { + // An entry that vanished between readdir and stat (TOCTOU) is a + // benign race → skip. Any other errno surfaces. + if (isErrnoCode(err, 'ENOENT')) continue + throw err } } return out }, } + +function isErrnoCode(err: unknown, code: NodeJS.ErrnoException['code']): boolean { + return typeof err === 'object' && err !== null && (err as NodeJS.ErrnoException).code === code +} + +/** + * Resolve `name` to an executable absolute path using the runner's PATH, + * or `null` if not found. Cross-platform: does not depend on `which` + * (absent on Windows) or a `command -v` shell (absent on minimal + * containers). On Windows, candidate extensions come from PATHEXT. + * + * Throws only on a non-ENOENT/EACCES filesystem error while probing a + * candidate — those indicate a real IO fault, not "binary absent." + */ +function resolveBinOnPath(name: string): string | null { + // An explicit path component means PATH lookup doesn't apply: probe directly. + if (name.includes('/') || name.includes('\\')) { + return isExecutable(name) ? name : null + } + const pathEnv = process.env.PATH ?? '' + const dirs = pathEnv.split(delimiter).filter((d) => d.length > 0) + const exts = + process.platform === 'win32' + ? (process.env.PATHEXT ?? '.COM;.EXE;.BAT;.CMD').split(';').filter((e) => e.length > 0) + : [''] + for (const dir of dirs) { + for (const ext of exts) { + const candidate = join(dir, name + ext) + if (isExecutable(candidate)) return candidate + } + } + return null +} + +function isExecutable(path: string): boolean { + try { + // On win32 X_OK is a no-op; presence of the file (with a PATHEXT + // extension) is the executability signal there. + accessSync(path, process.platform === 'win32' ? fsConstants.F_OK : fsConstants.X_OK) + return true + } catch (err) { + // Not found / not executable / not permitted → "not a usable binary + // here," keep scanning. A different errno (EIO, ELOOP, …) is a real + // fault and must surface rather than silently yield a false negative. + if (isErrnoCode(err, 'ENOENT') || isErrnoCode(err, 'EACCES') || isErrnoCode(err, 'ENOTDIR')) { + return false + } + throw err + } +} diff --git a/src/detectors/edge.test.ts b/src/detectors/edge.test.ts new file mode 100644 index 00000000..4dbcde04 --- /dev/null +++ b/src/detectors/edge.test.ts @@ -0,0 +1,55 @@ +import { describe, expect, it } from 'vitest' +import { errorStreakDetector, noProgressDetector } from './index' + +describe('errorStreakDetector edge cases', () => { + it('is inert when status is undefined (does not advance or reset the streak)', () => { + const d = errorStreakDetector({ maxErrors: 2 }) + expect(d.observe({ status: 'error' })).toBeNull() // streak 1 + // a step with no status field must NOT touch the streak + expect(d.observe({})).toBeNull() + expect(d.streak).toBe(1) + // the next real error completes the streak across the inert step + expect(d.observe({ status: 'error' })).toMatchObject({ detector: 'error-streak', streak: 2 }) + }) + + it('maxErrors <= 0 disables signalling without crashing', () => { + const d = errorStreakDetector({ maxErrors: 0 }) + for (let i = 0; i < 5; i++) { + expect(d.observe({ status: 'error' })).toBeNull() + } + // streak still tracked for telemetry even though no signal fires + expect(d.streak).toBe(5) + }) + + it('negative maxErrors also disables without crashing', () => { + const d = errorStreakDetector({ maxErrors: -3 }) + expect(d.observe({ status: 'error' })).toBeNull() + expect(d.observe({ status: 'error' })).toBeNull() + expect(d.streak).toBe(2) + }) +}) + +describe('noProgressDetector edge cases', () => { + it('does not trip on step 1 without a primed baseline', () => { + // maxNoProgress 1 is the most aggressive setting; even so the FIRST observed + // step has no prior state to compare against, so stateUnchanged is false and + // the streak stays at 0 -> no signal. + const d = noProgressDetector({ maxNoProgress: 1 }) + expect(d.observe({ stateFingerprint: 's0', score: 0 })).toBeNull() + expect(d.streak).toBe(0) + // the SECOND identical step now has a baseline -> trips at streak 1 + expect(d.observe({ stateFingerprint: 's0', score: 0 })).toMatchObject({ + detector: 'no-progress', + streak: 1, + }) + }) + + it('maxNoProgress <= 0 disables signalling without crashing', () => { + const d = noProgressDetector({ maxNoProgress: 0 }) + d.observe({ stateFingerprint: 's0', score: 0 }) // baseline + for (let i = 0; i < 5; i++) { + expect(d.observe({ stateFingerprint: 's0', score: 0 })).toBeNull() + } + expect(d.streak).toBe(5) + }) +}) diff --git a/src/eval-campaign.test.ts b/src/eval-campaign.test.ts new file mode 100644 index 00000000..b9d49365 --- /dev/null +++ b/src/eval-campaign.test.ts @@ -0,0 +1,241 @@ +import { describe, expect, it } from 'vitest' +import type { CampaignRunContext, CampaignRunOutcome, EvalCampaignOptions } from './eval-campaign' +import { finalizeAbort, runEvalCampaign } from './eval-campaign' +import { assertLlmRoute, type LlmClientOptions } from './llm-client' +import { TraceEmitter } from './trace/emitter' +import { NoopRawProviderSink } from './trace/raw-provider-sink' +import { InMemoryTraceStore } from './trace/store' + +// A minimally-valid LLM config. routeRequirements is set to `{}` in every +// campaign below so assertLlmRoute does not gate on baseUrl/auth. +const LLM_OPTS: LlmClientOptions = { + baseUrl: 'https://api.example.test/v1', + apiKey: 'test-key', + provider: 'test', +} + +const TOKENS = { input: 1, output: 1 } + +function passingOutcome(model = 'm@1', promptHash = 'p1'): CampaignRunOutcome { + return { + pass: true, + score: 1, + costUsd: 0, + tokenUsage: TOKENS, + model, + promptHash, + configHash: 'c1', + } +} + +/** A runner that starts and ends the run so the run is captured + completed. */ +async function captureAndEnd(ctx: CampaignRunContext): Promise { + await ctx.emitter.startRun({ scenarioId: ctx.scenarioId, variantId: ctx.variantId }) + await ctx.emitter.endRun({ pass: true, score: 1 }) + return passingOutcome() +} + +function baseOpts( + overrides: Partial> = {}, +): EvalCampaignOptions { + return { + campaignId: 'c', + variants: [{ id: 'v0', payload: {} }], + scenarios: [{ scenarioId: 's0' }], + seeds: [0], + commitSha: 'sha', + llmOpts: LLM_OPTS, + routeRequirements: {}, + storeFactory: () => new InMemoryTraceStore(), + rawSinkFactory: () => new NoopRawProviderSink(), + integrity: { + llmSpansMin: 0, + requireRawCoverageOfLlmSpans: false, + rawProviderEventsMin: 0, + }, + runner: captureAndEnd, + ...overrides, + } +} + +describe('finalizeAbort', () => { + it('aborts a run still in `running` status', async () => { + const store = new InMemoryTraceStore() + const emitter = new TraceEmitter(store, { runId: 'r1', now: () => 1 }) + await emitter.startRun({ scenarioId: 's' }) + + await finalizeAbort(emitter, 'r1', 'pool aborted') + + const run = await store.getRun('r1') + expect(run?.status).toBe('aborted') + expect(run?.outcome).toEqual({ pass: false, notes: 'pool aborted' }) + }) + + // Regression: the guard. Before the `status !== 'running'` check was added, + // finalizeAbort unconditionally called abortRun and would overwrite a + // successfully-completed run — flipping status `completed`→`aborted` and the + // real outcome `{ pass: true }`→`{ pass: false, notes: reason }`. This is the + // exact corruption the post-settle cleanup could cause if the open-run + // bookkeeping invariant is ever broken in a refactor. + it('does NOT overwrite a run that is already completed', async () => { + const store = new InMemoryTraceStore() + const emitter = new TraceEmitter(store, { runId: 'r2', now: () => 1 }) + await emitter.startRun({ scenarioId: 's' }) + await emitter.endRun({ pass: true, score: 0.9, notes: 'real outcome' }) + + const before = await store.getRun('r2') + expect(before?.status).toBe('completed') + + await finalizeAbort(emitter, 'r2', 'late cleanup must not corrupt this') + + const after = await store.getRun('r2') + expect(after?.status).toBe('completed') + expect(after?.outcome).toEqual({ pass: true, score: 0.9, notes: 'real outcome' }) + }) + + it('does NOT overwrite a run that already failed', async () => { + const store = new InMemoryTraceStore() + const emitter = new TraceEmitter(store, { runId: 'r3', now: () => 1 }) + await emitter.startRun({ scenarioId: 's' }) + await emitter.endRun({ pass: false, score: 0, notes: 'genuine failure' }) + + await finalizeAbort(emitter, 'r3', 'should not touch a failed run') + + const after = await store.getRun('r3') + expect(after?.status).toBe('failed') + expect(after?.outcome).toEqual({ pass: false, score: 0, notes: 'genuine failure' }) + }) + + it('is a no-op (no throw) when the run was never started', async () => { + const store = new InMemoryTraceStore() + const emitter = new TraceEmitter(store, { runId: 'never', now: () => 1 }) + + await expect(finalizeAbort(emitter, 'never', 'nothing to abort')).resolves.toBeUndefined() + expect(await store.getRun('never')).toBeUndefined() + }) + + it('propagates a genuine store-read error rather than swallowing it', async () => { + class ReadFailingStore extends InMemoryTraceStore { + async getRun(): Promise { + throw new Error('disk fault on getRun') + } + } + const store = new ReadFailingStore() + const emitter = new TraceEmitter(store, { runId: 'r4', now: () => 1 }) + + await expect(finalizeAbort(emitter, 'r4', 'x')).rejects.toThrow('disk fault on getRun') + }) + + it('propagates a genuine abort-write error on a running run', async () => { + class WriteFailingStore extends InMemoryTraceStore { + async updateRun(): Promise { + throw new Error('disk full on updateRun') + } + } + const store = new WriteFailingStore() + const emitter = new TraceEmitter(store, { runId: 'r5', now: () => 1 }) + await emitter.startRun({ scenarioId: 's' }) + + await expect(finalizeAbort(emitter, 'r5', 'x')).rejects.toThrow('disk full on updateRun') + }) +}) + +describe('runEvalCampaign — genuine-error orphan handling', () => { + // A genuine (non-CellExecutionError) error from a worker — here a profile/model + // contradiction surfaced AFTER the runner already completed and finalized its + // run — must NOT reject the pool mid-flight and orphan peers. The campaign uses + // allSettled, finalizes every still-open run in the post-settle sweep, then + // re-throws the aggregated genuine error. + it('re-throws the genuine error and finalizes the failing run without corrupting it', async () => { + const stores = new Map() + const opts = baseOpts({ + variants: [{ id: 'v0', payload: {} }], + scenarios: [{ scenarioId: 's0' }], + seeds: [0], + storeFactory: (p) => { + const s = new InMemoryTraceStore() + stores.set(p.runId, s) + return s + }, + runner: async (ctx) => { + await ctx.emitter.startRun({ scenarioId: ctx.scenarioId }) + await ctx.emitter.endRun({ pass: true, score: 1 }) + // Force a genuine error AFTER the run is completed: the declared profile + // contradicts outcome.model. This is thrown from runOneCell, not the + // runner, so it is a genuine error, not a CellExecutionError. + return { + ...passingOutcome('actual-model@1'), + agentProfile: { + profileId: 'prof-1', + sourceProfile: { kind: 'test-profile', hash: 'a'.repeat(64) }, + model: 'different-model@1', + promptHash: 'p1', + }, + } + }, + }) + + await expect(runEvalCampaign(opts)).rejects.toThrow(/agentProfile\.model/) + + // The completed run must NOT have been flipped to aborted by post-settle + // cleanup — its terminal status is preserved. + expect(stores.size).toBe(1) + const [, store] = [...stores][0]! + const runs = await store.listRuns() + expect(runs).toHaveLength(1) + expect(runs[0]!.status).toBe('completed') + expect(runs[0]!.outcome).toEqual({ pass: true, score: 1 }) + }) + + it('aggregates multiple genuine errors into an AggregateError', async () => { + const opts = baseOpts({ + variants: [{ id: 'v0', payload: {} }], + scenarios: [{ scenarioId: 's0' }, { scenarioId: 's1' }], + seeds: [0], + concurrency: 2, + runner: async (ctx) => { + await ctx.emitter.startRun({ scenarioId: ctx.scenarioId }) + await ctx.emitter.endRun({ pass: true, score: 1 }) + return { + ...passingOutcome('actual@1'), + agentProfile: { + profileId: 'prof-1', + sourceProfile: { kind: 'test-profile', hash: 'a'.repeat(64) }, + model: 'mismatch@1', + promptHash: 'p1', + }, + } + }, + }) + + await expect(runEvalCampaign(opts)).rejects.toBeInstanceOf(AggregateError) + }) + + it('records a runner-throw as a failed run and still completes the campaign', async () => { + const opts = baseOpts({ + runner: async (ctx) => { + await ctx.emitter.startRun({ scenarioId: ctx.scenarioId }) + throw new Error('runner blew up') + }, + }) + + const result = await runEvalCampaign(opts) + expect(result.failedRuns).toHaveLength(1) + expect(result.failedRuns[0]!.reason).toBe('runner_threw') + expect(result.failedRuns[0]!.error).toBe('runner blew up') + expect(result.runs).toHaveLength(0) + }) + + it('happy path: a captured + completed run lands in result.runs', async () => { + const result = await runEvalCampaign(baseOpts()) + expect(result.runs).toHaveLength(1) + expect(result.failedRuns).toHaveLength(0) + expect(result.runs[0]!.outcome.holdoutScore).toBe(1) + }) +}) + +describe('preflight sanity', () => { + it('assertLlmRoute is exercised by the campaign (smoke)', () => { + expect(() => assertLlmRoute(LLM_OPTS, {})).not.toThrow() + }) +}) diff --git a/src/eval-campaign.ts b/src/eval-campaign.ts index c8052e39..6489cb4c 100644 --- a/src/eval-campaign.ts +++ b/src/eval-campaign.ts @@ -379,9 +379,23 @@ export async function runEvalCampaign( const failedRuns: FailedRun[] = [] // ── Execute (bounded-concurrency worker pool) ────────────────────── + // A genuine (non-CellExecutionError) error from any worker is a bug, not a + // run-level failure. We must NOT let `Promise.all` reject mid-flight and + // orphan the other workers' in-progress runs (emitters never finalized, + // sinks/handles leak, partial work silently discarded). Instead: capture the + // first genuine error, stop dispatching new cells so in-flight workers wind + // down, finalize every still-open run, then re-throw the aggregated error. let cursor = 0 + let aborting = false + const genuineErrors: unknown[] = [] + // Emitters for runs that are currently mid-flight, keyed by runId. A worker + // registers its emitter before invoking the runner and removes it once the + // run is finalized (via endRun/abortRun, success or failure). Anything left + // here after the pool settles is an orphan we must abort. + const openRuns = new Map() + async function worker(): Promise { - while (true) { + while (!aborting) { const i = cursor++ if (i >= cells.length) return const cell = cells[i]! @@ -395,8 +409,12 @@ export async function runEvalCampaign( if (err.integrity) integrityReports.push(err.integrity) } else { // Genuine bug — not a runner failure, not an integrity failure. - // Surface it; don't silently mask. - throw err + // Capture it and stop dispatching so peers wind down gracefully; + // re-thrown after the pool settles. Do not surface here — that would + // reject Promise.all and orphan the other workers' runs. + genuineErrors.push(err) + aborting = true + return } } } @@ -427,6 +445,9 @@ export async function runEvalCampaign( now: opts.now, onRunComplete: opts.onRunComplete, }) + // Track this run as open so a genuine error elsewhere in the pool can + // finalize it instead of orphaning it. Removed in the finally below. + openRuns.set(runId, emitter) const llmOpts: LlmClientOptions = { ...opts.llmOpts, @@ -449,101 +470,129 @@ export async function runEvalCampaign( llmOpts, } - const wallStart = now() - let outcome: CampaignRunOutcome try { - outcome = await opts.runner(ctx) - } catch (err) { - const message = err instanceof Error ? err.message : String(err) - // The runner threw mid-execution; give it a chance to have aborted. + const wallStart = now() + let outcome: CampaignRunOutcome try { - await emitter.abortRun(message) - } catch { - // Already aborted/ended; ignore. + outcome = await opts.runner(ctx) + } catch (err) { + const message = err instanceof Error ? err.message : String(err) + // The runner threw mid-execution. Abort the run so the emitter + // finalizes. The only benign abortRun failure is "the runner never + // started the run" (nothing to finalize). A store-write failure (disk + // full, FS error) is a genuine diagnostic — surface it rather than + // masking it as a plain runner failure. + await finalizeAbort(emitter, runId, message) + throw new CellExecutionError({ + runId, + variantId: cell.variant.id, + scenarioId: cell.scenario.scenarioId, + seed: cell.seed, + reason: 'runner_threw', + error: message, + }) + } + const wallMs = now() - wallStart + + const integrityReport = await assertRunCaptured(store, runId, { ...integrity, rawSink }) + if (!integrityReport.ok) { + switch (onIntegrityFailure) { + case 'throw': + throw new RunIntegrityError(integrityReport) + case 'mark_failed': + throw new CellExecutionError( + { + runId, + variantId: cell.variant.id, + scenarioId: cell.scenario.scenarioId, + seed: cell.seed, + reason: 'integrity_failed', + error: integrityReport.issues.map((i) => i.code).join(', '), + }, + integrityReport, + ) + case 'log': + // Caller wants the run admitted with a flagged report; fall through. + break + } } - throw new CellExecutionError({ + + const recordOutcome: RunOutcome = { + raw: outcome.raw ?? {}, + } + if (splitTag === 'holdout') recordOutcome.holdoutScore = outcome.score + else recordOutcome.searchScore = outcome.score + if (outcome.judgeScores !== undefined) recordOutcome.judgeScores = outcome.judgeScores + + const record: RunRecord = { runId, - variantId: cell.variant.id, - scenarioId: cell.scenario.scenarioId, + experimentId: opts.campaignId, + candidateId: cell.variant.id, seed: cell.seed, - reason: 'runner_threw', - error: message, - }) - } - const wallMs = now() - wallStart - - const integrityReport = await assertRunCaptured(store, runId, { ...integrity, rawSink }) - if (!integrityReport.ok) { - switch (onIntegrityFailure) { - case 'throw': - throw new RunIntegrityError(integrityReport) - case 'mark_failed': - throw new CellExecutionError( - { + model: outcome.model, + promptHash: outcome.promptHash, + configHash: outcome.configHash, + commitSha: opts.commitSha, + wallMs, + costUsd: outcome.costUsd, + tokenUsage: outcome.tokenUsage, + judgeMetadata: outcome.judgeMetadata, + outcome: recordOutcome, + ...(outcome.failureClass ? { failureClass: outcome.failureClass } : {}), + failureMode: outcome.failureMode, + splitTag, + scenarioId: cell.scenario.scenarioId, + } + const profileSource = + outcome.agentProfile ?? + (typeof opts.agentProfile === 'function' + ? await opts.agentProfile({ + campaignId: opts.campaignId, runId, variantId: cell.variant.id, scenarioId: cell.scenario.scenarioId, seed: cell.seed, - reason: 'integrity_failed', - error: integrityReport.issues.map((i) => i.code).join(', '), - }, - integrityReport, - ) - case 'log': - // Caller wants the run admitted with a flagged report; fall through. - break + variant: cell.variant.payload, + scenarioTags: cell.scenario.tags ?? {}, + }) + : opts.agentProfile) + if (profileSource !== undefined) { + const agentProfile = await resolveAgentProfileCell(profileSource) + assertAgentProfileMatchesRun(agentProfile, outcome.model, outcome.promptHash) + record.agentProfile = agentProfile } + return { record: validateRunRecord(record), integrity: integrityReport } + } finally { + // This run's worker has finished with it (success, run-level failure, or + // genuine error). It is no longer the pool's job to finalize — drop it so + // the post-settle sweep doesn't double-abort a finalized run. + openRuns.delete(runId) } - - const recordOutcome: RunOutcome = { - raw: outcome.raw ?? {}, - } - if (splitTag === 'holdout') recordOutcome.holdoutScore = outcome.score - else recordOutcome.searchScore = outcome.score - if (outcome.judgeScores !== undefined) recordOutcome.judgeScores = outcome.judgeScores - - const record: RunRecord = { - runId, - experimentId: opts.campaignId, - candidateId: cell.variant.id, - seed: cell.seed, - model: outcome.model, - promptHash: outcome.promptHash, - configHash: outcome.configHash, - commitSha: opts.commitSha, - wallMs, - costUsd: outcome.costUsd, - tokenUsage: outcome.tokenUsage, - judgeMetadata: outcome.judgeMetadata, - outcome: recordOutcome, - ...(outcome.failureClass ? { failureClass: outcome.failureClass } : {}), - failureMode: outcome.failureMode, - splitTag, - scenarioId: cell.scenario.scenarioId, - } - const profileSource = - outcome.agentProfile ?? - (typeof opts.agentProfile === 'function' - ? await opts.agentProfile({ - campaignId: opts.campaignId, - runId, - variantId: cell.variant.id, - scenarioId: cell.scenario.scenarioId, - seed: cell.seed, - variant: cell.variant.payload, - scenarioTags: cell.scenario.tags ?? {}, - }) - : opts.agentProfile) - if (profileSource !== undefined) { - const agentProfile = await resolveAgentProfileCell(profileSource) - assertAgentProfileMatchesRun(agentProfile, outcome.model, outcome.promptHash) - record.agentProfile = agentProfile - } - return { record: validateRunRecord(record), integrity: integrityReport } } const workers = Array.from({ length: Math.min(concurrency, cells.length) }, () => worker()) - await Promise.all(workers) + // allSettled (not all): a genuine error in one worker must not reject the + // pool mid-flight and orphan the others. Each worker captures its own + // genuine error into `genuineErrors` and returns; we re-throw below. + await Promise.allSettled(workers) + + // Finalize any run still open after the pool wound down. With the + // stop-dispatch flag these are the runs that were mid-flight in peer workers + // when the first genuine error fired — abort them so their emitters finalize + // (hooks fire, store records a terminal status) instead of leaking. + for (const [runId, emitter] of openRuns) { + await finalizeAbort(emitter, runId, 'campaign aborted: genuine error in a sibling run') + } + openRuns.clear() + + if (genuineErrors.length > 0) { + throw genuineErrors.length === 1 + ? genuineErrors[0] + : new AggregateError( + genuineErrors, + `runEvalCampaign: ${genuineErrors.length} runs failed with genuine (non-run-level) errors`, + ) + } // ── Optional research report ─────────────────────────────────────── let report: ResearchReport | undefined @@ -585,6 +634,32 @@ class CellExecutionError extends Error { } } +/** + * Abort a run whose owning work threw or was orphaned by a sibling's genuine + * error, finalizing the emitter so hooks fire and the store records a terminal + * status. Safe to call unconditionally: it only aborts runs that are still + * `running`. Two no-op cases are intentional and benign: + * + * - the run was never started (absent from the store) — nothing to finalize + * - the run is already terminal (`completed` / `failed` / `aborted`) — + * finalizing again would overwrite the real outcome (e.g. flip a passed run + * to `aborted` with `{ pass: false, notes: reason }`), so we leave it alone + * + * Any OTHER failure of the store read or the abort write (disk full, FS fault, + * backend down) is a genuine diagnostic and propagates rather than being + * swallowed. + */ +export async function finalizeAbort( + emitter: TraceEmitter, + runId: string, + reason: string, +): Promise { + const existing = await emitter.traceStore.getRun(runId) + if (existing === undefined) return // run never started; nothing to abort + if (existing.status !== 'running') return // already finalized; never overwrite a real outcome + await emitter.abortRun(reason) +} + function defaultRawSinkFactory(workDir: string | undefined) { return (params: CampaignFactoryParams): RawProviderSink => { if (!workDir) { diff --git a/src/executor.test.ts b/src/executor.test.ts new file mode 100644 index 00000000..6b6f0ebd --- /dev/null +++ b/src/executor.test.ts @@ -0,0 +1,179 @@ +/** + * Capture-integrity contract for the scenario executor. Two failure classes + * that a sloppy fallback erases must stay loud: + * 1. a malformed/hung chat response must NOT be recorded as a real empty turn + * (capture defect, indistinguishable from a model that said nothing); + * 2. a judge that errors must record WHY, retry only transient faults, and + * fail deterministic faults immediately. + */ + +import type { TCloud } from '@tangle-network/tcloud' +import { describe, expect, it, vi } from 'vitest' + +import { CaptureIntegrityError } from './errors' +import { type ExecutorConfig, executeScenario, type JudgeFailure } from './executor' +import { JudgeParseError } from './judges' +import type { JudgeFn, Scenario, ScenarioResult } from './types' + +function scenario(overrides: Partial = {}): Scenario { + return { + id: 'cap-1', + persona: 'analyst', + label: 'capture', + thesis: 'executor preserves capture integrity', + dimensions: ['accuracy'], + turns: [{ user: 'hello', expectedBehaviors: [] }], + artifactChecks: [], + ...overrides, + } +} + +/** A TCloud whose chat() returns whatever shape the test supplies. */ +function chatStub(resp: unknown): TCloud { + return { chat: vi.fn(async () => resp) } as unknown as TCloud +} + +/** No-op sleep so the retry policy runs without real backoff. */ +const noSleep = (_ms: number) => Promise.resolve() + +function config(overrides: Partial = {}): ExecutorConfig { + return { + systemPrompt: 'you are a test agent', + judges: [], + sleep: noSleep, + ...overrides, + } +} + +describe('executeScenario — malformed chat response is a loud capture defect', () => { + it('throws CaptureIntegrityError when choices[0].message is absent', async () => { + const tc = chatStub({ choices: [{}] }) + await expect(executeScenario(tc, scenario(), config())).rejects.toBeInstanceOf( + CaptureIntegrityError, + ) + }) + + it('throws when content is missing entirely (collapsed to "" under the old code)', async () => { + const tc = chatStub({ choices: [{ message: { role: 'assistant' } }] }) + await expect(executeScenario(tc, scenario(), config())).rejects.toBeInstanceOf( + CaptureIntegrityError, + ) + }) + + it('throws when content is a non-string (e.g. null) instead of recording an empty turn', async () => { + const tc = chatStub({ choices: [{ message: { content: null } }] }) + await expect(executeScenario(tc, scenario(), config())).rejects.toBeInstanceOf( + CaptureIntegrityError, + ) + }) + + it('PRESERVES a legitimately-empty string — a model that chose to say nothing is real signal', async () => { + const tc = chatStub({ choices: [{ message: { content: '' } }] }) + const result = await executeScenario(tc, scenario(), config()) + expect(result.turns).toHaveLength(1) + expect(result.turns[0]!.agentResponse).toBe('') + }) +}) + +describe('executeScenario — judge retry policy records the reason and gates on transience', () => { + const goodResp = { choices: [{ message: { content: 'hi there' } }] } + + it('does NOT retry a deterministic judge error — calls the judge exactly once', async () => { + const judge = vi.fn(async () => { + throw new Error('rubric validation failed') + }) as unknown as JudgeFn + const tc = chatStub(goodResp) + + const result = await executeScenario(tc, scenario(), config({ judges: [judge] })) + + expect((judge as unknown as ReturnType).mock.calls).toHaveLength(1) + expect(result.judgeErrors).toBe(1) + }) + + it('records WHY a judge failed in additive judgeFailures with the actual error message', async () => { + const judge = vi.fn(async () => { + throw new Error('rubric validation failed') + }) as unknown as JudgeFn + const tc = chatStub(goodResp) + + const result = (await executeScenario( + tc, + scenario(), + config({ judges: [judge] }), + )) as ScenarioResult & { + judgeFailures?: JudgeFailure[] + } + + expect(result.judgeFailures).toBeDefined() + expect(result.judgeFailures).toHaveLength(1) + expect(result.judgeFailures![0]!.reason).toContain('rubric validation failed') + expect(result.judgeFailures![0]!.attempts).toBe(1) + }) + + it('retries a transient judge error and succeeds on the next attempt', async () => { + let calls = 0 + const judge = vi.fn(async () => { + calls++ + if (calls === 1) { + const err = new Error('socket hang up') as Error & { status?: number } + err.status = 503 + throw err + } + return [{ judgeName: 'j', dimension: 'accuracy', score: 0.8, reasoning: 'ok' }] + }) as unknown as JudgeFn + const tc = chatStub(goodResp) + + const result = (await executeScenario( + tc, + scenario(), + config({ judges: [judge] }), + )) as ScenarioResult & { + judgeFailures?: JudgeFailure[] + } + + expect(calls).toBe(2) + expect(result.judgeErrors).toBe(0) + expect(result.judgeFailures).toBeUndefined() + expect(result.judgeScores).toHaveLength(1) + }) + + it('exhausts retries on a persistently-transient judge and reports exactly 3 attempts', async () => { + const judge = vi.fn(async () => { + const err = new Error('ECONNRESET') as Error & { status?: number } + err.status = 503 + throw err + }) as unknown as JudgeFn + const tc = chatStub(goodResp) + + const result = (await executeScenario( + tc, + scenario(), + config({ judges: [judge] }), + )) as ScenarioResult & { + judgeFailures?: JudgeFailure[] + } + + expect((judge as unknown as ReturnType).mock.calls).toHaveLength(3) + expect(result.judgeFailures![0]!.attempts).toBe(3) + expect(result.judgeErrors).toBe(1) + }) + + it('does not retry a JudgeParseError but still records it as a failed judge', async () => { + const judge = vi.fn(async () => { + throw new JudgeParseError('domain', 'not json at all') + }) as unknown as JudgeFn + const tc = chatStub(goodResp) + + const result = (await executeScenario( + tc, + scenario(), + config({ judges: [judge] }), + )) as ScenarioResult & { + judgeFailures?: JudgeFailure[] + } + + expect((judge as unknown as ReturnType).mock.calls).toHaveLength(1) + expect(result.judgeErrors).toBe(1) + expect(result.judgeFailures![0]!.reason).toContain('unparseable') + }) +}) diff --git a/src/executor.ts b/src/executor.ts index 73b76bae..e900d406 100644 --- a/src/executor.ts +++ b/src/executor.ts @@ -1,5 +1,7 @@ import type { TCloud } from '@tangle-network/tcloud' +import { CaptureIntegrityError } from './errors' import { JudgeParseError } from './judges' +import { isTransientLlmError } from './llm-client' import { normalizeScores, weightedMean } from './statistics' import type { CollectedArtifacts, @@ -31,6 +33,29 @@ export interface ExecutorConfig { check: Scenario['artifactChecks'][0], artifacts: CollectedArtifacts, ) => { passed: boolean; detail: string } | null + /** + * Sleep used between judge retries and after a judge succeeds. Defaults to + * real `setTimeout`. Injectable so tests exercise the retry policy without + * the real multi-second backoff. + */ + sleep?: (ms: number) => Promise +} + +function describeShape(content: unknown, message: unknown): string { + if (message === undefined || message === null) return 'no choices[0].message' + return `message.content of type ${content === null ? 'null' : typeof content}` +} + +function errMessage(err: unknown): string { + if (err instanceof Error) return `${err.name}: ${err.message}` + return String(err) +} + +/** Per-judge failure diagnostic, recorded additively on `ScenarioResult`. */ +export interface JudgeFailure { + judge: string + attempts: number + reason: string } /** @@ -72,9 +97,21 @@ export async function executeScenario( maxTokens: 3000, }) - const content = - (resp as { choices?: { message?: { content?: string } }[] }).choices?.[0]?.message?.content ?? - '' + const message = (resp as { choices?: { message?: { content?: unknown } }[] }).choices?.[0] + ?.message + const rawContent = message?.content + // A legitimately-empty turn ('' from a model that chose to say nothing) is + // real signal and must survive. A missing choices[0].message, or a content + // that is not a string, is a capture defect — the chat call hung or returned + // a malformed shape — and is indistinguishable from a real empty turn once + // collapsed to ''. Fail loud so it is never scored as a real response. + if (message === undefined || message === null || typeof rawContent !== 'string') { + throw new CaptureIntegrityError( + `chat response for scenario "${scenario.id}" turn ${i} is malformed: ` + + `expected choices[0].message.content to be a string, got ${describeShape(rawContent, message)}`, + ) + } + const content = rawContent messages.push({ role: 'assistant', content }) @@ -173,29 +210,50 @@ export async function executeScenario( const judgeInput = { scenario, turns, artifacts } const judgeResults: JudgeScore[][] = [] let failedJudges = 0 + const judgeFailures: JudgeFailure[] = [] + const sleep = config.sleep ?? ((ms: number) => new Promise((r) => setTimeout(r, ms))) - for (const judge of config.judges) { + for (let j = 0; j < config.judges.length; j++) { + const judge = config.judges[j]! + const judgeName = judge.name || `judge[${j}]` + let lastError: unknown + let madeAttempts = 0 for (let attempt = 0; attempt < 3; attempt++) { + madeAttempts = attempt + 1 try { if (attempt > 0) { const wait = attempt * 10_000 - console.log(` judge retry ${attempt}/2 (waiting ${wait / 1000}s)`) - await new Promise((r) => setTimeout(r, wait)) + console.log(` ${judgeName} retry ${attempt}/2 (waiting ${wait / 1000}s)`) + await sleep(wait) } const scores = await judge(tc, judgeInput) judgeResults.push(scores) - await new Promise((r) => setTimeout(r, 3000)) + await sleep(3000) + lastError = undefined break } catch (err) { + lastError = err if (err instanceof JudgeParseError) { // The model answered but unparseably — another identical prompt // won't fix it. Record the failure and move to the next judge. - failedJudges++ + console.error(` ${judgeName} unparseable (no retry): ${errMessage(err)}`) break } - if (attempt === 2) failedJudges++ + console.error(` ${judgeName} attempt ${attempt} failed: ${errMessage(err)}`) + // Deterministic errors (bad rubric, auth, 4xx) won't change on retry — + // only retry connection/rate-limit class faults. Stop the loop now so a + // permanent error is not masked by two pointless waits. + if (!isTransientLlmError(err)) break } } + if (lastError !== undefined) { + failedJudges++ + judgeFailures.push({ + judge: judgeName, + attempts: madeAttempts, + reason: errMessage(lastError), + }) + } } const allScores = judgeResults.flat() @@ -221,7 +279,10 @@ export async function executeScenario( })), ) - return { + // judgeFailures is additive diagnostic context (WHY each judge errored), + // recorded alongside the existing judgeErrors count without widening the + // shared ScenarioResult contract. Absent when every judge succeeded. + const result: ScenarioResult & { judgeFailures?: JudgeFailure[] } = { scenarioId: scenario.id, persona: scenario.persona, turns, @@ -232,4 +293,6 @@ export async function executeScenario( totalDurationMs: Date.now() - startTime, artifacts, } + if (judgeFailures.length > 0) result.judgeFailures = judgeFailures + return result } diff --git a/src/judge-runner.test.ts b/src/judge-runner.test.ts new file mode 100644 index 00000000..539914f5 --- /dev/null +++ b/src/judge-runner.test.ts @@ -0,0 +1,61 @@ +import { describe, expect, it } from 'vitest' +import { compilerJudge, runJudgeFleet } from './judge-runner' +import type { HarnessConfig, SandboxDriver, SandboxResult } from './sandbox-harness' + +/** + * Driver that records peak concurrent `exec` calls. Each exec yields to the + * event loop a few times so the pool's bound is observable: with unbounded + * fan-out every spec's subprocess is in-flight at once. + */ +function makeConcurrencyTrackingDriver(): SandboxDriver & { peak: number; active: number } { + const state = { id: 'tracking', peak: 0, active: 0 } as SandboxDriver & { + peak: number + active: number + } + state.exec = async ( + phase: SandboxResult['phase'], + _command: string, + _config: HarnessConfig, + ): Promise => { + state.active++ + if (state.active > state.peak) state.peak = state.active + // Yield repeatedly so other workers can start while we're "running". + for (let i = 0; i < 5; i++) await Promise.resolve() + state.active-- + return { phase, exitCode: 0, stdout: '', stderr: '', wallMs: 1 } + } + return state +} + +describe('runJudgeFleet bounded concurrency', () => { + it('caps concurrent subprocesses at the configured concurrency', async () => { + const driver = makeConcurrencyTrackingDriver() + const specs = Array.from({ length: 12 }, (_, i) => + compilerJudge(`judge-${i}`, { runCommand: 'true' } as HarnessConfig), + ) + + const results = await runJudgeFleet(specs, { driver, concurrency: 3 }) + + expect(results).toHaveLength(12) + // OLD behavior (Promise.all over all specs) would peak at 12. The bounded + // pool must never exceed the configured concurrency. + expect(driver.peak).toBeLessThanOrEqual(3) + expect(driver.peak).toBeGreaterThan(0) + }) + + it('preserves result order matching input spec order', async () => { + const driver = makeConcurrencyTrackingDriver() + const specs = Array.from({ length: 8 }, (_, i) => + compilerJudge(`spec-${i}`, { runCommand: 'true' } as HarnessConfig), + ) + const results = await runJudgeFleet(specs, { driver, concurrency: 2 }) + expect(results.map((r) => r.id)).toEqual(specs.map((s) => s.id)) + }) + + it('returns [] for an empty spec list without spawning workers', async () => { + const driver = makeConcurrencyTrackingDriver() + const results = await runJudgeFleet([], { driver }) + expect(results).toEqual([]) + expect(driver.peak).toBe(0) + }) +}) diff --git a/src/judge-runner.ts b/src/judge-runner.ts index 2c28289b..23d4bbe7 100644 --- a/src/judge-runner.ts +++ b/src/judge-runner.ts @@ -1,3 +1,4 @@ +import { cpus } from 'node:os' import { type HarnessConfig, type SandboxDriver, @@ -28,6 +29,12 @@ export interface SandboxJudgeResult { export interface JudgeFleetOptions { driver?: SandboxDriver parallel?: boolean + /** + * Max concurrent judge subprocesses. Each spec spawns its own subprocess + * via the driver, so unbounded fan-out exhausts file descriptors / PIDs. + * Defaults to the host CPU count. Ignored when `parallel === false`. + */ + concurrency?: number } export class JudgeRunner { @@ -69,7 +76,18 @@ export async function runJudgeFleet( for (const spec of specs) results.push(await runner.run(spec)) return results } - return await Promise.all(specs.map((spec) => runner.run(spec))) + const concurrency = Math.max(1, options.concurrency ?? cpus().length) + const results: SandboxJudgeResult[] = new Array(specs.length) + let cursor = 0 + async function worker(): Promise { + while (true) { + const i = cursor++ + if (i >= specs.length) return + results[i] = await runner.run(specs[i]!) + } + } + await Promise.all(Array.from({ length: Math.min(concurrency, specs.length) }, () => worker())) + return results } export function compilerJudge(id: string, config: HarnessConfig): SandboxJudgeSpec { diff --git a/src/llm-client.test.ts b/src/llm-client.test.ts index 4883c4ae..7316ae76 100644 --- a/src/llm-client.test.ts +++ b/src/llm-client.test.ts @@ -257,6 +257,114 @@ describe('llm-client — retry semantics', () => { }) }) +describe('llm-client — caller AbortSignal + cross-attempt deadline', () => { + it('an already-aborted caller signal fails loud without firing fetch', async () => { + const fetch = vi.fn(async () => + mkOkResponse({ choices: [{ message: { content: 'x' } }], usage: {} }), + ) as unknown as typeof globalThis.fetch + const controller = new AbortController() + controller.abort() + await expect( + callLlm({ model: 'm', messages: [] }, { fetch, signal: controller.signal }), + ).rejects.toThrow(/abort/i) + expect(fetch as unknown as ReturnType).not.toHaveBeenCalled() + }) + + it('a caller abort mid-call is NOT retried — it surfaces immediately', async () => { + // Regression: callLlm took no external signal, so a campaign cancel could + // not stop an in-flight call, AND an AbortError matches the transient + // patterns — so a naive wiring would retry the cancelled call. + const controller = new AbortController() + let calls = 0 + const fetch: typeof globalThis.fetch = (async (_url: string, init: RequestInit) => { + calls++ + controller.abort() + // Mirror fetch's behavior: a request whose linked signal aborts rejects. + const err = new Error('The operation was aborted') + err.name = 'AbortError' + void init + throw err + }) as unknown as typeof globalThis.fetch + + await expect( + callLlm({ model: 'm', messages: [] }, { fetch, signal: controller.signal, maxRetries: 3 }), + ).rejects.toThrow(/abort/i) + expect(calls).toBe(1) + }) + + it('a caller abort cancels the in-flight fetch via the linked signal', async () => { + const controller = new AbortController() + const fetch: typeof globalThis.fetch = ((_url: string, init: RequestInit) => + new Promise((_resolve, reject) => { + init.signal?.addEventListener('abort', () => { + const err = new Error('The operation was aborted') + err.name = 'AbortError' + reject(err) + }) + })) as unknown as typeof globalThis.fetch + + const p = callLlm({ model: 'm', messages: [] }, { fetch, signal: controller.signal }) + controller.abort() + await expect(p).rejects.toThrow(/abort/i) + }) + + it('stops retrying once the cross-attempt deadline is exhausted', async () => { + // Per-attempt timeout still bounds each call, but a tight wall-clock budget + // must cut the retry loop short rather than waiting full timeout × retries. + let calls = 0 + const fetch: typeof globalThis.fetch = (async () => { + calls++ + // Burn the entire deadline on the first attempt's "work". + await new Promise((r) => setTimeout(r, 30)) + return mkErrResponse(503, 'still unavailable') + }) as unknown as typeof globalThis.fetch + + await expect( + callLlm({ model: 'm', messages: [] }, { fetch, maxRetries: 5, deadlineMs: 10 }), + ).rejects.toBeInstanceOf(LlmCallError) + // Without the deadline this would retry up to 5 times; the budget caps it at 1. + expect(calls).toBe(1) + }) +}) + +describe('llm-client — empty-content + finishReason signals', () => { + it('flags empty content and surfaces finish_reason=length (truncation)', async () => { + const fetch = mockFetch([ + async () => + mkOkResponse({ + choices: [{ message: { content: '' }, finish_reason: 'length' }], + usage: {}, + }), + ]) + const r = await callLlm({ model: 'm', messages: [] }, { fetch }) + expect(r.content).toBe('') + expect(r.contentEmpty).toBe(true) + expect(r.finishReason).toBe('length') + }) + + it('non-empty content reports contentEmpty=false and finish_reason=stop', async () => { + const fetch = mockFetch([ + async () => + mkOkResponse({ + choices: [{ message: { content: 'real answer' }, finish_reason: 'stop' }], + usage: {}, + }), + ]) + const r = await callLlm({ model: 'm', messages: [] }, { fetch }) + expect(r.contentEmpty).toBe(false) + expect(r.finishReason).toBe('stop') + }) + + it('whitespace-only content counts as empty; missing finish_reason is null', async () => { + const fetch = mockFetch([ + async () => mkOkResponse({ choices: [{ message: { content: ' \n ' } }], usage: {} }), + ]) + const r = await callLlm({ model: 'm', messages: [] }, { fetch }) + expect(r.contentEmpty).toBe(true) + expect(r.finishReason).toBeNull() + }) +}) + describe('llm-client — isTransientLlmError classification', () => { it('classifies HTTP/2 + undici transport faults as transient', () => { expect(isTransientLlmError(new Error('terminated'))).toBe(true) diff --git a/src/llm-client.ts b/src/llm-client.ts index 048c725b..1e5e06ff 100644 --- a/src/llm-client.ts +++ b/src/llm-client.ts @@ -79,6 +79,22 @@ export interface LlmCallResult { model: string /** Wall-clock duration of the HTTP call (last attempt, if retried). */ durationMs: number + /** + * `finish_reason` echoed from the first choice (`stop`, `length`, + * `content_filter`, `tool_calls`, ...). `null` when the provider omits it. + * Exposed so a free-form `callLlm` caller CAN detect a truncated answer + * (`length`) instead of treating a cut-off completion as complete. Note: + * `callLlm` does not itself reject on it — acting on this signal is the + * caller's responsibility (in-repo free-form drivers do not yet enforce it). + */ + finishReason?: string | null + /** + * True when `content.trim()` is empty. An empty completion is a silent zero + * for free-form `callLlm` callers; this flag is the signal a caller can + * inspect to fail loud rather than proceed on an empty string. `callLlm` + * surfaces it but does not throw on it. + */ + contentEmpty?: boolean /** Raw response body. */ raw: Record } @@ -104,6 +120,21 @@ export interface LlmClientOptions { authHeader?: { name: string; value: string } /** Default timeout in ms. Per-call can override. */ defaultTimeoutMs?: number + /** + * Caller-supplied abort signal — e.g. a campaign-wide cancel. Linked to + * each attempt's per-attempt timeout controller, so aborting it cancels + * the in-flight fetch. A caller abort is FATAL: it is not retried even + * though an AbortError otherwise matches the transient patterns. + */ + signal?: AbortSignal + /** + * Cross-attempt wall-clock budget in ms, measured from the first attempt. + * Before launching each attempt the loop checks the remaining budget and + * stops retrying once it is exhausted, rather than waiting the full + * per-attempt timeout on every retry. Bounds total time independent of + * `maxRetries` × `timeoutMs`. + */ + deadlineMs?: number /** Max retry attempts on retriable errors. Default 3 (1 initial + 2 retries). */ maxRetries?: number /** Fetch implementation — defaults to global `fetch`. Override for custom transport (e.g. tests). */ @@ -271,6 +302,31 @@ async function sleep(ms: number): Promise { return new Promise((resolve) => setTimeout(resolve, ms)) } +/** + * Combine the per-attempt timeout signal with an optional caller signal into + * one signal the fetch listens on. Prefers the native `AbortSignal.any`; falls + * back to manual wiring on runtimes that predate it. The caller signal is also + * propagated to the timeout controller so aborting it cancels the in-flight + * fetch immediately. + */ +function linkSignals(timeoutController: AbortController, caller?: AbortSignal): AbortSignal { + if (!caller) return timeoutController.signal + if (typeof (AbortSignal as { any?: unknown }).any === 'function') { + return AbortSignal.any([timeoutController.signal, caller]) + } + if (caller.aborted) { + timeoutController.abort() + } else { + caller.addEventListener('abort', () => timeoutController.abort(), { once: true }) + } + return timeoutController.signal +} + +/** True once the cross-attempt wall-clock budget (if any) is exhausted. */ +function deadlineExceeded(start: number, deadlineMs: number | undefined): boolean { + return deadlineMs != null && Date.now() - start >= deadlineMs +} + // ─── Public API ───────────────────────────────────────────────────────── /** @@ -366,10 +422,24 @@ export async function callLlm( const sink = opts.rawSink const redactor = opts.redactor ?? defaultProviderRedactor const traceContext = opts.traceContext + const callerSignal = opts.signal + const deadlineMs = opts.deadlineMs + const deadlineStart = Date.now() let lastErr: unknown for (let attempt = 0; attempt < maxRetries; attempt++) { + // A caller cancel is fatal — never retried. Checking before each attempt + // means an already-aborted signal short-circuits without firing fetch. + if (callerSignal?.aborted) { + throw new DOMException('callLlm aborted by caller signal', 'AbortError') + } + // Stop retrying once the cross-attempt budget is spent rather than burning + // a full per-attempt timeout on each remaining retry. + if (attempt > 0 && deadlineExceeded(deadlineStart, deadlineMs)) { + throw lastErr instanceof Error ? lastErr : new Error(String(lastErr)) + } const controller = new AbortController() + const attemptSignal = linkSignals(controller, callerSignal) const timeoutHandle = setTimeout(() => controller.abort(), timeoutMs) const started = Date.now() const requestBody = buildBody(req, false) @@ -397,7 +467,7 @@ export async function callLlm( method: 'POST', headers, body: JSON.stringify(requestBody), - signal: controller.signal, + signal: attemptSignal, }) clearTimeout(timeoutHandle) const responseHeaders = sink ? headersToObject(res.headers) : undefined @@ -431,7 +501,11 @@ export async function callLlm( body, req.model, ) - if (RETRYABLE_STATUS.has(res.status) && attempt < maxRetries - 1) { + if ( + RETRYABLE_STATUS.has(res.status) && + attempt < maxRetries - 1 && + !deadlineExceeded(deadlineStart, deadlineMs) + ) { lastErr = err const retryAfter = parseRetryAfter(res.headers) await sleep(retryAfter ?? backoffMs(attempt)) @@ -487,12 +561,19 @@ export async function callLlm( redactedFields: [], }) } - const choice = (json.choices as Array<{ message?: { content?: string } }> | undefined)?.[0] + const choice = ( + json.choices as + | Array<{ message?: { content?: string }; finish_reason?: string | null }> + | undefined + )?.[0] const usageRaw = (json.usage as Record | undefined) ?? {} const costFromProxy = (json._response_cost ?? json.cost_usd) as number | undefined + const content = choice?.message?.content ?? '' return { - content: choice?.message?.content ?? '', + content, + finishReason: choice?.finish_reason ?? null, + contentEmpty: content.trim().length === 0, usage: { promptTokens: Number(usageRaw.prompt_tokens ?? 0), completionTokens: Number(usageRaw.completion_tokens ?? 0), @@ -512,6 +593,29 @@ export async function callLlm( } catch (err) { clearTimeout(timeoutHandle) lastErr = err + // A caller cancel is fatal even though an AbortError matches the + // transient patterns — a cancelled call must surface immediately, not + // be retried against the same dead intent. + if (callerSignal?.aborted) { + if (sink && !attemptErrorRecorded) { + await recordRaw(sink, redactor, { + eventId: cryptoEventId(), + runId: traceContext?.runId, + spanId: traceContext?.spanId, + provider, + model: req.model, + endpoint, + baseUrl, + attemptIndex: attempt, + direction: 'error', + timestamp: Date.now(), + durationMs: Date.now() - started, + errorMessage: err instanceof Error ? err.message : String(err), + redactedFields: [], + }) + } + throw err + } if (sink && !attemptErrorRecorded) { // Record only if neither the !res.ok branch nor the JSON.parse catch // already produced an error event for this attempt. Covers network @@ -532,7 +636,11 @@ export async function callLlm( redactedFields: [], }) } - if (attempt < maxRetries - 1 && isTransientLlmError(err)) { + if ( + attempt < maxRetries - 1 && + isTransientLlmError(err) && + !deadlineExceeded(deadlineStart, deadlineMs) + ) { await sleep(backoffMs(attempt)) continue } diff --git a/src/pipelines/budget-breach.test.ts b/src/pipelines/budget-breach.test.ts new file mode 100644 index 00000000..988dcd39 --- /dev/null +++ b/src/pipelines/budget-breach.test.ts @@ -0,0 +1,107 @@ +import { describe, expect, it } from 'vitest' +import type { BudgetLedgerEntry, BudgetSpec, Run } from '../trace/schema' +import { InMemoryTraceStore } from '../trace/store' +import { budgetBreachView } from './budget-breach' + +function run(runId: string, scenarioId: string, variantId?: string): Run { + return { runId, scenarioId, variantId, startedAt: 1, status: 'completed' } +} + +function entry( + runId: string, + dimension: keyof BudgetSpec, + limit: number, + consumed: number, + breached: boolean, +): BudgetLedgerEntry { + return { + runId, + dimension, + limit, + consumed, + remaining: limit - consumed, + timestamp: 100, + breached, + } +} + +async function storeWith(runs: Run[], entries: BudgetLedgerEntry[]): Promise { + const store = new InMemoryTraceStore() + for (const r of runs) await store.appendRun(r) + for (const e of entries) await store.appendBudgetEntry(e) + return store +} + +describe('budgetBreachView', () => { + it('counts only breached:true entries, ignoring non-breached ledger rows', async () => { + const store = await storeWith( + [run('r1', 's1')], + [ + entry('r1', 'tokens', 100, 50, false), // not breached -> ignored + entry('r1', 'tokens', 100, 150, true), // breached -> counted + ], + ) + const report = await budgetBreachView(store) + expect(report.findings).toHaveLength(1) + expect(report.findings[0]!.consumed).toBe(150) + expect(report.byDimension).toEqual({ tokens: 1 }) + expect(report.byScenario).toEqual({ s1: 1 }) + }) + + it('excessRatio is Infinity (not NaN/crash) when limit === 0', async () => { + const store = await storeWith([run('r1', 's1')], [entry('r1', 'calls', 0, 5, true)]) + const report = await budgetBreachView(store) + expect(report.findings).toHaveLength(1) + const ratio = report.findings[0]!.excessRatio + expect(ratio).toBe(Infinity) + expect(Number.isNaN(ratio)).toBe(false) + }) + + it('computes excessRatio as consumed/limit when limit > 0', async () => { + const store = await storeWith([run('r1', 's1')], [entry('r1', 'usd', 2, 5, true)]) + const report = await budgetBreachView(store) + expect(report.findings[0]!.excessRatio).toBe(2.5) + }) + + it('tallies byDimension, byScenario, byVariant and breachedRunRatio across runs', async () => { + const store = await storeWith( + [run('r1', 's1', 'v1'), run('r2', 's1', 'v2'), run('r3', 's2', 'v1')], + [ + entry('r1', 'tokens', 100, 200, true), + entry('r1', 'wallMs', 1000, 2000, true), // two breaches, same run + entry('r2', 'tokens', 100, 200, true), + // r3 has only a non-breach entry -> contributes to totalRuns, not breaches + entry('r3', 'tokens', 100, 10, false), + ], + ) + const report = await budgetBreachView(store) + expect(report.totalRuns).toBe(3) + expect(report.byDimension).toEqual({ tokens: 2, wallMs: 1 }) + expect(report.byScenario).toEqual({ s1: 3 }) + expect(report.byVariant).toEqual({ v1: 2, v2: 1 }) + // 2 of 3 runs breached at least once + expect(report.breachedRunRatio).toBeCloseTo(2 / 3, 10) + }) + + it('breachedRunRatio is 0 (not NaN) when there are no runs', async () => { + const store = new InMemoryTraceStore() + const report = await budgetBreachView(store) + expect(report.breachedRunRatio).toBe(0) + expect(Number.isNaN(report.breachedRunRatio)).toBe(false) + expect(report.findings).toHaveLength(0) + }) + + it('counts a run breaching multiple dimensions once toward breachedRunRatio', async () => { + const store = await storeWith( + [run('r1', 's1')], + [ + entry('r1', 'tokens', 100, 200, true), + entry('r1', 'usd', 1, 2, true), + entry('r1', 'calls', 5, 9, true), + ], + ) + const report = await budgetBreachView(store) + expect(report.findings).toHaveLength(3) + expect(report.breachedRunRatio).toBe(1) + }) +}) diff --git a/src/pipelines/failure-cluster.test.ts b/src/pipelines/failure-cluster.test.ts new file mode 100644 index 00000000..c7bc1b4b --- /dev/null +++ b/src/pipelines/failure-cluster.test.ts @@ -0,0 +1,142 @@ +import { describe, expect, it } from 'vitest' +import type { Run, ToolSpan } from '../trace/schema' +import { InMemoryTraceStore } from '../trace/store' +import { failureClusterView } from './failure-cluster' + +function failedRun(runId: string, scenarioId: string): Run { + return { + runId, + scenarioId, + startedAt: 1000, + endedAt: 2000, + status: 'completed', + outcome: { pass: false, failureClass: 'tool_recovery_failure' }, + } +} + +function erroredTool(runId: string, name: string, args: unknown): ToolSpan { + return { + spanId: `${runId}-t`, + runId, + kind: 'tool', + name: `tool.${name}`, + startedAt: 1500, + status: 'error', + error: `${name} blew up`, + toolName: name, + args, + } +} + +async function storeWith( + entries: Array<{ run: Run; spans: ToolSpan[] }>, +): Promise { + const store = new InMemoryTraceStore() + for (const { run, spans } of entries) { + await store.appendRun(run) + for (const s of spans) await store.appendSpan(s) + } + return store +} + +describe('failureClusterView', () => { + it('collapses identical failureClass+toolName+argPrefix into one cluster with summed runCount', async () => { + const store = await storeWith([ + { + run: failedRun('r1', 's1'), + spans: [erroredTool('r1', 'bash', { cmd: 'ls -la /etc/passwd' })], + }, + { + run: failedRun('r2', 's2'), + spans: [erroredTool('r2', 'bash', { cmd: 'ls -la /etc/passwd' })], + }, + { + run: failedRun('r3', 's1'), + spans: [erroredTool('r3', 'bash', { cmd: 'ls -la /etc/passwd' })], + }, + ]) + const report = await failureClusterView(store) + expect(report.clusters).toHaveLength(1) + const c = report.clusters[0]! + expect(c.runCount).toBe(3) + expect(c.toolName).toBe('bash') + expect(c.failureClass).toBe('tool_recovery_failure') + // distinct scenarios are tracked, runs are not double-counted as scenarios + expect([...c.scenarioIds].sort()).toEqual(['s1', 's2']) + expect(report.totalFailures).toBe(3) + expect(report.totalRuns).toBe(3) + }) + + it('splits clusters that differ on any of the three dimensions', async () => { + const store = await storeWith([ + // same class+tool, DIFFERENT args -> argPrefix differs -> separate cluster + { run: failedRun('a1', 's'), spans: [erroredTool('a1', 'bash', { cmd: 'one' })] }, + { run: failedRun('a2', 's'), spans: [erroredTool('a2', 'bash', { cmd: 'two' })] }, + // same class+args-shape but DIFFERENT tool -> separate cluster + { run: failedRun('b1', 's'), spans: [erroredTool('b1', 'curl', { cmd: 'one' })] }, + ]) + const report = await failureClusterView(store) + // three distinct (class, tool, argPrefix) tuples + expect(report.clusters).toHaveLength(3) + expect(report.clusters.every((c) => c.runCount === 1)).toBe(true) + }) + + it('does NOT merge a tool whose argHash differs only by a 17th+ char (argPrefix is first 16)', async () => { + // argHash is stableStringify of args; pick two arg objects whose stringify + // shares the first 16 chars but diverges later -> SAME prefix -> SAME cluster. + const shared = 'aaaaaaaaaaaaaa' // padding inside a shared key + const store = await storeWith([ + { run: failedRun('p1', 's'), spans: [erroredTool('p1', 'bash', { k: `${shared}1` })] }, + { run: failedRun('p2', 's'), spans: [erroredTool('p2', 'bash', { k: `${shared}2` })] }, + ]) + const report = await failureClusterView(store) + // {"k":"aaaaaaaaaaaaaa... -> first 16 chars identical across both + expect(report.clusters).toHaveLength(1) + expect(report.clusters[0]!.runCount).toBe(2) + expect(report.clusters[0]!.argPrefix).toHaveLength(16) + }) + + it('minClusterSize filters out clusters below the threshold', async () => { + const store = await storeWith([ + { run: failedRun('big1', 's'), spans: [erroredTool('big1', 'bash', { cmd: 'x' })] }, + { run: failedRun('big2', 's'), spans: [erroredTool('big2', 'bash', { cmd: 'x' })] }, + { run: failedRun('small', 's'), spans: [erroredTool('small', 'curl', { cmd: 'y' })] }, + ]) + const filtered = await failureClusterView(store, { minClusterSize: 2 }) + expect(filtered.clusters).toHaveLength(1) + expect(filtered.clusters[0]!.toolName).toBe('bash') + expect(filtered.clusters[0]!.runCount).toBe(2) + // but totals still count every failure regardless of filtering + expect(filtered.totalFailures).toBe(3) + }) + + it('sorts clusters by descending runCount', async () => { + const store = await storeWith([ + { run: failedRun('s1', 's'), spans: [erroredTool('s1', 'rare', { cmd: 'r' })] }, + { run: failedRun('m1', 's'), spans: [erroredTool('m1', 'mid', { cmd: 'm' })] }, + { run: failedRun('m2', 's'), spans: [erroredTool('m2', 'mid', { cmd: 'm' })] }, + { run: failedRun('h1', 's'), spans: [erroredTool('h1', 'hot', { cmd: 'h' })] }, + { run: failedRun('h2', 's'), spans: [erroredTool('h2', 'hot', { cmd: 'h' })] }, + { run: failedRun('h3', 's'), spans: [erroredTool('h3', 'hot', { cmd: 'h' })] }, + ]) + const report = await failureClusterView(store) + const counts = report.clusters.map((c) => c.runCount) + expect(counts).toEqual([3, 2, 1]) + expect(report.clusters[0]!.toolName).toBe('hot') + }) + + it('does not count passing/completed runs as failures', async () => { + const store = new InMemoryTraceStore() + await store.appendRun({ + runId: 'ok', + scenarioId: 's', + startedAt: 1, + status: 'completed', + outcome: { pass: true, score: 1 }, + }) + const report = await failureClusterView(store) + expect(report.totalFailures).toBe(0) + expect(report.clusters).toHaveLength(0) + expect(report.totalRuns).toBe(1) + }) +}) diff --git a/src/pipelines/first-divergence.test.ts b/src/pipelines/first-divergence.test.ts new file mode 100644 index 00000000..9ebc6330 --- /dev/null +++ b/src/pipelines/first-divergence.test.ts @@ -0,0 +1,62 @@ +import { describe, expect, it } from 'vitest' +import type { Span } from '../trace/schema' +import { InMemoryTraceStore } from '../trace/store' +import { firstDivergenceView } from './first-divergence' + +function llm(runId: string, n: number, model: string): Span { + return { + spanId: `${runId}-llm-${n}`, + runId, + kind: 'llm', + name: `llm.step.${n}`, + startedAt: n * 1000, + endedAt: n * 1000 + 100, + model, + messages: [], + } +} + +async function seed(store: InMemoryTraceStore, runId: string, spans: Span[]): Promise { + for (const s of spans) await store.appendSpan(s) + // buildTrajectory only reads spans + events; a run row isn't required for it. + void runId +} + +describe('firstDivergenceView', () => { + it('reports the first differing step on equal-length, diverging trajectories', async () => { + const store = new InMemoryTraceStore() + await seed(store, 'A', [llm('A', 1, 'm1'), llm('A', 2, 'm1')]) + await seed(store, 'B', [llm('B', 1, 'm1'), llm('B', 2, 'm2')]) + const r = await firstDivergenceView(store, 'A', 'B') + expect(r.firstDivergenceIndex).toBe(1) + expect(r.aStep).toBeDefined() + expect(r.bStep).toBeDefined() + expect(r.reason).toContain('model m1 vs m2') + }) + + it('handles one empty trajectory without leaking "index -1" or an undefined step', async () => { + const store = new InMemoryTraceStore() + // Run A is empty (no spans); run B has steps. + await seed(store, 'A', []) + await seed(store, 'B', [llm('B', 1, 'm1'), llm('B', 2, 'm1')]) + const r = await firstDivergenceView(store, 'A', 'B') + // Divergence is the first step itself, not minLen-1 === -1. + expect(r.firstDivergenceIndex).toBe(0) + expect(r.commonPrefixLen).toBe(0) + expect(r.reason).not.toContain('index -1') + expect(r.reason).toContain('index 0') + // The empty side has no step at index 0; the present side does. + expect(r.aStep).toBeUndefined() + expect(r.bStep).toBeDefined() + expect(r.bStep!.index).toBe(0) + }) + + it('reports identical trajectories as no divergence', async () => { + const store = new InMemoryTraceStore() + await seed(store, 'A', [llm('A', 1, 'm1')]) + await seed(store, 'B', [llm('B', 1, 'm1')]) + const r = await firstDivergenceView(store, 'A', 'B') + expect(r.firstDivergenceIndex).toBeNull() + expect(r.commonPrefixLen).toBe(1) + }) +}) diff --git a/src/pipelines/first-divergence.ts b/src/pipelines/first-divergence.ts index 84d72c58..1bd2cc8a 100644 --- a/src/pipelines/first-divergence.ts +++ b/src/pipelines/first-divergence.ts @@ -54,13 +54,20 @@ export async function firstDivergenceView( return { runA, runB, firstDivergenceIndex: null, commonPrefixLen: minLen } } const longer: Trajectory = a.steps.length > b.steps.length ? a : b + const extra = longer.steps.length - minLen + // minLen === 0 means one trajectory is empty: divergence is the first step + // itself (index 0), and the absent side has no step to surface. + const reason = + minLen === 0 + ? `one trajectory is empty; the other has ${extra} step(s) starting at index 0` + : `one trajectory has ${extra} more step(s) after index ${minLen - 1}` return { runA, runB, firstDivergenceIndex: minLen, aStep: a.steps[minLen], bStep: b.steps[minLen], - reason: `one trajectory has ${longer.steps.length - minLen} more step(s) after index ${minLen - 1}`, + reason, commonPrefixLen: minLen, } } diff --git a/src/pipelines/judge-agreement.test.ts b/src/pipelines/judge-agreement.test.ts new file mode 100644 index 00000000..b6e36715 --- /dev/null +++ b/src/pipelines/judge-agreement.test.ts @@ -0,0 +1,108 @@ +import { describe, expect, it } from 'vitest' +import type { JudgeSpan } from '../trace/schema' +import { InMemoryTraceStore } from '../trace/store' +import { judgeAgreementView } from './judge-agreement' + +let seq = 0 +function judge(judgeId: string, targetSpanId: string, dimension: string, score: number): JudgeSpan { + return { + spanId: `j${seq++}`, + runId: 'r', + kind: 'judge', + name: 'judge.score', + startedAt: 1000, + judgeId, + targetSpanId, + dimension, + score, + } +} + +async function storeWith(spans: JudgeSpan[]): Promise { + seq = 0 + const store = new InMemoryTraceStore() + await store.appendRun({ runId: 'r', scenarioId: 's', startedAt: 1, status: 'completed' }) + for (const s of spans) await store.appendSpan(s) + return store +} + +describe('judgeAgreementView', () => { + it('skips a pair with fewer than 2 common items', async () => { + // A and B only co-score ONE target (t1); t2 is A-only, t3 is B-only. + const store = await storeWith([ + judge('A', 't1', 'quality', 0.5), + judge('A', 't2', 'quality', 0.9), + judge('B', 't1', 'quality', 0.5), + judge('B', 't3', 'quality', 0.1), + ]) + const report = await judgeAgreementView(store) + expect(report.pairs).toHaveLength(0) + expect(report.judgeIds).toEqual(['A', 'B']) + }) + + it('identical varying scores across common targets => pearson 1', async () => { + const store = await storeWith([ + judge('A', 't1', 'quality', 0.3), + judge('A', 't2', 'quality', 0.7), + judge('B', 't1', 'quality', 0.3), + judge('B', 't2', 'quality', 0.7), + ]) + const report = await judgeAgreementView(store) + expect(report.pairs).toHaveLength(1) + expect(report.pairs[0]!.commonItems).toBe(2) + expect(report.pairs[0]!.pearson).toBe(1) + }) + + it('pearson is 1 when BOTH judges have zero variance (constant scores)', async () => { + const store = await storeWith([ + judge('A', 't1', 'quality', 0.5), + judge('A', 't2', 'quality', 0.5), + judge('B', 't1', 'quality', 0.5), + judge('B', 't2', 'quality', 0.5), + ]) + const report = await judgeAgreementView(store) + expect(report.pairs).toHaveLength(1) + expect(report.pairs[0]!.pearson).toBe(1) + }) + + it('pearson is 0 when EXACTLY one judge has zero variance', async () => { + const store = await storeWith([ + judge('A', 't1', 'quality', 0.5), // A constant -> variance 0 + judge('A', 't2', 'quality', 0.5), + judge('B', 't1', 'quality', 0.2), // B varies -> variance > 0 + judge('B', 't2', 'quality', 0.8), + ]) + const report = await judgeAgreementView(store) + expect(report.pairs).toHaveLength(1) + expect(report.pairs[0]!.pearson).toBe(0) + }) + + it('sorts pairs by descending commonItems', async () => { + const store = await storeWith([ + // A/B share 3 targets + judge('A', 't1', 'd', 0.1), + judge('A', 't2', 'd', 0.2), + judge('A', 't3', 'd', 0.3), + judge('B', 't1', 'd', 0.1), + judge('B', 't2', 'd', 0.2), + judge('B', 't3', 'd', 0.3), + // C shares 2 targets with A and B on a different dimension + judge('A', 'u1', 'e', 0.1), + judge('A', 'u2', 'e', 0.2), + judge('C', 'u1', 'e', 0.1), + judge('C', 'u2', 'e', 0.2), + ]) + const report = await judgeAgreementView(store) + const items = report.pairs.map((p) => p.commonItems) + // descending order + expect(items).toEqual([...items].sort((a, b) => b - a)) + expect(items[0]).toBe(3) + expect(report.dimensions).toEqual(['d', 'e']) + }) + + it('returns empty report when there are no judge spans', async () => { + const store = await storeWith([]) + const report = await judgeAgreementView(store) + expect(report).toEqual({ pairs: [], dimensions: [], judgeIds: [] }) + }) +}) diff --git a/src/pipelines/regression.test.ts b/src/pipelines/regression.test.ts new file mode 100644 index 00000000..08bd3bc1 --- /dev/null +++ b/src/pipelines/regression.test.ts @@ -0,0 +1,153 @@ +import { describe, expect, it } from 'vitest' +import type { LlmSpan, Run } from '../trace/schema' +import { InMemoryTraceStore } from '../trace/store' +import { regressionView } from './regression' + +function run(runId: string, patch: Partial = {}): Run { + return { runId, scenarioId: 's', startedAt: 1000, status: 'completed', ...patch } +} + +function llm(runId: string, patch: Partial): LlmSpan { + return { + spanId: `${runId}-llm`, + runId, + kind: 'llm', + name: 'llm.call', + startedAt: 1000, + model: 'm', + messages: [], + ...patch, + } +} + +function metricMean(report: Awaited>, metric: string) { + const m = report.metrics.find((x) => x.metric === metric) + if (!m) throw new Error(`metric ${metric} not in report`) + return { baselineMean: m.baselineMean, candidateMean: m.candidateMean } +} + +describe('regressionView defaultExtract metric mapping', () => { + it('extracts the correct scalar for each metric name (score/pass/durationMs)', async () => { + const store = new InMemoryTraceStore() + const tag = (v: string) => ({ tags: { slice: v } }) + await store.appendRun( + run('b1', { ...tag('base'), outcome: { score: 0.4, pass: false }, endedAt: 1500 }), + ) + await store.appendRun( + run('b2', { ...tag('base'), outcome: { score: 0.6, pass: false }, endedAt: 1700 }), + ) + await store.appendRun( + run('c1', { ...tag('cand'), outcome: { score: 0.8, pass: true }, endedAt: 1900 }), + ) + await store.appendRun( + run('c2', { ...tag('cand'), outcome: { score: 1.0, pass: true }, endedAt: 2100 }), + ) + + const report = await regressionView( + store, + [ + { metric: 'score', higherIsBetter: true }, + { metric: 'pass', higherIsBetter: true }, + { metric: 'durationMs', higherIsBetter: false }, + ], + { + baseline: { tag: { key: 'slice', value: 'base' } }, + candidate: { tag: { key: 'slice', value: 'cand' } }, + }, + ) + + expect(metricMean(report, 'score')).toEqual({ baselineMean: 0.5, candidateMean: 0.9 }) + // pass maps true->1, false->0 + expect(metricMean(report, 'pass')).toEqual({ baselineMean: 0, candidateMean: 1 }) + // durationMs = endedAt - startedAt (startedAt 1000) + expect(metricMean(report, 'durationMs')).toEqual({ baselineMean: 600, candidateMean: 1000 }) + }) + + it('maps costUsd and inputTokens from aggregated llm spans', async () => { + const store = new InMemoryTraceStore() + await store.appendRun(run('b1', { tags: { slice: 'base' } })) + await store.appendRun(run('b2', { tags: { slice: 'base' } })) + await store.appendRun(run('c1', { tags: { slice: 'cand' } })) + await store.appendRun(run('c2', { tags: { slice: 'cand' } })) + await store.appendSpan(llm('b1', { costUsd: 0.1, inputTokens: 100 })) + await store.appendSpan(llm('b2', { costUsd: 0.3, inputTokens: 300 })) + await store.appendSpan(llm('c1', { costUsd: 0.5, inputTokens: 500 })) + await store.appendSpan(llm('c2', { costUsd: 0.7, inputTokens: 700 })) + + const report = await regressionView( + store, + [ + { metric: 'costUsd', higherIsBetter: false }, + { metric: 'inputTokens', higherIsBetter: false }, + ], + { + baseline: { tag: { key: 'slice', value: 'base' } }, + candidate: { tag: { key: 'slice', value: 'cand' } }, + }, + ) + expect(metricMean(report, 'costUsd')).toEqual({ baselineMean: 0.2, candidateMean: 0.6 }) + expect(metricMean(report, 'inputTokens')).toEqual({ baselineMean: 200, candidateMean: 600 }) + }) + + it('maps failureClass to 1 for success runs and 0 otherwise', async () => { + const store = new InMemoryTraceStore() + // success: completed with pass != false + await store.appendRun(run('b1', { tags: { slice: 'base' }, outcome: { pass: true } })) + await store.appendRun(run('b2', { tags: { slice: 'base' }, outcome: { pass: true } })) + // non-success: explicit failureClass + await store.appendRun( + run('c1', { + tags: { slice: 'cand' }, + outcome: { pass: false, failureClass: 'reasoning_error' }, + }), + ) + await store.appendRun( + run('c2', { + tags: { slice: 'cand' }, + outcome: { pass: false, failureClass: 'reasoning_error' }, + }), + ) + const report = await regressionView(store, [{ metric: 'failureClass', higherIsBetter: true }], { + baseline: { tag: { key: 'slice', value: 'base' } }, + candidate: { tag: { key: 'slice', value: 'cand' } }, + }) + expect(metricMean(report, 'failureClass')).toEqual({ baselineMean: 1, candidateMean: 0 }) + }) + + it('drops null samples (unknown metric / missing scalar) instead of counting them as 0', async () => { + const store = new InMemoryTraceStore() + // Two runs WITH a score, plus runs that produce null for `score` + // (no outcome.score). The null runs must be DROPPED, so the mean is + // computed over only the present scores — NOT diluted toward 0. + await store.appendRun(run('b1', { tags: { slice: 'base' }, outcome: { score: 0.8 } })) + await store.appendRun(run('b2', { tags: { slice: 'base' }, outcome: { score: 0.8 } })) + await store.appendRun(run('b3', { tags: { slice: 'base' } })) // null score -> dropped + await store.appendRun(run('c1', { tags: { slice: 'cand' }, outcome: { score: 0.9 } })) + await store.appendRun(run('c2', { tags: { slice: 'cand' }, outcome: { score: 0.9 } })) + await store.appendRun(run('c3', { tags: { slice: 'cand' } })) // null score -> dropped + + const report = await regressionView(store, [{ metric: 'score', higherIsBetter: true }], { + baseline: { tag: { key: 'slice', value: 'base' } }, + candidate: { tag: { key: 'slice', value: 'cand' } }, + }) + // If the null run were counted as 0, baselineMean would be (0.8+0.8+0)/3 = 0.533. + // Correct (dropped) mean is 0.8. + const { baselineMean, candidateMean } = metricMean(report, 'score') + expect(baselineMean).toBeCloseTo(0.8, 10) + expect(candidateMean).toBeCloseTo(0.9, 10) + }) + + it('throws for an unknown metric because all samples extract to null and get dropped', async () => { + const store = new InMemoryTraceStore() + await store.appendRun(run('b1', { tags: { slice: 'base' }, outcome: { score: 1 } })) + await store.appendRun(run('c1', { tags: { slice: 'cand' }, outcome: { score: 1 } })) + // 'banana' is not a known metric -> defaultExtract returns null for every + // run -> extractAll yields [] -> compareToBaseline needs >=2 samples. + await expect( + regressionView(store, [{ metric: 'banana', higherIsBetter: true }], { + baseline: { tag: { key: 'slice', value: 'base' } }, + candidate: { tag: { key: 'slice', value: 'cand' } }, + }), + ).rejects.toThrow(/banana/) + }) +}) diff --git a/src/pipelines/stuck-loop.test.ts b/src/pipelines/stuck-loop.test.ts new file mode 100644 index 00000000..abdacecb --- /dev/null +++ b/src/pipelines/stuck-loop.test.ts @@ -0,0 +1,73 @@ +import { describe, expect, it } from 'vitest' +import type { Run, ToolSpan } from '../trace/schema' +import { InMemoryTraceStore } from '../trace/store' +import { stuckLoopView } from './stuck-loop' + +function tool(i: number, name: string, args: unknown): ToolSpan { + return { + spanId: `t${i}`, + runId: 'r', + kind: 'tool', + name: `tool.${name}`, + startedAt: 1000 + i * 1000, + toolName: name, + args, + } +} + +async function storeWith(spans: ToolSpan[]): Promise { + const store = new InMemoryTraceStore() + await store.appendRun({ runId: 'r', scenarioId: 's' } as Run) + for (const s of spans) await store.appendSpan(s) + return store +} + +describe('stuckLoopView', () => { + it('flags a loop at exactly minOccurrences, windowMs spans first→last', async () => { + const store = await storeWith([ + tool(0, 'bash', { cmd: 'x' }), + tool(1, 'bash', { cmd: 'x' }), + tool(2, 'bash', { cmd: 'x' }), + ]) + const r = await stuckLoopView(store, { minOccurrences: 3 }) + expect(r.findings).toHaveLength(1) + expect(r.findings[0]!.occurrences).toBe(3) + expect(r.findings[0]!.windowMs).toBe(2000) + expect(r.affectedRunRatio).toBe(1) + }) + + it('preserves a tool name containing the delimiter (regression: split("|") mislabel)', async () => { + const store = await storeWith([ + tool(0, 'shell|grep', { q: 'x' }), + tool(1, 'shell|grep', { q: 'x' }), + tool(2, 'shell|grep', { q: 'x' }), + ]) + const r = await stuckLoopView(store, { minOccurrences: 3 }) + expect(r.findings[0]!.toolName).toBe('shell|grep') + }) + + it('does not collapse two distinct tools whose names+hashes alias across the delimiter', async () => { + // "a|b" with hash "c" vs "a" with hash "b|c" used to share a key. + const store = await storeWith([ + tool(0, 'a|b', { k: 'c' }), + tool(1, 'a|b', { k: 'c' }), + tool(2, 'a|b', { k: 'c' }), + tool(3, 'a', { k: 'b|c' }), + tool(4, 'a', { k: 'b|c' }), + tool(5, 'a', { k: 'b|c' }), + ]) + const r = await stuckLoopView(store, { minOccurrences: 3 }) + const names = r.findings.map((f) => f.toolName).sort() + expect(names).toEqual(['a', 'a|b']) + }) + + it('does not flag distinct args as a loop', async () => { + const store = await storeWith([ + tool(0, 'bash', { cmd: 'a' }), + tool(1, 'bash', { cmd: 'b' }), + tool(2, 'bash', { cmd: 'c' }), + ]) + const r = await stuckLoopView(store, { minOccurrences: 3 }) + expect(r.findings).toHaveLength(0) + }) +}) diff --git a/src/pipelines/stuck-loop.ts b/src/pipelines/stuck-loop.ts index c5180832..6d5e7c93 100644 --- a/src/pipelines/stuck-loop.ts +++ b/src/pipelines/stuck-loop.ts @@ -46,22 +46,26 @@ export async function stuckLoopView( const findings: StuckLoopFinding[] = [] for (const { runId } of runs) { const tools = await toolSpans(store, runId) - const byKey = new Map() + const byKey = new Map() for (const t of tools) { const h = argHash(t.args) - const key = `${t.toolName}|${h}` - const bucket = byKey.get(key) ?? { spans: [], argHash: h } + // NUL delimiter never appears in a tool name or a JSON argHash, so the + // key cannot collide; toolName is carried in the bucket rather than + // re-derived by splitting the key (which mislabeled any tool whose name + // contained the delimiter, e.g. "shell|grep"). + const key = `${t.toolName}\u0000${h}` + const bucket = byKey.get(key) ?? { spans: [], argHash: h, toolName: t.toolName } bucket.spans.push(t) byKey.set(key, bucket) } - for (const [key, { spans, argHash: h }] of byKey) { + for (const { spans, argHash: h, toolName } of byKey.values()) { if (spans.length < minOccurrences) continue const sorted = [...spans].sort((a, b) => a.startedAt - b.startedAt) const first = sorted[0]!.startedAt const last = sorted[sorted.length - 1]!.startedAt findings.push({ runId, - toolName: key.split('|')[0]!, + toolName, argHash: h, occurrences: sorted.length, spanIds: sorted.map((s) => s.spanId), diff --git a/src/pipelines/tool-waste.test.ts b/src/pipelines/tool-waste.test.ts new file mode 100644 index 00000000..672bab9a --- /dev/null +++ b/src/pipelines/tool-waste.test.ts @@ -0,0 +1,150 @@ +import { describe, expect, it } from 'vitest' +import type { LlmSpan, Message, Run, Span, ToolSpan } from '../trace/schema' +import type { SpanFilter, TraceStore } from '../trace/store' +import { toolWasteView } from './tool-waste' + +// Minimal in-file store implementing only the TraceStore surface toolWasteView +// touches (listRuns + spans). Avoids coupling this slice to the full store. +class StubStore implements TraceStore { + constructor( + private readonly runs: Run[], + private readonly allSpans: Span[], + ) {} + async listRuns(): Promise { + return this.runs + } + async spans(filter: SpanFilter = {}): Promise { + return this.allSpans.filter((s) => { + if (filter.runId && s.runId !== filter.runId) return false + if (filter.kind && s.kind !== filter.kind) return false + if (filter.toolName && (s.kind !== 'tool' || s.toolName !== filter.toolName)) return false + return true + }) + } + async appendRun(): Promise {} + async updateRun(): Promise {} + async appendSpan(): Promise {} + async updateSpan(): Promise {} + async appendEvent(): Promise {} + async appendArtifact(): Promise {} + async appendBudgetEntry(): Promise {} + async getRun(): Promise { + return undefined + } + async events(): Promise { + return [] + } + async budget(): Promise { + return [] + } + async artifacts(): Promise { + return [] + } +} + +function tool(i: number, opts: Partial & { result?: unknown }): ToolSpan { + return { + spanId: `t${i}`, + runId: 'r', + kind: 'tool', + name: `tool.${i}`, + startedAt: 1000 + i * 1000, + toolName: `tool${i}`, + args: {}, + ...opts, + } +} + +function llm(i: number, contents: string[]): LlmSpan { + const messages: Message[] = contents.map((c) => ({ role: 'user', content: c })) + return { + spanId: `l${i}`, + runId: 'r', + kind: 'llm', + name: `llm.${i}`, + startedAt: 1500 + i * 1000, + model: 'm', + messages, + } +} + +function storeWith(tools: ToolSpan[], llms: LlmSpan[]): StubStore { + return new StubStore([{ runId: 'r', scenarioId: 's' } as Run], [...tools, ...llms]) +} + +describe('toolWasteView', () => { + it('counts an error tool call as wasted', async () => { + const store = storeWith([tool(0, { status: 'error', result: 'boom' })], [llm(0, ['hi'])]) + const r = await toolWasteView(store, { runId: 'r' }) + expect(r.byRun[0]!.wastedCalls).toBe(1) + expect(r.byRun[0]!.wasteRate).toBe(1) + }) + + it('does not count a tool whose result appears in a later LLM message', async () => { + const store = storeWith( + [tool(0, { result: 'IMPORTANT_PAYLOAD_42' })], + [llm(0, ['here is the data: IMPORTANT_PAYLOAD_42 thanks'])], + ) + const r = await toolWasteView(store, { runId: 'r' }) + expect(r.byRun[0]!.wastedCalls).toBe(0) + expect(r.byRun[0]!.wasteRate).toBe(0) + }) + + it('counts a tool whose non-empty result never appears downstream as wasted', async () => { + const store = storeWith( + [tool(0, { result: 'UNUSED_PAYLOAD' })], + [llm(0, ['something unrelated'])], + ) + const r = await toolWasteView(store, { runId: 'r' }) + expect(r.byRun[0]!.wastedCalls).toBe(1) + }) + + // Regression: an empty/null tool result stringifies to '' — there is no + // payload to propagate, so it must NOT be counted as waste. The old code + // (`resultStr &&` → falsy → used=false) wrongly counted it. + it('does not count an empty-result tool call as wasted', async () => { + const store = storeWith( + [tool(0, { result: '' }), tool(1, { result: null }), tool(2, { result: undefined })], + [llm(0, ['no payloads here'])], + ) + const r = await toolWasteView(store, { runId: 'r' }) + expect(r.byRun[0]!.wastedCalls).toBe(0) + expect(r.byRun[0]!.wasteRate).toBe(0) + }) + + it('only matches LLM spans that started strictly after the tool', async () => { + // An LLM span that ran BEFORE the tool mentions the payload, but a later + // span must be the one that propagates it. Earlier mention != usage. + const earlier = llm(-2, ['LATE_PAYLOAD already mentioned early']) // startedAt 500 + earlier.startedAt = 500 + const store = storeWith([tool(0, { result: 'LATE_PAYLOAD' })], [earlier]) + const r = await toolWasteView(store, { runId: 'r' }) + expect(r.byRun[0]!.wastedCalls).toBe(1) + }) + + it('honors a usageOracle override and passes only later LLM spans', async () => { + const before = llm(-1, ['before']) + before.startedAt = 500 + const after = llm(0, ['after']) + const store = storeWith([tool(0, { result: 'x' })], [before, after]) + const seen: number[] = [] + const r = await toolWasteView(store, { + runId: 'r', + usageOracle: (_t, later) => { + seen.push(later.llm.length) + return true // oracle says always used + }, + }) + expect(r.byRun[0]!.wastedCalls).toBe(0) + // Only the span started after the tool (startedAt 1000) is later. + expect(seen).toEqual([1]) + }) + + it('reports wasteRate 0 with no NaN for an empty run', async () => { + const store = new StubStore([{ runId: 'r', scenarioId: 's' } as Run], []) + const r = await toolWasteView(store, { runId: 'r' }) + expect(r.byRun[0]!).toEqual({ runId: 'r', wastedCalls: 0, totalCalls: 0, wasteRate: 0 }) + expect(Number.isNaN(r.overallWasteRate)).toBe(false) + expect(r.overallWasteRate).toBe(0) + }) +}) diff --git a/src/pipelines/tool-waste.ts b/src/pipelines/tool-waste.ts index 0788766c..9e622bf2 100644 --- a/src/pipelines/tool-waste.ts +++ b/src/pipelines/tool-waste.ts @@ -12,7 +12,7 @@ import { computeToolUseMetrics } from '../tool-use-metrics' import { llmSpans, toolSpans } from '../trace/query' -import type { ToolSpan } from '../trace/schema' +import type { LlmSpan, ToolSpan } from '../trace/schema' import type { TraceStore } from '../trace/store' export interface ToolWasteFinding { @@ -48,27 +48,34 @@ export async function toolWasteView( continue } const llms = await llmSpans(store, runId) + // Sort LLM spans once by start time, then build a suffix index of the + // concatenated message text. `suffixText[i]` is the haystack of every + // string message content in spans[i..]. Per tool we binary-search the + // first span started strictly after the tool, then test that one suffix + // — turning the per-tool O(llms × messages × content) scan into a single + // O(log llms) lookup over precomputed text. + const sortedLlm = [...llms].sort((a, b) => a.startedAt - b.startedAt) + const startTimes = sortedLlm.map((l) => l.startedAt) + const suffixText = buildSuffixText(sortedLlm) let wasted = 0 for (const t of tools) { if (t.status === 'error') { wasted++ continue } - const laterLlm = llms.filter((l) => l.startedAt > t.startedAt) + // First LLM span started strictly after this tool (upper-bound search). + const cutoff = upperBound(startTimes, t.startedAt) if (options.usageOracle) { - if (!options.usageOracle(t, { llm: laterLlm })) wasted++ + if (!options.usageOracle(t, { llm: sortedLlm.slice(cutoff) })) wasted++ } else { // Default heuristic: a tool whose result is NOT mentioned in any - // later LLM input message is likely wasted. + // later LLM input message is likely wasted. An empty/null result has + // no payload to propagate downstream — there is nothing to find in a + // later message, so it is not evidence of waste; skip it. const resultStr = stringify(t.result) - const used = laterLlm.some((l) => - l.messages.some( - (m) => - typeof m.content === 'string' && - resultStr && - m.content.includes(resultStr.slice(0, 120)), - ), - ) + if (resultStr === '') continue + const haystack = suffixText[cutoff] ?? '' + const used = haystack.includes(resultStr.slice(0, 120)) if (!used) wasted++ } } @@ -80,6 +87,35 @@ export async function toolWasteView( return { byRun, overallWasteRate: totalCalls > 0 ? totalWasted / totalCalls : 0 } } +/** + * Build per-position suffix haystacks: result[i] is the concatenation of every + * string message content in spans[i..end]. Built back-to-front so each entry + * reuses the next one — O(total message text) rather than O(spans²). + */ +function buildSuffixText(spans: LlmSpan[]): string[] { + const result = new Array(spans.length + 1) + result[spans.length] = '' + for (let i = spans.length - 1; i >= 0; i--) { + const own = spans[i]!.messages.map((m) => + typeof m.content === 'string' ? m.content : '', + ).join('\n') + result[i] = `${own}\n${result[i + 1]}` + } + return result +} + +/** Index of the first element strictly greater than `target` in a sorted array. */ +function upperBound(sorted: number[], target: number): number { + let lo = 0 + let hi = sorted.length + while (lo < hi) { + const mid = (lo + hi) >>> 1 + if (sorted[mid]! <= target) lo = mid + 1 + else hi = mid + } + return lo +} + function stringify(v: unknown): string { if (v === null || v === undefined) return '' if (typeof v === 'string') return v diff --git a/src/prm/inference.test.ts b/src/prm/inference.test.ts new file mode 100644 index 00000000..ac1e045e --- /dev/null +++ b/src/prm/inference.test.ts @@ -0,0 +1,113 @@ +import { describe, expect, it } from 'vitest' +import type { TraceStore } from '../trace/store' +import { prmBestOfN, prmEnsembleBestOfN } from './inference' +import type { PrmGradedTrace, PrmGrader } from './rubric' + +const fakeStore = {} as TraceStore + +function tracedTrace(runId: string, score: number): PrmGradedTrace { + return { runId, steps: [], aggregateScore: score, gradedCount: 0, ungradedCount: 0 } +} + +interface ConcurrencyMeter { + peak: number + active: number +} + +/** + * Grader stub that records peak concurrent `grade` calls against a shared + * meter and assigns a score per runId. A shared meter lets multiple graders + * report the true simultaneous in-flight count across the whole pool. Each + * call yields a few times so concurrency is observable. + */ +function makeTrackingGrader( + scores: Record, + meter: ConcurrencyMeter = { peak: 0, active: 0 }, +): PrmGrader & { meter: ConcurrencyMeter } { + const state = { meter } as PrmGrader & { meter: ConcurrencyMeter } + state.grade = async (_store: TraceStore, runId: string): Promise => { + meter.active++ + if (meter.active > meter.peak) meter.peak = meter.active + for (let i = 0; i < 5; i++) await Promise.resolve() + meter.active-- + return tracedTrace(runId, scores[runId] ?? 0) + } + return state +} + +/** Grader whose `grade` always rejects — models an unrecoverable LLM failure. */ +function makeFailingGrader(message: string): PrmGrader { + const state = {} as PrmGrader + state.grade = async (): Promise => { + throw new Error(message) + } + return state +} + +describe('prmBestOfN bounded concurrency', () => { + it('caps concurrent grade calls at the configured concurrency', async () => { + const grader = makeTrackingGrader({}) + const runIds = Array.from({ length: 10 }, (_, i) => `run-${i}`) + + const result = await prmBestOfN(fakeStore, grader, runIds, { concurrency: 2 }) + + expect(result.ranked).toHaveLength(10) + // OLD behavior (Promise.all over all runIds) peaks at 10. + expect(grader.meter.peak).toBeLessThanOrEqual(2) + expect(grader.meter.peak).toBeGreaterThan(0) + }) + + it('still picks the highest-scoring candidate as winner', async () => { + const grader = makeTrackingGrader({ 'run-0': 0.1, 'run-1': 0.9, 'run-2': 0.5 }) + const result = await prmBestOfN(fakeStore, grader, ['run-0', 'run-1', 'run-2'], { + concurrency: 2, + }) + expect(result.winner.runId).toBe('run-1') + }) +}) + +describe('prmEnsembleBestOfN bounded concurrency + isolation', () => { + it('caps concurrent grade calls across the flattened graderxrunId product', async () => { + const scores = { 'run-0': 0.2, 'run-1': 0.8, 'run-2': 0.5 } + const meter: ConcurrencyMeter = { peak: 0, active: 0 } + const g1 = makeTrackingGrader(scores, meter) + const g2 = makeTrackingGrader(scores, meter) + const g3 = makeTrackingGrader(scores, meter) + const runIds = ['run-0', 'run-1', 'run-2'] + + const result = await prmEnsembleBestOfN(fakeStore, [g1, g2, g3], runIds, { concurrency: 2 }) + + expect(result.ranked).toHaveLength(3) + // OLD nested Promise.all fired 3 graders x 3 runIds = 9 calls at once. + // The shared meter measures the true simultaneous in-flight count across + // the whole flattened pool; it must never exceed the configured bound. + expect(meter.peak).toBeLessThanOrEqual(2) + expect(meter.peak).toBeGreaterThan(0) + }) + + it('one grader failing on a candidate does not void the ensemble (allSettled)', async () => { + const scores = { 'run-0': 0.2, 'run-1': 0.9, 'run-2': 0.4 } + const good = makeTrackingGrader(scores) + // Second grader rejects on every candidate; the ensemble must survive. + const flaky = makeFailingGrader('rate limited') + const runIds = ['run-0', 'run-1', 'run-2'] + + const result = await prmEnsembleBestOfN(fakeStore, [good, flaky], runIds, { concurrency: 4 }) + + // OLD behavior: nested Promise.all rejects the whole call on the first + // grader failure. New behavior: the surviving grader still produces a vote. + expect(result.ranked).toHaveLength(3) + expect(result.winner.runId).toBe('run-1') + }) + + it('throws only when every grader fails on every candidate', async () => { + const runIds = ['run-0', 'run-1'] + await expect( + prmEnsembleBestOfN( + fakeStore, + [makeFailingGrader('boom-a'), makeFailingGrader('boom-b')], + runIds, + ), + ).rejects.toThrow(/every grader failed/) + }) +}) diff --git a/src/prm/inference.ts b/src/prm/inference.ts index 31ccd54f..eeb85601 100644 --- a/src/prm/inference.ts +++ b/src/prm/inference.ts @@ -17,13 +17,33 @@ export interface BestOfNResult { stdDev: number } +/** Default max concurrent grader calls. Bounds LLM fan-out so a wide ensemble + * doesn't trigger a provider rate-limit storm. */ +const DEFAULT_PRM_CONCURRENCY = 4 + +export interface PrmBestOfNOptions { + /** Max concurrent `grader.grade` calls. Default 4. */ + concurrency?: number +} + export async function prmBestOfN( store: TraceStore, grader: PrmGrader, runIds: string[], + options: PrmBestOfNOptions = {}, ): Promise { if (runIds.length === 0) throw new Error('prmBestOfN: at least 1 candidate required') - const graded = await Promise.all(runIds.map((id) => grader.grade(store, id))) + const concurrency = Math.max(1, options.concurrency ?? DEFAULT_PRM_CONCURRENCY) + const graded: PrmGradedTrace[] = new Array(runIds.length) + let cursor = 0 + async function worker(): Promise { + while (true) { + const i = cursor++ + if (i >= runIds.length) return + graded[i] = await grader.grade(store, runIds[i]!) + } + } + await Promise.all(Array.from({ length: Math.min(concurrency, runIds.length) }, () => worker())) const ranked = [...graded].sort((a, b) => b.aggregateScore - a.aggregateScore) const mean = graded.reduce((a, g) => a + g.aggregateScore, 0) / graded.length const variance = graded.reduce((a, g) => a + (g.aggregateScore - mean) ** 2, 0) / graded.length @@ -40,25 +60,73 @@ export async function prmEnsembleBestOfN( store: TraceStore, graders: PrmGrader[], runIds: string[], + options: PrmBestOfNOptions = {}, ): Promise { if (graders.length === 0) throw new Error('prmEnsembleBestOfN: at least 1 grader') - const perGrader = await Promise.all( - graders.map(async (g) => { - const graded = await Promise.all(runIds.map((id) => g.grade(store, id))) - return graded.sort((a, b) => b.aggregateScore - a.aggregateScore) - }), - ) - // Borda: rank-sum across graders. + if (runIds.length === 0) throw new Error('prmEnsembleBestOfN: at least 1 candidate required') + const concurrency = Math.max(1, options.concurrency ?? DEFAULT_PRM_CONCURRENCY) + + // Flatten the (grader, runId) product and run it through a single bounded + // pool. Nested unbounded fan-out (graders × runIds) would launch every LLM + // call at once. allSettled isolates failures: one grader (or one runId) + // failing doesn't void the whole ensemble. + type Job = { graderIdx: number; runId: string } + const jobs: Job[] = [] + for (let gi = 0; gi < graders.length; gi++) { + for (const runId of runIds) jobs.push({ graderIdx: gi, runId }) + } + const settled: PromiseSettledResult[] = new Array(jobs.length) + let cursor = 0 + async function worker(): Promise { + while (true) { + const i = cursor++ + if (i >= jobs.length) return + const job = jobs[i]! + settled[i] = await graders[job.graderIdx]!.grade(store, job.runId) + .then((value): PromiseSettledResult => ({ status: 'fulfilled', value })) + .catch((reason): PromiseSettledResult => ({ status: 'rejected', reason })) + } + } + await Promise.all(Array.from({ length: Math.min(concurrency, jobs.length) }, () => worker())) + + // Regroup fulfilled results into per-grader rankings. A grader contributes + // only the candidates it successfully graded; a grader that graded nothing + // is dropped from the vote rather than skewing it with phantom zeros. + const perGrader: PrmGradedTrace[][] = graders.map(() => []) + const failures: { graderIdx: number; runId: string; error: string }[] = [] + for (let i = 0; i < jobs.length; i++) { + const job = jobs[i]! + const result = settled[i]! + if (result.status === 'fulfilled') perGrader[job.graderIdx]!.push(result.value) + else { + const error = result.reason instanceof Error ? result.reason.message : String(result.reason) + failures.push({ graderIdx: job.graderIdx, runId: job.runId, error }) + } + } + const survivingGraders = perGrader.filter((ranking) => ranking.length > 0) + if (survivingGraders.length === 0) { + throw new Error( + `prmEnsembleBestOfN: every grader failed on every candidate (${failures.length} call(s)). First error: ${failures[0]?.error ?? 'unknown'}`, + ) + } + for (const ranking of survivingGraders) + ranking.sort((a, b) => b.aggregateScore - a.aggregateScore) + + // Borda: rank-sum across surviving graders. const bordaScores = new Map() - for (const ranking of perGrader) { + for (const ranking of survivingGraders) { ranking.forEach((g, rank) => { bordaScores.set(g.runId, (bordaScores.get(g.runId) ?? 0) + (ranking.length - rank)) }) } - // Return a synthesized ranking using the first grader's graded traces - // ordered by Borda score. aggregateScore field kept for UX. - const canonical = perGrader[0]! - const byRun = new Map(canonical.map((g) => [g.runId, g])) + // Synthesize a ranking from the union of every successfully-graded trace, + // ordered by Borda score. aggregateScore field kept for UX. Using the union + // (not just the first grader) keeps a candidate that one grader dropped but + // another graded. + const byRun = new Map() + for (const ranking of survivingGraders) { + for (const g of ranking) if (!byRun.has(g.runId)) byRun.set(g.runId, g) + } const ranked = [...byRun.values()].sort( (a, b) => (bordaScores.get(b.runId) ?? 0) - (bordaScores.get(a.runId) ?? 0), ) diff --git a/src/sandbox-harness.test.ts b/src/sandbox-harness.test.ts new file mode 100644 index 00000000..807372bf --- /dev/null +++ b/src/sandbox-harness.test.ts @@ -0,0 +1,127 @@ +import { describe, expect, it } from 'vitest' +import { + type HarnessConfig, + type SandboxDriver, + SandboxHarness, + type SandboxResult, + SubprocessSandboxDriver, + vitestTestParser, +} from './sandbox-harness' +import type { TraceEmitter } from './trace/emitter' + +// Minimal trace emitter stub — the harness only needs `sandbox()` and the +// returned handle's `end`/`fail`. None of the regression behaviour depends +// on the trace payload, so we record nothing. +function stubEmitter(): TraceEmitter { + const handle = { + async end() {}, + async fail() {}, + } + return { + async sandbox() { + return handle as never + }, + } as unknown as TraceEmitter +} + +describe('SubprocessSandboxDriver timeout handling', () => { + it('flags killedByTimeout and forces non-zero exit when a phase overruns', async () => { + const driver = new SubprocessSandboxDriver() + // A process that outlives the timeout. The driver must mark the + // result as timed out — NOT resolve it as a clean close. + const res = await driver.exec('test', 'sleep 5', { + timeoutMs: 150, + } as HarnessConfig) + expect(res.killedByTimeout).toBe(true) + expect(res.exitCode).not.toBe(0) + }) + + it('does not flag killedByTimeout for a fast clean exit', async () => { + const driver = new SubprocessSandboxDriver() + const res = await driver.exec('test', 'true', { timeoutMs: 5000 } as HarnessConfig) + expect(res.killedByTimeout).toBeFalsy() + expect(res.exitCode).toBe(0) + }) + + it('caps captured output for a runaway process (outputTruncated)', async () => { + const driver = new SubprocessSandboxDriver() + // Emit far more than the cap; the buffer must stay bounded. + const res = await driver.exec( + 'run', + `node -e "const b='x'.repeat(1024); for(let i=0;i<5000;i++) process.stdout.write(b)"`, + { maxOutputBytes: 4096 } as HarnessConfig, + ) + expect(res.outputTruncated).toBe(true) + expect(res.stdout.length).toBeLessThanOrEqual(4096) + }) +}) + +describe('SandboxHarness treats a timed-out test phase as a hard fail', () => { + // A driver whose test phase was killed by timeout but whose SIGKILLed + // child reported exit 0 AND whose stdout would parse as "all passed." + // This is the exact masquerade: before the fix the harness scored it a + // pass. + const masqueradeDriver: SandboxDriver = { + id: 'masquerade', + async exec(phase): Promise { + if (phase === 'test') { + return { + phase, + exitCode: 0, + stdout: 'Tests 3 passed', + stderr: '', + wallMs: 1, + killedByTimeout: true, + } + } + return { phase, exitCode: 0, stdout: '', stderr: '', wallMs: 1 } + }, + } + + it('a timed-out test phase fails the harness regardless of exit code', async () => { + const harness = new SandboxHarness(masqueradeDriver) + const config: HarnessConfig = { testCommand: 'pnpm test', testParser: vitestTestParser } + const result = await harness.run(config, stubEmitter()) + expect(result.passed).toBe(false) + expect(result.score).toBe(0) + }) + + it('a clean test phase with a real parse still passes (no false negatives)', async () => { + const cleanDriver: SandboxDriver = { + id: 'clean', + async exec(phase): Promise { + return { + phase, + exitCode: 0, + stdout: 'Tests 4 passed', + stderr: '', + wallMs: 1, + } + }, + } + const harness = new SandboxHarness(cleanDriver) + const config: HarnessConfig = { testCommand: 'pnpm test', testParser: vitestTestParser } + const result = await harness.run(config, stubEmitter()) + expect(result.passed).toBe(true) + expect(result.score).toBe(1) + }) + + it('a timed-out setup phase fails fast without running test', async () => { + let testRan = false + const setupTimeoutDriver: SandboxDriver = { + id: 'setup-timeout', + async exec(phase): Promise { + if (phase === 'test') testRan = true + if (phase === 'setup') { + return { phase, exitCode: 0, stdout: '', stderr: '', wallMs: 1, killedByTimeout: true } + } + return { phase, exitCode: 0, stdout: '', stderr: '', wallMs: 1 } + }, + } + const harness = new SandboxHarness(setupTimeoutDriver) + const config: HarnessConfig = { setupCommand: 'install', testCommand: 'test' } + const result = await harness.run(config, stubEmitter()) + expect(result.passed).toBe(false) + expect(testRan).toBe(false) + }) +}) diff --git a/src/sandbox-harness.ts b/src/sandbox-harness.ts index 86471230..b3226ede 100644 --- a/src/sandbox-harness.ts +++ b/src/sandbox-harness.ts @@ -27,6 +27,12 @@ export interface HarnessConfig { cwd?: string /** Max wall-clock per phase in ms. Default 10 minutes. */ timeoutMs?: number + /** + * Cap on captured stdout+stderr bytes per phase. A runaway process can + * otherwise grow the in-memory buffer without bound. Once hit, further + * output is dropped and `outputTruncated` is set. Default 16 MiB. + */ + maxOutputBytes?: number /** Image for the docker driver. */ image?: string /** Extra env vars (validated; shell-escaped). */ @@ -52,6 +58,19 @@ export interface SandboxResult { wallMs: number testsTotal?: number testsPassed?: number + /** + * True when the process was killed because it exceeded `timeoutMs`. A + * SIGKILLed child can still close with exit code 0; callers MUST treat + * a timed-out phase as a hard failure regardless of `exitCode`, never + * as a pass. `undefined`/`false` means the process completed on its own. + */ + killedByTimeout?: boolean + /** + * True when captured stdout/stderr hit `maxOutputBytes` and further + * output was dropped. The result is still returned (the process was + * not killed for this), but downstream parsers see truncated text. + */ + outputTruncated?: boolean } export interface SandboxDriver { @@ -162,51 +181,126 @@ export class SubprocessSandboxDriver implements SandboxDriver { // cwd when only the constructor arg is supplied. const effectiveCwd = config.cwd ?? this.defaultCwd const effectiveEnv = { ...process.env, ...(this.defaultEnv ?? {}), ...(config.env ?? {}) } + const maxOutputBytes = config.maxOutputBytes ?? 16 * 1024 * 1024 return await new Promise((resolve) => { const child = spawn(command, { shell: true, cwd: effectiveCwd, env: effectiveEnv, + // Own process group so a timeout can SIGKILL the whole tree, not + // just the shell — otherwise a runaway grandchild keeps the pipes + // open and `close` never fires (the timeout silently does nothing). + detached: process.platform !== 'win32', }) let stdout = '' let stderr = '' - child.stdout?.on('data', (d) => { - stdout += String(d) - }) - child.stderr?.on('data', (d) => { - stderr += String(d) - }) - const timeout = setTimeout( - () => { - try { - child.kill('SIGKILL') - } catch (err) { - console.warn('[sandbox-harness] SIGKILL on timeout failed:', err) - } - }, - config.timeoutMs ?? 10 * 60_000, - ) - child.on('close', (code) => { + let outputBytes = 0 + let outputTruncated = false + let killedByTimeout = false + let settled = false + + const onStdout = (d: unknown) => { + const chunk = capture(String(d)) + if (chunk) stdout += chunk + } + const onStderr = (d: unknown) => { + const chunk = capture(String(d)) + if (chunk) stderr += chunk + } + // Bound the in-memory buffer so a runaway process can't OOM the + // harness. Once the cap is hit we keep draining the streams (to let + // the child reach `close`) but discard the bytes. + function capture(s: string): string { + if (outputBytes >= maxOutputBytes) { + outputTruncated = true + return '' + } + const room = maxOutputBytes - outputBytes + if (s.length > room) { + outputBytes = maxOutputBytes + outputTruncated = true + return s.slice(0, room) + } + outputBytes += s.length + return s + } + + child.stdout?.on('data', onStdout) + child.stderr?.on('data', onStderr) + + const cleanup = () => { clearTimeout(timeout) + if (forceResolve) clearTimeout(forceResolve) + child.stdout?.off('data', onStdout) + child.stderr?.off('data', onStderr) + child.removeAllListeners('close') + child.removeAllListeners('error') + } + + // Resolve exactly once. `code`/`signal` come from `close`; on the + // timeout path we force a non-zero exit so a SIGKILLed child that + // reports exit 0 can never read as a clean pass downstream. + const finish = (outcome: { code: number | null; runnerError?: string }) => { + if (settled) return + settled = true + cleanup() const wallMs = Date.now() - start + const exitCode = killedByTimeout + ? outcome.code && outcome.code !== 0 + ? outcome.code + : 124 + : (outcome.code ?? 1) const parsed = - phase === 'test' && config.testParser - ? config.testParser.parse(stdout, stderr, code ?? 1) + !killedByTimeout && phase === 'test' && config.testParser + ? config.testParser.parse(stdout, stderr, exitCode) : undefined resolve({ phase, - exitCode: code ?? 1, + exitCode, stdout, - stderr, + stderr: outcome.runnerError ? stderr + outcome.runnerError : stderr, wallMs, testsTotal: parsed?.testsTotal, testsPassed: parsed?.testsPassed, + killedByTimeout: killedByTimeout || undefined, + outputTruncated: outputTruncated || undefined, }) + } + + // Kill the whole process group on win32-less platforms (we spawned + // detached). On a clean close this is a no-op. + const killTree = () => { + try { + if (process.platform !== 'win32' && typeof child.pid === 'number') { + process.kill(-child.pid, 'SIGKILL') + } else { + child.kill('SIGKILL') + } + } catch (err) { + console.warn('[sandbox-harness] SIGKILL on timeout failed:', err) + } + } + + let forceResolve: ReturnType | undefined + const timeout = setTimeout( + () => { + killedByTimeout = true + killTree() + // Give `close` a brief window to fire (flushing any final output) + // after the group dies; if an orphaned grandchild keeps the pipes + // open, force-resolve so the harness can't hang on a runaway. + forceResolve = setTimeout(() => finish({ code: null }), 2000) + }, + config.timeoutMs ?? 10 * 60_000, + ) + + child.on('close', (code) => { + if (forceResolve) clearTimeout(forceResolve) + finish({ code }) }) child.on('error', (err) => { - clearTimeout(timeout) - const wallMs = Date.now() - start - resolve({ phase, exitCode: 127, stdout, stderr: stderr + String(err), wallMs }) + if (forceResolve) clearTimeout(forceResolve) + finish({ code: 127, runnerError: String(err) }) }) }) } @@ -266,8 +360,11 @@ export class SandboxHarness { if (config.setupCommand) { result.setup = await this.driver.exec('setup', config.setupCommand, config) result.totalWallMs += result.setup.wallMs - if (result.setup.exitCode !== 0) { - await handle.fail(`setup failed (exit ${result.setup.exitCode})`, { + if (result.setup.killedByTimeout || result.setup.exitCode !== 0) { + const reason = result.setup.killedByTimeout + ? `setup timed out after ${result.setup.wallMs}ms` + : `setup failed (exit ${result.setup.exitCode})` + await handle.fail(reason, { exitCode: result.setup.exitCode, wallMs: result.totalWallMs, } as Partial) @@ -277,8 +374,11 @@ export class SandboxHarness { if (config.runCommand) { result.run = await this.driver.exec('run', config.runCommand, config) result.totalWallMs += result.run.wallMs - if (result.run.exitCode !== 0) { - await handle.fail(`run failed (exit ${result.run.exitCode})`, { + if (result.run.killedByTimeout || result.run.exitCode !== 0) { + const reason = result.run.killedByTimeout + ? `run timed out after ${result.run.wallMs}ms` + : `run failed (exit ${result.run.exitCode})` + await handle.fail(reason, { exitCode: result.run.exitCode, wallMs: result.totalWallMs, } as Partial) @@ -288,7 +388,9 @@ export class SandboxHarness { if (config.testCommand) { result.test = await this.driver.exec('test', config.testCommand, config) result.totalWallMs += result.test.wallMs - const passed = result.test.exitCode === 0 + // A timed-out test phase is a hard fail regardless of the exit code + // a SIGKILLed child happens to report — never let it score as a pass. + const passed = !result.test.killedByTimeout && result.test.exitCode === 0 result.passed = passed if (result.test.testsTotal !== undefined && result.test.testsTotal > 0) { result.score = (result.test.testsPassed ?? 0) / result.test.testsTotal diff --git a/src/sandbox-pool.test.ts b/src/sandbox-pool.test.ts new file mode 100644 index 00000000..a43e3d7b --- /dev/null +++ b/src/sandbox-pool.test.ts @@ -0,0 +1,196 @@ +import { describe, expect, it, vi } from 'vitest' +import { createSandboxPool, type SlotFactory } from './sandbox-pool' + +// A tick helper: let queued microtasks (the async release IIFE) drain. +const flush = () => new Promise((r) => setTimeout(r, 0)) + +describe('createSandboxPool reset-failure handling', () => { + it('destroys and replaces a slot whose reset failed — never recycles it dirty', async () => { + const created: string[] = [] + const destroyed: string[] = [] + let resetCalls = 0 + + const factory: SlotFactory = { + async create(slotId) { + created.push(slotId) + return `res-${slotId}` + }, + async reset() { + resetCalls++ + // Fail the reset on the FIRST release. A dirty slot must not be + // handed to the next waiter; it must be destroyed and replaced. + if (resetCalls === 1) throw new Error('reset blew up') + }, + async destroy(slot) { + destroyed.push(slot.id) + }, + } + + const warn = vi.spyOn(console, 'warn').mockImplementation(() => {}) + try { + const pool = createSandboxPool({ size: 1, factory }) + + const a = await pool.checkout() + const firstId = a.slot.id + a.release() + await flush() + + // The dirty slot must have been destroyed, not recycled. + expect(destroyed).toContain(firstId) + + // The next checkout must hand back a FRESH slot (new id), proving the + // pool did not recycle the dirty one. + const b = await pool.checkout() + expect(b.slot.id).not.toBe(firstId) + expect(created.length).toBe(2) + b.release() + await flush() + } finally { + warn.mockRestore() + } + }) + + it('a waiter queued behind a failing-reset slot is satisfied by a fresh slot', async () => { + let resetCalls = 0 + const factory: SlotFactory = { + async create() { + return resetCalls + }, + async reset() { + resetCalls++ + if (resetCalls === 1) throw new Error('reset failed') + }, + async destroy() {}, + } + + const warn = vi.spyOn(console, 'warn').mockImplementation(() => {}) + try { + const pool = createSandboxPool({ size: 1, factory }) + + const a = await pool.checkout() + // Queue a second checkout while the only slot is busy. + const waiterP = pool.checkout() + // Release the first; its reset fails → slot destroyed → waiter must + // be served by a freshly minted slot rather than hang forever. + a.release() + + const b = await waiterP + expect(b.slot.id).toBeDefined() + b.release() + await flush() + } finally { + warn.mockRestore() + } + }) + + it('a successful reset recycles the same slot (no needless churn)', async () => { + const destroyed: string[] = [] + const factory: SlotFactory = { + async create(slotId) { + return slotId + }, + async reset() {}, + async destroy(slot) { + destroyed.push(slot.id) + }, + } + const pool = createSandboxPool({ size: 1, factory }) + + const a = await pool.checkout() + const id = a.slot.id + a.release() + await flush() + + const b = await pool.checkout() + expect(b.slot.id).toBe(id) + expect(destroyed).not.toContain(id) + b.release() + await flush() + }) + + it('drain rejects a pending waiter instead of hanging forever', async () => { + const factory: SlotFactory = { + async create(slotId) { + return slotId + }, + async reset() {}, + async destroy() {}, + } + const pool = createSandboxPool({ size: 1, factory }) + + const a = await pool.checkout() + const waiterP = pool.checkout() + // Drain while a is still held and a waiter is queued. + await pool.drain() + await expect(waiterP).rejects.toThrow(/drained/) + a.release() + await flush() + }) + + it('does not raise an unhandled rejection when destroy also fails', async () => { + let resetCalls = 0 + const factory: SlotFactory = { + async create(slotId) { + return slotId + }, + async reset() { + resetCalls++ + if (resetCalls === 1) throw new Error('reset failed') + }, + async destroy() { + throw new Error('destroy failed too') + }, + } + const warn = vi.spyOn(console, 'warn').mockImplementation(() => {}) + const unhandled: unknown[] = [] + const onUnhandled = (e: unknown) => unhandled.push(e) + process.on('unhandledRejection', onUnhandled) + try { + const pool = createSandboxPool({ size: 1, factory }) + const a = await pool.checkout() + a.release() + await flush() + await flush() + expect(unhandled).toHaveLength(0) + } finally { + process.off('unhandledRejection', onUnhandled) + warn.mockRestore() + } + }) + + it('does not over-mint under concurrent first checkouts (capacity reservation)', async () => { + let created = 0 + const factory: SlotFactory = { + async create(slotId) { + created++ + await Promise.resolve() + return `res-${slotId}` + }, + async reset() {}, + async destroy() {}, + } + const pool = createSandboxPool({ size: 1, factory }) + + // Two checkouts race before the first slot is push()ed. The mint gate reads + // slots.length, still 0 for both — without a reservation counter both mint + // and the size-1 pool provisions 2 sandboxes. + const aP = pool.checkout() + const bP = pool.checkout() + const a = await aP + + // The second checkout must be QUEUED (pending), not a second mint. Race it + // against a macrotask: pre-fix it resolves immediately on its own slot. + const state = await Promise.race([bP.then(() => 'resolved'), flush().then(() => 'pending')]) + expect(state).toBe('pending') + expect(created).toBe(1) + expect(pool.poolSize()).toBe(1) + + // Releasing the one slot hands it to the queued waiter — still no 2nd mint. + a.release() + const b = await bP + expect(created).toBe(1) + expect(pool.poolSize()).toBe(1) + b.release() + await flush() + }) +}) diff --git a/src/sandbox-pool.ts b/src/sandbox-pool.ts index 3d24ab93..38d09199 100644 --- a/src/sandbox-pool.ts +++ b/src/sandbox-pool.ts @@ -79,11 +79,22 @@ export function createSandboxPool(opts: CreateSandboxPoolOpts): SandboxPoo if (opts.size < 1) throw new Error(`sandbox pool size must be >= 1 (got ${opts.size})`) const slots: SlotState[] = [] - const waiters: Array<(s: SlotState) => void> = [] + interface Waiter { + resolve: (s: SlotState) => void + reject: (err: unknown) => void + } + const waiters: Waiter[] = [] const mutex = new Mutex() let nextSlotId = 0 + // In-flight mints: a slot is decided under the mutex but only push()ed into + // `slots` AFTER the un-locked factory.create(). Counting reservations against + // capacity in the SAME critical section as the gate stops two concurrent + // acquires from both minting past `size` (slots.length alone is still 0 for + // both until the first push lands). + let reserving = 0 let totalCheckouts = 0 let busyMs = 0 + let drained = false const startedAt = Date.now() /** @@ -100,52 +111,113 @@ export function createSandboxPool(opts: CreateSandboxPoolOpts): SandboxPoo idle.busy = true return idle } - if (slots.length < opts.size) { - // Reserve a slot ID synchronously so concurrent acquireSlot - // calls don't all decide to mint past the cap. + if (slots.length + reserving < opts.size) { + // Reserve capacity synchronously (live slots + in-flight mints) so + // concurrent acquireSlot calls can't all decide to mint past the cap. mintId = `slot_${nextSlotId++}` + reserving++ return null } return null }) if (ready) return ready if (mintId !== undefined) { - const resource = await opts.factory.create(mintId) + let resource: T + try { + resource = await opts.factory.create(mintId) + } catch (err) { + // Release the reservation and hand the freed capacity to a waiter, + // otherwise a failed create leaks a capacity slot forever. + await mutex.runExclusive(() => { + reserving-- + }) + if (!drained) wakeWaiterToMint() + throw err + } const state: SlotState = { slot: { id: mintId, resource }, busy: true, } await mutex.runExclusive(() => { slots.push(state) + reserving-- }) return state } // All slots busy + at cap: queue. - return new Promise>((resolve) => { - waiters.push((s) => { - s.busy = true - resolve(s) - }) + return new Promise>((resolve, reject) => { + waiters.push({ resolve, reject }) }) } + /** + * Hand `state` (clean + idle) to the next queued waiter, or leave it + * idle if none are waiting. Caller must have already marked the slot + * not-busy. Runs under no lock; waiters are FIFO. + */ + function handOffCleanSlot(state: SlotState): void { + const next = waiters.shift() + if (next) { + state.busy = true + next.resolve(state) + } + } + + /** + * Wake a queued waiter when a slot was destroyed (so capacity freed up + * but no clean slot is on hand). The waiter re-enters acquisition and + * mints a fresh slot. Any acquisition failure is routed to that waiter + * — never dropped on the floor. + */ + function wakeWaiterToMint(): void { + const next = waiters.shift() + if (!next) return + acquireSlot().then(next.resolve, next.reject) + } + + /** + * Release a checked-out slot. Reset is async; we kick it off and the + * slot only becomes available to the next waiter once reset lands + * CLEAN. If reset fails, the slot is dirty — destroy and remove it + * rather than recycle a corrupted workspace to the next waiter, then + * free the capacity so a fresh slot can be minted. The whole flow is + * awaited inside one IIFE whose rejection is caught, so a failed + * destroy can't surface as an unhandled rejection. + */ function releaseSlot(state: SlotState): void { - // Non-async release — runs synchronously inside the user's finally. - // Reset is async; we kick it off and let the next waiter see a - // freshly-reset slot once it lands. void (async () => { try { - if (opts.factory.reset) await opts.factory.reset(state.slot) - } catch (err) { - // A failing reset is the consumer's bug; we still release - // (otherwise the pool deadlocks). Surface via console.warn so - // it doesn't get lost. - console.warn(`[sandbox-pool] reset failed for slot ${state.slot.id}:`, err) + if (opts.factory.reset) { + await opts.factory.reset(state.slot) + } + state.busy = false + if (!drained) handOffCleanSlot(state) + } catch (resetErr) { + // Dirty slot: do NOT recycle it. Remove from the pool and tear it + // down, then wake a waiter to mint a clean replacement. + await mutex.runExclusive(() => { + const i = slots.indexOf(state) + if (i !== -1) slots.splice(i, 1) + }) + try { + await opts.factory.destroy(state.slot) + } catch (destroyErr) { + console.warn( + `[sandbox-pool] destroy of dirty slot ${state.slot.id} failed after reset error:`, + destroyErr, + ) + } + console.warn( + `[sandbox-pool] reset failed for slot ${state.slot.id} — slot destroyed, not recycled:`, + resetErr, + ) + if (!drained) wakeWaiterToMint() } - state.busy = false - const next = waiters.shift() - if (next) next(state) - })() + })().catch((err) => { + // Defense in depth: nothing above should throw out here, but if it + // does, surface it instead of producing an unhandled rejection. + console.warn(`[sandbox-pool] release of slot ${state.slot.id} faulted:`, err) + }) } async function checkout(): Promise<{ slot: PoolSlot; release: () => void }> { @@ -171,20 +243,20 @@ export function createSandboxPool(opts: CreateSandboxPoolOpts): SandboxPoo } async function drain(): Promise { + drained = true // Snapshot under lock; destroy outside lock so a slow teardown // doesn't block a concurrent drain caller. - const snapshot = await mutex.runExclusive(() => { + const { snapshot, pending } = await mutex.runExclusive(() => { const taken = slots.splice(0, slots.length) - // Reject any pending waiters — pool is going away. - for (const w of waiters.splice(0, waiters.length)) { - // Best-effort rejection: the waiter is still pending; we - // can't reject a Promise we already resolved. Surface as a - // warning. In practice, drain() is called when nothing's in - // flight (end of run), so this is a defensive no-op. - void w - } - return taken + const queued = waiters.splice(0, waiters.length) + return { snapshot: taken, pending: queued } }) + // Reject any pending waiters — the pool is going away, so their + // checkout promise can never be satisfied. Failing loud beats leaving + // a caller hung on an awaited checkout() forever. + for (const w of pending) { + w.reject(new Error('sandbox pool drained while a checkout was pending')) + } await Promise.allSettled(snapshot.map((s) => opts.factory.destroy(s.slot))) } diff --git a/src/trace-analyst/behavioral-metrics.test.ts b/src/trace-analyst/behavioral-metrics.test.ts index 8b26119e..002bc89d 100644 --- a/src/trace-analyst/behavioral-metrics.test.ts +++ b/src/trace-analyst/behavioral-metrics.test.ts @@ -98,6 +98,29 @@ describe('computeTraceMetrics — deterministic behavioral signals (no LLM)', () expect(m.hasSelfVerification).toBe(true) expect(m.signals.map((s) => s.code)).not.toContain('no-self-verification') }) + + it('FIRES monotonic-input-growth on a 0→huge blowup (first call reported 0 input tokens)', () => { + // First LLM call reports 0 input tokens, then context explodes. Ratio is + // unbounded — the old `first > 0 ? last/first : 0` forced growth to 0 and + // silently dropped the most severe possible blowup. + const spans = [llmSpan(1, 0, 100), llmSpan(2, 4000, 90), llmSpan(3, 9000, 80)] + const sig = computeTraceMetrics(spans).signals.find((s) => s.code === 'monotonic-input-growth') + expect(sig).toBeDefined() + expect(sig!.severity).toBe('high') + // Displayed ratio must stay sane (no NaN/Infinity literal leaking into the message). + expect(sig!.detail).not.toContain('Infinity') + expect(sig!.detail).not.toContain('NaN') + expect(sig!.detail).toContain('0→9000') + expect(sig!.evidence.first).toBe(0) + expect(sig!.evidence.last).toBe(9000) + expect(sig!.evidence.growth_x).toBe('unbounded') + }) + + it('does NOT fire monotonic-input-growth when input stays at 0 (no blowup)', () => { + const spans = [llmSpan(1, 0, 100), llmSpan(2, 0, 90), llmSpan(3, 0, 80)] + const codes = computeTraceMetrics(spans).signals.map((s) => s.code) + expect(codes).not.toContain('monotonic-input-growth') + }) }) describe('deriveEfficiencyFindings — the 0→4, any-model flip', () => { diff --git a/src/trace-analyst/behavioral-metrics.ts b/src/trace-analyst/behavioral-metrics.ts index ef34f939..e15c31b0 100644 --- a/src/trace-analyst/behavioral-metrics.ts +++ b/src/trace-analyst/behavioral-metrics.ts @@ -105,16 +105,21 @@ export function computeTraceMetrics(spans: readonly TraceAnalystSpan[]): Behavio if (inputTokenTrajectory.length >= 3) { const first = inputTokenTrajectory[0]! const last = inputTokenTrajectory[inputTokenTrajectory.length - 1]! - const growth = first > 0 ? last / first : 0 + // first === 0 with later growth is an unbounded ratio (0→huge context blowup); + // treat it as infinite so the signal fires, and report it as such instead of + // dividing by zero for the displayed factor. + const growthFromZero = first === 0 && last > 0 + const growth = growthFromZero ? Infinity : first > 0 ? last / first : 0 if (last > first && growth >= INPUT_GROWTH_FACTOR) { + const growthLabel = growthFromZero ? '0→nonzero (unbounded)' : `${growth.toFixed(1)}x` signals.push({ code: 'monotonic-input-growth', severity: 'high', - detail: `LLM input tokens grew ${growth.toFixed(1)}x (${first}→${last}) across ${inputTokenTrajectory.length} calls — full history re-sent each step with no compression.`, + detail: `LLM input tokens grew ${growthLabel} (${first}→${last}) across ${inputTokenTrajectory.length} calls — full history re-sent each step with no compression.`, evidence: { first, last, - growth_x: Number(growth.toFixed(2)), + growth_x: growthFromZero ? 'unbounded' : Number(growth.toFixed(2)), calls: inputTokenTrajectory.length, }, }) diff --git a/src/trace-analyst/otlp-span.test.ts b/src/trace-analyst/otlp-span.test.ts new file mode 100644 index 00000000..9696664c --- /dev/null +++ b/src/trace-analyst/otlp-span.test.ts @@ -0,0 +1,96 @@ +import { describe, expect, it } from 'vitest' +import { projectOtlpFlatLine } from './otlp-span' + +describe('projectOtlpFlatLine', () => { + it('returns null when trace_id is missing', () => { + expect(projectOtlpFlatLine({ span_id: 's1' })).toBeNull() + }) + + it('returns null when span_id is missing', () => { + expect(projectOtlpFlatLine({ trace_id: 't1' })).toBeNull() + }) + + it('projects a line that has both trace_id and span_id', () => { + const out = projectOtlpFlatLine({ trace_id: 't1', span_id: 's1' }) + expect(out).not.toBeNull() + expect(out!.trace_id).toBe('t1') + expect(out!.span_id).toBe('s1') + }) + + it('resolves camelCase pivots (traceId/spanId/parentSpanId/startTime/endTime)', () => { + const out = projectOtlpFlatLine({ + traceId: 'tc', + spanId: 'sc', + parentSpanId: 'pc', + startTime: '2024-01-01T00:00:00.000Z', + endTime: '2024-01-01T00:00:01.000Z', + }) + expect(out).not.toBeNull() + expect(out!.trace_id).toBe('tc') + expect(out!.span_id).toBe('sc') + expect(out!.parent_span_id).toBe('pc') + expect(out!.duration_ms).toBe(1000) + }) + + it('resolves snake_case pivots (trace_id/span_id/parent_span_id/start_time/end_time)', () => { + const out = projectOtlpFlatLine({ + trace_id: 'ts', + span_id: 'ss', + parent_span_id: 'ps', + start_time: '2024-01-01T00:00:00.000Z', + end_time: '2024-01-01T00:00:02.500Z', + }) + expect(out).not.toBeNull() + expect(out!.parent_span_id).toBe('ps') + expect(out!.duration_ms).toBe(2500) + }) + + it('span attributes override resource attributes on key overlap', () => { + const out = projectOtlpFlatLine({ + trace_id: 't', + span_id: 's', + resource: { attributes: { 'service.name': 'from-resource', 'only.resource': 'r' } }, + attributes: { 'service.name': 'from-span', 'only.span': 'sp' }, + }) + expect(out).not.toBeNull() + // span attribute wins + expect(out!.attributes['service.name']).toBe('from-span') + expect(out!.service_name).toBe('from-span') + // non-overlapping keys from both layers survive + expect(out!.attributes['only.resource']).toBe('r') + expect(out!.attributes['only.span']).toBe('sp') + }) + + it('clamps duration to 0 when timestamps are unparseable', () => { + const out = projectOtlpFlatLine({ + trace_id: 't', + span_id: 's', + start_time: 'not-a-date', + end_time: 'also-not-a-date', + }) + expect(out).not.toBeNull() + expect(out!.duration_ms).toBe(0) + }) + + it('clamps duration to 0 when end precedes start (never negative)', () => { + const out = projectOtlpFlatLine({ + trace_id: 't', + span_id: 's', + start_time: '2024-01-01T00:00:05.000Z', + end_time: '2024-01-01T00:00:01.000Z', + }) + expect(out).not.toBeNull() + expect(out!.duration_ms).toBe(0) + }) + + it('defaults end_time to start_time when end is absent (duration 0)', () => { + const out = projectOtlpFlatLine({ + trace_id: 't', + span_id: 's', + start_time: '2024-01-01T00:00:05.000Z', + }) + expect(out).not.toBeNull() + expect(out!.duration_ms).toBe(0) + expect(out!.end_time).toBe('2024-01-01T00:00:05.000Z') + }) +}) diff --git a/src/trace-analyst/otlp-span.ts b/src/trace-analyst/otlp-span.ts index 8697a829..6a5d86d2 100644 --- a/src/trace-analyst/otlp-span.ts +++ b/src/trace-analyst/otlp-span.ts @@ -216,3 +216,25 @@ export function firstStringAttr( } return null } + +/** + * Parse a span timestamp to epoch millis, or null when empty/unparseable. The + * OTLP readers accept BOTH ISO-8601 and epoch-millis-string dialects, so raw + * string comparison (`<`, `localeCompare`) mis-orders across dialects and + * `Date.parse` returns NaN for a bare epoch-millis string. + */ +export function spanEpochMillis(ts: string | undefined | null): number | null { + if (!ts) return null + if (/^\d+$/.test(ts)) return Number(ts) + const n = Date.parse(ts) + return Number.isNaN(n) ? null : n +} + +/** + * Order comparator for span timestamps across mixed ISO/epoch dialects. + * Unparseable timestamps sort as epoch 0 (earliest), never NaN (which would + * make the sort non-deterministic). + */ +export function compareSpanTime(a: string, b: string): number { + return (spanEpochMillis(a) ?? 0) - (spanEpochMillis(b) ?? 0) +} diff --git a/src/trace-analyst/otlp-to-run-records.timestamps.test.ts b/src/trace-analyst/otlp-to-run-records.timestamps.test.ts new file mode 100644 index 00000000..7571384e --- /dev/null +++ b/src/trace-analyst/otlp-to-run-records.timestamps.test.ts @@ -0,0 +1,54 @@ +import { describe, expect, it } from 'vitest' +import { otlpToRunRecords } from './otlp-to-run-records' + +// One OTLP-flat span line with sane defaults. +function line(o: Record): string { + return JSON.stringify({ status: { code: 'OK' }, attributes: {}, ...o }) +} +const opts = { experimentId: 'e', candidateId: 'c' } as const + +describe('otlpToRunRecords timestamp handling (epoch + ISO dialects)', () => { + it('computes wallMs from epoch-millis-string timestamps — Date.parse would NaN→0', () => { + const jsonl = line({ + trace_id: 't1', + span_id: 'a', + name: 'llm', + start_time: '1700000000000', + end_time: '1700000005000', + attributes: { + 'openinference.span.kind': 'LLM', + 'llm.input_tokens': 10, + 'llm.output_tokens': 2, + }, + }) + const recs = otlpToRunRecords(jsonl, opts) + expect(recs).toHaveLength(1) + // Pre-fix: Date.parse('1700000000000') is NaN → wallMs silently 0. + expect(recs[0]!.wallMs).toBe(5000) + }) + + it('orders trace bounds by real time across mixed ISO + epoch dialects', () => { + const jsonl = [ + line({ + trace_id: 't2', + span_id: 'older', + name: 'llm', + start_time: '1700000000000', + end_time: '1700000001000', + attributes: { 'openinference.span.kind': 'LLM' }, + }), + line({ + trace_id: 't2', + span_id: 'newer', + name: 'llm', + start_time: '2024-01-01T00:00:00.000Z', + end_time: '2024-01-01T00:00:10.000Z', + attributes: { 'openinference.span.kind': 'LLM' }, + }), + ].join('\n') + const recs = otlpToRunRecords(jsonl, opts) + expect(recs).toHaveLength(1) + const expected = Date.parse('2024-01-01T00:00:10.000Z') - 1_700_000_000_000 + expect(recs[0]!.wallMs).toBe(expected) + }) +}) diff --git a/src/trace-analyst/otlp-to-run-records.ts b/src/trace-analyst/otlp-to-run-records.ts index c5d354c9..420173f4 100644 --- a/src/trace-analyst/otlp-to-run-records.ts +++ b/src/trace-analyst/otlp-to-run-records.ts @@ -40,10 +40,12 @@ import { validateRunRecord, } from '../run-record' import { + compareSpanTime, firstNumberAttr, firstStringAttr, type ProjectedOtlpSpan, projectOtlpFlatLine, + spanEpochMillis, } from './otlp-span' /** Candidate attribute keys for LLM input (prompt) tokens, both dialects. */ @@ -277,9 +279,10 @@ function aggregateTrace( spans: ProjectedOtlpSpan[], fallbackModel: string, ): TraceAggregate { - // Span order within a trace is by start time, then a stable tiebreak. + // Span order within a trace is by start time (epoch-aware across ISO/epoch + // dialects), then a stable tiebreak. const ordered = [...spans].sort( - (a, b) => a.start_time.localeCompare(b.start_time) || a.span_id.localeCompare(b.span_id), + (a, b) => compareSpanTime(a.start_time, b.start_time) || a.span_id.localeCompare(b.span_id), ) let input = 0 @@ -296,8 +299,9 @@ function aggregateTrace( let latest = ordered[0]?.end_time ?? '' for (const s of ordered) { - if (s.start_time && (!earliest || s.start_time < earliest)) earliest = s.start_time - if (s.end_time && s.end_time > latest) latest = s.end_time + if (s.start_time && (!earliest || compareSpanTime(s.start_time, earliest) < 0)) + earliest = s.start_time + if (s.end_time && (!latest || compareSpanTime(s.end_time, latest) > 0)) latest = s.end_time if (s.kind === 'LLM') { llmSpanCount += 1 @@ -331,11 +335,11 @@ function aggregateTrace( const model = topVote(modelVotes) ?? firstModelAttr(ordered) ?? fallbackModel let wallMs = 0 - if (earliest && latest) { - const a = Date.parse(earliest) - const b = Date.parse(latest) - if (!Number.isNaN(a) && !Number.isNaN(b)) wallMs = Math.max(0, b - a) - } + // epoch-aware: Date.parse returns NaN for a bare epoch-millis string, which + // silently zeroed wallMs for traces emitted with epoch timestamps. + const a = spanEpochMillis(earliest) + const b = spanEpochMillis(latest) + if (a !== null && b !== null) wallMs = Math.max(0, b - a) const tokenUsage: RunTokenUsage = sawCached ? { input, output, cached } : { input, output } @@ -403,7 +407,9 @@ function extractPromptCompletion(spans: ProjectedOtlpSpan[]): { } { const llm = spans .filter((s) => s.kind === 'LLM') - .sort((a, b) => a.start_time.localeCompare(b.start_time) || a.span_id.localeCompare(b.span_id)) + .sort( + (a, b) => compareSpanTime(a.start_time, b.start_time) || a.span_id.localeCompare(b.span_id), + ) if (llm.length === 0) return {} const promptText = firstStringAttr(llm[0]!.attributes, ['input.value', 'llm.input_messages', 'gen_ai.prompt']) ?? diff --git a/src/trace-analyst/store-otlp.test.ts b/src/trace-analyst/store-otlp.test.ts index f94ffadc..b2500c58 100644 --- a/src/trace-analyst/store-otlp.test.ts +++ b/src/trace-analyst/store-otlp.test.ts @@ -11,7 +11,12 @@ import { join } from 'node:path' import { describe, expect, it } from 'vitest' import { compileSearchRegex } from './store' -import { OtlpFileTraceStore, TraceFileMissingError, TraceNotFoundError } from './store-otlp' +import { + OtlpFileTraceStore, + TraceFileMissingError, + TraceFileTooLargeError, + TraceNotFoundError, +} from './store-otlp' const TINY_FIXTURE = new URL('../../tests/fixtures/trace-analyst/tiny-trace.jsonl', import.meta.url) .pathname @@ -177,7 +182,9 @@ describe('OtlpFileTraceStore', () => { max_matches: 2, }) expect(result.hits.length).toBe(2) - expect(result.total_matches).toBeGreaterThan(result.hits.length) + // Capped: total_matches mirrors the hit count (no fabricated +1); the + // "more exist" signal lives in has_more, not in an invented total. + expect(result.total_matches).toBe(result.hits.length) expect(result.has_more).toBe(true) }) @@ -195,7 +202,8 @@ describe('OtlpFileTraceStore', () => { }) expect(result.hits).toHaveLength(3) expect(result.has_more).toBe(true) - expect(result.total_matches).toBeGreaterThan(3) + // Capped: no fabricated total — mirrors hits, has_more carries the signal. + expect(result.total_matches).toBe(3) }) it('searchSpan returns hits scoped to one span only', async () => { @@ -310,3 +318,182 @@ describe('OtlpFileTraceStore — error_clusters (deterministic failure coverage) expect(Array.isArray(overview.error_clusters)).toBe(true) }) }) + +describe('OtlpFileTraceStore — concurrent reads (per-call truncation counter)', () => { + // Each span carries a single oversized string attribute so a tiny + // per-attribute budget forces exactly ONE truncation per projected span. + const bigSpan = (trace: string, span: string) => + JSON.stringify({ + trace_id: trace, + span_id: span, + parent_span_id: '', + name: 'op', + kind: 'SPAN_KIND_INTERNAL', + start_time: '2026-05-29T00:00:00Z', + end_time: '2026-05-29T00:00:01Z', + status: { code: 'STATUS_CODE_OK' }, + resource: {}, + attributes: { 'openinference.span.kind': 'TOOL', 'input.value': 'Z'.repeat(5000) }, + }) + + it('reports each call its own truncation count under interleaved reads — bug class: store-keyed delta cross-contaminates concurrent calls', async () => { + const tmp = await mkdtemp(join(tmpdir(), 'trace-concurrency-')) + const path = join(tmp, 'trace.jsonl') + try { + // Trace A: 1 span (→ 1 truncation). Trace B: 3 spans (→ 3 truncations). + const lines = [ + bigSpan('tA', 'a1'), + bigSpan('tB', 'b1'), + bigSpan('tB', 'b2'), + bigSpan('tB', 'b3'), + ] + await writeFile(path, `${lines.join('\n')}\n`) + const store = new OtlpFileTraceStore({ path, perAttributeSpanBudget: 100 }) + + // Fire both reads on the SAME store concurrently. With the old + // before/after delta on a store-keyed WeakMap the awaits interleave + // and each call absorbs the other's increments. + const [a, b] = await Promise.all([ + store.viewSpans({ trace_id: 'tA', span_ids: ['a1'] }), + store.viewSpans({ trace_id: 'tB', span_ids: ['b1', 'b2', 'b3'] }), + ]) + expect(a.truncated_attribute_count).toBe(1) + expect(b.truncated_attribute_count).toBe(3) + + // Repeated interleavings stay stable — no accumulation, no bleed. + for (let i = 0; i < 5; i += 1) { + const [x, y] = await Promise.all([ + store.viewSpans({ trace_id: 'tA', span_ids: ['a1'] }), + store.viewSpans({ trace_id: 'tB', span_ids: ['b1', 'b2', 'b3'] }), + ]) + expect(x.truncated_attribute_count).toBe(1) + expect(y.truncated_attribute_count).toBe(3) + } + } finally { + await rm(tmp, { recursive: true, force: true }) + } + }) +}) + +describe('OtlpFileTraceStore — searchTrace total_matches honesty', () => { + it('reports the EXACT total when uncapped — total equals hit count, has_more is false', async () => { + const store = new OtlpFileTraceStore({ path: TINY_FIXTURE }) + // 'STATUS_CODE_ERROR' occurs once in t000000000001; uncapped scan. + const r = await store.searchTrace({ + trace_id: 't000000000001', + regex_pattern: 'STATUS_CODE_ERROR', + max_matches: 50, + }) + expect(r.total_matches).toBe(r.hits.length) + expect(r.total_matches).toBe(1) + expect(r.has_more).toBe(false) + }) + + it('does NOT fabricate total_matches when capped — bug class: total = max(total, hits+1) invents a number', async () => { + const store = new OtlpFileTraceStore({ path: TINY_FIXTURE }) + // 'span_id' appears in every span line — easily exceeds the cap. + const r = await store.searchTrace({ + trace_id: 't000000000001', + regex_pattern: 'span_id', + max_matches: 2, + }) + expect(r.hits.length).toBe(2) + // Old behavior: total_matches = hits.length + 1 = 3 (a fabricated + // count). New behavior: mirror the hit count and lean on has_more. + expect(r.total_matches).toBe(2) + expect(r.has_more).toBe(true) + }) +}) + +describe('OtlpFileTraceStore — bestAttributePathForOffset escaped quotes', () => { + it('resolves the real attribute_path past a \\"-escaped quote inside the value — bug class: backward scan stops at an escaped quote and returns garbage', async () => { + const tmp = await mkdtemp(join(tmpdir(), 'trace-escq-')) + const path = join(tmp, 'trace.jsonl') + try { + // output.value is itself a JSON string: its escaped inner quotes + // AND inner ':' derail a non-escape-aware backward scan, making it + // report the inner "phase" key instead of the real attribute. + await writeFile( + path, + `${JSON.stringify({ + trace_id: 'tq', + span_id: 's1', + parent_span_id: '', + name: 'op', + kind: 'SPAN_KIND_INTERNAL', + start_time: '2026-05-29T00:00:00Z', + end_time: '2026-05-29T00:00:01Z', + status: { code: 'STATUS_CODE_OK' }, + resource: {}, + attributes: { + 'openinference.span.kind': 'TOOL', + 'output.value': '{"phase":"NEEDLE"}', + }, + })}\n`, + ) + const store = new OtlpFileTraceStore({ path }) + const r = await store.searchSpan({ + trace_id: 'tq', + span_id: 's1', + regex_pattern: 'NEEDLE', + }) + expect(r.hits.length).toBe(1) + expect(r.hits[0]!.matched_text).toBe('NEEDLE') + // The match lives inside output.value; the path must name that key, + // not a fragment of the value or a different field. + expect(r.hits[0]!.attribute_path).toBe('output.value') + } finally { + await rm(tmp, { recursive: true, force: true }) + } + }) +}) + +describe('OtlpFileTraceStore — max-file-size guard (fail loud, no OOM)', () => { + it('throws TraceFileTooLargeError above maxFileBytes instead of reading the whole file', async () => { + const tmp = await mkdtemp(join(tmpdir(), 'trace-toobig-')) + const path = join(tmp, 'trace.jsonl') + try { + const line = JSON.stringify({ + trace_id: 't', + span_id: 's', + parent_span_id: '', + name: 'op', + start_time: '2026-05-29T00:00:00Z', + end_time: '2026-05-29T00:00:01Z', + status: { code: 'STATUS_CODE_OK' }, + resource: {}, + attributes: { 'openinference.span.kind': 'TOOL' }, + }) + await writeFile(path, `${line}\n`) + // File is a few hundred bytes; set the ceiling below it. + const store = new OtlpFileTraceStore({ path, maxFileBytes: 10 }) + await expect(store.ensureIndexed()).rejects.toBeInstanceOf(TraceFileTooLargeError) + } finally { + await rm(tmp, { recursive: true, force: true }) + } + }) + + it('indexes normally when the file is within the configured ceiling', async () => { + const tmp = await mkdtemp(join(tmpdir(), 'trace-okbig-')) + const path = join(tmp, 'trace.jsonl') + try { + const line = JSON.stringify({ + trace_id: 't', + span_id: 's', + parent_span_id: '', + name: 'op', + start_time: '2026-05-29T00:00:00Z', + end_time: '2026-05-29T00:00:01Z', + status: { code: 'STATUS_CODE_OK' }, + resource: {}, + attributes: { 'openinference.span.kind': 'TOOL' }, + }) + await writeFile(path, `${line}\n`) + const store = new OtlpFileTraceStore({ path, maxFileBytes: 1024 * 1024 }) + const overview = await store.getOverview() + expect(overview.total_traces).toBe(1) + } finally { + await rm(tmp, { recursive: true, force: true }) + } + }) +}) diff --git a/src/trace-analyst/store-otlp.timestamps.test.ts b/src/trace-analyst/store-otlp.timestamps.test.ts new file mode 100644 index 00000000..8ad562f2 --- /dev/null +++ b/src/trace-analyst/store-otlp.timestamps.test.ts @@ -0,0 +1,71 @@ +import { mkdtempSync, writeFileSync } from 'node:fs' +import { tmpdir } from 'node:os' +import { join } from 'node:path' +import { describe, expect, it } from 'vitest' +import { OtlpFileTraceStore } from './store-otlp' + +const dir = mkdtempSync(join(tmpdir(), 'otlp-ts-')) + +interface Line { + trace_id: string + span_id: string + name?: string + start_time?: string + end_time?: string + status?: { code: string } + attributes?: Record +} + +function writeOtlp(name: string, lines: Line[]): string { + const p = join(dir, name) + writeFileSync( + p, + `${lines.map((l) => JSON.stringify({ name: 's', status: { code: 'OK' }, attributes: {}, ...l })).join('\n')}\n`, + ) + return p +} + +describe('OtlpFileTraceStore timestamp handling', () => { + it('yields a finite duration (0), never NaN/null, when timestamps are empty', async () => { + const path = writeOtlp('empty.jsonl', [ + { trace_id: 't1', span_id: 'a', start_time: '', end_time: '' }, + { trace_id: 't1', span_id: 'b', start_time: '', end_time: '' }, + ]) + const store = new OtlpFileTraceStore({ path }) + const page = await store.queryTraces({ limit: 10 }) + expect(page.traces[0]!.duration_ms).toBe(0) + expect(Number.isNaN(page.traces[0]!.duration_ms)).toBe(false) + }) + + it('orders the time range by real time across mixed ISO + epoch-millis dialects', async () => { + // epoch-ms 1000 (1970) is earlier than the ISO 2024 timestamp; lexical + // string compare would order them backwards. + const path = writeOtlp('mixed.jsonl', [ + { + trace_id: 't2', + span_id: 'a', + start_time: '2024-01-01T00:00:00.000Z', + end_time: '2024-01-01T00:00:01.000Z', + }, + { trace_id: 't2', span_id: 'b', start_time: '1000', end_time: '2000' }, + ]) + const store = new OtlpFileTraceStore({ path }) + const ov = await store.getOverview() + expect(ov.time_range?.earliest).toBe('1000') + expect(ov.time_range?.latest).toBe('2024-01-01T00:00:01.000Z') + }) + + it('computes a correct positive duration for a normal ISO trace', async () => { + const path = writeOtlp('normal.jsonl', [ + { + trace_id: 't3', + span_id: 'a', + start_time: '2024-01-01T00:00:00.000Z', + end_time: '2024-01-01T00:00:05.000Z', + }, + ]) + const store = new OtlpFileTraceStore({ path }) + const page = await store.queryTraces({ limit: 10 }) + expect(page.traces[0]!.duration_ms).toBe(5000) + }) +}) diff --git a/src/trace-analyst/store-otlp.ts b/src/trace-analyst/store-otlp.ts index 24183c1b..fb146802 100644 --- a/src/trace-analyst/store-otlp.ts +++ b/src/trace-analyst/store-otlp.ts @@ -47,6 +47,34 @@ import { type ViewTraceResult, } from './types' +/** + * Parse a span timestamp to epoch millis, or null when empty/unparseable. The + * OTLP readers accept BOTH ISO-8601 and epoch-millis-string dialects, so raw + * string comparison (`<`, localeCompare) is wrong across dialects, and + * `new Date(str)` is NaN for an epoch-millis string. + */ +function epochOrNull(ts: string): number | null { + if (!ts) return null + if (/^\d+$/.test(ts)) return Number(ts) + const n = Date.parse(ts) + return Number.isNaN(n) ? null : n +} + +/** Ordering key: unparseable timestamps sort as epoch 0 (earliest), never NaN. */ +function epochMs(ts: string): number { + return epochOrNull(ts) ?? 0 +} + +/** Lines indexed between event-loop yields. Bounded synchronous work per + * tick keeps the index build from starving other tasks on large files + * while staying coarse enough that the yields are cheap. */ +const INDEX_YIELD_LINES = 5000 + +/** Hand control back to the event loop without busy-waiting. */ +function yieldToEventLoop(): Promise { + return new Promise((resolve) => setImmediate(resolve)) +} + interface SpanIndexEntry { span_id: string parent_span_id: string | null @@ -101,14 +129,26 @@ export interface OtlpFileTraceStoreOptions { perCallByteCeiling?: number /** Override the per-match text budget. */ perMatchTextBudget?: number + /** + * Hard ceiling on the trace file size in bytes. The store reads the + * whole file into one Buffer and indexes it in memory, so an + * unbounded file OOMs the process. Above this size the store fails + * loud with `TraceFileTooLargeError` instead of degrading silently. + * Default 256 MiB. + */ + maxFileBytes?: number } +/** Default ceiling for {@link OtlpFileTraceStoreOptions.maxFileBytes}. */ +export const DEFAULT_MAX_TRACE_FILE_BYTES = 256 * 1024 * 1024 + export class OtlpFileTraceStore implements TraceAnalysisStore { private readonly path: string private readonly perAttributeViewBudget: number private readonly perAttributeSpanBudget: number private readonly perCallByteCeiling: number private readonly perMatchTextBudget: number + private readonly maxFileBytes: number private indexPromise?: Promise /** Cached UTF-8 buffer of the file. We pin it once because every * read needs slice access and re-reading on each call balloons the @@ -125,6 +165,7 @@ export class OtlpFileTraceStore implements TraceAnalysisStore { opts.perCallByteCeiling ?? DEFAULT_TRACE_ANALYST_BUDGETS.perCallByteCeiling this.perMatchTextBudget = opts.perMatchTextBudget ?? DEFAULT_TRACE_ANALYST_BUDGETS.perMatchTextBudget + this.maxFileBytes = opts.maxFileBytes ?? DEFAULT_MAX_TRACE_FILE_BYTES } // ─── Public API ──────────────────────────────────────────────────── @@ -150,8 +191,8 @@ export class OtlpFileTraceStore implements TraceAnalysisStore { for (const m of t.models) models.add(m) for (const tn of t.tools) tools.add(tn) rawBytes += t.raw_jsonl_bytes - if (!earliest || t.start_time < earliest) earliest = t.start_time - if (!latest || t.end_time > latest) latest = t.end_time + if (!earliest || epochMs(t.start_time) < epochMs(earliest)) earliest = t.start_time + if (!latest || epochMs(t.end_time) > epochMs(latest)) latest = t.end_time if (t.has_errors) { errorTraceCount += 1 for (const s of t.spans) { @@ -224,8 +265,9 @@ export class OtlpFileTraceStore implements TraceAnalysisStore { const spans: TraceAnalystSpan[] = [] let runningBytes = 0 let span_response_bytes_max = 0 + const counter: TruncationCounter = { value: 0 } for (const s of trace.spans) { - const projected = await this.projectSpan(buf, trace.trace_id, s, cap) + const projected = this.projectSpan(buf, trace.trace_id, s, cap, counter) const bytes = Buffer.byteLength(JSON.stringify(projected), 'utf8') span_response_bytes_max = Math.max(span_response_bytes_max, bytes) runningBytes += bytes @@ -267,12 +309,10 @@ export class OtlpFileTraceStore implements TraceAnalysisStore { const buf = await this.buffer() const spans: TraceAnalystSpan[] = [] - let truncated = 0 + const counter: TruncationCounter = { value: 0 } let runningBytes = 0 for (const s of found) { - const before = truncationCounter(this) - const projected = await this.projectSpan(buf, trace.trace_id, s, cap) - truncated += before.delta() + const projected = this.projectSpan(buf, trace.trace_id, s, cap, counter) const bytes = Buffer.byteLength(JSON.stringify(projected), 'utf8') runningBytes += bytes if (runningBytes > this.perCallByteCeiling) { @@ -286,7 +326,7 @@ export class OtlpFileTraceStore implements TraceAnalysisStore { trace_id: trace.trace_id, spans, missing_span_ids: missing, - truncated_attribute_count: truncated, + truncated_attribute_count: counter.value, } } @@ -324,15 +364,19 @@ export class OtlpFileTraceStore implements TraceAnalysisStore { hits.push(h) } if (hits.length >= max_matches) { + // Capped: we stopped scanning, so `total` is a lower bound on the + // real match count — never report it as exact. has_more signals + // "more exist"; total_matches mirrors hits so callers don't read a + // fabricated number. capped = true - total = Math.max(total, hits.length + 1) break } } return { trace_id: trace.trace_id, hits, - total_matches: total, + // Uncapped: every span scanned to completion, so `total` is exact. + total_matches: capped ? hits.length : total, has_more: capped || total > hits.length, } } @@ -383,11 +427,30 @@ export class OtlpFileTraceStore implements TraceAnalysisStore { private async buffer(): Promise { if (!this.bufferPromise) { - this.bufferPromise = readFile(this.path) + this.bufferPromise = this.readGuarded() } return this.bufferPromise } + /** Stat-then-read so an oversized file fails loud BEFORE we allocate a + * multi-hundred-MB Buffer and OOM the process. A missing file surfaces + * as TraceFileMissingError; any other stat/read error propagates. */ + private async readGuarded(): Promise { + let stats: Awaited> + try { + stats = await stat(this.path) + } catch (err) { + if ((err as NodeJS.ErrnoException)?.code === 'ENOENT') { + throw new TraceFileMissingError(this.path) + } + throw err + } + if (stats.size > this.maxFileBytes) { + throw new TraceFileTooLargeError(this.path, stats.size, this.maxFileBytes) + } + return readFile(this.path) + } + private async index(): Promise { if (!this.indexPromise) { this.indexPromise = this.buildIndex() @@ -396,20 +459,19 @@ export class OtlpFileTraceStore implements TraceAnalysisStore { } private async buildIndex(): Promise { - let buf: Buffer - try { - buf = await this.buffer() - } catch (err) { - const stats = await stat(this.path).catch(() => null) - if (!stats) { - throw new TraceFileMissingError(this.path) - } - throw err - } + // readGuarded surfaces missing/oversized files as typed errors. + const buf = await this.buffer() const byTrace = new Map() let cursor = 0 + let sinceYield = 0 while (cursor < buf.length) { + // Yield to the event loop every INDEX_YIELD_LINES lines so a huge + // file doesn't monopolise the thread for the whole index build. + if (++sinceYield >= INDEX_YIELD_LINES) { + sinceYield = 0 + await yieldToEventLoop() + } const newlineIndex = buf.indexOf(0x0a, cursor) // \n const lineEnd = newlineIndex === -1 ? buf.length : newlineIndex const lineLength = lineEnd - cursor @@ -478,8 +540,8 @@ export class OtlpFileTraceStore implements TraceAnalysisStore { entry.span_count += 1 entry.raw_jsonl_bytes += lineLength + 1 // +1 newline byte if (span.status === 'ERROR') entry.has_errors = true - if (span.start_time < entry.start_time) entry.start_time = span.start_time - if (span.end_time > entry.end_time) entry.end_time = span.end_time + if (epochMs(span.start_time) < epochMs(entry.start_time)) entry.start_time = span.start_time + if (epochMs(span.end_time) > epochMs(entry.end_time)) entry.end_time = span.end_time if (span.model_name) entry.models.add(span.model_name) if (span.tool_name) entry.tools.add(span.tool_name) } @@ -491,9 +553,13 @@ export class OtlpFileTraceStore implements TraceAnalysisStore { totalRawBytes += t.raw_jsonl_bytes t.spans.sort( (a, b) => - a.start_time.localeCompare(b.start_time) || a.line_byte_offset - b.line_byte_offset, + epochMs(a.start_time) - epochMs(b.start_time) || a.line_byte_offset - b.line_byte_offset, ) - t.duration_ms = Math.max(0, new Date(t.end_time).getTime() - new Date(t.start_time).getTime()) + // Duration is 0 unless BOTH bounds parse — a missing/garbage timestamp + // yields 0, never a NaN (→ null in JSON) or a bogus epoch-from-zero span. + const startMs = epochOrNull(t.start_time) + const endMs = epochOrNull(t.end_time) + t.duration_ms = startMs === null || endMs === null ? 0 : Math.max(0, endMs - startMs) } const sortedTraceIds = [...byTrace.keys()].sort() @@ -569,12 +635,13 @@ export class OtlpFileTraceStore implements TraceAnalysisStore { // ─── Span projection (lazy attribute reads) ──────────────────────── - private async projectSpan( + private projectSpan( buf: Buffer, trace_id: string, s: SpanIndexEntry, perAttrCap: number, - ): Promise { + counter: TruncationCounter, + ): TraceAnalystSpan { const slice = buf .subarray(s.line_byte_offset, s.line_byte_offset + s.line_byte_length) .toString('utf8') @@ -590,13 +657,13 @@ export class OtlpFileTraceStore implements TraceAnalysisStore { for (const [k, v] of Object.entries(attrs)) { if (typeof v === 'string') { const trunc = truncateForBudget(v, perAttrCap) - if (trunc !== v) trackTruncation(this) + if (trunc !== v) counter.value += 1 projected[k] = trunc } else if (Array.isArray(v) || (v && typeof v === 'object')) { const json = JSON.stringify(v) const trunc = truncateForBudget(json, perAttrCap) if (trunc !== json) { - trackTruncation(this) + counter.value += 1 projected[k] = trunc } else { projected[k] = v @@ -699,6 +766,20 @@ export class TraceFileMissingError extends NotFoundError { super(`trace file not found: ${path}`) } } +export class TraceFileTooLargeError extends Error { + readonly path: string + readonly size_bytes: number + readonly max_bytes: number + constructor(path: string, size_bytes: number, max_bytes: number) { + super( + `trace file ${path} is ${size_bytes} bytes, over the ${max_bytes}-byte limit; ` + + 'raise OtlpFileTraceStoreOptions.maxFileBytes or pre-split the file', + ) + this.path = path + this.size_bytes = size_bytes + this.max_bytes = max_bytes + } +} export class TraceNotFoundError extends NotFoundError { readonly trace_id: string constructor(trace_id: string) { @@ -725,55 +806,59 @@ function isPresent(v: T | undefined): v is T { return v !== undefined } -// Truncation counter — module-private, lets viewSpans report the -// number of attribute fields it had to truncate without threading -// a counter through every call. -const truncationCounters = new WeakMap() +// Per-call truncation counter. Each public read that projects spans +// owns one of these and threads it through projectSpan; a store-keyed +// counter would let two concurrent reads on the same store report each +// other's truncation counts. +interface TruncationCounter { + value: number +} -function trackTruncation(store: OtlpFileTraceStore): void { - let c = truncationCounters.get(store) - if (!c) { - c = { value: 0 } - truncationCounters.set(store, c) +/** A `"` at `idx` is a real JSON delimiter only when the run of `\` + * immediately preceding it is even-length; an odd run means the quote + * is escaped (`\"`) and is part of a string value, not a boundary. */ +function isUnescapedQuote(slice: string, idx: number): boolean { + if (slice[idx] !== '"') return false + let backslashes = 0 + let b = idx - 1 + while (b >= 0 && slice[b] === '\\') { + backslashes += 1 + b -= 1 } - c.value += 1 + return backslashes % 2 === 0 } -function truncationCounter(store: OtlpFileTraceStore): { delta(): number } { - const before = truncationCounters.get(store)?.value ?? 0 - return { - delta() { - const after = truncationCounters.get(store)?.value ?? 0 - return after - before - }, +/** Scan backwards from `from` (inclusive) for the nearest UNescaped `"`. + * Returns its index, or -1 when none is found. */ +function prevUnescapedQuote(slice: string, from: number): number { + for (let i = from; i >= 0; i -= 1) { + if (slice[i] === '"' && isUnescapedQuote(slice, i)) return i } + return -1 } /** * Best-effort: locate the JSON path for the substring at `offset` in - * a single span's JSONL slice. We walk the parsed JSON structurally - * and return the dotted path when we find a string field whose - * serialised form contains `offset`. Returns `null` if the offset - * doesn't fall inside a recognisable string field. + * a single span's JSONL slice. The slice is '...,"key":"value..."' — we + * walk back from `offset` to the value-opening quote, past the `:`, to + * the key's closing then opening quote, skipping `\"`-escaped quotes that + * live inside string values. Returns `null` when the offset doesn't fall + * inside a recognisable string field. */ function bestAttributePathForOffset(slice: string, offset: number): string | null { - // The slice contains '"key":"value..."' — find the nearest '"' - // wrapping `offset` and walk back to a key. This is heuristic but - // bounded by the span line length, not the whole file. - let i = offset - while (i > 0 && slice[i] !== '"') i -= 1 - if (i <= 0) return null - // Scan backwards for the preceding '"': pattern is "key":"value" - let j = i - 1 - while (j > 0 && slice[j] !== ':') j -= 1 - if (j <= 0) return null - // Find the key: walk back from `:` to the matching closing '"' then to opening '"'. - let k = j - 1 - while (k > 0 && slice[k] !== '"') k -= 1 - let l = k - 1 - while (l > 0 && slice[l] !== '"') l -= 1 - if (l <= 0) return null - return slice.slice(l + 1, k) + // Value-opening quote: nearest unescaped '"' at or before the offset. + const valueQuote = prevUnescapedQuote(slice, Math.min(offset, slice.length - 1)) + if (valueQuote < 1) return null + // The ':' separating key and value sits before the value quote. + let j = valueQuote - 1 + while (j >= 0 && slice[j] !== ':') j -= 1 + if (j < 1) return null + // Key closing quote, then key opening quote — both unescaped. + const keyClose = prevUnescapedQuote(slice, j - 1) + if (keyClose < 1) return null + const keyOpen = prevUnescapedQuote(slice, keyClose - 1) + if (keyOpen < 0) return null + return slice.slice(keyOpen + 1, keyClose) } // ─── Error-cluster extraction ──────────────────────────────────────── diff --git a/src/trace/query.test.ts b/src/trace/query.test.ts new file mode 100644 index 00000000..d312a138 --- /dev/null +++ b/src/trace/query.test.ts @@ -0,0 +1,47 @@ +import { describe, expect, it } from 'vitest' +import { aggregateLlm, argHash } from './query' +import type { LlmSpan } from './schema' + +function llm(extra: Partial): LlmSpan { + // `extra` is a Partial spread last, which widens `kind`; cast back to LlmSpan. + return { spanId: 's', runId: 'r', kind: 'llm', name: 'llm', startedAt: 0, ...extra } as LlmSpan +} + +describe('aggregateLlm', () => { + it('sums reasoningTokens (was silently omitted, hiding reasoning usage)', () => { + const agg = aggregateLlm([ + llm({ inputTokens: 100, outputTokens: 50, reasoningTokens: 1000 }), + llm({ inputTokens: 10, outputTokens: 5, reasoningTokens: 200 }), + ]) + expect(agg.inputTokens).toBe(110) + expect(agg.outputTokens).toBe(55) + expect(agg.reasoningTokens).toBe(1200) + }) +}) + +describe('argHash', () => { + it('always returns a string — even for undefined / functions', () => { + // Regression: JSON.stringify(undefined) returns the value `undefined`, + // which made argHash non-string and broke de-dup keys downstream. + expect(typeof argHash(undefined)).toBe('string') + expect(typeof argHash(null)).toBe('string') + expect(typeof argHash(() => 1)).toBe('string') + }) + + it('is stable across object key insertion order', () => { + expect(argHash({ a: 1, b: 2 })).toBe(argHash({ b: 2, a: 1 })) + }) + + it('treats an undefined-valued key like an absent key (JSON semantics)', () => { + expect(argHash({ a: 1 })).toBe(argHash({ a: 1, b: undefined })) + }) + + it('distinguishes null, the string "null", and distinct args', () => { + expect(argHash(null)).not.toBe(argHash('null')) + expect(argHash({ cmd: 'a' })).not.toBe(argHash({ cmd: 'b' })) + }) + + it('recurses into nested structures stably', () => { + expect(argHash({ x: [{ a: 1, b: 2 }] })).toBe(argHash({ x: [{ b: 2, a: 1 }] })) + }) +}) diff --git a/src/trace/query.ts b/src/trace/query.ts index f0c895a0..2f643c55 100644 --- a/src/trace/query.ts +++ b/src/trace/query.ts @@ -55,12 +55,20 @@ export function argHash(args: unknown): string { } function stableStringify(value: unknown): string { - if (value === null || typeof value !== 'object') return JSON.stringify(value) + // Must ALWAYS return a string: JSON.stringify(undefined) — and stringify of + // functions/symbols — yields the JS value `undefined`, which would make + // argHash return a non-string and silently break the de-dup keys behind + // stuck-loop and failure-cluster. + if (value === undefined) return 'undefined' + if (value === null || typeof value !== 'object') return JSON.stringify(value) ?? 'undefined' if (Array.isArray(value)) return `[${value.map(stableStringify).join(',')}]` - const keys = Object.keys(value as Record).sort() - const parts = keys.map( - (k) => `${JSON.stringify(k)}:${stableStringify((value as Record)[k])}`, - ) + // Drop undefined-valued keys to match JSON.stringify semantics, so + // `{a:1}` and `{a:1,b:undefined}` hash identically. + const obj = value as Record + const keys = Object.keys(obj) + .filter((k) => obj[k] !== undefined) + .sort() + const parts = keys.map((k) => `${JSON.stringify(k)}:${stableStringify(obj[k])}`) return `{${parts.join(',')}}` } @@ -69,6 +77,7 @@ export function aggregateLlm(spans: LlmSpan[]): { inputTokens: number outputTokens: number cachedTokens: number + reasoningTokens: number costUsd: number } { return spans.reduce( @@ -76,9 +85,12 @@ export function aggregateLlm(spans: LlmSpan[]): { inputTokens: acc.inputTokens + (s.inputTokens ?? 0), outputTokens: acc.outputTokens + (s.outputTokens ?? 0), cachedTokens: acc.cachedTokens + (s.cachedTokens ?? 0), + // reasoningTokens is on LlmSpan but was omitted here — reasoning usage was + // invisible to any cost/perf analysis reading this aggregate. + reasoningTokens: acc.reasoningTokens + (s.reasoningTokens ?? 0), costUsd: acc.costUsd + (s.costUsd ?? 0), }), - { inputTokens: 0, outputTokens: 0, cachedTokens: 0, costUsd: 0 }, + { inputTokens: 0, outputTokens: 0, cachedTokens: 0, reasoningTokens: 0, costUsd: 0 }, ) } diff --git a/src/trace/store.test.ts b/src/trace/store.test.ts new file mode 100644 index 00000000..b6e1204b --- /dev/null +++ b/src/trace/store.test.ts @@ -0,0 +1,224 @@ +import { mkdtempSync, readdirSync, readFileSync, rmSync, writeFileSync } from 'node:fs' +import { tmpdir } from 'node:os' +import { join } from 'node:path' + +import { afterEach, beforeEach, describe, expect, it, vi } from 'vitest' + +import type { GenericSpan, Run } from './schema' +import { FileSystemTraceStore } from './store' + +// Wrap node:fs/promises so the loader's readdir order is controllable per-test: +// real FS, but readdir entries are reordered to whatever `readdirOrder` sets. +// This forces the patch-before-base case (otherwise OS-dependent) and lets stat +// yield a macrotask so concurrent appends interleave deterministically. +let readdirOrder: ((files: string[]) => string[]) | undefined +let statDelay = false +vi.mock('node:fs/promises', async (importActual) => { + const actual = await importActual() + return { + ...actual, + default: actual, + async readdir(...args: Parameters) { + const out = (await (actual.readdir as (...a: unknown[]) => Promise)( + ...(args as unknown[]), + )) as string[] + return readdirOrder ? readdirOrder(out) : out + }, + stat(...args: Parameters) { + if (!statDelay) return actual.stat(...(args as Parameters)) + return new Promise((resolve, reject) => { + setTimeout( + () => actual.stat(...(args as Parameters)).then(resolve, reject), + 0, + ) + }) + }, + } +}) + +function run(id: string, extra: Partial = {}): Run { + return { + runId: id, + scenarioId: 'scn', + startedAt: 1_700_000_000_000, + status: 'running', + ...extra, + } +} + +function span(id: string, runId: string, extra: Partial = {}): GenericSpan { + return { + spanId: id, + runId, + kind: 'agent', + name: 'work', + startedAt: 1_700_000_000_000, + ...extra, + } +} + +describe('FileSystemTraceStore', () => { + let dir: string + beforeEach(() => { + dir = mkdtempSync(join(tmpdir(), 'fs-trace-store-')) + }) + afterEach(() => { + readdirOrder = undefined + statDelay = false + rmSync(dir, { recursive: true, force: true }) + }) + + function diskLines(name: string): string[] { + const out: string[] = [] + for (const f of readdirSync(dir)) { + if (!f.endsWith('.ndjson')) continue + if (f === `${name}.ndjson` || f.startsWith(`${name}.`)) { + out.push( + ...readFileSync(join(dir, f), 'utf8') + .split('\n') + .filter((l) => l.trim()), + ) + } + } + return out + } + + it('serializes concurrent appends so rollover holds the size bound (mutex)', async () => { + // Seed the active file already at the bound, then fire a concurrent burst + // with stat delayed a macrotask so every append observes the same + // over-size state. Without per-name serialization the stat→rename→append + // windows interleave: each append renames once then writes to the same + // fresh active without re-checking, so the active file balloons far past + // maxBytes — the rollover bound the option exists to enforce is silently + // violated. Serialized, each append re-stats after the prior one's write, + // so the active file never holds more than one record over the bound. + const maxBytes = 200 + const fs = await import('node:fs/promises') + await fs.mkdir(dir, { recursive: true }) + await fs.writeFile(join(dir, 'spans.ndjson'), `${'x'.repeat(maxBytes)}\n`, 'utf8') + statDelay = true + + const store = new FileSystemTraceStore({ dir, maxBytes }) + const n = 20 + const pad = 'y'.repeat(80) + await Promise.all( + Array.from({ length: n }, (_, i) => + store.appendSpan(span(`s${i}`, 'r1', { attributes: { pad } })), + ), + ) + + // Active file must not exceed the bound by more than the single record that + // tipped it over; a no-lock race lets every record pile into one file. + const recordBytes = Buffer.byteLength( + `${JSON.stringify(span('s0', 'r1', { attributes: { pad } }))}\n`, + ) + const activeSize = (await fs.stat(join(dir, 'spans.ndjson'))).size + expect(activeSize).toBeLessThanOrEqual(maxBytes + recordBytes) + + // And no row is lost across the rolled corpus. + const ids = new Set( + diskLines('spans') + .map((l) => { + try { + return (JSON.parse(l) as { spanId?: string }).spanId + } catch { + return undefined + } + }) + .filter((id): id is string => Boolean(id)), + ) + expect(ids.size).toBe(n) + }) + + it('surfaces an index-mirror failure to the caller (awaited, not fire-and-forget)', async () => { + // Populate the lazy index so the next append mirrors into it. A duplicate + // runId makes the index reject. A floating `void this.insertInto(...)` + // swallows that as an unhandled rejection and reports success while disk + + // index diverge; awaiting it surfaces the error to the caller. + const store = new FileSystemTraceStore({ dir }) + await store.appendRun(run('r1')) + expect(await store.getRun('r1')).toBeDefined() + + await expect(store.appendRun(run('r1'))).rejects.toThrow(/already exists/) + }) + + it('collapses an _update patch onto its base when reloaded cross-instance', async () => { + const store = new FileSystemTraceStore({ dir }) + await store.appendSpan(span('span-1', 'r1', { status: 'ok' })) + await store.updateSpan('span-1', { endedAt: 1_700_000_002_000, status: 'error' }) + + const reopened = new FileSystemTraceStore({ dir }) + const spans = await reopened.spans({ runId: 'r1' }) + expect(spans).toHaveLength(1) + expect(spans[0]!.spanId).toBe('span-1') + expect(spans[0]!.runId).toBe('r1') + expect(spans[0]!.status).toBe('error') + expect(spans[0]!.endedAt).toBe(1_700_000_002_000) + }) + + it('collapses a patch read before its base — no runId-less fragment (two-pass)', async () => { + // Rollover splits a span's stream across files: the base full span in one + // file, its `_update` patch in another. Force readdir to return the patch + // file first. A one-pass loader applies the patch before the base exists, + // hits the catch, and appends a runId-less fragment → two spans, corrupt + // counts. Two-pass load defers patches until every base is indexed. + writeFileSync( + join(dir, 'spans.base.ndjson'), + `${JSON.stringify(span('span-1', 'r1', { status: 'ok' }))}\n`, + ) + writeFileSync( + join(dir, 'spans.patch.ndjson'), + `${JSON.stringify({ spanId: 'span-1', status: 'error', endedAt: 1_700_000_002_000, _update: true })}\n`, + ) + readdirOrder = () => ['spans.patch.ndjson', 'spans.base.ndjson'] + + const store = new FileSystemTraceStore({ dir }) + const spans = await store.spans() + expect(spans).toHaveLength(1) + expect(spans[0]!.spanId).toBe('span-1') + expect(spans[0]!.runId).toBe('r1') + expect(spans[0]!.status).toBe('error') + expect(spans.every((s) => typeof s.runId === 'string' && s.runId.length > 0)).toBe(true) + }) + + it('collapses a run patch read before its base — no scenarioId-less fragment (two-pass)', async () => { + writeFileSync( + join(dir, 'runs.base.ndjson'), + `${JSON.stringify(run('r1', { status: 'running' }))}\n`, + ) + writeFileSync( + join(dir, 'runs.patch.ndjson'), + `${JSON.stringify({ runId: 'r1', status: 'completed', _update: true })}\n`, + ) + readdirOrder = () => ['runs.patch.ndjson', 'runs.base.ndjson'] + + const store = new FileSystemTraceStore({ dir }) + const runs = await store.listRuns() + expect(runs).toHaveLength(1) + expect(runs[0]!.runId).toBe('r1') + expect(runs[0]!.status).toBe('completed') + expect(runs[0]!.scenarioId).toBe('scn') + }) + + it('applies patches in write order across rolled files (newest wins), not readdir order', async () => { + // r1 was running, completed (both in the older rolled file), then failed + // (in the newer file). Replaying in readdir order with the newer file first + // lets the OLDER 'completed' patch win and silently restore a stale status; + // the chronological file sort makes 'failed' (the newest write) win. + writeFileSync( + join(dir, 'runs.900.ndjson'), + `${JSON.stringify(run('r1', { status: 'running' }))}\n${JSON.stringify({ runId: 'r1', status: 'completed', _update: true })}\n`, + ) + writeFileSync( + join(dir, 'runs.1000.ndjson'), + `${JSON.stringify({ runId: 'r1', status: 'failed', _update: true })}\n`, + ) + // Adversarial: newer rolled file returned first by readdir. + readdirOrder = () => ['runs.1000.ndjson', 'runs.900.ndjson'] + + const store = new FileSystemTraceStore({ dir }) + const runs = await store.listRuns() + expect(runs).toHaveLength(1) + expect(runs[0]!.status).toBe('failed') + }) +}) diff --git a/src/trace/store.ts b/src/trace/store.ts index 3c3888b3..b48ac50f 100644 --- a/src/trace/store.ts +++ b/src/trace/store.ts @@ -187,6 +187,14 @@ export class FileSystemTraceStore implements TraceStore { /** Lazy in-memory index for queries — populated on first read. */ private index?: InMemoryTraceStore private loaded = false + /** + * Per-file append serialization. stat → conditional-rename → appendFile is a + * read-modify-write on the active file; without a lock two concurrent appends + * to the same `name` can both pass the size check (exceeding maxBytes) or one + * can append to a file the other just renamed away. Each entry chains the + * next append behind the prior one for that file. + */ + private appendLocks = new Map>() constructor(options: FileSystemTraceStoreOptions) { this.dir = options.dir @@ -199,6 +207,26 @@ export class FileSystemTraceStore implements TraceStore { } private async append(name: string, record: unknown): Promise { + // Chain this append behind any in-flight append for the same file. The tail + // promise never rejects (errors are isolated to each caller's `result`), so + // one failed append can't break the lock chain for the next one. + const prior = this.appendLocks.get(name) ?? Promise.resolve() + const result = prior.then(() => this.appendLocked(name, record)) + const tail = result.then( + () => {}, + () => {}, + ) + this.appendLocks.set(name, tail) + try { + await result + } finally { + // Drop the slot once this append is the current tail, so the map doesn't + // grow without bound across many appends to the same file. + if (this.appendLocks.get(name) === tail) this.appendLocks.delete(name) + } + } + + private async appendLocked(name: string, record: unknown): Promise { await this.ensureDir() const fs = await import('node:fs/promises') const path = await import('node:path') @@ -217,9 +245,10 @@ export class FileSystemTraceStore implements TraceStore { // with `_update: true` by updateRun/updateSpan) are applied by those // methods directly via the index's update* APIs — re-inserting them // here triggers a duplicate-id error once the first read populates - // the index. + // the index. Await the mirror: a fire-and-forget would let an index + // rejection go unhandled and let disk + index diverge silently. if (this.index && !(record as { _update?: boolean })?._update) { - void this.insertInto(name, record) + await this.insertInto(name, record) } } @@ -249,8 +278,47 @@ export class FileSystemTraceStore implements TraceStore { const fs = await import('node:fs/promises') const path = await import('node:path') const store = new InMemoryTraceStore() + let entries: string[] try { - const entries = await fs.readdir(this.dir) + entries = await fs.readdir(this.dir) + } catch { + // No dir yet (first run / empty corpus). Anything past this point — parse + // errors, missing patch bases — is real corruption and must fail loud. + this.index = store + this.loaded = true + return store + } + // Replay files in write order. For one entity, rolled files + // (`name..ndjson`) are older than the active bare file (`name.ndjson`), + // and readdir order is not chronological — so without this sort a patch in + // an older rolled file could be applied AFTER (overwrite) a newer patch in + // the active file, silently restoring a stale field. Sort: per entity, + // rolled ascending by timestamp, then the bare active file last. + entries.sort((a, b) => { + // Group by the same entity base the loop uses (`split('.')[0]`); within an + // entity, real rollover files (`..ndjson`) sort by timestamp + // ascending and the bare active file (no numeric segment) sorts last. + // Non-rollover names tie (Infinity) and a stable sort preserves their + // readdir order. + const baseA = a.split('.')[0]! + const baseB = b.split('.')[0]! + if (baseA !== baseB) return baseA < baseB ? -1 : 1 + const ta = a.match(/^[^.]+\.(\d+)\.ndjson$/) + const tb = b.match(/^[^.]+\.(\d+)\.ndjson$/) + const tsA = ta ? Number(ta[1]) : Number.POSITIVE_INFINITY + const tsB = tb ? Number(tb[1]) : Number.POSITIVE_INFINITY + return tsA - tsB + }) + { + // Two-pass load. Pass 1 indexes all full (base) rows; pass 2 replays the + // `_update` patch rows that updateRun/updateSpan append. readdir order is + // not deterministic and rollover splits a stream across files, so a patch + // can be read before its base. Applying patches in a separate pass — after + // every base row is indexed — means a patch always finds its target. + // Doing it in one pass let a patch hit the catch and append a runId-less + // fragment, corrupting span/run counts cross-instance. + const runUpdates: Array> = [] + const spanUpdates: Array> = [] for (const file of entries) { if (!file.endsWith('.ndjson')) continue const full = path.join(this.dir, file) @@ -260,7 +328,12 @@ export class FileSystemTraceStore implements TraceStore { if (!line.trim()) continue const record = JSON.parse(line) if (base === 'runs') { - // Allow re-loading without duplicate error + if (record?._update) { + runUpdates.push(record) + continue + } + // A duplicate full row (same dir loaded twice into one stream) is + // collapsed last-write-wins rather than thrown. try { await store.appendRun(record) } catch { @@ -268,24 +341,17 @@ export class FileSystemTraceStore implements TraceStore { } } else if (base === 'spans') { // `updateSpan` appends an `_update: true` patch row instead of - // rewriting the original span. On reload we must collapse those - // patches onto the original span — otherwise a fresh - // FileSystemTraceStore reading the same dir reports duplicate - // spans (one full, one fragment with no runId/kind/name), which - // breaks any downstream consumer that re-opens the store - // cross-process (e.g. the OTLP converter). + // rewriting the original span. Deferred to pass 2 so it collapses + // onto the original span — otherwise a fresh FileSystemTraceStore + // reading the same dir reports duplicate spans (one full, one + // fragment with no runId/kind/name), which breaks any downstream + // consumer that re-opens the store cross-process (e.g. the OTLP + // converter). if (record?._update) { - try { - await store.updateSpan(record.spanId, record) - } catch { - // Patch row arrived before the original — should not happen - // with locked append order, but fall through to append so we - // don't lose data. - await store.appendSpan(record) - } - } else { - await store.appendSpan(record) + spanUpdates.push(record) + continue } + await store.appendSpan(record) } else if (base === 'events') { await store.appendEvent(record) } else if (base === 'artifacts') { @@ -295,8 +361,16 @@ export class FileSystemTraceStore implements TraceStore { } } } - } catch { - /* empty dir, first run */ + // Pass 2: every base row is now indexed. A patch whose base is genuinely + // absent (truncated/partial corpus) throws inside update* — we let it + // propagate rather than silently appending a fragment, so a corrupt + // corpus fails loud instead of polluting counts. + for (const record of runUpdates) { + await store.updateRun(record.runId as string, record as Partial) + } + for (const record of spanUpdates) { + await store.updateSpan(record.spanId as string, record as Partial) + } } this.index = store this.loaded = true diff --git a/src/types.ts b/src/types.ts index eedccd68..2f3b4256 100644 --- a/src/types.ts +++ b/src/types.ts @@ -138,6 +138,8 @@ export interface RouteMap { export interface ProductClientConfig { baseUrl: string routes: RouteMap + /** Per-request timeout in ms before the request is aborted. Default 30s. */ + timeoutMs?: number } // ── Scenario Registry ── diff --git a/tests/eval-campaign.test.ts b/tests/eval-campaign.test.ts index fa0b6d01..69eaaadc 100644 --- a/tests/eval-campaign.test.ts +++ b/tests/eval-campaign.test.ts @@ -547,3 +547,193 @@ describe('runEvalCampaign — concurrency', () => { expect(inFlight.max).toBeLessThanOrEqual(4) }) }) + +describe('runEvalCampaign — genuine-error containment (no orphaned workers)', () => { + // A genuine (non-CellExecutionError) bug in one cell must not reject the pool + // mid-flight and orphan the other in-flight workers. The campaign must wait + // for every sibling worker to finish (its run finalized) before re-throwing. + it('does not throw until every in-flight sibling run has finalized', async () => { + // Two cells, concurrency 2. Cell A is the genuine-error cell. Cell B parks + // mid-run on a barrier we control. With Promise.all (old behaviour) the + // campaign rejects the instant A throws, while B is still parked — orphaned. + // With Promise.allSettled the campaign cannot settle until B finalizes. + let releaseB: () => void = () => {} + const bParked = new Promise((resolve) => { + releaseB = resolve + }) + let bFinalized = false + + const runner: CampaignRunner = async (ctx) => { + if (ctx.variantId === 'fail') { + // Genuine error in post-runner assembly: the runner finalizes its OWN + // run, then hands back an agent profile that contradicts the model. + // assertAgentProfileMatchesRun throws a plain Error (not a Cell error). + const base = await defaultRunner(ctx) + return { + ...base, + agentProfile: { + profileId: 'mismatch', + sourceProfile: { kind: 'sandbox-agent-profile', profile: { name: 'a' } }, + harness: { id: 'h', version: '1' }, + model: 'WRONG-MODEL@2026-05-08', + promptHash: 'p'.repeat(64), + }, + } + } + // Cell B: start the run, park until released, then finalize itself. + await ctx.emitter.startRun({ scenarioId: ctx.scenarioId, layer: 'app-runtime' }) + await bParked + const handle = await ctx.emitter.llm({ + name: 'judge', + model: 'test-model@2026-05-08', + messages: [{ role: 'user', content: ctx.variant.prompt }], + output: 'ok', + }) + await ctx.rawSink.record({ + eventId: `evt-${ctx.runId}-0`, + runId: ctx.runId, + spanId: handle.span.spanId, + provider: 'test', + model: 'test-model@2026-05-08', + endpoint: '/chat/completions', + baseUrl: ctx.llmOpts.baseUrl ?? '', + attemptIndex: 0, + direction: 'request', + timestamp: 1_000, + redactedFields: [], + }) + await handle.end() + await ctx.emitter.endRun({ pass: true, score: 0.6 }) + bFinalized = true + return { + pass: true, + score: 0.6, + costUsd: 0.001, + tokenUsage: { input: 10, output: 5 }, + model: 'test-model@2026-05-08', + promptHash: 'p'.repeat(64), + configHash: 'c'.repeat(64), + } + } + + const stores = new Map() + const opts: EvalCampaignOptions = { + campaignId: 'orphan-campaign', + variants: [ + { id: 'fail', payload: { prompt: 'x' } }, + { id: 'park', payload: { prompt: 'y' } }, + ], + scenarios: [{ scenarioId: 's1' }], + seeds: [0], + commitSha: 'cafebabe', + llmOpts: { baseUrl: 'https://api.test.local/v1', apiKey: 'sk-test' }, + storeFactory: ({ runId }) => { + const s = new InMemoryTraceStore() + stores.set(runId, s) + return s + }, + rawSinkFactory: () => new InMemoryRawProviderSink(), + concurrency: 2, + runner, + } + + const campaign = runEvalCampaign(opts) + let settled = false + void campaign.then( + () => { + settled = true + }, + () => { + settled = true + }, + ) + + // Flush microtasks + a macrotask. Cell A has thrown its genuine error by + // now, but cell B is still parked. Old Promise.all would have rejected the + // campaign already; allSettled must keep it pending. + await new Promise((r) => setTimeout(r, 20)) + expect(settled).toBe(false) + expect(bFinalized).toBe(false) + + // Release B; only now may the campaign settle (re-throwing the genuine error). + releaseB() + await expect(campaign).rejects.toThrow(/does not match outcome.model/) + expect(bFinalized).toBe(true) + + // B's run was finalized as a real completed run — not left dangling. + const bRunId = [...stores.keys()].find((id) => id.length > 0) + expect(bRunId).toBeDefined() + // Every store that recorded a run must have a terminal status (no 'running'). + for (const store of stores.values()) { + const runs = await store.listRuns() + for (const r of runs) { + expect(r.status).not.toBe('running') + } + } + }) +}) + +describe('runEvalCampaign — abort failure is surfaced, not swallowed', () => { + // When the runner throws, the campaign aborts the run to finalize the + // emitter. A store-write failure during that abort is a genuine diagnostic + // (disk full, FS fault, backend down) and MUST surface — the old + // `try { abortRun } catch {}` swallowed it, hiding real corruption. + it('re-throws a store-write failure that occurs while aborting a thrown run', async () => { + class AbortFailingStore extends InMemoryTraceStore { + override async updateRun(runId: string, patch: Partial) { + if (patch.status === 'aborted') { + throw new Error('disk full: cannot persist aborted run') + } + return super.updateRun(runId, patch) + } + } + + const runner: CampaignRunner = async (ctx) => { + await ctx.emitter.startRun({ scenarioId: ctx.scenarioId, layer: 'app-runtime' }) + throw new Error('runner boom') + } + + const opts: EvalCampaignOptions = { + campaignId: 'abort-fail-campaign', + variants: [{ id: 'v1', payload: { prompt: 'x' } }], + scenarios: [{ scenarioId: 's1' }], + seeds: [0], + commitSha: 'cafebabe', + llmOpts: { baseUrl: 'https://api.test.local/v1', apiKey: 'sk-test' }, + storeFactory: () => new AbortFailingStore(), + rawSinkFactory: () => new InMemoryRawProviderSink(), + runner, + } + + // Old behaviour: the abort error is swallowed; the cell is recorded as a + // plain runner_threw failure and the campaign resolves. New behaviour: the + // store-write failure surfaces as a genuine error. + await expect(runEvalCampaign(opts)).rejects.toThrow(/disk full: cannot persist aborted run/) + }) + + it('still treats a never-started run as a benign abort (nothing to finalize)', async () => { + // If the runner throws BEFORE startRun, there is no run to abort — that is + // the only benign case the narrowed catch may pass over. It must NOT throw + // a store error; the cell is a normal runner_threw failure. + const runner: CampaignRunner = async () => { + throw new Error('threw before startRun') + } + + const opts: EvalCampaignOptions = { + campaignId: 'never-started-campaign', + variants: [{ id: 'v1', payload: { prompt: 'x' } }], + scenarios: [{ scenarioId: 's1' }], + seeds: [0], + commitSha: 'cafebabe', + llmOpts: { baseUrl: 'https://api.test.local/v1', apiKey: 'sk-test' }, + storeFactory: () => new InMemoryTraceStore(), + rawSinkFactory: () => new InMemoryRawProviderSink(), + runner, + } + + const result = await runEvalCampaign(opts) + expect(result.failedRuns).toHaveLength(1) + expect(result.failedRuns[0]?.reason).toBe('runner_threw') + expect(result.failedRuns[0]?.error).toBe('threw before startRun') + }) +})