diff --git a/src/analyst/analyst.test.ts b/src/analyst/analyst.test.ts
index ff71a24b..e0da4a0f 100644
--- a/src/analyst/analyst.test.ts
+++ b/src/analyst/analyst.test.ts
@@ -754,7 +754,15 @@ describe('ChatClient signal racing', () => {
     // documents the limit: races live in wrapLlmClient, mock passes
     // through. Either resolves (slow path) or rejects (when bound).
     // We just assert it eventually completes without hanging.
-    await Promise.race([p.catch(() => undefined), new Promise((r) => setTimeout(r, 200))])
-    expect(true).toBe(true)
+    const settled = await Promise.race([
+      p.then(
+        () => 'settled',
+        () => 'settled',
+      ),
+      new Promise<string>((r) => setTimeout(() => r('hung'), 200)),
+    ])
+    // The mock transport passes the signal through; the real contract here is
+    // only that the call SETTLES (resolves or rejects) and never hangs.
+    expect(settled).toBe('settled')
   })
 })
diff --git a/src/analyst/registry.budget.test.ts b/src/analyst/registry.budget.test.ts
new file mode 100644
index 00000000..22ce9916
--- /dev/null
+++ b/src/analyst/registry.budget.test.ts
@@ -0,0 +1,33 @@
+import { describe, expect, it } from 'vitest'
+import { AnalystRegistry } from './registry'
+import type { Analyst, AnalystContext, AnalystFinding } from './types'
+
+function recorder(id: string, inputKind: Analyst['inputKind'], sink: { budget?: number }): Analyst {
+  return {
+    id,
+    description: id,
+    inputKind,
+    cost: { kind: 'deterministic' },
+    version: '1.0.0',
+    async analyze(_input: unknown, ctx: AnalystContext): Promise<AnalystFinding[]> {
+      sink.budget = ctx.budgetUsd
+      return []
+    },
+  }
+}
+
+describe('AnalystRegistry budget allocation', () => {
+  it('splits budget across analysts that actually run, not ones skipped for missing input', async () => {
+    const sink: { budget?: number } = {}
+    const reg = new AnalystRegistry()
+    reg.register(recorder('runs', 'trace-store', sink))
+    // artifact-dir input is NOT supplied → this analyst is skipped, so it must
+    // not dilute the budget of the one that runs.
+    reg.register(recorder('skipped', 'artifact-dir', {}))
+
+    await reg.run('t', { traceStore: {} as never }, { budget: { totalUsd: 10 } })
+
+    // runnableCount = 1 → full budget. The pre-fix code divided by 2 → 5.
+    expect(sink.budget).toBe(10)
+  })
+})
diff --git a/src/analyst/registry.ts b/src/analyst/registry.ts
index c26c65dd..9f8f7614 100644
--- a/src/analyst/registry.ts
+++ b/src/analyst/registry.ts
@@ -196,6 +196,13 @@ export class AnalystRegistry {
     let totalCost = 0
     let remainingUsd = budget?.totalUsd
 
+    // Budget is split only across analysts that actually run. Analysts skipped
+    // for missing input never spend, so counting them would under-budget the
+    // ones that do. routeInput is pure, so the pre-count is safe.
+    const runnableCount = selected.filter(
+      (a) => this.routeInput(a, inputs).kind !== 'missing',
+    ).length
+
     for (const analyst of selected) {
       const t0 = Date.now()
       const input = this.routeInput(analyst, inputs)
@@ -218,7 +225,7 @@ export class AnalystRegistry {
       const perBudget = allocateBudget(budget, {
         analyst,
         remainingUsd,
-        runningCount: selected.length,
+        runningCount: runnableCount,
       })
 
       const ctx: AnalystContext = {
diff --git a/src/client.test.ts b/src/client.test.ts
new file mode 100644
index 00000000..0bcf8d6a
--- /dev/null
+++ b/src/client.test.ts
@@ -0,0 +1,124 @@
+import { afterEach, beforeEach, describe, expect, it, vi } from 'vitest'
+import { ProductClient } from './client'
+
+function mkOk(body: unknown): Response {
+  return new Response(JSON.stringify(body), {
+    status: 200,
+    headers: { 'content-type': 'application/json' },
+  })
+}
+
+function mkErr(status: number, body: string): Response {
+  return new Response(body, { status })
+}
+
+const ROUTES = {
+  tasks: '/api/tasks',
+  events: '/api/events',
+  approvals: '/api/approvals',
+  vault: '/api/vault',
+  generations: '/api/generations',
+}
+
+function newClient() {
+  return new ProductClient({ baseUrl: 'https://app.test', routes: ROUTES })
+}
+
+describe('ProductClient — fail-loud on non-ok responses', () => {
+  const realFetch = globalThis.fetch
+  afterEach(() => {
+    globalThis.fetch = realFetch
+  })
+
+  it('getTasks throws on a 500 instead of masking it as zero results', async () => {
+    // OLD behavior: res.json() was called without checking res.ok, then the
+    // error body's missing `tasks` key was masked by `?? []`, so a 500 with an
+    // error JSON body returned [] — indistinguishable from a healthy empty set.
+    globalThis.fetch = vi.fn(async () =>
+      mkErr(500, JSON.stringify({ error: 'internal server error' })),
+    ) as unknown as typeof fetch
+
+    await expect(newClient().getTasks('ws-1')).rejects.toThrow(/HTTP 500/)
+  })
+
+  it('getApprovals throws on a 401 (auth) rather than returning [])', async () => {
+    globalThis.fetch = vi.fn(async () =>
+      mkErr(401, JSON.stringify({ error: 'unauthorized' })),
+    ) as unknown as typeof fetch
+
+    await expect(newClient().getApprovals('ws-1')).rejects.toThrow(/HTTP 401/)
+  })
+
+  it('getEvents fails loud when a 200 body omits the required array field', async () => {
+    // A wrong route or a contract drift can return 200 `{}`. The old `?? []`
+    // turned that into "zero events"; now it must surface as a defect.
+    globalThis.fetch = vi.fn(async () => mkOk({ unexpected: true })) as unknown as typeof fetch
+
+    await expect(newClient().getEvents('ws-1')).rejects.toThrow(/missing array field "events"/)
+  })
+
+  it('getVaultTree fails loud when a 200 body omits the tree array', async () => {
+    globalThis.fetch = vi.fn(async () => mkOk({})) as unknown as typeof fetch
+    await expect(newClient().getVaultTree('ws-1')).rejects.toThrow(/missing array field "tree"/)
+  })
+
+  it('returns the parsed array on a healthy 200', async () => {
+    globalThis.fetch = vi.fn(async () =>
+      mkOk({ tasks: [{ id: 't1', title: 'a', status: 'open', priority: 'high' }] }),
+    ) as unknown as typeof fetch
+
+    const tasks = await newClient().getTasks('ws-1')
+    expect(tasks).toHaveLength(1)
+    expect(tasks[0]!.id).toBe('t1')
+  })
+
+  it('a genuinely empty result set is still allowed (empty array, not masked error)', async () => {
+    globalThis.fetch = vi.fn(async () => mkOk({ tasks: [] })) as unknown as typeof fetch
+    const tasks = await newClient().getTasks('ws-1')
+    expect(tasks).toEqual([])
+  })
+
+  it('generic post throws with status + body on a 4xx', async () => {
+    globalThis.fetch = vi.fn(async () =>
+      mkErr(422, 'validation failed: name required'),
+    ) as unknown as typeof fetch
+
+    await expect(newClient().post('/api/x', { a: 1 })).rejects.toThrow(
+      /HTTP 422.*validation failed/,
+    )
+  })
+})
+
+describe('ProductClient — request timeout', () => {
+  const realFetch = globalThis.fetch
+  beforeEach(() => {
+    vi.useFakeTimers()
+  })
+  afterEach(() => {
+    vi.useRealTimers()
+    globalThis.fetch = realFetch
+  })
+
+  it('aborts a hung request after the configured timeout', async () => {
+    // fetch resolves only if its signal aborts (mirrors how undici/fetch
+    // rejects an aborted request). A never-resolving server must not hang the
+    // harness forever.
+    globalThis.fetch = ((_url: string, init: RequestInit) =>
+      new Promise((_resolve, reject) => {
+        const signal = init.signal
+        if (signal) {
+          signal.addEventListener('abort', () => {
+            const err = new Error('The operation was aborted')
+            err.name = 'AbortError'
+            reject(err)
+          })
+        }
+      })) as unknown as typeof fetch
+
+    const client = new ProductClient({ baseUrl: 'https://app.test', routes: ROUTES, timeoutMs: 50 })
+    const p = client.get('/api/slow')
+    const assertion = expect(p).rejects.toThrow(/aborted/i)
+    await vi.advanceTimersByTimeAsync(60)
+    await assertion
+  })
+})
diff --git a/src/client.ts b/src/client.ts
index fd85303d..f01007bc 100644
--- a/src/client.ts
+++ b/src/client.ts
@@ -5,14 +5,18 @@ import type { CheckResult, ProductClientConfig, RouteMap, TestResult } from './t
  *
  * Routes are config, not hardcoded. Each agent provides its own RouteMap.
  */
+const DEFAULT_REQUEST_TIMEOUT_MS = 30_000
+
 export class ProductClient {
   private baseUrl: string
   private routes: RouteMap
   private cookies: string = ''
+  private timeoutMs: number
 
   constructor(config: ProductClientConfig) {
     this.baseUrl = config.baseUrl.replace(/\/+$/, '')
     this.routes = config.routes
+    this.timeoutMs = config.timeoutMs ?? DEFAULT_REQUEST_TIMEOUT_MS
   }
 
   private route(name: keyof RouteMap): string {
@@ -21,6 +25,57 @@ export class ProductClient {
     return path
   }
 
+  /**
+   * Single HTTP boundary for every JSON request. Aborts after `timeoutMs`,
+   * checks `res.ok` BEFORE parsing, and throws `${method} ${path} failed:
+   * HTTP ${status} — ${body}` on a non-ok response so a 4xx/5xx error body
+   * can never be parsed as a success shape. Callers inspect the resolved
+   * value only after a successful return.
+   */
+  private async request(
+    method: string,
+    path: string,
+    body?: Record<string, unknown>,
+  ): Promise<Record<string, unknown>> {
+    const controller = new AbortController()
+    const timeout = setTimeout(() => controller.abort(), this.timeoutMs)
+    let res: Response
+    try {
+      res = await fetch(`${this.baseUrl}${path}`, {
+        method,
+        headers: {
+          ...(body ? { 'Content-Type': 'application/json' } : {}),
+          Origin: this.baseUrl,
+          Cookie: this.cookies,
+        },
+        ...(body ? { body: JSON.stringify(body) } : {}),
+        signal: controller.signal,
+      })
+    } finally {
+      clearTimeout(timeout)
+    }
+    if (!res.ok) {
+      const text = await res.text().catch(() => '')
+      throw new Error(`${method} ${path} failed: HTTP ${res.status} — ${text.slice(0, 500)}`)
+    }
+    return res.json() as Promise<Record<string, unknown>>
+  }
+
+  /**
+   * Read a required collection field. A missing/non-array field is a contract
+   * violation (wrong route, drift, partial body) — surface it loud instead of
+   * masking it as a healthy empty set. A genuinely empty `[]` passes through.
+   */
+  private requireArray<T>(res: Record<string, unknown>, field: string): T[] {
+    const value = res[field]
+    if (!Array.isArray(value)) {
+      throw new Error(
+        `Response missing array field "${field}" (got ${typeof value}: ${JSON.stringify(value)?.slice(0, 200)})`,
+      )
+    }
+    return value as T[]
+  }
+
   async signup(name: string, email: string, password: string): Promise<{ userId: string }> {
     const res = await this.post(this.route('signup'), { name, email, password })
     const user = res.user as Record<string, unknown> | undefined
@@ -121,19 +176,25 @@ export class ProductClient {
     workspaceId: string,
   ): Promise<{ id: string; title: string; status: string; priority: string }[]> {
     const res = await this.get(`${this.route('tasks')}?workspaceId=${workspaceId}`)
-    return (res.tasks ?? []) as { id: string; title: string; status: string; priority: string }[]
+    return this.requireArray<{ id: string; title: string; status: string; priority: string }>(
+      res,
+      'tasks',
+    )
   }
 
   async getEvents(workspaceId: string): Promise<{ id: string; title: string; type: string }[]> {
     const res = await this.get(`${this.route('events')}?workspaceId=${workspaceId}`)
-    return (res.events ?? []) as { id: string; title: string; type: string }[]
+    return this.requireArray<{ id: string; title: string; type: string }>(res, 'events')
   }
 
   async getApprovals(
     workspaceId: string,
   ): Promise<{ id: string; title: string; status: string; type: string }[]> {
     const res = await this.get(`${this.route('approvals')}?workspaceId=${workspaceId}`)
-    return (res.actions ?? []) as { id: string; title: string; status: string; type: string }[]
+    return this.requireArray<{ id: string; title: string; status: string; type: string }>(
+      res,
+      'actions',
+    )
   }
 
   async getVaultTree(workspaceId: string): Promise<string[]> {
@@ -146,7 +207,7 @@ export class ProductClient {
         if (node.children) extract(node.children)
       }
     }
-    extract((res.tree ?? []) as unknown[])
+    extract(this.requireArray<unknown>(res, 'tree'))
     return paths
   }
 
@@ -162,43 +223,22 @@ export class ProductClient {
     workspaceId: string,
   ): Promise<{ id: string; type: string; prompt: string }[]> {
     const res = await this.get(`${this.route('generations')}?workspaceId=${workspaceId}`)
-    return (res.generations ?? []) as { id: string; type: string; prompt: string }[]
+    return this.requireArray<{ id: string; type: string; prompt: string }>(res, 'generations')
   }
 
   /** Generic GET for custom routes */
   async get(path: string): Promise<Record<string, unknown>> {
-    const res = await fetch(`${this.baseUrl}${path}`, {
-      headers: { Cookie: this.cookies },
-    })
-    return res.json() as Promise<Record<string, unknown>>
+    return this.request('GET', path)
   }
 
   /** Generic POST for custom routes */
   async post(path: string, body: Record<string, unknown>): Promise<Record<string, unknown>> {
-    const res = await fetch(`${this.baseUrl}${path}`, {
-      method: 'POST',
-      headers: {
-        'Content-Type': 'application/json',
-        Origin: this.baseUrl,
-        Cookie: this.cookies,
-      },
-      body: JSON.stringify(body),
-    })
-    return res.json() as Promise<Record<string, unknown>>
+    return this.request('POST', path, body)
   }
 
   /** Generic PATCH for custom routes */
   async patch(path: string, body: Record<string, unknown>): Promise<Record<string, unknown>> {
-    const res = await fetch(`${this.baseUrl}${path}`, {
-      method: 'PATCH',
-      headers: {
-        'Content-Type': 'application/json',
-        Origin: this.baseUrl,
-        Cookie: this.cookies,
-      },
-      body: JSON.stringify(body),
-    })
-    return res.json() as Promise<Record<string, unknown>>
+    return this.request('PATCH', path, body)
   }
 }
 
diff --git a/src/command-runner.test.ts b/src/command-runner.test.ts
index 9310e0d9..2cd3bea9 100644
--- a/src/command-runner.test.ts
+++ b/src/command-runner.test.ts
@@ -1,4 +1,4 @@
-import { mkdirSync, mkdtempSync, rmSync, writeFileSync } from 'node:fs'
+import { chmodSync, mkdirSync, mkdtempSync, rmSync, writeFileSync } from 'node:fs'
 import { tmpdir } from 'node:os'
 import { join } from 'node:path'
 import { describe, expect, it } from 'vitest'
@@ -98,4 +98,66 @@ describe('localCommandRunner', () => {
     })
     expect(r.stdout).toBe('1')
   })
+
+  it('hasBin resolves a real binary without relying on `which`', async () => {
+    // `node` is on PATH in any environment that runs this suite. The
+    // resolution must not depend on a `which`/`command -v` subprocess
+    // (absent on Windows / minimal containers).
+    expect(await localCommandRunner.hasBin('node')).toBe(true)
+  })
+
+  it('hasBin resolves an absolute executable path directly', async () => {
+    expect(await localCommandRunner.hasBin(process.execPath)).toBe(true)
+  })
+
+  it('readFile surfaces a non-ENOENT error instead of collapsing it to null', async () => {
+    const wd = mkdtempSync(join(tmpdir(), 'cmd-runner-errno-'))
+    try {
+      // Reading a directory as a file yields EISDIR, not ENOENT. The old
+      // bare `catch { return null }` reported this real IO error as a
+      // missing file; the fix must surface (throw) it.
+      await expect(localCommandRunner.readFile(wd)).rejects.toThrow()
+    } finally {
+      rmSync(wd, { recursive: true, force: true })
+    }
+  })
+
+  it('readFile returns null only for genuinely-absent (ENOENT) files', async () => {
+    expect(
+      await localCommandRunner.readFile('/tmp/cmd-runner-definitely-absent-xyz-123'),
+    ).toBeNull()
+  })
+
+  it('readDir surfaces a non-ENOENT error instead of reporting an empty dir', async () => {
+    const wd = mkdtempSync(join(tmpdir(), 'cmd-runner-errno-dir-'))
+    const file = join(wd, 'not-a-dir.txt')
+    try {
+      writeFileSync(file, 'x')
+      // readdir on a regular file yields ENOTDIR, not ENOENT. The old
+      // bare `catch { return [] }` masked this as "empty directory."
+      await expect(localCommandRunner.readDir(file)).rejects.toThrow()
+    } finally {
+      rmSync(wd, { recursive: true, force: true })
+    }
+  })
+
+  it('readDir still returns [] for a genuinely-absent (ENOENT) directory', async () => {
+    expect(await localCommandRunner.readDir('/tmp/cmd-runner-no-such-dir-xyz-123')).toEqual([])
+  })
+
+  it.skipIf(process.platform === 'win32' || process.getuid?.() === 0)(
+    'readFile surfaces EACCES (permission) rather than masking it as absent',
+    async () => {
+      const wd = mkdtempSync(join(tmpdir(), 'cmd-runner-eacces-'))
+      const file = join(wd, 'secret.txt')
+      try {
+        writeFileSync(file, 'classified')
+        chmodSync(file, 0o000)
+        await expect(localCommandRunner.readFile(file)).rejects.toThrow()
+      } finally {
+        chmodSync(file, 0o600)
+        rmSync(wd, { recursive: true, force: true })
+      }
+    },
+  )
 })
diff --git a/src/command-runner.ts b/src/command-runner.ts
index d35be29f..36c3cf7c 100644
--- a/src/command-runner.ts
+++ b/src/command-runner.ts
@@ -17,8 +17,15 @@
  */
 
 import { spawnSync } from 'node:child_process'
-import { existsSync, readdirSync, readFileSync, statSync } from 'node:fs'
-import { join } from 'node:path'
+import {
+  accessSync,
+  existsSync,
+  constants as fsConstants,
+  readdirSync,
+  readFileSync,
+  statSync,
+} from 'node:fs'
+import { delimiter, join } from 'node:path'
 
 // ─── Types ──────────────────────────────────────────────────────────────
 
@@ -110,8 +117,7 @@ export const localCommandRunner: CommandRunner = {
     }
   },
   async hasBin(name: string): Promise<boolean> {
-    const r = spawnSync('which', [name], { encoding: 'utf8', timeout: 2000 })
-    return r.status === 0 && (r.stdout ?? '').trim().length > 0
+    return resolveBinOnPath(name) !== null
   },
   async fileExists(path: string): Promise<boolean> {
     return existsSync(path)
@@ -119,16 +125,25 @@ export const localCommandRunner: CommandRunner = {
   async readFile(path: string): Promise<string | null> {
     try {
       return readFileSync(path, 'utf8')
-    } catch {
-      return null
+    } catch (err) {
+      // ENOENT is the only "legitimately absent" signal → null. Any other
+      // errno (EACCES, EISDIR, EIO, …) is a real failure that must NOT
+      // masquerade as "file not present" — surface it so callers don't
+      // treat a permission/IO error as a missing artifact.
+      if (isErrnoCode(err, 'ENOENT')) return null
+      throw err
     }
   },
   async readDir(path: string): Promise<DirEntry[]> {
     let entries: string[]
     try {
       entries = readdirSync(path)
-    } catch {
-      return []
+    } catch (err) {
+      // Same ENOENT-vs-real-error split as readFile: a missing directory
+      // is legitimately empty (→ []); EACCES/ENOTDIR/EIO must surface so
+      // an unreadable directory isn't silently reported as having no files.
+      if (isErrnoCode(err, 'ENOENT')) return []
+      throw err
     }
     const out: DirEntry[] = []
     for (const name of entries) {
@@ -140,10 +155,63 @@ export const localCommandRunner: CommandRunner = {
           isFile: st.isFile(),
           sizeBytes: st.isFile() ? st.size : null,
         })
-      } catch {
-        // skip unreadable
+      } catch (err) {
+        // An entry that vanished between readdir and stat (TOCTOU) is a
+        // benign race → skip. Any other errno surfaces.
+        if (isErrnoCode(err, 'ENOENT')) continue
+        throw err
       }
     }
     return out
   },
 }
+
+function isErrnoCode(err: unknown, code: NodeJS.ErrnoException['code']): boolean {
+  return typeof err === 'object' && err !== null && (err as NodeJS.ErrnoException).code === code
+}
+
+/**
+ * Resolve `name` to an executable absolute path using the runner's PATH,
+ * or `null` if not found. Cross-platform: does not depend on `which`
+ * (absent on Windows) or a `command -v` shell (absent on minimal
+ * containers). On Windows, candidate extensions come from PATHEXT.
+ *
+ * Throws only on a non-ENOENT/EACCES filesystem error while probing a
+ * candidate — those indicate a real IO fault, not "binary absent."
+ */
+function resolveBinOnPath(name: string): string | null {
+  // An explicit path component means PATH lookup doesn't apply: probe directly.
+  if (name.includes('/') || name.includes('\\')) {
+    return isExecutable(name) ? name : null
+  }
+  const pathEnv = process.env.PATH ?? ''
+  const dirs = pathEnv.split(delimiter).filter((d) => d.length > 0)
+  const exts =
+    process.platform === 'win32'
+      ? (process.env.PATHEXT ?? '.COM;.EXE;.BAT;.CMD').split(';').filter((e) => e.length > 0)
+      : ['']
+  for (const dir of dirs) {
+    for (const ext of exts) {
+      const candidate = join(dir, name + ext)
+      if (isExecutable(candidate)) return candidate
+    }
+  }
+  return null
+}
+
+function isExecutable(path: string): boolean {
+  try {
+    // On win32 X_OK is a no-op; presence of the file (with a PATHEXT
+    // extension) is the executability signal there.
+    accessSync(path, process.platform === 'win32' ? fsConstants.F_OK : fsConstants.X_OK)
+    return true
+  } catch (err) {
+    // Not found / not executable / not permitted → "not a usable binary
+    // here," keep scanning. A different errno (EIO, ELOOP, …) is a real
+    // fault and must surface rather than silently yield a false negative.
+    if (isErrnoCode(err, 'ENOENT') || isErrnoCode(err, 'EACCES') || isErrnoCode(err, 'ENOTDIR')) {
+      return false
+    }
+    throw err
+  }
+}
diff --git a/src/detectors/edge.test.ts b/src/detectors/edge.test.ts
new file mode 100644
index 00000000..4dbcde04
--- /dev/null
+++ b/src/detectors/edge.test.ts
@@ -0,0 +1,55 @@
+import { describe, expect, it } from 'vitest'
+import { errorStreakDetector, noProgressDetector } from './index'
+
+describe('errorStreakDetector edge cases', () => {
+  it('is inert when status is undefined (does not advance or reset the streak)', () => {
+    const d = errorStreakDetector({ maxErrors: 2 })
+    expect(d.observe({ status: 'error' })).toBeNull() // streak 1
+    // a step with no status field must NOT touch the streak
+    expect(d.observe({})).toBeNull()
+    expect(d.streak).toBe(1)
+    // the next real error completes the streak across the inert step
+    expect(d.observe({ status: 'error' })).toMatchObject({ detector: 'error-streak', streak: 2 })
+  })
+
+  it('maxErrors <= 0 disables signalling without crashing', () => {
+    const d = errorStreakDetector({ maxErrors: 0 })
+    for (let i = 0; i < 5; i++) {
+      expect(d.observe({ status: 'error' })).toBeNull()
+    }
+    // streak still tracked for telemetry even though no signal fires
+    expect(d.streak).toBe(5)
+  })
+
+  it('negative maxErrors also disables without crashing', () => {
+    const d = errorStreakDetector({ maxErrors: -3 })
+    expect(d.observe({ status: 'error' })).toBeNull()
+    expect(d.observe({ status: 'error' })).toBeNull()
+    expect(d.streak).toBe(2)
+  })
+})
+
+describe('noProgressDetector edge cases', () => {
+  it('does not trip on step 1 without a primed baseline', () => {
+    // maxNoProgress 1 is the most aggressive setting; even so the FIRST observed
+    // step has no prior state to compare against, so stateUnchanged is false and
+    // the streak stays at 0 -> no signal.
+    const d = noProgressDetector({ maxNoProgress: 1 })
+    expect(d.observe({ stateFingerprint: 's0', score: 0 })).toBeNull()
+    expect(d.streak).toBe(0)
+    // the SECOND identical step now has a baseline -> trips at streak 1
+    expect(d.observe({ stateFingerprint: 's0', score: 0 })).toMatchObject({
+      detector: 'no-progress',
+      streak: 1,
+    })
+  })
+
+  it('maxNoProgress <= 0 disables signalling without crashing', () => {
+    const d = noProgressDetector({ maxNoProgress: 0 })
+    d.observe({ stateFingerprint: 's0', score: 0 }) // baseline
+    for (let i = 0; i < 5; i++) {
+      expect(d.observe({ stateFingerprint: 's0', score: 0 })).toBeNull()
+    }
+    expect(d.streak).toBe(5)
+  })
+})
diff --git a/src/eval-campaign.test.ts b/src/eval-campaign.test.ts
new file mode 100644
index 00000000..b9d49365
--- /dev/null
+++ b/src/eval-campaign.test.ts
@@ -0,0 +1,241 @@
+import { describe, expect, it } from 'vitest'
+import type { CampaignRunContext, CampaignRunOutcome, EvalCampaignOptions } from './eval-campaign'
+import { finalizeAbort, runEvalCampaign } from './eval-campaign'
+import { assertLlmRoute, type LlmClientOptions } from './llm-client'
+import { TraceEmitter } from './trace/emitter'
+import { NoopRawProviderSink } from './trace/raw-provider-sink'
+import { InMemoryTraceStore } from './trace/store'
+
+// A minimally-valid LLM config. routeRequirements is set to `{}` in every
+// campaign below so assertLlmRoute does not gate on baseUrl/auth.
+const LLM_OPTS: LlmClientOptions = {
+  baseUrl: 'https://api.example.test/v1',
+  apiKey: 'test-key',
+  provider: 'test',
+}
+
+const TOKENS = { input: 1, output: 1 }
+
+function passingOutcome(model = 'm@1', promptHash = 'p1'): CampaignRunOutcome {
+  return {
+    pass: true,
+    score: 1,
+    costUsd: 0,
+    tokenUsage: TOKENS,
+    model,
+    promptHash,
+    configHash: 'c1',
+  }
+}
+
+/** A runner that starts and ends the run so the run is captured + completed. */
+async function captureAndEnd(ctx: CampaignRunContext<unknown>): Promise<CampaignRunOutcome> {
+  await ctx.emitter.startRun({ scenarioId: ctx.scenarioId, variantId: ctx.variantId })
+  await ctx.emitter.endRun({ pass: true, score: 1 })
+  return passingOutcome()
+}
+
+function baseOpts(
+  overrides: Partial<EvalCampaignOptions<unknown>> = {},
+): EvalCampaignOptions<unknown> {
+  return {
+    campaignId: 'c',
+    variants: [{ id: 'v0', payload: {} }],
+    scenarios: [{ scenarioId: 's0' }],
+    seeds: [0],
+    commitSha: 'sha',
+    llmOpts: LLM_OPTS,
+    routeRequirements: {},
+    storeFactory: () => new InMemoryTraceStore(),
+    rawSinkFactory: () => new NoopRawProviderSink(),
+    integrity: {
+      llmSpansMin: 0,
+      requireRawCoverageOfLlmSpans: false,
+      rawProviderEventsMin: 0,
+    },
+    runner: captureAndEnd,
+    ...overrides,
+  }
+}
+
+describe('finalizeAbort', () => {
+  it('aborts a run still in `running` status', async () => {
+    const store = new InMemoryTraceStore()
+    const emitter = new TraceEmitter(store, { runId: 'r1', now: () => 1 })
+    await emitter.startRun({ scenarioId: 's' })
+
+    await finalizeAbort(emitter, 'r1', 'pool aborted')
+
+    const run = await store.getRun('r1')
+    expect(run?.status).toBe('aborted')
+    expect(run?.outcome).toEqual({ pass: false, notes: 'pool aborted' })
+  })
+
+  // Regression: the guard. Before the `status !== 'running'` check was added,
+  // finalizeAbort unconditionally called abortRun and would overwrite a
+  // successfully-completed run — flipping status `completed`→`aborted` and the
+  // real outcome `{ pass: true }`→`{ pass: false, notes: reason }`. This is the
+  // exact corruption the post-settle cleanup could cause if the open-run
+  // bookkeeping invariant is ever broken in a refactor.
+  it('does NOT overwrite a run that is already completed', async () => {
+    const store = new InMemoryTraceStore()
+    const emitter = new TraceEmitter(store, { runId: 'r2', now: () => 1 })
+    await emitter.startRun({ scenarioId: 's' })
+    await emitter.endRun({ pass: true, score: 0.9, notes: 'real outcome' })
+
+    const before = await store.getRun('r2')
+    expect(before?.status).toBe('completed')
+
+    await finalizeAbort(emitter, 'r2', 'late cleanup must not corrupt this')
+
+    const after = await store.getRun('r2')
+    expect(after?.status).toBe('completed')
+    expect(after?.outcome).toEqual({ pass: true, score: 0.9, notes: 'real outcome' })
+  })
+
+  it('does NOT overwrite a run that already failed', async () => {
+    const store = new InMemoryTraceStore()
+    const emitter = new TraceEmitter(store, { runId: 'r3', now: () => 1 })
+    await emitter.startRun({ scenarioId: 's' })
+    await emitter.endRun({ pass: false, score: 0, notes: 'genuine failure' })
+
+    await finalizeAbort(emitter, 'r3', 'should not touch a failed run')
+
+    const after = await store.getRun('r3')
+    expect(after?.status).toBe('failed')
+    expect(after?.outcome).toEqual({ pass: false, score: 0, notes: 'genuine failure' })
+  })
+
+  it('is a no-op (no throw) when the run was never started', async () => {
+    const store = new InMemoryTraceStore()
+    const emitter = new TraceEmitter(store, { runId: 'never', now: () => 1 })
+
+    await expect(finalizeAbort(emitter, 'never', 'nothing to abort')).resolves.toBeUndefined()
+    expect(await store.getRun('never')).toBeUndefined()
+  })
+
+  it('propagates a genuine store-read error rather than swallowing it', async () => {
+    class ReadFailingStore extends InMemoryTraceStore {
+      async getRun(): Promise<never> {
+        throw new Error('disk fault on getRun')
+      }
+    }
+    const store = new ReadFailingStore()
+    const emitter = new TraceEmitter(store, { runId: 'r4', now: () => 1 })
+
+    await expect(finalizeAbort(emitter, 'r4', 'x')).rejects.toThrow('disk fault on getRun')
+  })
+
+  it('propagates a genuine abort-write error on a running run', async () => {
+    class WriteFailingStore extends InMemoryTraceStore {
+      async updateRun(): Promise<never> {
+        throw new Error('disk full on updateRun')
+      }
+    }
+    const store = new WriteFailingStore()
+    const emitter = new TraceEmitter(store, { runId: 'r5', now: () => 1 })
+    await emitter.startRun({ scenarioId: 's' })
+
+    await expect(finalizeAbort(emitter, 'r5', 'x')).rejects.toThrow('disk full on updateRun')
+  })
+})
+
+describe('runEvalCampaign — genuine-error orphan handling', () => {
+  // A genuine (non-CellExecutionError) error from a worker — here a profile/model
+  // contradiction surfaced AFTER the runner already completed and finalized its
+  // run — must NOT reject the pool mid-flight and orphan peers. The campaign uses
+  // allSettled, finalizes every still-open run in the post-settle sweep, then
+  // re-throws the aggregated genuine error.
+  it('re-throws the genuine error and finalizes the failing run without corrupting it', async () => {
+    const stores = new Map<string, InMemoryTraceStore>()
+    const opts = baseOpts({
+      variants: [{ id: 'v0', payload: {} }],
+      scenarios: [{ scenarioId: 's0' }],
+      seeds: [0],
+      storeFactory: (p) => {
+        const s = new InMemoryTraceStore()
+        stores.set(p.runId, s)
+        return s
+      },
+      runner: async (ctx) => {
+        await ctx.emitter.startRun({ scenarioId: ctx.scenarioId })
+        await ctx.emitter.endRun({ pass: true, score: 1 })
+        // Force a genuine error AFTER the run is completed: the declared profile
+        // contradicts outcome.model. This is thrown from runOneCell, not the
+        // runner, so it is a genuine error, not a CellExecutionError.
+        return {
+          ...passingOutcome('actual-model@1'),
+          agentProfile: {
+            profileId: 'prof-1',
+            sourceProfile: { kind: 'test-profile', hash: 'a'.repeat(64) },
+            model: 'different-model@1',
+            promptHash: 'p1',
+          },
+        }
+      },
+    })
+
+    await expect(runEvalCampaign(opts)).rejects.toThrow(/agentProfile\.model/)
+
+    // The completed run must NOT have been flipped to aborted by post-settle
+    // cleanup — its terminal status is preserved.
+    expect(stores.size).toBe(1)
+    const [, store] = [...stores][0]!
+    const runs = await store.listRuns()
+    expect(runs).toHaveLength(1)
+    expect(runs[0]!.status).toBe('completed')
+    expect(runs[0]!.outcome).toEqual({ pass: true, score: 1 })
+  })
+
+  it('aggregates multiple genuine errors into an AggregateError', async () => {
+    const opts = baseOpts({
+      variants: [{ id: 'v0', payload: {} }],
+      scenarios: [{ scenarioId: 's0' }, { scenarioId: 's1' }],
+      seeds: [0],
+      concurrency: 2,
+      runner: async (ctx) => {
+        await ctx.emitter.startRun({ scenarioId: ctx.scenarioId })
+        await ctx.emitter.endRun({ pass: true, score: 1 })
+        return {
+          ...passingOutcome('actual@1'),
+          agentProfile: {
+            profileId: 'prof-1',
+            sourceProfile: { kind: 'test-profile', hash: 'a'.repeat(64) },
+            model: 'mismatch@1',
+            promptHash: 'p1',
+          },
+        }
+      },
+    })
+
+    await expect(runEvalCampaign(opts)).rejects.toBeInstanceOf(AggregateError)
+  })
+
+  it('records a runner-throw as a failed run and still completes the campaign', async () => {
+    const opts = baseOpts({
+      runner: async (ctx) => {
+        await ctx.emitter.startRun({ scenarioId: ctx.scenarioId })
+        throw new Error('runner blew up')
+      },
+    })
+
+    const result = await runEvalCampaign(opts)
+    expect(result.failedRuns).toHaveLength(1)
+    expect(result.failedRuns[0]!.reason).toBe('runner_threw')
+    expect(result.failedRuns[0]!.error).toBe('runner blew up')
+    expect(result.runs).toHaveLength(0)
+  })
+
+  it('happy path: a captured + completed run lands in result.runs', async () => {
+    const result = await runEvalCampaign(baseOpts())
+    expect(result.runs).toHaveLength(1)
+    expect(result.failedRuns).toHaveLength(0)
+    expect(result.runs[0]!.outcome.holdoutScore).toBe(1)
+  })
+})
+
+describe('preflight sanity', () => {
+  it('assertLlmRoute is exercised by the campaign (smoke)', () => {
+    expect(() => assertLlmRoute(LLM_OPTS, {})).not.toThrow()
+  })
+})
diff --git a/src/eval-campaign.ts b/src/eval-campaign.ts
index c8052e39..6489cb4c 100644
--- a/src/eval-campaign.ts
+++ b/src/eval-campaign.ts
@@ -379,9 +379,23 @@ export async function runEvalCampaign<V>(
   const failedRuns: FailedRun[] = []
 
   // ── Execute (bounded-concurrency worker pool) ──────────────────────
+  // A genuine (non-CellExecutionError) error from any worker is a bug, not a
+  // run-level failure. We must NOT let `Promise.all` reject mid-flight and
+  // orphan the other workers' in-progress runs (emitters never finalized,
+  // sinks/handles leak, partial work silently discarded). Instead: capture the
+  // first genuine error, stop dispatching new cells so in-flight workers wind
+  // down, finalize every still-open run, then re-throw the aggregated error.
   let cursor = 0
+  let aborting = false
+  const genuineErrors: unknown[] = []
+  // Emitters for runs that are currently mid-flight, keyed by runId. A worker
+  // registers its emitter before invoking the runner and removes it once the
+  // run is finalized (via endRun/abortRun, success or failure). Anything left
+  // here after the pool settles is an orphan we must abort.
+  const openRuns = new Map<string, TraceEmitter>()
+
   async function worker(): Promise<void> {
-    while (true) {
+    while (!aborting) {
       const i = cursor++
       if (i >= cells.length) return
       const cell = cells[i]!
@@ -395,8 +409,12 @@ export async function runEvalCampaign<V>(
           if (err.integrity) integrityReports.push(err.integrity)
         } else {
           // Genuine bug — not a runner failure, not an integrity failure.
-          // Surface it; don't silently mask.
-          throw err
+          // Capture it and stop dispatching so peers wind down gracefully;
+          // re-thrown after the pool settles. Do not surface here — that would
+          // reject Promise.all and orphan the other workers' runs.
+          genuineErrors.push(err)
+          aborting = true
+          return
         }
       }
     }
@@ -427,6 +445,9 @@ export async function runEvalCampaign<V>(
       now: opts.now,
       onRunComplete: opts.onRunComplete,
     })
+    // Track this run as open so a genuine error elsewhere in the pool can
+    // finalize it instead of orphaning it. Removed in the finally below.
+    openRuns.set(runId, emitter)
 
     const llmOpts: LlmClientOptions = {
       ...opts.llmOpts,
@@ -449,101 +470,129 @@ export async function runEvalCampaign<V>(
       llmOpts,
     }
 
-    const wallStart = now()
-    let outcome: CampaignRunOutcome
     try {
-      outcome = await opts.runner(ctx)
-    } catch (err) {
-      const message = err instanceof Error ? err.message : String(err)
-      // The runner threw mid-execution; give it a chance to have aborted.
+      const wallStart = now()
+      let outcome: CampaignRunOutcome
       try {
-        await emitter.abortRun(message)
-      } catch {
-        // Already aborted/ended; ignore.
+        outcome = await opts.runner(ctx)
+      } catch (err) {
+        const message = err instanceof Error ? err.message : String(err)
+        // The runner threw mid-execution. Abort the run so the emitter
+        // finalizes. The only benign abortRun failure is "the runner never
+        // started the run" (nothing to finalize). A store-write failure (disk
+        // full, FS error) is a genuine diagnostic — surface it rather than
+        // masking it as a plain runner failure.
+        await finalizeAbort(emitter, runId, message)
+        throw new CellExecutionError({
+          runId,
+          variantId: cell.variant.id,
+          scenarioId: cell.scenario.scenarioId,
+          seed: cell.seed,
+          reason: 'runner_threw',
+          error: message,
+        })
+      }
+      const wallMs = now() - wallStart
+
+      const integrityReport = await assertRunCaptured(store, runId, { ...integrity, rawSink })
+      if (!integrityReport.ok) {
+        switch (onIntegrityFailure) {
+          case 'throw':
+            throw new RunIntegrityError(integrityReport)
+          case 'mark_failed':
+            throw new CellExecutionError(
+              {
+                runId,
+                variantId: cell.variant.id,
+                scenarioId: cell.scenario.scenarioId,
+                seed: cell.seed,
+                reason: 'integrity_failed',
+                error: integrityReport.issues.map((i) => i.code).join(', '),
+              },
+              integrityReport,
+            )
+          case 'log':
+            // Caller wants the run admitted with a flagged report; fall through.
+            break
+        }
       }
-      throw new CellExecutionError({
+
+      const recordOutcome: RunOutcome = {
+        raw: outcome.raw ?? {},
+      }
+      if (splitTag === 'holdout') recordOutcome.holdoutScore = outcome.score
+      else recordOutcome.searchScore = outcome.score
+      if (outcome.judgeScores !== undefined) recordOutcome.judgeScores = outcome.judgeScores
+
+      const record: RunRecord = {
         runId,
-        variantId: cell.variant.id,
-        scenarioId: cell.scenario.scenarioId,
+        experimentId: opts.campaignId,
+        candidateId: cell.variant.id,
         seed: cell.seed,
-        reason: 'runner_threw',
-        error: message,
-      })
-    }
-    const wallMs = now() - wallStart
-
-    const integrityReport = await assertRunCaptured(store, runId, { ...integrity, rawSink })
-    if (!integrityReport.ok) {
-      switch (onIntegrityFailure) {
-        case 'throw':
-          throw new RunIntegrityError(integrityReport)
-        case 'mark_failed':
-          throw new CellExecutionError(
-            {
+        model: outcome.model,
+        promptHash: outcome.promptHash,
+        configHash: outcome.configHash,
+        commitSha: opts.commitSha,
+        wallMs,
+        costUsd: outcome.costUsd,
+        tokenUsage: outcome.tokenUsage,
+        judgeMetadata: outcome.judgeMetadata,
+        outcome: recordOutcome,
+        ...(outcome.failureClass ? { failureClass: outcome.failureClass } : {}),
+        failureMode: outcome.failureMode,
+        splitTag,
+        scenarioId: cell.scenario.scenarioId,
+      }
+      const profileSource =
+        outcome.agentProfile ??
+        (typeof opts.agentProfile === 'function'
+          ? await opts.agentProfile({
+              campaignId: opts.campaignId,
               runId,
               variantId: cell.variant.id,
               scenarioId: cell.scenario.scenarioId,
               seed: cell.seed,
-              reason: 'integrity_failed',
-              error: integrityReport.issues.map((i) => i.code).join(', '),
-            },
-            integrityReport,
-          )
-        case 'log':
-          // Caller wants the run admitted with a flagged report; fall through.
-          break
+              variant: cell.variant.payload,
+              scenarioTags: cell.scenario.tags ?? {},
+            })
+          : opts.agentProfile)
+      if (profileSource !== undefined) {
+        const agentProfile = await resolveAgentProfileCell(profileSource)
+        assertAgentProfileMatchesRun(agentProfile, outcome.model, outcome.promptHash)
+        record.agentProfile = agentProfile
       }
+      return { record: validateRunRecord(record), integrity: integrityReport }
+    } finally {
+      // This run's worker has finished with it (success, run-level failure, or
+      // genuine error). It is no longer the pool's job to finalize — drop it so
+      // the post-settle sweep doesn't double-abort a finalized run.
+      openRuns.delete(runId)
     }
-
-    const recordOutcome: RunOutcome = {
-      raw: outcome.raw ?? {},
-    }
-    if (splitTag === 'holdout') recordOutcome.holdoutScore = outcome.score
-    else recordOutcome.searchScore = outcome.score
-    if (outcome.judgeScores !== undefined) recordOutcome.judgeScores = outcome.judgeScores
-
-    const record: RunRecord = {
-      runId,
-      experimentId: opts.campaignId,
-      candidateId: cell.variant.id,
-      seed: cell.seed,
-      model: outcome.model,
-      promptHash: outcome.promptHash,
-      configHash: outcome.configHash,
-      commitSha: opts.commitSha,
-      wallMs,
-      costUsd: outcome.costUsd,
-      tokenUsage: outcome.tokenUsage,
-      judgeMetadata: outcome.judgeMetadata,
-      outcome: recordOutcome,
-      ...(outcome.failureClass ? { failureClass: outcome.failureClass } : {}),
-      failureMode: outcome.failureMode,
-      splitTag,
-      scenarioId: cell.scenario.scenarioId,
-    }
-    const profileSource =
-      outcome.agentProfile ??
-      (typeof opts.agentProfile === 'function'
-        ? await opts.agentProfile({
-            campaignId: opts.campaignId,
-            runId,
-            variantId: cell.variant.id,
-            scenarioId: cell.scenario.scenarioId,
-            seed: cell.seed,
-            variant: cell.variant.payload,
-            scenarioTags: cell.scenario.tags ?? {},
-          })
-        : opts.agentProfile)
-    if (profileSource !== undefined) {
-      const agentProfile = await resolveAgentProfileCell(profileSource)
-      assertAgentProfileMatchesRun(agentProfile, outcome.model, outcome.promptHash)
-      record.agentProfile = agentProfile
-    }
-    return { record: validateRunRecord(record), integrity: integrityReport }
   }
 
   const workers = Array.from({ length: Math.min(concurrency, cells.length) }, () => worker())
-  await Promise.all(workers)
+  // allSettled (not all): a genuine error in one worker must not reject the
+  // pool mid-flight and orphan the others. Each worker captures its own
+  // genuine error into `genuineErrors` and returns; we re-throw below.
+  await Promise.allSettled(workers)
+
+  // Finalize any run still open after the pool wound down. With the
+  // stop-dispatch flag these are the runs that were mid-flight in peer workers
+  // when the first genuine error fired — abort them so their emitters finalize
+  // (hooks fire, store records a terminal status) instead of leaking.
+  for (const [runId, emitter] of openRuns) {
+    await finalizeAbort(emitter, runId, 'campaign aborted: genuine error in a sibling run')
+  }
+  openRuns.clear()
+
+  if (genuineErrors.length > 0) {
+    throw genuineErrors.length === 1
+      ? genuineErrors[0]
+      : new AggregateError(
+          genuineErrors,
+          `runEvalCampaign: ${genuineErrors.length} runs failed with genuine (non-run-level) errors`,
+        )
+  }
 
   // ── Optional research report ───────────────────────────────────────
   let report: ResearchReport | undefined
@@ -585,6 +634,32 @@ class CellExecutionError extends Error {
   }
 }
 
+/**
+ * Abort a run whose owning work threw or was orphaned by a sibling's genuine
+ * error, finalizing the emitter so hooks fire and the store records a terminal
+ * status. Safe to call unconditionally: it only aborts runs that are still
+ * `running`. Two no-op cases are intentional and benign:
+ *
+ *   - the run was never started (absent from the store) — nothing to finalize
+ *   - the run is already terminal (`completed` / `failed` / `aborted`) —
+ *     finalizing again would overwrite the real outcome (e.g. flip a passed run
+ *     to `aborted` with `{ pass: false, notes: reason }`), so we leave it alone
+ *
+ * Any OTHER failure of the store read or the abort write (disk full, FS fault,
+ * backend down) is a genuine diagnostic and propagates rather than being
+ * swallowed.
+ */
+export async function finalizeAbort(
+  emitter: TraceEmitter,
+  runId: string,
+  reason: string,
+): Promise<void> {
+  const existing = await emitter.traceStore.getRun(runId)
+  if (existing === undefined) return // run never started; nothing to abort
+  if (existing.status !== 'running') return // already finalized; never overwrite a real outcome
+  await emitter.abortRun(reason)
+}
+
 function defaultRawSinkFactory(workDir: string | undefined) {
   return (params: CampaignFactoryParams): RawProviderSink => {
     if (!workDir) {
diff --git a/src/executor.test.ts b/src/executor.test.ts
new file mode 100644
index 00000000..6b6f0ebd
--- /dev/null
+++ b/src/executor.test.ts
@@ -0,0 +1,179 @@
+/**
+ * Capture-integrity contract for the scenario executor. Two failure classes
+ * that a sloppy fallback erases must stay loud:
+ *   1. a malformed/hung chat response must NOT be recorded as a real empty turn
+ *      (capture defect, indistinguishable from a model that said nothing);
+ *   2. a judge that errors must record WHY, retry only transient faults, and
+ *      fail deterministic faults immediately.
+ */
+
+import type { TCloud } from '@tangle-network/tcloud'
+import { describe, expect, it, vi } from 'vitest'
+
+import { CaptureIntegrityError } from './errors'
+import { type ExecutorConfig, executeScenario, type JudgeFailure } from './executor'
+import { JudgeParseError } from './judges'
+import type { JudgeFn, Scenario, ScenarioResult } from './types'
+
+function scenario(overrides: Partial<Scenario> = {}): Scenario {
+  return {
+    id: 'cap-1',
+    persona: 'analyst',
+    label: 'capture',
+    thesis: 'executor preserves capture integrity',
+    dimensions: ['accuracy'],
+    turns: [{ user: 'hello', expectedBehaviors: [] }],
+    artifactChecks: [],
+    ...overrides,
+  }
+}
+
+/** A TCloud whose chat() returns whatever shape the test supplies. */
+function chatStub(resp: unknown): TCloud {
+  return { chat: vi.fn(async () => resp) } as unknown as TCloud
+}
+
+/** No-op sleep so the retry policy runs without real backoff. */
+const noSleep = (_ms: number) => Promise.resolve()
+
+function config(overrides: Partial<ExecutorConfig> = {}): ExecutorConfig {
+  return {
+    systemPrompt: 'you are a test agent',
+    judges: [],
+    sleep: noSleep,
+    ...overrides,
+  }
+}
+
+describe('executeScenario — malformed chat response is a loud capture defect', () => {
+  it('throws CaptureIntegrityError when choices[0].message is absent', async () => {
+    const tc = chatStub({ choices: [{}] })
+    await expect(executeScenario(tc, scenario(), config())).rejects.toBeInstanceOf(
+      CaptureIntegrityError,
+    )
+  })
+
+  it('throws when content is missing entirely (collapsed to "" under the old code)', async () => {
+    const tc = chatStub({ choices: [{ message: { role: 'assistant' } }] })
+    await expect(executeScenario(tc, scenario(), config())).rejects.toBeInstanceOf(
+      CaptureIntegrityError,
+    )
+  })
+
+  it('throws when content is a non-string (e.g. null) instead of recording an empty turn', async () => {
+    const tc = chatStub({ choices: [{ message: { content: null } }] })
+    await expect(executeScenario(tc, scenario(), config())).rejects.toBeInstanceOf(
+      CaptureIntegrityError,
+    )
+  })
+
+  it('PRESERVES a legitimately-empty string — a model that chose to say nothing is real signal', async () => {
+    const tc = chatStub({ choices: [{ message: { content: '' } }] })
+    const result = await executeScenario(tc, scenario(), config())
+    expect(result.turns).toHaveLength(1)
+    expect(result.turns[0]!.agentResponse).toBe('')
+  })
+})
+
+describe('executeScenario — judge retry policy records the reason and gates on transience', () => {
+  const goodResp = { choices: [{ message: { content: 'hi there' } }] }
+
+  it('does NOT retry a deterministic judge error — calls the judge exactly once', async () => {
+    const judge = vi.fn(async () => {
+      throw new Error('rubric validation failed')
+    }) as unknown as JudgeFn
+    const tc = chatStub(goodResp)
+
+    const result = await executeScenario(tc, scenario(), config({ judges: [judge] }))
+
+    expect((judge as unknown as ReturnType<typeof vi.fn>).mock.calls).toHaveLength(1)
+    expect(result.judgeErrors).toBe(1)
+  })
+
+  it('records WHY a judge failed in additive judgeFailures with the actual error message', async () => {
+    const judge = vi.fn(async () => {
+      throw new Error('rubric validation failed')
+    }) as unknown as JudgeFn
+    const tc = chatStub(goodResp)
+
+    const result = (await executeScenario(
+      tc,
+      scenario(),
+      config({ judges: [judge] }),
+    )) as ScenarioResult & {
+      judgeFailures?: JudgeFailure[]
+    }
+
+    expect(result.judgeFailures).toBeDefined()
+    expect(result.judgeFailures).toHaveLength(1)
+    expect(result.judgeFailures![0]!.reason).toContain('rubric validation failed')
+    expect(result.judgeFailures![0]!.attempts).toBe(1)
+  })
+
+  it('retries a transient judge error and succeeds on the next attempt', async () => {
+    let calls = 0
+    const judge = vi.fn(async () => {
+      calls++
+      if (calls === 1) {
+        const err = new Error('socket hang up') as Error & { status?: number }
+        err.status = 503
+        throw err
+      }
+      return [{ judgeName: 'j', dimension: 'accuracy', score: 0.8, reasoning: 'ok' }]
+    }) as unknown as JudgeFn
+    const tc = chatStub(goodResp)
+
+    const result = (await executeScenario(
+      tc,
+      scenario(),
+      config({ judges: [judge] }),
+    )) as ScenarioResult & {
+      judgeFailures?: JudgeFailure[]
+    }
+
+    expect(calls).toBe(2)
+    expect(result.judgeErrors).toBe(0)
+    expect(result.judgeFailures).toBeUndefined()
+    expect(result.judgeScores).toHaveLength(1)
+  })
+
+  it('exhausts retries on a persistently-transient judge and reports exactly 3 attempts', async () => {
+    const judge = vi.fn(async () => {
+      const err = new Error('ECONNRESET') as Error & { status?: number }
+      err.status = 503
+      throw err
+    }) as unknown as JudgeFn
+    const tc = chatStub(goodResp)
+
+    const result = (await executeScenario(
+      tc,
+      scenario(),
+      config({ judges: [judge] }),
+    )) as ScenarioResult & {
+      judgeFailures?: JudgeFailure[]
+    }
+
+    expect((judge as unknown as ReturnType<typeof vi.fn>).mock.calls).toHaveLength(3)
+    expect(result.judgeFailures![0]!.attempts).toBe(3)
+    expect(result.judgeErrors).toBe(1)
+  })
+
+  it('does not retry a JudgeParseError but still records it as a failed judge', async () => {
+    const judge = vi.fn(async () => {
+      throw new JudgeParseError('domain', 'not json at all')
+    }) as unknown as JudgeFn
+    const tc = chatStub(goodResp)
+
+    const result = (await executeScenario(
+      tc,
+      scenario(),
+      config({ judges: [judge] }),
+    )) as ScenarioResult & {
+      judgeFailures?: JudgeFailure[]
+    }
+
+    expect((judge as unknown as ReturnType<typeof vi.fn>).mock.calls).toHaveLength(1)
+    expect(result.judgeErrors).toBe(1)
+    expect(result.judgeFailures![0]!.reason).toContain('unparseable')
+  })
+})
diff --git a/src/executor.ts b/src/executor.ts
index 73b76bae..e900d406 100644
--- a/src/executor.ts
+++ b/src/executor.ts
@@ -1,5 +1,7 @@
 import type { TCloud } from '@tangle-network/tcloud'
+import { CaptureIntegrityError } from './errors'
 import { JudgeParseError } from './judges'
+import { isTransientLlmError } from './llm-client'
 import { normalizeScores, weightedMean } from './statistics'
 import type {
   CollectedArtifacts,
@@ -31,6 +33,29 @@ export interface ExecutorConfig {
     check: Scenario['artifactChecks'][0],
     artifacts: CollectedArtifacts,
   ) => { passed: boolean; detail: string } | null
+  /**
+   * Sleep used between judge retries and after a judge succeeds. Defaults to
+   * real `setTimeout`. Injectable so tests exercise the retry policy without
+   * the real multi-second backoff.
+   */
+  sleep?: (ms: number) => Promise<void>
+}
+
+function describeShape(content: unknown, message: unknown): string {
+  if (message === undefined || message === null) return 'no choices[0].message'
+  return `message.content of type ${content === null ? 'null' : typeof content}`
+}
+
+function errMessage(err: unknown): string {
+  if (err instanceof Error) return `${err.name}: ${err.message}`
+  return String(err)
+}
+
+/** Per-judge failure diagnostic, recorded additively on `ScenarioResult`. */
+export interface JudgeFailure {
+  judge: string
+  attempts: number
+  reason: string
 }
 
 /**
@@ -72,9 +97,21 @@ export async function executeScenario(
       maxTokens: 3000,
     })
 
-    const content =
-      (resp as { choices?: { message?: { content?: string } }[] }).choices?.[0]?.message?.content ??
-      ''
+    const message = (resp as { choices?: { message?: { content?: unknown } }[] }).choices?.[0]
+      ?.message
+    const rawContent = message?.content
+    // A legitimately-empty turn ('' from a model that chose to say nothing) is
+    // real signal and must survive. A missing choices[0].message, or a content
+    // that is not a string, is a capture defect — the chat call hung or returned
+    // a malformed shape — and is indistinguishable from a real empty turn once
+    // collapsed to ''. Fail loud so it is never scored as a real response.
+    if (message === undefined || message === null || typeof rawContent !== 'string') {
+      throw new CaptureIntegrityError(
+        `chat response for scenario "${scenario.id}" turn ${i} is malformed: ` +
+          `expected choices[0].message.content to be a string, got ${describeShape(rawContent, message)}`,
+      )
+    }
+    const content = rawContent
 
     messages.push({ role: 'assistant', content })
 
@@ -173,29 +210,50 @@ export async function executeScenario(
   const judgeInput = { scenario, turns, artifacts }
   const judgeResults: JudgeScore[][] = []
   let failedJudges = 0
+  const judgeFailures: JudgeFailure[] = []
+  const sleep = config.sleep ?? ((ms: number) => new Promise<void>((r) => setTimeout(r, ms)))
 
-  for (const judge of config.judges) {
+  for (let j = 0; j < config.judges.length; j++) {
+    const judge = config.judges[j]!
+    const judgeName = judge.name || `judge[${j}]`
+    let lastError: unknown
+    let madeAttempts = 0
     for (let attempt = 0; attempt < 3; attempt++) {
+      madeAttempts = attempt + 1
       try {
         if (attempt > 0) {
           const wait = attempt * 10_000
-          console.log(`    judge retry ${attempt}/2 (waiting ${wait / 1000}s)`)
-          await new Promise((r) => setTimeout(r, wait))
+          console.log(`    ${judgeName} retry ${attempt}/2 (waiting ${wait / 1000}s)`)
+          await sleep(wait)
         }
         const scores = await judge(tc, judgeInput)
         judgeResults.push(scores)
-        await new Promise((r) => setTimeout(r, 3000))
+        await sleep(3000)
+        lastError = undefined
         break
       } catch (err) {
+        lastError = err
         if (err instanceof JudgeParseError) {
           // The model answered but unparseably — another identical prompt
           // won't fix it. Record the failure and move to the next judge.
-          failedJudges++
+          console.error(`    ${judgeName} unparseable (no retry): ${errMessage(err)}`)
           break
         }
-        if (attempt === 2) failedJudges++
+        console.error(`    ${judgeName} attempt ${attempt} failed: ${errMessage(err)}`)
+        // Deterministic errors (bad rubric, auth, 4xx) won't change on retry —
+        // only retry connection/rate-limit class faults. Stop the loop now so a
+        // permanent error is not masked by two pointless waits.
+        if (!isTransientLlmError(err)) break
       }
     }
+    if (lastError !== undefined) {
+      failedJudges++
+      judgeFailures.push({
+        judge: judgeName,
+        attempts: madeAttempts,
+        reason: errMessage(lastError),
+      })
+    }
   }
 
   const allScores = judgeResults.flat()
@@ -221,7 +279,10 @@ export async function executeScenario(
     })),
   )
 
-  return {
+  // judgeFailures is additive diagnostic context (WHY each judge errored),
+  // recorded alongside the existing judgeErrors count without widening the
+  // shared ScenarioResult contract. Absent when every judge succeeded.
+  const result: ScenarioResult & { judgeFailures?: JudgeFailure[] } = {
     scenarioId: scenario.id,
     persona: scenario.persona,
     turns,
@@ -232,4 +293,6 @@ export async function executeScenario(
     totalDurationMs: Date.now() - startTime,
     artifacts,
   }
+  if (judgeFailures.length > 0) result.judgeFailures = judgeFailures
+  return result
 }
diff --git a/src/judge-runner.test.ts b/src/judge-runner.test.ts
new file mode 100644
index 00000000..539914f5
--- /dev/null
+++ b/src/judge-runner.test.ts
@@ -0,0 +1,61 @@
+import { describe, expect, it } from 'vitest'
+import { compilerJudge, runJudgeFleet } from './judge-runner'
+import type { HarnessConfig, SandboxDriver, SandboxResult } from './sandbox-harness'
+
+/**
+ * Driver that records peak concurrent `exec` calls. Each exec yields to the
+ * event loop a few times so the pool's bound is observable: with unbounded
+ * fan-out every spec's subprocess is in-flight at once.
+ */
+function makeConcurrencyTrackingDriver(): SandboxDriver & { peak: number; active: number } {
+  const state = { id: 'tracking', peak: 0, active: 0 } as SandboxDriver & {
+    peak: number
+    active: number
+  }
+  state.exec = async (
+    phase: SandboxResult['phase'],
+    _command: string,
+    _config: HarnessConfig,
+  ): Promise<SandboxResult> => {
+    state.active++
+    if (state.active > state.peak) state.peak = state.active
+    // Yield repeatedly so other workers can start while we're "running".
+    for (let i = 0; i < 5; i++) await Promise.resolve()
+    state.active--
+    return { phase, exitCode: 0, stdout: '', stderr: '', wallMs: 1 }
+  }
+  return state
+}
+
+describe('runJudgeFleet bounded concurrency', () => {
+  it('caps concurrent subprocesses at the configured concurrency', async () => {
+    const driver = makeConcurrencyTrackingDriver()
+    const specs = Array.from({ length: 12 }, (_, i) =>
+      compilerJudge(`judge-${i}`, { runCommand: 'true' } as HarnessConfig),
+    )
+
+    const results = await runJudgeFleet(specs, { driver, concurrency: 3 })
+
+    expect(results).toHaveLength(12)
+    // OLD behavior (Promise.all over all specs) would peak at 12. The bounded
+    // pool must never exceed the configured concurrency.
+    expect(driver.peak).toBeLessThanOrEqual(3)
+    expect(driver.peak).toBeGreaterThan(0)
+  })
+
+  it('preserves result order matching input spec order', async () => {
+    const driver = makeConcurrencyTrackingDriver()
+    const specs = Array.from({ length: 8 }, (_, i) =>
+      compilerJudge(`spec-${i}`, { runCommand: 'true' } as HarnessConfig),
+    )
+    const results = await runJudgeFleet(specs, { driver, concurrency: 2 })
+    expect(results.map((r) => r.id)).toEqual(specs.map((s) => s.id))
+  })
+
+  it('returns [] for an empty spec list without spawning workers', async () => {
+    const driver = makeConcurrencyTrackingDriver()
+    const results = await runJudgeFleet([], { driver })
+    expect(results).toEqual([])
+    expect(driver.peak).toBe(0)
+  })
+})
diff --git a/src/judge-runner.ts b/src/judge-runner.ts
index 2c28289b..23d4bbe7 100644
--- a/src/judge-runner.ts
+++ b/src/judge-runner.ts
@@ -1,3 +1,4 @@
+import { cpus } from 'node:os'
 import {
   type HarnessConfig,
   type SandboxDriver,
@@ -28,6 +29,12 @@ export interface SandboxJudgeResult {
 export interface JudgeFleetOptions {
   driver?: SandboxDriver
   parallel?: boolean
+  /**
+   * Max concurrent judge subprocesses. Each spec spawns its own subprocess
+   * via the driver, so unbounded fan-out exhausts file descriptors / PIDs.
+   * Defaults to the host CPU count. Ignored when `parallel === false`.
+   */
+  concurrency?: number
 }
 
 export class JudgeRunner {
@@ -69,7 +76,18 @@ export async function runJudgeFleet(
     for (const spec of specs) results.push(await runner.run(spec))
     return results
   }
-  return await Promise.all(specs.map((spec) => runner.run(spec)))
+  const concurrency = Math.max(1, options.concurrency ?? cpus().length)
+  const results: SandboxJudgeResult[] = new Array(specs.length)
+  let cursor = 0
+  async function worker(): Promise<void> {
+    while (true) {
+      const i = cursor++
+      if (i >= specs.length) return
+      results[i] = await runner.run(specs[i]!)
+    }
+  }
+  await Promise.all(Array.from({ length: Math.min(concurrency, specs.length) }, () => worker()))
+  return results
 }
 
 export function compilerJudge(id: string, config: HarnessConfig): SandboxJudgeSpec {
diff --git a/src/llm-client.test.ts b/src/llm-client.test.ts
index 4883c4ae..7316ae76 100644
--- a/src/llm-client.test.ts
+++ b/src/llm-client.test.ts
@@ -257,6 +257,114 @@ describe('llm-client — retry semantics', () => {
   })
 })
 
+describe('llm-client — caller AbortSignal + cross-attempt deadline', () => {
+  it('an already-aborted caller signal fails loud without firing fetch', async () => {
+    const fetch = vi.fn(async () =>
+      mkOkResponse({ choices: [{ message: { content: 'x' } }], usage: {} }),
+    ) as unknown as typeof globalThis.fetch
+    const controller = new AbortController()
+    controller.abort()
+    await expect(
+      callLlm({ model: 'm', messages: [] }, { fetch, signal: controller.signal }),
+    ).rejects.toThrow(/abort/i)
+    expect(fetch as unknown as ReturnType<typeof vi.fn>).not.toHaveBeenCalled()
+  })
+
+  it('a caller abort mid-call is NOT retried — it surfaces immediately', async () => {
+    // Regression: callLlm took no external signal, so a campaign cancel could
+    // not stop an in-flight call, AND an AbortError matches the transient
+    // patterns — so a naive wiring would retry the cancelled call.
+    const controller = new AbortController()
+    let calls = 0
+    const fetch: typeof globalThis.fetch = (async (_url: string, init: RequestInit) => {
+      calls++
+      controller.abort()
+      // Mirror fetch's behavior: a request whose linked signal aborts rejects.
+      const err = new Error('The operation was aborted')
+      err.name = 'AbortError'
+      void init
+      throw err
+    }) as unknown as typeof globalThis.fetch
+
+    await expect(
+      callLlm({ model: 'm', messages: [] }, { fetch, signal: controller.signal, maxRetries: 3 }),
+    ).rejects.toThrow(/abort/i)
+    expect(calls).toBe(1)
+  })
+
+  it('a caller abort cancels the in-flight fetch via the linked signal', async () => {
+    const controller = new AbortController()
+    const fetch: typeof globalThis.fetch = ((_url: string, init: RequestInit) =>
+      new Promise((_resolve, reject) => {
+        init.signal?.addEventListener('abort', () => {
+          const err = new Error('The operation was aborted')
+          err.name = 'AbortError'
+          reject(err)
+        })
+      })) as unknown as typeof globalThis.fetch
+
+    const p = callLlm({ model: 'm', messages: [] }, { fetch, signal: controller.signal })
+    controller.abort()
+    await expect(p).rejects.toThrow(/abort/i)
+  })
+
+  it('stops retrying once the cross-attempt deadline is exhausted', async () => {
+    // Per-attempt timeout still bounds each call, but a tight wall-clock budget
+    // must cut the retry loop short rather than waiting full timeout × retries.
+    let calls = 0
+    const fetch: typeof globalThis.fetch = (async () => {
+      calls++
+      // Burn the entire deadline on the first attempt's "work".
+      await new Promise((r) => setTimeout(r, 30))
+      return mkErrResponse(503, 'still unavailable')
+    }) as unknown as typeof globalThis.fetch
+
+    await expect(
+      callLlm({ model: 'm', messages: [] }, { fetch, maxRetries: 5, deadlineMs: 10 }),
+    ).rejects.toBeInstanceOf(LlmCallError)
+    // Without the deadline this would retry up to 5 times; the budget caps it at 1.
+    expect(calls).toBe(1)
+  })
+})
+
+describe('llm-client — empty-content + finishReason signals', () => {
+  it('flags empty content and surfaces finish_reason=length (truncation)', async () => {
+    const fetch = mockFetch([
+      async () =>
+        mkOkResponse({
+          choices: [{ message: { content: '' }, finish_reason: 'length' }],
+          usage: {},
+        }),
+    ])
+    const r = await callLlm({ model: 'm', messages: [] }, { fetch })
+    expect(r.content).toBe('')
+    expect(r.contentEmpty).toBe(true)
+    expect(r.finishReason).toBe('length')
+  })
+
+  it('non-empty content reports contentEmpty=false and finish_reason=stop', async () => {
+    const fetch = mockFetch([
+      async () =>
+        mkOkResponse({
+          choices: [{ message: { content: 'real answer' }, finish_reason: 'stop' }],
+          usage: {},
+        }),
+    ])
+    const r = await callLlm({ model: 'm', messages: [] }, { fetch })
+    expect(r.contentEmpty).toBe(false)
+    expect(r.finishReason).toBe('stop')
+  })
+
+  it('whitespace-only content counts as empty; missing finish_reason is null', async () => {
+    const fetch = mockFetch([
+      async () => mkOkResponse({ choices: [{ message: { content: '   \n ' } }], usage: {} }),
+    ])
+    const r = await callLlm({ model: 'm', messages: [] }, { fetch })
+    expect(r.contentEmpty).toBe(true)
+    expect(r.finishReason).toBeNull()
+  })
+})
+
 describe('llm-client — isTransientLlmError classification', () => {
   it('classifies HTTP/2 + undici transport faults as transient', () => {
     expect(isTransientLlmError(new Error('terminated'))).toBe(true)
diff --git a/src/llm-client.ts b/src/llm-client.ts
index 048c725b..1e5e06ff 100644
--- a/src/llm-client.ts
+++ b/src/llm-client.ts
@@ -79,6 +79,22 @@ export interface LlmCallResult {
   model: string
   /** Wall-clock duration of the HTTP call (last attempt, if retried). */
   durationMs: number
+  /**
+   * `finish_reason` echoed from the first choice (`stop`, `length`,
+   * `content_filter`, `tool_calls`, ...). `null` when the provider omits it.
+   * Exposed so a free-form `callLlm` caller CAN detect a truncated answer
+   * (`length`) instead of treating a cut-off completion as complete. Note:
+   * `callLlm` does not itself reject on it — acting on this signal is the
+   * caller's responsibility (in-repo free-form drivers do not yet enforce it).
+   */
+  finishReason?: string | null
+  /**
+   * True when `content.trim()` is empty. An empty completion is a silent zero
+   * for free-form `callLlm` callers; this flag is the signal a caller can
+   * inspect to fail loud rather than proceed on an empty string. `callLlm`
+   * surfaces it but does not throw on it.
+   */
+  contentEmpty?: boolean
   /** Raw response body. */
   raw: Record<string, unknown>
 }
@@ -104,6 +120,21 @@ export interface LlmClientOptions {
   authHeader?: { name: string; value: string }
   /** Default timeout in ms. Per-call can override. */
   defaultTimeoutMs?: number
+  /**
+   * Caller-supplied abort signal — e.g. a campaign-wide cancel. Linked to
+   * each attempt's per-attempt timeout controller, so aborting it cancels
+   * the in-flight fetch. A caller abort is FATAL: it is not retried even
+   * though an AbortError otherwise matches the transient patterns.
+   */
+  signal?: AbortSignal
+  /**
+   * Cross-attempt wall-clock budget in ms, measured from the first attempt.
+   * Before launching each attempt the loop checks the remaining budget and
+   * stops retrying once it is exhausted, rather than waiting the full
+   * per-attempt timeout on every retry. Bounds total time independent of
+   * `maxRetries` × `timeoutMs`.
+   */
+  deadlineMs?: number
   /** Max retry attempts on retriable errors. Default 3 (1 initial + 2 retries). */
   maxRetries?: number
   /** Fetch implementation — defaults to global `fetch`. Override for custom transport (e.g. tests). */
@@ -271,6 +302,31 @@ async function sleep(ms: number): Promise<void> {
   return new Promise((resolve) => setTimeout(resolve, ms))
 }
 
+/**
+ * Combine the per-attempt timeout signal with an optional caller signal into
+ * one signal the fetch listens on. Prefers the native `AbortSignal.any`; falls
+ * back to manual wiring on runtimes that predate it. The caller signal is also
+ * propagated to the timeout controller so aborting it cancels the in-flight
+ * fetch immediately.
+ */
+function linkSignals(timeoutController: AbortController, caller?: AbortSignal): AbortSignal {
+  if (!caller) return timeoutController.signal
+  if (typeof (AbortSignal as { any?: unknown }).any === 'function') {
+    return AbortSignal.any([timeoutController.signal, caller])
+  }
+  if (caller.aborted) {
+    timeoutController.abort()
+  } else {
+    caller.addEventListener('abort', () => timeoutController.abort(), { once: true })
+  }
+  return timeoutController.signal
+}
+
+/** True once the cross-attempt wall-clock budget (if any) is exhausted. */
+function deadlineExceeded(start: number, deadlineMs: number | undefined): boolean {
+  return deadlineMs != null && Date.now() - start >= deadlineMs
+}
+
 // ─── Public API ─────────────────────────────────────────────────────────
 
 /**
@@ -366,10 +422,24 @@ export async function callLlm(
   const sink = opts.rawSink
   const redactor = opts.redactor ?? defaultProviderRedactor
   const traceContext = opts.traceContext
+  const callerSignal = opts.signal
+  const deadlineMs = opts.deadlineMs
+  const deadlineStart = Date.now()
 
   let lastErr: unknown
   for (let attempt = 0; attempt < maxRetries; attempt++) {
+    // A caller cancel is fatal — never retried. Checking before each attempt
+    // means an already-aborted signal short-circuits without firing fetch.
+    if (callerSignal?.aborted) {
+      throw new DOMException('callLlm aborted by caller signal', 'AbortError')
+    }
+    // Stop retrying once the cross-attempt budget is spent rather than burning
+    // a full per-attempt timeout on each remaining retry.
+    if (attempt > 0 && deadlineExceeded(deadlineStart, deadlineMs)) {
+      throw lastErr instanceof Error ? lastErr : new Error(String(lastErr))
+    }
     const controller = new AbortController()
+    const attemptSignal = linkSignals(controller, callerSignal)
     const timeoutHandle = setTimeout(() => controller.abort(), timeoutMs)
     const started = Date.now()
     const requestBody = buildBody(req, false)
@@ -397,7 +467,7 @@ export async function callLlm(
         method: 'POST',
         headers,
         body: JSON.stringify(requestBody),
-        signal: controller.signal,
+        signal: attemptSignal,
       })
       clearTimeout(timeoutHandle)
       const responseHeaders = sink ? headersToObject(res.headers) : undefined
@@ -431,7 +501,11 @@ export async function callLlm(
           body,
           req.model,
         )
-        if (RETRYABLE_STATUS.has(res.status) && attempt < maxRetries - 1) {
+        if (
+          RETRYABLE_STATUS.has(res.status) &&
+          attempt < maxRetries - 1 &&
+          !deadlineExceeded(deadlineStart, deadlineMs)
+        ) {
           lastErr = err
           const retryAfter = parseRetryAfter(res.headers)
           await sleep(retryAfter ?? backoffMs(attempt))
@@ -487,12 +561,19 @@ export async function callLlm(
           redactedFields: [],
         })
       }
-      const choice = (json.choices as Array<{ message?: { content?: string } }> | undefined)?.[0]
+      const choice = (
+        json.choices as
+          | Array<{ message?: { content?: string }; finish_reason?: string | null }>
+          | undefined
+      )?.[0]
       const usageRaw = (json.usage as Record<string, unknown> | undefined) ?? {}
       const costFromProxy = (json._response_cost ?? json.cost_usd) as number | undefined
+      const content = choice?.message?.content ?? ''
 
       return {
-        content: choice?.message?.content ?? '',
+        content,
+        finishReason: choice?.finish_reason ?? null,
+        contentEmpty: content.trim().length === 0,
         usage: {
           promptTokens: Number(usageRaw.prompt_tokens ?? 0),
           completionTokens: Number(usageRaw.completion_tokens ?? 0),
@@ -512,6 +593,29 @@ export async function callLlm(
     } catch (err) {
       clearTimeout(timeoutHandle)
       lastErr = err
+      // A caller cancel is fatal even though an AbortError matches the
+      // transient patterns — a cancelled call must surface immediately, not
+      // be retried against the same dead intent.
+      if (callerSignal?.aborted) {
+        if (sink && !attemptErrorRecorded) {
+          await recordRaw(sink, redactor, {
+            eventId: cryptoEventId(),
+            runId: traceContext?.runId,
+            spanId: traceContext?.spanId,
+            provider,
+            model: req.model,
+            endpoint,
+            baseUrl,
+            attemptIndex: attempt,
+            direction: 'error',
+            timestamp: Date.now(),
+            durationMs: Date.now() - started,
+            errorMessage: err instanceof Error ? err.message : String(err),
+            redactedFields: [],
+          })
+        }
+        throw err
+      }
       if (sink && !attemptErrorRecorded) {
         // Record only if neither the !res.ok branch nor the JSON.parse catch
         // already produced an error event for this attempt. Covers network
@@ -532,7 +636,11 @@ export async function callLlm(
           redactedFields: [],
         })
       }
-      if (attempt < maxRetries - 1 && isTransientLlmError(err)) {
+      if (
+        attempt < maxRetries - 1 &&
+        isTransientLlmError(err) &&
+        !deadlineExceeded(deadlineStart, deadlineMs)
+      ) {
         await sleep(backoffMs(attempt))
         continue
       }
diff --git a/src/pipelines/budget-breach.test.ts b/src/pipelines/budget-breach.test.ts
new file mode 100644
index 00000000..988dcd39
--- /dev/null
+++ b/src/pipelines/budget-breach.test.ts
@@ -0,0 +1,107 @@
+import { describe, expect, it } from 'vitest'
+import type { BudgetLedgerEntry, BudgetSpec, Run } from '../trace/schema'
+import { InMemoryTraceStore } from '../trace/store'
+import { budgetBreachView } from './budget-breach'
+
+function run(runId: string, scenarioId: string, variantId?: string): Run {
+  return { runId, scenarioId, variantId, startedAt: 1, status: 'completed' }
+}
+
+function entry(
+  runId: string,
+  dimension: keyof BudgetSpec,
+  limit: number,
+  consumed: number,
+  breached: boolean,
+): BudgetLedgerEntry {
+  return {
+    runId,
+    dimension,
+    limit,
+    consumed,
+    remaining: limit - consumed,
+    timestamp: 100,
+    breached,
+  }
+}
+
+async function storeWith(runs: Run[], entries: BudgetLedgerEntry[]): Promise<InMemoryTraceStore> {
+  const store = new InMemoryTraceStore()
+  for (const r of runs) await store.appendRun(r)
+  for (const e of entries) await store.appendBudgetEntry(e)
+  return store
+}
+
+describe('budgetBreachView', () => {
+  it('counts only breached:true entries, ignoring non-breached ledger rows', async () => {
+    const store = await storeWith(
+      [run('r1', 's1')],
+      [
+        entry('r1', 'tokens', 100, 50, false), // not breached -> ignored
+        entry('r1', 'tokens', 100, 150, true), // breached -> counted
+      ],
+    )
+    const report = await budgetBreachView(store)
+    expect(report.findings).toHaveLength(1)
+    expect(report.findings[0]!.consumed).toBe(150)
+    expect(report.byDimension).toEqual({ tokens: 1 })
+    expect(report.byScenario).toEqual({ s1: 1 })
+  })
+
+  it('excessRatio is Infinity (not NaN/crash) when limit === 0', async () => {
+    const store = await storeWith([run('r1', 's1')], [entry('r1', 'calls', 0, 5, true)])
+    const report = await budgetBreachView(store)
+    expect(report.findings).toHaveLength(1)
+    const ratio = report.findings[0]!.excessRatio
+    expect(ratio).toBe(Infinity)
+    expect(Number.isNaN(ratio)).toBe(false)
+  })
+
+  it('computes excessRatio as consumed/limit when limit > 0', async () => {
+    const store = await storeWith([run('r1', 's1')], [entry('r1', 'usd', 2, 5, true)])
+    const report = await budgetBreachView(store)
+    expect(report.findings[0]!.excessRatio).toBe(2.5)
+  })
+
+  it('tallies byDimension, byScenario, byVariant and breachedRunRatio across runs', async () => {
+    const store = await storeWith(
+      [run('r1', 's1', 'v1'), run('r2', 's1', 'v2'), run('r3', 's2', 'v1')],
+      [
+        entry('r1', 'tokens', 100, 200, true),
+        entry('r1', 'wallMs', 1000, 2000, true), // two breaches, same run
+        entry('r2', 'tokens', 100, 200, true),
+        // r3 has only a non-breach entry -> contributes to totalRuns, not breaches
+        entry('r3', 'tokens', 100, 10, false),
+      ],
+    )
+    const report = await budgetBreachView(store)
+    expect(report.totalRuns).toBe(3)
+    expect(report.byDimension).toEqual({ tokens: 2, wallMs: 1 })
+    expect(report.byScenario).toEqual({ s1: 3 })
+    expect(report.byVariant).toEqual({ v1: 2, v2: 1 })
+    // 2 of 3 runs breached at least once
+    expect(report.breachedRunRatio).toBeCloseTo(2 / 3, 10)
+  })
+
+  it('breachedRunRatio is 0 (not NaN) when there are no runs', async () => {
+    const store = new InMemoryTraceStore()
+    const report = await budgetBreachView(store)
+    expect(report.breachedRunRatio).toBe(0)
+    expect(Number.isNaN(report.breachedRunRatio)).toBe(false)
+    expect(report.findings).toHaveLength(0)
+  })
+
+  it('counts a run breaching multiple dimensions once toward breachedRunRatio', async () => {
+    const store = await storeWith(
+      [run('r1', 's1')],
+      [
+        entry('r1', 'tokens', 100, 200, true),
+        entry('r1', 'usd', 1, 2, true),
+        entry('r1', 'calls', 5, 9, true),
+      ],
+    )
+    const report = await budgetBreachView(store)
+    expect(report.findings).toHaveLength(3)
+    expect(report.breachedRunRatio).toBe(1)
+  })
+})
diff --git a/src/pipelines/failure-cluster.test.ts b/src/pipelines/failure-cluster.test.ts
new file mode 100644
index 00000000..c7bc1b4b
--- /dev/null
+++ b/src/pipelines/failure-cluster.test.ts
@@ -0,0 +1,142 @@
+import { describe, expect, it } from 'vitest'
+import type { Run, ToolSpan } from '../trace/schema'
+import { InMemoryTraceStore } from '../trace/store'
+import { failureClusterView } from './failure-cluster'
+
+function failedRun(runId: string, scenarioId: string): Run {
+  return {
+    runId,
+    scenarioId,
+    startedAt: 1000,
+    endedAt: 2000,
+    status: 'completed',
+    outcome: { pass: false, failureClass: 'tool_recovery_failure' },
+  }
+}
+
+function erroredTool(runId: string, name: string, args: unknown): ToolSpan {
+  return {
+    spanId: `${runId}-t`,
+    runId,
+    kind: 'tool',
+    name: `tool.${name}`,
+    startedAt: 1500,
+    status: 'error',
+    error: `${name} blew up`,
+    toolName: name,
+    args,
+  }
+}
+
+async function storeWith(
+  entries: Array<{ run: Run; spans: ToolSpan[] }>,
+): Promise<InMemoryTraceStore> {
+  const store = new InMemoryTraceStore()
+  for (const { run, spans } of entries) {
+    await store.appendRun(run)
+    for (const s of spans) await store.appendSpan(s)
+  }
+  return store
+}
+
+describe('failureClusterView', () => {
+  it('collapses identical failureClass+toolName+argPrefix into one cluster with summed runCount', async () => {
+    const store = await storeWith([
+      {
+        run: failedRun('r1', 's1'),
+        spans: [erroredTool('r1', 'bash', { cmd: 'ls -la /etc/passwd' })],
+      },
+      {
+        run: failedRun('r2', 's2'),
+        spans: [erroredTool('r2', 'bash', { cmd: 'ls -la /etc/passwd' })],
+      },
+      {
+        run: failedRun('r3', 's1'),
+        spans: [erroredTool('r3', 'bash', { cmd: 'ls -la /etc/passwd' })],
+      },
+    ])
+    const report = await failureClusterView(store)
+    expect(report.clusters).toHaveLength(1)
+    const c = report.clusters[0]!
+    expect(c.runCount).toBe(3)
+    expect(c.toolName).toBe('bash')
+    expect(c.failureClass).toBe('tool_recovery_failure')
+    // distinct scenarios are tracked, runs are not double-counted as scenarios
+    expect([...c.scenarioIds].sort()).toEqual(['s1', 's2'])
+    expect(report.totalFailures).toBe(3)
+    expect(report.totalRuns).toBe(3)
+  })
+
+  it('splits clusters that differ on any of the three dimensions', async () => {
+    const store = await storeWith([
+      // same class+tool, DIFFERENT args -> argPrefix differs -> separate cluster
+      { run: failedRun('a1', 's'), spans: [erroredTool('a1', 'bash', { cmd: 'one' })] },
+      { run: failedRun('a2', 's'), spans: [erroredTool('a2', 'bash', { cmd: 'two' })] },
+      // same class+args-shape but DIFFERENT tool -> separate cluster
+      { run: failedRun('b1', 's'), spans: [erroredTool('b1', 'curl', { cmd: 'one' })] },
+    ])
+    const report = await failureClusterView(store)
+    // three distinct (class, tool, argPrefix) tuples
+    expect(report.clusters).toHaveLength(3)
+    expect(report.clusters.every((c) => c.runCount === 1)).toBe(true)
+  })
+
+  it('does NOT merge a tool whose argHash differs only by a 17th+ char (argPrefix is first 16)', async () => {
+    // argHash is stableStringify of args; pick two arg objects whose stringify
+    // shares the first 16 chars but diverges later -> SAME prefix -> SAME cluster.
+    const shared = 'aaaaaaaaaaaaaa' // padding inside a shared key
+    const store = await storeWith([
+      { run: failedRun('p1', 's'), spans: [erroredTool('p1', 'bash', { k: `${shared}1` })] },
+      { run: failedRun('p2', 's'), spans: [erroredTool('p2', 'bash', { k: `${shared}2` })] },
+    ])
+    const report = await failureClusterView(store)
+    // {"k":"aaaaaaaaaaaaaa... -> first 16 chars identical across both
+    expect(report.clusters).toHaveLength(1)
+    expect(report.clusters[0]!.runCount).toBe(2)
+    expect(report.clusters[0]!.argPrefix).toHaveLength(16)
+  })
+
+  it('minClusterSize filters out clusters below the threshold', async () => {
+    const store = await storeWith([
+      { run: failedRun('big1', 's'), spans: [erroredTool('big1', 'bash', { cmd: 'x' })] },
+      { run: failedRun('big2', 's'), spans: [erroredTool('big2', 'bash', { cmd: 'x' })] },
+      { run: failedRun('small', 's'), spans: [erroredTool('small', 'curl', { cmd: 'y' })] },
+    ])
+    const filtered = await failureClusterView(store, { minClusterSize: 2 })
+    expect(filtered.clusters).toHaveLength(1)
+    expect(filtered.clusters[0]!.toolName).toBe('bash')
+    expect(filtered.clusters[0]!.runCount).toBe(2)
+    // but totals still count every failure regardless of filtering
+    expect(filtered.totalFailures).toBe(3)
+  })
+
+  it('sorts clusters by descending runCount', async () => {
+    const store = await storeWith([
+      { run: failedRun('s1', 's'), spans: [erroredTool('s1', 'rare', { cmd: 'r' })] },
+      { run: failedRun('m1', 's'), spans: [erroredTool('m1', 'mid', { cmd: 'm' })] },
+      { run: failedRun('m2', 's'), spans: [erroredTool('m2', 'mid', { cmd: 'm' })] },
+      { run: failedRun('h1', 's'), spans: [erroredTool('h1', 'hot', { cmd: 'h' })] },
+      { run: failedRun('h2', 's'), spans: [erroredTool('h2', 'hot', { cmd: 'h' })] },
+      { run: failedRun('h3', 's'), spans: [erroredTool('h3', 'hot', { cmd: 'h' })] },
+    ])
+    const report = await failureClusterView(store)
+    const counts = report.clusters.map((c) => c.runCount)
+    expect(counts).toEqual([3, 2, 1])
+    expect(report.clusters[0]!.toolName).toBe('hot')
+  })
+
+  it('does not count passing/completed runs as failures', async () => {
+    const store = new InMemoryTraceStore()
+    await store.appendRun({
+      runId: 'ok',
+      scenarioId: 's',
+      startedAt: 1,
+      status: 'completed',
+      outcome: { pass: true, score: 1 },
+    })
+    const report = await failureClusterView(store)
+    expect(report.totalFailures).toBe(0)
+    expect(report.clusters).toHaveLength(0)
+    expect(report.totalRuns).toBe(1)
+  })
+})
diff --git a/src/pipelines/first-divergence.test.ts b/src/pipelines/first-divergence.test.ts
new file mode 100644
index 00000000..9ebc6330
--- /dev/null
+++ b/src/pipelines/first-divergence.test.ts
@@ -0,0 +1,62 @@
+import { describe, expect, it } from 'vitest'
+import type { Span } from '../trace/schema'
+import { InMemoryTraceStore } from '../trace/store'
+import { firstDivergenceView } from './first-divergence'
+
+function llm(runId: string, n: number, model: string): Span {
+  return {
+    spanId: `${runId}-llm-${n}`,
+    runId,
+    kind: 'llm',
+    name: `llm.step.${n}`,
+    startedAt: n * 1000,
+    endedAt: n * 1000 + 100,
+    model,
+    messages: [],
+  }
+}
+
+async function seed(store: InMemoryTraceStore, runId: string, spans: Span[]): Promise<void> {
+  for (const s of spans) await store.appendSpan(s)
+  // buildTrajectory only reads spans + events; a run row isn't required for it.
+  void runId
+}
+
+describe('firstDivergenceView', () => {
+  it('reports the first differing step on equal-length, diverging trajectories', async () => {
+    const store = new InMemoryTraceStore()
+    await seed(store, 'A', [llm('A', 1, 'm1'), llm('A', 2, 'm1')])
+    await seed(store, 'B', [llm('B', 1, 'm1'), llm('B', 2, 'm2')])
+    const r = await firstDivergenceView(store, 'A', 'B')
+    expect(r.firstDivergenceIndex).toBe(1)
+    expect(r.aStep).toBeDefined()
+    expect(r.bStep).toBeDefined()
+    expect(r.reason).toContain('model m1 vs m2')
+  })
+
+  it('handles one empty trajectory without leaking "index -1" or an undefined step', async () => {
+    const store = new InMemoryTraceStore()
+    // Run A is empty (no spans); run B has steps.
+    await seed(store, 'A', [])
+    await seed(store, 'B', [llm('B', 1, 'm1'), llm('B', 2, 'm1')])
+    const r = await firstDivergenceView(store, 'A', 'B')
+    // Divergence is the first step itself, not minLen-1 === -1.
+    expect(r.firstDivergenceIndex).toBe(0)
+    expect(r.commonPrefixLen).toBe(0)
+    expect(r.reason).not.toContain('index -1')
+    expect(r.reason).toContain('index 0')
+    // The empty side has no step at index 0; the present side does.
+    expect(r.aStep).toBeUndefined()
+    expect(r.bStep).toBeDefined()
+    expect(r.bStep!.index).toBe(0)
+  })
+
+  it('reports identical trajectories as no divergence', async () => {
+    const store = new InMemoryTraceStore()
+    await seed(store, 'A', [llm('A', 1, 'm1')])
+    await seed(store, 'B', [llm('B', 1, 'm1')])
+    const r = await firstDivergenceView(store, 'A', 'B')
+    expect(r.firstDivergenceIndex).toBeNull()
+    expect(r.commonPrefixLen).toBe(1)
+  })
+})
diff --git a/src/pipelines/first-divergence.ts b/src/pipelines/first-divergence.ts
index 84d72c58..1bd2cc8a 100644
--- a/src/pipelines/first-divergence.ts
+++ b/src/pipelines/first-divergence.ts
@@ -54,13 +54,20 @@ export async function firstDivergenceView(
     return { runA, runB, firstDivergenceIndex: null, commonPrefixLen: minLen }
   }
   const longer: Trajectory = a.steps.length > b.steps.length ? a : b
+  const extra = longer.steps.length - minLen
+  // minLen === 0 means one trajectory is empty: divergence is the first step
+  // itself (index 0), and the absent side has no step to surface.
+  const reason =
+    minLen === 0
+      ? `one trajectory is empty; the other has ${extra} step(s) starting at index 0`
+      : `one trajectory has ${extra} more step(s) after index ${minLen - 1}`
   return {
     runA,
     runB,
     firstDivergenceIndex: minLen,
     aStep: a.steps[minLen],
     bStep: b.steps[minLen],
-    reason: `one trajectory has ${longer.steps.length - minLen} more step(s) after index ${minLen - 1}`,
+    reason,
     commonPrefixLen: minLen,
   }
 }
diff --git a/src/pipelines/judge-agreement.test.ts b/src/pipelines/judge-agreement.test.ts
new file mode 100644
index 00000000..b6e36715
--- /dev/null
+++ b/src/pipelines/judge-agreement.test.ts
@@ -0,0 +1,108 @@
+import { describe, expect, it } from 'vitest'
+import type { JudgeSpan } from '../trace/schema'
+import { InMemoryTraceStore } from '../trace/store'
+import { judgeAgreementView } from './judge-agreement'
+
+let seq = 0
+function judge(judgeId: string, targetSpanId: string, dimension: string, score: number): JudgeSpan {
+  return {
+    spanId: `j${seq++}`,
+    runId: 'r',
+    kind: 'judge',
+    name: 'judge.score',
+    startedAt: 1000,
+    judgeId,
+    targetSpanId,
+    dimension,
+    score,
+  }
+}
+
+async function storeWith(spans: JudgeSpan[]): Promise<InMemoryTraceStore> {
+  seq = 0
+  const store = new InMemoryTraceStore()
+  await store.appendRun({ runId: 'r', scenarioId: 's', startedAt: 1, status: 'completed' })
+  for (const s of spans) await store.appendSpan(s)
+  return store
+}
+
+describe('judgeAgreementView', () => {
+  it('skips a pair with fewer than 2 common items', async () => {
+    // A and B only co-score ONE target (t1); t2 is A-only, t3 is B-only.
+    const store = await storeWith([
+      judge('A', 't1', 'quality', 0.5),
+      judge('A', 't2', 'quality', 0.9),
+      judge('B', 't1', 'quality', 0.5),
+      judge('B', 't3', 'quality', 0.1),
+    ])
+    const report = await judgeAgreementView(store)
+    expect(report.pairs).toHaveLength(0)
+    expect(report.judgeIds).toEqual(['A', 'B'])
+  })
+
+  it('identical varying scores across common targets => pearson 1', async () => {
+    const store = await storeWith([
+      judge('A', 't1', 'quality', 0.3),
+      judge('A', 't2', 'quality', 0.7),
+      judge('B', 't1', 'quality', 0.3),
+      judge('B', 't2', 'quality', 0.7),
+    ])
+    const report = await judgeAgreementView(store)
+    expect(report.pairs).toHaveLength(1)
+    expect(report.pairs[0]!.commonItems).toBe(2)
+    expect(report.pairs[0]!.pearson).toBe(1)
+  })
+
+  it('pearson is 1 when BOTH judges have zero variance (constant scores)', async () => {
+    const store = await storeWith([
+      judge('A', 't1', 'quality', 0.5),
+      judge('A', 't2', 'quality', 0.5),
+      judge('B', 't1', 'quality', 0.5),
+      judge('B', 't2', 'quality', 0.5),
+    ])
+    const report = await judgeAgreementView(store)
+    expect(report.pairs).toHaveLength(1)
+    expect(report.pairs[0]!.pearson).toBe(1)
+  })
+
+  it('pearson is 0 when EXACTLY one judge has zero variance', async () => {
+    const store = await storeWith([
+      judge('A', 't1', 'quality', 0.5), // A constant -> variance 0
+      judge('A', 't2', 'quality', 0.5),
+      judge('B', 't1', 'quality', 0.2), // B varies -> variance > 0
+      judge('B', 't2', 'quality', 0.8),
+    ])
+    const report = await judgeAgreementView(store)
+    expect(report.pairs).toHaveLength(1)
+    expect(report.pairs[0]!.pearson).toBe(0)
+  })
+
+  it('sorts pairs by descending commonItems', async () => {
+    const store = await storeWith([
+      // A/B share 3 targets
+      judge('A', 't1', 'd', 0.1),
+      judge('A', 't2', 'd', 0.2),
+      judge('A', 't3', 'd', 0.3),
+      judge('B', 't1', 'd', 0.1),
+      judge('B', 't2', 'd', 0.2),
+      judge('B', 't3', 'd', 0.3),
+      // C shares 2 targets with A and B on a different dimension
+      judge('A', 'u1', 'e', 0.1),
+      judge('A', 'u2', 'e', 0.2),
+      judge('C', 'u1', 'e', 0.1),
+      judge('C', 'u2', 'e', 0.2),
+    ])
+    const report = await judgeAgreementView(store)
+    const items = report.pairs.map((p) => p.commonItems)
+    // descending order
+    expect(items).toEqual([...items].sort((a, b) => b - a))
+    expect(items[0]).toBe(3)
+    expect(report.dimensions).toEqual(['d', 'e'])
+  })
+
+  it('returns empty report when there are no judge spans', async () => {
+    const store = await storeWith([])
+    const report = await judgeAgreementView(store)
+    expect(report).toEqual({ pairs: [], dimensions: [], judgeIds: [] })
+  })
+})
diff --git a/src/pipelines/regression.test.ts b/src/pipelines/regression.test.ts
new file mode 100644
index 00000000..08bd3bc1
--- /dev/null
+++ b/src/pipelines/regression.test.ts
@@ -0,0 +1,153 @@
+import { describe, expect, it } from 'vitest'
+import type { LlmSpan, Run } from '../trace/schema'
+import { InMemoryTraceStore } from '../trace/store'
+import { regressionView } from './regression'
+
+function run(runId: string, patch: Partial<Run> = {}): Run {
+  return { runId, scenarioId: 's', startedAt: 1000, status: 'completed', ...patch }
+}
+
+function llm(runId: string, patch: Partial<LlmSpan>): LlmSpan {
+  return {
+    spanId: `${runId}-llm`,
+    runId,
+    kind: 'llm',
+    name: 'llm.call',
+    startedAt: 1000,
+    model: 'm',
+    messages: [],
+    ...patch,
+  }
+}
+
+function metricMean(report: Awaited<ReturnType<typeof regressionView>>, metric: string) {
+  const m = report.metrics.find((x) => x.metric === metric)
+  if (!m) throw new Error(`metric ${metric} not in report`)
+  return { baselineMean: m.baselineMean, candidateMean: m.candidateMean }
+}
+
+describe('regressionView defaultExtract metric mapping', () => {
+  it('extracts the correct scalar for each metric name (score/pass/durationMs)', async () => {
+    const store = new InMemoryTraceStore()
+    const tag = (v: string) => ({ tags: { slice: v } })
+    await store.appendRun(
+      run('b1', { ...tag('base'), outcome: { score: 0.4, pass: false }, endedAt: 1500 }),
+    )
+    await store.appendRun(
+      run('b2', { ...tag('base'), outcome: { score: 0.6, pass: false }, endedAt: 1700 }),
+    )
+    await store.appendRun(
+      run('c1', { ...tag('cand'), outcome: { score: 0.8, pass: true }, endedAt: 1900 }),
+    )
+    await store.appendRun(
+      run('c2', { ...tag('cand'), outcome: { score: 1.0, pass: true }, endedAt: 2100 }),
+    )
+
+    const report = await regressionView(
+      store,
+      [
+        { metric: 'score', higherIsBetter: true },
+        { metric: 'pass', higherIsBetter: true },
+        { metric: 'durationMs', higherIsBetter: false },
+      ],
+      {
+        baseline: { tag: { key: 'slice', value: 'base' } },
+        candidate: { tag: { key: 'slice', value: 'cand' } },
+      },
+    )
+
+    expect(metricMean(report, 'score')).toEqual({ baselineMean: 0.5, candidateMean: 0.9 })
+    // pass maps true->1, false->0
+    expect(metricMean(report, 'pass')).toEqual({ baselineMean: 0, candidateMean: 1 })
+    // durationMs = endedAt - startedAt (startedAt 1000)
+    expect(metricMean(report, 'durationMs')).toEqual({ baselineMean: 600, candidateMean: 1000 })
+  })
+
+  it('maps costUsd and inputTokens from aggregated llm spans', async () => {
+    const store = new InMemoryTraceStore()
+    await store.appendRun(run('b1', { tags: { slice: 'base' } }))
+    await store.appendRun(run('b2', { tags: { slice: 'base' } }))
+    await store.appendRun(run('c1', { tags: { slice: 'cand' } }))
+    await store.appendRun(run('c2', { tags: { slice: 'cand' } }))
+    await store.appendSpan(llm('b1', { costUsd: 0.1, inputTokens: 100 }))
+    await store.appendSpan(llm('b2', { costUsd: 0.3, inputTokens: 300 }))
+    await store.appendSpan(llm('c1', { costUsd: 0.5, inputTokens: 500 }))
+    await store.appendSpan(llm('c2', { costUsd: 0.7, inputTokens: 700 }))
+
+    const report = await regressionView(
+      store,
+      [
+        { metric: 'costUsd', higherIsBetter: false },
+        { metric: 'inputTokens', higherIsBetter: false },
+      ],
+      {
+        baseline: { tag: { key: 'slice', value: 'base' } },
+        candidate: { tag: { key: 'slice', value: 'cand' } },
+      },
+    )
+    expect(metricMean(report, 'costUsd')).toEqual({ baselineMean: 0.2, candidateMean: 0.6 })
+    expect(metricMean(report, 'inputTokens')).toEqual({ baselineMean: 200, candidateMean: 600 })
+  })
+
+  it('maps failureClass to 1 for success runs and 0 otherwise', async () => {
+    const store = new InMemoryTraceStore()
+    // success: completed with pass != false
+    await store.appendRun(run('b1', { tags: { slice: 'base' }, outcome: { pass: true } }))
+    await store.appendRun(run('b2', { tags: { slice: 'base' }, outcome: { pass: true } }))
+    // non-success: explicit failureClass
+    await store.appendRun(
+      run('c1', {
+        tags: { slice: 'cand' },
+        outcome: { pass: false, failureClass: 'reasoning_error' },
+      }),
+    )
+    await store.appendRun(
+      run('c2', {
+        tags: { slice: 'cand' },
+        outcome: { pass: false, failureClass: 'reasoning_error' },
+      }),
+    )
+    const report = await regressionView(store, [{ metric: 'failureClass', higherIsBetter: true }], {
+      baseline: { tag: { key: 'slice', value: 'base' } },
+      candidate: { tag: { key: 'slice', value: 'cand' } },
+    })
+    expect(metricMean(report, 'failureClass')).toEqual({ baselineMean: 1, candidateMean: 0 })
+  })
+
+  it('drops null samples (unknown metric / missing scalar) instead of counting them as 0', async () => {
+    const store = new InMemoryTraceStore()
+    // Two runs WITH a score, plus runs that produce null for `score`
+    // (no outcome.score). The null runs must be DROPPED, so the mean is
+    // computed over only the present scores — NOT diluted toward 0.
+    await store.appendRun(run('b1', { tags: { slice: 'base' }, outcome: { score: 0.8 } }))
+    await store.appendRun(run('b2', { tags: { slice: 'base' }, outcome: { score: 0.8 } }))
+    await store.appendRun(run('b3', { tags: { slice: 'base' } })) // null score -> dropped
+    await store.appendRun(run('c1', { tags: { slice: 'cand' }, outcome: { score: 0.9 } }))
+    await store.appendRun(run('c2', { tags: { slice: 'cand' }, outcome: { score: 0.9 } }))
+    await store.appendRun(run('c3', { tags: { slice: 'cand' } })) // null score -> dropped
+
+    const report = await regressionView(store, [{ metric: 'score', higherIsBetter: true }], {
+      baseline: { tag: { key: 'slice', value: 'base' } },
+      candidate: { tag: { key: 'slice', value: 'cand' } },
+    })
+    // If the null run were counted as 0, baselineMean would be (0.8+0.8+0)/3 = 0.533.
+    // Correct (dropped) mean is 0.8.
+    const { baselineMean, candidateMean } = metricMean(report, 'score')
+    expect(baselineMean).toBeCloseTo(0.8, 10)
+    expect(candidateMean).toBeCloseTo(0.9, 10)
+  })
+
+  it('throws for an unknown metric because all samples extract to null and get dropped', async () => {
+    const store = new InMemoryTraceStore()
+    await store.appendRun(run('b1', { tags: { slice: 'base' }, outcome: { score: 1 } }))
+    await store.appendRun(run('c1', { tags: { slice: 'cand' }, outcome: { score: 1 } }))
+    // 'banana' is not a known metric -> defaultExtract returns null for every
+    // run -> extractAll yields [] -> compareToBaseline needs >=2 samples.
+    await expect(
+      regressionView(store, [{ metric: 'banana', higherIsBetter: true }], {
+        baseline: { tag: { key: 'slice', value: 'base' } },
+        candidate: { tag: { key: 'slice', value: 'cand' } },
+      }),
+    ).rejects.toThrow(/banana/)
+  })
+})
diff --git a/src/pipelines/stuck-loop.test.ts b/src/pipelines/stuck-loop.test.ts
new file mode 100644
index 00000000..abdacecb
--- /dev/null
+++ b/src/pipelines/stuck-loop.test.ts
@@ -0,0 +1,73 @@
+import { describe, expect, it } from 'vitest'
+import type { Run, ToolSpan } from '../trace/schema'
+import { InMemoryTraceStore } from '../trace/store'
+import { stuckLoopView } from './stuck-loop'
+
+function tool(i: number, name: string, args: unknown): ToolSpan {
+  return {
+    spanId: `t${i}`,
+    runId: 'r',
+    kind: 'tool',
+    name: `tool.${name}`,
+    startedAt: 1000 + i * 1000,
+    toolName: name,
+    args,
+  }
+}
+
+async function storeWith(spans: ToolSpan[]): Promise<InMemoryTraceStore> {
+  const store = new InMemoryTraceStore()
+  await store.appendRun({ runId: 'r', scenarioId: 's' } as Run)
+  for (const s of spans) await store.appendSpan(s)
+  return store
+}
+
+describe('stuckLoopView', () => {
+  it('flags a loop at exactly minOccurrences, windowMs spans first→last', async () => {
+    const store = await storeWith([
+      tool(0, 'bash', { cmd: 'x' }),
+      tool(1, 'bash', { cmd: 'x' }),
+      tool(2, 'bash', { cmd: 'x' }),
+    ])
+    const r = await stuckLoopView(store, { minOccurrences: 3 })
+    expect(r.findings).toHaveLength(1)
+    expect(r.findings[0]!.occurrences).toBe(3)
+    expect(r.findings[0]!.windowMs).toBe(2000)
+    expect(r.affectedRunRatio).toBe(1)
+  })
+
+  it('preserves a tool name containing the delimiter (regression: split("|") mislabel)', async () => {
+    const store = await storeWith([
+      tool(0, 'shell|grep', { q: 'x' }),
+      tool(1, 'shell|grep', { q: 'x' }),
+      tool(2, 'shell|grep', { q: 'x' }),
+    ])
+    const r = await stuckLoopView(store, { minOccurrences: 3 })
+    expect(r.findings[0]!.toolName).toBe('shell|grep')
+  })
+
+  it('does not collapse two distinct tools whose names+hashes alias across the delimiter', async () => {
+    // "a|b" with hash "c" vs "a" with hash "b|c" used to share a key.
+    const store = await storeWith([
+      tool(0, 'a|b', { k: 'c' }),
+      tool(1, 'a|b', { k: 'c' }),
+      tool(2, 'a|b', { k: 'c' }),
+      tool(3, 'a', { k: 'b|c' }),
+      tool(4, 'a', { k: 'b|c' }),
+      tool(5, 'a', { k: 'b|c' }),
+    ])
+    const r = await stuckLoopView(store, { minOccurrences: 3 })
+    const names = r.findings.map((f) => f.toolName).sort()
+    expect(names).toEqual(['a', 'a|b'])
+  })
+
+  it('does not flag distinct args as a loop', async () => {
+    const store = await storeWith([
+      tool(0, 'bash', { cmd: 'a' }),
+      tool(1, 'bash', { cmd: 'b' }),
+      tool(2, 'bash', { cmd: 'c' }),
+    ])
+    const r = await stuckLoopView(store, { minOccurrences: 3 })
+    expect(r.findings).toHaveLength(0)
+  })
+})
diff --git a/src/pipelines/stuck-loop.ts b/src/pipelines/stuck-loop.ts
index c5180832..6d5e7c93 100644
--- a/src/pipelines/stuck-loop.ts
+++ b/src/pipelines/stuck-loop.ts
@@ -46,22 +46,26 @@ export async function stuckLoopView(
   const findings: StuckLoopFinding[] = []
   for (const { runId } of runs) {
     const tools = await toolSpans(store, runId)
-    const byKey = new Map<string, { spans: typeof tools; argHash: string }>()
+    const byKey = new Map<string, { spans: typeof tools; argHash: string; toolName: string }>()
     for (const t of tools) {
       const h = argHash(t.args)
-      const key = `${t.toolName}|${h}`
-      const bucket = byKey.get(key) ?? { spans: [], argHash: h }
+      // NUL delimiter never appears in a tool name or a JSON argHash, so the
+      // key cannot collide; toolName is carried in the bucket rather than
+      // re-derived by splitting the key (which mislabeled any tool whose name
+      // contained the delimiter, e.g. "shell|grep").
+      const key = `${t.toolName}\u0000${h}`
+      const bucket = byKey.get(key) ?? { spans: [], argHash: h, toolName: t.toolName }
       bucket.spans.push(t)
       byKey.set(key, bucket)
     }
-    for (const [key, { spans, argHash: h }] of byKey) {
+    for (const { spans, argHash: h, toolName } of byKey.values()) {
       if (spans.length < minOccurrences) continue
       const sorted = [...spans].sort((a, b) => a.startedAt - b.startedAt)
       const first = sorted[0]!.startedAt
       const last = sorted[sorted.length - 1]!.startedAt
       findings.push({
         runId,
-        toolName: key.split('|')[0]!,
+        toolName,
         argHash: h,
         occurrences: sorted.length,
         spanIds: sorted.map((s) => s.spanId),
diff --git a/src/pipelines/tool-waste.test.ts b/src/pipelines/tool-waste.test.ts
new file mode 100644
index 00000000..672bab9a
--- /dev/null
+++ b/src/pipelines/tool-waste.test.ts
@@ -0,0 +1,150 @@
+import { describe, expect, it } from 'vitest'
+import type { LlmSpan, Message, Run, Span, ToolSpan } from '../trace/schema'
+import type { SpanFilter, TraceStore } from '../trace/store'
+import { toolWasteView } from './tool-waste'
+
+// Minimal in-file store implementing only the TraceStore surface toolWasteView
+// touches (listRuns + spans). Avoids coupling this slice to the full store.
+class StubStore implements TraceStore {
+  constructor(
+    private readonly runs: Run[],
+    private readonly allSpans: Span[],
+  ) {}
+  async listRuns(): Promise<Run[]> {
+    return this.runs
+  }
+  async spans(filter: SpanFilter = {}): Promise<Span[]> {
+    return this.allSpans.filter((s) => {
+      if (filter.runId && s.runId !== filter.runId) return false
+      if (filter.kind && s.kind !== filter.kind) return false
+      if (filter.toolName && (s.kind !== 'tool' || s.toolName !== filter.toolName)) return false
+      return true
+    })
+  }
+  async appendRun(): Promise<void> {}
+  async updateRun(): Promise<void> {}
+  async appendSpan(): Promise<void> {}
+  async updateSpan(): Promise<void> {}
+  async appendEvent(): Promise<void> {}
+  async appendArtifact(): Promise<void> {}
+  async appendBudgetEntry(): Promise<void> {}
+  async getRun(): Promise<Run | undefined> {
+    return undefined
+  }
+  async events(): Promise<never[]> {
+    return []
+  }
+  async budget(): Promise<never[]> {
+    return []
+  }
+  async artifacts(): Promise<never[]> {
+    return []
+  }
+}
+
+function tool(i: number, opts: Partial<ToolSpan> & { result?: unknown }): ToolSpan {
+  return {
+    spanId: `t${i}`,
+    runId: 'r',
+    kind: 'tool',
+    name: `tool.${i}`,
+    startedAt: 1000 + i * 1000,
+    toolName: `tool${i}`,
+    args: {},
+    ...opts,
+  }
+}
+
+function llm(i: number, contents: string[]): LlmSpan {
+  const messages: Message[] = contents.map((c) => ({ role: 'user', content: c }))
+  return {
+    spanId: `l${i}`,
+    runId: 'r',
+    kind: 'llm',
+    name: `llm.${i}`,
+    startedAt: 1500 + i * 1000,
+    model: 'm',
+    messages,
+  }
+}
+
+function storeWith(tools: ToolSpan[], llms: LlmSpan[]): StubStore {
+  return new StubStore([{ runId: 'r', scenarioId: 's' } as Run], [...tools, ...llms])
+}
+
+describe('toolWasteView', () => {
+  it('counts an error tool call as wasted', async () => {
+    const store = storeWith([tool(0, { status: 'error', result: 'boom' })], [llm(0, ['hi'])])
+    const r = await toolWasteView(store, { runId: 'r' })
+    expect(r.byRun[0]!.wastedCalls).toBe(1)
+    expect(r.byRun[0]!.wasteRate).toBe(1)
+  })
+
+  it('does not count a tool whose result appears in a later LLM message', async () => {
+    const store = storeWith(
+      [tool(0, { result: 'IMPORTANT_PAYLOAD_42' })],
+      [llm(0, ['here is the data: IMPORTANT_PAYLOAD_42 thanks'])],
+    )
+    const r = await toolWasteView(store, { runId: 'r' })
+    expect(r.byRun[0]!.wastedCalls).toBe(0)
+    expect(r.byRun[0]!.wasteRate).toBe(0)
+  })
+
+  it('counts a tool whose non-empty result never appears downstream as wasted', async () => {
+    const store = storeWith(
+      [tool(0, { result: 'UNUSED_PAYLOAD' })],
+      [llm(0, ['something unrelated'])],
+    )
+    const r = await toolWasteView(store, { runId: 'r' })
+    expect(r.byRun[0]!.wastedCalls).toBe(1)
+  })
+
+  // Regression: an empty/null tool result stringifies to '' — there is no
+  // payload to propagate, so it must NOT be counted as waste. The old code
+  // (`resultStr &&` → falsy → used=false) wrongly counted it.
+  it('does not count an empty-result tool call as wasted', async () => {
+    const store = storeWith(
+      [tool(0, { result: '' }), tool(1, { result: null }), tool(2, { result: undefined })],
+      [llm(0, ['no payloads here'])],
+    )
+    const r = await toolWasteView(store, { runId: 'r' })
+    expect(r.byRun[0]!.wastedCalls).toBe(0)
+    expect(r.byRun[0]!.wasteRate).toBe(0)
+  })
+
+  it('only matches LLM spans that started strictly after the tool', async () => {
+    // An LLM span that ran BEFORE the tool mentions the payload, but a later
+    // span must be the one that propagates it. Earlier mention != usage.
+    const earlier = llm(-2, ['LATE_PAYLOAD already mentioned early']) // startedAt 500
+    earlier.startedAt = 500
+    const store = storeWith([tool(0, { result: 'LATE_PAYLOAD' })], [earlier])
+    const r = await toolWasteView(store, { runId: 'r' })
+    expect(r.byRun[0]!.wastedCalls).toBe(1)
+  })
+
+  it('honors a usageOracle override and passes only later LLM spans', async () => {
+    const before = llm(-1, ['before'])
+    before.startedAt = 500
+    const after = llm(0, ['after'])
+    const store = storeWith([tool(0, { result: 'x' })], [before, after])
+    const seen: number[] = []
+    const r = await toolWasteView(store, {
+      runId: 'r',
+      usageOracle: (_t, later) => {
+        seen.push(later.llm.length)
+        return true // oracle says always used
+      },
+    })
+    expect(r.byRun[0]!.wastedCalls).toBe(0)
+    // Only the span started after the tool (startedAt 1000) is later.
+    expect(seen).toEqual([1])
+  })
+
+  it('reports wasteRate 0 with no NaN for an empty run', async () => {
+    const store = new StubStore([{ runId: 'r', scenarioId: 's' } as Run], [])
+    const r = await toolWasteView(store, { runId: 'r' })
+    expect(r.byRun[0]!).toEqual({ runId: 'r', wastedCalls: 0, totalCalls: 0, wasteRate: 0 })
+    expect(Number.isNaN(r.overallWasteRate)).toBe(false)
+    expect(r.overallWasteRate).toBe(0)
+  })
+})
diff --git a/src/pipelines/tool-waste.ts b/src/pipelines/tool-waste.ts
index 0788766c..9e622bf2 100644
--- a/src/pipelines/tool-waste.ts
+++ b/src/pipelines/tool-waste.ts
@@ -12,7 +12,7 @@
 
 import { computeToolUseMetrics } from '../tool-use-metrics'
 import { llmSpans, toolSpans } from '../trace/query'
-import type { ToolSpan } from '../trace/schema'
+import type { LlmSpan, ToolSpan } from '../trace/schema'
 import type { TraceStore } from '../trace/store'
 
 export interface ToolWasteFinding {
@@ -48,27 +48,34 @@ export async function toolWasteView(
       continue
     }
     const llms = await llmSpans(store, runId)
+    // Sort LLM spans once by start time, then build a suffix index of the
+    // concatenated message text. `suffixText[i]` is the haystack of every
+    // string message content in spans[i..]. Per tool we binary-search the
+    // first span started strictly after the tool, then test that one suffix
+    // — turning the per-tool O(llms × messages × content) scan into a single
+    // O(log llms) lookup over precomputed text.
+    const sortedLlm = [...llms].sort((a, b) => a.startedAt - b.startedAt)
+    const startTimes = sortedLlm.map((l) => l.startedAt)
+    const suffixText = buildSuffixText(sortedLlm)
     let wasted = 0
     for (const t of tools) {
       if (t.status === 'error') {
         wasted++
         continue
       }
-      const laterLlm = llms.filter((l) => l.startedAt > t.startedAt)
+      // First LLM span started strictly after this tool (upper-bound search).
+      const cutoff = upperBound(startTimes, t.startedAt)
       if (options.usageOracle) {
-        if (!options.usageOracle(t, { llm: laterLlm })) wasted++
+        if (!options.usageOracle(t, { llm: sortedLlm.slice(cutoff) })) wasted++
       } else {
         // Default heuristic: a tool whose result is NOT mentioned in any
-        // later LLM input message is likely wasted.
+        // later LLM input message is likely wasted. An empty/null result has
+        // no payload to propagate downstream — there is nothing to find in a
+        // later message, so it is not evidence of waste; skip it.
         const resultStr = stringify(t.result)
-        const used = laterLlm.some((l) =>
-          l.messages.some(
-            (m) =>
-              typeof m.content === 'string' &&
-              resultStr &&
-              m.content.includes(resultStr.slice(0, 120)),
-          ),
-        )
+        if (resultStr === '') continue
+        const haystack = suffixText[cutoff] ?? ''
+        const used = haystack.includes(resultStr.slice(0, 120))
         if (!used) wasted++
       }
     }
@@ -80,6 +87,35 @@ export async function toolWasteView(
   return { byRun, overallWasteRate: totalCalls > 0 ? totalWasted / totalCalls : 0 }
 }
 
+/**
+ * Build per-position suffix haystacks: result[i] is the concatenation of every
+ * string message content in spans[i..end]. Built back-to-front so each entry
+ * reuses the next one — O(total message text) rather than O(spans²).
+ */
+function buildSuffixText(spans: LlmSpan[]): string[] {
+  const result = new Array<string>(spans.length + 1)
+  result[spans.length] = ''
+  for (let i = spans.length - 1; i >= 0; i--) {
+    const own = spans[i]!.messages.map((m) =>
+      typeof m.content === 'string' ? m.content : '',
+    ).join('\n')
+    result[i] = `${own}\n${result[i + 1]}`
+  }
+  return result
+}
+
+/** Index of the first element strictly greater than `target` in a sorted array. */
+function upperBound(sorted: number[], target: number): number {
+  let lo = 0
+  let hi = sorted.length
+  while (lo < hi) {
+    const mid = (lo + hi) >>> 1
+    if (sorted[mid]! <= target) lo = mid + 1
+    else hi = mid
+  }
+  return lo
+}
+
 function stringify(v: unknown): string {
   if (v === null || v === undefined) return ''
   if (typeof v === 'string') return v
diff --git a/src/prm/inference.test.ts b/src/prm/inference.test.ts
new file mode 100644
index 00000000..ac1e045e
--- /dev/null
+++ b/src/prm/inference.test.ts
@@ -0,0 +1,113 @@
+import { describe, expect, it } from 'vitest'
+import type { TraceStore } from '../trace/store'
+import { prmBestOfN, prmEnsembleBestOfN } from './inference'
+import type { PrmGradedTrace, PrmGrader } from './rubric'
+
+const fakeStore = {} as TraceStore
+
+function tracedTrace(runId: string, score: number): PrmGradedTrace {
+  return { runId, steps: [], aggregateScore: score, gradedCount: 0, ungradedCount: 0 }
+}
+
+interface ConcurrencyMeter {
+  peak: number
+  active: number
+}
+
+/**
+ * Grader stub that records peak concurrent `grade` calls against a shared
+ * meter and assigns a score per runId. A shared meter lets multiple graders
+ * report the true simultaneous in-flight count across the whole pool. Each
+ * call yields a few times so concurrency is observable.
+ */
+function makeTrackingGrader(
+  scores: Record<string, number>,
+  meter: ConcurrencyMeter = { peak: 0, active: 0 },
+): PrmGrader & { meter: ConcurrencyMeter } {
+  const state = { meter } as PrmGrader & { meter: ConcurrencyMeter }
+  state.grade = async (_store: TraceStore, runId: string): Promise<PrmGradedTrace> => {
+    meter.active++
+    if (meter.active > meter.peak) meter.peak = meter.active
+    for (let i = 0; i < 5; i++) await Promise.resolve()
+    meter.active--
+    return tracedTrace(runId, scores[runId] ?? 0)
+  }
+  return state
+}
+
+/** Grader whose `grade` always rejects — models an unrecoverable LLM failure. */
+function makeFailingGrader(message: string): PrmGrader {
+  const state = {} as PrmGrader
+  state.grade = async (): Promise<PrmGradedTrace> => {
+    throw new Error(message)
+  }
+  return state
+}
+
+describe('prmBestOfN bounded concurrency', () => {
+  it('caps concurrent grade calls at the configured concurrency', async () => {
+    const grader = makeTrackingGrader({})
+    const runIds = Array.from({ length: 10 }, (_, i) => `run-${i}`)
+
+    const result = await prmBestOfN(fakeStore, grader, runIds, { concurrency: 2 })
+
+    expect(result.ranked).toHaveLength(10)
+    // OLD behavior (Promise.all over all runIds) peaks at 10.
+    expect(grader.meter.peak).toBeLessThanOrEqual(2)
+    expect(grader.meter.peak).toBeGreaterThan(0)
+  })
+
+  it('still picks the highest-scoring candidate as winner', async () => {
+    const grader = makeTrackingGrader({ 'run-0': 0.1, 'run-1': 0.9, 'run-2': 0.5 })
+    const result = await prmBestOfN(fakeStore, grader, ['run-0', 'run-1', 'run-2'], {
+      concurrency: 2,
+    })
+    expect(result.winner.runId).toBe('run-1')
+  })
+})
+
+describe('prmEnsembleBestOfN bounded concurrency + isolation', () => {
+  it('caps concurrent grade calls across the flattened graderxrunId product', async () => {
+    const scores = { 'run-0': 0.2, 'run-1': 0.8, 'run-2': 0.5 }
+    const meter: ConcurrencyMeter = { peak: 0, active: 0 }
+    const g1 = makeTrackingGrader(scores, meter)
+    const g2 = makeTrackingGrader(scores, meter)
+    const g3 = makeTrackingGrader(scores, meter)
+    const runIds = ['run-0', 'run-1', 'run-2']
+
+    const result = await prmEnsembleBestOfN(fakeStore, [g1, g2, g3], runIds, { concurrency: 2 })
+
+    expect(result.ranked).toHaveLength(3)
+    // OLD nested Promise.all fired 3 graders x 3 runIds = 9 calls at once.
+    // The shared meter measures the true simultaneous in-flight count across
+    // the whole flattened pool; it must never exceed the configured bound.
+    expect(meter.peak).toBeLessThanOrEqual(2)
+    expect(meter.peak).toBeGreaterThan(0)
+  })
+
+  it('one grader failing on a candidate does not void the ensemble (allSettled)', async () => {
+    const scores = { 'run-0': 0.2, 'run-1': 0.9, 'run-2': 0.4 }
+    const good = makeTrackingGrader(scores)
+    // Second grader rejects on every candidate; the ensemble must survive.
+    const flaky = makeFailingGrader('rate limited')
+    const runIds = ['run-0', 'run-1', 'run-2']
+
+    const result = await prmEnsembleBestOfN(fakeStore, [good, flaky], runIds, { concurrency: 4 })
+
+    // OLD behavior: nested Promise.all rejects the whole call on the first
+    // grader failure. New behavior: the surviving grader still produces a vote.
+    expect(result.ranked).toHaveLength(3)
+    expect(result.winner.runId).toBe('run-1')
+  })
+
+  it('throws only when every grader fails on every candidate', async () => {
+    const runIds = ['run-0', 'run-1']
+    await expect(
+      prmEnsembleBestOfN(
+        fakeStore,
+        [makeFailingGrader('boom-a'), makeFailingGrader('boom-b')],
+        runIds,
+      ),
+    ).rejects.toThrow(/every grader failed/)
+  })
+})
diff --git a/src/prm/inference.ts b/src/prm/inference.ts
index 31ccd54f..eeb85601 100644
--- a/src/prm/inference.ts
+++ b/src/prm/inference.ts
@@ -17,13 +17,33 @@ export interface BestOfNResult {
   stdDev: number
 }
 
+/** Default max concurrent grader calls. Bounds LLM fan-out so a wide ensemble
+ *  doesn't trigger a provider rate-limit storm. */
+const DEFAULT_PRM_CONCURRENCY = 4
+
+export interface PrmBestOfNOptions {
+  /** Max concurrent `grader.grade` calls. Default 4. */
+  concurrency?: number
+}
+
 export async function prmBestOfN(
   store: TraceStore,
   grader: PrmGrader,
   runIds: string[],
+  options: PrmBestOfNOptions = {},
 ): Promise<BestOfNResult> {
   if (runIds.length === 0) throw new Error('prmBestOfN: at least 1 candidate required')
-  const graded = await Promise.all(runIds.map((id) => grader.grade(store, id)))
+  const concurrency = Math.max(1, options.concurrency ?? DEFAULT_PRM_CONCURRENCY)
+  const graded: PrmGradedTrace[] = new Array(runIds.length)
+  let cursor = 0
+  async function worker(): Promise<void> {
+    while (true) {
+      const i = cursor++
+      if (i >= runIds.length) return
+      graded[i] = await grader.grade(store, runIds[i]!)
+    }
+  }
+  await Promise.all(Array.from({ length: Math.min(concurrency, runIds.length) }, () => worker()))
   const ranked = [...graded].sort((a, b) => b.aggregateScore - a.aggregateScore)
   const mean = graded.reduce((a, g) => a + g.aggregateScore, 0) / graded.length
   const variance = graded.reduce((a, g) => a + (g.aggregateScore - mean) ** 2, 0) / graded.length
@@ -40,25 +60,73 @@ export async function prmEnsembleBestOfN(
   store: TraceStore,
   graders: PrmGrader[],
   runIds: string[],
+  options: PrmBestOfNOptions = {},
 ): Promise<BestOfNResult> {
   if (graders.length === 0) throw new Error('prmEnsembleBestOfN: at least 1 grader')
-  const perGrader = await Promise.all(
-    graders.map(async (g) => {
-      const graded = await Promise.all(runIds.map((id) => g.grade(store, id)))
-      return graded.sort((a, b) => b.aggregateScore - a.aggregateScore)
-    }),
-  )
-  // Borda: rank-sum across graders.
+  if (runIds.length === 0) throw new Error('prmEnsembleBestOfN: at least 1 candidate required')
+  const concurrency = Math.max(1, options.concurrency ?? DEFAULT_PRM_CONCURRENCY)
+
+  // Flatten the (grader, runId) product and run it through a single bounded
+  // pool. Nested unbounded fan-out (graders × runIds) would launch every LLM
+  // call at once. allSettled isolates failures: one grader (or one runId)
+  // failing doesn't void the whole ensemble.
+  type Job = { graderIdx: number; runId: string }
+  const jobs: Job[] = []
+  for (let gi = 0; gi < graders.length; gi++) {
+    for (const runId of runIds) jobs.push({ graderIdx: gi, runId })
+  }
+  const settled: PromiseSettledResult<PrmGradedTrace>[] = new Array(jobs.length)
+  let cursor = 0
+  async function worker(): Promise<void> {
+    while (true) {
+      const i = cursor++
+      if (i >= jobs.length) return
+      const job = jobs[i]!
+      settled[i] = await graders[job.graderIdx]!.grade(store, job.runId)
+        .then((value): PromiseSettledResult<PrmGradedTrace> => ({ status: 'fulfilled', value }))
+        .catch((reason): PromiseSettledResult<PrmGradedTrace> => ({ status: 'rejected', reason }))
+    }
+  }
+  await Promise.all(Array.from({ length: Math.min(concurrency, jobs.length) }, () => worker()))
+
+  // Regroup fulfilled results into per-grader rankings. A grader contributes
+  // only the candidates it successfully graded; a grader that graded nothing
+  // is dropped from the vote rather than skewing it with phantom zeros.
+  const perGrader: PrmGradedTrace[][] = graders.map(() => [])
+  const failures: { graderIdx: number; runId: string; error: string }[] = []
+  for (let i = 0; i < jobs.length; i++) {
+    const job = jobs[i]!
+    const result = settled[i]!
+    if (result.status === 'fulfilled') perGrader[job.graderIdx]!.push(result.value)
+    else {
+      const error = result.reason instanceof Error ? result.reason.message : String(result.reason)
+      failures.push({ graderIdx: job.graderIdx, runId: job.runId, error })
+    }
+  }
+  const survivingGraders = perGrader.filter((ranking) => ranking.length > 0)
+  if (survivingGraders.length === 0) {
+    throw new Error(
+      `prmEnsembleBestOfN: every grader failed on every candidate (${failures.length} call(s)). First error: ${failures[0]?.error ?? 'unknown'}`,
+    )
+  }
+  for (const ranking of survivingGraders)
+    ranking.sort((a, b) => b.aggregateScore - a.aggregateScore)
+
+  // Borda: rank-sum across surviving graders.
   const bordaScores = new Map<string, number>()
-  for (const ranking of perGrader) {
+  for (const ranking of survivingGraders) {
     ranking.forEach((g, rank) => {
       bordaScores.set(g.runId, (bordaScores.get(g.runId) ?? 0) + (ranking.length - rank))
     })
   }
-  // Return a synthesized ranking using the first grader's graded traces
-  // ordered by Borda score. aggregateScore field kept for UX.
-  const canonical = perGrader[0]!
-  const byRun = new Map(canonical.map((g) => [g.runId, g]))
+  // Synthesize a ranking from the union of every successfully-graded trace,
+  // ordered by Borda score. aggregateScore field kept for UX. Using the union
+  // (not just the first grader) keeps a candidate that one grader dropped but
+  // another graded.
+  const byRun = new Map<string, PrmGradedTrace>()
+  for (const ranking of survivingGraders) {
+    for (const g of ranking) if (!byRun.has(g.runId)) byRun.set(g.runId, g)
+  }
   const ranked = [...byRun.values()].sort(
     (a, b) => (bordaScores.get(b.runId) ?? 0) - (bordaScores.get(a.runId) ?? 0),
   )
diff --git a/src/sandbox-harness.test.ts b/src/sandbox-harness.test.ts
new file mode 100644
index 00000000..807372bf
--- /dev/null
+++ b/src/sandbox-harness.test.ts
@@ -0,0 +1,127 @@
+import { describe, expect, it } from 'vitest'
+import {
+  type HarnessConfig,
+  type SandboxDriver,
+  SandboxHarness,
+  type SandboxResult,
+  SubprocessSandboxDriver,
+  vitestTestParser,
+} from './sandbox-harness'
+import type { TraceEmitter } from './trace/emitter'
+
+// Minimal trace emitter stub — the harness only needs `sandbox()` and the
+// returned handle's `end`/`fail`. None of the regression behaviour depends
+// on the trace payload, so we record nothing.
+function stubEmitter(): TraceEmitter {
+  const handle = {
+    async end() {},
+    async fail() {},
+  }
+  return {
+    async sandbox() {
+      return handle as never
+    },
+  } as unknown as TraceEmitter
+}
+
+describe('SubprocessSandboxDriver timeout handling', () => {
+  it('flags killedByTimeout and forces non-zero exit when a phase overruns', async () => {
+    const driver = new SubprocessSandboxDriver()
+    // A process that outlives the timeout. The driver must mark the
+    // result as timed out — NOT resolve it as a clean close.
+    const res = await driver.exec('test', 'sleep 5', {
+      timeoutMs: 150,
+    } as HarnessConfig)
+    expect(res.killedByTimeout).toBe(true)
+    expect(res.exitCode).not.toBe(0)
+  })
+
+  it('does not flag killedByTimeout for a fast clean exit', async () => {
+    const driver = new SubprocessSandboxDriver()
+    const res = await driver.exec('test', 'true', { timeoutMs: 5000 } as HarnessConfig)
+    expect(res.killedByTimeout).toBeFalsy()
+    expect(res.exitCode).toBe(0)
+  })
+
+  it('caps captured output for a runaway process (outputTruncated)', async () => {
+    const driver = new SubprocessSandboxDriver()
+    // Emit far more than the cap; the buffer must stay bounded.
+    const res = await driver.exec(
+      'run',
+      `node -e "const b='x'.repeat(1024); for(let i=0;i<5000;i++) process.stdout.write(b)"`,
+      { maxOutputBytes: 4096 } as HarnessConfig,
+    )
+    expect(res.outputTruncated).toBe(true)
+    expect(res.stdout.length).toBeLessThanOrEqual(4096)
+  })
+})
+
+describe('SandboxHarness treats a timed-out test phase as a hard fail', () => {
+  // A driver whose test phase was killed by timeout but whose SIGKILLed
+  // child reported exit 0 AND whose stdout would parse as "all passed."
+  // This is the exact masquerade: before the fix the harness scored it a
+  // pass.
+  const masqueradeDriver: SandboxDriver = {
+    id: 'masquerade',
+    async exec(phase): Promise<SandboxResult> {
+      if (phase === 'test') {
+        return {
+          phase,
+          exitCode: 0,
+          stdout: 'Tests  3 passed',
+          stderr: '',
+          wallMs: 1,
+          killedByTimeout: true,
+        }
+      }
+      return { phase, exitCode: 0, stdout: '', stderr: '', wallMs: 1 }
+    },
+  }
+
+  it('a timed-out test phase fails the harness regardless of exit code', async () => {
+    const harness = new SandboxHarness(masqueradeDriver)
+    const config: HarnessConfig = { testCommand: 'pnpm test', testParser: vitestTestParser }
+    const result = await harness.run(config, stubEmitter())
+    expect(result.passed).toBe(false)
+    expect(result.score).toBe(0)
+  })
+
+  it('a clean test phase with a real parse still passes (no false negatives)', async () => {
+    const cleanDriver: SandboxDriver = {
+      id: 'clean',
+      async exec(phase): Promise<SandboxResult> {
+        return {
+          phase,
+          exitCode: 0,
+          stdout: 'Tests  4 passed',
+          stderr: '',
+          wallMs: 1,
+        }
+      },
+    }
+    const harness = new SandboxHarness(cleanDriver)
+    const config: HarnessConfig = { testCommand: 'pnpm test', testParser: vitestTestParser }
+    const result = await harness.run(config, stubEmitter())
+    expect(result.passed).toBe(true)
+    expect(result.score).toBe(1)
+  })
+
+  it('a timed-out setup phase fails fast without running test', async () => {
+    let testRan = false
+    const setupTimeoutDriver: SandboxDriver = {
+      id: 'setup-timeout',
+      async exec(phase): Promise<SandboxResult> {
+        if (phase === 'test') testRan = true
+        if (phase === 'setup') {
+          return { phase, exitCode: 0, stdout: '', stderr: '', wallMs: 1, killedByTimeout: true }
+        }
+        return { phase, exitCode: 0, stdout: '', stderr: '', wallMs: 1 }
+      },
+    }
+    const harness = new SandboxHarness(setupTimeoutDriver)
+    const config: HarnessConfig = { setupCommand: 'install', testCommand: 'test' }
+    const result = await harness.run(config, stubEmitter())
+    expect(result.passed).toBe(false)
+    expect(testRan).toBe(false)
+  })
+})
diff --git a/src/sandbox-harness.ts b/src/sandbox-harness.ts
index 86471230..b3226ede 100644
--- a/src/sandbox-harness.ts
+++ b/src/sandbox-harness.ts
@@ -27,6 +27,12 @@ export interface HarnessConfig {
   cwd?: string
   /** Max wall-clock per phase in ms. Default 10 minutes. */
   timeoutMs?: number
+  /**
+   * Cap on captured stdout+stderr bytes per phase. A runaway process can
+   * otherwise grow the in-memory buffer without bound. Once hit, further
+   * output is dropped and `outputTruncated` is set. Default 16 MiB.
+   */
+  maxOutputBytes?: number
   /** Image for the docker driver. */
   image?: string
   /** Extra env vars (validated; shell-escaped). */
@@ -52,6 +58,19 @@ export interface SandboxResult {
   wallMs: number
   testsTotal?: number
   testsPassed?: number
+  /**
+   * True when the process was killed because it exceeded `timeoutMs`. A
+   * SIGKILLed child can still close with exit code 0; callers MUST treat
+   * a timed-out phase as a hard failure regardless of `exitCode`, never
+   * as a pass. `undefined`/`false` means the process completed on its own.
+   */
+  killedByTimeout?: boolean
+  /**
+   * True when captured stdout/stderr hit `maxOutputBytes` and further
+   * output was dropped. The result is still returned (the process was
+   * not killed for this), but downstream parsers see truncated text.
+   */
+  outputTruncated?: boolean
 }
 
 export interface SandboxDriver {
@@ -162,51 +181,126 @@ export class SubprocessSandboxDriver implements SandboxDriver {
     // cwd when only the constructor arg is supplied.
     const effectiveCwd = config.cwd ?? this.defaultCwd
     const effectiveEnv = { ...process.env, ...(this.defaultEnv ?? {}), ...(config.env ?? {}) }
+    const maxOutputBytes = config.maxOutputBytes ?? 16 * 1024 * 1024
     return await new Promise<SandboxResult>((resolve) => {
       const child = spawn(command, {
         shell: true,
         cwd: effectiveCwd,
         env: effectiveEnv,
+        // Own process group so a timeout can SIGKILL the whole tree, not
+        // just the shell — otherwise a runaway grandchild keeps the pipes
+        // open and `close` never fires (the timeout silently does nothing).
+        detached: process.platform !== 'win32',
       })
       let stdout = ''
       let stderr = ''
-      child.stdout?.on('data', (d) => {
-        stdout += String(d)
-      })
-      child.stderr?.on('data', (d) => {
-        stderr += String(d)
-      })
-      const timeout = setTimeout(
-        () => {
-          try {
-            child.kill('SIGKILL')
-          } catch (err) {
-            console.warn('[sandbox-harness] SIGKILL on timeout failed:', err)
-          }
-        },
-        config.timeoutMs ?? 10 * 60_000,
-      )
-      child.on('close', (code) => {
+      let outputBytes = 0
+      let outputTruncated = false
+      let killedByTimeout = false
+      let settled = false
+
+      const onStdout = (d: unknown) => {
+        const chunk = capture(String(d))
+        if (chunk) stdout += chunk
+      }
+      const onStderr = (d: unknown) => {
+        const chunk = capture(String(d))
+        if (chunk) stderr += chunk
+      }
+      // Bound the in-memory buffer so a runaway process can't OOM the
+      // harness. Once the cap is hit we keep draining the streams (to let
+      // the child reach `close`) but discard the bytes.
+      function capture(s: string): string {
+        if (outputBytes >= maxOutputBytes) {
+          outputTruncated = true
+          return ''
+        }
+        const room = maxOutputBytes - outputBytes
+        if (s.length > room) {
+          outputBytes = maxOutputBytes
+          outputTruncated = true
+          return s.slice(0, room)
+        }
+        outputBytes += s.length
+        return s
+      }
+
+      child.stdout?.on('data', onStdout)
+      child.stderr?.on('data', onStderr)
+
+      const cleanup = () => {
         clearTimeout(timeout)
+        if (forceResolve) clearTimeout(forceResolve)
+        child.stdout?.off('data', onStdout)
+        child.stderr?.off('data', onStderr)
+        child.removeAllListeners('close')
+        child.removeAllListeners('error')
+      }
+
+      // Resolve exactly once. `code`/`signal` come from `close`; on the
+      // timeout path we force a non-zero exit so a SIGKILLed child that
+      // reports exit 0 can never read as a clean pass downstream.
+      const finish = (outcome: { code: number | null; runnerError?: string }) => {
+        if (settled) return
+        settled = true
+        cleanup()
         const wallMs = Date.now() - start
+        const exitCode = killedByTimeout
+          ? outcome.code && outcome.code !== 0
+            ? outcome.code
+            : 124
+          : (outcome.code ?? 1)
         const parsed =
-          phase === 'test' && config.testParser
-            ? config.testParser.parse(stdout, stderr, code ?? 1)
+          !killedByTimeout && phase === 'test' && config.testParser
+            ? config.testParser.parse(stdout, stderr, exitCode)
             : undefined
         resolve({
           phase,
-          exitCode: code ?? 1,
+          exitCode,
           stdout,
-          stderr,
+          stderr: outcome.runnerError ? stderr + outcome.runnerError : stderr,
           wallMs,
           testsTotal: parsed?.testsTotal,
           testsPassed: parsed?.testsPassed,
+          killedByTimeout: killedByTimeout || undefined,
+          outputTruncated: outputTruncated || undefined,
         })
+      }
+
+      // Kill the whole process group on win32-less platforms (we spawned
+      // detached). On a clean close this is a no-op.
+      const killTree = () => {
+        try {
+          if (process.platform !== 'win32' && typeof child.pid === 'number') {
+            process.kill(-child.pid, 'SIGKILL')
+          } else {
+            child.kill('SIGKILL')
+          }
+        } catch (err) {
+          console.warn('[sandbox-harness] SIGKILL on timeout failed:', err)
+        }
+      }
+
+      let forceResolve: ReturnType<typeof setTimeout> | undefined
+      const timeout = setTimeout(
+        () => {
+          killedByTimeout = true
+          killTree()
+          // Give `close` a brief window to fire (flushing any final output)
+          // after the group dies; if an orphaned grandchild keeps the pipes
+          // open, force-resolve so the harness can't hang on a runaway.
+          forceResolve = setTimeout(() => finish({ code: null }), 2000)
+        },
+        config.timeoutMs ?? 10 * 60_000,
+      )
+
+      child.on('close', (code) => {
+        if (forceResolve) clearTimeout(forceResolve)
+        finish({ code })
       })
       child.on('error', (err) => {
-        clearTimeout(timeout)
-        const wallMs = Date.now() - start
-        resolve({ phase, exitCode: 127, stdout, stderr: stderr + String(err), wallMs })
+        if (forceResolve) clearTimeout(forceResolve)
+        finish({ code: 127, runnerError: String(err) })
       })
     })
   }
@@ -266,8 +360,11 @@ export class SandboxHarness {
       if (config.setupCommand) {
         result.setup = await this.driver.exec('setup', config.setupCommand, config)
         result.totalWallMs += result.setup.wallMs
-        if (result.setup.exitCode !== 0) {
-          await handle.fail(`setup failed (exit ${result.setup.exitCode})`, {
+        if (result.setup.killedByTimeout || result.setup.exitCode !== 0) {
+          const reason = result.setup.killedByTimeout
+            ? `setup timed out after ${result.setup.wallMs}ms`
+            : `setup failed (exit ${result.setup.exitCode})`
+          await handle.fail(reason, {
             exitCode: result.setup.exitCode,
             wallMs: result.totalWallMs,
           } as Partial<SandboxSpan>)
@@ -277,8 +374,11 @@ export class SandboxHarness {
       if (config.runCommand) {
         result.run = await this.driver.exec('run', config.runCommand, config)
         result.totalWallMs += result.run.wallMs
-        if (result.run.exitCode !== 0) {
-          await handle.fail(`run failed (exit ${result.run.exitCode})`, {
+        if (result.run.killedByTimeout || result.run.exitCode !== 0) {
+          const reason = result.run.killedByTimeout
+            ? `run timed out after ${result.run.wallMs}ms`
+            : `run failed (exit ${result.run.exitCode})`
+          await handle.fail(reason, {
             exitCode: result.run.exitCode,
             wallMs: result.totalWallMs,
           } as Partial<SandboxSpan>)
@@ -288,7 +388,9 @@ export class SandboxHarness {
       if (config.testCommand) {
         result.test = await this.driver.exec('test', config.testCommand, config)
         result.totalWallMs += result.test.wallMs
-        const passed = result.test.exitCode === 0
+        // A timed-out test phase is a hard fail regardless of the exit code
+        // a SIGKILLed child happens to report — never let it score as a pass.
+        const passed = !result.test.killedByTimeout && result.test.exitCode === 0
         result.passed = passed
         if (result.test.testsTotal !== undefined && result.test.testsTotal > 0) {
           result.score = (result.test.testsPassed ?? 0) / result.test.testsTotal
diff --git a/src/sandbox-pool.test.ts b/src/sandbox-pool.test.ts
new file mode 100644
index 00000000..a43e3d7b
--- /dev/null
+++ b/src/sandbox-pool.test.ts
@@ -0,0 +1,196 @@
+import { describe, expect, it, vi } from 'vitest'
+import { createSandboxPool, type SlotFactory } from './sandbox-pool'
+
+// A tick helper: let queued microtasks (the async release IIFE) drain.
+const flush = () => new Promise((r) => setTimeout(r, 0))
+
+describe('createSandboxPool reset-failure handling', () => {
+  it('destroys and replaces a slot whose reset failed — never recycles it dirty', async () => {
+    const created: string[] = []
+    const destroyed: string[] = []
+    let resetCalls = 0
+
+    const factory: SlotFactory<string> = {
+      async create(slotId) {
+        created.push(slotId)
+        return `res-${slotId}`
+      },
+      async reset() {
+        resetCalls++
+        // Fail the reset on the FIRST release. A dirty slot must not be
+        // handed to the next waiter; it must be destroyed and replaced.
+        if (resetCalls === 1) throw new Error('reset blew up')
+      },
+      async destroy(slot) {
+        destroyed.push(slot.id)
+      },
+    }
+
+    const warn = vi.spyOn(console, 'warn').mockImplementation(() => {})
+    try {
+      const pool = createSandboxPool({ size: 1, factory })
+
+      const a = await pool.checkout()
+      const firstId = a.slot.id
+      a.release()
+      await flush()
+
+      // The dirty slot must have been destroyed, not recycled.
+      expect(destroyed).toContain(firstId)
+
+      // The next checkout must hand back a FRESH slot (new id), proving the
+      // pool did not recycle the dirty one.
+      const b = await pool.checkout()
+      expect(b.slot.id).not.toBe(firstId)
+      expect(created.length).toBe(2)
+      b.release()
+      await flush()
+    } finally {
+      warn.mockRestore()
+    }
+  })
+
+  it('a waiter queued behind a failing-reset slot is satisfied by a fresh slot', async () => {
+    let resetCalls = 0
+    const factory: SlotFactory<number> = {
+      async create() {
+        return resetCalls
+      },
+      async reset() {
+        resetCalls++
+        if (resetCalls === 1) throw new Error('reset failed')
+      },
+      async destroy() {},
+    }
+
+    const warn = vi.spyOn(console, 'warn').mockImplementation(() => {})
+    try {
+      const pool = createSandboxPool({ size: 1, factory })
+
+      const a = await pool.checkout()
+      // Queue a second checkout while the only slot is busy.
+      const waiterP = pool.checkout()
+      // Release the first; its reset fails → slot destroyed → waiter must
+      // be served by a freshly minted slot rather than hang forever.
+      a.release()
+
+      const b = await waiterP
+      expect(b.slot.id).toBeDefined()
+      b.release()
+      await flush()
+    } finally {
+      warn.mockRestore()
+    }
+  })
+
+  it('a successful reset recycles the same slot (no needless churn)', async () => {
+    const destroyed: string[] = []
+    const factory: SlotFactory<string> = {
+      async create(slotId) {
+        return slotId
+      },
+      async reset() {},
+      async destroy(slot) {
+        destroyed.push(slot.id)
+      },
+    }
+    const pool = createSandboxPool({ size: 1, factory })
+
+    const a = await pool.checkout()
+    const id = a.slot.id
+    a.release()
+    await flush()
+
+    const b = await pool.checkout()
+    expect(b.slot.id).toBe(id)
+    expect(destroyed).not.toContain(id)
+    b.release()
+    await flush()
+  })
+
+  it('drain rejects a pending waiter instead of hanging forever', async () => {
+    const factory: SlotFactory<string> = {
+      async create(slotId) {
+        return slotId
+      },
+      async reset() {},
+      async destroy() {},
+    }
+    const pool = createSandboxPool({ size: 1, factory })
+
+    const a = await pool.checkout()
+    const waiterP = pool.checkout()
+    // Drain while a is still held and a waiter is queued.
+    await pool.drain()
+    await expect(waiterP).rejects.toThrow(/drained/)
+    a.release()
+    await flush()
+  })
+
+  it('does not raise an unhandled rejection when destroy also fails', async () => {
+    let resetCalls = 0
+    const factory: SlotFactory<string> = {
+      async create(slotId) {
+        return slotId
+      },
+      async reset() {
+        resetCalls++
+        if (resetCalls === 1) throw new Error('reset failed')
+      },
+      async destroy() {
+        throw new Error('destroy failed too')
+      },
+    }
+    const warn = vi.spyOn(console, 'warn').mockImplementation(() => {})
+    const unhandled: unknown[] = []
+    const onUnhandled = (e: unknown) => unhandled.push(e)
+    process.on('unhandledRejection', onUnhandled)
+    try {
+      const pool = createSandboxPool({ size: 1, factory })
+      const a = await pool.checkout()
+      a.release()
+      await flush()
+      await flush()
+      expect(unhandled).toHaveLength(0)
+    } finally {
+      process.off('unhandledRejection', onUnhandled)
+      warn.mockRestore()
+    }
+  })
+
+  it('does not over-mint under concurrent first checkouts (capacity reservation)', async () => {
+    let created = 0
+    const factory: SlotFactory<string> = {
+      async create(slotId) {
+        created++
+        await Promise.resolve()
+        return `res-${slotId}`
+      },
+      async reset() {},
+      async destroy() {},
+    }
+    const pool = createSandboxPool({ size: 1, factory })
+
+    // Two checkouts race before the first slot is push()ed. The mint gate reads
+    // slots.length, still 0 for both — without a reservation counter both mint
+    // and the size-1 pool provisions 2 sandboxes.
+    const aP = pool.checkout()
+    const bP = pool.checkout()
+    const a = await aP
+
+    // The second checkout must be QUEUED (pending), not a second mint. Race it
+    // against a macrotask: pre-fix it resolves immediately on its own slot.
+    const state = await Promise.race([bP.then(() => 'resolved'), flush().then(() => 'pending')])
+    expect(state).toBe('pending')
+    expect(created).toBe(1)
+    expect(pool.poolSize()).toBe(1)
+
+    // Releasing the one slot hands it to the queued waiter — still no 2nd mint.
+    a.release()
+    const b = await bP
+    expect(created).toBe(1)
+    expect(pool.poolSize()).toBe(1)
+    b.release()
+    await flush()
+  })
+})
diff --git a/src/sandbox-pool.ts b/src/sandbox-pool.ts
index 3d24ab93..38d09199 100644
--- a/src/sandbox-pool.ts
+++ b/src/sandbox-pool.ts
@@ -79,11 +79,22 @@ export function createSandboxPool<T>(opts: CreateSandboxPoolOpts<T>): SandboxPoo
   if (opts.size < 1) throw new Error(`sandbox pool size must be >= 1 (got ${opts.size})`)
 
   const slots: SlotState<T>[] = []
-  const waiters: Array<(s: SlotState<T>) => void> = []
+  interface Waiter {
+    resolve: (s: SlotState<T>) => void
+    reject: (err: unknown) => void
+  }
+  const waiters: Waiter[] = []
   const mutex = new Mutex()
   let nextSlotId = 0
+  // In-flight mints: a slot is decided under the mutex but only push()ed into
+  // `slots` AFTER the un-locked factory.create(). Counting reservations against
+  // capacity in the SAME critical section as the gate stops two concurrent
+  // acquires from both minting past `size` (slots.length alone is still 0 for
+  // both until the first push lands).
+  let reserving = 0
   let totalCheckouts = 0
   let busyMs = 0
+  let drained = false
   const startedAt = Date.now()
 
   /**
@@ -100,52 +111,113 @@ export function createSandboxPool<T>(opts: CreateSandboxPoolOpts<T>): SandboxPoo
         idle.busy = true
         return idle
       }
-      if (slots.length < opts.size) {
-        // Reserve a slot ID synchronously so concurrent acquireSlot
-        // calls don't all decide to mint past the cap.
+      if (slots.length + reserving < opts.size) {
+        // Reserve capacity synchronously (live slots + in-flight mints) so
+        // concurrent acquireSlot calls can't all decide to mint past the cap.
         mintId = `slot_${nextSlotId++}`
+        reserving++
         return null
       }
       return null
     })
     if (ready) return ready
     if (mintId !== undefined) {
-      const resource = await opts.factory.create(mintId)
+      let resource: T
+      try {
+        resource = await opts.factory.create(mintId)
+      } catch (err) {
+        // Release the reservation and hand the freed capacity to a waiter,
+        // otherwise a failed create leaks a capacity slot forever.
+        await mutex.runExclusive(() => {
+          reserving--
+        })
+        if (!drained) wakeWaiterToMint()
+        throw err
+      }
       const state: SlotState<T> = {
         slot: { id: mintId, resource },
         busy: true,
       }
       await mutex.runExclusive(() => {
         slots.push(state)
+        reserving--
       })
       return state
     }
     // All slots busy + at cap: queue.
-    return new Promise<SlotState<T>>((resolve) => {
-      waiters.push((s) => {
-        s.busy = true
-        resolve(s)
-      })
+    return new Promise<SlotState<T>>((resolve, reject) => {
+      waiters.push({ resolve, reject })
     })
   }
 
+  /**
+   * Hand `state` (clean + idle) to the next queued waiter, or leave it
+   * idle if none are waiting. Caller must have already marked the slot
+   * not-busy. Runs under no lock; waiters are FIFO.
+   */
+  function handOffCleanSlot(state: SlotState<T>): void {
+    const next = waiters.shift()
+    if (next) {
+      state.busy = true
+      next.resolve(state)
+    }
+  }
+
+  /**
+   * Wake a queued waiter when a slot was destroyed (so capacity freed up
+   * but no clean slot is on hand). The waiter re-enters acquisition and
+   * mints a fresh slot. Any acquisition failure is routed to that waiter
+   * — never dropped on the floor.
+   */
+  function wakeWaiterToMint(): void {
+    const next = waiters.shift()
+    if (!next) return
+    acquireSlot().then(next.resolve, next.reject)
+  }
+
+  /**
+   * Release a checked-out slot. Reset is async; we kick it off and the
+   * slot only becomes available to the next waiter once reset lands
+   * CLEAN. If reset fails, the slot is dirty — destroy and remove it
+   * rather than recycle a corrupted workspace to the next waiter, then
+   * free the capacity so a fresh slot can be minted. The whole flow is
+   * awaited inside one IIFE whose rejection is caught, so a failed
+   * destroy can't surface as an unhandled rejection.
+   */
   function releaseSlot(state: SlotState<T>): void {
-    // Non-async release — runs synchronously inside the user's finally.
-    // Reset is async; we kick it off and let the next waiter see a
-    // freshly-reset slot once it lands.
     void (async () => {
       try {
-        if (opts.factory.reset) await opts.factory.reset(state.slot)
-      } catch (err) {
-        // A failing reset is the consumer's bug; we still release
-        // (otherwise the pool deadlocks). Surface via console.warn so
-        // it doesn't get lost.
-        console.warn(`[sandbox-pool] reset failed for slot ${state.slot.id}:`, err)
+        if (opts.factory.reset) {
+          await opts.factory.reset(state.slot)
+        }
+        state.busy = false
+        if (!drained) handOffCleanSlot(state)
+      } catch (resetErr) {
+        // Dirty slot: do NOT recycle it. Remove from the pool and tear it
+        // down, then wake a waiter to mint a clean replacement.
+        await mutex.runExclusive(() => {
+          const i = slots.indexOf(state)
+          if (i !== -1) slots.splice(i, 1)
+        })
+        try {
+          await opts.factory.destroy(state.slot)
+        } catch (destroyErr) {
+          console.warn(
+            `[sandbox-pool] destroy of dirty slot ${state.slot.id} failed after reset error:`,
+            destroyErr,
+          )
+        }
+        console.warn(
+          `[sandbox-pool] reset failed for slot ${state.slot.id} — slot destroyed, not recycled:`,
+          resetErr,
+        )
+        if (!drained) wakeWaiterToMint()
       }
-      state.busy = false
-      const next = waiters.shift()
-      if (next) next(state)
-    })()
+    })().catch((err) => {
+      // Defense in depth: nothing above should throw out here, but if it
+      // does, surface it instead of producing an unhandled rejection.
+      console.warn(`[sandbox-pool] release of slot ${state.slot.id} faulted:`, err)
+    })
   }
 
   async function checkout(): Promise<{ slot: PoolSlot<T>; release: () => void }> {
@@ -171,20 +243,20 @@ export function createSandboxPool<T>(opts: CreateSandboxPoolOpts<T>): SandboxPoo
   }
 
   async function drain(): Promise<void> {
+    drained = true
     // Snapshot under lock; destroy outside lock so a slow teardown
     // doesn't block a concurrent drain caller.
-    const snapshot = await mutex.runExclusive(() => {
+    const { snapshot, pending } = await mutex.runExclusive(() => {
       const taken = slots.splice(0, slots.length)
-      // Reject any pending waiters — pool is going away.
-      for (const w of waiters.splice(0, waiters.length)) {
-        // Best-effort rejection: the waiter is still pending; we
-        // can't reject a Promise we already resolved. Surface as a
-        // warning. In practice, drain() is called when nothing's in
-        // flight (end of run), so this is a defensive no-op.
-        void w
-      }
-      return taken
+      const queued = waiters.splice(0, waiters.length)
+      return { snapshot: taken, pending: queued }
     })
+    // Reject any pending waiters — the pool is going away, so their
+    // checkout promise can never be satisfied. Failing loud beats leaving
+    // a caller hung on an awaited checkout() forever.
+    for (const w of pending) {
+      w.reject(new Error('sandbox pool drained while a checkout was pending'))
+    }
     await Promise.allSettled(snapshot.map((s) => opts.factory.destroy(s.slot)))
   }
 
diff --git a/src/trace-analyst/behavioral-metrics.test.ts b/src/trace-analyst/behavioral-metrics.test.ts
index 8b26119e..002bc89d 100644
--- a/src/trace-analyst/behavioral-metrics.test.ts
+++ b/src/trace-analyst/behavioral-metrics.test.ts
@@ -98,6 +98,29 @@ describe('computeTraceMetrics — deterministic behavioral signals (no LLM)', ()
     expect(m.hasSelfVerification).toBe(true)
     expect(m.signals.map((s) => s.code)).not.toContain('no-self-verification')
   })
+
+  it('FIRES monotonic-input-growth on a 0→huge blowup (first call reported 0 input tokens)', () => {
+    // First LLM call reports 0 input tokens, then context explodes. Ratio is
+    // unbounded — the old `first > 0 ? last/first : 0` forced growth to 0 and
+    // silently dropped the most severe possible blowup.
+    const spans = [llmSpan(1, 0, 100), llmSpan(2, 4000, 90), llmSpan(3, 9000, 80)]
+    const sig = computeTraceMetrics(spans).signals.find((s) => s.code === 'monotonic-input-growth')
+    expect(sig).toBeDefined()
+    expect(sig!.severity).toBe('high')
+    // Displayed ratio must stay sane (no NaN/Infinity literal leaking into the message).
+    expect(sig!.detail).not.toContain('Infinity')
+    expect(sig!.detail).not.toContain('NaN')
+    expect(sig!.detail).toContain('0→9000')
+    expect(sig!.evidence.first).toBe(0)
+    expect(sig!.evidence.last).toBe(9000)
+    expect(sig!.evidence.growth_x).toBe('unbounded')
+  })
+
+  it('does NOT fire monotonic-input-growth when input stays at 0 (no blowup)', () => {
+    const spans = [llmSpan(1, 0, 100), llmSpan(2, 0, 90), llmSpan(3, 0, 80)]
+    const codes = computeTraceMetrics(spans).signals.map((s) => s.code)
+    expect(codes).not.toContain('monotonic-input-growth')
+  })
 })
 
 describe('deriveEfficiencyFindings — the 0→4, any-model flip', () => {
diff --git a/src/trace-analyst/behavioral-metrics.ts b/src/trace-analyst/behavioral-metrics.ts
index ef34f939..e15c31b0 100644
--- a/src/trace-analyst/behavioral-metrics.ts
+++ b/src/trace-analyst/behavioral-metrics.ts
@@ -105,16 +105,21 @@ export function computeTraceMetrics(spans: readonly TraceAnalystSpan[]): Behavio
   if (inputTokenTrajectory.length >= 3) {
     const first = inputTokenTrajectory[0]!
     const last = inputTokenTrajectory[inputTokenTrajectory.length - 1]!
-    const growth = first > 0 ? last / first : 0
+    // first === 0 with later growth is an unbounded ratio (0→huge context blowup);
+    // treat it as infinite so the signal fires, and report it as such instead of
+    // dividing by zero for the displayed factor.
+    const growthFromZero = first === 0 && last > 0
+    const growth = growthFromZero ? Infinity : first > 0 ? last / first : 0
     if (last > first && growth >= INPUT_GROWTH_FACTOR) {
+      const growthLabel = growthFromZero ? '0→nonzero (unbounded)' : `${growth.toFixed(1)}x`
       signals.push({
         code: 'monotonic-input-growth',
         severity: 'high',
-        detail: `LLM input tokens grew ${growth.toFixed(1)}x (${first}→${last}) across ${inputTokenTrajectory.length} calls — full history re-sent each step with no compression.`,
+        detail: `LLM input tokens grew ${growthLabel} (${first}→${last}) across ${inputTokenTrajectory.length} calls — full history re-sent each step with no compression.`,
         evidence: {
           first,
           last,
-          growth_x: Number(growth.toFixed(2)),
+          growth_x: growthFromZero ? 'unbounded' : Number(growth.toFixed(2)),
           calls: inputTokenTrajectory.length,
         },
       })
diff --git a/src/trace-analyst/otlp-span.test.ts b/src/trace-analyst/otlp-span.test.ts
new file mode 100644
index 00000000..9696664c
--- /dev/null
+++ b/src/trace-analyst/otlp-span.test.ts
@@ -0,0 +1,96 @@
+import { describe, expect, it } from 'vitest'
+import { projectOtlpFlatLine } from './otlp-span'
+
+describe('projectOtlpFlatLine', () => {
+  it('returns null when trace_id is missing', () => {
+    expect(projectOtlpFlatLine({ span_id: 's1' })).toBeNull()
+  })
+
+  it('returns null when span_id is missing', () => {
+    expect(projectOtlpFlatLine({ trace_id: 't1' })).toBeNull()
+  })
+
+  it('projects a line that has both trace_id and span_id', () => {
+    const out = projectOtlpFlatLine({ trace_id: 't1', span_id: 's1' })
+    expect(out).not.toBeNull()
+    expect(out!.trace_id).toBe('t1')
+    expect(out!.span_id).toBe('s1')
+  })
+
+  it('resolves camelCase pivots (traceId/spanId/parentSpanId/startTime/endTime)', () => {
+    const out = projectOtlpFlatLine({
+      traceId: 'tc',
+      spanId: 'sc',
+      parentSpanId: 'pc',
+      startTime: '2024-01-01T00:00:00.000Z',
+      endTime: '2024-01-01T00:00:01.000Z',
+    })
+    expect(out).not.toBeNull()
+    expect(out!.trace_id).toBe('tc')
+    expect(out!.span_id).toBe('sc')
+    expect(out!.parent_span_id).toBe('pc')
+    expect(out!.duration_ms).toBe(1000)
+  })
+
+  it('resolves snake_case pivots (trace_id/span_id/parent_span_id/start_time/end_time)', () => {
+    const out = projectOtlpFlatLine({
+      trace_id: 'ts',
+      span_id: 'ss',
+      parent_span_id: 'ps',
+      start_time: '2024-01-01T00:00:00.000Z',
+      end_time: '2024-01-01T00:00:02.500Z',
+    })
+    expect(out).not.toBeNull()
+    expect(out!.parent_span_id).toBe('ps')
+    expect(out!.duration_ms).toBe(2500)
+  })
+
+  it('span attributes override resource attributes on key overlap', () => {
+    const out = projectOtlpFlatLine({
+      trace_id: 't',
+      span_id: 's',
+      resource: { attributes: { 'service.name': 'from-resource', 'only.resource': 'r' } },
+      attributes: { 'service.name': 'from-span', 'only.span': 'sp' },
+    })
+    expect(out).not.toBeNull()
+    // span attribute wins
+    expect(out!.attributes['service.name']).toBe('from-span')
+    expect(out!.service_name).toBe('from-span')
+    // non-overlapping keys from both layers survive
+    expect(out!.attributes['only.resource']).toBe('r')
+    expect(out!.attributes['only.span']).toBe('sp')
+  })
+
+  it('clamps duration to 0 when timestamps are unparseable', () => {
+    const out = projectOtlpFlatLine({
+      trace_id: 't',
+      span_id: 's',
+      start_time: 'not-a-date',
+      end_time: 'also-not-a-date',
+    })
+    expect(out).not.toBeNull()
+    expect(out!.duration_ms).toBe(0)
+  })
+
+  it('clamps duration to 0 when end precedes start (never negative)', () => {
+    const out = projectOtlpFlatLine({
+      trace_id: 't',
+      span_id: 's',
+      start_time: '2024-01-01T00:00:05.000Z',
+      end_time: '2024-01-01T00:00:01.000Z',
+    })
+    expect(out).not.toBeNull()
+    expect(out!.duration_ms).toBe(0)
+  })
+
+  it('defaults end_time to start_time when end is absent (duration 0)', () => {
+    const out = projectOtlpFlatLine({
+      trace_id: 't',
+      span_id: 's',
+      start_time: '2024-01-01T00:00:05.000Z',
+    })
+    expect(out).not.toBeNull()
+    expect(out!.duration_ms).toBe(0)
+    expect(out!.end_time).toBe('2024-01-01T00:00:05.000Z')
+  })
+})
diff --git a/src/trace-analyst/otlp-span.ts b/src/trace-analyst/otlp-span.ts
index 8697a829..6a5d86d2 100644
--- a/src/trace-analyst/otlp-span.ts
+++ b/src/trace-analyst/otlp-span.ts
@@ -216,3 +216,25 @@ export function firstStringAttr(
   }
   return null
 }
+
+/**
+ * Parse a span timestamp to epoch millis, or null when empty/unparseable. The
+ * OTLP readers accept BOTH ISO-8601 and epoch-millis-string dialects, so raw
+ * string comparison (`<`, `localeCompare`) mis-orders across dialects and
+ * `Date.parse` returns NaN for a bare epoch-millis string.
+ */
+export function spanEpochMillis(ts: string | undefined | null): number | null {
+  if (!ts) return null
+  if (/^\d+$/.test(ts)) return Number(ts)
+  const n = Date.parse(ts)
+  return Number.isNaN(n) ? null : n
+}
+
+/**
+ * Order comparator for span timestamps across mixed ISO/epoch dialects.
+ * Unparseable timestamps sort as epoch 0 (earliest), never NaN (which would
+ * make the sort non-deterministic).
+ */
+export function compareSpanTime(a: string, b: string): number {
+  return (spanEpochMillis(a) ?? 0) - (spanEpochMillis(b) ?? 0)
+}
diff --git a/src/trace-analyst/otlp-to-run-records.timestamps.test.ts b/src/trace-analyst/otlp-to-run-records.timestamps.test.ts
new file mode 100644
index 00000000..7571384e
--- /dev/null
+++ b/src/trace-analyst/otlp-to-run-records.timestamps.test.ts
@@ -0,0 +1,54 @@
+import { describe, expect, it } from 'vitest'
+import { otlpToRunRecords } from './otlp-to-run-records'
+
+// One OTLP-flat span line with sane defaults.
+function line(o: Record<string, unknown>): string {
+  return JSON.stringify({ status: { code: 'OK' }, attributes: {}, ...o })
+}
+const opts = { experimentId: 'e', candidateId: 'c' } as const
+
+describe('otlpToRunRecords timestamp handling (epoch + ISO dialects)', () => {
+  it('computes wallMs from epoch-millis-string timestamps — Date.parse would NaN→0', () => {
+    const jsonl = line({
+      trace_id: 't1',
+      span_id: 'a',
+      name: 'llm',
+      start_time: '1700000000000',
+      end_time: '1700000005000',
+      attributes: {
+        'openinference.span.kind': 'LLM',
+        'llm.input_tokens': 10,
+        'llm.output_tokens': 2,
+      },
+    })
+    const recs = otlpToRunRecords(jsonl, opts)
+    expect(recs).toHaveLength(1)
+    // Pre-fix: Date.parse('1700000000000') is NaN → wallMs silently 0.
+    expect(recs[0]!.wallMs).toBe(5000)
+  })
+
+  it('orders trace bounds by real time across mixed ISO + epoch dialects', () => {
+    const jsonl = [
+      line({
+        trace_id: 't2',
+        span_id: 'older',
+        name: 'llm',
+        start_time: '1700000000000',
+        end_time: '1700000001000',
+        attributes: { 'openinference.span.kind': 'LLM' },
+      }),
+      line({
+        trace_id: 't2',
+        span_id: 'newer',
+        name: 'llm',
+        start_time: '2024-01-01T00:00:00.000Z',
+        end_time: '2024-01-01T00:00:10.000Z',
+        attributes: { 'openinference.span.kind': 'LLM' },
+      }),
+    ].join('\n')
+    const recs = otlpToRunRecords(jsonl, opts)
+    expect(recs).toHaveLength(1)
+    const expected = Date.parse('2024-01-01T00:00:10.000Z') - 1_700_000_000_000
+    expect(recs[0]!.wallMs).toBe(expected)
+  })
+})
diff --git a/src/trace-analyst/otlp-to-run-records.ts b/src/trace-analyst/otlp-to-run-records.ts
index c5d354c9..420173f4 100644
--- a/src/trace-analyst/otlp-to-run-records.ts
+++ b/src/trace-analyst/otlp-to-run-records.ts
@@ -40,10 +40,12 @@ import {
   validateRunRecord,
 } from '../run-record'
 import {
+  compareSpanTime,
   firstNumberAttr,
   firstStringAttr,
   type ProjectedOtlpSpan,
   projectOtlpFlatLine,
+  spanEpochMillis,
 } from './otlp-span'
 
 /** Candidate attribute keys for LLM input (prompt) tokens, both dialects. */
@@ -277,9 +279,10 @@ function aggregateTrace(
   spans: ProjectedOtlpSpan[],
   fallbackModel: string,
 ): TraceAggregate {
-  // Span order within a trace is by start time, then a stable tiebreak.
+  // Span order within a trace is by start time (epoch-aware across ISO/epoch
+  // dialects), then a stable tiebreak.
   const ordered = [...spans].sort(
-    (a, b) => a.start_time.localeCompare(b.start_time) || a.span_id.localeCompare(b.span_id),
+    (a, b) => compareSpanTime(a.start_time, b.start_time) || a.span_id.localeCompare(b.span_id),
   )
 
   let input = 0
@@ -296,8 +299,9 @@ function aggregateTrace(
   let latest = ordered[0]?.end_time ?? ''
 
   for (const s of ordered) {
-    if (s.start_time && (!earliest || s.start_time < earliest)) earliest = s.start_time
-    if (s.end_time && s.end_time > latest) latest = s.end_time
+    if (s.start_time && (!earliest || compareSpanTime(s.start_time, earliest) < 0))
+      earliest = s.start_time
+    if (s.end_time && (!latest || compareSpanTime(s.end_time, latest) > 0)) latest = s.end_time
 
     if (s.kind === 'LLM') {
       llmSpanCount += 1
@@ -331,11 +335,11 @@ function aggregateTrace(
   const model = topVote(modelVotes) ?? firstModelAttr(ordered) ?? fallbackModel
 
   let wallMs = 0
-  if (earliest && latest) {
-    const a = Date.parse(earliest)
-    const b = Date.parse(latest)
-    if (!Number.isNaN(a) && !Number.isNaN(b)) wallMs = Math.max(0, b - a)
-  }
+  // epoch-aware: Date.parse returns NaN for a bare epoch-millis string, which
+  // silently zeroed wallMs for traces emitted with epoch timestamps.
+  const a = spanEpochMillis(earliest)
+  const b = spanEpochMillis(latest)
+  if (a !== null && b !== null) wallMs = Math.max(0, b - a)
 
   const tokenUsage: RunTokenUsage = sawCached ? { input, output, cached } : { input, output }
 
@@ -403,7 +407,9 @@ function extractPromptCompletion(spans: ProjectedOtlpSpan[]): {
 } {
   const llm = spans
     .filter((s) => s.kind === 'LLM')
-    .sort((a, b) => a.start_time.localeCompare(b.start_time) || a.span_id.localeCompare(b.span_id))
+    .sort(
+      (a, b) => compareSpanTime(a.start_time, b.start_time) || a.span_id.localeCompare(b.span_id),
+    )
   if (llm.length === 0) return {}
   const promptText =
     firstStringAttr(llm[0]!.attributes, ['input.value', 'llm.input_messages', 'gen_ai.prompt']) ??
diff --git a/src/trace-analyst/store-otlp.test.ts b/src/trace-analyst/store-otlp.test.ts
index f94ffadc..b2500c58 100644
--- a/src/trace-analyst/store-otlp.test.ts
+++ b/src/trace-analyst/store-otlp.test.ts
@@ -11,7 +11,12 @@ import { join } from 'node:path'
 
 import { describe, expect, it } from 'vitest'
 import { compileSearchRegex } from './store'
-import { OtlpFileTraceStore, TraceFileMissingError, TraceNotFoundError } from './store-otlp'
+import {
+  OtlpFileTraceStore,
+  TraceFileMissingError,
+  TraceFileTooLargeError,
+  TraceNotFoundError,
+} from './store-otlp'
 
 const TINY_FIXTURE = new URL('../../tests/fixtures/trace-analyst/tiny-trace.jsonl', import.meta.url)
   .pathname
@@ -177,7 +182,9 @@ describe('OtlpFileTraceStore', () => {
       max_matches: 2,
     })
     expect(result.hits.length).toBe(2)
-    expect(result.total_matches).toBeGreaterThan(result.hits.length)
+    // Capped: total_matches mirrors the hit count (no fabricated +1); the
+    // "more exist" signal lives in has_more, not in an invented total.
+    expect(result.total_matches).toBe(result.hits.length)
     expect(result.has_more).toBe(true)
   })
 
@@ -195,7 +202,8 @@ describe('OtlpFileTraceStore', () => {
     })
     expect(result.hits).toHaveLength(3)
     expect(result.has_more).toBe(true)
-    expect(result.total_matches).toBeGreaterThan(3)
+    // Capped: no fabricated total — mirrors hits, has_more carries the signal.
+    expect(result.total_matches).toBe(3)
   })
 
   it('searchSpan returns hits scoped to one span only', async () => {
@@ -310,3 +318,182 @@ describe('OtlpFileTraceStore — error_clusters (deterministic failure coverage)
     expect(Array.isArray(overview.error_clusters)).toBe(true)
   })
 })
+
+describe('OtlpFileTraceStore — concurrent reads (per-call truncation counter)', () => {
+  // Each span carries a single oversized string attribute so a tiny
+  // per-attribute budget forces exactly ONE truncation per projected span.
+  const bigSpan = (trace: string, span: string) =>
+    JSON.stringify({
+      trace_id: trace,
+      span_id: span,
+      parent_span_id: '',
+      name: 'op',
+      kind: 'SPAN_KIND_INTERNAL',
+      start_time: '2026-05-29T00:00:00Z',
+      end_time: '2026-05-29T00:00:01Z',
+      status: { code: 'STATUS_CODE_OK' },
+      resource: {},
+      attributes: { 'openinference.span.kind': 'TOOL', 'input.value': 'Z'.repeat(5000) },
+    })
+
+  it('reports each call its own truncation count under interleaved reads — bug class: store-keyed delta cross-contaminates concurrent calls', async () => {
+    const tmp = await mkdtemp(join(tmpdir(), 'trace-concurrency-'))
+    const path = join(tmp, 'trace.jsonl')
+    try {
+      // Trace A: 1 span (→ 1 truncation). Trace B: 3 spans (→ 3 truncations).
+      const lines = [
+        bigSpan('tA', 'a1'),
+        bigSpan('tB', 'b1'),
+        bigSpan('tB', 'b2'),
+        bigSpan('tB', 'b3'),
+      ]
+      await writeFile(path, `${lines.join('\n')}\n`)
+      const store = new OtlpFileTraceStore({ path, perAttributeSpanBudget: 100 })
+
+      // Fire both reads on the SAME store concurrently. With the old
+      // before/after delta on a store-keyed WeakMap the awaits interleave
+      // and each call absorbs the other's increments.
+      const [a, b] = await Promise.all([
+        store.viewSpans({ trace_id: 'tA', span_ids: ['a1'] }),
+        store.viewSpans({ trace_id: 'tB', span_ids: ['b1', 'b2', 'b3'] }),
+      ])
+      expect(a.truncated_attribute_count).toBe(1)
+      expect(b.truncated_attribute_count).toBe(3)
+
+      // Repeated interleavings stay stable — no accumulation, no bleed.
+      for (let i = 0; i < 5; i += 1) {
+        const [x, y] = await Promise.all([
+          store.viewSpans({ trace_id: 'tA', span_ids: ['a1'] }),
+          store.viewSpans({ trace_id: 'tB', span_ids: ['b1', 'b2', 'b3'] }),
+        ])
+        expect(x.truncated_attribute_count).toBe(1)
+        expect(y.truncated_attribute_count).toBe(3)
+      }
+    } finally {
+      await rm(tmp, { recursive: true, force: true })
+    }
+  })
+})
+
+describe('OtlpFileTraceStore — searchTrace total_matches honesty', () => {
+  it('reports the EXACT total when uncapped — total equals hit count, has_more is false', async () => {
+    const store = new OtlpFileTraceStore({ path: TINY_FIXTURE })
+    // 'STATUS_CODE_ERROR' occurs once in t000000000001; uncapped scan.
+    const r = await store.searchTrace({
+      trace_id: 't000000000001',
+      regex_pattern: 'STATUS_CODE_ERROR',
+      max_matches: 50,
+    })
+    expect(r.total_matches).toBe(r.hits.length)
+    expect(r.total_matches).toBe(1)
+    expect(r.has_more).toBe(false)
+  })
+
+  it('does NOT fabricate total_matches when capped — bug class: total = max(total, hits+1) invents a number', async () => {
+    const store = new OtlpFileTraceStore({ path: TINY_FIXTURE })
+    // 'span_id' appears in every span line — easily exceeds the cap.
+    const r = await store.searchTrace({
+      trace_id: 't000000000001',
+      regex_pattern: 'span_id',
+      max_matches: 2,
+    })
+    expect(r.hits.length).toBe(2)
+    // Old behavior: total_matches = hits.length + 1 = 3 (a fabricated
+    // count). New behavior: mirror the hit count and lean on has_more.
+    expect(r.total_matches).toBe(2)
+    expect(r.has_more).toBe(true)
+  })
+})
+
+describe('OtlpFileTraceStore — bestAttributePathForOffset escaped quotes', () => {
+  it('resolves the real attribute_path past a \\"-escaped quote inside the value — bug class: backward scan stops at an escaped quote and returns garbage', async () => {
+    const tmp = await mkdtemp(join(tmpdir(), 'trace-escq-'))
+    const path = join(tmp, 'trace.jsonl')
+    try {
+      // output.value is itself a JSON string: its escaped inner quotes
+      // AND inner ':' derail a non-escape-aware backward scan, making it
+      // report the inner "phase" key instead of the real attribute.
+      await writeFile(
+        path,
+        `${JSON.stringify({
+          trace_id: 'tq',
+          span_id: 's1',
+          parent_span_id: '',
+          name: 'op',
+          kind: 'SPAN_KIND_INTERNAL',
+          start_time: '2026-05-29T00:00:00Z',
+          end_time: '2026-05-29T00:00:01Z',
+          status: { code: 'STATUS_CODE_OK' },
+          resource: {},
+          attributes: {
+            'openinference.span.kind': 'TOOL',
+            'output.value': '{"phase":"NEEDLE"}',
+          },
+        })}\n`,
+      )
+      const store = new OtlpFileTraceStore({ path })
+      const r = await store.searchSpan({
+        trace_id: 'tq',
+        span_id: 's1',
+        regex_pattern: 'NEEDLE',
+      })
+      expect(r.hits.length).toBe(1)
+      expect(r.hits[0]!.matched_text).toBe('NEEDLE')
+      // The match lives inside output.value; the path must name that key,
+      // not a fragment of the value or a different field.
+      expect(r.hits[0]!.attribute_path).toBe('output.value')
+    } finally {
+      await rm(tmp, { recursive: true, force: true })
+    }
+  })
+})
+
+describe('OtlpFileTraceStore — max-file-size guard (fail loud, no OOM)', () => {
+  it('throws TraceFileTooLargeError above maxFileBytes instead of reading the whole file', async () => {
+    const tmp = await mkdtemp(join(tmpdir(), 'trace-toobig-'))
+    const path = join(tmp, 'trace.jsonl')
+    try {
+      const line = JSON.stringify({
+        trace_id: 't',
+        span_id: 's',
+        parent_span_id: '',
+        name: 'op',
+        start_time: '2026-05-29T00:00:00Z',
+        end_time: '2026-05-29T00:00:01Z',
+        status: { code: 'STATUS_CODE_OK' },
+        resource: {},
+        attributes: { 'openinference.span.kind': 'TOOL' },
+      })
+      await writeFile(path, `${line}\n`)
+      // File is a few hundred bytes; set the ceiling below it.
+      const store = new OtlpFileTraceStore({ path, maxFileBytes: 10 })
+      await expect(store.ensureIndexed()).rejects.toBeInstanceOf(TraceFileTooLargeError)
+    } finally {
+      await rm(tmp, { recursive: true, force: true })
+    }
+  })
+
+  it('indexes normally when the file is within the configured ceiling', async () => {
+    const tmp = await mkdtemp(join(tmpdir(), 'trace-okbig-'))
+    const path = join(tmp, 'trace.jsonl')
+    try {
+      const line = JSON.stringify({
+        trace_id: 't',
+        span_id: 's',
+        parent_span_id: '',
+        name: 'op',
+        start_time: '2026-05-29T00:00:00Z',
+        end_time: '2026-05-29T00:00:01Z',
+        status: { code: 'STATUS_CODE_OK' },
+        resource: {},
+        attributes: { 'openinference.span.kind': 'TOOL' },
+      })
+      await writeFile(path, `${line}\n`)
+      const store = new OtlpFileTraceStore({ path, maxFileBytes: 1024 * 1024 })
+      const overview = await store.getOverview()
+      expect(overview.total_traces).toBe(1)
+    } finally {
+      await rm(tmp, { recursive: true, force: true })
+    }
+  })
+})
diff --git a/src/trace-analyst/store-otlp.timestamps.test.ts b/src/trace-analyst/store-otlp.timestamps.test.ts
new file mode 100644
index 00000000..8ad562f2
--- /dev/null
+++ b/src/trace-analyst/store-otlp.timestamps.test.ts
@@ -0,0 +1,71 @@
+import { mkdtempSync, writeFileSync } from 'node:fs'
+import { tmpdir } from 'node:os'
+import { join } from 'node:path'
+import { describe, expect, it } from 'vitest'
+import { OtlpFileTraceStore } from './store-otlp'
+
+const dir = mkdtempSync(join(tmpdir(), 'otlp-ts-'))
+
+interface Line {
+  trace_id: string
+  span_id: string
+  name?: string
+  start_time?: string
+  end_time?: string
+  status?: { code: string }
+  attributes?: Record<string, unknown>
+}
+
+function writeOtlp(name: string, lines: Line[]): string {
+  const p = join(dir, name)
+  writeFileSync(
+    p,
+    `${lines.map((l) => JSON.stringify({ name: 's', status: { code: 'OK' }, attributes: {}, ...l })).join('\n')}\n`,
+  )
+  return p
+}
+
+describe('OtlpFileTraceStore timestamp handling', () => {
+  it('yields a finite duration (0), never NaN/null, when timestamps are empty', async () => {
+    const path = writeOtlp('empty.jsonl', [
+      { trace_id: 't1', span_id: 'a', start_time: '', end_time: '' },
+      { trace_id: 't1', span_id: 'b', start_time: '', end_time: '' },
+    ])
+    const store = new OtlpFileTraceStore({ path })
+    const page = await store.queryTraces({ limit: 10 })
+    expect(page.traces[0]!.duration_ms).toBe(0)
+    expect(Number.isNaN(page.traces[0]!.duration_ms)).toBe(false)
+  })
+
+  it('orders the time range by real time across mixed ISO + epoch-millis dialects', async () => {
+    // epoch-ms 1000 (1970) is earlier than the ISO 2024 timestamp; lexical
+    // string compare would order them backwards.
+    const path = writeOtlp('mixed.jsonl', [
+      {
+        trace_id: 't2',
+        span_id: 'a',
+        start_time: '2024-01-01T00:00:00.000Z',
+        end_time: '2024-01-01T00:00:01.000Z',
+      },
+      { trace_id: 't2', span_id: 'b', start_time: '1000', end_time: '2000' },
+    ])
+    const store = new OtlpFileTraceStore({ path })
+    const ov = await store.getOverview()
+    expect(ov.time_range?.earliest).toBe('1000')
+    expect(ov.time_range?.latest).toBe('2024-01-01T00:00:01.000Z')
+  })
+
+  it('computes a correct positive duration for a normal ISO trace', async () => {
+    const path = writeOtlp('normal.jsonl', [
+      {
+        trace_id: 't3',
+        span_id: 'a',
+        start_time: '2024-01-01T00:00:00.000Z',
+        end_time: '2024-01-01T00:00:05.000Z',
+      },
+    ])
+    const store = new OtlpFileTraceStore({ path })
+    const page = await store.queryTraces({ limit: 10 })
+    expect(page.traces[0]!.duration_ms).toBe(5000)
+  })
+})
diff --git a/src/trace-analyst/store-otlp.ts b/src/trace-analyst/store-otlp.ts
index 24183c1b..fb146802 100644
--- a/src/trace-analyst/store-otlp.ts
+++ b/src/trace-analyst/store-otlp.ts
@@ -47,6 +47,34 @@ import {
   type ViewTraceResult,
 } from './types'
 
+/**
+ * Parse a span timestamp to epoch millis, or null when empty/unparseable. The
+ * OTLP readers accept BOTH ISO-8601 and epoch-millis-string dialects, so raw
+ * string comparison (`<`, localeCompare) is wrong across dialects, and
+ * `new Date(str)` is NaN for an epoch-millis string.
+ */
+function epochOrNull(ts: string): number | null {
+  if (!ts) return null
+  if (/^\d+$/.test(ts)) return Number(ts)
+  const n = Date.parse(ts)
+  return Number.isNaN(n) ? null : n
+}
+
+/** Ordering key: unparseable timestamps sort as epoch 0 (earliest), never NaN. */
+function epochMs(ts: string): number {
+  return epochOrNull(ts) ?? 0
+}
+
+/** Lines indexed between event-loop yields. Bounded synchronous work per
+ *  tick keeps the index build from starving other tasks on large files
+ *  while staying coarse enough that the yields are cheap. */
+const INDEX_YIELD_LINES = 5000
+
+/** Hand control back to the event loop without busy-waiting. */
+function yieldToEventLoop(): Promise<void> {
+  return new Promise((resolve) => setImmediate(resolve))
+}
+
 interface SpanIndexEntry {
   span_id: string
   parent_span_id: string | null
@@ -101,14 +129,26 @@ export interface OtlpFileTraceStoreOptions {
   perCallByteCeiling?: number
   /** Override the per-match text budget. */
   perMatchTextBudget?: number
+  /**
+   * Hard ceiling on the trace file size in bytes. The store reads the
+   * whole file into one Buffer and indexes it in memory, so an
+   * unbounded file OOMs the process. Above this size the store fails
+   * loud with `TraceFileTooLargeError` instead of degrading silently.
+   * Default 256 MiB.
+   */
+  maxFileBytes?: number
 }
 
+/** Default ceiling for {@link OtlpFileTraceStoreOptions.maxFileBytes}. */
+export const DEFAULT_MAX_TRACE_FILE_BYTES = 256 * 1024 * 1024
+
 export class OtlpFileTraceStore implements TraceAnalysisStore {
   private readonly path: string
   private readonly perAttributeViewBudget: number
   private readonly perAttributeSpanBudget: number
   private readonly perCallByteCeiling: number
   private readonly perMatchTextBudget: number
+  private readonly maxFileBytes: number
   private indexPromise?: Promise<DatasetIndex>
   /** Cached UTF-8 buffer of the file. We pin it once because every
    *  read needs slice access and re-reading on each call balloons the
@@ -125,6 +165,7 @@ export class OtlpFileTraceStore implements TraceAnalysisStore {
       opts.perCallByteCeiling ?? DEFAULT_TRACE_ANALYST_BUDGETS.perCallByteCeiling
     this.perMatchTextBudget =
       opts.perMatchTextBudget ?? DEFAULT_TRACE_ANALYST_BUDGETS.perMatchTextBudget
+    this.maxFileBytes = opts.maxFileBytes ?? DEFAULT_MAX_TRACE_FILE_BYTES
   }
 
   // ─── Public API ────────────────────────────────────────────────────
@@ -150,8 +191,8 @@ export class OtlpFileTraceStore implements TraceAnalysisStore {
       for (const m of t.models) models.add(m)
       for (const tn of t.tools) tools.add(tn)
       rawBytes += t.raw_jsonl_bytes
-      if (!earliest || t.start_time < earliest) earliest = t.start_time
-      if (!latest || t.end_time > latest) latest = t.end_time
+      if (!earliest || epochMs(t.start_time) < epochMs(earliest)) earliest = t.start_time
+      if (!latest || epochMs(t.end_time) > epochMs(latest)) latest = t.end_time
       if (t.has_errors) {
         errorTraceCount += 1
         for (const s of t.spans) {
@@ -224,8 +265,9 @@ export class OtlpFileTraceStore implements TraceAnalysisStore {
     const spans: TraceAnalystSpan[] = []
     let runningBytes = 0
     let span_response_bytes_max = 0
+    const counter: TruncationCounter = { value: 0 }
     for (const s of trace.spans) {
-      const projected = await this.projectSpan(buf, trace.trace_id, s, cap)
+      const projected = this.projectSpan(buf, trace.trace_id, s, cap, counter)
       const bytes = Buffer.byteLength(JSON.stringify(projected), 'utf8')
       span_response_bytes_max = Math.max(span_response_bytes_max, bytes)
       runningBytes += bytes
@@ -267,12 +309,10 @@ export class OtlpFileTraceStore implements TraceAnalysisStore {
 
     const buf = await this.buffer()
     const spans: TraceAnalystSpan[] = []
-    let truncated = 0
+    const counter: TruncationCounter = { value: 0 }
     let runningBytes = 0
     for (const s of found) {
-      const before = truncationCounter(this)
-      const projected = await this.projectSpan(buf, trace.trace_id, s, cap)
-      truncated += before.delta()
+      const projected = this.projectSpan(buf, trace.trace_id, s, cap, counter)
       const bytes = Buffer.byteLength(JSON.stringify(projected), 'utf8')
       runningBytes += bytes
       if (runningBytes > this.perCallByteCeiling) {
@@ -286,7 +326,7 @@ export class OtlpFileTraceStore implements TraceAnalysisStore {
       trace_id: trace.trace_id,
       spans,
       missing_span_ids: missing,
-      truncated_attribute_count: truncated,
+      truncated_attribute_count: counter.value,
     }
   }
 
@@ -324,15 +364,19 @@ export class OtlpFileTraceStore implements TraceAnalysisStore {
         hits.push(h)
       }
       if (hits.length >= max_matches) {
+        // Capped: we stopped scanning, so `total` is a lower bound on the
+        // real match count — never report it as exact. has_more signals
+        // "more exist"; total_matches mirrors hits so callers don't read a
+        // fabricated number.
         capped = true
-        total = Math.max(total, hits.length + 1)
         break
       }
     }
     return {
       trace_id: trace.trace_id,
       hits,
-      total_matches: total,
+      // Uncapped: every span scanned to completion, so `total` is exact.
+      total_matches: capped ? hits.length : total,
       has_more: capped || total > hits.length,
     }
   }
@@ -383,11 +427,30 @@ export class OtlpFileTraceStore implements TraceAnalysisStore {
 
   private async buffer(): Promise<Buffer> {
     if (!this.bufferPromise) {
-      this.bufferPromise = readFile(this.path)
+      this.bufferPromise = this.readGuarded()
     }
     return this.bufferPromise
   }
 
+  /** Stat-then-read so an oversized file fails loud BEFORE we allocate a
+   *  multi-hundred-MB Buffer and OOM the process. A missing file surfaces
+   *  as TraceFileMissingError; any other stat/read error propagates. */
+  private async readGuarded(): Promise<Buffer> {
+    let stats: Awaited<ReturnType<typeof stat>>
+    try {
+      stats = await stat(this.path)
+    } catch (err) {
+      if ((err as NodeJS.ErrnoException)?.code === 'ENOENT') {
+        throw new TraceFileMissingError(this.path)
+      }
+      throw err
+    }
+    if (stats.size > this.maxFileBytes) {
+      throw new TraceFileTooLargeError(this.path, stats.size, this.maxFileBytes)
+    }
+    return readFile(this.path)
+  }
+
   private async index(): Promise<DatasetIndex> {
     if (!this.indexPromise) {
       this.indexPromise = this.buildIndex()
@@ -396,20 +459,19 @@ export class OtlpFileTraceStore implements TraceAnalysisStore {
   }
 
   private async buildIndex(): Promise<DatasetIndex> {
-    let buf: Buffer
-    try {
-      buf = await this.buffer()
-    } catch (err) {
-      const stats = await stat(this.path).catch(() => null)
-      if (!stats) {
-        throw new TraceFileMissingError(this.path)
-      }
-      throw err
-    }
+    // readGuarded surfaces missing/oversized files as typed errors.
+    const buf = await this.buffer()
 
     const byTrace = new Map<string, TraceIndexEntry>()
     let cursor = 0
+    let sinceYield = 0
     while (cursor < buf.length) {
+      // Yield to the event loop every INDEX_YIELD_LINES lines so a huge
+      // file doesn't monopolise the thread for the whole index build.
+      if (++sinceYield >= INDEX_YIELD_LINES) {
+        sinceYield = 0
+        await yieldToEventLoop()
+      }
       const newlineIndex = buf.indexOf(0x0a, cursor) // \n
       const lineEnd = newlineIndex === -1 ? buf.length : newlineIndex
       const lineLength = lineEnd - cursor
@@ -478,8 +540,8 @@ export class OtlpFileTraceStore implements TraceAnalysisStore {
       entry.span_count += 1
       entry.raw_jsonl_bytes += lineLength + 1 // +1 newline byte
       if (span.status === 'ERROR') entry.has_errors = true
-      if (span.start_time < entry.start_time) entry.start_time = span.start_time
-      if (span.end_time > entry.end_time) entry.end_time = span.end_time
+      if (epochMs(span.start_time) < epochMs(entry.start_time)) entry.start_time = span.start_time
+      if (epochMs(span.end_time) > epochMs(entry.end_time)) entry.end_time = span.end_time
       if (span.model_name) entry.models.add(span.model_name)
       if (span.tool_name) entry.tools.add(span.tool_name)
     }
@@ -491,9 +553,13 @@ export class OtlpFileTraceStore implements TraceAnalysisStore {
       totalRawBytes += t.raw_jsonl_bytes
       t.spans.sort(
         (a, b) =>
-          a.start_time.localeCompare(b.start_time) || a.line_byte_offset - b.line_byte_offset,
+          epochMs(a.start_time) - epochMs(b.start_time) || a.line_byte_offset - b.line_byte_offset,
       )
-      t.duration_ms = Math.max(0, new Date(t.end_time).getTime() - new Date(t.start_time).getTime())
+      // Duration is 0 unless BOTH bounds parse — a missing/garbage timestamp
+      // yields 0, never a NaN (→ null in JSON) or a bogus epoch-from-zero span.
+      const startMs = epochOrNull(t.start_time)
+      const endMs = epochOrNull(t.end_time)
+      t.duration_ms = startMs === null || endMs === null ? 0 : Math.max(0, endMs - startMs)
     }
     const sortedTraceIds = [...byTrace.keys()].sort()
 
@@ -569,12 +635,13 @@ export class OtlpFileTraceStore implements TraceAnalysisStore {
 
   // ─── Span projection (lazy attribute reads) ────────────────────────
 
-  private async projectSpan(
+  private projectSpan(
     buf: Buffer,
     trace_id: string,
     s: SpanIndexEntry,
     perAttrCap: number,
-  ): Promise<TraceAnalystSpan> {
+    counter: TruncationCounter,
+  ): TraceAnalystSpan {
     const slice = buf
       .subarray(s.line_byte_offset, s.line_byte_offset + s.line_byte_length)
       .toString('utf8')
@@ -590,13 +657,13 @@ export class OtlpFileTraceStore implements TraceAnalysisStore {
     for (const [k, v] of Object.entries(attrs)) {
       if (typeof v === 'string') {
         const trunc = truncateForBudget(v, perAttrCap)
-        if (trunc !== v) trackTruncation(this)
+        if (trunc !== v) counter.value += 1
         projected[k] = trunc
       } else if (Array.isArray(v) || (v && typeof v === 'object')) {
         const json = JSON.stringify(v)
         const trunc = truncateForBudget(json, perAttrCap)
         if (trunc !== json) {
-          trackTruncation(this)
+          counter.value += 1
           projected[k] = trunc
         } else {
           projected[k] = v
@@ -699,6 +766,20 @@ export class TraceFileMissingError extends NotFoundError {
     super(`trace file not found: ${path}`)
   }
 }
+export class TraceFileTooLargeError extends Error {
+  readonly path: string
+  readonly size_bytes: number
+  readonly max_bytes: number
+  constructor(path: string, size_bytes: number, max_bytes: number) {
+    super(
+      `trace file ${path} is ${size_bytes} bytes, over the ${max_bytes}-byte limit; ` +
+        'raise OtlpFileTraceStoreOptions.maxFileBytes or pre-split the file',
+    )
+    this.path = path
+    this.size_bytes = size_bytes
+    this.max_bytes = max_bytes
+  }
+}
 export class TraceNotFoundError extends NotFoundError {
   readonly trace_id: string
   constructor(trace_id: string) {
@@ -725,55 +806,59 @@ function isPresent<T>(v: T | undefined): v is T {
   return v !== undefined
 }
 
-// Truncation counter — module-private, lets viewSpans report the
-// number of attribute fields it had to truncate without threading
-// a counter through every call.
-const truncationCounters = new WeakMap<OtlpFileTraceStore, { value: number }>()
+// Per-call truncation counter. Each public read that projects spans
+// owns one of these and threads it through projectSpan; a store-keyed
+// counter would let two concurrent reads on the same store report each
+// other's truncation counts.
+interface TruncationCounter {
+  value: number
+}
 
-function trackTruncation(store: OtlpFileTraceStore): void {
-  let c = truncationCounters.get(store)
-  if (!c) {
-    c = { value: 0 }
-    truncationCounters.set(store, c)
+/** A `"` at `idx` is a real JSON delimiter only when the run of `\`
+ *  immediately preceding it is even-length; an odd run means the quote
+ *  is escaped (`\"`) and is part of a string value, not a boundary. */
+function isUnescapedQuote(slice: string, idx: number): boolean {
+  if (slice[idx] !== '"') return false
+  let backslashes = 0
+  let b = idx - 1
+  while (b >= 0 && slice[b] === '\\') {
+    backslashes += 1
+    b -= 1
   }
-  c.value += 1
+  return backslashes % 2 === 0
 }
 
-function truncationCounter(store: OtlpFileTraceStore): { delta(): number } {
-  const before = truncationCounters.get(store)?.value ?? 0
-  return {
-    delta() {
-      const after = truncationCounters.get(store)?.value ?? 0
-      return after - before
-    },
+/** Scan backwards from `from` (inclusive) for the nearest UNescaped `"`.
+ *  Returns its index, or -1 when none is found. */
+function prevUnescapedQuote(slice: string, from: number): number {
+  for (let i = from; i >= 0; i -= 1) {
+    if (slice[i] === '"' && isUnescapedQuote(slice, i)) return i
   }
+  return -1
 }
 
 /**
  * Best-effort: locate the JSON path for the substring at `offset` in
- * a single span's JSONL slice. We walk the parsed JSON structurally
- * and return the dotted path when we find a string field whose
- * serialised form contains `offset`. Returns `null` if the offset
- * doesn't fall inside a recognisable string field.
+ * a single span's JSONL slice. The slice is '...,"key":"value..."' — we
+ * walk back from `offset` to the value-opening quote, past the `:`, to
+ * the key's closing then opening quote, skipping `\"`-escaped quotes that
+ * live inside string values. Returns `null` when the offset doesn't fall
+ * inside a recognisable string field.
  */
 function bestAttributePathForOffset(slice: string, offset: number): string | null {
-  // The slice contains '"key":"value..."' — find the nearest '"'
-  // wrapping `offset` and walk back to a key. This is heuristic but
-  // bounded by the span line length, not the whole file.
-  let i = offset
-  while (i > 0 && slice[i] !== '"') i -= 1
-  if (i <= 0) return null
-  // Scan backwards for the preceding '"': pattern is "key":"value"
-  let j = i - 1
-  while (j > 0 && slice[j] !== ':') j -= 1
-  if (j <= 0) return null
-  // Find the key: walk back from `:` to the matching closing '"' then to opening '"'.
-  let k = j - 1
-  while (k > 0 && slice[k] !== '"') k -= 1
-  let l = k - 1
-  while (l > 0 && slice[l] !== '"') l -= 1
-  if (l <= 0) return null
-  return slice.slice(l + 1, k)
+  // Value-opening quote: nearest unescaped '"' at or before the offset.
+  const valueQuote = prevUnescapedQuote(slice, Math.min(offset, slice.length - 1))
+  if (valueQuote < 1) return null
+  // The ':' separating key and value sits before the value quote.
+  let j = valueQuote - 1
+  while (j >= 0 && slice[j] !== ':') j -= 1
+  if (j < 1) return null
+  // Key closing quote, then key opening quote — both unescaped.
+  const keyClose = prevUnescapedQuote(slice, j - 1)
+  if (keyClose < 1) return null
+  const keyOpen = prevUnescapedQuote(slice, keyClose - 1)
+  if (keyOpen < 0) return null
+  return slice.slice(keyOpen + 1, keyClose)
 }
 
 // ─── Error-cluster extraction ────────────────────────────────────────
diff --git a/src/trace/query.test.ts b/src/trace/query.test.ts
new file mode 100644
index 00000000..d312a138
--- /dev/null
+++ b/src/trace/query.test.ts
@@ -0,0 +1,47 @@
+import { describe, expect, it } from 'vitest'
+import { aggregateLlm, argHash } from './query'
+import type { LlmSpan } from './schema'
+
+function llm(extra: Partial<LlmSpan>): LlmSpan {
+  // `extra` is a Partial spread last, which widens `kind`; cast back to LlmSpan.
+  return { spanId: 's', runId: 'r', kind: 'llm', name: 'llm', startedAt: 0, ...extra } as LlmSpan
+}
+
+describe('aggregateLlm', () => {
+  it('sums reasoningTokens (was silently omitted, hiding reasoning usage)', () => {
+    const agg = aggregateLlm([
+      llm({ inputTokens: 100, outputTokens: 50, reasoningTokens: 1000 }),
+      llm({ inputTokens: 10, outputTokens: 5, reasoningTokens: 200 }),
+    ])
+    expect(agg.inputTokens).toBe(110)
+    expect(agg.outputTokens).toBe(55)
+    expect(agg.reasoningTokens).toBe(1200)
+  })
+})
+
+describe('argHash', () => {
+  it('always returns a string — even for undefined / functions', () => {
+    // Regression: JSON.stringify(undefined) returns the value `undefined`,
+    // which made argHash non-string and broke de-dup keys downstream.
+    expect(typeof argHash(undefined)).toBe('string')
+    expect(typeof argHash(null)).toBe('string')
+    expect(typeof argHash(() => 1)).toBe('string')
+  })
+
+  it('is stable across object key insertion order', () => {
+    expect(argHash({ a: 1, b: 2 })).toBe(argHash({ b: 2, a: 1 }))
+  })
+
+  it('treats an undefined-valued key like an absent key (JSON semantics)', () => {
+    expect(argHash({ a: 1 })).toBe(argHash({ a: 1, b: undefined }))
+  })
+
+  it('distinguishes null, the string "null", and distinct args', () => {
+    expect(argHash(null)).not.toBe(argHash('null'))
+    expect(argHash({ cmd: 'a' })).not.toBe(argHash({ cmd: 'b' }))
+  })
+
+  it('recurses into nested structures stably', () => {
+    expect(argHash({ x: [{ a: 1, b: 2 }] })).toBe(argHash({ x: [{ b: 2, a: 1 }] }))
+  })
+})
diff --git a/src/trace/query.ts b/src/trace/query.ts
index f0c895a0..2f643c55 100644
--- a/src/trace/query.ts
+++ b/src/trace/query.ts
@@ -55,12 +55,20 @@ export function argHash(args: unknown): string {
 }
 
 function stableStringify(value: unknown): string {
-  if (value === null || typeof value !== 'object') return JSON.stringify(value)
+  // Must ALWAYS return a string: JSON.stringify(undefined) — and stringify of
+  // functions/symbols — yields the JS value `undefined`, which would make
+  // argHash return a non-string and silently break the de-dup keys behind
+  // stuck-loop and failure-cluster.
+  if (value === undefined) return 'undefined'
+  if (value === null || typeof value !== 'object') return JSON.stringify(value) ?? 'undefined'
   if (Array.isArray(value)) return `[${value.map(stableStringify).join(',')}]`
-  const keys = Object.keys(value as Record<string, unknown>).sort()
-  const parts = keys.map(
-    (k) => `${JSON.stringify(k)}:${stableStringify((value as Record<string, unknown>)[k])}`,
-  )
+  // Drop undefined-valued keys to match JSON.stringify semantics, so
+  // `{a:1}` and `{a:1,b:undefined}` hash identically.
+  const obj = value as Record<string, unknown>
+  const keys = Object.keys(obj)
+    .filter((k) => obj[k] !== undefined)
+    .sort()
+  const parts = keys.map((k) => `${JSON.stringify(k)}:${stableStringify(obj[k])}`)
   return `{${parts.join(',')}}`
 }
 
@@ -69,6 +77,7 @@ export function aggregateLlm(spans: LlmSpan[]): {
   inputTokens: number
   outputTokens: number
   cachedTokens: number
+  reasoningTokens: number
   costUsd: number
 } {
   return spans.reduce(
@@ -76,9 +85,12 @@ export function aggregateLlm(spans: LlmSpan[]): {
       inputTokens: acc.inputTokens + (s.inputTokens ?? 0),
       outputTokens: acc.outputTokens + (s.outputTokens ?? 0),
       cachedTokens: acc.cachedTokens + (s.cachedTokens ?? 0),
+      // reasoningTokens is on LlmSpan but was omitted here — reasoning usage was
+      // invisible to any cost/perf analysis reading this aggregate.
+      reasoningTokens: acc.reasoningTokens + (s.reasoningTokens ?? 0),
       costUsd: acc.costUsd + (s.costUsd ?? 0),
     }),
-    { inputTokens: 0, outputTokens: 0, cachedTokens: 0, costUsd: 0 },
+    { inputTokens: 0, outputTokens: 0, cachedTokens: 0, reasoningTokens: 0, costUsd: 0 },
   )
 }
 
diff --git a/src/trace/store.test.ts b/src/trace/store.test.ts
new file mode 100644
index 00000000..b6e1204b
--- /dev/null
+++ b/src/trace/store.test.ts
@@ -0,0 +1,224 @@
+import { mkdtempSync, readdirSync, readFileSync, rmSync, writeFileSync } from 'node:fs'
+import { tmpdir } from 'node:os'
+import { join } from 'node:path'
+
+import { afterEach, beforeEach, describe, expect, it, vi } from 'vitest'
+
+import type { GenericSpan, Run } from './schema'
+import { FileSystemTraceStore } from './store'
+
+// Wrap node:fs/promises so the loader's readdir order is controllable per-test:
+// real FS, but readdir entries are reordered to whatever `readdirOrder` sets.
+// This forces the patch-before-base case (otherwise OS-dependent) and lets stat
+// yield a macrotask so concurrent appends interleave deterministically.
+let readdirOrder: ((files: string[]) => string[]) | undefined
+let statDelay = false
+vi.mock('node:fs/promises', async (importActual) => {
+  const actual = await importActual<typeof import('node:fs/promises')>()
+  return {
+    ...actual,
+    default: actual,
+    async readdir(...args: Parameters<typeof actual.readdir>) {
+      const out = (await (actual.readdir as (...a: unknown[]) => Promise<unknown>)(
+        ...(args as unknown[]),
+      )) as string[]
+      return readdirOrder ? readdirOrder(out) : out
+    },
+    stat(...args: Parameters<typeof actual.stat>) {
+      if (!statDelay) return actual.stat(...(args as Parameters<typeof actual.stat>))
+      return new Promise((resolve, reject) => {
+        setTimeout(
+          () => actual.stat(...(args as Parameters<typeof actual.stat>)).then(resolve, reject),
+          0,
+        )
+      })
+    },
+  }
+})
+
+function run(id: string, extra: Partial<Run> = {}): Run {
+  return {
+    runId: id,
+    scenarioId: 'scn',
+    startedAt: 1_700_000_000_000,
+    status: 'running',
+    ...extra,
+  }
+}
+
+function span(id: string, runId: string, extra: Partial<GenericSpan> = {}): GenericSpan {
+  return {
+    spanId: id,
+    runId,
+    kind: 'agent',
+    name: 'work',
+    startedAt: 1_700_000_000_000,
+    ...extra,
+  }
+}
+
+describe('FileSystemTraceStore', () => {
+  let dir: string
+  beforeEach(() => {
+    dir = mkdtempSync(join(tmpdir(), 'fs-trace-store-'))
+  })
+  afterEach(() => {
+    readdirOrder = undefined
+    statDelay = false
+    rmSync(dir, { recursive: true, force: true })
+  })
+
+  function diskLines(name: string): string[] {
+    const out: string[] = []
+    for (const f of readdirSync(dir)) {
+      if (!f.endsWith('.ndjson')) continue
+      if (f === `${name}.ndjson` || f.startsWith(`${name}.`)) {
+        out.push(
+          ...readFileSync(join(dir, f), 'utf8')
+            .split('\n')
+            .filter((l) => l.trim()),
+        )
+      }
+    }
+    return out
+  }
+
+  it('serializes concurrent appends so rollover holds the size bound (mutex)', async () => {
+    // Seed the active file already at the bound, then fire a concurrent burst
+    // with stat delayed a macrotask so every append observes the same
+    // over-size state. Without per-name serialization the stat→rename→append
+    // windows interleave: each append renames once then writes to the same
+    // fresh active without re-checking, so the active file balloons far past
+    // maxBytes — the rollover bound the option exists to enforce is silently
+    // violated. Serialized, each append re-stats after the prior one's write,
+    // so the active file never holds more than one record over the bound.
+    const maxBytes = 200
+    const fs = await import('node:fs/promises')
+    await fs.mkdir(dir, { recursive: true })
+    await fs.writeFile(join(dir, 'spans.ndjson'), `${'x'.repeat(maxBytes)}\n`, 'utf8')
+    statDelay = true
+
+    const store = new FileSystemTraceStore({ dir, maxBytes })
+    const n = 20
+    const pad = 'y'.repeat(80)
+    await Promise.all(
+      Array.from({ length: n }, (_, i) =>
+        store.appendSpan(span(`s${i}`, 'r1', { attributes: { pad } })),
+      ),
+    )
+
+    // Active file must not exceed the bound by more than the single record that
+    // tipped it over; a no-lock race lets every record pile into one file.
+    const recordBytes = Buffer.byteLength(
+      `${JSON.stringify(span('s0', 'r1', { attributes: { pad } }))}\n`,
+    )
+    const activeSize = (await fs.stat(join(dir, 'spans.ndjson'))).size
+    expect(activeSize).toBeLessThanOrEqual(maxBytes + recordBytes)
+
+    // And no row is lost across the rolled corpus.
+    const ids = new Set(
+      diskLines('spans')
+        .map((l) => {
+          try {
+            return (JSON.parse(l) as { spanId?: string }).spanId
+          } catch {
+            return undefined
+          }
+        })
+        .filter((id): id is string => Boolean(id)),
+    )
+    expect(ids.size).toBe(n)
+  })
+
+  it('surfaces an index-mirror failure to the caller (awaited, not fire-and-forget)', async () => {
+    // Populate the lazy index so the next append mirrors into it. A duplicate
+    // runId makes the index reject. A floating `void this.insertInto(...)`
+    // swallows that as an unhandled rejection and reports success while disk +
+    // index diverge; awaiting it surfaces the error to the caller.
+    const store = new FileSystemTraceStore({ dir })
+    await store.appendRun(run('r1'))
+    expect(await store.getRun('r1')).toBeDefined()
+
+    await expect(store.appendRun(run('r1'))).rejects.toThrow(/already exists/)
+  })
+
+  it('collapses an _update patch onto its base when reloaded cross-instance', async () => {
+    const store = new FileSystemTraceStore({ dir })
+    await store.appendSpan(span('span-1', 'r1', { status: 'ok' }))
+    await store.updateSpan('span-1', { endedAt: 1_700_000_002_000, status: 'error' })
+
+    const reopened = new FileSystemTraceStore({ dir })
+    const spans = await reopened.spans({ runId: 'r1' })
+    expect(spans).toHaveLength(1)
+    expect(spans[0]!.spanId).toBe('span-1')
+    expect(spans[0]!.runId).toBe('r1')
+    expect(spans[0]!.status).toBe('error')
+    expect(spans[0]!.endedAt).toBe(1_700_000_002_000)
+  })
+
+  it('collapses a patch read before its base — no runId-less fragment (two-pass)', async () => {
+    // Rollover splits a span's stream across files: the base full span in one
+    // file, its `_update` patch in another. Force readdir to return the patch
+    // file first. A one-pass loader applies the patch before the base exists,
+    // hits the catch, and appends a runId-less fragment → two spans, corrupt
+    // counts. Two-pass load defers patches until every base is indexed.
+    writeFileSync(
+      join(dir, 'spans.base.ndjson'),
+      `${JSON.stringify(span('span-1', 'r1', { status: 'ok' }))}\n`,
+    )
+    writeFileSync(
+      join(dir, 'spans.patch.ndjson'),
+      `${JSON.stringify({ spanId: 'span-1', status: 'error', endedAt: 1_700_000_002_000, _update: true })}\n`,
+    )
+    readdirOrder = () => ['spans.patch.ndjson', 'spans.base.ndjson']
+
+    const store = new FileSystemTraceStore({ dir })
+    const spans = await store.spans()
+    expect(spans).toHaveLength(1)
+    expect(spans[0]!.spanId).toBe('span-1')
+    expect(spans[0]!.runId).toBe('r1')
+    expect(spans[0]!.status).toBe('error')
+    expect(spans.every((s) => typeof s.runId === 'string' && s.runId.length > 0)).toBe(true)
+  })
+
+  it('collapses a run patch read before its base — no scenarioId-less fragment (two-pass)', async () => {
+    writeFileSync(
+      join(dir, 'runs.base.ndjson'),
+      `${JSON.stringify(run('r1', { status: 'running' }))}\n`,
+    )
+    writeFileSync(
+      join(dir, 'runs.patch.ndjson'),
+      `${JSON.stringify({ runId: 'r1', status: 'completed', _update: true })}\n`,
+    )
+    readdirOrder = () => ['runs.patch.ndjson', 'runs.base.ndjson']
+
+    const store = new FileSystemTraceStore({ dir })
+    const runs = await store.listRuns()
+    expect(runs).toHaveLength(1)
+    expect(runs[0]!.runId).toBe('r1')
+    expect(runs[0]!.status).toBe('completed')
+    expect(runs[0]!.scenarioId).toBe('scn')
+  })
+
+  it('applies patches in write order across rolled files (newest wins), not readdir order', async () => {
+    // r1 was running, completed (both in the older rolled file), then failed
+    // (in the newer file). Replaying in readdir order with the newer file first
+    // lets the OLDER 'completed' patch win and silently restore a stale status;
+    // the chronological file sort makes 'failed' (the newest write) win.
+    writeFileSync(
+      join(dir, 'runs.900.ndjson'),
+      `${JSON.stringify(run('r1', { status: 'running' }))}\n${JSON.stringify({ runId: 'r1', status: 'completed', _update: true })}\n`,
+    )
+    writeFileSync(
+      join(dir, 'runs.1000.ndjson'),
+      `${JSON.stringify({ runId: 'r1', status: 'failed', _update: true })}\n`,
+    )
+    // Adversarial: newer rolled file returned first by readdir.
+    readdirOrder = () => ['runs.1000.ndjson', 'runs.900.ndjson']
+
+    const store = new FileSystemTraceStore({ dir })
+    const runs = await store.listRuns()
+    expect(runs).toHaveLength(1)
+    expect(runs[0]!.status).toBe('failed')
+  })
+})
diff --git a/src/trace/store.ts b/src/trace/store.ts
index 3c3888b3..b48ac50f 100644
--- a/src/trace/store.ts
+++ b/src/trace/store.ts
@@ -187,6 +187,14 @@ export class FileSystemTraceStore implements TraceStore {
   /** Lazy in-memory index for queries — populated on first read. */
   private index?: InMemoryTraceStore
   private loaded = false
+  /**
+   * Per-file append serialization. stat → conditional-rename → appendFile is a
+   * read-modify-write on the active file; without a lock two concurrent appends
+   * to the same `name` can both pass the size check (exceeding maxBytes) or one
+   * can append to a file the other just renamed away. Each entry chains the
+   * next append behind the prior one for that file.
+   */
+  private appendLocks = new Map<string, Promise<void>>()
 
   constructor(options: FileSystemTraceStoreOptions) {
     this.dir = options.dir
@@ -199,6 +207,26 @@ export class FileSystemTraceStore implements TraceStore {
   }
 
   private async append(name: string, record: unknown): Promise<void> {
+    // Chain this append behind any in-flight append for the same file. The tail
+    // promise never rejects (errors are isolated to each caller's `result`), so
+    // one failed append can't break the lock chain for the next one.
+    const prior = this.appendLocks.get(name) ?? Promise.resolve()
+    const result = prior.then(() => this.appendLocked(name, record))
+    const tail = result.then(
+      () => {},
+      () => {},
+    )
+    this.appendLocks.set(name, tail)
+    try {
+      await result
+    } finally {
+      // Drop the slot once this append is the current tail, so the map doesn't
+      // grow without bound across many appends to the same file.
+      if (this.appendLocks.get(name) === tail) this.appendLocks.delete(name)
+    }
+  }
+
+  private async appendLocked(name: string, record: unknown): Promise<void> {
     await this.ensureDir()
     const fs = await import('node:fs/promises')
     const path = await import('node:path')
@@ -217,9 +245,10 @@ export class FileSystemTraceStore implements TraceStore {
     // with `_update: true` by updateRun/updateSpan) are applied by those
     // methods directly via the index's update* APIs — re-inserting them
     // here triggers a duplicate-id error once the first read populates
-    // the index.
+    // the index. Await the mirror: a fire-and-forget would let an index
+    // rejection go unhandled and let disk + index diverge silently.
     if (this.index && !(record as { _update?: boolean })?._update) {
-      void this.insertInto(name, record)
+      await this.insertInto(name, record)
     }
   }
 
@@ -249,8 +278,47 @@ export class FileSystemTraceStore implements TraceStore {
     const fs = await import('node:fs/promises')
     const path = await import('node:path')
     const store = new InMemoryTraceStore()
+    let entries: string[]
     try {
-      const entries = await fs.readdir(this.dir)
+      entries = await fs.readdir(this.dir)
+    } catch {
+      // No dir yet (first run / empty corpus). Anything past this point — parse
+      // errors, missing patch bases — is real corruption and must fail loud.
+      this.index = store
+      this.loaded = true
+      return store
+    }
+    // Replay files in write order. For one entity, rolled files
+    // (`name.<ts>.ndjson`) are older than the active bare file (`name.ndjson`),
+    // and readdir order is not chronological — so without this sort a patch in
+    // an older rolled file could be applied AFTER (overwrite) a newer patch in
+    // the active file, silently restoring a stale field. Sort: per entity,
+    // rolled ascending by timestamp, then the bare active file last.
+    entries.sort((a, b) => {
+      // Group by the same entity base the loop uses (`split('.')[0]`); within an
+      // entity, real rollover files (`<base>.<ms>.ndjson`) sort by timestamp
+      // ascending and the bare active file (no numeric segment) sorts last.
+      // Non-rollover names tie (Infinity) and a stable sort preserves their
+      // readdir order.
+      const baseA = a.split('.')[0]!
+      const baseB = b.split('.')[0]!
+      if (baseA !== baseB) return baseA < baseB ? -1 : 1
+      const ta = a.match(/^[^.]+\.(\d+)\.ndjson$/)
+      const tb = b.match(/^[^.]+\.(\d+)\.ndjson$/)
+      const tsA = ta ? Number(ta[1]) : Number.POSITIVE_INFINITY
+      const tsB = tb ? Number(tb[1]) : Number.POSITIVE_INFINITY
+      return tsA - tsB
+    })
+    {
+      // Two-pass load. Pass 1 indexes all full (base) rows; pass 2 replays the
+      // `_update` patch rows that updateRun/updateSpan append. readdir order is
+      // not deterministic and rollover splits a stream across files, so a patch
+      // can be read before its base. Applying patches in a separate pass — after
+      // every base row is indexed — means a patch always finds its target.
+      // Doing it in one pass let a patch hit the catch and append a runId-less
+      // fragment, corrupting span/run counts cross-instance.
+      const runUpdates: Array<Record<string, unknown>> = []
+      const spanUpdates: Array<Record<string, unknown>> = []
       for (const file of entries) {
         if (!file.endsWith('.ndjson')) continue
         const full = path.join(this.dir, file)
@@ -260,7 +328,12 @@ export class FileSystemTraceStore implements TraceStore {
           if (!line.trim()) continue
           const record = JSON.parse(line)
           if (base === 'runs') {
-            // Allow re-loading without duplicate error
+            if (record?._update) {
+              runUpdates.push(record)
+              continue
+            }
+            // A duplicate full row (same dir loaded twice into one stream) is
+            // collapsed last-write-wins rather than thrown.
             try {
               await store.appendRun(record)
             } catch {
@@ -268,24 +341,17 @@ export class FileSystemTraceStore implements TraceStore {
             }
           } else if (base === 'spans') {
             // `updateSpan` appends an `_update: true` patch row instead of
-            // rewriting the original span. On reload we must collapse those
-            // patches onto the original span — otherwise a fresh
-            // FileSystemTraceStore reading the same dir reports duplicate
-            // spans (one full, one fragment with no runId/kind/name), which
-            // breaks any downstream consumer that re-opens the store
-            // cross-process (e.g. the OTLP converter).
+            // rewriting the original span. Deferred to pass 2 so it collapses
+            // onto the original span — otherwise a fresh FileSystemTraceStore
+            // reading the same dir reports duplicate spans (one full, one
+            // fragment with no runId/kind/name), which breaks any downstream
+            // consumer that re-opens the store cross-process (e.g. the OTLP
+            // converter).
             if (record?._update) {
-              try {
-                await store.updateSpan(record.spanId, record)
-              } catch {
-                // Patch row arrived before the original — should not happen
-                // with locked append order, but fall through to append so we
-                // don't lose data.
-                await store.appendSpan(record)
-              }
-            } else {
-              await store.appendSpan(record)
+              spanUpdates.push(record)
+              continue
             }
+            await store.appendSpan(record)
           } else if (base === 'events') {
             await store.appendEvent(record)
           } else if (base === 'artifacts') {
@@ -295,8 +361,16 @@ export class FileSystemTraceStore implements TraceStore {
           }
         }
       }
-    } catch {
-      /* empty dir, first run */
+      // Pass 2: every base row is now indexed. A patch whose base is genuinely
+      // absent (truncated/partial corpus) throws inside update* — we let it
+      // propagate rather than silently appending a fragment, so a corrupt
+      // corpus fails loud instead of polluting counts.
+      for (const record of runUpdates) {
+        await store.updateRun(record.runId as string, record as Partial<Run>)
+      }
+      for (const record of spanUpdates) {
+        await store.updateSpan(record.spanId as string, record as Partial<Span>)
+      }
     }
     this.index = store
     this.loaded = true
diff --git a/src/types.ts b/src/types.ts
index eedccd68..2f3b4256 100644
--- a/src/types.ts
+++ b/src/types.ts
@@ -138,6 +138,8 @@ export interface RouteMap {
 export interface ProductClientConfig {
   baseUrl: string
   routes: RouteMap
+  /** Per-request timeout in ms before the request is aborted. Default 30s. */
+  timeoutMs?: number
 }
 
 // ── Scenario Registry ──
diff --git a/tests/eval-campaign.test.ts b/tests/eval-campaign.test.ts
index fa0b6d01..69eaaadc 100644
--- a/tests/eval-campaign.test.ts
+++ b/tests/eval-campaign.test.ts
@@ -547,3 +547,193 @@ describe('runEvalCampaign — concurrency', () => {
     expect(inFlight.max).toBeLessThanOrEqual(4)
   })
 })
+
+describe('runEvalCampaign — genuine-error containment (no orphaned workers)', () => {
+  // A genuine (non-CellExecutionError) bug in one cell must not reject the pool
+  // mid-flight and orphan the other in-flight workers. The campaign must wait
+  // for every sibling worker to finish (its run finalized) before re-throwing.
+  it('does not throw until every in-flight sibling run has finalized', async () => {
+    // Two cells, concurrency 2. Cell A is the genuine-error cell. Cell B parks
+    // mid-run on a barrier we control. With Promise.all (old behaviour) the
+    // campaign rejects the instant A throws, while B is still parked — orphaned.
+    // With Promise.allSettled the campaign cannot settle until B finalizes.
+    let releaseB: () => void = () => {}
+    const bParked = new Promise<void>((resolve) => {
+      releaseB = resolve
+    })
+    let bFinalized = false
+
+    const runner: CampaignRunner<VariantPayload> = async (ctx) => {
+      if (ctx.variantId === 'fail') {
+        // Genuine error in post-runner assembly: the runner finalizes its OWN
+        // run, then hands back an agent profile that contradicts the model.
+        // assertAgentProfileMatchesRun throws a plain Error (not a Cell error).
+        const base = await defaultRunner(ctx)
+        return {
+          ...base,
+          agentProfile: {
+            profileId: 'mismatch',
+            sourceProfile: { kind: 'sandbox-agent-profile', profile: { name: 'a' } },
+            harness: { id: 'h', version: '1' },
+            model: 'WRONG-MODEL@2026-05-08',
+            promptHash: 'p'.repeat(64),
+          },
+        }
+      }
+      // Cell B: start the run, park until released, then finalize itself.
+      await ctx.emitter.startRun({ scenarioId: ctx.scenarioId, layer: 'app-runtime' })
+      await bParked
+      const handle = await ctx.emitter.llm({
+        name: 'judge',
+        model: 'test-model@2026-05-08',
+        messages: [{ role: 'user', content: ctx.variant.prompt }],
+        output: 'ok',
+      })
+      await ctx.rawSink.record({
+        eventId: `evt-${ctx.runId}-0`,
+        runId: ctx.runId,
+        spanId: handle.span.spanId,
+        provider: 'test',
+        model: 'test-model@2026-05-08',
+        endpoint: '/chat/completions',
+        baseUrl: ctx.llmOpts.baseUrl ?? '',
+        attemptIndex: 0,
+        direction: 'request',
+        timestamp: 1_000,
+        redactedFields: [],
+      })
+      await handle.end()
+      await ctx.emitter.endRun({ pass: true, score: 0.6 })
+      bFinalized = true
+      return {
+        pass: true,
+        score: 0.6,
+        costUsd: 0.001,
+        tokenUsage: { input: 10, output: 5 },
+        model: 'test-model@2026-05-08',
+        promptHash: 'p'.repeat(64),
+        configHash: 'c'.repeat(64),
+      }
+    }
+
+    const stores = new Map<string, InMemoryTraceStore>()
+    const opts: EvalCampaignOptions<VariantPayload> = {
+      campaignId: 'orphan-campaign',
+      variants: [
+        { id: 'fail', payload: { prompt: 'x' } },
+        { id: 'park', payload: { prompt: 'y' } },
+      ],
+      scenarios: [{ scenarioId: 's1' }],
+      seeds: [0],
+      commitSha: 'cafebabe',
+      llmOpts: { baseUrl: 'https://api.test.local/v1', apiKey: 'sk-test' },
+      storeFactory: ({ runId }) => {
+        const s = new InMemoryTraceStore()
+        stores.set(runId, s)
+        return s
+      },
+      rawSinkFactory: () => new InMemoryRawProviderSink(),
+      concurrency: 2,
+      runner,
+    }
+
+    const campaign = runEvalCampaign(opts)
+    let settled = false
+    void campaign.then(
+      () => {
+        settled = true
+      },
+      () => {
+        settled = true
+      },
+    )
+
+    // Flush microtasks + a macrotask. Cell A has thrown its genuine error by
+    // now, but cell B is still parked. Old Promise.all would have rejected the
+    // campaign already; allSettled must keep it pending.
+    await new Promise((r) => setTimeout(r, 20))
+    expect(settled).toBe(false)
+    expect(bFinalized).toBe(false)
+
+    // Release B; only now may the campaign settle (re-throwing the genuine error).
+    releaseB()
+    await expect(campaign).rejects.toThrow(/does not match outcome.model/)
+    expect(bFinalized).toBe(true)
+
+    // B's run was finalized as a real completed run — not left dangling.
+    const bRunId = [...stores.keys()].find((id) => id.length > 0)
+    expect(bRunId).toBeDefined()
+    // Every store that recorded a run must have a terminal status (no 'running').
+    for (const store of stores.values()) {
+      const runs = await store.listRuns()
+      for (const r of runs) {
+        expect(r.status).not.toBe('running')
+      }
+    }
+  })
+})
+
+describe('runEvalCampaign — abort failure is surfaced, not swallowed', () => {
+  // When the runner throws, the campaign aborts the run to finalize the
+  // emitter. A store-write failure during that abort is a genuine diagnostic
+  // (disk full, FS fault, backend down) and MUST surface — the old
+  // `try { abortRun } catch {}` swallowed it, hiding real corruption.
+  it('re-throws a store-write failure that occurs while aborting a thrown run', async () => {
+    class AbortFailingStore extends InMemoryTraceStore {
+      override async updateRun(runId: string, patch: Partial<import('../src/trace/schema').Run>) {
+        if (patch.status === 'aborted') {
+          throw new Error('disk full: cannot persist aborted run')
+        }
+        return super.updateRun(runId, patch)
+      }
+    }
+
+    const runner: CampaignRunner<VariantPayload> = async (ctx) => {
+      await ctx.emitter.startRun({ scenarioId: ctx.scenarioId, layer: 'app-runtime' })
+      throw new Error('runner boom')
+    }
+
+    const opts: EvalCampaignOptions<VariantPayload> = {
+      campaignId: 'abort-fail-campaign',
+      variants: [{ id: 'v1', payload: { prompt: 'x' } }],
+      scenarios: [{ scenarioId: 's1' }],
+      seeds: [0],
+      commitSha: 'cafebabe',
+      llmOpts: { baseUrl: 'https://api.test.local/v1', apiKey: 'sk-test' },
+      storeFactory: () => new AbortFailingStore(),
+      rawSinkFactory: () => new InMemoryRawProviderSink(),
+      runner,
+    }
+
+    // Old behaviour: the abort error is swallowed; the cell is recorded as a
+    // plain runner_threw failure and the campaign resolves. New behaviour: the
+    // store-write failure surfaces as a genuine error.
+    await expect(runEvalCampaign(opts)).rejects.toThrow(/disk full: cannot persist aborted run/)
+  })
+
+  it('still treats a never-started run as a benign abort (nothing to finalize)', async () => {
+    // If the runner throws BEFORE startRun, there is no run to abort — that is
+    // the only benign case the narrowed catch may pass over. It must NOT throw
+    // a store error; the cell is a normal runner_threw failure.
+    const runner: CampaignRunner<VariantPayload> = async () => {
+      throw new Error('threw before startRun')
+    }
+
+    const opts: EvalCampaignOptions<VariantPayload> = {
+      campaignId: 'never-started-campaign',
+      variants: [{ id: 'v1', payload: { prompt: 'x' } }],
+      scenarios: [{ scenarioId: 's1' }],
+      seeds: [0],
+      commitSha: 'cafebabe',
+      llmOpts: { baseUrl: 'https://api.test.local/v1', apiKey: 'sk-test' },
+      storeFactory: () => new InMemoryTraceStore(),
+      rawSinkFactory: () => new InMemoryRawProviderSink(),
+      runner,
+    }
+
+    const result = await runEvalCampaign(opts)
+    expect(result.failedRuns).toHaveLength(1)
+    expect(result.failedRuns[0]?.reason).toBe('runner_threw')
+    expect(result.failedRuns[0]?.error).toBe('threw before startRun')
+  })
+})