From 666c9a6f5571be21de4b14b1760dbf11d2f17222 Mon Sep 17 00:00:00 2001 From: Mahmoud Mabrouk Date: Wed, 24 Jun 2026 23:36:25 +0200 Subject: [PATCH 1/4] feat(agent): fail loud on missing harness capabilities + debug assertions The runner's capability handling degraded silently: a non-Pi run carrying tools whose harness probe reports mcpTools:false / toolCalls:false had its tools dropped without a trace, and a failed probe fell back to a static guess with no signal that it had. A wrong or missing flag changed behavior quietly instead of erroring. Make capability handling fail loud, mirroring the *_UNSUPPORTED_MESSAGE gates from #4831 and tools/code.ts CODE_TOOL_UNSUPPORTED_MESSAGE: - assertRequiredCapabilities (capabilities.ts) refuses a non-Pi run that carries tool specs when the probe reports it cannot receive tools, with a specific error naming the missing flags and the dropped tool count. Pi is exempt (its tools ride the native extension, not MCP); a no-tools run is unaffected. - probeCapabilities now returns ProbedCapabilities { capabilities, source } so callers know whether the flags were probed or a static guess; the gate logs the source so a failed probe is debuggable. - Debug assertions across the runner hot path (run-plan build, capability probe shape, capability gate, sandbox start) catch impossible states early with actionable [sandbox-agent invariant] messages. Docs: runner-to-harness inventory page updated to record the fail-loud gate and drop the stale 'a wrong flag silently changes behavior' claim. Tests: 182 vitest (was 168) + typecheck green in services/agent. Claude-Session: https://claude.ai/code/session_01GYo3UEfvsZpncagqb28Mbc --- .../cross-service/runner-to-harness.md | 18 +- services/agent/src/engines/sandbox_agent.ts | 49 +++- .../src/engines/sandbox_agent/capabilities.ts | 209 +++++++++++++++--- .../src/engines/sandbox_agent/run-plan.ts | 22 ++ .../unit/sandbox-agent-capabilities.test.ts | 196 +++++++++++++--- .../unit/sandbox-agent-orchestration.test.ts | 60 ++++- 6 files changed, 485 insertions(+), 69 deletions(-) diff --git a/docs/design/agent-workflows/interfaces/cross-service/runner-to-harness.md b/docs/design/agent-workflows/interfaces/cross-service/runner-to-harness.md index 81766f1dfe..d2d9be835c 100644 --- a/docs/design/agent-workflows/interfaces/cross-service/runner-to-harness.md +++ b/docs/design/agent-workflows/interfaces/cross-service/runner-to-harness.md @@ -29,7 +29,16 @@ returns `HarnessCapabilities`: ``` The probe gates real behavior. Tools only go over MCP when `mcpTools` is true, for example. -If the probe fails, the runner falls back to a static capability policy. +If the probe fails, the runner falls back to a static capability policy, and the result records +whether the flags were `probed` or a `static` guess. + +The runner no longer silently degrades when a run requires a capability the harness lacks. +`assertRequiredCapabilities` (in `capabilities.ts`) fails the run with a specific error — the +same fail-loud pattern as the `*_UNSUPPORTED_MESSAGE` gates in `run-plan.ts` and the +`CODE_TOOL_UNSUPPORTED_MESSAGE` gate in `tools/code.ts`. Today the asserted requirement is tool +delivery: a non-Pi run carrying tool specs whose probe reports `mcpTools:false` or +`toolCalls:false` errors instead of dropping the tools without a trace. Pi is exempt (its tools +ride the native extension, not MCP), and a run with no tools is unaffected. **The ACP event stream.** The harness emits ACP updates that the runner maps to neutral events: @@ -58,8 +67,11 @@ tears the sandbox down in the `finally` path. - **Harness selection and the `pi_core`/`pi_agenta` to `pi` remap.** New harnesses thread through here. -- **The capability probe.** It gates tool delivery, permissions, and streaming. A wrong flag - silently changes behavior rather than erroring. +- **The capability probe.** It gates tool delivery, permissions, and streaming. A run that + REQUIRES a capability the harness lacks now fails loud (`assertRequiredCapabilities`) rather + than silently dropping the behavior — today that covers tool delivery to a non-Pi harness. + Other flags (permissions, streaming) still degrade silently; assert the next requirement here + if a flag must be a hard gate. - **ACP event mapping.** A missed or mis-mapped update drops content from the stream. - **Pi versus Claude divergence.** Both run over ACP, but Pi takes tools natively and self-instruments traces, while Claude takes tools over MCP and the runner builds the spans. diff --git a/services/agent/src/engines/sandbox_agent.ts b/services/agent/src/engines/sandbox_agent.ts index fafc4cec19..1905f69c51 100644 --- a/services/agent/src/engines/sandbox_agent.ts +++ b/services/agent/src/engines/sandbox_agent.ts @@ -46,7 +46,11 @@ import { type ToolCallbackContext, resolveRunSessionId, } from "../protocol.ts"; -import { probeCapabilities } from "./sandbox_agent/capabilities.ts"; +import { + assert, + assertRequiredCapabilities, + probeCapabilities, +} from "./sandbox_agent/capabilities.ts"; import { buildDaemonEnv, resolveDaemonBinary } from "./sandbox_agent/daemon.ts"; import { createCookieFetch, @@ -81,7 +85,12 @@ function log(message: string): void { type Log = (message: string) => void; -const CLAUDE_STRICT_DEPLOYMENTS = new Set(["custom", "bedrock", "vertex", "vertex_ai"]); +const CLAUDE_STRICT_DEPLOYMENTS = new Set([ + "custom", + "bedrock", + "vertex", + "vertex_ai", +]); function applyClaudeConnectionEnv( env: Record, @@ -110,7 +119,10 @@ function applyClaudeConnectionEnv( env.CLAUDE_CODE_USE_VERTEX = "1"; } - if (selectedModel && (baseUrl || (deployment && CLAUDE_STRICT_DEPLOYMENTS.has(deployment)))) { + if ( + selectedModel && + (baseUrl || (deployment && CLAUDE_STRICT_DEPLOYMENTS.has(deployment))) + ) { env.ANTHROPIC_MODEL = selectedModel; env.ANTHROPIC_CUSTOM_MODEL_OPTION = selectedModel; return true; @@ -162,7 +174,12 @@ export async function runSandboxAgent( clearProviderEnv, }); Object.assign(env, plan.secrets); // apply only the resolved provider keys - const strictModel = applyClaudeConnectionEnv(env, request, plan.acpAgent, logger); + const strictModel = applyClaudeConnectionEnv( + env, + request, + plan.acpAgent, + logger, + ); // Pi self-instruments locally: propagate the trace context + public tool metadata into Pi // via the Agenta extension. Tool execution always relays back to this runner, which keeps // private specs, scoped env, callback endpoints, and callback auth in memory. @@ -233,14 +250,36 @@ export async function runSandboxAgent( log: logger, }); + // Sandbox-start invariant: `startSandboxAgent` must hand back a usable handle, or the + // probe/createSession below fail with an opaque "cannot read property of undefined". + assert( + sandbox && typeof sandbox.createSession === "function", + `sandbox provider '${plan.sandboxId}' returned no usable sandbox handle`, + ); + // Probe what this harness supports and branch on capabilities, not on the harness // name. Tool delivery: Pi loads our extension (native tools, set up above); any other // harness takes tools over MCP only when it advertises `mcpTools` (pi-acp does not // forward MCP, Claude/Codex do). - const capabilities = await (deps.probeCapabilities ?? probeCapabilities)( + const probed = await (deps.probeCapabilities ?? probeCapabilities)( sandbox, plan.acpAgent, ); + const capabilities = probed.capabilities; + + // Fail loud (A7): a run that REQUIRES a capability the harness lacks errors with a + // specific message instead of silently dropping the behavior, the way the + // `*_UNSUPPORTED_MESSAGE` gates in `run-plan.ts` do. Today: tool delivery to a non-Pi + // harness whose probe reports `mcpTools:false` / `toolCalls:false`. The throw is caught + // below and returned as `{ ok: false, error }`. + assertRequiredCapabilities({ + harness: plan.harness, + isPi: plan.isPi, + probed, + toolSpecs: plan.toolSpecs, + log: logger, + }); + const mcpServers = buildSessionMcpServers({ isPi: plan.isPi, capabilities, diff --git a/services/agent/src/engines/sandbox_agent/capabilities.ts b/services/agent/src/engines/sandbox_agent/capabilities.ts index 654fdb21bc..70cc446088 100644 --- a/services/agent/src/engines/sandbox_agent/capabilities.ts +++ b/services/agent/src/engines/sandbox_agent/capabilities.ts @@ -1,40 +1,118 @@ -import type { HarnessCapabilities } from "../../protocol.ts"; +import type { HarnessCapabilities, ResolvedToolSpec } from "../../protocol.ts"; /** - * Map a sandbox-agent `AgentInfo` to our capability flags. Falls back to a per-harness - * static guess when the probe is unavailable. + * Where a run's `HarnessCapabilities` came from. + * + * - `probed`: read from the daemon's `AgentInfo.capabilities` for this exact harness. + * - `static`: the daemon probe was unavailable (or returned no capabilities), so we fell + * back to a per-harness static guess. A guess is a weaker contract — it can disagree with + * what the harness actually does — so callers that assert a hard capability requirement use + * the source to phrase a clear error instead of trusting the guess. */ -export function mapCapabilities(harness: string, info: any): HarnessCapabilities { +export type CapabilitySource = "probed" | "static"; + +export interface ProbedCapabilities { + capabilities: HarnessCapabilities; + source: CapabilitySource; +} + +/** Every capability flag the runner branches on, for the debug-assertion shape check. */ +const CAPABILITY_FLAGS = [ + "textMessages", + "images", + "fileAttachments", + "mcpTools", + "toolCalls", + "reasoning", + "planMode", + "permissions", + "streamingDeltas", + "sessionLifecycle", + "usage", +] as const; + +/** + * Debug assertion: throw on an impossible runner state with an actionable message. These guard + * the runner hot path (run-plan build, capability probe, capability gate, sandbox start) so an + * impossible state surfaces at its origin with context instead of as a confusing downstream + * failure. Cheap (a boolean test) and always on — a tripped invariant is a runner bug, not a + * user error. + */ +export function assert(condition: unknown, message: string): asserts condition { + if (!condition) throw new Error(`[sandbox-agent invariant] ${message}`); +} + +/** + * A run asked for a capability the harness does not have. Mirrors the not-implemented gates + * (`tools/code.ts` `CODE_TOOL_UNSUPPORTED_MESSAGE`, #4831's `*_UNSUPPORTED_MESSAGE`): the run + * fails with one clear, specific line rather than silently dropping the behavior. Built by + * `assertRequiredCapabilities`. + */ +export function toolDeliveryUnsupportedMessage( + harness: string, + missing: string, + toolCount: number, +): string { + return ( + `harness '${harness}' cannot receive tools (probe reports ${missing}); ` + + `the run carries ${toolCount} tool(s) that would be silently dropped. Run on a ` + + `tool-capable harness, or remove the tools.` + ); +} + +/** + * Map a sandbox-agent `AgentInfo` to our capability flags. Falls back to a per-harness static + * guess when the probe is unavailable. Returns the flags AND where they came from, so a caller + * can refuse to silently degrade on a guess (see `assertRequiredCapabilities`). + */ +export function mapCapabilities( + harness: string, + info: any, +): ProbedCapabilities { + assert( + typeof harness === "string" && harness.length > 0, + "mapCapabilities requires a non-empty harness id", + ); const c = info?.capabilities; if (c) { + assert( + typeof c === "object", + `probed capabilities for '${harness}' is not an object (got ${typeof c})`, + ); return { - textMessages: c.textMessages ?? true, - images: !!c.images, - fileAttachments: !!c.fileAttachments, - mcpTools: !!c.mcpTools, - toolCalls: !!c.toolCalls, - reasoning: !!c.reasoning, - planMode: !!c.planMode, - permissions: !!c.permissions, - streamingDeltas: !!c.streamingDeltas, - sessionLifecycle: !!c.sessionLifecycle, - usage: true, + source: "probed", + capabilities: { + textMessages: c.textMessages ?? true, + images: !!c.images, + fileAttachments: !!c.fileAttachments, + mcpTools: !!c.mcpTools, + toolCalls: !!c.toolCalls, + reasoning: !!c.reasoning, + planMode: !!c.planMode, + permissions: !!c.permissions, + streamingDeltas: !!c.streamingDeltas, + sessionLifecycle: !!c.sessionLifecycle, + usage: true, + }, }; } // Static fallback by harness id: pi-acp does not forward MCP, Claude/Codex do. const isPiHarness = harness === "pi"; return { - textMessages: true, - images: false, - fileAttachments: false, - mcpTools: !isPiHarness, - toolCalls: true, - reasoning: true, - planMode: !isPiHarness, - permissions: !isPiHarness, - streamingDeltas: true, - sessionLifecycle: true, - usage: true, + source: "static", + capabilities: { + textMessages: true, + images: false, + fileAttachments: false, + mcpTools: !isPiHarness, + toolCalls: true, + reasoning: true, + planMode: !isPiHarness, + permissions: !isPiHarness, + streamingDeltas: true, + sessionLifecycle: true, + usage: true, + }, }; } @@ -42,11 +120,84 @@ export function mapCapabilities(harness: string, info: any): HarnessCapabilities export async function probeCapabilities( sandbox: any, harness: string, -): Promise { +): Promise { + assert( + sandbox && typeof sandbox.getAgent === "function", + "probeCapabilities requires a sandbox with getAgent()", + ); + let probed: ProbedCapabilities; try { const info = await sandbox.getAgent(harness, { config: true }); - return mapCapabilities(harness, info); + probed = mapCapabilities(harness, info); } catch { - return mapCapabilities(harness, undefined); + probed = mapCapabilities(harness, undefined); + } + // Debug assertion: the mapper must always return a complete, boolean-valued flag set so a + // downstream branch never reads an undefined flag (which would coerce to a silent `false`). + for (const flag of CAPABILITY_FLAGS) { + assert( + typeof probed.capabilities[flag] === "boolean", + `capability '${flag}' for '${harness}' is ${typeof probed.capabilities[flag]}, expected boolean`, + ); } + return probed; +} + +export interface AssertCapabilitiesInput { + harness: string; + isPi: boolean; + probed: ProbedCapabilities; + toolSpecs: ResolvedToolSpec[]; + /** Hook for the debug log; the engine passes its stderr logger. */ + log?: (message: string) => void; +} + +/** + * Fail loud when a run REQUIRES a capability the harness does not have, instead of silently + * dropping the behavior the way `buildSessionMcpServers` did before. The one requirement worth + * asserting today is tool delivery, the clearest case: + * + * - Pi delivers tools through its native extension (not MCP), so a Pi run is exempt — its tool + * delivery never depends on the probed `mcpTools`/`toolCalls` flags. + * - Any other harness takes tools over MCP only. If the run carries tool specs but the harness + * does not advertise BOTH `mcpTools` (it can receive an MCP server) AND `toolCalls` (it can + * actually call a tool), those tools would be dropped without a trace. Refuse the run with a + * specific error, mirroring the `*_UNSUPPORTED_MESSAGE` gates in `run-plan.ts`. + * + * Throws on violation; the engine catch turns the throw into `{ ok: false, error }`. + */ +export function assertRequiredCapabilities({ + harness, + isPi, + probed, + toolSpecs, + log = () => {}, +}: AssertCapabilitiesInput): void { + assert( + typeof harness === "string" && harness.length > 0, + "assertRequiredCapabilities requires a non-empty harness id", + ); + const { capabilities, source } = probed; + + // Pi's tools ride its native extension, not the probed MCP capability — nothing to assert. + if (isPi) return; + + const toolCount = toolSpecs.length; + if (toolCount === 0) return; + + const missing: string[] = []; + if (!capabilities.mcpTools) missing.push("mcpTools:false"); + if (!capabilities.toolCalls) missing.push("toolCalls:false"); + if (missing.length === 0) return; + + // A static guess that says a capability is missing is the weakest signal, but the run still + // cannot deliver the tools, so it must fail rather than proceed and drop them. Note the source + // in the log so a debugging operator knows the probe was unavailable. + log( + `capability gate: harness '${harness}' (${source}) lacks ${missing.join(", ")}; ` + + `refusing a run with ${toolCount} tool(s) rather than dropping them`, + ); + throw new Error( + toolDeliveryUnsupportedMessage(harness, missing.join(", "), toolCount), + ); } diff --git a/services/agent/src/engines/sandbox_agent/run-plan.ts b/services/agent/src/engines/sandbox_agent/run-plan.ts index 537fbcddec..8b146a7ee1 100644 --- a/services/agent/src/engines/sandbox_agent/run-plan.ts +++ b/services/agent/src/engines/sandbox_agent/run-plan.ts @@ -16,6 +16,7 @@ import { type MaterializedSkill, resolveSkillDirs as defaultResolveSkillDirs, } from "../skills.ts"; +import { assert } from "./capabilities.ts"; import { buildTurnText } from "./transcript.ts"; type Log = (message: string) => void; @@ -145,6 +146,14 @@ export function buildRunPlan( const acpAgent = harness === "pi_core" || harness === "pi_agenta" ? "pi" : harness; + // Debug assertion: every Pi identity must resolve to the `pi` ACP agent and nothing else may. + // Catches a future harness-id typo (e.g. a new `pi_*` value forgotten here) at plan-build time + // rather than as a daemon "unknown agent" error mid-run. + assert( + (harness === "pi_core" || harness === "pi_agenta") === (acpAgent === "pi"), + `harness '${harness}' resolved to ACP agent '${acpAgent}', but pi identity mapping disagrees`, + ); + const prompt = resolvePromptText(request); if (!prompt) { return { @@ -229,6 +238,19 @@ export function buildRunPlan( ? request.appendSystemPrompt?.trim() || undefined : undefined; + // Debug assertions: the derived run state must be self-consistent before the engine acts on + // it. A cwd that is empty, or a relay dir not nested under it, would only surface later as a + // confusing filesystem error inside the sandbox. + assert(!!cwd, `buildRunPlan produced an empty cwd for harness '${harness}'`); + assert( + relayDir.startsWith(cwd), + `relay dir '${relayDir}' is not under cwd '${cwd}'`, + ); + assert( + isPi === (acpAgent === "pi"), + `isPi (${isPi}) disagrees with acpAgent '${acpAgent}'`, + ); + return { ok: true, plan: { diff --git a/services/agent/tests/unit/sandbox-agent-capabilities.test.ts b/services/agent/tests/unit/sandbox-agent-capabilities.test.ts index 6d8f80c081..b96f4a4ece 100644 --- a/services/agent/tests/unit/sandbox-agent-capabilities.test.ts +++ b/services/agent/tests/unit/sandbox-agent-capabilities.test.ts @@ -1,34 +1,25 @@ /** - * Unit tests for sandbox-agent capability mapping. + * Unit tests for sandbox-agent capability mapping, the fail-loud capability gate, and the + * debug assertions. * * Run: pnpm test (or: pnpm exec vitest run tests/unit/sandbox-agent-capabilities.test.ts) */ import { describe, it } from "vitest"; import assert from "node:assert/strict"; +import type { ResolvedToolSpec } from "../../src/protocol.ts"; import { + assert as invariant, + assertRequiredCapabilities, mapCapabilities, probeCapabilities, + type ProbedCapabilities, } from "../../src/engines/sandbox_agent/capabilities.ts"; describe("mapCapabilities", () => { - it("maps probed sandbox-agent capabilities and always enables usage", () => { - assert.deepEqual( - mapCapabilities("claude", { - capabilities: { - textMessages: false, - images: true, - fileAttachments: true, - mcpTools: true, - toolCalls: true, - reasoning: true, - planMode: true, - permissions: true, - streamingDeltas: true, - sessionLifecycle: true, - }, - }), - { + it("maps probed sandbox-agent capabilities, always enables usage, marks the source probed", () => { + const out = mapCapabilities("claude", { + capabilities: { textMessages: false, images: true, fileAttachments: true, @@ -39,26 +30,58 @@ describe("mapCapabilities", () => { permissions: true, streamingDeltas: true, sessionLifecycle: true, - usage: true, }, - ); + }); + assert.equal(out.source, "probed"); + assert.deepEqual(out.capabilities, { + textMessages: false, + images: true, + fileAttachments: true, + mcpTools: true, + toolCalls: true, + reasoning: true, + planMode: true, + permissions: true, + streamingDeltas: true, + sessionLifecycle: true, + usage: true, + }); + }); + + it("falls back to no MCP for Pi and MCP for non-Pi harnesses, marked static", () => { + const pi = mapCapabilities("pi", undefined); + assert.equal(pi.source, "static"); + assert.equal(pi.capabilities.mcpTools, false); + + const claude = mapCapabilities("claude", undefined); + assert.equal(claude.source, "static"); + assert.equal(claude.capabilities.mcpTools, true); + }); + + it("invariant: rejects an empty harness id", () => { + assert.throws(() => mapCapabilities("", undefined), /non-empty harness id/); }); - it("falls back to no MCP for Pi and MCP for non-Pi harnesses", () => { - assert.equal(mapCapabilities("pi", undefined).mcpTools, false); - assert.equal(mapCapabilities("claude", undefined).mcpTools, true); + it("invariant: rejects a non-object probed capabilities payload", () => { + assert.throws( + () => mapCapabilities("claude", { capabilities: "yes" }), + /not an object/, + ); }); }); describe("probeCapabilities", () => { - it("uses the daemon probe when available", async () => { + it("uses the daemon probe when available and reports the source", async () => { const sandbox = { - getAgent: async () => ({ capabilities: { mcpTools: true, permissions: true } }), + getAgent: async () => ({ + capabilities: { mcpTools: true, permissions: true, toolCalls: true }, + }), }; const out = await probeCapabilities(sandbox, "claude"); - assert.equal(out.mcpTools, true); - assert.equal(out.permissions, true); + assert.equal(out.source, "probed"); + assert.equal(out.capabilities.mcpTools, true); + assert.equal(out.capabilities.permissions, true); }); it("uses static fallback when probing fails", async () => { @@ -68,6 +91,123 @@ describe("probeCapabilities", () => { }, }; - assert.equal((await probeCapabilities(sandbox, "pi")).mcpTools, false); + const out = await probeCapabilities(sandbox, "pi"); + assert.equal(out.source, "static"); + assert.equal(out.capabilities.mcpTools, false); + }); + + it("returns a complete boolean-valued flag set even when the probe omits flags", async () => { + const sandbox = { + // A partial probe payload: every absent flag must still come back a boolean. + getAgent: async () => ({ capabilities: { mcpTools: true } }), + }; + const out = await probeCapabilities(sandbox, "claude"); + for (const value of Object.values(out.capabilities)) { + assert.equal(typeof value, "boolean"); + } + }); + + it("invariant: rejects a sandbox without getAgent", async () => { + await assert.rejects( + () => probeCapabilities({}, "claude"), + /requires a sandbox with getAgent/, + ); + }); +}); + +function probed( + flags: Record, + source: "probed" | "static" = "probed", +): ProbedCapabilities { + return { source, capabilities: flags as any }; +} + +const tool: ResolvedToolSpec = { name: "server_tool", kind: "callback" }; + +describe("assertRequiredCapabilities (fail loud on tool delivery)", () => { + it("passes a non-Pi harness that advertises mcpTools + toolCalls", () => { + assert.doesNotThrow(() => + assertRequiredCapabilities({ + harness: "claude", + isPi: false, + probed: probed({ mcpTools: true, toolCalls: true }), + toolSpecs: [tool], + }), + ); + }); + + it("throws a specific error when the harness lacks mcpTools", () => { + assert.throws( + () => + assertRequiredCapabilities({ + harness: "claude", + isPi: false, + probed: probed({ mcpTools: false, toolCalls: true }), + toolSpecs: [tool], + }), + /harness 'claude' cannot receive tools.*mcpTools:false.*1 tool/s, + ); + }); + + it("throws naming both missing flags when neither is advertised", () => { + assert.throws( + () => + assertRequiredCapabilities({ + harness: "codex", + isPi: false, + probed: probed({ mcpTools: false, toolCalls: false }), + toolSpecs: [tool, { name: "second", kind: "callback" }], + }), + /mcpTools:false, toolCalls:false.*2 tool/s, + ); + }); + + it("exempts Pi (tools ride its native extension, not the probed MCP flags)", () => { + assert.doesNotThrow(() => + assertRequiredCapabilities({ + harness: "pi_core", + isPi: true, + probed: probed({ mcpTools: false, toolCalls: false }), + toolSpecs: [tool], + }), + ); + }); + + it("is a no-op when the run carries no tools", () => { + assert.doesNotThrow(() => + assertRequiredCapabilities({ + harness: "claude", + isPi: false, + probed: probed({ mcpTools: false, toolCalls: false }), + toolSpecs: [], + }), + ); + }); + + it("logs the source so a failed-probe static guess is debuggable", () => { + const logs: string[] = []; + assert.throws(() => + assertRequiredCapabilities({ + harness: "claude", + isPi: false, + probed: probed({ mcpTools: false, toolCalls: true }, "static"), + toolSpecs: [tool], + log: (m) => logs.push(m), + }), + ); + assert.ok(logs.some((m) => m.includes("(static)"))); + }); +}); + +describe("assert (debug invariant)", () => { + it("throws a prefixed message when the condition is false", () => { + assert.throws( + () => invariant(false, "boom"), + /\[sandbox-agent invariant\] boom/, + ); + }); + + it("does nothing when the condition holds", () => { + assert.doesNotThrow(() => invariant(true, "fine")); }); }); diff --git a/services/agent/tests/unit/sandbox-agent-orchestration.test.ts b/services/agent/tests/unit/sandbox-agent-orchestration.test.ts index 440b65c545..e8e228facd 100644 --- a/services/agent/tests/unit/sandbox-agent-orchestration.test.ts +++ b/services/agent/tests/unit/sandbox-agent-orchestration.test.ts @@ -162,10 +162,14 @@ function fakeHarness(options: FakeOptions = {}) { }) as any, probeCapabilities: async () => ({ - mcpTools: true, - usage: true, - streamingDeltas: true, - ...(options.capabilities ?? {}), + source: "probed", + capabilities: { + mcpTools: true, + toolCalls: true, + usage: true, + streamingDeltas: true, + ...(options.capabilities ?? {}), + }, }) as any, applyModel: async (_session, model, _log, options) => { calls.applyModelArgs.push({ model, options }); @@ -347,6 +351,54 @@ describe("runSandboxAgent orchestration", () => { }); }); + it("fails loud when a non-Pi harness probes mcpTools:false but the run carries tools", async () => { + // A7: the capability gate runs BEFORE the MCP bridge, so a harness whose probe reports it + // cannot receive tools fails with a SPECIFIC capability error (not the generic MCP-disabled + // line, and never a silent drop). This is the silent-degradation case the staff review + // flagged: a wrong/missing `mcpTools` flag must error, not change behavior quietly. + const { calls, deps } = fakeHarness({ capabilities: { mcpTools: false } }); + + const result = await runSandboxAgent( + { + harness: "claude", + prompt: "use the tool", + customTools: [{ name: "server_tool", kind: "callback" }], + } as AgentRunRequest, + undefined, + undefined, + deps, + ); + + assert.equal(result.ok, false); + if (result.ok) return; + assert.match(result.error ?? "", /harness 'claude' cannot receive tools/); + assert.match(result.error ?? "", /mcpTools:false/); + // The gate fires before the session is created, so no session/prompt happened. + assert.equal(calls.createSessionOptions, undefined); + // The acquired sandbox is still disposed in the finally. + assert.equal(calls.sandboxDestroyed, 1); + }); + + it("does not gate tool delivery for a Pi run even when mcpTools is false", async () => { + // Pi delivers tools through its native extension, not MCP, so a Pi run with tools and a + // `mcpTools:false` probe must proceed (the gate exempts Pi). + const { calls, deps } = fakeHarness({ capabilities: { mcpTools: false } }); + + const result = await runSandboxAgent( + { + harness: "pi_core", + prompt: "use the tool", + customTools: [{ name: "server_tool", kind: "callback" }], + } as AgentRunRequest, + undefined, + undefined, + deps, + ); + + assert.equal(result.ok, true); + assert.notEqual(calls.createSessionOptions, undefined); + }); + it("flushes a partial trace and cleans up on prompt errors", async () => { const { calls, deps } = fakeHarness({ promptError: new Error("boom") }); From 36693e975162a51ce692043ed8f3cfa2169d2f44 Mon Sep 17 00:00:00 2001 From: Mahmoud Mabrouk Date: Thu, 25 Jun 2026 13:30:05 +0200 Subject: [PATCH 2/4] fix(agent): surface a swallowed Pi model error instead of a silent empty turn MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When Pi's provider call fails (out-of-quota, bad key, rate limit, unknown model), Pi's pi-acp bridge reports the turn over ACP as a plain end_turn with no content, so the runner returned an ok:true empty turn and the playground showed 'No response — the agent ended its turn without answering' with no clue why. The real error lived only in Pi's own session transcript. On the local Pi path, when a turn ends with no output and ran no tools, read the run's Pi session transcript (matched by its unique cwd) for the last assistant message's stopReason:error / errorMessage and fail the run loud with that message. The vercel stream then emits an error part the playground already renders, instead of a silent empty bubble. conciseError now also maps the OpenAI insufficient_quota wording to the shared insufficient-credit message. Daytona is out of scope (transcript lives in the remote sandbox). Adds findSwallowedPiError + unit tests; runner suite 190 green, typecheck clean. Claude-Session: https://claude.ai/code/session_01GYo3UEfvsZpncagqb28Mbc --- services/agent/src/engines/sandbox_agent.ts | 21 +++ .../agent/src/engines/sandbox_agent/errors.ts | 2 +- .../src/engines/sandbox_agent/pi-error.ts | 140 ++++++++++++++++++ .../tests/unit/sandbox-agent-errors.test.ts | 12 ++ .../tests/unit/sandbox-agent-pi-error.test.ts | 124 ++++++++++++++++ 5 files changed, 298 insertions(+), 1 deletion(-) create mode 100644 services/agent/src/engines/sandbox_agent/pi-error.ts create mode 100644 services/agent/tests/unit/sandbox-agent-pi-error.test.ts diff --git a/services/agent/src/engines/sandbox_agent.ts b/services/agent/src/engines/sandbox_agent.ts index 1905f69c51..7352b86453 100644 --- a/services/agent/src/engines/sandbox_agent.ts +++ b/services/agent/src/engines/sandbox_agent.ts @@ -59,6 +59,7 @@ import { import { conciseError } from "./sandbox_agent/errors.ts"; import { buildSessionMcpServers } from "./sandbox_agent/mcp.ts"; import { applyModel } from "./sandbox_agent/model.ts"; +import { findSwallowedPiError } from "./sandbox_agent/pi-error.ts"; import { buildPiExtensionEnv, prepareLocalPiAssets, @@ -393,6 +394,26 @@ export async function runSandboxAgent( const output = run.finish(); await run.flush(); + // Fail loud on a swallowed model error (A7 / "fail loud, not silent"). When Pi's provider + // call fails (out-of-quota, bad key, rate limit, unknown model, ...), Pi's pi-acp bridge + // reports the turn as a plain `end_turn` with NO content, so without this the run would + // return an `ok:true` empty turn and the user would see a silent "No response" instead of + // the real failure. On the LOCAL Pi path the error is recoverable from Pi's own session + // transcript; surface it as a run error. Only checked when the turn produced no output and + // ran no tools (a real tool-only turn legitimately has empty text), and never on Daytona + // (the transcript lives in the remote sandbox). + if ( + plan.isPi && + !plan.isDaytona && + !output.trim() && + !run.events().some((e) => e.type === "tool_call") + ) { + const piError = findSwallowedPiError(plan.sourcePiAgentDir, plan.cwd); + if (piError) { + return { ok: false, error: conciseError(new Error(piError), plan.harness) }; + } + } + return { ok: true, output, diff --git a/services/agent/src/engines/sandbox_agent/errors.ts b/services/agent/src/engines/sandbox_agent/errors.ts index dd89229209..dceaac0101 100644 --- a/services/agent/src/engines/sandbox_agent/errors.ts +++ b/services/agent/src/engines/sandbox_agent/errors.ts @@ -7,7 +7,7 @@ export function conciseError(err: unknown, harness: string): string { const msg = raw.split("\n")[0].trim(); const keyHint = harness === "claude" ? "the project's Anthropic key" : "the project's OpenAI key"; - if (/credit balance is too low/i.test(raw)) { + if (/credit balance is too low|exceeded your current quota|insufficient_quota/i.test(raw)) { return `${harness}: the model provider account has insufficient credit (check ${keyHint}).`; } if (/authentication required|invalid api key|401|unauthorized/i.test(raw)) { diff --git a/services/agent/src/engines/sandbox_agent/pi-error.ts b/services/agent/src/engines/sandbox_agent/pi-error.ts new file mode 100644 index 0000000000..129b54b982 --- /dev/null +++ b/services/agent/src/engines/sandbox_agent/pi-error.ts @@ -0,0 +1,140 @@ +/** + * Recover a model-call error that Pi swallows on the local sandbox-agent path. + * + * When Pi's provider call fails (out-of-quota, bad key, rate limit, unknown model, ...), + * Pi records the failed turn in its session transcript as an assistant message with + * `stopReason: "error"` and a human-readable `errorMessage`, but its pi-acp bridge reports + * the turn to the runner as a plain `{ stopReason: "end_turn" }` with NO content. The runner + * then returns an `ok: true` run with empty output, and the user sees a silent "No response" + * instead of the real failure. + * + * This reader closes that gap on the LOCAL path (Pi runs on the runner host, so its session + * dir is on this filesystem). After a Pi turn that produced no output, the engine asks this + * helper for the transcript's last assistant `errorMessage`; when present, the run is failed + * loud with that message instead of returning an empty turn. + * + * It is deliberately best-effort and side-effect free: any filesystem/parse problem returns + * `undefined` so a genuinely empty (but successful) turn is never turned into a false error. + * Daytona is out of scope (the transcript lives in the remote sandbox); the empty-turn there + * stays as-is until the harness surfaces the error over ACP. + */ +import { readFileSync, readdirSync, statSync } from "node:fs"; +import { join } from "node:path"; + +/** A Pi transcript `session` record (first line of the .jsonl). */ +interface PiSessionRecord { + type?: string; + id?: string; + cwd?: string; +} + +/** A Pi transcript `message` record. */ +interface PiMessageRecord { + type?: string; + message?: { + role?: string; + stopReason?: string; + errorMessage?: string; + content?: unknown[]; + }; +} + +/** The most recent assistant error in one transcript, or undefined if none / unreadable. */ +function lastAssistantError(jsonlPath: string): string | undefined { + let raw: string; + try { + raw = readFileSync(jsonlPath, "utf8"); + } catch { + return undefined; + } + let found: string | undefined; + for (const line of raw.split("\n")) { + const trimmed = line.trim(); + if (!trimmed) continue; + let record: PiMessageRecord; + try { + record = JSON.parse(trimmed) as PiMessageRecord; + } catch { + continue; + } + const msg = record.message; + if (record.type !== "message" || msg?.role !== "assistant") continue; + // Keep scanning so a later successful turn clears an earlier error; only an error that + // is the LAST assistant turn (and is what produced the empty output) is surfaced. + if (msg.stopReason === "error" && msg.errorMessage) { + found = msg.errorMessage.trim() || undefined; + } else { + found = undefined; + } + } + return found; +} + +/** The `session` record of a transcript file, or undefined if unreadable / not a session. */ +function readSessionRecord(jsonlPath: string): PiSessionRecord | undefined { + let raw: string; + try { + raw = readFileSync(jsonlPath, "utf8"); + } catch { + return undefined; + } + const firstLine = raw.split("\n", 1)[0]?.trim(); + if (!firstLine) return undefined; + try { + const record = JSON.parse(firstLine) as PiSessionRecord; + return record.type === "session" ? record : undefined; + } catch { + return undefined; + } +} + +/** + * Find the Pi transcript for a run (matched by its unique `cwd`) and return the last + * assistant turn's `errorMessage`, or undefined when there is none. + * + * Each run gets a unique cwd, and Pi stamps the cwd on every transcript's `session` record, + * so matching on cwd locates this run's transcript without depending on Pi's dir-name + * encoding. Among matches (a resumed session can have several), the newest file wins. + */ +export function findSwallowedPiError( + piAgentDir: string, + cwd: string, +): string | undefined { + const sessionsRoot = join(piAgentDir, "sessions"); + let dirs: string[]; + try { + dirs = readdirSync(sessionsRoot); + } catch { + return undefined; + } + + let newestPath: string | undefined; + let newestMtime = -1; + for (const dir of dirs) { + const dirPath = join(sessionsRoot, dir); + let files: string[]; + try { + files = readdirSync(dirPath); + } catch { + continue; + } + for (const file of files) { + if (!file.endsWith(".jsonl")) continue; + const filePath = join(dirPath, file); + const session = readSessionRecord(filePath); + if (session?.cwd !== cwd) continue; + let mtime: number; + try { + mtime = statSync(filePath).mtimeMs; + } catch { + continue; + } + if (mtime > newestMtime) { + newestMtime = mtime; + newestPath = filePath; + } + } + } + + return newestPath ? lastAssistantError(newestPath) : undefined; +} diff --git a/services/agent/tests/unit/sandbox-agent-errors.test.ts b/services/agent/tests/unit/sandbox-agent-errors.test.ts index f3d9c15e23..b205d743a4 100644 --- a/services/agent/tests/unit/sandbox-agent-errors.test.ts +++ b/services/agent/tests/unit/sandbox-agent-errors.test.ts @@ -16,6 +16,18 @@ describe("conciseError", () => { ); }); + it("formats OpenAI quota failures as insufficient credit", () => { + assert.equal( + conciseError( + new Error( + "You exceeded your current quota, please check your plan and billing details.", + ), + "pi", + ), + "pi: the model provider account has insufficient credit (check the project's OpenAI key).", + ); + }); + it("formats auth failures with the right provider hint", () => { assert.equal( conciseError(new Error("Authentication required"), "pi"), diff --git a/services/agent/tests/unit/sandbox-agent-pi-error.test.ts b/services/agent/tests/unit/sandbox-agent-pi-error.test.ts new file mode 100644 index 0000000000..d9cf4c9410 --- /dev/null +++ b/services/agent/tests/unit/sandbox-agent-pi-error.test.ts @@ -0,0 +1,124 @@ +/** + * Unit tests for recovering a model error Pi swallows on the local sandbox-agent path. + * + * When Pi's provider call fails, Pi records the failed turn in its session transcript with + * `stopReason: "error"` + `errorMessage`, but reports a plain `end_turn` with no content over + * ACP. `findSwallowedPiError` reads that transcript so the empty turn can fail loud. + * + * Run: pnpm test (or: pnpm exec vitest run tests/unit/sandbox-agent-pi-error.test.ts) + */ +import { afterEach, describe, it } from "vitest"; +import assert from "node:assert/strict"; +import { mkdtempSync, mkdirSync, rmSync, writeFileSync } from "node:fs"; +import { tmpdir } from "node:os"; +import { join } from "node:path"; + +import { findSwallowedPiError } from "../../src/engines/sandbox_agent/pi-error.ts"; + +const dirs: string[] = []; + +function tempDir(): string { + const dir = mkdtempSync(join(tmpdir(), "agenta-pi-error-test-")); + dirs.push(dir); + return dir; +} + +/** Write a Pi-style .jsonl transcript under `/sessions//` for `cwd`. */ +function writeTranscript( + piAgentDir: string, + name: string, + cwd: string, + messages: Array>, +): void { + const dir = join(piAgentDir, "sessions", name); + mkdirSync(dir, { recursive: true }); + const lines = [ + JSON.stringify({ type: "session", version: 3, id: name, cwd }), + ...messages.map((m) => JSON.stringify(m)), + ]; + writeFileSync(join(dir, `${name}.jsonl`), lines.join("\n") + "\n"); +} + +afterEach(() => { + for (const dir of dirs.splice(0)) rmSync(dir, { recursive: true, force: true }); +}); + +describe("findSwallowedPiError", () => { + it("returns the last assistant errorMessage for a matching cwd", () => { + const piAgentDir = tempDir(); + const cwd = "/tmp/agenta-sandbox-agent-abc123"; + writeTranscript(piAgentDir, "--tmp-agenta-sandbox-agent-abc123--", cwd, [ + { type: "message", message: { role: "user", content: [{ type: "text", text: "hi" }] } }, + { + type: "message", + message: { + role: "assistant", + content: [], + stopReason: "error", + errorMessage: "You exceeded your current quota, please check your plan.", + }, + }, + ]); + + const error = findSwallowedPiError(piAgentDir, cwd); + assert.equal(error, "You exceeded your current quota, please check your plan."); + }); + + it("returns undefined for a successful turn", () => { + const piAgentDir = tempDir(); + const cwd = "/tmp/agenta-sandbox-agent-ok"; + writeTranscript(piAgentDir, "--tmp-agenta-sandbox-agent-ok--", cwd, [ + { + type: "message", + message: { + role: "assistant", + content: [{ type: "text", text: "hello" }], + stopReason: "end_turn", + }, + }, + ]); + + assert.equal(findSwallowedPiError(piAgentDir, cwd), undefined); + }); + + it("does not surface an error cleared by a later successful turn", () => { + const piAgentDir = tempDir(); + const cwd = "/tmp/agenta-sandbox-agent-recovered"; + writeTranscript(piAgentDir, "--recovered--", cwd, [ + { + type: "message", + message: { role: "assistant", content: [], stopReason: "error", errorMessage: "transient" }, + }, + { + type: "message", + message: { + role: "assistant", + content: [{ type: "text", text: "recovered" }], + stopReason: "end_turn", + }, + }, + ]); + + assert.equal(findSwallowedPiError(piAgentDir, cwd), undefined); + }); + + it("ignores transcripts for a different cwd", () => { + const piAgentDir = tempDir(); + writeTranscript(piAgentDir, "--other--", "/tmp/some-other-cwd", [ + { + type: "message", + message: { role: "assistant", content: [], stopReason: "error", errorMessage: "nope" }, + }, + ]); + + assert.equal( + findSwallowedPiError(piAgentDir, "/tmp/agenta-sandbox-agent-missing"), + undefined, + ); + }); + + it("returns undefined when the sessions dir is absent", () => { + const piAgentDir = tempDir(); + assert.equal(findSwallowedPiError(piAgentDir, "/tmp/whatever"), undefined); + }); +}); From 6c2b323435df6f1eb461190c5d0c77da8d93d8eb Mon Sep 17 00:00:00 2001 From: Mahmoud Mabrouk Date: Thu, 25 Jun 2026 14:29:56 +0200 Subject: [PATCH 3/4] fix(agent): name the resolved provider in the run error hint, not the harness default conciseError derived its 'check the project's key' hint from the harness name (claude -> Anthropic, everything else -> OpenAI). A Pi run against an Anthropic model that failed auth or ran out of credit therefore told the user to check their OpenAI key, even though they selected Anthropic. Live repro on the dev sidecar showed 'pi_core: model authentication failed - add the project's OpenAI key ...' for an Anthropic model. Thread the resolved provider (request.provider, the authoritative value from the resolved connection) into conciseError and derive the key label from it, mapping the known provider families to their vault-key names. Falls back to the old harness-derived default only when no provider is on the wire (un-migrated caller) or the provider is a custom slug with no known key label. Both conciseError call sites in the engine now pass request.provider. Builds on the swallowed-Pi-error surfacing already on this lane. Verified live on the deployed sidecar: an Anthropic run now says 'the project's Anthropic key' and an OpenAI run still says 'the project's OpenAI key'. Claude-Session: https://claude.ai/code/session_01GYo3UEfvsZpncagqb28Mbc --- services/agent/src/engines/sandbox_agent.ts | 7 ++- .../agent/src/engines/sandbox_agent/errors.ts | 43 +++++++++++++++++-- .../tests/unit/sandbox-agent-errors.test.ts | 42 ++++++++++++++++++ 3 files changed, 87 insertions(+), 5 deletions(-) diff --git a/services/agent/src/engines/sandbox_agent.ts b/services/agent/src/engines/sandbox_agent.ts index 7352b86453..51c1f266da 100644 --- a/services/agent/src/engines/sandbox_agent.ts +++ b/services/agent/src/engines/sandbox_agent.ts @@ -410,7 +410,10 @@ export async function runSandboxAgent( ) { const piError = findSwallowedPiError(plan.sourcePiAgentDir, plan.cwd); if (piError) { - return { ok: false, error: conciseError(new Error(piError), plan.harness) }; + return { + ok: false, + error: conciseError(new Error(piError), plan.harness, request.provider), + }; } } @@ -436,7 +439,7 @@ export async function runSandboxAgent( } catch (err) { otel?.finish(); await otel?.flush().catch(() => {}); - return { ok: false, error: conciseError(err, plan.harness) }; + return { ok: false, error: conciseError(err, plan.harness, request.provider) }; } finally { await toolRelay?.stop().catch(() => {}); await sandbox?.destroySandbox().catch(() => {}); diff --git a/services/agent/src/engines/sandbox_agent/errors.ts b/services/agent/src/engines/sandbox_agent/errors.ts index dceaac0101..1b9f8b9185 100644 --- a/services/agent/src/engines/sandbox_agent/errors.ts +++ b/services/agent/src/engines/sandbox_agent/errors.ts @@ -1,12 +1,49 @@ +/** Map a provider family to its human-facing vault key label, for the credit/auth hint. */ +const PROVIDER_KEY_LABELS: Record = { + openai: "OpenAI", + anthropic: "Anthropic", + gemini: "Gemini", + mistral: "Mistral", + mistralai: "Mistral", + minimax: "MiniMax", + groq: "Groq", + together_ai: "Together AI", + openrouter: "OpenRouter", +}; + +/** + * The vault-key hint phrase for an error, named after the RESOLVED provider rather than the + * harness. A Pi run against an Anthropic model must say "Anthropic key", not "OpenAI key" — the + * harness name (`pi_core`/`claude`) is not the provider, so deriving the hint from it mislabels + * every cross-provider run (e.g. Pi + Anthropic wrongly read "check the project's OpenAI key"). + * + * `provider` is the resolved provider the runner already knows (`request.provider`, from the + * resolved connection). When it is absent (un-migrated caller) fall back to the harness default + * — Claude is always Anthropic; every other harness defaults to OpenAI, matching the old + * behavior for that path only. + */ +function keyHintFor(provider: string | undefined, harness: string): string { + const label = provider ? PROVIDER_KEY_LABELS[provider.toLowerCase()] : undefined; + if (label) return `the project's ${label} key`; + if (harness === "claude") return "the project's Anthropic key"; + return "the project's OpenAI key"; +} + /** * Turn a harness/SDK error into one clear line for the caller instead of dumping a full * ACP/JS stack. Recognizes common harness auth failures. + * + * `provider` is the resolved provider for the run; pass it so the credit/auth hint names the + * actual provider the run targeted, not a provider guessed from the harness name. */ -export function conciseError(err: unknown, harness: string): string { +export function conciseError( + err: unknown, + harness: string, + provider?: string, +): string { const raw = err instanceof Error ? err.message : String(err); const msg = raw.split("\n")[0].trim(); - const keyHint = - harness === "claude" ? "the project's Anthropic key" : "the project's OpenAI key"; + const keyHint = keyHintFor(provider, harness); if (/credit balance is too low|exceeded your current quota|insufficient_quota/i.test(raw)) { return `${harness}: the model provider account has insufficient credit (check ${keyHint}).`; } diff --git a/services/agent/tests/unit/sandbox-agent-errors.test.ts b/services/agent/tests/unit/sandbox-agent-errors.test.ts index b205d743a4..ec303cdbf8 100644 --- a/services/agent/tests/unit/sandbox-agent-errors.test.ts +++ b/services/agent/tests/unit/sandbox-agent-errors.test.ts @@ -35,6 +35,48 @@ describe("conciseError", () => { ); }); + it("names the resolved provider, not the harness, for a Pi+Anthropic run", () => { + // The bug: a Pi run against an Anthropic model that fails auth must NOT say "OpenAI key". + assert.equal( + conciseError(new Error("Authentication required"), "pi_core", "anthropic"), + "pi_core: model authentication failed — add the project's Anthropic key to the project vault, or log in (OAuth).", + ); + }); + + it("names the resolved provider for a Pi+Anthropic credit failure", () => { + assert.equal( + conciseError(new Error("credit balance is too low"), "pi_core", "anthropic"), + "pi_core: the model provider account has insufficient credit (check the project's Anthropic key).", + ); + }); + + it("keeps the OpenAI hint when the resolved provider is openai on Pi", () => { + assert.equal( + conciseError(new Error("insufficient_quota"), "pi_core", "openai"), + "pi_core: the model provider account has insufficient credit (check the project's OpenAI key).", + ); + }); + + it("falls back to the harness default when no provider is resolved", () => { + // Un-migrated caller (no provider on the wire): keep the old harness-derived behavior. + assert.equal( + conciseError(new Error("401 unauthorized"), "claude"), + "claude: model authentication failed — add the project's Anthropic key to the project vault, or log in (OAuth).", + ); + assert.equal( + conciseError(new Error("401 unauthorized"), "pi_core"), + "pi_core: model authentication failed — add the project's OpenAI key to the project vault, or log in (OAuth).", + ); + }); + + it("falls back to the harness default for an unknown custom provider", () => { + // A custom router slug we have no key label for: do not invent one, use the harness default. + assert.equal( + conciseError(new Error("Authentication required"), "pi_core", "openai-codex"), + "pi_core: model authentication failed — add the project's OpenAI key to the project vault, or log in (OAuth).", + ); + }); + it("falls back to the first line", () => { assert.equal(conciseError(new Error("first line\nsecond line"), "pi"), "first line"); }); From 92c777d1ef100097c782e3a4951e354427b3ebd7 Mon Sep 17 00:00:00 2001 From: Mahmoud Mabrouk Date: Thu, 25 Jun 2026 14:40:22 +0200 Subject: [PATCH 4/4] fix(agent): strip the pi-acp startup banner from the streamed answer pi-acp emits its startup banner ("pi vX.Y.Z" / "## Context" / the AGENTS.md path / "New version available: ... Run: `npm i -g ...`") as the FIRST agent_message_chunk on session/new, ahead of the real answer. The tracer only stripped it from the coalesced text in finish(), but the playground reads the live stream: streamText(accumulated) flushed those deltas before finish() ran, so the banner reached the client as the assistant reply. Strip the banner on the streaming path too. splitLeadingBanner() holds the leading deltas until the banner region resolves (it can straddle chunk boundaries), then streams only the body past it; once a real line starts it latches and never strips again. isBannerLine() now matches every real variant (verified verbatim against a live pi-acp@0.0.29 session/new): the version line, both raw and markdown-rendered Context/Skills headings, list-prefixed or bare .md path items, the 'New version available: ... (installed ...)' notice, and the 'Run: npm i -g ...' upgrade line. finish() remains the safety net for a banner that never settled mid-stream. A genuine answer is never truncated (only a leading run of banner lines is removed). Adds startup-banner.test.ts (14 cases): isBannerLine variants, the coalesced stripper, the streaming splitter with cross-chunk boundary safety, and an end-to-end createSandboxAgentOtel streaming assertion. Claude-Session: https://claude.ai/code/session_01GYo3UEfvsZpncagqb28Mbc --- services/agent/src/tracing/otel.ts | 139 ++++++++++-- .../agent/tests/unit/startup-banner.test.ts | 212 ++++++++++++++++++ 2 files changed, 335 insertions(+), 16 deletions(-) create mode 100644 services/agent/tests/unit/startup-banner.test.ts diff --git a/services/agent/src/tracing/otel.ts b/services/agent/src/tracing/otel.ts index 5a8d98ce7f..eb4acbe012 100644 --- a/services/agent/src/tracing/otel.ts +++ b/services/agent/src/tracing/otel.ts @@ -611,30 +611,107 @@ function acpToolContentText(content: any): string { } /** - * Strip the pi-acp startup banner that some setups emit as the first agent message - * chunk (a "pi vX.Y.Z" / "## Context" / file list / "New version available" prelude, - * surfaced ahead of the real answer). Removes only a leading run of those marker lines - * so a genuine answer is never touched. + * Is this line part of the pi-acp startup banner that some setups emit as the first agent + * message chunk, ahead of the real answer? pi-acp's `buildStartupInfo` produces, in order: + * + * pi v0.79.4 + * --- + * (blank) + * ## Context + * - /tmp/agenta-sandbox-agent-XXXX/AGENTS.md + * (blank) + * ## Skills (when skills are installed) + * - /path/to/skill.md + * (blank) + * New version available: v0.80.2 (installed v0.79.4). Run: `npm i -g @earendil-works/pi-coding-agent` + * + * The markdown markers (`## `, `- `) are stripped when the playground renders the text, so the + * user sees a bare `Context` heading and an unprefixed absolute `.../AGENTS.md` path — but the + * raw chunk still carries the markdown, so we match BOTH the raw and the rendered shapes. The + * "New version available" notice is emitted even when `quietStartup` suppresses the rest, so it + * must be matched on its own. We only ever strip a LEADING run of these lines, so a genuine + * answer that happens to contain such words later is never touched. + */ +export function isBannerLine(line: string): boolean { + const t = line.trim(); + return ( + t === "" || + t === "---" || + /^pi v\d+\.\d+\.\d+\b/.test(t) || + // section heading, raw ("## Context") or rendered ("Context"); same for "Skills" + /^(?:#{1,6}\s*)?(?:Context|Skills)\s*$/.test(t) || + // an AGENTS.md / *.md path item, list-prefixed ("- /…/AGENTS.md") or bare ("/…/AGENTS.md") + /^(?:-\s+)?\/\S*\.md\s*$/.test(t) || + // upgrade notice: "New version available: vX (installed vY). Run: `npm i -g …`" + /^New version available:/.test(t) || + /^Run:\s*`?npm\s+i\b/.test(t) + ); +} + +/** + * Strip a leading run of pi-acp startup-banner lines from `text`. Returns the text past the + * banner (trimmed) when at least one non-blank banner line was seen, otherwise the original. */ -function stripStartupBanner(text: string): string { +export function stripStartupBanner(text: string): string { const lines = text.split("\n"); - const isBanner = (line: string) => - /^pi v\d+\.\d+\.\d+/.test(line) || - /^## Context\b/.test(line) || - /^-\s+\/.*AGENTS\.md\s*$/.test(line) || - /^New version available:/.test(line) || - /^Run: `npm/.test(line) || - line.trim() === "---" || - line.trim() === ""; let i = 0; let sawBanner = false; - while (i < lines.length && isBanner(lines[i])) { + while (i < lines.length && isBannerLine(lines[i])) { if (lines[i].trim() !== "") sawBanner = true; i++; } return sawBanner ? lines.slice(i).join("\n").trim() : text; } +/** + * Streaming-safe variant. Given the cumulative assistant text so far, return the portion that + * is safe to surface as a delta now, plus whether the leading banner region is fully resolved. + * + * The banner always arrives at the START of the stream and may straddle chunk boundaries, so we + * must not classify the LAST line until we know it is complete (a trailing newline, or a later + * chunk, settles it). While the text seen so far is entirely banner-or-blank we return + * `{ body: "", settled: false }` and the caller holds emission; once a non-banner line appears + * we return everything from it onward and never re-buffer again. `start` is the byte offset of + * the body within `text` (after leading whitespace), so the caller can slice later chunks from + * the same offset — the banner is a stable leading prefix of the cumulative stream. + */ +export function splitLeadingBanner(text: string): { + body: string; + settled: boolean; + start: number; +} { + const endsWithNewline = text.endsWith("\n"); + const lines = text.split("\n"); + // The final element is a partial line unless the text ended on a newline. + const lastIsPartial = !endsWithNewline; + let i = 0; + let offset = 0; // byte offset of line i within `text` + let sawBanner = false; + while (i < lines.length) { + const isLast = i === lines.length - 1; + if (isLast && lastIsPartial && sawBanner) { + // We are at the banner boundary with a still-arriving partial line. It could complete into + // either another banner line or the first line of the real answer — only a later chunk (or + // a newline) settles it, so hold. (When no banner has been seen, there is nothing to + // suppress: settle on the partial line and stream it without latency.) + return { body: "", settled: false, start: -1 }; + } + if (!isBannerLine(lines[i])) break; + sawBanner = true; + offset += lines[i].length + 1; // +1 for the consumed "\n" + i++; + } + if (i >= lines.length) { + // Consumed every (complete) line as banner — nothing real has started yet. + return { body: "", settled: false, start: -1 }; + } + const rest = text.slice(offset); + // Drop the blank line(s) that separate the banner from the answer, but never trim a + // banner-free stream — a genuine answer may legitimately begin with whitespace. + const ws = sawBanner ? rest.length - rest.replace(/^\s+/, "").length : 0; + return { body: rest.slice(ws), settled: true, start: offset + ws }; +} + /** Split a resolved model id ("openai-codex/gpt-5.5") into provider + id. */ function splitModel(model?: string): { provider?: string; id?: string } { if (!model) return {}; @@ -767,6 +844,16 @@ export function createSandboxAgentOtel(init: SandboxAgentOtelInit): SandboxAgent let textBlockId: string | undefined; let textEmitted = ""; let anyTextDelta = false; + // Streaming banner suppression: pi-acp emits its startup banner as the FIRST agent message + // chunk, ahead of the real answer. The one-shot `finish()` path strips it from the coalesced + // text, but the streaming path flushes deltas as they arrive, before finish() ever runs — so + // the banner would leak to the client. We hold the leading deltas until the banner region is + // resolved (it may straddle chunk boundaries), then stream only the body past it. Once a real + // line has started, `bannerSettled` latches true and we never strip again. + let bannerSettled = false; + // Once the banner region resolves, the real answer begins at this byte offset in `accumulated`; + // every later chunk is streamed as `accumulated.slice(bannerEnd)` so the banner never reappears. + let bannerEnd = 0; let reasoningBlockId: string | undefined; let reasoningEmitted = ""; let blockSeq = 0; @@ -800,6 +887,25 @@ export function createSandboxAgentOtel(init: SandboxAgentOtelInit): SandboxAgent anyTextDelta = true; } + /** + * Stream the assistant text for the cumulative `accumulated`, with the leading startup banner + * suppressed. Until the banner region resolves we emit nothing; afterwards we stream the body + * past it. `body` is a prefix-growing view of the real answer, so `streamText`'s delta logic + * stays correct. + */ + function streamAssistantText(): void { + if (bannerSettled) { + // The banner is a stable leading prefix; stream everything past it. + streamText(accumulated.slice(bannerEnd)); + return; + } + const { body, settled, start } = splitLeadingBanner(accumulated); + if (!settled) return; // banner region still arriving — hold emission + bannerSettled = true; + bannerEnd = start; + if (body) streamText(body); + } + /** Open (if needed) the reasoning block and emit the pure delta up to `target`. */ function streamReasoning(target: string): void { closeText(); // a reasoning chunk ends any open text run @@ -869,8 +975,9 @@ export function createSandboxAgentOtel(init: SandboxAgentOtelInit): SandboxAgent else accumulated += t; // Live deltas run independent of span emission (text, not a span), so they flow even // when the harness self-instruments (emitSpans=false). `accumulated` is the cumulative - // text, so the pure delta is its tail past what we already sent. - if (sink) streamText(accumulated); + // text, so the pure delta is its tail past what we already sent — minus the leading + // startup banner, which is held back until the body begins (see streamAssistantText). + if (sink) streamAssistantText(); return; } diff --git a/services/agent/tests/unit/startup-banner.test.ts b/services/agent/tests/unit/startup-banner.test.ts new file mode 100644 index 0000000000..4912373961 --- /dev/null +++ b/services/agent/tests/unit/startup-banner.test.ts @@ -0,0 +1,212 @@ +/** + * Unit tests for pi-acp startup-banner stripping. + * + * pi-acp emits its startup banner as the FIRST agent message chunk, ahead of the real answer + * (a "pi vX.Y.Z" / "## Context" / AGENTS.md path list / "New version available" prelude). The + * playground renders the markdown, so the user sees a bare "Context" heading and an unprefixed + * absolute path. These tests pin both the coalesced stripper (`stripStartupBanner`, used by the + * one-shot `finish()` path) and the streaming-safe splitter (`splitLeadingBanner`, used to hold + * leading deltas until the banner region resolves), plus an end-to-end streaming assertion + * through `createSandboxAgentOtel`. + * + * Run: pnpm test (or: pnpm exec vitest run tests/unit/startup-banner.test.ts) + */ +import { describe, it } from "vitest"; +import assert from "node:assert/strict"; + +import { + createSandboxAgentOtel, + isBannerLine, + splitLeadingBanner, + stripStartupBanner, +} from "../../src/tracing/otel.ts"; +import type { AgentEvent } from "../../src/protocol.ts"; + +// The exact banner the user saw, in its RAW (pre-render) markdown form: pi-acp's buildStartupInfo +// output followed by buildUpdateNotice. The playground strips "## " and "- " when rendering, so +// the user-visible text is the same minus those markers. +const RAW_BANNER = [ + "pi v0.79.4", + "---", + "", + "## Context", + "- /tmp/agenta-sandbox-agent-UxwfPW/AGENTS.md", + "", + "---", + "New version available: v0.80.2 (installed v0.79.4). Run: `npm i -g @earendil-works/pi-coding-agent`", +].join("\n"); +// ^ verbatim from a live `pi-acp@0.0.29` session/new (`_meta.piAcp.startupInfo`) on this stack: +// "pi v0.79.4\n---\n\n## Context\n- /tmp/.../AGENTS.md\n\n---\nNew version available: v0.80.2 ...". + +// The same banner as the user reported it (already markdown-rendered): bare "Context", unprefixed +// AGENTS.md path, no fenced npm command. +const RENDERED_BANNER = [ + "pi v0.79.4", + "Context", + "/tmp/agenta-sandbox-UxwfPW/AGENTS.md", + "New version available: v0.80.2 (installed v0.79.4).", + "Run: npm i -g @earendil-works/pi-coding-agent", +].join("\n"); + +describe("isBannerLine", () => { + it("matches every banner-line variant the user saw", () => { + for (const line of [ + "pi v0.79.4", + "pi v0.80.2 (something)", + "---", + "", + " ", + "## Context", + "Context", + "## Skills", + "Skills", + "- /tmp/agenta-sandbox-agent-UxwfPW/AGENTS.md", + "/tmp/agenta-sandbox-UxwfPW/AGENTS.md", + "- /pi-agent/skills/foo.md", + "New version available: v0.80.2 (installed v0.79.4). Run: `npm i -g @earendil-works/pi-coding-agent`", + "New version available: v0.80.2 (installed v0.79.4).", + "Run: `npm i -g @earendil-works/pi-coding-agent`", + "Run: npm i -g @earendil-works/pi-coding-agent", + ]) { + assert.equal(isBannerLine(line), true, `expected banner: ${JSON.stringify(line)}`); + } + }); + + it("does not match genuine answer lines", () => { + for (const line of [ + "4", + "The answer is 4.", + "Here is the context you asked for:", + "I will run: npm test", // not "npm i -g" + "Contextual information follows", // not a bare heading + "See /etc/hosts for details", // not a .md path + ]) { + assert.equal(isBannerLine(line), false, `expected non-banner: ${JSON.stringify(line)}`); + } + }); +}); + +describe("stripStartupBanner (coalesced / finish path)", () => { + it("strips the raw markdown banner the user saw", () => { + assert.equal(stripStartupBanner(RAW_BANNER + "\n4"), "4"); + }); + + it("strips the rendered banner the user saw", () => { + assert.equal(stripStartupBanner(RENDERED_BANNER + "\nThe answer is 4."), "The answer is 4."); + }); + + it("strips a multi-line answer's banner but keeps the whole answer", () => { + const answer = "Line one of the answer.\nLine two.\n- bullet point"; + assert.equal(stripStartupBanner(RAW_BANNER + "\n" + answer), answer); + }); + + it("leaves a clean answer untouched", () => { + assert.equal(stripStartupBanner("4"), "4"); + assert.equal( + stripStartupBanner("The context of this question is simple: 4."), + "The context of this question is simple: 4.", + ); + }); + + it("returns empty when the text is only the banner", () => { + assert.equal(stripStartupBanner(RAW_BANNER), ""); + }); +}); + +describe("splitLeadingBanner (streaming path)", () => { + it("holds emission while only banner-or-blank has arrived", () => { + for (const text of ["pi v0.79.4\n", RAW_BANNER, RAW_BANNER + "\n"]) { + const { body, settled } = splitLeadingBanner(text); + assert.deepEqual({ body, settled }, { body: "", settled: false }); + } + }); + + it("does not judge an incomplete trailing line", () => { + // "Run: `npm i" could become the upgrade line OR a real answer; wait for the newline. + const { body, settled } = splitLeadingBanner("pi v0.79.4\n---\nRun: `npm i"); + assert.deepEqual({ body, settled }, { body: "", settled: false }); + }); + + it("settles and returns the body once a real line completes", () => { + // Streaming preserves the answer faithfully, including a trailing newline; only the leading + // blank line(s) between banner and answer are dropped. + const { body, settled, start } = splitLeadingBanner(RAW_BANNER + "\n4\n"); + assert.equal(settled, true); + assert.equal(body, "4\n"); + // `start` points exactly at the body within the original text. + assert.equal((RAW_BANNER + "\n4\n").slice(start, start + body.length), body); + }); + + it("holds the first real line until its newline arrives (boundary safety)", () => { + // At the banner boundary a still-arriving partial line ("4", no newline) could complete into + // either a banner line or the answer, so we hold and let the finish path strip it instead. + const { settled } = splitLeadingBanner(RAW_BANNER + "\n4"); + assert.equal(settled, false); + }); + + it("settles immediately for a banner-free stream", () => { + const { body, settled, start } = splitLeadingBanner("4"); + assert.deepEqual({ body, settled, start }, { body: "4", settled: true, start: 0 }); + }); +}); + +describe("createSandboxAgentOtel streaming suppresses the banner", () => { + const textChunk = (text: string) => ({ + sessionUpdate: "agent_message_chunk", + content: { type: "text", text }, + }); + + it("never streams banner deltas, only the real answer", () => { + const emitted: AgentEvent[] = []; + const run = createSandboxAgentOtel({ + harness: "pi", + model: "openai/o4-mini", + emitSpans: false, + emit: (e) => emitted.push(e), + }); + run.start({ prompt: "What is 2+2?" }); + // pi streams pure deltas; the banner arrives first, then the answer, split across chunks. + run.handleUpdate(textChunk("pi v0.79.4\n---\n\n## Context\n- /tmp/x/AGENTS.md\n\n")); + run.handleUpdate( + textChunk("New version available: v0.80.2 (installed v0.79.4). Run: `npm i -g @earendil-works/pi-coding-agent`\n"), + ); + // First real line completes with a newline -> the splitter settles and streams the body + // INCREMENTALLY (before finish), exercising the live path, not just the finish fallback. + run.handleUpdate(textChunk("The answer is 4.\n")); // pure delta (pi-style) + const beforeFinish = emitted.filter((e) => e.type === "message_delta").length; + run.handleUpdate(textChunk("Thanks!")); // pure delta (pi-style) + run.finish(); + assert.ok(beforeFinish > 0, "body was not streamed incrementally before finish"); + + const streamed = emitted + .filter((e): e is Extract => e.type === "message_delta") + .map((e) => e.delta) + .join(""); + assert.equal(streamed, "The answer is 4.\nThanks!"); + assert.ok(!streamed.includes("pi v0.79.4"), "banner version leaked"); + assert.ok(!streamed.includes("New version available"), "upgrade notice leaked"); + assert.ok(!streamed.includes("AGENTS.md"), "context file path leaked"); + }); + + it("strips the banner on finish when the stream had no clean body line", () => { + // A short, banner-only-then-inline answer that never produced a newline before finish. + const emitted: AgentEvent[] = []; + const run = createSandboxAgentOtel({ + harness: "pi", + model: "openai/o4-mini", + emitSpans: false, + emit: (e) => emitted.push(e), + }); + run.start({ prompt: "hi" }); + // The whole banner plus the answer arrive in ONE chunk with no trailing newline. The + // streaming splitter holds (last line partial), so finish() must do the stripping. + run.handleUpdate(textChunk(RAW_BANNER + "\n4")); + run.finish(); + + const streamed = emitted + .filter((e): e is Extract => e.type === "message_delta") + .map((e) => e.delta) + .join(""); + assert.equal(streamed, "4"); + }); +});