diff --git a/docs/design/agent-workflows/interfaces/cross-service/runner-to-harness.md b/docs/design/agent-workflows/interfaces/cross-service/runner-to-harness.md index 81766f1dfe..d2d9be835c 100644 --- a/docs/design/agent-workflows/interfaces/cross-service/runner-to-harness.md +++ b/docs/design/agent-workflows/interfaces/cross-service/runner-to-harness.md @@ -29,7 +29,16 @@ returns `HarnessCapabilities`: ``` The probe gates real behavior. Tools only go over MCP when `mcpTools` is true, for example. -If the probe fails, the runner falls back to a static capability policy. +If the probe fails, the runner falls back to a static capability policy, and the result records +whether the flags were `probed` or a `static` guess. + +The runner no longer silently degrades when a run requires a capability the harness lacks. +`assertRequiredCapabilities` (in `capabilities.ts`) fails the run with a specific error — the +same fail-loud pattern as the `*_UNSUPPORTED_MESSAGE` gates in `run-plan.ts` and the +`CODE_TOOL_UNSUPPORTED_MESSAGE` gate in `tools/code.ts`. Today the asserted requirement is tool +delivery: a non-Pi run carrying tool specs whose probe reports `mcpTools:false` or +`toolCalls:false` errors instead of dropping the tools without a trace. Pi is exempt (its tools +ride the native extension, not MCP), and a run with no tools is unaffected. **The ACP event stream.** The harness emits ACP updates that the runner maps to neutral events: @@ -58,8 +67,11 @@ tears the sandbox down in the `finally` path. - **Harness selection and the `pi_core`/`pi_agenta` to `pi` remap.** New harnesses thread through here. -- **The capability probe.** It gates tool delivery, permissions, and streaming. A wrong flag - silently changes behavior rather than erroring. +- **The capability probe.** It gates tool delivery, permissions, and streaming. A run that + REQUIRES a capability the harness lacks now fails loud (`assertRequiredCapabilities`) rather + than silently dropping the behavior — today that covers tool delivery to a non-Pi harness. + Other flags (permissions, streaming) still degrade silently; assert the next requirement here + if a flag must be a hard gate. - **ACP event mapping.** A missed or mis-mapped update drops content from the stream. - **Pi versus Claude divergence.** Both run over ACP, but Pi takes tools natively and self-instruments traces, while Claude takes tools over MCP and the runner builds the spans. diff --git a/services/agent/src/engines/sandbox_agent.ts b/services/agent/src/engines/sandbox_agent.ts index fafc4cec19..51c1f266da 100644 --- a/services/agent/src/engines/sandbox_agent.ts +++ b/services/agent/src/engines/sandbox_agent.ts @@ -46,7 +46,11 @@ import { type ToolCallbackContext, resolveRunSessionId, } from "../protocol.ts"; -import { probeCapabilities } from "./sandbox_agent/capabilities.ts"; +import { + assert, + assertRequiredCapabilities, + probeCapabilities, +} from "./sandbox_agent/capabilities.ts"; import { buildDaemonEnv, resolveDaemonBinary } from "./sandbox_agent/daemon.ts"; import { createCookieFetch, @@ -55,6 +59,7 @@ import { import { conciseError } from "./sandbox_agent/errors.ts"; import { buildSessionMcpServers } from "./sandbox_agent/mcp.ts"; import { applyModel } from "./sandbox_agent/model.ts"; +import { findSwallowedPiError } from "./sandbox_agent/pi-error.ts"; import { buildPiExtensionEnv, prepareLocalPiAssets, @@ -81,7 +86,12 @@ function log(message: string): void { type Log = (message: string) => void; -const CLAUDE_STRICT_DEPLOYMENTS = new Set(["custom", "bedrock", "vertex", "vertex_ai"]); +const CLAUDE_STRICT_DEPLOYMENTS = new Set([ + "custom", + "bedrock", + "vertex", + "vertex_ai", +]); function applyClaudeConnectionEnv( env: Record, @@ -110,7 +120,10 @@ function applyClaudeConnectionEnv( env.CLAUDE_CODE_USE_VERTEX = "1"; } - if (selectedModel && (baseUrl || (deployment && CLAUDE_STRICT_DEPLOYMENTS.has(deployment)))) { + if ( + selectedModel && + (baseUrl || (deployment && CLAUDE_STRICT_DEPLOYMENTS.has(deployment))) + ) { env.ANTHROPIC_MODEL = selectedModel; env.ANTHROPIC_CUSTOM_MODEL_OPTION = selectedModel; return true; @@ -162,7 +175,12 @@ export async function runSandboxAgent( clearProviderEnv, }); Object.assign(env, plan.secrets); // apply only the resolved provider keys - const strictModel = applyClaudeConnectionEnv(env, request, plan.acpAgent, logger); + const strictModel = applyClaudeConnectionEnv( + env, + request, + plan.acpAgent, + logger, + ); // Pi self-instruments locally: propagate the trace context + public tool metadata into Pi // via the Agenta extension. Tool execution always relays back to this runner, which keeps // private specs, scoped env, callback endpoints, and callback auth in memory. @@ -233,14 +251,36 @@ export async function runSandboxAgent( log: logger, }); + // Sandbox-start invariant: `startSandboxAgent` must hand back a usable handle, or the + // probe/createSession below fail with an opaque "cannot read property of undefined". + assert( + sandbox && typeof sandbox.createSession === "function", + `sandbox provider '${plan.sandboxId}' returned no usable sandbox handle`, + ); + // Probe what this harness supports and branch on capabilities, not on the harness // name. Tool delivery: Pi loads our extension (native tools, set up above); any other // harness takes tools over MCP only when it advertises `mcpTools` (pi-acp does not // forward MCP, Claude/Codex do). - const capabilities = await (deps.probeCapabilities ?? probeCapabilities)( + const probed = await (deps.probeCapabilities ?? probeCapabilities)( sandbox, plan.acpAgent, ); + const capabilities = probed.capabilities; + + // Fail loud (A7): a run that REQUIRES a capability the harness lacks errors with a + // specific message instead of silently dropping the behavior, the way the + // `*_UNSUPPORTED_MESSAGE` gates in `run-plan.ts` do. Today: tool delivery to a non-Pi + // harness whose probe reports `mcpTools:false` / `toolCalls:false`. The throw is caught + // below and returned as `{ ok: false, error }`. + assertRequiredCapabilities({ + harness: plan.harness, + isPi: plan.isPi, + probed, + toolSpecs: plan.toolSpecs, + log: logger, + }); + const mcpServers = buildSessionMcpServers({ isPi: plan.isPi, capabilities, @@ -354,6 +394,29 @@ export async function runSandboxAgent( const output = run.finish(); await run.flush(); + // Fail loud on a swallowed model error (A7 / "fail loud, not silent"). When Pi's provider + // call fails (out-of-quota, bad key, rate limit, unknown model, ...), Pi's pi-acp bridge + // reports the turn as a plain `end_turn` with NO content, so without this the run would + // return an `ok:true` empty turn and the user would see a silent "No response" instead of + // the real failure. On the LOCAL Pi path the error is recoverable from Pi's own session + // transcript; surface it as a run error. Only checked when the turn produced no output and + // ran no tools (a real tool-only turn legitimately has empty text), and never on Daytona + // (the transcript lives in the remote sandbox). + if ( + plan.isPi && + !plan.isDaytona && + !output.trim() && + !run.events().some((e) => e.type === "tool_call") + ) { + const piError = findSwallowedPiError(plan.sourcePiAgentDir, plan.cwd); + if (piError) { + return { + ok: false, + error: conciseError(new Error(piError), plan.harness, request.provider), + }; + } + } + return { ok: true, output, @@ -376,7 +439,7 @@ export async function runSandboxAgent( } catch (err) { otel?.finish(); await otel?.flush().catch(() => {}); - return { ok: false, error: conciseError(err, plan.harness) }; + return { ok: false, error: conciseError(err, plan.harness, request.provider) }; } finally { await toolRelay?.stop().catch(() => {}); await sandbox?.destroySandbox().catch(() => {}); diff --git a/services/agent/src/engines/sandbox_agent/capabilities.ts b/services/agent/src/engines/sandbox_agent/capabilities.ts index 654fdb21bc..70cc446088 100644 --- a/services/agent/src/engines/sandbox_agent/capabilities.ts +++ b/services/agent/src/engines/sandbox_agent/capabilities.ts @@ -1,40 +1,118 @@ -import type { HarnessCapabilities } from "../../protocol.ts"; +import type { HarnessCapabilities, ResolvedToolSpec } from "../../protocol.ts"; /** - * Map a sandbox-agent `AgentInfo` to our capability flags. Falls back to a per-harness - * static guess when the probe is unavailable. + * Where a run's `HarnessCapabilities` came from. + * + * - `probed`: read from the daemon's `AgentInfo.capabilities` for this exact harness. + * - `static`: the daemon probe was unavailable (or returned no capabilities), so we fell + * back to a per-harness static guess. A guess is a weaker contract — it can disagree with + * what the harness actually does — so callers that assert a hard capability requirement use + * the source to phrase a clear error instead of trusting the guess. */ -export function mapCapabilities(harness: string, info: any): HarnessCapabilities { +export type CapabilitySource = "probed" | "static"; + +export interface ProbedCapabilities { + capabilities: HarnessCapabilities; + source: CapabilitySource; +} + +/** Every capability flag the runner branches on, for the debug-assertion shape check. */ +const CAPABILITY_FLAGS = [ + "textMessages", + "images", + "fileAttachments", + "mcpTools", + "toolCalls", + "reasoning", + "planMode", + "permissions", + "streamingDeltas", + "sessionLifecycle", + "usage", +] as const; + +/** + * Debug assertion: throw on an impossible runner state with an actionable message. These guard + * the runner hot path (run-plan build, capability probe, capability gate, sandbox start) so an + * impossible state surfaces at its origin with context instead of as a confusing downstream + * failure. Cheap (a boolean test) and always on — a tripped invariant is a runner bug, not a + * user error. + */ +export function assert(condition: unknown, message: string): asserts condition { + if (!condition) throw new Error(`[sandbox-agent invariant] ${message}`); +} + +/** + * A run asked for a capability the harness does not have. Mirrors the not-implemented gates + * (`tools/code.ts` `CODE_TOOL_UNSUPPORTED_MESSAGE`, #4831's `*_UNSUPPORTED_MESSAGE`): the run + * fails with one clear, specific line rather than silently dropping the behavior. Built by + * `assertRequiredCapabilities`. + */ +export function toolDeliveryUnsupportedMessage( + harness: string, + missing: string, + toolCount: number, +): string { + return ( + `harness '${harness}' cannot receive tools (probe reports ${missing}); ` + + `the run carries ${toolCount} tool(s) that would be silently dropped. Run on a ` + + `tool-capable harness, or remove the tools.` + ); +} + +/** + * Map a sandbox-agent `AgentInfo` to our capability flags. Falls back to a per-harness static + * guess when the probe is unavailable. Returns the flags AND where they came from, so a caller + * can refuse to silently degrade on a guess (see `assertRequiredCapabilities`). + */ +export function mapCapabilities( + harness: string, + info: any, +): ProbedCapabilities { + assert( + typeof harness === "string" && harness.length > 0, + "mapCapabilities requires a non-empty harness id", + ); const c = info?.capabilities; if (c) { + assert( + typeof c === "object", + `probed capabilities for '${harness}' is not an object (got ${typeof c})`, + ); return { - textMessages: c.textMessages ?? true, - images: !!c.images, - fileAttachments: !!c.fileAttachments, - mcpTools: !!c.mcpTools, - toolCalls: !!c.toolCalls, - reasoning: !!c.reasoning, - planMode: !!c.planMode, - permissions: !!c.permissions, - streamingDeltas: !!c.streamingDeltas, - sessionLifecycle: !!c.sessionLifecycle, - usage: true, + source: "probed", + capabilities: { + textMessages: c.textMessages ?? true, + images: !!c.images, + fileAttachments: !!c.fileAttachments, + mcpTools: !!c.mcpTools, + toolCalls: !!c.toolCalls, + reasoning: !!c.reasoning, + planMode: !!c.planMode, + permissions: !!c.permissions, + streamingDeltas: !!c.streamingDeltas, + sessionLifecycle: !!c.sessionLifecycle, + usage: true, + }, }; } // Static fallback by harness id: pi-acp does not forward MCP, Claude/Codex do. const isPiHarness = harness === "pi"; return { - textMessages: true, - images: false, - fileAttachments: false, - mcpTools: !isPiHarness, - toolCalls: true, - reasoning: true, - planMode: !isPiHarness, - permissions: !isPiHarness, - streamingDeltas: true, - sessionLifecycle: true, - usage: true, + source: "static", + capabilities: { + textMessages: true, + images: false, + fileAttachments: false, + mcpTools: !isPiHarness, + toolCalls: true, + reasoning: true, + planMode: !isPiHarness, + permissions: !isPiHarness, + streamingDeltas: true, + sessionLifecycle: true, + usage: true, + }, }; } @@ -42,11 +120,84 @@ export function mapCapabilities(harness: string, info: any): HarnessCapabilities export async function probeCapabilities( sandbox: any, harness: string, -): Promise { +): Promise { + assert( + sandbox && typeof sandbox.getAgent === "function", + "probeCapabilities requires a sandbox with getAgent()", + ); + let probed: ProbedCapabilities; try { const info = await sandbox.getAgent(harness, { config: true }); - return mapCapabilities(harness, info); + probed = mapCapabilities(harness, info); } catch { - return mapCapabilities(harness, undefined); + probed = mapCapabilities(harness, undefined); + } + // Debug assertion: the mapper must always return a complete, boolean-valued flag set so a + // downstream branch never reads an undefined flag (which would coerce to a silent `false`). + for (const flag of CAPABILITY_FLAGS) { + assert( + typeof probed.capabilities[flag] === "boolean", + `capability '${flag}' for '${harness}' is ${typeof probed.capabilities[flag]}, expected boolean`, + ); } + return probed; +} + +export interface AssertCapabilitiesInput { + harness: string; + isPi: boolean; + probed: ProbedCapabilities; + toolSpecs: ResolvedToolSpec[]; + /** Hook for the debug log; the engine passes its stderr logger. */ + log?: (message: string) => void; +} + +/** + * Fail loud when a run REQUIRES a capability the harness does not have, instead of silently + * dropping the behavior the way `buildSessionMcpServers` did before. The one requirement worth + * asserting today is tool delivery, the clearest case: + * + * - Pi delivers tools through its native extension (not MCP), so a Pi run is exempt — its tool + * delivery never depends on the probed `mcpTools`/`toolCalls` flags. + * - Any other harness takes tools over MCP only. If the run carries tool specs but the harness + * does not advertise BOTH `mcpTools` (it can receive an MCP server) AND `toolCalls` (it can + * actually call a tool), those tools would be dropped without a trace. Refuse the run with a + * specific error, mirroring the `*_UNSUPPORTED_MESSAGE` gates in `run-plan.ts`. + * + * Throws on violation; the engine catch turns the throw into `{ ok: false, error }`. + */ +export function assertRequiredCapabilities({ + harness, + isPi, + probed, + toolSpecs, + log = () => {}, +}: AssertCapabilitiesInput): void { + assert( + typeof harness === "string" && harness.length > 0, + "assertRequiredCapabilities requires a non-empty harness id", + ); + const { capabilities, source } = probed; + + // Pi's tools ride its native extension, not the probed MCP capability — nothing to assert. + if (isPi) return; + + const toolCount = toolSpecs.length; + if (toolCount === 0) return; + + const missing: string[] = []; + if (!capabilities.mcpTools) missing.push("mcpTools:false"); + if (!capabilities.toolCalls) missing.push("toolCalls:false"); + if (missing.length === 0) return; + + // A static guess that says a capability is missing is the weakest signal, but the run still + // cannot deliver the tools, so it must fail rather than proceed and drop them. Note the source + // in the log so a debugging operator knows the probe was unavailable. + log( + `capability gate: harness '${harness}' (${source}) lacks ${missing.join(", ")}; ` + + `refusing a run with ${toolCount} tool(s) rather than dropping them`, + ); + throw new Error( + toolDeliveryUnsupportedMessage(harness, missing.join(", "), toolCount), + ); } diff --git a/services/agent/src/engines/sandbox_agent/errors.ts b/services/agent/src/engines/sandbox_agent/errors.ts index dd89229209..1b9f8b9185 100644 --- a/services/agent/src/engines/sandbox_agent/errors.ts +++ b/services/agent/src/engines/sandbox_agent/errors.ts @@ -1,13 +1,50 @@ +/** Map a provider family to its human-facing vault key label, for the credit/auth hint. */ +const PROVIDER_KEY_LABELS: Record = { + openai: "OpenAI", + anthropic: "Anthropic", + gemini: "Gemini", + mistral: "Mistral", + mistralai: "Mistral", + minimax: "MiniMax", + groq: "Groq", + together_ai: "Together AI", + openrouter: "OpenRouter", +}; + +/** + * The vault-key hint phrase for an error, named after the RESOLVED provider rather than the + * harness. A Pi run against an Anthropic model must say "Anthropic key", not "OpenAI key" — the + * harness name (`pi_core`/`claude`) is not the provider, so deriving the hint from it mislabels + * every cross-provider run (e.g. Pi + Anthropic wrongly read "check the project's OpenAI key"). + * + * `provider` is the resolved provider the runner already knows (`request.provider`, from the + * resolved connection). When it is absent (un-migrated caller) fall back to the harness default + * — Claude is always Anthropic; every other harness defaults to OpenAI, matching the old + * behavior for that path only. + */ +function keyHintFor(provider: string | undefined, harness: string): string { + const label = provider ? PROVIDER_KEY_LABELS[provider.toLowerCase()] : undefined; + if (label) return `the project's ${label} key`; + if (harness === "claude") return "the project's Anthropic key"; + return "the project's OpenAI key"; +} + /** * Turn a harness/SDK error into one clear line for the caller instead of dumping a full * ACP/JS stack. Recognizes common harness auth failures. + * + * `provider` is the resolved provider for the run; pass it so the credit/auth hint names the + * actual provider the run targeted, not a provider guessed from the harness name. */ -export function conciseError(err: unknown, harness: string): string { +export function conciseError( + err: unknown, + harness: string, + provider?: string, +): string { const raw = err instanceof Error ? err.message : String(err); const msg = raw.split("\n")[0].trim(); - const keyHint = - harness === "claude" ? "the project's Anthropic key" : "the project's OpenAI key"; - if (/credit balance is too low/i.test(raw)) { + const keyHint = keyHintFor(provider, harness); + if (/credit balance is too low|exceeded your current quota|insufficient_quota/i.test(raw)) { return `${harness}: the model provider account has insufficient credit (check ${keyHint}).`; } if (/authentication required|invalid api key|401|unauthorized/i.test(raw)) { diff --git a/services/agent/src/engines/sandbox_agent/pi-error.ts b/services/agent/src/engines/sandbox_agent/pi-error.ts new file mode 100644 index 0000000000..129b54b982 --- /dev/null +++ b/services/agent/src/engines/sandbox_agent/pi-error.ts @@ -0,0 +1,140 @@ +/** + * Recover a model-call error that Pi swallows on the local sandbox-agent path. + * + * When Pi's provider call fails (out-of-quota, bad key, rate limit, unknown model, ...), + * Pi records the failed turn in its session transcript as an assistant message with + * `stopReason: "error"` and a human-readable `errorMessage`, but its pi-acp bridge reports + * the turn to the runner as a plain `{ stopReason: "end_turn" }` with NO content. The runner + * then returns an `ok: true` run with empty output, and the user sees a silent "No response" + * instead of the real failure. + * + * This reader closes that gap on the LOCAL path (Pi runs on the runner host, so its session + * dir is on this filesystem). After a Pi turn that produced no output, the engine asks this + * helper for the transcript's last assistant `errorMessage`; when present, the run is failed + * loud with that message instead of returning an empty turn. + * + * It is deliberately best-effort and side-effect free: any filesystem/parse problem returns + * `undefined` so a genuinely empty (but successful) turn is never turned into a false error. + * Daytona is out of scope (the transcript lives in the remote sandbox); the empty-turn there + * stays as-is until the harness surfaces the error over ACP. + */ +import { readFileSync, readdirSync, statSync } from "node:fs"; +import { join } from "node:path"; + +/** A Pi transcript `session` record (first line of the .jsonl). */ +interface PiSessionRecord { + type?: string; + id?: string; + cwd?: string; +} + +/** A Pi transcript `message` record. */ +interface PiMessageRecord { + type?: string; + message?: { + role?: string; + stopReason?: string; + errorMessage?: string; + content?: unknown[]; + }; +} + +/** The most recent assistant error in one transcript, or undefined if none / unreadable. */ +function lastAssistantError(jsonlPath: string): string | undefined { + let raw: string; + try { + raw = readFileSync(jsonlPath, "utf8"); + } catch { + return undefined; + } + let found: string | undefined; + for (const line of raw.split("\n")) { + const trimmed = line.trim(); + if (!trimmed) continue; + let record: PiMessageRecord; + try { + record = JSON.parse(trimmed) as PiMessageRecord; + } catch { + continue; + } + const msg = record.message; + if (record.type !== "message" || msg?.role !== "assistant") continue; + // Keep scanning so a later successful turn clears an earlier error; only an error that + // is the LAST assistant turn (and is what produced the empty output) is surfaced. + if (msg.stopReason === "error" && msg.errorMessage) { + found = msg.errorMessage.trim() || undefined; + } else { + found = undefined; + } + } + return found; +} + +/** The `session` record of a transcript file, or undefined if unreadable / not a session. */ +function readSessionRecord(jsonlPath: string): PiSessionRecord | undefined { + let raw: string; + try { + raw = readFileSync(jsonlPath, "utf8"); + } catch { + return undefined; + } + const firstLine = raw.split("\n", 1)[0]?.trim(); + if (!firstLine) return undefined; + try { + const record = JSON.parse(firstLine) as PiSessionRecord; + return record.type === "session" ? record : undefined; + } catch { + return undefined; + } +} + +/** + * Find the Pi transcript for a run (matched by its unique `cwd`) and return the last + * assistant turn's `errorMessage`, or undefined when there is none. + * + * Each run gets a unique cwd, and Pi stamps the cwd on every transcript's `session` record, + * so matching on cwd locates this run's transcript without depending on Pi's dir-name + * encoding. Among matches (a resumed session can have several), the newest file wins. + */ +export function findSwallowedPiError( + piAgentDir: string, + cwd: string, +): string | undefined { + const sessionsRoot = join(piAgentDir, "sessions"); + let dirs: string[]; + try { + dirs = readdirSync(sessionsRoot); + } catch { + return undefined; + } + + let newestPath: string | undefined; + let newestMtime = -1; + for (const dir of dirs) { + const dirPath = join(sessionsRoot, dir); + let files: string[]; + try { + files = readdirSync(dirPath); + } catch { + continue; + } + for (const file of files) { + if (!file.endsWith(".jsonl")) continue; + const filePath = join(dirPath, file); + const session = readSessionRecord(filePath); + if (session?.cwd !== cwd) continue; + let mtime: number; + try { + mtime = statSync(filePath).mtimeMs; + } catch { + continue; + } + if (mtime > newestMtime) { + newestMtime = mtime; + newestPath = filePath; + } + } + } + + return newestPath ? lastAssistantError(newestPath) : undefined; +} diff --git a/services/agent/src/engines/sandbox_agent/run-plan.ts b/services/agent/src/engines/sandbox_agent/run-plan.ts index 537fbcddec..8b146a7ee1 100644 --- a/services/agent/src/engines/sandbox_agent/run-plan.ts +++ b/services/agent/src/engines/sandbox_agent/run-plan.ts @@ -16,6 +16,7 @@ import { type MaterializedSkill, resolveSkillDirs as defaultResolveSkillDirs, } from "../skills.ts"; +import { assert } from "./capabilities.ts"; import { buildTurnText } from "./transcript.ts"; type Log = (message: string) => void; @@ -145,6 +146,14 @@ export function buildRunPlan( const acpAgent = harness === "pi_core" || harness === "pi_agenta" ? "pi" : harness; + // Debug assertion: every Pi identity must resolve to the `pi` ACP agent and nothing else may. + // Catches a future harness-id typo (e.g. a new `pi_*` value forgotten here) at plan-build time + // rather than as a daemon "unknown agent" error mid-run. + assert( + (harness === "pi_core" || harness === "pi_agenta") === (acpAgent === "pi"), + `harness '${harness}' resolved to ACP agent '${acpAgent}', but pi identity mapping disagrees`, + ); + const prompt = resolvePromptText(request); if (!prompt) { return { @@ -229,6 +238,19 @@ export function buildRunPlan( ? request.appendSystemPrompt?.trim() || undefined : undefined; + // Debug assertions: the derived run state must be self-consistent before the engine acts on + // it. A cwd that is empty, or a relay dir not nested under it, would only surface later as a + // confusing filesystem error inside the sandbox. + assert(!!cwd, `buildRunPlan produced an empty cwd for harness '${harness}'`); + assert( + relayDir.startsWith(cwd), + `relay dir '${relayDir}' is not under cwd '${cwd}'`, + ); + assert( + isPi === (acpAgent === "pi"), + `isPi (${isPi}) disagrees with acpAgent '${acpAgent}'`, + ); + return { ok: true, plan: { diff --git a/services/agent/src/tracing/otel.ts b/services/agent/src/tracing/otel.ts index 5a8d98ce7f..eb4acbe012 100644 --- a/services/agent/src/tracing/otel.ts +++ b/services/agent/src/tracing/otel.ts @@ -611,30 +611,107 @@ function acpToolContentText(content: any): string { } /** - * Strip the pi-acp startup banner that some setups emit as the first agent message - * chunk (a "pi vX.Y.Z" / "## Context" / file list / "New version available" prelude, - * surfaced ahead of the real answer). Removes only a leading run of those marker lines - * so a genuine answer is never touched. + * Is this line part of the pi-acp startup banner that some setups emit as the first agent + * message chunk, ahead of the real answer? pi-acp's `buildStartupInfo` produces, in order: + * + * pi v0.79.4 + * --- + * (blank) + * ## Context + * - /tmp/agenta-sandbox-agent-XXXX/AGENTS.md + * (blank) + * ## Skills (when skills are installed) + * - /path/to/skill.md + * (blank) + * New version available: v0.80.2 (installed v0.79.4). Run: `npm i -g @earendil-works/pi-coding-agent` + * + * The markdown markers (`## `, `- `) are stripped when the playground renders the text, so the + * user sees a bare `Context` heading and an unprefixed absolute `.../AGENTS.md` path — but the + * raw chunk still carries the markdown, so we match BOTH the raw and the rendered shapes. The + * "New version available" notice is emitted even when `quietStartup` suppresses the rest, so it + * must be matched on its own. We only ever strip a LEADING run of these lines, so a genuine + * answer that happens to contain such words later is never touched. + */ +export function isBannerLine(line: string): boolean { + const t = line.trim(); + return ( + t === "" || + t === "---" || + /^pi v\d+\.\d+\.\d+\b/.test(t) || + // section heading, raw ("## Context") or rendered ("Context"); same for "Skills" + /^(?:#{1,6}\s*)?(?:Context|Skills)\s*$/.test(t) || + // an AGENTS.md / *.md path item, list-prefixed ("- /…/AGENTS.md") or bare ("/…/AGENTS.md") + /^(?:-\s+)?\/\S*\.md\s*$/.test(t) || + // upgrade notice: "New version available: vX (installed vY). Run: `npm i -g …`" + /^New version available:/.test(t) || + /^Run:\s*`?npm\s+i\b/.test(t) + ); +} + +/** + * Strip a leading run of pi-acp startup-banner lines from `text`. Returns the text past the + * banner (trimmed) when at least one non-blank banner line was seen, otherwise the original. */ -function stripStartupBanner(text: string): string { +export function stripStartupBanner(text: string): string { const lines = text.split("\n"); - const isBanner = (line: string) => - /^pi v\d+\.\d+\.\d+/.test(line) || - /^## Context\b/.test(line) || - /^-\s+\/.*AGENTS\.md\s*$/.test(line) || - /^New version available:/.test(line) || - /^Run: `npm/.test(line) || - line.trim() === "---" || - line.trim() === ""; let i = 0; let sawBanner = false; - while (i < lines.length && isBanner(lines[i])) { + while (i < lines.length && isBannerLine(lines[i])) { if (lines[i].trim() !== "") sawBanner = true; i++; } return sawBanner ? lines.slice(i).join("\n").trim() : text; } +/** + * Streaming-safe variant. Given the cumulative assistant text so far, return the portion that + * is safe to surface as a delta now, plus whether the leading banner region is fully resolved. + * + * The banner always arrives at the START of the stream and may straddle chunk boundaries, so we + * must not classify the LAST line until we know it is complete (a trailing newline, or a later + * chunk, settles it). While the text seen so far is entirely banner-or-blank we return + * `{ body: "", settled: false }` and the caller holds emission; once a non-banner line appears + * we return everything from it onward and never re-buffer again. `start` is the byte offset of + * the body within `text` (after leading whitespace), so the caller can slice later chunks from + * the same offset — the banner is a stable leading prefix of the cumulative stream. + */ +export function splitLeadingBanner(text: string): { + body: string; + settled: boolean; + start: number; +} { + const endsWithNewline = text.endsWith("\n"); + const lines = text.split("\n"); + // The final element is a partial line unless the text ended on a newline. + const lastIsPartial = !endsWithNewline; + let i = 0; + let offset = 0; // byte offset of line i within `text` + let sawBanner = false; + while (i < lines.length) { + const isLast = i === lines.length - 1; + if (isLast && lastIsPartial && sawBanner) { + // We are at the banner boundary with a still-arriving partial line. It could complete into + // either another banner line or the first line of the real answer — only a later chunk (or + // a newline) settles it, so hold. (When no banner has been seen, there is nothing to + // suppress: settle on the partial line and stream it without latency.) + return { body: "", settled: false, start: -1 }; + } + if (!isBannerLine(lines[i])) break; + sawBanner = true; + offset += lines[i].length + 1; // +1 for the consumed "\n" + i++; + } + if (i >= lines.length) { + // Consumed every (complete) line as banner — nothing real has started yet. + return { body: "", settled: false, start: -1 }; + } + const rest = text.slice(offset); + // Drop the blank line(s) that separate the banner from the answer, but never trim a + // banner-free stream — a genuine answer may legitimately begin with whitespace. + const ws = sawBanner ? rest.length - rest.replace(/^\s+/, "").length : 0; + return { body: rest.slice(ws), settled: true, start: offset + ws }; +} + /** Split a resolved model id ("openai-codex/gpt-5.5") into provider + id. */ function splitModel(model?: string): { provider?: string; id?: string } { if (!model) return {}; @@ -767,6 +844,16 @@ export function createSandboxAgentOtel(init: SandboxAgentOtelInit): SandboxAgent let textBlockId: string | undefined; let textEmitted = ""; let anyTextDelta = false; + // Streaming banner suppression: pi-acp emits its startup banner as the FIRST agent message + // chunk, ahead of the real answer. The one-shot `finish()` path strips it from the coalesced + // text, but the streaming path flushes deltas as they arrive, before finish() ever runs — so + // the banner would leak to the client. We hold the leading deltas until the banner region is + // resolved (it may straddle chunk boundaries), then stream only the body past it. Once a real + // line has started, `bannerSettled` latches true and we never strip again. + let bannerSettled = false; + // Once the banner region resolves, the real answer begins at this byte offset in `accumulated`; + // every later chunk is streamed as `accumulated.slice(bannerEnd)` so the banner never reappears. + let bannerEnd = 0; let reasoningBlockId: string | undefined; let reasoningEmitted = ""; let blockSeq = 0; @@ -800,6 +887,25 @@ export function createSandboxAgentOtel(init: SandboxAgentOtelInit): SandboxAgent anyTextDelta = true; } + /** + * Stream the assistant text for the cumulative `accumulated`, with the leading startup banner + * suppressed. Until the banner region resolves we emit nothing; afterwards we stream the body + * past it. `body` is a prefix-growing view of the real answer, so `streamText`'s delta logic + * stays correct. + */ + function streamAssistantText(): void { + if (bannerSettled) { + // The banner is a stable leading prefix; stream everything past it. + streamText(accumulated.slice(bannerEnd)); + return; + } + const { body, settled, start } = splitLeadingBanner(accumulated); + if (!settled) return; // banner region still arriving — hold emission + bannerSettled = true; + bannerEnd = start; + if (body) streamText(body); + } + /** Open (if needed) the reasoning block and emit the pure delta up to `target`. */ function streamReasoning(target: string): void { closeText(); // a reasoning chunk ends any open text run @@ -869,8 +975,9 @@ export function createSandboxAgentOtel(init: SandboxAgentOtelInit): SandboxAgent else accumulated += t; // Live deltas run independent of span emission (text, not a span), so they flow even // when the harness self-instruments (emitSpans=false). `accumulated` is the cumulative - // text, so the pure delta is its tail past what we already sent. - if (sink) streamText(accumulated); + // text, so the pure delta is its tail past what we already sent — minus the leading + // startup banner, which is held back until the body begins (see streamAssistantText). + if (sink) streamAssistantText(); return; } diff --git a/services/agent/tests/unit/sandbox-agent-capabilities.test.ts b/services/agent/tests/unit/sandbox-agent-capabilities.test.ts index 6d8f80c081..b96f4a4ece 100644 --- a/services/agent/tests/unit/sandbox-agent-capabilities.test.ts +++ b/services/agent/tests/unit/sandbox-agent-capabilities.test.ts @@ -1,34 +1,25 @@ /** - * Unit tests for sandbox-agent capability mapping. + * Unit tests for sandbox-agent capability mapping, the fail-loud capability gate, and the + * debug assertions. * * Run: pnpm test (or: pnpm exec vitest run tests/unit/sandbox-agent-capabilities.test.ts) */ import { describe, it } from "vitest"; import assert from "node:assert/strict"; +import type { ResolvedToolSpec } from "../../src/protocol.ts"; import { + assert as invariant, + assertRequiredCapabilities, mapCapabilities, probeCapabilities, + type ProbedCapabilities, } from "../../src/engines/sandbox_agent/capabilities.ts"; describe("mapCapabilities", () => { - it("maps probed sandbox-agent capabilities and always enables usage", () => { - assert.deepEqual( - mapCapabilities("claude", { - capabilities: { - textMessages: false, - images: true, - fileAttachments: true, - mcpTools: true, - toolCalls: true, - reasoning: true, - planMode: true, - permissions: true, - streamingDeltas: true, - sessionLifecycle: true, - }, - }), - { + it("maps probed sandbox-agent capabilities, always enables usage, marks the source probed", () => { + const out = mapCapabilities("claude", { + capabilities: { textMessages: false, images: true, fileAttachments: true, @@ -39,26 +30,58 @@ describe("mapCapabilities", () => { permissions: true, streamingDeltas: true, sessionLifecycle: true, - usage: true, }, - ); + }); + assert.equal(out.source, "probed"); + assert.deepEqual(out.capabilities, { + textMessages: false, + images: true, + fileAttachments: true, + mcpTools: true, + toolCalls: true, + reasoning: true, + planMode: true, + permissions: true, + streamingDeltas: true, + sessionLifecycle: true, + usage: true, + }); + }); + + it("falls back to no MCP for Pi and MCP for non-Pi harnesses, marked static", () => { + const pi = mapCapabilities("pi", undefined); + assert.equal(pi.source, "static"); + assert.equal(pi.capabilities.mcpTools, false); + + const claude = mapCapabilities("claude", undefined); + assert.equal(claude.source, "static"); + assert.equal(claude.capabilities.mcpTools, true); + }); + + it("invariant: rejects an empty harness id", () => { + assert.throws(() => mapCapabilities("", undefined), /non-empty harness id/); }); - it("falls back to no MCP for Pi and MCP for non-Pi harnesses", () => { - assert.equal(mapCapabilities("pi", undefined).mcpTools, false); - assert.equal(mapCapabilities("claude", undefined).mcpTools, true); + it("invariant: rejects a non-object probed capabilities payload", () => { + assert.throws( + () => mapCapabilities("claude", { capabilities: "yes" }), + /not an object/, + ); }); }); describe("probeCapabilities", () => { - it("uses the daemon probe when available", async () => { + it("uses the daemon probe when available and reports the source", async () => { const sandbox = { - getAgent: async () => ({ capabilities: { mcpTools: true, permissions: true } }), + getAgent: async () => ({ + capabilities: { mcpTools: true, permissions: true, toolCalls: true }, + }), }; const out = await probeCapabilities(sandbox, "claude"); - assert.equal(out.mcpTools, true); - assert.equal(out.permissions, true); + assert.equal(out.source, "probed"); + assert.equal(out.capabilities.mcpTools, true); + assert.equal(out.capabilities.permissions, true); }); it("uses static fallback when probing fails", async () => { @@ -68,6 +91,123 @@ describe("probeCapabilities", () => { }, }; - assert.equal((await probeCapabilities(sandbox, "pi")).mcpTools, false); + const out = await probeCapabilities(sandbox, "pi"); + assert.equal(out.source, "static"); + assert.equal(out.capabilities.mcpTools, false); + }); + + it("returns a complete boolean-valued flag set even when the probe omits flags", async () => { + const sandbox = { + // A partial probe payload: every absent flag must still come back a boolean. + getAgent: async () => ({ capabilities: { mcpTools: true } }), + }; + const out = await probeCapabilities(sandbox, "claude"); + for (const value of Object.values(out.capabilities)) { + assert.equal(typeof value, "boolean"); + } + }); + + it("invariant: rejects a sandbox without getAgent", async () => { + await assert.rejects( + () => probeCapabilities({}, "claude"), + /requires a sandbox with getAgent/, + ); + }); +}); + +function probed( + flags: Record, + source: "probed" | "static" = "probed", +): ProbedCapabilities { + return { source, capabilities: flags as any }; +} + +const tool: ResolvedToolSpec = { name: "server_tool", kind: "callback" }; + +describe("assertRequiredCapabilities (fail loud on tool delivery)", () => { + it("passes a non-Pi harness that advertises mcpTools + toolCalls", () => { + assert.doesNotThrow(() => + assertRequiredCapabilities({ + harness: "claude", + isPi: false, + probed: probed({ mcpTools: true, toolCalls: true }), + toolSpecs: [tool], + }), + ); + }); + + it("throws a specific error when the harness lacks mcpTools", () => { + assert.throws( + () => + assertRequiredCapabilities({ + harness: "claude", + isPi: false, + probed: probed({ mcpTools: false, toolCalls: true }), + toolSpecs: [tool], + }), + /harness 'claude' cannot receive tools.*mcpTools:false.*1 tool/s, + ); + }); + + it("throws naming both missing flags when neither is advertised", () => { + assert.throws( + () => + assertRequiredCapabilities({ + harness: "codex", + isPi: false, + probed: probed({ mcpTools: false, toolCalls: false }), + toolSpecs: [tool, { name: "second", kind: "callback" }], + }), + /mcpTools:false, toolCalls:false.*2 tool/s, + ); + }); + + it("exempts Pi (tools ride its native extension, not the probed MCP flags)", () => { + assert.doesNotThrow(() => + assertRequiredCapabilities({ + harness: "pi_core", + isPi: true, + probed: probed({ mcpTools: false, toolCalls: false }), + toolSpecs: [tool], + }), + ); + }); + + it("is a no-op when the run carries no tools", () => { + assert.doesNotThrow(() => + assertRequiredCapabilities({ + harness: "claude", + isPi: false, + probed: probed({ mcpTools: false, toolCalls: false }), + toolSpecs: [], + }), + ); + }); + + it("logs the source so a failed-probe static guess is debuggable", () => { + const logs: string[] = []; + assert.throws(() => + assertRequiredCapabilities({ + harness: "claude", + isPi: false, + probed: probed({ mcpTools: false, toolCalls: true }, "static"), + toolSpecs: [tool], + log: (m) => logs.push(m), + }), + ); + assert.ok(logs.some((m) => m.includes("(static)"))); + }); +}); + +describe("assert (debug invariant)", () => { + it("throws a prefixed message when the condition is false", () => { + assert.throws( + () => invariant(false, "boom"), + /\[sandbox-agent invariant\] boom/, + ); + }); + + it("does nothing when the condition holds", () => { + assert.doesNotThrow(() => invariant(true, "fine")); }); }); diff --git a/services/agent/tests/unit/sandbox-agent-errors.test.ts b/services/agent/tests/unit/sandbox-agent-errors.test.ts index f3d9c15e23..ec303cdbf8 100644 --- a/services/agent/tests/unit/sandbox-agent-errors.test.ts +++ b/services/agent/tests/unit/sandbox-agent-errors.test.ts @@ -16,6 +16,18 @@ describe("conciseError", () => { ); }); + it("formats OpenAI quota failures as insufficient credit", () => { + assert.equal( + conciseError( + new Error( + "You exceeded your current quota, please check your plan and billing details.", + ), + "pi", + ), + "pi: the model provider account has insufficient credit (check the project's OpenAI key).", + ); + }); + it("formats auth failures with the right provider hint", () => { assert.equal( conciseError(new Error("Authentication required"), "pi"), @@ -23,6 +35,48 @@ describe("conciseError", () => { ); }); + it("names the resolved provider, not the harness, for a Pi+Anthropic run", () => { + // The bug: a Pi run against an Anthropic model that fails auth must NOT say "OpenAI key". + assert.equal( + conciseError(new Error("Authentication required"), "pi_core", "anthropic"), + "pi_core: model authentication failed — add the project's Anthropic key to the project vault, or log in (OAuth).", + ); + }); + + it("names the resolved provider for a Pi+Anthropic credit failure", () => { + assert.equal( + conciseError(new Error("credit balance is too low"), "pi_core", "anthropic"), + "pi_core: the model provider account has insufficient credit (check the project's Anthropic key).", + ); + }); + + it("keeps the OpenAI hint when the resolved provider is openai on Pi", () => { + assert.equal( + conciseError(new Error("insufficient_quota"), "pi_core", "openai"), + "pi_core: the model provider account has insufficient credit (check the project's OpenAI key).", + ); + }); + + it("falls back to the harness default when no provider is resolved", () => { + // Un-migrated caller (no provider on the wire): keep the old harness-derived behavior. + assert.equal( + conciseError(new Error("401 unauthorized"), "claude"), + "claude: model authentication failed — add the project's Anthropic key to the project vault, or log in (OAuth).", + ); + assert.equal( + conciseError(new Error("401 unauthorized"), "pi_core"), + "pi_core: model authentication failed — add the project's OpenAI key to the project vault, or log in (OAuth).", + ); + }); + + it("falls back to the harness default for an unknown custom provider", () => { + // A custom router slug we have no key label for: do not invent one, use the harness default. + assert.equal( + conciseError(new Error("Authentication required"), "pi_core", "openai-codex"), + "pi_core: model authentication failed — add the project's OpenAI key to the project vault, or log in (OAuth).", + ); + }); + it("falls back to the first line", () => { assert.equal(conciseError(new Error("first line\nsecond line"), "pi"), "first line"); }); diff --git a/services/agent/tests/unit/sandbox-agent-orchestration.test.ts b/services/agent/tests/unit/sandbox-agent-orchestration.test.ts index 440b65c545..e8e228facd 100644 --- a/services/agent/tests/unit/sandbox-agent-orchestration.test.ts +++ b/services/agent/tests/unit/sandbox-agent-orchestration.test.ts @@ -162,10 +162,14 @@ function fakeHarness(options: FakeOptions = {}) { }) as any, probeCapabilities: async () => ({ - mcpTools: true, - usage: true, - streamingDeltas: true, - ...(options.capabilities ?? {}), + source: "probed", + capabilities: { + mcpTools: true, + toolCalls: true, + usage: true, + streamingDeltas: true, + ...(options.capabilities ?? {}), + }, }) as any, applyModel: async (_session, model, _log, options) => { calls.applyModelArgs.push({ model, options }); @@ -347,6 +351,54 @@ describe("runSandboxAgent orchestration", () => { }); }); + it("fails loud when a non-Pi harness probes mcpTools:false but the run carries tools", async () => { + // A7: the capability gate runs BEFORE the MCP bridge, so a harness whose probe reports it + // cannot receive tools fails with a SPECIFIC capability error (not the generic MCP-disabled + // line, and never a silent drop). This is the silent-degradation case the staff review + // flagged: a wrong/missing `mcpTools` flag must error, not change behavior quietly. + const { calls, deps } = fakeHarness({ capabilities: { mcpTools: false } }); + + const result = await runSandboxAgent( + { + harness: "claude", + prompt: "use the tool", + customTools: [{ name: "server_tool", kind: "callback" }], + } as AgentRunRequest, + undefined, + undefined, + deps, + ); + + assert.equal(result.ok, false); + if (result.ok) return; + assert.match(result.error ?? "", /harness 'claude' cannot receive tools/); + assert.match(result.error ?? "", /mcpTools:false/); + // The gate fires before the session is created, so no session/prompt happened. + assert.equal(calls.createSessionOptions, undefined); + // The acquired sandbox is still disposed in the finally. + assert.equal(calls.sandboxDestroyed, 1); + }); + + it("does not gate tool delivery for a Pi run even when mcpTools is false", async () => { + // Pi delivers tools through its native extension, not MCP, so a Pi run with tools and a + // `mcpTools:false` probe must proceed (the gate exempts Pi). + const { calls, deps } = fakeHarness({ capabilities: { mcpTools: false } }); + + const result = await runSandboxAgent( + { + harness: "pi_core", + prompt: "use the tool", + customTools: [{ name: "server_tool", kind: "callback" }], + } as AgentRunRequest, + undefined, + undefined, + deps, + ); + + assert.equal(result.ok, true); + assert.notEqual(calls.createSessionOptions, undefined); + }); + it("flushes a partial trace and cleans up on prompt errors", async () => { const { calls, deps } = fakeHarness({ promptError: new Error("boom") }); diff --git a/services/agent/tests/unit/sandbox-agent-pi-error.test.ts b/services/agent/tests/unit/sandbox-agent-pi-error.test.ts new file mode 100644 index 0000000000..d9cf4c9410 --- /dev/null +++ b/services/agent/tests/unit/sandbox-agent-pi-error.test.ts @@ -0,0 +1,124 @@ +/** + * Unit tests for recovering a model error Pi swallows on the local sandbox-agent path. + * + * When Pi's provider call fails, Pi records the failed turn in its session transcript with + * `stopReason: "error"` + `errorMessage`, but reports a plain `end_turn` with no content over + * ACP. `findSwallowedPiError` reads that transcript so the empty turn can fail loud. + * + * Run: pnpm test (or: pnpm exec vitest run tests/unit/sandbox-agent-pi-error.test.ts) + */ +import { afterEach, describe, it } from "vitest"; +import assert from "node:assert/strict"; +import { mkdtempSync, mkdirSync, rmSync, writeFileSync } from "node:fs"; +import { tmpdir } from "node:os"; +import { join } from "node:path"; + +import { findSwallowedPiError } from "../../src/engines/sandbox_agent/pi-error.ts"; + +const dirs: string[] = []; + +function tempDir(): string { + const dir = mkdtempSync(join(tmpdir(), "agenta-pi-error-test-")); + dirs.push(dir); + return dir; +} + +/** Write a Pi-style .jsonl transcript under `/sessions//` for `cwd`. */ +function writeTranscript( + piAgentDir: string, + name: string, + cwd: string, + messages: Array>, +): void { + const dir = join(piAgentDir, "sessions", name); + mkdirSync(dir, { recursive: true }); + const lines = [ + JSON.stringify({ type: "session", version: 3, id: name, cwd }), + ...messages.map((m) => JSON.stringify(m)), + ]; + writeFileSync(join(dir, `${name}.jsonl`), lines.join("\n") + "\n"); +} + +afterEach(() => { + for (const dir of dirs.splice(0)) rmSync(dir, { recursive: true, force: true }); +}); + +describe("findSwallowedPiError", () => { + it("returns the last assistant errorMessage for a matching cwd", () => { + const piAgentDir = tempDir(); + const cwd = "/tmp/agenta-sandbox-agent-abc123"; + writeTranscript(piAgentDir, "--tmp-agenta-sandbox-agent-abc123--", cwd, [ + { type: "message", message: { role: "user", content: [{ type: "text", text: "hi" }] } }, + { + type: "message", + message: { + role: "assistant", + content: [], + stopReason: "error", + errorMessage: "You exceeded your current quota, please check your plan.", + }, + }, + ]); + + const error = findSwallowedPiError(piAgentDir, cwd); + assert.equal(error, "You exceeded your current quota, please check your plan."); + }); + + it("returns undefined for a successful turn", () => { + const piAgentDir = tempDir(); + const cwd = "/tmp/agenta-sandbox-agent-ok"; + writeTranscript(piAgentDir, "--tmp-agenta-sandbox-agent-ok--", cwd, [ + { + type: "message", + message: { + role: "assistant", + content: [{ type: "text", text: "hello" }], + stopReason: "end_turn", + }, + }, + ]); + + assert.equal(findSwallowedPiError(piAgentDir, cwd), undefined); + }); + + it("does not surface an error cleared by a later successful turn", () => { + const piAgentDir = tempDir(); + const cwd = "/tmp/agenta-sandbox-agent-recovered"; + writeTranscript(piAgentDir, "--recovered--", cwd, [ + { + type: "message", + message: { role: "assistant", content: [], stopReason: "error", errorMessage: "transient" }, + }, + { + type: "message", + message: { + role: "assistant", + content: [{ type: "text", text: "recovered" }], + stopReason: "end_turn", + }, + }, + ]); + + assert.equal(findSwallowedPiError(piAgentDir, cwd), undefined); + }); + + it("ignores transcripts for a different cwd", () => { + const piAgentDir = tempDir(); + writeTranscript(piAgentDir, "--other--", "/tmp/some-other-cwd", [ + { + type: "message", + message: { role: "assistant", content: [], stopReason: "error", errorMessage: "nope" }, + }, + ]); + + assert.equal( + findSwallowedPiError(piAgentDir, "/tmp/agenta-sandbox-agent-missing"), + undefined, + ); + }); + + it("returns undefined when the sessions dir is absent", () => { + const piAgentDir = tempDir(); + assert.equal(findSwallowedPiError(piAgentDir, "/tmp/whatever"), undefined); + }); +}); diff --git a/services/agent/tests/unit/startup-banner.test.ts b/services/agent/tests/unit/startup-banner.test.ts new file mode 100644 index 0000000000..4912373961 --- /dev/null +++ b/services/agent/tests/unit/startup-banner.test.ts @@ -0,0 +1,212 @@ +/** + * Unit tests for pi-acp startup-banner stripping. + * + * pi-acp emits its startup banner as the FIRST agent message chunk, ahead of the real answer + * (a "pi vX.Y.Z" / "## Context" / AGENTS.md path list / "New version available" prelude). The + * playground renders the markdown, so the user sees a bare "Context" heading and an unprefixed + * absolute path. These tests pin both the coalesced stripper (`stripStartupBanner`, used by the + * one-shot `finish()` path) and the streaming-safe splitter (`splitLeadingBanner`, used to hold + * leading deltas until the banner region resolves), plus an end-to-end streaming assertion + * through `createSandboxAgentOtel`. + * + * Run: pnpm test (or: pnpm exec vitest run tests/unit/startup-banner.test.ts) + */ +import { describe, it } from "vitest"; +import assert from "node:assert/strict"; + +import { + createSandboxAgentOtel, + isBannerLine, + splitLeadingBanner, + stripStartupBanner, +} from "../../src/tracing/otel.ts"; +import type { AgentEvent } from "../../src/protocol.ts"; + +// The exact banner the user saw, in its RAW (pre-render) markdown form: pi-acp's buildStartupInfo +// output followed by buildUpdateNotice. The playground strips "## " and "- " when rendering, so +// the user-visible text is the same minus those markers. +const RAW_BANNER = [ + "pi v0.79.4", + "---", + "", + "## Context", + "- /tmp/agenta-sandbox-agent-UxwfPW/AGENTS.md", + "", + "---", + "New version available: v0.80.2 (installed v0.79.4). Run: `npm i -g @earendil-works/pi-coding-agent`", +].join("\n"); +// ^ verbatim from a live `pi-acp@0.0.29` session/new (`_meta.piAcp.startupInfo`) on this stack: +// "pi v0.79.4\n---\n\n## Context\n- /tmp/.../AGENTS.md\n\n---\nNew version available: v0.80.2 ...". + +// The same banner as the user reported it (already markdown-rendered): bare "Context", unprefixed +// AGENTS.md path, no fenced npm command. +const RENDERED_BANNER = [ + "pi v0.79.4", + "Context", + "/tmp/agenta-sandbox-UxwfPW/AGENTS.md", + "New version available: v0.80.2 (installed v0.79.4).", + "Run: npm i -g @earendil-works/pi-coding-agent", +].join("\n"); + +describe("isBannerLine", () => { + it("matches every banner-line variant the user saw", () => { + for (const line of [ + "pi v0.79.4", + "pi v0.80.2 (something)", + "---", + "", + " ", + "## Context", + "Context", + "## Skills", + "Skills", + "- /tmp/agenta-sandbox-agent-UxwfPW/AGENTS.md", + "/tmp/agenta-sandbox-UxwfPW/AGENTS.md", + "- /pi-agent/skills/foo.md", + "New version available: v0.80.2 (installed v0.79.4). Run: `npm i -g @earendil-works/pi-coding-agent`", + "New version available: v0.80.2 (installed v0.79.4).", + "Run: `npm i -g @earendil-works/pi-coding-agent`", + "Run: npm i -g @earendil-works/pi-coding-agent", + ]) { + assert.equal(isBannerLine(line), true, `expected banner: ${JSON.stringify(line)}`); + } + }); + + it("does not match genuine answer lines", () => { + for (const line of [ + "4", + "The answer is 4.", + "Here is the context you asked for:", + "I will run: npm test", // not "npm i -g" + "Contextual information follows", // not a bare heading + "See /etc/hosts for details", // not a .md path + ]) { + assert.equal(isBannerLine(line), false, `expected non-banner: ${JSON.stringify(line)}`); + } + }); +}); + +describe("stripStartupBanner (coalesced / finish path)", () => { + it("strips the raw markdown banner the user saw", () => { + assert.equal(stripStartupBanner(RAW_BANNER + "\n4"), "4"); + }); + + it("strips the rendered banner the user saw", () => { + assert.equal(stripStartupBanner(RENDERED_BANNER + "\nThe answer is 4."), "The answer is 4."); + }); + + it("strips a multi-line answer's banner but keeps the whole answer", () => { + const answer = "Line one of the answer.\nLine two.\n- bullet point"; + assert.equal(stripStartupBanner(RAW_BANNER + "\n" + answer), answer); + }); + + it("leaves a clean answer untouched", () => { + assert.equal(stripStartupBanner("4"), "4"); + assert.equal( + stripStartupBanner("The context of this question is simple: 4."), + "The context of this question is simple: 4.", + ); + }); + + it("returns empty when the text is only the banner", () => { + assert.equal(stripStartupBanner(RAW_BANNER), ""); + }); +}); + +describe("splitLeadingBanner (streaming path)", () => { + it("holds emission while only banner-or-blank has arrived", () => { + for (const text of ["pi v0.79.4\n", RAW_BANNER, RAW_BANNER + "\n"]) { + const { body, settled } = splitLeadingBanner(text); + assert.deepEqual({ body, settled }, { body: "", settled: false }); + } + }); + + it("does not judge an incomplete trailing line", () => { + // "Run: `npm i" could become the upgrade line OR a real answer; wait for the newline. + const { body, settled } = splitLeadingBanner("pi v0.79.4\n---\nRun: `npm i"); + assert.deepEqual({ body, settled }, { body: "", settled: false }); + }); + + it("settles and returns the body once a real line completes", () => { + // Streaming preserves the answer faithfully, including a trailing newline; only the leading + // blank line(s) between banner and answer are dropped. + const { body, settled, start } = splitLeadingBanner(RAW_BANNER + "\n4\n"); + assert.equal(settled, true); + assert.equal(body, "4\n"); + // `start` points exactly at the body within the original text. + assert.equal((RAW_BANNER + "\n4\n").slice(start, start + body.length), body); + }); + + it("holds the first real line until its newline arrives (boundary safety)", () => { + // At the banner boundary a still-arriving partial line ("4", no newline) could complete into + // either a banner line or the answer, so we hold and let the finish path strip it instead. + const { settled } = splitLeadingBanner(RAW_BANNER + "\n4"); + assert.equal(settled, false); + }); + + it("settles immediately for a banner-free stream", () => { + const { body, settled, start } = splitLeadingBanner("4"); + assert.deepEqual({ body, settled, start }, { body: "4", settled: true, start: 0 }); + }); +}); + +describe("createSandboxAgentOtel streaming suppresses the banner", () => { + const textChunk = (text: string) => ({ + sessionUpdate: "agent_message_chunk", + content: { type: "text", text }, + }); + + it("never streams banner deltas, only the real answer", () => { + const emitted: AgentEvent[] = []; + const run = createSandboxAgentOtel({ + harness: "pi", + model: "openai/o4-mini", + emitSpans: false, + emit: (e) => emitted.push(e), + }); + run.start({ prompt: "What is 2+2?" }); + // pi streams pure deltas; the banner arrives first, then the answer, split across chunks. + run.handleUpdate(textChunk("pi v0.79.4\n---\n\n## Context\n- /tmp/x/AGENTS.md\n\n")); + run.handleUpdate( + textChunk("New version available: v0.80.2 (installed v0.79.4). Run: `npm i -g @earendil-works/pi-coding-agent`\n"), + ); + // First real line completes with a newline -> the splitter settles and streams the body + // INCREMENTALLY (before finish), exercising the live path, not just the finish fallback. + run.handleUpdate(textChunk("The answer is 4.\n")); // pure delta (pi-style) + const beforeFinish = emitted.filter((e) => e.type === "message_delta").length; + run.handleUpdate(textChunk("Thanks!")); // pure delta (pi-style) + run.finish(); + assert.ok(beforeFinish > 0, "body was not streamed incrementally before finish"); + + const streamed = emitted + .filter((e): e is Extract => e.type === "message_delta") + .map((e) => e.delta) + .join(""); + assert.equal(streamed, "The answer is 4.\nThanks!"); + assert.ok(!streamed.includes("pi v0.79.4"), "banner version leaked"); + assert.ok(!streamed.includes("New version available"), "upgrade notice leaked"); + assert.ok(!streamed.includes("AGENTS.md"), "context file path leaked"); + }); + + it("strips the banner on finish when the stream had no clean body line", () => { + // A short, banner-only-then-inline answer that never produced a newline before finish. + const emitted: AgentEvent[] = []; + const run = createSandboxAgentOtel({ + harness: "pi", + model: "openai/o4-mini", + emitSpans: false, + emit: (e) => emitted.push(e), + }); + run.start({ prompt: "hi" }); + // The whole banner plus the answer arrive in ONE chunk with no trailing newline. The + // streaming splitter holds (last line partial), so finish() must do the stripping. + run.handleUpdate(textChunk(RAW_BANNER + "\n4")); + run.finish(); + + const streamed = emitted + .filter((e): e is Extract => e.type === "message_delta") + .map((e) => e.delta) + .join(""); + assert.equal(streamed, "4"); + }); +});