diff --git a/docs/design/agent-workflows/interfaces/cross-service/runner-to-harness.md b/docs/design/agent-workflows/interfaces/cross-service/runner-to-harness.md
index 81766f1dfe..d2d9be835c 100644
--- a/docs/design/agent-workflows/interfaces/cross-service/runner-to-harness.md
+++ b/docs/design/agent-workflows/interfaces/cross-service/runner-to-harness.md
@@ -29,7 +29,16 @@ returns `HarnessCapabilities`:
 ```
 
 The probe gates real behavior. Tools only go over MCP when `mcpTools` is true, for example.
-If the probe fails, the runner falls back to a static capability policy.
+If the probe fails, the runner falls back to a static capability policy, and the result records
+whether the flags were `probed` or a `static` guess.
+
+The runner no longer silently degrades when a run requires a capability the harness lacks.
+`assertRequiredCapabilities` (in `capabilities.ts`) fails the run with a specific error — the
+same fail-loud pattern as the `*_UNSUPPORTED_MESSAGE` gates in `run-plan.ts` and the
+`CODE_TOOL_UNSUPPORTED_MESSAGE` gate in `tools/code.ts`. Today the asserted requirement is tool
+delivery: a non-Pi run carrying tool specs whose probe reports `mcpTools:false` or
+`toolCalls:false` errors instead of dropping the tools without a trace. Pi is exempt (its tools
+ride the native extension, not MCP), and a run with no tools is unaffected.
 
 **The ACP event stream.** The harness emits ACP updates that the runner maps to neutral
 events:
@@ -58,8 +67,11 @@ tears the sandbox down in the `finally` path.
 
 - **Harness selection and the `pi_core`/`pi_agenta` to `pi` remap.** New harnesses thread
   through here.
-- **The capability probe.** It gates tool delivery, permissions, and streaming. A wrong flag
-  silently changes behavior rather than erroring.
+- **The capability probe.** It gates tool delivery, permissions, and streaming. A run that
+  REQUIRES a capability the harness lacks now fails loud (`assertRequiredCapabilities`) rather
+  than silently dropping the behavior — today that covers tool delivery to a non-Pi harness.
+  Other flags (permissions, streaming) still degrade silently; assert the next requirement here
+  if a flag must be a hard gate.
 - **ACP event mapping.** A missed or mis-mapped update drops content from the stream.
 - **Pi versus Claude divergence.** Both run over ACP, but Pi takes tools natively and
   self-instruments traces, while Claude takes tools over MCP and the runner builds the spans.
diff --git a/services/agent/src/engines/sandbox_agent.ts b/services/agent/src/engines/sandbox_agent.ts
index fafc4cec19..51c1f266da 100644
--- a/services/agent/src/engines/sandbox_agent.ts
+++ b/services/agent/src/engines/sandbox_agent.ts
@@ -46,7 +46,11 @@ import {
   type ToolCallbackContext,
   resolveRunSessionId,
 } from "../protocol.ts";
-import { probeCapabilities } from "./sandbox_agent/capabilities.ts";
+import {
+  assert,
+  assertRequiredCapabilities,
+  probeCapabilities,
+} from "./sandbox_agent/capabilities.ts";
 import { buildDaemonEnv, resolveDaemonBinary } from "./sandbox_agent/daemon.ts";
 import {
   createCookieFetch,
@@ -55,6 +59,7 @@ import {
 import { conciseError } from "./sandbox_agent/errors.ts";
 import { buildSessionMcpServers } from "./sandbox_agent/mcp.ts";
 import { applyModel } from "./sandbox_agent/model.ts";
+import { findSwallowedPiError } from "./sandbox_agent/pi-error.ts";
 import {
   buildPiExtensionEnv,
   prepareLocalPiAssets,
@@ -81,7 +86,12 @@ function log(message: string): void {
 
 type Log = (message: string) => void;
 
-const CLAUDE_STRICT_DEPLOYMENTS = new Set(["custom", "bedrock", "vertex", "vertex_ai"]);
+const CLAUDE_STRICT_DEPLOYMENTS = new Set([
+  "custom",
+  "bedrock",
+  "vertex",
+  "vertex_ai",
+]);
 
 function applyClaudeConnectionEnv(
   env: Record<string, string>,
@@ -110,7 +120,10 @@ function applyClaudeConnectionEnv(
     env.CLAUDE_CODE_USE_VERTEX = "1";
   }
 
-  if (selectedModel && (baseUrl || (deployment && CLAUDE_STRICT_DEPLOYMENTS.has(deployment)))) {
+  if (
+    selectedModel &&
+    (baseUrl || (deployment && CLAUDE_STRICT_DEPLOYMENTS.has(deployment)))
+  ) {
     env.ANTHROPIC_MODEL = selectedModel;
     env.ANTHROPIC_CUSTOM_MODEL_OPTION = selectedModel;
     return true;
@@ -162,7 +175,12 @@ export async function runSandboxAgent(
     clearProviderEnv,
   });
   Object.assign(env, plan.secrets); // apply only the resolved provider keys
-  const strictModel = applyClaudeConnectionEnv(env, request, plan.acpAgent, logger);
+  const strictModel = applyClaudeConnectionEnv(
+    env,
+    request,
+    plan.acpAgent,
+    logger,
+  );
   // Pi self-instruments locally: propagate the trace context + public tool metadata into Pi
   // via the Agenta extension. Tool execution always relays back to this runner, which keeps
   // private specs, scoped env, callback endpoints, and callback auth in memory.
@@ -233,14 +251,36 @@ export async function runSandboxAgent(
       log: logger,
     });
 
+    // Sandbox-start invariant: `startSandboxAgent` must hand back a usable handle, or the
+    // probe/createSession below fail with an opaque "cannot read property of undefined".
+    assert(
+      sandbox && typeof sandbox.createSession === "function",
+      `sandbox provider '${plan.sandboxId}' returned no usable sandbox handle`,
+    );
+
     // Probe what this harness supports and branch on capabilities, not on the harness
     // name. Tool delivery: Pi loads our extension (native tools, set up above); any other
     // harness takes tools over MCP only when it advertises `mcpTools` (pi-acp does not
     // forward MCP, Claude/Codex do).
-    const capabilities = await (deps.probeCapabilities ?? probeCapabilities)(
+    const probed = await (deps.probeCapabilities ?? probeCapabilities)(
       sandbox,
       plan.acpAgent,
     );
+    const capabilities = probed.capabilities;
+
+    // Fail loud (A7): a run that REQUIRES a capability the harness lacks errors with a
+    // specific message instead of silently dropping the behavior, the way the
+    // `*_UNSUPPORTED_MESSAGE` gates in `run-plan.ts` do. Today: tool delivery to a non-Pi
+    // harness whose probe reports `mcpTools:false` / `toolCalls:false`. The throw is caught
+    // below and returned as `{ ok: false, error }`.
+    assertRequiredCapabilities({
+      harness: plan.harness,
+      isPi: plan.isPi,
+      probed,
+      toolSpecs: plan.toolSpecs,
+      log: logger,
+    });
+
     const mcpServers = buildSessionMcpServers({
       isPi: plan.isPi,
       capabilities,
@@ -354,6 +394,29 @@ export async function runSandboxAgent(
     const output = run.finish();
     await run.flush();
 
+    // Fail loud on a swallowed model error (A7 / "fail loud, not silent"). When Pi's provider
+    // call fails (out-of-quota, bad key, rate limit, unknown model, ...), Pi's pi-acp bridge
+    // reports the turn as a plain `end_turn` with NO content, so without this the run would
+    // return an `ok:true` empty turn and the user would see a silent "No response" instead of
+    // the real failure. On the LOCAL Pi path the error is recoverable from Pi's own session
+    // transcript; surface it as a run error. Only checked when the turn produced no output and
+    // ran no tools (a real tool-only turn legitimately has empty text), and never on Daytona
+    // (the transcript lives in the remote sandbox).
+    if (
+      plan.isPi &&
+      !plan.isDaytona &&
+      !output.trim() &&
+      !run.events().some((e) => e.type === "tool_call")
+    ) {
+      const piError = findSwallowedPiError(plan.sourcePiAgentDir, plan.cwd);
+      if (piError) {
+        return {
+          ok: false,
+          error: conciseError(new Error(piError), plan.harness, request.provider),
+        };
+      }
+    }
+
     return {
       ok: true,
       output,
@@ -376,7 +439,7 @@ export async function runSandboxAgent(
   } catch (err) {
     otel?.finish();
     await otel?.flush().catch(() => {});
-    return { ok: false, error: conciseError(err, plan.harness) };
+    return { ok: false, error: conciseError(err, plan.harness, request.provider) };
   } finally {
     await toolRelay?.stop().catch(() => {});
     await sandbox?.destroySandbox().catch(() => {});
diff --git a/services/agent/src/engines/sandbox_agent/capabilities.ts b/services/agent/src/engines/sandbox_agent/capabilities.ts
index 654fdb21bc..70cc446088 100644
--- a/services/agent/src/engines/sandbox_agent/capabilities.ts
+++ b/services/agent/src/engines/sandbox_agent/capabilities.ts
@@ -1,40 +1,118 @@
-import type { HarnessCapabilities } from "../../protocol.ts";
+import type { HarnessCapabilities, ResolvedToolSpec } from "../../protocol.ts";
 
 /**
- * Map a sandbox-agent `AgentInfo` to our capability flags. Falls back to a per-harness
- * static guess when the probe is unavailable.
+ * Where a run's `HarnessCapabilities` came from.
+ *
+ *  - `probed`: read from the daemon's `AgentInfo.capabilities` for this exact harness.
+ *  - `static`: the daemon probe was unavailable (or returned no capabilities), so we fell
+ *    back to a per-harness static guess. A guess is a weaker contract — it can disagree with
+ *    what the harness actually does — so callers that assert a hard capability requirement use
+ *    the source to phrase a clear error instead of trusting the guess.
  */
-export function mapCapabilities(harness: string, info: any): HarnessCapabilities {
+export type CapabilitySource = "probed" | "static";
+
+export interface ProbedCapabilities {
+  capabilities: HarnessCapabilities;
+  source: CapabilitySource;
+}
+
+/** Every capability flag the runner branches on, for the debug-assertion shape check. */
+const CAPABILITY_FLAGS = [
+  "textMessages",
+  "images",
+  "fileAttachments",
+  "mcpTools",
+  "toolCalls",
+  "reasoning",
+  "planMode",
+  "permissions",
+  "streamingDeltas",
+  "sessionLifecycle",
+  "usage",
+] as const;
+
+/**
+ * Debug assertion: throw on an impossible runner state with an actionable message. These guard
+ * the runner hot path (run-plan build, capability probe, capability gate, sandbox start) so an
+ * impossible state surfaces at its origin with context instead of as a confusing downstream
+ * failure. Cheap (a boolean test) and always on — a tripped invariant is a runner bug, not a
+ * user error.
+ */
+export function assert(condition: unknown, message: string): asserts condition {
+  if (!condition) throw new Error(`[sandbox-agent invariant] ${message}`);
+}
+
+/**
+ * A run asked for a capability the harness does not have. Mirrors the not-implemented gates
+ * (`tools/code.ts` `CODE_TOOL_UNSUPPORTED_MESSAGE`, #4831's `*_UNSUPPORTED_MESSAGE`): the run
+ * fails with one clear, specific line rather than silently dropping the behavior. Built by
+ * `assertRequiredCapabilities`.
+ */
+export function toolDeliveryUnsupportedMessage(
+  harness: string,
+  missing: string,
+  toolCount: number,
+): string {
+  return (
+    `harness '${harness}' cannot receive tools (probe reports ${missing}); ` +
+    `the run carries ${toolCount} tool(s) that would be silently dropped. Run on a ` +
+    `tool-capable harness, or remove the tools.`
+  );
+}
+
+/**
+ * Map a sandbox-agent `AgentInfo` to our capability flags. Falls back to a per-harness static
+ * guess when the probe is unavailable. Returns the flags AND where they came from, so a caller
+ * can refuse to silently degrade on a guess (see `assertRequiredCapabilities`).
+ */
+export function mapCapabilities(
+  harness: string,
+  info: any,
+): ProbedCapabilities {
+  assert(
+    typeof harness === "string" && harness.length > 0,
+    "mapCapabilities requires a non-empty harness id",
+  );
   const c = info?.capabilities;
   if (c) {
+    assert(
+      typeof c === "object",
+      `probed capabilities for '${harness}' is not an object (got ${typeof c})`,
+    );
     return {
-      textMessages: c.textMessages ?? true,
-      images: !!c.images,
-      fileAttachments: !!c.fileAttachments,
-      mcpTools: !!c.mcpTools,
-      toolCalls: !!c.toolCalls,
-      reasoning: !!c.reasoning,
-      planMode: !!c.planMode,
-      permissions: !!c.permissions,
-      streamingDeltas: !!c.streamingDeltas,
-      sessionLifecycle: !!c.sessionLifecycle,
-      usage: true,
+      source: "probed",
+      capabilities: {
+        textMessages: c.textMessages ?? true,
+        images: !!c.images,
+        fileAttachments: !!c.fileAttachments,
+        mcpTools: !!c.mcpTools,
+        toolCalls: !!c.toolCalls,
+        reasoning: !!c.reasoning,
+        planMode: !!c.planMode,
+        permissions: !!c.permissions,
+        streamingDeltas: !!c.streamingDeltas,
+        sessionLifecycle: !!c.sessionLifecycle,
+        usage: true,
+      },
     };
   }
   // Static fallback by harness id: pi-acp does not forward MCP, Claude/Codex do.
   const isPiHarness = harness === "pi";
   return {
-    textMessages: true,
-    images: false,
-    fileAttachments: false,
-    mcpTools: !isPiHarness,
-    toolCalls: true,
-    reasoning: true,
-    planMode: !isPiHarness,
-    permissions: !isPiHarness,
-    streamingDeltas: true,
-    sessionLifecycle: true,
-    usage: true,
+    source: "static",
+    capabilities: {
+      textMessages: true,
+      images: false,
+      fileAttachments: false,
+      mcpTools: !isPiHarness,
+      toolCalls: true,
+      reasoning: true,
+      planMode: !isPiHarness,
+      permissions: !isPiHarness,
+      streamingDeltas: true,
+      sessionLifecycle: true,
+      usage: true,
+    },
   };
 }
 
@@ -42,11 +120,84 @@ export function mapCapabilities(harness: string, info: any): HarnessCapabilities
 export async function probeCapabilities(
   sandbox: any,
   harness: string,
-): Promise<HarnessCapabilities> {
+): Promise<ProbedCapabilities> {
+  assert(
+    sandbox && typeof sandbox.getAgent === "function",
+    "probeCapabilities requires a sandbox with getAgent()",
+  );
+  let probed: ProbedCapabilities;
   try {
     const info = await sandbox.getAgent(harness, { config: true });
-    return mapCapabilities(harness, info);
+    probed = mapCapabilities(harness, info);
   } catch {
-    return mapCapabilities(harness, undefined);
+    probed = mapCapabilities(harness, undefined);
+  }
+  // Debug assertion: the mapper must always return a complete, boolean-valued flag set so a
+  // downstream branch never reads an undefined flag (which would coerce to a silent `false`).
+  for (const flag of CAPABILITY_FLAGS) {
+    assert(
+      typeof probed.capabilities[flag] === "boolean",
+      `capability '${flag}' for '${harness}' is ${typeof probed.capabilities[flag]}, expected boolean`,
+    );
   }
+  return probed;
+}
+
+export interface AssertCapabilitiesInput {
+  harness: string;
+  isPi: boolean;
+  probed: ProbedCapabilities;
+  toolSpecs: ResolvedToolSpec[];
+  /** Hook for the debug log; the engine passes its stderr logger. */
+  log?: (message: string) => void;
+}
+
+/**
+ * Fail loud when a run REQUIRES a capability the harness does not have, instead of silently
+ * dropping the behavior the way `buildSessionMcpServers` did before. The one requirement worth
+ * asserting today is tool delivery, the clearest case:
+ *
+ *  - Pi delivers tools through its native extension (not MCP), so a Pi run is exempt — its tool
+ *    delivery never depends on the probed `mcpTools`/`toolCalls` flags.
+ *  - Any other harness takes tools over MCP only. If the run carries tool specs but the harness
+ *    does not advertise BOTH `mcpTools` (it can receive an MCP server) AND `toolCalls` (it can
+ *    actually call a tool), those tools would be dropped without a trace. Refuse the run with a
+ *    specific error, mirroring the `*_UNSUPPORTED_MESSAGE` gates in `run-plan.ts`.
+ *
+ * Throws on violation; the engine catch turns the throw into `{ ok: false, error }`.
+ */
+export function assertRequiredCapabilities({
+  harness,
+  isPi,
+  probed,
+  toolSpecs,
+  log = () => {},
+}: AssertCapabilitiesInput): void {
+  assert(
+    typeof harness === "string" && harness.length > 0,
+    "assertRequiredCapabilities requires a non-empty harness id",
+  );
+  const { capabilities, source } = probed;
+
+  // Pi's tools ride its native extension, not the probed MCP capability — nothing to assert.
+  if (isPi) return;
+
+  const toolCount = toolSpecs.length;
+  if (toolCount === 0) return;
+
+  const missing: string[] = [];
+  if (!capabilities.mcpTools) missing.push("mcpTools:false");
+  if (!capabilities.toolCalls) missing.push("toolCalls:false");
+  if (missing.length === 0) return;
+
+  // A static guess that says a capability is missing is the weakest signal, but the run still
+  // cannot deliver the tools, so it must fail rather than proceed and drop them. Note the source
+  // in the log so a debugging operator knows the probe was unavailable.
+  log(
+    `capability gate: harness '${harness}' (${source}) lacks ${missing.join(", ")}; ` +
+      `refusing a run with ${toolCount} tool(s) rather than dropping them`,
+  );
+  throw new Error(
+    toolDeliveryUnsupportedMessage(harness, missing.join(", "), toolCount),
+  );
 }
diff --git a/services/agent/src/engines/sandbox_agent/errors.ts b/services/agent/src/engines/sandbox_agent/errors.ts
index dd89229209..1b9f8b9185 100644
--- a/services/agent/src/engines/sandbox_agent/errors.ts
+++ b/services/agent/src/engines/sandbox_agent/errors.ts
@@ -1,13 +1,50 @@
+/** Map a provider family to its human-facing vault key label, for the credit/auth hint. */
+const PROVIDER_KEY_LABELS: Record<string, string> = {
+  openai: "OpenAI",
+  anthropic: "Anthropic",
+  gemini: "Gemini",
+  mistral: "Mistral",
+  mistralai: "Mistral",
+  minimax: "MiniMax",
+  groq: "Groq",
+  together_ai: "Together AI",
+  openrouter: "OpenRouter",
+};
+
+/**
+ * The vault-key hint phrase for an error, named after the RESOLVED provider rather than the
+ * harness. A Pi run against an Anthropic model must say "Anthropic key", not "OpenAI key" — the
+ * harness name (`pi_core`/`claude`) is not the provider, so deriving the hint from it mislabels
+ * every cross-provider run (e.g. Pi + Anthropic wrongly read "check the project's OpenAI key").
+ *
+ * `provider` is the resolved provider the runner already knows (`request.provider`, from the
+ * resolved connection). When it is absent (un-migrated caller) fall back to the harness default
+ * — Claude is always Anthropic; every other harness defaults to OpenAI, matching the old
+ * behavior for that path only.
+ */
+function keyHintFor(provider: string | undefined, harness: string): string {
+  const label = provider ? PROVIDER_KEY_LABELS[provider.toLowerCase()] : undefined;
+  if (label) return `the project's ${label} key`;
+  if (harness === "claude") return "the project's Anthropic key";
+  return "the project's OpenAI key";
+}
+
 /**
  * Turn a harness/SDK error into one clear line for the caller instead of dumping a full
  * ACP/JS stack. Recognizes common harness auth failures.
+ *
+ * `provider` is the resolved provider for the run; pass it so the credit/auth hint names the
+ * actual provider the run targeted, not a provider guessed from the harness name.
  */
-export function conciseError(err: unknown, harness: string): string {
+export function conciseError(
+  err: unknown,
+  harness: string,
+  provider?: string,
+): string {
   const raw = err instanceof Error ? err.message : String(err);
   const msg = raw.split("\n")[0].trim();
-  const keyHint =
-    harness === "claude" ? "the project's Anthropic key" : "the project's OpenAI key";
-  if (/credit balance is too low/i.test(raw)) {
+  const keyHint = keyHintFor(provider, harness);
+  if (/credit balance is too low|exceeded your current quota|insufficient_quota/i.test(raw)) {
     return `${harness}: the model provider account has insufficient credit (check ${keyHint}).`;
   }
   if (/authentication required|invalid api key|401|unauthorized/i.test(raw)) {
diff --git a/services/agent/src/engines/sandbox_agent/pi-error.ts b/services/agent/src/engines/sandbox_agent/pi-error.ts
new file mode 100644
index 0000000000..129b54b982
--- /dev/null
+++ b/services/agent/src/engines/sandbox_agent/pi-error.ts
@@ -0,0 +1,140 @@
+/**
+ * Recover a model-call error that Pi swallows on the local sandbox-agent path.
+ *
+ * When Pi's provider call fails (out-of-quota, bad key, rate limit, unknown model, ...),
+ * Pi records the failed turn in its session transcript as an assistant message with
+ * `stopReason: "error"` and a human-readable `errorMessage`, but its pi-acp bridge reports
+ * the turn to the runner as a plain `{ stopReason: "end_turn" }` with NO content. The runner
+ * then returns an `ok: true` run with empty output, and the user sees a silent "No response"
+ * instead of the real failure.
+ *
+ * This reader closes that gap on the LOCAL path (Pi runs on the runner host, so its session
+ * dir is on this filesystem). After a Pi turn that produced no output, the engine asks this
+ * helper for the transcript's last assistant `errorMessage`; when present, the run is failed
+ * loud with that message instead of returning an empty turn.
+ *
+ * It is deliberately best-effort and side-effect free: any filesystem/parse problem returns
+ * `undefined` so a genuinely empty (but successful) turn is never turned into a false error.
+ * Daytona is out of scope (the transcript lives in the remote sandbox); the empty-turn there
+ * stays as-is until the harness surfaces the error over ACP.
+ */
+import { readFileSync, readdirSync, statSync } from "node:fs";
+import { join } from "node:path";
+
+/** A Pi transcript `session` record (first line of the .jsonl). */
+interface PiSessionRecord {
+  type?: string;
+  id?: string;
+  cwd?: string;
+}
+
+/** A Pi transcript `message` record. */
+interface PiMessageRecord {
+  type?: string;
+  message?: {
+    role?: string;
+    stopReason?: string;
+    errorMessage?: string;
+    content?: unknown[];
+  };
+}
+
+/** The most recent assistant error in one transcript, or undefined if none / unreadable. */
+function lastAssistantError(jsonlPath: string): string | undefined {
+  let raw: string;
+  try {
+    raw = readFileSync(jsonlPath, "utf8");
+  } catch {
+    return undefined;
+  }
+  let found: string | undefined;
+  for (const line of raw.split("\n")) {
+    const trimmed = line.trim();
+    if (!trimmed) continue;
+    let record: PiMessageRecord;
+    try {
+      record = JSON.parse(trimmed) as PiMessageRecord;
+    } catch {
+      continue;
+    }
+    const msg = record.message;
+    if (record.type !== "message" || msg?.role !== "assistant") continue;
+    // Keep scanning so a later successful turn clears an earlier error; only an error that
+    // is the LAST assistant turn (and is what produced the empty output) is surfaced.
+    if (msg.stopReason === "error" && msg.errorMessage) {
+      found = msg.errorMessage.trim() || undefined;
+    } else {
+      found = undefined;
+    }
+  }
+  return found;
+}
+
+/** The `session` record of a transcript file, or undefined if unreadable / not a session. */
+function readSessionRecord(jsonlPath: string): PiSessionRecord | undefined {
+  let raw: string;
+  try {
+    raw = readFileSync(jsonlPath, "utf8");
+  } catch {
+    return undefined;
+  }
+  const firstLine = raw.split("\n", 1)[0]?.trim();
+  if (!firstLine) return undefined;
+  try {
+    const record = JSON.parse(firstLine) as PiSessionRecord;
+    return record.type === "session" ? record : undefined;
+  } catch {
+    return undefined;
+  }
+}
+
+/**
+ * Find the Pi transcript for a run (matched by its unique `cwd`) and return the last
+ * assistant turn's `errorMessage`, or undefined when there is none.
+ *
+ * Each run gets a unique cwd, and Pi stamps the cwd on every transcript's `session` record,
+ * so matching on cwd locates this run's transcript without depending on Pi's dir-name
+ * encoding. Among matches (a resumed session can have several), the newest file wins.
+ */
+export function findSwallowedPiError(
+  piAgentDir: string,
+  cwd: string,
+): string | undefined {
+  const sessionsRoot = join(piAgentDir, "sessions");
+  let dirs: string[];
+  try {
+    dirs = readdirSync(sessionsRoot);
+  } catch {
+    return undefined;
+  }
+
+  let newestPath: string | undefined;
+  let newestMtime = -1;
+  for (const dir of dirs) {
+    const dirPath = join(sessionsRoot, dir);
+    let files: string[];
+    try {
+      files = readdirSync(dirPath);
+    } catch {
+      continue;
+    }
+    for (const file of files) {
+      if (!file.endsWith(".jsonl")) continue;
+      const filePath = join(dirPath, file);
+      const session = readSessionRecord(filePath);
+      if (session?.cwd !== cwd) continue;
+      let mtime: number;
+      try {
+        mtime = statSync(filePath).mtimeMs;
+      } catch {
+        continue;
+      }
+      if (mtime > newestMtime) {
+        newestMtime = mtime;
+        newestPath = filePath;
+      }
+    }
+  }
+
+  return newestPath ? lastAssistantError(newestPath) : undefined;
+}
diff --git a/services/agent/src/engines/sandbox_agent/run-plan.ts b/services/agent/src/engines/sandbox_agent/run-plan.ts
index 537fbcddec..8b146a7ee1 100644
--- a/services/agent/src/engines/sandbox_agent/run-plan.ts
+++ b/services/agent/src/engines/sandbox_agent/run-plan.ts
@@ -16,6 +16,7 @@ import {
   type MaterializedSkill,
   resolveSkillDirs as defaultResolveSkillDirs,
 } from "../skills.ts";
+import { assert } from "./capabilities.ts";
 import { buildTurnText } from "./transcript.ts";
 
 type Log = (message: string) => void;
@@ -145,6 +146,14 @@ export function buildRunPlan(
   const acpAgent =
     harness === "pi_core" || harness === "pi_agenta" ? "pi" : harness;
 
+  // Debug assertion: every Pi identity must resolve to the `pi` ACP agent and nothing else may.
+  // Catches a future harness-id typo (e.g. a new `pi_*` value forgotten here) at plan-build time
+  // rather than as a daemon "unknown agent" error mid-run.
+  assert(
+    (harness === "pi_core" || harness === "pi_agenta") === (acpAgent === "pi"),
+    `harness '${harness}' resolved to ACP agent '${acpAgent}', but pi identity mapping disagrees`,
+  );
+
   const prompt = resolvePromptText(request);
   if (!prompt) {
     return {
@@ -229,6 +238,19 @@ export function buildRunPlan(
     ? request.appendSystemPrompt?.trim() || undefined
     : undefined;
 
+  // Debug assertions: the derived run state must be self-consistent before the engine acts on
+  // it. A cwd that is empty, or a relay dir not nested under it, would only surface later as a
+  // confusing filesystem error inside the sandbox.
+  assert(!!cwd, `buildRunPlan produced an empty cwd for harness '${harness}'`);
+  assert(
+    relayDir.startsWith(cwd),
+    `relay dir '${relayDir}' is not under cwd '${cwd}'`,
+  );
+  assert(
+    isPi === (acpAgent === "pi"),
+    `isPi (${isPi}) disagrees with acpAgent '${acpAgent}'`,
+  );
+
   return {
     ok: true,
     plan: {
diff --git a/services/agent/src/tracing/otel.ts b/services/agent/src/tracing/otel.ts
index 5a8d98ce7f..eb4acbe012 100644
--- a/services/agent/src/tracing/otel.ts
+++ b/services/agent/src/tracing/otel.ts
@@ -611,30 +611,107 @@ function acpToolContentText(content: any): string {
 }
 
 /**
- * Strip the pi-acp startup banner that some setups emit as the first agent message
- * chunk (a "pi vX.Y.Z" / "## Context" / file list / "New version available" prelude,
- * surfaced ahead of the real answer). Removes only a leading run of those marker lines
- * so a genuine answer is never touched.
+ * Is this line part of the pi-acp startup banner that some setups emit as the first agent
+ * message chunk, ahead of the real answer? pi-acp's `buildStartupInfo` produces, in order:
+ *
+ *   pi v0.79.4
+ *   ---
+ *   (blank)
+ *   ## Context
+ *   - /tmp/agenta-sandbox-agent-XXXX/AGENTS.md
+ *   (blank)
+ *   ## Skills            (when skills are installed)
+ *   - /path/to/skill.md
+ *   (blank)
+ *   New version available: v0.80.2 (installed v0.79.4). Run: `npm i -g @earendil-works/pi-coding-agent`
+ *
+ * The markdown markers (`## `, `- `) are stripped when the playground renders the text, so the
+ * user sees a bare `Context` heading and an unprefixed absolute `.../AGENTS.md` path — but the
+ * raw chunk still carries the markdown, so we match BOTH the raw and the rendered shapes. The
+ * "New version available" notice is emitted even when `quietStartup` suppresses the rest, so it
+ * must be matched on its own. We only ever strip a LEADING run of these lines, so a genuine
+ * answer that happens to contain such words later is never touched.
+ */
+export function isBannerLine(line: string): boolean {
+  const t = line.trim();
+  return (
+    t === "" ||
+    t === "---" ||
+    /^pi v\d+\.\d+\.\d+\b/.test(t) ||
+    // section heading, raw ("## Context") or rendered ("Context"); same for "Skills"
+    /^(?:#{1,6}\s*)?(?:Context|Skills)\s*$/.test(t) ||
+    // an AGENTS.md / *.md path item, list-prefixed ("- /…/AGENTS.md") or bare ("/…/AGENTS.md")
+    /^(?:-\s+)?\/\S*\.md\s*$/.test(t) ||
+    // upgrade notice: "New version available: vX (installed vY). Run: `npm i -g …`"
+    /^New version available:/.test(t) ||
+    /^Run:\s*`?npm\s+i\b/.test(t)
+  );
+}
+
+/**
+ * Strip a leading run of pi-acp startup-banner lines from `text`. Returns the text past the
+ * banner (trimmed) when at least one non-blank banner line was seen, otherwise the original.
  */
-function stripStartupBanner(text: string): string {
+export function stripStartupBanner(text: string): string {
   const lines = text.split("\n");
-  const isBanner = (line: string) =>
-    /^pi v\d+\.\d+\.\d+/.test(line) ||
-    /^## Context\b/.test(line) ||
-    /^-\s+\/.*AGENTS\.md\s*$/.test(line) ||
-    /^New version available:/.test(line) ||
-    /^Run: `npm/.test(line) ||
-    line.trim() === "---" ||
-    line.trim() === "";
   let i = 0;
   let sawBanner = false;
-  while (i < lines.length && isBanner(lines[i])) {
+  while (i < lines.length && isBannerLine(lines[i])) {
     if (lines[i].trim() !== "") sawBanner = true;
     i++;
   }
   return sawBanner ? lines.slice(i).join("\n").trim() : text;
 }
 
+/**
+ * Streaming-safe variant. Given the cumulative assistant text so far, return the portion that
+ * is safe to surface as a delta now, plus whether the leading banner region is fully resolved.
+ *
+ * The banner always arrives at the START of the stream and may straddle chunk boundaries, so we
+ * must not classify the LAST line until we know it is complete (a trailing newline, or a later
+ * chunk, settles it). While the text seen so far is entirely banner-or-blank we return
+ * `{ body: "", settled: false }` and the caller holds emission; once a non-banner line appears
+ * we return everything from it onward and never re-buffer again. `start` is the byte offset of
+ * the body within `text` (after leading whitespace), so the caller can slice later chunks from
+ * the same offset — the banner is a stable leading prefix of the cumulative stream.
+ */
+export function splitLeadingBanner(text: string): {
+  body: string;
+  settled: boolean;
+  start: number;
+} {
+  const endsWithNewline = text.endsWith("\n");
+  const lines = text.split("\n");
+  // The final element is a partial line unless the text ended on a newline.
+  const lastIsPartial = !endsWithNewline;
+  let i = 0;
+  let offset = 0; // byte offset of line i within `text`
+  let sawBanner = false;
+  while (i < lines.length) {
+    const isLast = i === lines.length - 1;
+    if (isLast && lastIsPartial && sawBanner) {
+      // We are at the banner boundary with a still-arriving partial line. It could complete into
+      // either another banner line or the first line of the real answer — only a later chunk (or
+      // a newline) settles it, so hold. (When no banner has been seen, there is nothing to
+      // suppress: settle on the partial line and stream it without latency.)
+      return { body: "", settled: false, start: -1 };
+    }
+    if (!isBannerLine(lines[i])) break;
+    sawBanner = true;
+    offset += lines[i].length + 1; // +1 for the consumed "\n"
+    i++;
+  }
+  if (i >= lines.length) {
+    // Consumed every (complete) line as banner — nothing real has started yet.
+    return { body: "", settled: false, start: -1 };
+  }
+  const rest = text.slice(offset);
+  // Drop the blank line(s) that separate the banner from the answer, but never trim a
+  // banner-free stream — a genuine answer may legitimately begin with whitespace.
+  const ws = sawBanner ? rest.length - rest.replace(/^\s+/, "").length : 0;
+  return { body: rest.slice(ws), settled: true, start: offset + ws };
+}
+
 /** Split a resolved model id ("openai-codex/gpt-5.5") into provider + id. */
 function splitModel(model?: string): { provider?: string; id?: string } {
   if (!model) return {};
@@ -767,6 +844,16 @@ export function createSandboxAgentOtel(init: SandboxAgentOtelInit): SandboxAgent
   let textBlockId: string | undefined;
   let textEmitted = "";
   let anyTextDelta = false;
+  // Streaming banner suppression: pi-acp emits its startup banner as the FIRST agent message
+  // chunk, ahead of the real answer. The one-shot `finish()` path strips it from the coalesced
+  // text, but the streaming path flushes deltas as they arrive, before finish() ever runs — so
+  // the banner would leak to the client. We hold the leading deltas until the banner region is
+  // resolved (it may straddle chunk boundaries), then stream only the body past it. Once a real
+  // line has started, `bannerSettled` latches true and we never strip again.
+  let bannerSettled = false;
+  // Once the banner region resolves, the real answer begins at this byte offset in `accumulated`;
+  // every later chunk is streamed as `accumulated.slice(bannerEnd)` so the banner never reappears.
+  let bannerEnd = 0;
   let reasoningBlockId: string | undefined;
   let reasoningEmitted = "";
   let blockSeq = 0;
@@ -800,6 +887,25 @@ export function createSandboxAgentOtel(init: SandboxAgentOtelInit): SandboxAgent
     anyTextDelta = true;
   }
 
+  /**
+   * Stream the assistant text for the cumulative `accumulated`, with the leading startup banner
+   * suppressed. Until the banner region resolves we emit nothing; afterwards we stream the body
+   * past it. `body` is a prefix-growing view of the real answer, so `streamText`'s delta logic
+   * stays correct.
+   */
+  function streamAssistantText(): void {
+    if (bannerSettled) {
+      // The banner is a stable leading prefix; stream everything past it.
+      streamText(accumulated.slice(bannerEnd));
+      return;
+    }
+    const { body, settled, start } = splitLeadingBanner(accumulated);
+    if (!settled) return; // banner region still arriving — hold emission
+    bannerSettled = true;
+    bannerEnd = start;
+    if (body) streamText(body);
+  }
+
   /** Open (if needed) the reasoning block and emit the pure delta up to `target`. */
   function streamReasoning(target: string): void {
     closeText(); // a reasoning chunk ends any open text run
@@ -869,8 +975,9 @@ export function createSandboxAgentOtel(init: SandboxAgentOtelInit): SandboxAgent
       else accumulated += t;
       // Live deltas run independent of span emission (text, not a span), so they flow even
       // when the harness self-instruments (emitSpans=false). `accumulated` is the cumulative
-      // text, so the pure delta is its tail past what we already sent.
-      if (sink) streamText(accumulated);
+      // text, so the pure delta is its tail past what we already sent — minus the leading
+      // startup banner, which is held back until the body begins (see streamAssistantText).
+      if (sink) streamAssistantText();
       return;
     }
 
diff --git a/services/agent/tests/unit/sandbox-agent-capabilities.test.ts b/services/agent/tests/unit/sandbox-agent-capabilities.test.ts
index 6d8f80c081..b96f4a4ece 100644
--- a/services/agent/tests/unit/sandbox-agent-capabilities.test.ts
+++ b/services/agent/tests/unit/sandbox-agent-capabilities.test.ts
@@ -1,34 +1,25 @@
 /**
- * Unit tests for sandbox-agent capability mapping.
+ * Unit tests for sandbox-agent capability mapping, the fail-loud capability gate, and the
+ * debug assertions.
  *
  * Run: pnpm test (or: pnpm exec vitest run tests/unit/sandbox-agent-capabilities.test.ts)
  */
 import { describe, it } from "vitest";
 import assert from "node:assert/strict";
 
+import type { ResolvedToolSpec } from "../../src/protocol.ts";
 import {
+  assert as invariant,
+  assertRequiredCapabilities,
   mapCapabilities,
   probeCapabilities,
+  type ProbedCapabilities,
 } from "../../src/engines/sandbox_agent/capabilities.ts";
 
 describe("mapCapabilities", () => {
-  it("maps probed sandbox-agent capabilities and always enables usage", () => {
-    assert.deepEqual(
-      mapCapabilities("claude", {
-        capabilities: {
-          textMessages: false,
-          images: true,
-          fileAttachments: true,
-          mcpTools: true,
-          toolCalls: true,
-          reasoning: true,
-          planMode: true,
-          permissions: true,
-          streamingDeltas: true,
-          sessionLifecycle: true,
-        },
-      }),
-      {
+  it("maps probed sandbox-agent capabilities, always enables usage, marks the source probed", () => {
+    const out = mapCapabilities("claude", {
+      capabilities: {
         textMessages: false,
         images: true,
         fileAttachments: true,
@@ -39,26 +30,58 @@ describe("mapCapabilities", () => {
         permissions: true,
         streamingDeltas: true,
         sessionLifecycle: true,
-        usage: true,
       },
-    );
+    });
+    assert.equal(out.source, "probed");
+    assert.deepEqual(out.capabilities, {
+      textMessages: false,
+      images: true,
+      fileAttachments: true,
+      mcpTools: true,
+      toolCalls: true,
+      reasoning: true,
+      planMode: true,
+      permissions: true,
+      streamingDeltas: true,
+      sessionLifecycle: true,
+      usage: true,
+    });
+  });
+
+  it("falls back to no MCP for Pi and MCP for non-Pi harnesses, marked static", () => {
+    const pi = mapCapabilities("pi", undefined);
+    assert.equal(pi.source, "static");
+    assert.equal(pi.capabilities.mcpTools, false);
+
+    const claude = mapCapabilities("claude", undefined);
+    assert.equal(claude.source, "static");
+    assert.equal(claude.capabilities.mcpTools, true);
+  });
+
+  it("invariant: rejects an empty harness id", () => {
+    assert.throws(() => mapCapabilities("", undefined), /non-empty harness id/);
   });
 
-  it("falls back to no MCP for Pi and MCP for non-Pi harnesses", () => {
-    assert.equal(mapCapabilities("pi", undefined).mcpTools, false);
-    assert.equal(mapCapabilities("claude", undefined).mcpTools, true);
+  it("invariant: rejects a non-object probed capabilities payload", () => {
+    assert.throws(
+      () => mapCapabilities("claude", { capabilities: "yes" }),
+      /not an object/,
+    );
   });
 });
 
 describe("probeCapabilities", () => {
-  it("uses the daemon probe when available", async () => {
+  it("uses the daemon probe when available and reports the source", async () => {
     const sandbox = {
-      getAgent: async () => ({ capabilities: { mcpTools: true, permissions: true } }),
+      getAgent: async () => ({
+        capabilities: { mcpTools: true, permissions: true, toolCalls: true },
+      }),
     };
 
     const out = await probeCapabilities(sandbox, "claude");
-    assert.equal(out.mcpTools, true);
-    assert.equal(out.permissions, true);
+    assert.equal(out.source, "probed");
+    assert.equal(out.capabilities.mcpTools, true);
+    assert.equal(out.capabilities.permissions, true);
   });
 
   it("uses static fallback when probing fails", async () => {
@@ -68,6 +91,123 @@ describe("probeCapabilities", () => {
       },
     };
 
-    assert.equal((await probeCapabilities(sandbox, "pi")).mcpTools, false);
+    const out = await probeCapabilities(sandbox, "pi");
+    assert.equal(out.source, "static");
+    assert.equal(out.capabilities.mcpTools, false);
+  });
+
+  it("returns a complete boolean-valued flag set even when the probe omits flags", async () => {
+    const sandbox = {
+      // A partial probe payload: every absent flag must still come back a boolean.
+      getAgent: async () => ({ capabilities: { mcpTools: true } }),
+    };
+    const out = await probeCapabilities(sandbox, "claude");
+    for (const value of Object.values(out.capabilities)) {
+      assert.equal(typeof value, "boolean");
+    }
+  });
+
+  it("invariant: rejects a sandbox without getAgent", async () => {
+    await assert.rejects(
+      () => probeCapabilities({}, "claude"),
+      /requires a sandbox with getAgent/,
+    );
+  });
+});
+
+function probed(
+  flags: Record<string, boolean>,
+  source: "probed" | "static" = "probed",
+): ProbedCapabilities {
+  return { source, capabilities: flags as any };
+}
+
+const tool: ResolvedToolSpec = { name: "server_tool", kind: "callback" };
+
+describe("assertRequiredCapabilities (fail loud on tool delivery)", () => {
+  it("passes a non-Pi harness that advertises mcpTools + toolCalls", () => {
+    assert.doesNotThrow(() =>
+      assertRequiredCapabilities({
+        harness: "claude",
+        isPi: false,
+        probed: probed({ mcpTools: true, toolCalls: true }),
+        toolSpecs: [tool],
+      }),
+    );
+  });
+
+  it("throws a specific error when the harness lacks mcpTools", () => {
+    assert.throws(
+      () =>
+        assertRequiredCapabilities({
+          harness: "claude",
+          isPi: false,
+          probed: probed({ mcpTools: false, toolCalls: true }),
+          toolSpecs: [tool],
+        }),
+      /harness 'claude' cannot receive tools.*mcpTools:false.*1 tool/s,
+    );
+  });
+
+  it("throws naming both missing flags when neither is advertised", () => {
+    assert.throws(
+      () =>
+        assertRequiredCapabilities({
+          harness: "codex",
+          isPi: false,
+          probed: probed({ mcpTools: false, toolCalls: false }),
+          toolSpecs: [tool, { name: "second", kind: "callback" }],
+        }),
+      /mcpTools:false, toolCalls:false.*2 tool/s,
+    );
+  });
+
+  it("exempts Pi (tools ride its native extension, not the probed MCP flags)", () => {
+    assert.doesNotThrow(() =>
+      assertRequiredCapabilities({
+        harness: "pi_core",
+        isPi: true,
+        probed: probed({ mcpTools: false, toolCalls: false }),
+        toolSpecs: [tool],
+      }),
+    );
+  });
+
+  it("is a no-op when the run carries no tools", () => {
+    assert.doesNotThrow(() =>
+      assertRequiredCapabilities({
+        harness: "claude",
+        isPi: false,
+        probed: probed({ mcpTools: false, toolCalls: false }),
+        toolSpecs: [],
+      }),
+    );
+  });
+
+  it("logs the source so a failed-probe static guess is debuggable", () => {
+    const logs: string[] = [];
+    assert.throws(() =>
+      assertRequiredCapabilities({
+        harness: "claude",
+        isPi: false,
+        probed: probed({ mcpTools: false, toolCalls: true }, "static"),
+        toolSpecs: [tool],
+        log: (m) => logs.push(m),
+      }),
+    );
+    assert.ok(logs.some((m) => m.includes("(static)")));
+  });
+});
+
+describe("assert (debug invariant)", () => {
+  it("throws a prefixed message when the condition is false", () => {
+    assert.throws(
+      () => invariant(false, "boom"),
+      /\[sandbox-agent invariant\] boom/,
+    );
+  });
+
+  it("does nothing when the condition holds", () => {
+    assert.doesNotThrow(() => invariant(true, "fine"));
   });
 });
diff --git a/services/agent/tests/unit/sandbox-agent-errors.test.ts b/services/agent/tests/unit/sandbox-agent-errors.test.ts
index f3d9c15e23..ec303cdbf8 100644
--- a/services/agent/tests/unit/sandbox-agent-errors.test.ts
+++ b/services/agent/tests/unit/sandbox-agent-errors.test.ts
@@ -16,6 +16,18 @@ describe("conciseError", () => {
     );
   });
 
+  it("formats OpenAI quota failures as insufficient credit", () => {
+    assert.equal(
+      conciseError(
+        new Error(
+          "You exceeded your current quota, please check your plan and billing details.",
+        ),
+        "pi",
+      ),
+      "pi: the model provider account has insufficient credit (check the project's OpenAI key).",
+    );
+  });
+
   it("formats auth failures with the right provider hint", () => {
     assert.equal(
       conciseError(new Error("Authentication required"), "pi"),
@@ -23,6 +35,48 @@ describe("conciseError", () => {
     );
   });
 
+  it("names the resolved provider, not the harness, for a Pi+Anthropic run", () => {
+    // The bug: a Pi run against an Anthropic model that fails auth must NOT say "OpenAI key".
+    assert.equal(
+      conciseError(new Error("Authentication required"), "pi_core", "anthropic"),
+      "pi_core: model authentication failed — add the project's Anthropic key to the project vault, or log in (OAuth).",
+    );
+  });
+
+  it("names the resolved provider for a Pi+Anthropic credit failure", () => {
+    assert.equal(
+      conciseError(new Error("credit balance is too low"), "pi_core", "anthropic"),
+      "pi_core: the model provider account has insufficient credit (check the project's Anthropic key).",
+    );
+  });
+
+  it("keeps the OpenAI hint when the resolved provider is openai on Pi", () => {
+    assert.equal(
+      conciseError(new Error("insufficient_quota"), "pi_core", "openai"),
+      "pi_core: the model provider account has insufficient credit (check the project's OpenAI key).",
+    );
+  });
+
+  it("falls back to the harness default when no provider is resolved", () => {
+    // Un-migrated caller (no provider on the wire): keep the old harness-derived behavior.
+    assert.equal(
+      conciseError(new Error("401 unauthorized"), "claude"),
+      "claude: model authentication failed — add the project's Anthropic key to the project vault, or log in (OAuth).",
+    );
+    assert.equal(
+      conciseError(new Error("401 unauthorized"), "pi_core"),
+      "pi_core: model authentication failed — add the project's OpenAI key to the project vault, or log in (OAuth).",
+    );
+  });
+
+  it("falls back to the harness default for an unknown custom provider", () => {
+    // A custom router slug we have no key label for: do not invent one, use the harness default.
+    assert.equal(
+      conciseError(new Error("Authentication required"), "pi_core", "openai-codex"),
+      "pi_core: model authentication failed — add the project's OpenAI key to the project vault, or log in (OAuth).",
+    );
+  });
+
   it("falls back to the first line", () => {
     assert.equal(conciseError(new Error("first line\nsecond line"), "pi"), "first line");
   });
diff --git a/services/agent/tests/unit/sandbox-agent-orchestration.test.ts b/services/agent/tests/unit/sandbox-agent-orchestration.test.ts
index 440b65c545..e8e228facd 100644
--- a/services/agent/tests/unit/sandbox-agent-orchestration.test.ts
+++ b/services/agent/tests/unit/sandbox-agent-orchestration.test.ts
@@ -162,10 +162,14 @@ function fakeHarness(options: FakeOptions = {}) {
     }) as any,
     probeCapabilities: async () =>
       ({
-        mcpTools: true,
-        usage: true,
-        streamingDeltas: true,
-        ...(options.capabilities ?? {}),
+        source: "probed",
+        capabilities: {
+          mcpTools: true,
+          toolCalls: true,
+          usage: true,
+          streamingDeltas: true,
+          ...(options.capabilities ?? {}),
+        },
       }) as any,
     applyModel: async (_session, model, _log, options) => {
       calls.applyModelArgs.push({ model, options });
@@ -347,6 +351,54 @@ describe("runSandboxAgent orchestration", () => {
     });
   });
 
+  it("fails loud when a non-Pi harness probes mcpTools:false but the run carries tools", async () => {
+    // A7: the capability gate runs BEFORE the MCP bridge, so a harness whose probe reports it
+    // cannot receive tools fails with a SPECIFIC capability error (not the generic MCP-disabled
+    // line, and never a silent drop). This is the silent-degradation case the staff review
+    // flagged: a wrong/missing `mcpTools` flag must error, not change behavior quietly.
+    const { calls, deps } = fakeHarness({ capabilities: { mcpTools: false } });
+
+    const result = await runSandboxAgent(
+      {
+        harness: "claude",
+        prompt: "use the tool",
+        customTools: [{ name: "server_tool", kind: "callback" }],
+      } as AgentRunRequest,
+      undefined,
+      undefined,
+      deps,
+    );
+
+    assert.equal(result.ok, false);
+    if (result.ok) return;
+    assert.match(result.error ?? "", /harness 'claude' cannot receive tools/);
+    assert.match(result.error ?? "", /mcpTools:false/);
+    // The gate fires before the session is created, so no session/prompt happened.
+    assert.equal(calls.createSessionOptions, undefined);
+    // The acquired sandbox is still disposed in the finally.
+    assert.equal(calls.sandboxDestroyed, 1);
+  });
+
+  it("does not gate tool delivery for a Pi run even when mcpTools is false", async () => {
+    // Pi delivers tools through its native extension, not MCP, so a Pi run with tools and a
+    // `mcpTools:false` probe must proceed (the gate exempts Pi).
+    const { calls, deps } = fakeHarness({ capabilities: { mcpTools: false } });
+
+    const result = await runSandboxAgent(
+      {
+        harness: "pi_core",
+        prompt: "use the tool",
+        customTools: [{ name: "server_tool", kind: "callback" }],
+      } as AgentRunRequest,
+      undefined,
+      undefined,
+      deps,
+    );
+
+    assert.equal(result.ok, true);
+    assert.notEqual(calls.createSessionOptions, undefined);
+  });
+
   it("flushes a partial trace and cleans up on prompt errors", async () => {
     const { calls, deps } = fakeHarness({ promptError: new Error("boom") });
 
diff --git a/services/agent/tests/unit/sandbox-agent-pi-error.test.ts b/services/agent/tests/unit/sandbox-agent-pi-error.test.ts
new file mode 100644
index 0000000000..d9cf4c9410
--- /dev/null
+++ b/services/agent/tests/unit/sandbox-agent-pi-error.test.ts
@@ -0,0 +1,124 @@
+/**
+ * Unit tests for recovering a model error Pi swallows on the local sandbox-agent path.
+ *
+ * When Pi's provider call fails, Pi records the failed turn in its session transcript with
+ * `stopReason: "error"` + `errorMessage`, but reports a plain `end_turn` with no content over
+ * ACP. `findSwallowedPiError` reads that transcript so the empty turn can fail loud.
+ *
+ * Run: pnpm test (or: pnpm exec vitest run tests/unit/sandbox-agent-pi-error.test.ts)
+ */
+import { afterEach, describe, it } from "vitest";
+import assert from "node:assert/strict";
+import { mkdtempSync, mkdirSync, rmSync, writeFileSync } from "node:fs";
+import { tmpdir } from "node:os";
+import { join } from "node:path";
+
+import { findSwallowedPiError } from "../../src/engines/sandbox_agent/pi-error.ts";
+
+const dirs: string[] = [];
+
+function tempDir(): string {
+  const dir = mkdtempSync(join(tmpdir(), "agenta-pi-error-test-"));
+  dirs.push(dir);
+  return dir;
+}
+
+/** Write a Pi-style .jsonl transcript under `<piAgentDir>/sessions/<name>/` for `cwd`. */
+function writeTranscript(
+  piAgentDir: string,
+  name: string,
+  cwd: string,
+  messages: Array<Record<string, unknown>>,
+): void {
+  const dir = join(piAgentDir, "sessions", name);
+  mkdirSync(dir, { recursive: true });
+  const lines = [
+    JSON.stringify({ type: "session", version: 3, id: name, cwd }),
+    ...messages.map((m) => JSON.stringify(m)),
+  ];
+  writeFileSync(join(dir, `${name}.jsonl`), lines.join("\n") + "\n");
+}
+
+afterEach(() => {
+  for (const dir of dirs.splice(0)) rmSync(dir, { recursive: true, force: true });
+});
+
+describe("findSwallowedPiError", () => {
+  it("returns the last assistant errorMessage for a matching cwd", () => {
+    const piAgentDir = tempDir();
+    const cwd = "/tmp/agenta-sandbox-agent-abc123";
+    writeTranscript(piAgentDir, "--tmp-agenta-sandbox-agent-abc123--", cwd, [
+      { type: "message", message: { role: "user", content: [{ type: "text", text: "hi" }] } },
+      {
+        type: "message",
+        message: {
+          role: "assistant",
+          content: [],
+          stopReason: "error",
+          errorMessage: "You exceeded your current quota, please check your plan.",
+        },
+      },
+    ]);
+
+    const error = findSwallowedPiError(piAgentDir, cwd);
+    assert.equal(error, "You exceeded your current quota, please check your plan.");
+  });
+
+  it("returns undefined for a successful turn", () => {
+    const piAgentDir = tempDir();
+    const cwd = "/tmp/agenta-sandbox-agent-ok";
+    writeTranscript(piAgentDir, "--tmp-agenta-sandbox-agent-ok--", cwd, [
+      {
+        type: "message",
+        message: {
+          role: "assistant",
+          content: [{ type: "text", text: "hello" }],
+          stopReason: "end_turn",
+        },
+      },
+    ]);
+
+    assert.equal(findSwallowedPiError(piAgentDir, cwd), undefined);
+  });
+
+  it("does not surface an error cleared by a later successful turn", () => {
+    const piAgentDir = tempDir();
+    const cwd = "/tmp/agenta-sandbox-agent-recovered";
+    writeTranscript(piAgentDir, "--recovered--", cwd, [
+      {
+        type: "message",
+        message: { role: "assistant", content: [], stopReason: "error", errorMessage: "transient" },
+      },
+      {
+        type: "message",
+        message: {
+          role: "assistant",
+          content: [{ type: "text", text: "recovered" }],
+          stopReason: "end_turn",
+        },
+      },
+    ]);
+
+    assert.equal(findSwallowedPiError(piAgentDir, cwd), undefined);
+  });
+
+  it("ignores transcripts for a different cwd", () => {
+    const piAgentDir = tempDir();
+    writeTranscript(piAgentDir, "--other--", "/tmp/some-other-cwd", [
+      {
+        type: "message",
+        message: { role: "assistant", content: [], stopReason: "error", errorMessage: "nope" },
+      },
+    ]);
+
+    assert.equal(
+      findSwallowedPiError(piAgentDir, "/tmp/agenta-sandbox-agent-missing"),
+      undefined,
+    );
+  });
+
+  it("returns undefined when the sessions dir is absent", () => {
+    const piAgentDir = tempDir();
+    assert.equal(findSwallowedPiError(piAgentDir, "/tmp/whatever"), undefined);
+  });
+});
diff --git a/services/agent/tests/unit/startup-banner.test.ts b/services/agent/tests/unit/startup-banner.test.ts
new file mode 100644
index 0000000000..4912373961
--- /dev/null
+++ b/services/agent/tests/unit/startup-banner.test.ts
@@ -0,0 +1,212 @@
+/**
+ * Unit tests for pi-acp startup-banner stripping.
+ *
+ * pi-acp emits its startup banner as the FIRST agent message chunk, ahead of the real answer
+ * (a "pi vX.Y.Z" / "## Context" / AGENTS.md path list / "New version available" prelude). The
+ * playground renders the markdown, so the user sees a bare "Context" heading and an unprefixed
+ * absolute path. These tests pin both the coalesced stripper (`stripStartupBanner`, used by the
+ * one-shot `finish()` path) and the streaming-safe splitter (`splitLeadingBanner`, used to hold
+ * leading deltas until the banner region resolves), plus an end-to-end streaming assertion
+ * through `createSandboxAgentOtel`.
+ *
+ * Run: pnpm test (or: pnpm exec vitest run tests/unit/startup-banner.test.ts)
+ */
+import { describe, it } from "vitest";
+import assert from "node:assert/strict";
+
+import {
+  createSandboxAgentOtel,
+  isBannerLine,
+  splitLeadingBanner,
+  stripStartupBanner,
+} from "../../src/tracing/otel.ts";
+import type { AgentEvent } from "../../src/protocol.ts";
+
+// The exact banner the user saw, in its RAW (pre-render) markdown form: pi-acp's buildStartupInfo
+// output followed by buildUpdateNotice. The playground strips "## " and "- " when rendering, so
+// the user-visible text is the same minus those markers.
+const RAW_BANNER = [
+  "pi v0.79.4",
+  "---",
+  "",
+  "## Context",
+  "- /tmp/agenta-sandbox-agent-UxwfPW/AGENTS.md",
+  "",
+  "---",
+  "New version available: v0.80.2 (installed v0.79.4). Run: `npm i -g @earendil-works/pi-coding-agent`",
+].join("\n");
+// ^ verbatim from a live `pi-acp@0.0.29` session/new (`_meta.piAcp.startupInfo`) on this stack:
+//   "pi v0.79.4\n---\n\n## Context\n- /tmp/.../AGENTS.md\n\n---\nNew version available: v0.80.2 ...".
+
+// The same banner as the user reported it (already markdown-rendered): bare "Context", unprefixed
+// AGENTS.md path, no fenced npm command.
+const RENDERED_BANNER = [
+  "pi v0.79.4",
+  "Context",
+  "/tmp/agenta-sandbox-UxwfPW/AGENTS.md",
+  "New version available: v0.80.2 (installed v0.79.4).",
+  "Run: npm i -g @earendil-works/pi-coding-agent",
+].join("\n");
+
+describe("isBannerLine", () => {
+  it("matches every banner-line variant the user saw", () => {
+    for (const line of [
+      "pi v0.79.4",
+      "pi v0.80.2 (something)",
+      "---",
+      "",
+      "   ",
+      "## Context",
+      "Context",
+      "## Skills",
+      "Skills",
+      "- /tmp/agenta-sandbox-agent-UxwfPW/AGENTS.md",
+      "/tmp/agenta-sandbox-UxwfPW/AGENTS.md",
+      "- /pi-agent/skills/foo.md",
+      "New version available: v0.80.2 (installed v0.79.4). Run: `npm i -g @earendil-works/pi-coding-agent`",
+      "New version available: v0.80.2 (installed v0.79.4).",
+      "Run: `npm i -g @earendil-works/pi-coding-agent`",
+      "Run: npm i -g @earendil-works/pi-coding-agent",
+    ]) {
+      assert.equal(isBannerLine(line), true, `expected banner: ${JSON.stringify(line)}`);
+    }
+  });
+
+  it("does not match genuine answer lines", () => {
+    for (const line of [
+      "4",
+      "The answer is 4.",
+      "Here is the context you asked for:",
+      "I will run: npm test", // not "npm i -g"
+      "Contextual information follows", // not a bare heading
+      "See /etc/hosts for details", // not a .md path
+    ]) {
+      assert.equal(isBannerLine(line), false, `expected non-banner: ${JSON.stringify(line)}`);
+    }
+  });
+});
+
+describe("stripStartupBanner (coalesced / finish path)", () => {
+  it("strips the raw markdown banner the user saw", () => {
+    assert.equal(stripStartupBanner(RAW_BANNER + "\n4"), "4");
+  });
+
+  it("strips the rendered banner the user saw", () => {
+    assert.equal(stripStartupBanner(RENDERED_BANNER + "\nThe answer is 4."), "The answer is 4.");
+  });
+
+  it("strips a multi-line answer's banner but keeps the whole answer", () => {
+    const answer = "Line one of the answer.\nLine two.\n- bullet point";
+    assert.equal(stripStartupBanner(RAW_BANNER + "\n" + answer), answer);
+  });
+
+  it("leaves a clean answer untouched", () => {
+    assert.equal(stripStartupBanner("4"), "4");
+    assert.equal(
+      stripStartupBanner("The context of this question is simple: 4."),
+      "The context of this question is simple: 4.",
+    );
+  });
+
+  it("returns empty when the text is only the banner", () => {
+    assert.equal(stripStartupBanner(RAW_BANNER), "");
+  });
+});
+
+describe("splitLeadingBanner (streaming path)", () => {
+  it("holds emission while only banner-or-blank has arrived", () => {
+    for (const text of ["pi v0.79.4\n", RAW_BANNER, RAW_BANNER + "\n"]) {
+      const { body, settled } = splitLeadingBanner(text);
+      assert.deepEqual({ body, settled }, { body: "", settled: false });
+    }
+  });
+
+  it("does not judge an incomplete trailing line", () => {
+    // "Run: `npm i" could become the upgrade line OR a real answer; wait for the newline.
+    const { body, settled } = splitLeadingBanner("pi v0.79.4\n---\nRun: `npm i");
+    assert.deepEqual({ body, settled }, { body: "", settled: false });
+  });
+
+  it("settles and returns the body once a real line completes", () => {
+    // Streaming preserves the answer faithfully, including a trailing newline; only the leading
+    // blank line(s) between banner and answer are dropped.
+    const { body, settled, start } = splitLeadingBanner(RAW_BANNER + "\n4\n");
+    assert.equal(settled, true);
+    assert.equal(body, "4\n");
+    // `start` points exactly at the body within the original text.
+    assert.equal((RAW_BANNER + "\n4\n").slice(start, start + body.length), body);
+  });
+
+  it("holds the first real line until its newline arrives (boundary safety)", () => {
+    // At the banner boundary a still-arriving partial line ("4", no newline) could complete into
+    // either a banner line or the answer, so we hold and let the finish path strip it instead.
+    const { settled } = splitLeadingBanner(RAW_BANNER + "\n4");
+    assert.equal(settled, false);
+  });
+
+  it("settles immediately for a banner-free stream", () => {
+    const { body, settled, start } = splitLeadingBanner("4");
+    assert.deepEqual({ body, settled, start }, { body: "4", settled: true, start: 0 });
+  });
+});
+
+describe("createSandboxAgentOtel streaming suppresses the banner", () => {
+  const textChunk = (text: string) => ({
+    sessionUpdate: "agent_message_chunk",
+    content: { type: "text", text },
+  });
+
+  it("never streams banner deltas, only the real answer", () => {
+    const emitted: AgentEvent[] = [];
+    const run = createSandboxAgentOtel({
+      harness: "pi",
+      model: "openai/o4-mini",
+      emitSpans: false,
+      emit: (e) => emitted.push(e),
+    });
+    run.start({ prompt: "What is 2+2?" });
+    // pi streams pure deltas; the banner arrives first, then the answer, split across chunks.
+    run.handleUpdate(textChunk("pi v0.79.4\n---\n\n## Context\n- /tmp/x/AGENTS.md\n\n"));
+    run.handleUpdate(
+      textChunk("New version available: v0.80.2 (installed v0.79.4). Run: `npm i -g @earendil-works/pi-coding-agent`\n"),
+    );
+    // First real line completes with a newline -> the splitter settles and streams the body
+    // INCREMENTALLY (before finish), exercising the live path, not just the finish fallback.
+    run.handleUpdate(textChunk("The answer is 4.\n")); // pure delta (pi-style)
+    const beforeFinish = emitted.filter((e) => e.type === "message_delta").length;
+    run.handleUpdate(textChunk("Thanks!")); // pure delta (pi-style)
+    run.finish();
+    assert.ok(beforeFinish > 0, "body was not streamed incrementally before finish");
+
+    const streamed = emitted
+      .filter((e): e is Extract<AgentEvent, { type: "message_delta" }> => e.type === "message_delta")
+      .map((e) => e.delta)
+      .join("");
+    assert.equal(streamed, "The answer is 4.\nThanks!");
+    assert.ok(!streamed.includes("pi v0.79.4"), "banner version leaked");
+    assert.ok(!streamed.includes("New version available"), "upgrade notice leaked");
+    assert.ok(!streamed.includes("AGENTS.md"), "context file path leaked");
+  });
+
+  it("strips the banner on finish when the stream had no clean body line", () => {
+    // A short, banner-only-then-inline answer that never produced a newline before finish.
+    const emitted: AgentEvent[] = [];
+    const run = createSandboxAgentOtel({
+      harness: "pi",
+      model: "openai/o4-mini",
+      emitSpans: false,
+      emit: (e) => emitted.push(e),
+    });
+    run.start({ prompt: "hi" });
+    // The whole banner plus the answer arrive in ONE chunk with no trailing newline. The
+    // streaming splitter holds (last line partial), so finish() must do the stripping.
+    run.handleUpdate(textChunk(RAW_BANNER + "\n4"));
+    run.finish();
+
+    const streamed = emitted
+      .filter((e): e is Extract<AgentEvent, { type: "message_delta" }> => e.type === "message_delta")
+      .map((e) => e.delta)
+      .join("");
+    assert.equal(streamed, "4");
+  });
+});