Agenta-AI · mmabrouk · Jun 25, 2026 · Jun 24, 2026 · Jun 25, 2026 · Jun 25, 2026
diff --git a/docs/design/agent-workflows/interfaces/cross-service/runner-to-harness.md b/docs/design/agent-workflows/interfaces/cross-service/runner-to-harness.md
@@ -29,7 +29,16 @@ returns `HarnessCapabilities`:
 ```
 
 The probe gates real behavior. Tools only go over MCP when `mcpTools` is true, for example.
-If the probe fails, the runner falls back to a static capability policy.
+If the probe fails, the runner falls back to a static capability policy, and the result records
+whether the flags were `probed` or a `static` guess.
+
+The runner no longer silently degrades when a run requires a capability the harness lacks.
+`assertRequiredCapabilities` (in `capabilities.ts`) fails the run with a specific error — the
+same fail-loud pattern as the `*_UNSUPPORTED_MESSAGE` gates in `run-plan.ts` and the
+`CODE_TOOL_UNSUPPORTED_MESSAGE` gate in `tools/code.ts`. Today the asserted requirement is tool
+delivery: a non-Pi run carrying tool specs whose probe reports `mcpTools:false` or
+`toolCalls:false` errors instead of dropping the tools without a trace. Pi is exempt (its tools
+ride the native extension, not MCP), and a run with no tools is unaffected.
 
 **The ACP event stream.** The harness emits ACP updates that the runner maps to neutral
 events:
@@ -58,8 +67,11 @@ tears the sandbox down in the `finally` path.
 
 - **Harness selection and the `pi_core`/`pi_agenta` to `pi` remap.** New harnesses thread
   through here.
-- **The capability probe.** It gates tool delivery, permissions, and streaming. A wrong flag
-  silently changes behavior rather than erroring.
+- **The capability probe.** It gates tool delivery, permissions, and streaming. A run that
+  REQUIRES a capability the harness lacks now fails loud (`assertRequiredCapabilities`) rather
+  than silently dropping the behavior — today that covers tool delivery to a non-Pi harness.
+  Other flags (permissions, streaming) still degrade silently; assert the next requirement here
+  if a flag must be a hard gate.
 - **ACP event mapping.** A missed or mis-mapped update drops content from the stream.
 - **Pi versus Claude divergence.** Both run over ACP, but Pi takes tools natively and
   self-instruments traces, while Claude takes tools over MCP and the runner builds the spans.

diff --git a/services/agent/src/engines/sandbox_agent.ts b/services/agent/src/engines/sandbox_agent.ts
@@ -46,7 +46,11 @@ import {
   type ToolCallbackContext,
   resolveRunSessionId,
 } from "../protocol.ts";
-import { probeCapabilities } from "./sandbox_agent/capabilities.ts";
+import {
+  assert,
+  assertRequiredCapabilities,
+  probeCapabilities,
+} from "./sandbox_agent/capabilities.ts";
 import { buildDaemonEnv, resolveDaemonBinary } from "./sandbox_agent/daemon.ts";
 import {
   createCookieFetch,
@@ -55,6 +59,7 @@ import {
 import { conciseError } from "./sandbox_agent/errors.ts";
 import { buildSessionMcpServers } from "./sandbox_agent/mcp.ts";
 import { applyModel } from "./sandbox_agent/model.ts";
+import { findSwallowedPiError } from "./sandbox_agent/pi-error.ts";
 import {
   buildPiExtensionEnv,
   prepareLocalPiAssets,
@@ -81,7 +86,12 @@ function log(message: string): void {
 
 type Log = (message: string) => void;
 
-const CLAUDE_STRICT_DEPLOYMENTS = new Set(["custom", "bedrock", "vertex", "vertex_ai"]);
+const CLAUDE_STRICT_DEPLOYMENTS = new Set([
+  "custom",
+  "bedrock",
+  "vertex",
+  "vertex_ai",
+]);
 
 function applyClaudeConnectionEnv(
   env: Record<string, string>,
@@ -110,7 +120,10 @@ function applyClaudeConnectionEnv(
     env.CLAUDE_CODE_USE_VERTEX = "1";
   }
 
-  if (selectedModel && (baseUrl || (deployment && CLAUDE_STRICT_DEPLOYMENTS.has(deployment)))) {
+  if (
+    selectedModel &&
+    (baseUrl || (deployment && CLAUDE_STRICT_DEPLOYMENTS.has(deployment)))
+  ) {
     env.ANTHROPIC_MODEL = selectedModel;
     env.ANTHROPIC_CUSTOM_MODEL_OPTION = selectedModel;
     return true;
@@ -162,7 +175,12 @@ export async function runSandboxAgent(
     clearProviderEnv,
   });
   Object.assign(env, plan.secrets); // apply only the resolved provider keys
-  const strictModel = applyClaudeConnectionEnv(env, request, plan.acpAgent, logger);
+  const strictModel = applyClaudeConnectionEnv(
+    env,
+    request,
+    plan.acpAgent,
+    logger,
+  );
   // Pi self-instruments locally: propagate the trace context + public tool metadata into Pi
   // via the Agenta extension. Tool execution always relays back to this runner, which keeps
   // private specs, scoped env, callback endpoints, and callback auth in memory.
@@ -233,14 +251,36 @@ export async function runSandboxAgent(
       log: logger,
     });
 
+    // Sandbox-start invariant: `startSandboxAgent` must hand back a usable handle, or the
+    // probe/createSession below fail with an opaque "cannot read property of undefined".
+    assert(
+      sandbox && typeof sandbox.createSession === "function",
+      `sandbox provider '${plan.sandboxId}' returned no usable sandbox handle`,
+    );
+
     // Probe what this harness supports and branch on capabilities, not on the harness
     // name. Tool delivery: Pi loads our extension (native tools, set up above); any other
     // harness takes tools over MCP only when it advertises `mcpTools` (pi-acp does not
     // forward MCP, Claude/Codex do).
-    const capabilities = await (deps.probeCapabilities ?? probeCapabilities)(
+    const probed = await (deps.probeCapabilities ?? probeCapabilities)(
       sandbox,
       plan.acpAgent,
     );
+    const capabilities = probed.capabilities;
+
+    // Fail loud (A7): a run that REQUIRES a capability the harness lacks errors with a
+    // specific message instead of silently dropping the behavior, the way the
+    // `*_UNSUPPORTED_MESSAGE` gates in `run-plan.ts` do. Today: tool delivery to a non-Pi
+    // harness whose probe reports `mcpTools:false` / `toolCalls:false`. The throw is caught
+    // below and returned as `{ ok: false, error }`.
+    assertRequiredCapabilities({
+      harness: plan.harness,
+      isPi: plan.isPi,
+      probed,
+      toolSpecs: plan.toolSpecs,
+      log: logger,
+    });
+
     const mcpServers = buildSessionMcpServers({
       isPi: plan.isPi,
       capabilities,
@@ -354,6 +394,29 @@ export async function runSandboxAgent(
     const output = run.finish();
     await run.flush();
 
+    // Fail loud on a swallowed model error (A7 / "fail loud, not silent"). When Pi's provider
+    // call fails (out-of-quota, bad key, rate limit, unknown model, ...), Pi's pi-acp bridge
+    // reports the turn as a plain `end_turn` with NO content, so without this the run would
+    // return an `ok:true` empty turn and the user would see a silent "No response" instead of
+    // the real failure. On the LOCAL Pi path the error is recoverable from Pi's own session
+    // transcript; surface it as a run error. Only checked when the turn produced no output and
+    // ran no tools (a real tool-only turn legitimately has empty text), and never on Daytona
+    // (the transcript lives in the remote sandbox).
+    if (
+      plan.isPi &&
+      !plan.isDaytona &&
+      !output.trim() &&
+      !run.events().some((e) => e.type === "tool_call")
+    ) {
+      const piError = findSwallowedPiError(plan.sourcePiAgentDir, plan.cwd);
+      if (piError) {
+        return {
+          ok: false,
+          error: conciseError(new Error(piError), plan.harness, request.provider),
+        };
+      }
+    }
+
     return {
       ok: true,
       output,
@@ -376,7 +439,7 @@ export async function runSandboxAgent(
   } catch (err) {
     otel?.finish();
     await otel?.flush().catch(() => {});
-    return { ok: false, error: conciseError(err, plan.harness) };
+    return { ok: false, error: conciseError(err, plan.harness, request.provider) };
   } finally {
     await toolRelay?.stop().catch(() => {});
     await sandbox?.destroySandbox().catch(() => {});