Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,16 @@ returns `HarnessCapabilities`:
```

The probe gates real behavior. Tools only go over MCP when `mcpTools` is true, for example.
If the probe fails, the runner falls back to a static capability policy.
If the probe fails, the runner falls back to a static capability policy, and the result records
whether the flags were `probed` or a `static` guess.

The runner no longer silently degrades when a run requires a capability the harness lacks.
`assertRequiredCapabilities` (in `capabilities.ts`) fails the run with a specific error — the
same fail-loud pattern as the `*_UNSUPPORTED_MESSAGE` gates in `run-plan.ts` and the
`CODE_TOOL_UNSUPPORTED_MESSAGE` gate in `tools/code.ts`. Today the asserted requirement is tool
delivery: a non-Pi run carrying tool specs whose probe reports `mcpTools:false` or
`toolCalls:false` errors instead of dropping the tools without a trace. Pi is exempt (its tools
ride the native extension, not MCP), and a run with no tools is unaffected.

**The ACP event stream.** The harness emits ACP updates that the runner maps to neutral
events:
Expand Down Expand Up @@ -58,8 +67,11 @@ tears the sandbox down in the `finally` path.

- **Harness selection and the `pi_core`/`pi_agenta` to `pi` remap.** New harnesses thread
through here.
- **The capability probe.** It gates tool delivery, permissions, and streaming. A wrong flag
silently changes behavior rather than erroring.
- **The capability probe.** It gates tool delivery, permissions, and streaming. A run that
REQUIRES a capability the harness lacks now fails loud (`assertRequiredCapabilities`) rather
than silently dropping the behavior — today that covers tool delivery to a non-Pi harness.
Other flags (permissions, streaming) still degrade silently; assert the next requirement here
if a flag must be a hard gate.
- **ACP event mapping.** A missed or mis-mapped update drops content from the stream.
- **Pi versus Claude divergence.** Both run over ACP, but Pi takes tools natively and
self-instruments traces, while Claude takes tools over MCP and the runner builds the spans.
Expand Down
75 changes: 69 additions & 6 deletions services/agent/src/engines/sandbox_agent.ts
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,11 @@ import {
type ToolCallbackContext,
resolveRunSessionId,
} from "../protocol.ts";
import { probeCapabilities } from "./sandbox_agent/capabilities.ts";
import {
assert,
assertRequiredCapabilities,
probeCapabilities,
} from "./sandbox_agent/capabilities.ts";
import { buildDaemonEnv, resolveDaemonBinary } from "./sandbox_agent/daemon.ts";
import {
createCookieFetch,
Expand All @@ -55,6 +59,7 @@ import {
import { conciseError } from "./sandbox_agent/errors.ts";
import { buildSessionMcpServers } from "./sandbox_agent/mcp.ts";
import { applyModel } from "./sandbox_agent/model.ts";
import { findSwallowedPiError } from "./sandbox_agent/pi-error.ts";
import {
buildPiExtensionEnv,
prepareLocalPiAssets,
Expand All @@ -81,7 +86,12 @@ function log(message: string): void {

type Log = (message: string) => void;

const CLAUDE_STRICT_DEPLOYMENTS = new Set(["custom", "bedrock", "vertex", "vertex_ai"]);
const CLAUDE_STRICT_DEPLOYMENTS = new Set([
"custom",
"bedrock",
"vertex",
"vertex_ai",
]);

function applyClaudeConnectionEnv(
env: Record<string, string>,
Expand Down Expand Up @@ -110,7 +120,10 @@ function applyClaudeConnectionEnv(
env.CLAUDE_CODE_USE_VERTEX = "1";
}

if (selectedModel && (baseUrl || (deployment && CLAUDE_STRICT_DEPLOYMENTS.has(deployment)))) {
if (
selectedModel &&
(baseUrl || (deployment && CLAUDE_STRICT_DEPLOYMENTS.has(deployment)))
) {
env.ANTHROPIC_MODEL = selectedModel;
env.ANTHROPIC_CUSTOM_MODEL_OPTION = selectedModel;
return true;
Expand Down Expand Up @@ -162,7 +175,12 @@ export async function runSandboxAgent(
clearProviderEnv,
});
Object.assign(env, plan.secrets); // apply only the resolved provider keys
const strictModel = applyClaudeConnectionEnv(env, request, plan.acpAgent, logger);
const strictModel = applyClaudeConnectionEnv(
env,
request,
plan.acpAgent,
logger,
);
// Pi self-instruments locally: propagate the trace context + public tool metadata into Pi
// via the Agenta extension. Tool execution always relays back to this runner, which keeps
// private specs, scoped env, callback endpoints, and callback auth in memory.
Expand Down Expand Up @@ -233,14 +251,36 @@ export async function runSandboxAgent(
log: logger,
});

// Sandbox-start invariant: `startSandboxAgent` must hand back a usable handle, or the
// probe/createSession below fail with an opaque "cannot read property of undefined".
assert(
sandbox && typeof sandbox.createSession === "function",
`sandbox provider '${plan.sandboxId}' returned no usable sandbox handle`,
);

// Probe what this harness supports and branch on capabilities, not on the harness
// name. Tool delivery: Pi loads our extension (native tools, set up above); any other
// harness takes tools over MCP only when it advertises `mcpTools` (pi-acp does not
// forward MCP, Claude/Codex do).
const capabilities = await (deps.probeCapabilities ?? probeCapabilities)(
const probed = await (deps.probeCapabilities ?? probeCapabilities)(
sandbox,
plan.acpAgent,
);
const capabilities = probed.capabilities;

// Fail loud (A7): a run that REQUIRES a capability the harness lacks errors with a
// specific message instead of silently dropping the behavior, the way the
// `*_UNSUPPORTED_MESSAGE` gates in `run-plan.ts` do. Today: tool delivery to a non-Pi
// harness whose probe reports `mcpTools:false` / `toolCalls:false`. The throw is caught
// below and returned as `{ ok: false, error }`.
assertRequiredCapabilities({
harness: plan.harness,
isPi: plan.isPi,
probed,
toolSpecs: plan.toolSpecs,
log: logger,
});

const mcpServers = buildSessionMcpServers({
isPi: plan.isPi,
capabilities,
Expand Down Expand Up @@ -354,6 +394,29 @@ export async function runSandboxAgent(
const output = run.finish();
await run.flush();

// Fail loud on a swallowed model error (A7 / "fail loud, not silent"). When Pi's provider
// call fails (out-of-quota, bad key, rate limit, unknown model, ...), Pi's pi-acp bridge
// reports the turn as a plain `end_turn` with NO content, so without this the run would
// return an `ok:true` empty turn and the user would see a silent "No response" instead of
// the real failure. On the LOCAL Pi path the error is recoverable from Pi's own session
// transcript; surface it as a run error. Only checked when the turn produced no output and
// ran no tools (a real tool-only turn legitimately has empty text), and never on Daytona
// (the transcript lives in the remote sandbox).
if (
plan.isPi &&
!plan.isDaytona &&
!output.trim() &&
!run.events().some((e) => e.type === "tool_call")
) {
const piError = findSwallowedPiError(plan.sourcePiAgentDir, plan.cwd);
if (piError) {
return {
ok: false,
error: conciseError(new Error(piError), plan.harness, request.provider),
};
}
}

return {
ok: true,
output,
Expand All @@ -376,7 +439,7 @@ export async function runSandboxAgent(
} catch (err) {
otel?.finish();
await otel?.flush().catch(() => {});
return { ok: false, error: conciseError(err, plan.harness) };
return { ok: false, error: conciseError(err, plan.harness, request.provider) };
} finally {
await toolRelay?.stop().catch(() => {});
await sandbox?.destroySandbox().catch(() => {});
Expand Down
Loading
Loading