diff --git a/docs/design/agent-workflows/projects/qa/f040-park-terminal-plan.md b/docs/design/agent-workflows/projects/qa/f040-park-terminal-plan.md new file mode 100644 index 0000000000..37d08b2fbf --- /dev/null +++ b/docs/design/agent-workflows/projects/qa/f040-park-terminal-plan.md @@ -0,0 +1,68 @@ +# F-040 fix plan: make a HITL park END the `/run` turn gracefully + +## The bug (confirmed, live) + +On the `/messages` (HITL) path, when the responder returns `park` +(`engines/sandbox_agent/permissions.ts`), the runner sends NO `respondPermission` and +relies on `session.prompt()` resolving. Claude-over-ACP does NOT end the turn on an +unanswered permission gate, so `session.prompt()` (`sandbox_agent.ts:~428`) blocks +forever. Consequences: + +1. The parked turn never terminates → its temp sandbox leaks (the `finally` never runs). +2. The egress (`vercel/stream.py`) never sees the run iterator end → no `finish` frame → + the SSE stream hangs. +3. The AI-SDK resume fires while the old stream is still open → errors → "agent run + failed" (Deny) / 5-min hang then `ERR_ABORTED` (Approve). + +## The fix (one move, runner-side) + +When the responder returns `park`, **end the current `/run` turn gracefully** instead of +holding the ACP connection open: + +- `permissions.ts` reports the park back to the orchestration loop (a callback / signal). +- `sandbox_agent.ts` races `session.prompt()` against a "parked" signal. On park it calls + `sandbox.destroySession(session.id)`, which (verified in the `sandbox-agent` package): + - `cancelPendingPermissionsForSession` resolves the pending permission RPC with + `{outcome: "cancelled"}` (NOT a reject → no F-024 clobber), and + - sends the managed `session/cancel`, so the in-flight `prompt()` resolves with + `StopReason::Cancelled`. +- The runner then returns a terminal result with `stopReason: "paused"` (a park terminal, + distinct from a model `cancelled`), so: + - the egress loop ends and emits a proper `finish` frame (`finishReason` maps to + `unknown`/omitted — fine; the FE re-runs on resume), + - the `finally` disposes the sandbox (NO leak). + +The resume then cold-replays as a fresh turn: `extractApprovalDecisions` resolves the +stored decision via the name+args anchor (already built, #4854), Approve → tool runs → +completes; Deny → clean denial (`#4859` FE path already maps deny → tool-error, model +continues). + +### Why `destroySession`, not a raw `session/cancel` + +The `sandbox-agent` package blocks a manual `session/cancel` via `rawSend` +(`MANUAL_CANCEL_ERROR`); only `destroySession` is allowed to send the managed cancel. +`destroySession` is exactly the right call: it cancels the pending permission cleanly AND +the in-flight prompt, then marks the session destroyed (we dispose the sandbox anyway). + +## FE resume contract (point 4) — already built, just verified + +`vercel/messages.py` `_tool_part_blocks` already converts an inline +`state:"approval-responded"` tool part into the `{approved}` envelope, and +`agentApprovalResume.ts` ensures the FE includes the responded tool part on resume. No FE +change needed; covered by existing tests. + +## Slices + +1. **Runner park-terminal** (`permissions.ts` + `sandbox_agent.ts`): park signal → + `destroySession` → terminal `stopReason:"paused"`; the `finally` disposes (no leak). +2. **Tests**: park emits a terminal result (stopReason paused, prompt unblocked); no + sandbox leak on park (destroySession + destroySandbox both called); the resume + cold-replay still resolves (existing test stays green). + +## Acceptance + +- Unit: a parked run RETURNS (does not hang), `stopReason:"paused"`, `destroySession` + called once, sandbox disposed once. +- Live: Claude + haiku + gated github tool + Ask rule → Approve completes with the real + result (no hang); Deny → clean denial (no "agent run failed"); NO leaked runner temp + sandbox after a parked→resumed turn. Reproduce 2-3x. diff --git a/sdks/python/agenta/sdk/agents/adapters/vercel/stream.py b/sdks/python/agenta/sdk/agents/adapters/vercel/stream.py index 4d59b0db9d..28c7f4cc27 100644 --- a/sdks/python/agenta/sdk/agents/adapters/vercel/stream.py +++ b/sdks/python/agenta/sdk/agents/adapters/vercel/stream.py @@ -26,6 +26,11 @@ "function_call": "tool-calls", "refusal": "content-filter", "content_filter": "content-filter", + # A HITL park ends the turn intentionally-but-incomplete (the FE then resumes on the + # user's decision). It is neither a model completion nor an error, so map it to the AI + # SDK's `other` rather than letting it fall through to `unknown` (F-040). + "paused": "other", + "cancelled": "other", } diff --git a/sdks/python/oss/tests/pytest/unit/agents/adapters/test_vercel_stream_park.py b/sdks/python/oss/tests/pytest/unit/agents/adapters/test_vercel_stream_park.py new file mode 100644 index 0000000000..093e71917d --- /dev/null +++ b/sdks/python/oss/tests/pytest/unit/agents/adapters/test_vercel_stream_park.py @@ -0,0 +1,84 @@ +"""Egress finish-on-park (F-040). + +A parked HITL turn now ends gracefully on the runner with a terminal ``stopReason:"paused"`` +result, so the streaming egress must drain to a clean ``finish`` frame (the FE then resumes on +the user's decision). Before the runner fix the parked turn never terminated, the stream hung, +and no ``finish`` ever arrived. This pins the egress side of that contract: given a parked +record stream, ``agent_run_to_vercel_parts`` yields the approval request AND a final ``finish``. +""" + +from __future__ import annotations + +from typing import Any, AsyncIterator, Dict, List + +import pytest + +from agenta.sdk.agents.adapters.vercel.stream import agent_run_to_vercel_parts +from agenta.sdk.agents.streaming import AgentRun + + +async def _records(items: List[Dict[str, Any]]) -> AsyncIterator[Dict[str, Any]]: + for item in items: + yield item + + +def _parked_run() -> AgentRun: + """A runner record stream for a parked turn: the gated tool call, the approval request, + a ``done`` event, then the terminal ``paused`` result (what the F-040 runner fix emits).""" + return AgentRun( + _records( + [ + { + "kind": "event", + "event": { + "type": "tool_call", + "id": "tool-1", + "name": "github__get_user", + "input": {}, + }, + }, + { + "kind": "event", + "event": { + "type": "interaction_request", + "id": "perm-1", + "kind": "permission", + "payload": {"toolCallId": "tool-1"}, + }, + }, + {"kind": "event", "event": {"type": "done", "stopReason": "paused"}}, + { + "kind": "result", + "result": { + "ok": True, + "output": "", + "stopReason": "paused", + "sessionId": "conv-1", + "traceId": "trace-1", + }, + }, + ] + ) + ) + + +@pytest.mark.asyncio +async def test_parked_run_emits_approval_then_finish() -> None: + parts = [part async for part in agent_run_to_vercel_parts(_parked_run())] + types = [p.get("type") for p in parts] + + # The approval request reaches the FE so it can prompt the user ... + assert "tool-approval-request" in types + # ... and the stream drains to a clean finish (no immortal-park hang, F-040). + assert types[-1] == "finish" + assert types.count("finish") == 1 + + # The approval request carries the gated tool-call id so it attaches to the tool part. + approval = next(p for p in parts if p["type"] == "tool-approval-request") + assert approval["approvalId"] == "perm-1" + assert approval["toolCallId"] == "tool-1" + + # A park is intentional-but-incomplete, mapped to the AI SDK `other` finish reason (not + # a model completion `stop`, not `unknown`). + finish = parts[-1] + assert finish["finishReason"] == "other" diff --git a/services/agent/src/engines/sandbox_agent.ts b/services/agent/src/engines/sandbox_agent.ts index 001dec8b46..02c617b694 100644 --- a/services/agent/src/engines/sandbox_agent.ts +++ b/services/agent/src/engines/sandbox_agent.ts @@ -398,9 +398,32 @@ export async function runSandboxAgent( // decisions the HITLResponder falls back to the base policy and is byte-identical to the // old PolicyResponder, so `/invoke` is unchanged. const hasHumanSurface = !!(request.sessionId && request.sessionId.trim()); + // A park (cross-turn HITL) must END the turn gracefully, not hold the ACP connection open + // forever (F-040): Claude does not end a turn on an unanswered gate, so `session.prompt()` + // would block indefinitely, the sandbox would leak, and the egress would never emit a + // `finish` frame. On the first park we `destroySession`, which (a) resolves the pending + // permission RPC with `{outcome:"cancelled"}` — NOT a reject, so no F-024 clobber — and + // (b) sends the managed `session/cancel` so the in-flight `prompt()` resolves with a + // cancelled stop reason. The resume then cold-replays as a fresh turn (#4854). + let parked = false; + let resolveParked: (() => void) | undefined; + const parkedSignal = new Promise((resolve) => { + resolveParked = resolve; + }); + const onPark = (): void => { + if (parked) return; // first park wins; later gates this turn are moot once we cancel + parked = true; + resolveParked?.(); + // Cancel the in-flight prompt so `session.prompt()` returns instead of hanging. The + // managed cancel lives on the SandboxAgent handle (the `Session` wrapper has none). + // Best effort: the `finally` disposes the sandbox regardless, and the prompt is also + // raced against `parkedSignal` below so the turn ends even if this call rejects. + void sandbox.destroySession?.(session.id).catch(() => {}); + }; attachPermissionResponder({ session, run, + onPark, responder: deps.responderFactory?.(request.permissionPolicy) ?? new HITLResponder( @@ -425,11 +448,29 @@ export async function runSandboxAgent( ); } - const result = await session.prompt([ - { type: "text", text: plan.turnText }, + // Race the prompt against the park signal: on a HITL park the prompt either resolves with + // a cancelled stop reason (the managed `session/cancel` landed) or never resolves at all + // (the harness ignores it) — either way `parkedSignal` ends the turn so the runner returns + // promptly, the `finally` disposes the sandbox (no leak, F-040), and the egress emits a + // `finish` frame. When the park wins, the orphaned `prompt()` may later reject as the + // cancelled/torn-down connection unwinds; swallow that so it is not an `unhandledRejection` + // (the daemon teardown's `fetch failed` is expected on a cancel, not a run error). + const PARKED = Symbol("parked"); + const promptPromise = Promise.resolve( + session.prompt([{ type: "text", text: plan.turnText }]), + ); + promptPromise.catch(() => {}); + const raced = await Promise.race([ + promptPromise, + parkedSignal.then(() => PARKED), ]); await toolRelay?.stop(); - const stopReason = (result as any)?.stopReason; + // A parked turn is terminal-but-incomplete: stop reason `paused` tells the egress to emit + // a clean `finish` (the FE then resumes on the user's decision). A real prompt result keeps + // the harness's own stop reason. + const stopReason = + raced === PARKED || parked ? "paused" : (raced as any)?.stopReason; + const result = raced === PARKED ? undefined : raced; logger(`prompt stopReason=${stopReason}`); // Usage: Pi writes its totals to a file via the extension. Other harnesses report the diff --git a/services/agent/src/engines/sandbox_agent/permissions.ts b/services/agent/src/engines/sandbox_agent/permissions.ts index 2155c1a990..f4ecb12e5a 100644 --- a/services/agent/src/engines/sandbox_agent/permissions.ts +++ b/services/agent/src/engines/sandbox_agent/permissions.ts @@ -5,6 +5,15 @@ export interface AttachPermissionResponderInput { session: any; run: { emitEvent: (event: AgentEvent) => void }; responder: Responder; + /** + * Called when the responder PARKS a gate (cross-turn HITL). The orchestration loop uses + * this to END the turn gracefully: a parked Claude turn never resolves `session.prompt()` + * on its own (the harness does not end the turn on an unanswered gate), so without this the + * prompt blocks forever, the sandbox leaks, and the egress never emits a `finish` frame + * (F-040). The runner cancels the in-flight prompt (via `destroySession`) so the turn ends + * paused and the resume cold-replays. Fires at most once per turn (the first park wins). + */ + onPark?: () => void; } /** @@ -17,6 +26,7 @@ export function attachPermissionResponder({ session, run, responder, + onPark, }: AttachPermissionResponderInput): void { session.onPermissionRequest((req: any) => { const id = String(req?.id ?? ""); @@ -37,13 +47,17 @@ export function attachPermissionResponder({ void responder .onPermission({ id, availableReplies, raw: req }) .then((decision) => { - if (!req?.id) return; // PARK (cross-turn HITL): send NO reply. The `interaction_request` above is the last - // word on this tool call; the harness ends the turn with the tool PENDING and the next - // turn's stored decision resolves it. Replying `reject` here would make Claude emit a - // failed tool call ("User refused permission") that clobbers the approval prompt on the - // same tool-call id (F-024) — do NOT map park onto a reply. - if (decision === "park") return; + // word on this tool call; the next turn's stored decision resolves it. Replying + // `reject` here would make Claude emit a failed tool call ("User refused permission") + // that clobbers the approval prompt on the same tool-call id (F-024) — do NOT map park + // onto a reply. Instead signal the orchestration loop to end the turn gracefully: + // Claude never ends a turn on an unanswered gate, so the prompt would otherwise hang. + if (decision === "park") { + onPark?.(); + return; + } + if (!req?.id) return; return session.respondPermission( req.id, decisionToReply(decision, availableReplies) as any, diff --git a/services/agent/tests/unit/sandbox-agent-orchestration.test.ts b/services/agent/tests/unit/sandbox-agent-orchestration.test.ts index 5bef06141c..c90906db97 100644 --- a/services/agent/tests/unit/sandbox-agent-orchestration.test.ts +++ b/services/agent/tests/unit/sandbox-agent-orchestration.test.ts @@ -27,6 +27,12 @@ interface FakeOptions { promptError?: Error; permissionDecision?: PermissionDecision; emitPermission?: boolean; + // Model Claude-over-ACP: the prompt NEVER resolves on its own after a permission gate. The + // runner must end the turn another way (the park -> destroySession -> cancel path, F-040). + hangPrompt?: boolean; + // Make the managed cancel reject (and NOT resolve the hung prompt), so the only thing that + // ends the turn is the local park signal — proves the run still terminates if cancel fails. + destroySessionError?: Error; } function fakeHarness(options: FakeOptions = {}) { @@ -43,6 +49,7 @@ function fakeHarness(options: FakeOptions = {}) { workspaceCleanup: 0, sandboxDestroyed: 0, sandboxDisposed: 0, + sessionDestroyed: 0, toolRelayArgs: undefined as unknown[] | undefined, toolRelayStops: 0, permissionReplies: [] as Array<{ id: string; reply: string }>, @@ -57,6 +64,9 @@ function fakeHarness(options: FakeOptions = {}) { const events: AgentEvent[] = []; let eventHandler: ((event: any) => void) | undefined; let permissionHandler: ((request: any) => void) | undefined; + // The in-flight prompt resolver, so a `destroySession` (the managed cancel) can resolve a + // hung prompt with a cancelled stop reason — mirroring the real sandbox-agent package. + let resolveHungPrompt: ((value: any) => void) | undefined; const session = { id: "session-1", @@ -80,6 +90,13 @@ function fakeHarness(options: FakeOptions = {}) { }); } if (options.promptError) throw options.promptError; + if (options.hangPrompt) { + // Claude does not end a turn on an unanswered gate: the prompt hangs until the + // managed cancel (destroySession) resolves it with a cancelled stop reason. + return new Promise((resolve) => { + resolveHungPrompt = resolve; + }); + } return ( options.promptResult ?? { stopReason: "complete", @@ -94,6 +111,14 @@ function fakeHarness(options: FakeOptions = {}) { calls.createSessionOptions = opts; return session; }, + async destroySession(id: string) { + calls.sessionDestroyed += 1; + void id; + if (options.destroySessionError) throw options.destroySessionError; + // Managed cancel: resolve any in-flight prompt with a cancelled stop reason (the runner + // races this against the park signal, so the turn ends either way). Mirrors the package. + resolveHungPrompt?.({ stopReason: "cancelled" }); + }, async destroySandbox() { calls.sandboxDestroyed += 1; }, @@ -687,6 +712,82 @@ describe("runSandboxAgent default HITL responder wiring", () => { assert.deepEqual(calls.permissionReplies, []); }); + it("park ENDS the turn even when the prompt hangs: terminal stopReason 'paused', no harness reply (F-040)", async () => { + // The real regression: Claude does NOT end a turn on an unanswered gate, so without the + // park->cancel path `session.prompt()` blocks forever and the run never returns. With + // hangPrompt the fake prompt never resolves on its own; the run must still RETURN, driven + // by the park signal + the managed cancel. + const { calls, deps } = (() => { + const { calls, deps } = fakeHarness({ + emitPermission: true, + hangPrompt: true, + }); + delete deps.responderFactory; // engine HITLResponder -> parks (human surface, no decision) + return { calls, deps }; + })(); + + const result = await runSandboxAgent( + { + harness: "claude", + sessionId: "conv-1", + messages: [{ role: "user", content: "edit the file" }], + }, + undefined, + undefined, + deps, + ); + await flushPromises(); + + assert.equal(result.ok, true); + if (!result.ok) return; + // The turn returns terminal-but-incomplete: `paused` tells the egress to emit a clean + // `finish` so the FE can resume on the user's decision (no immortal-park hang, F-040). + assert.equal(result.stopReason, "paused"); + // No reply reached the harness (no F-024 clobber). + assert.deepEqual(calls.permissionReplies, []); + // The session was cancelled (managed `session/cancel` via destroySession) ... + assert.equal(calls.sessionDestroyed, 1); + // ... and the sandbox was disposed in the finally — the parked turn does NOT leak it. + assert.equal(calls.sandboxDestroyed, 1); + assert.equal(calls.sandboxDisposed, 1); + assert.equal(calls.workspaceCleanup, 1); + }); + + it("park terminates even if the managed cancel rejects (local park signal still ends the turn)", async () => { + // Defense in depth: if destroySession throws (the daemon already tore down, a network + // blip, etc.) and the prompt never resolves, the local `parkedSignal` must STILL end the + // turn so the run returns and the `finally` disposes the sandbox (no hang, no leak). + const { calls, deps } = (() => { + const { calls, deps } = fakeHarness({ + emitPermission: true, + hangPrompt: true, + destroySessionError: new Error("session already gone"), + }); + delete deps.responderFactory; + return { calls, deps }; + })(); + + const result = await runSandboxAgent( + { + harness: "claude", + sessionId: "conv-1", + messages: [{ role: "user", content: "edit the file" }], + }, + undefined, + undefined, + deps, + ); + await flushPromises(); + + assert.equal(result.ok, true); + if (!result.ok) return; + assert.equal(result.stopReason, "paused"); + assert.equal(calls.sessionDestroyed, 1, "cancel was attempted"); + // The turn still ended and the sandbox was disposed despite the failed cancel. + assert.equal(calls.sandboxDestroyed, 1); + assert.equal(calls.sandboxDisposed, 1); + }); + it("human surface with a stored approval resumes the tool (once)", async () => { const { calls, deps } = depsWithDefaultResponder();