From 23aab377c05261db4803f7debea3fcd2759c3520 Mon Sep 17 00:00:00 2001
From: Mahmoud Mabrouk <mahmoud@agenta.ai>
Date: Wed, 24 Jun 2026 20:40:33 +0200
Subject: [PATCH 1/6] docs(agent): plan a schema-driven /run wire contract

Replace the hand-mirrored protocol.ts <-> wire.py contract (guarded only by
golden fixtures) with a single source of truth: dedicated Pydantic wire models,
exported JSON Schema, ajv validation in the standalone Node runner. Adds the
/run split decision (keep the turn unified; promote GET /capabilities + consume
the contract version on both transports), a structured error model
{ code, message, retryable } with a correctly-modeled cancelled outcome, an
in-band contractVersion, and a 9-step test-at-each-step migration (steps 1-6
behavior-preserving at v1; A10 error-model + A3 backend-removal/harness-rename
as one v2 cut; gated on the A1 versioning / A3 / A10 siblings). Codex-reviewed.
Composio, the tool gateway, connections, and MCP are unchanged. Docs only.

Claude-Session: https://claude.ai/code/session_01GYo3UEfvsZpncagqb28Mbc
---
 .../projects/wire-contract-schema/README.md   | 564 ++++++++++++++++++
 .../projects/wire-contract-schema/status.md   |  81 +++
 2 files changed, 645 insertions(+)
 create mode 100644 docs/design/agent-workflows/projects/wire-contract-schema/README.md
 create mode 100644 docs/design/agent-workflows/projects/wire-contract-schema/status.md

diff --git a/docs/design/agent-workflows/projects/wire-contract-schema/README.md b/docs/design/agent-workflows/projects/wire-contract-schema/README.md
new file mode 100644
index 0000000000..4b0c98ef04
--- /dev/null
+++ b/docs/design/agent-workflows/projects/wire-contract-schema/README.md
@@ -0,0 +1,564 @@
+# Project: A schema-driven `/run` contract
+
+| | |
+| --- | --- |
+| **Status** | Plan. Not started. Awaiting review before any code. |
+| **Type** | Engineering project (a sequenced, test-driven migration), not a one-shot change. |
+| **Scope** | Replace the hand-mirrored `/run` wire contract with a single schema source; add boundary validation; evaluate splitting `/run`; fold in a structured error model and a carried contract version. |
+| **Owner files (today)** | `services/agent/src/protocol.ts` (TS types), `sdks/python/agenta/sdk/agents/utils/wire.py` (Python mirror), `sdks/python/oss/tests/pytest/unit/agents/golden/` (fixtures), `sdks/python/oss/tests/pytest/unit/agents/test_wire_contract.py` + `services/agent/tests/unit/wire-contract.test.ts` (the two contract tests). |
+| **Reference** | The deep spec of the contract as built: [`../runner-interface/README.md`](../runner-interface/README.md). Its Section 12 ("Known gaps") names the exact gaps this project closes. The inventory page: [`../../interfaces/cross-service/service-to-agent-runner.md`](../../interfaces/cross-service/service-to-agent-runner.md). |
+| **Mirroring rule today** | `services/agent/CLAUDE.md` ("The wire contract is mirrored — change both sides"). |
+
+## 1. The problem, precisely
+
+The `/run` contract is the spine of the agent stack: the Python agent service builds a request,
+the Node runner executes a turn, and returns a result or a stream of events. The contract is
+**defined twice** and kept in sync **by hand**:
+
+- TypeScript: `services/agent/src/protocol.ts` declares `AgentRunRequest`, `AgentRunResult`,
+  the `AgentEvent` union, `HarnessCapabilities`, and the sub-objects (`ResolvedToolSpec`,
+  `ToolCallbackContext`, `McpServerConfig`, `SandboxPermission`, `TraceContext`, `WireSkill`,
+  `ContentBlock`, `ChatMessage`, `AgentUsage`, `RenderHint`, `StreamRecord`).
+- Python: `sdks/python/agenta/sdk/agents/utils/wire.py` (`request_to_wire` / `result_from_wire`)
+  plus the BaseModels in `sdks/python/agenta/sdk/agents/dtos.py` (`Message`, `AgentEvent`,
+  `AgentResult`, `HarnessCapabilities`, `TraceContext`, `SandboxPermission`, ...) re-create the
+  same field names by hand.
+
+The **only** guard against the two drifting is four golden fixtures
+(`golden/run_request.{pi,claude}.json`, `golden/run_result.{ok,error}.json`) asserted by two
+tests. The TS test adds a compile-time key guard (`KNOWN_REQUEST_KEYS` assigned to
+`(keyof AgentRunRequest)[]`), and the Python test holds a parallel `KNOWN_REQUEST_KEYS` set.
+
+This is brittle for concrete, observed reasons:
+
+1. **Two hand-kept key lists.** `KNOWN_REQUEST_KEYS` is duplicated in
+   `test_wire_contract.py` and `wire-contract.test.ts`. A new field means editing five places
+   (golden, `protocol.ts`, `wire.py`, both key lists) "deliberately", per the CLAUDE.md rule.
+2. **No runtime validation at the boundary.** `POST /run` JSON-parses the body and runs with
+   whatever fields are present; an empty body becomes `{}` (`server.ts`). A malformed or
+   misspelled field is silently ignored, not rejected. The contract is *implicitly all-optional*
+   (every TS field is `?`, every Python field defaults). A typo like `sandboxPermision` is
+   dropped on the floor with no error. This is `runner-interface/README.md` §12 gap
+   "No schema validation on the runner".
+3. **The version skew guard is exposed but unconsumed.** `version.ts` exports
+   `PROTOCOL_VERSION = 1` and `/health` returns it, but **no Python caller probes `/health`**
+   (verified: no reference to `runnerInfo`/`PROTOCOL_VERSION`/`/health` in the runner-calling
+   path). A client and runner can silently disagree across a major bump. §12 gap "The version
+   skew guard is not consumed".
+4. **The error model is a free string.** `AgentRunResult.error?: string` with no taxonomy and
+   no machine-readable code; `result_from_wire` turns any `ok:false` into a generic
+   `RuntimeError(f"Agent run failed: {error}")`. There is **no distinct cancelled outcome** —
+   a user/client abort surfaces (if at all) as a transport teardown or a generic error, not as a
+   first-class result. §12 names neither, but the user has scoped this as the A10 cleanup.
+
+The fix is a **single source of truth** plus **boundary validation**, sequenced so each step is
+a small change with a test that proves it, and folding in the sibling-project changes (A1
+versioning, A3 backend removal + harness rename, A10 error model).
+
+## 2. What this project changes vs leaves alone
+
+**In scope:**
+
+- One schema as the source of truth for the `/run` request, result, event union, capabilities,
+  and the sub-objects listed above.
+- Runtime validation of `/run` at the Node boundary (reject malformed input with a clear error).
+- A symmetric validation on the Python parse path (reject a malformed result).
+- A structured error object `{ code, message, retryable }` and a distinct `cancelled` outcome.
+- A contract version carried in the payload (not only on `/health`), and a probe that consumes
+  it.
+- A decision on splitting `/run` into more than one endpoint.
+- Replacing the four golden fixtures + two key lists with schema-derived checks (the golden
+  fixtures stay as *examples that must validate*, not as the only guard).
+
+**Explicitly unchanged by this work** (called out so reviewers do not expect movement):
+
+- **Composio, the tool gateway, connections, and MCP** all continue to work exactly as today.
+  They are inputs the service already resolves: `customTools` (gateway callback / code / client),
+  `toolCallback`, `mcpServers`, `connection` / `provider` / `endpoint` / `credentialMode`. This
+  project re-expresses their **shapes** in a schema and validates them; it does not change how
+  any of them resolve, route, or authenticate. The Composio key stays server-side; the gateway
+  callback still POSTs to `/tools/call`; MCP `stdio`/`http` shapes are preserved byte-for-byte.
+- The transports (HTTP + subprocess CLI) and the two modes (one-shot JSON + NDJSON streaming).
+- The harness shaping logic (`config.wire_tools()` etc.) — the schema describes the *output*
+  of that shaping, it does not move the shaping.
+- Tracing (`trace` / `TraceContext`) and the trace-export boundary.
+
+## 3. Current contract surface (assessment)
+
+The contract has four families. This is what a single schema has to cover.
+
+### 3.1 Request (`AgentRunRequest`)
+
+~30 top-level fields, **all optional on the wire**, grouped by job:
+
+| Group | Fields |
+| --- | --- |
+| engine + placement | `backend`, `harness`, `sandbox`, `sessionId` |
+| instructions | `agentsMd`, `systemPrompt`, `appendSystemPrompt` |
+| model + connection | `model`, `provider`, `connection {mode, slug?}`, `deployment`, `endpoint {baseUrl?, apiVersion?, region?, headers?}`, `credentialMode`, `secrets` |
+| turn | `prompt`, `messages` (`ChatMessage[]`) |
+| tools + skills | `tools` (string[]), `customTools` (`ResolvedToolSpec[]`), `toolCallback`, `mcpServers`, `skills` (`WireSkill[]`) |
+| policy + files | `permissionPolicy`, `sandboxPermission`, `harnessFiles` (`[{path, content}]`) |
+| tracing | `trace` (`TraceContext`) |
+
+Back-compat splits the schema must preserve: a plain-string `model` keeps `provider` /
+`connection` / `deployment` / `endpoint` / `credentialMode` **off** the wire; `mcpServers`,
+`skills`, `sandboxPermission`, `harnessFiles` are **omitted** (not null) when empty so a
+minimal payload stays byte-identical to the golden.
+
+### 3.2 Result (`AgentRunResult`)
+
+`ok` (bool), `output?`, `messages?`, `events?`, `usage?` (`AgentUsage`), `stopReason?`,
+`capabilities?` (`HarnessCapabilities`), `sessionId?`, `model?`, `traceId?`, `error?` (the free
+string this project replaces). `ok:false` raises in Python (`result_from_wire`).
+
+### 3.3 The event union (`AgentEvent`)
+
+A discriminated union on `type`: `message`, `thought`, the `message_*` / `reasoning_*` lifecycle
+trios, `tool_call`, `tool_result`, `interaction_request`, `data`, `file`, `usage`, `error`,
+`done`. Plus `StreamRecord = {kind:"event",event} | {kind:"result",result}` for NDJSON framing.
+Note: the Python side intentionally **drops unknown event types** on parse
+(`AgentEvent.from_wire` returns `None` for a typeless event), and a golden pins that. The schema
+must keep events **open/forward-compatible**, not closed.
+
+### 3.4 Sub-objects
+
+`ResolvedToolSpec` (the three-axis tool surface: `kind`/`runtime`/`code`/`env`/`callRef`,
+`needsApproval`, `render`, `readOnly`, `permission`), `ToolCallbackContext`, `McpServerConfig`,
+`SandboxPermission` (nested `network`, `filesystem`, `enforcement`), `HarnessCapabilities` (11
+boolean flags), `TraceContext`, `WireSkill` + `WireSkillFile`, `ContentBlock`, `ChatMessage`,
+`AgentUsage`, `RenderHint`.
+
+### 3.5 The existing golden/test machinery
+
+- `golden/run_request.pi.json` (full Pi shape: tools, skills, sandboxPermission, prompt overrides),
+  `golden/run_request.claude.json` (Claude shape: empty `tools`, `permissionPolicy:"deny"`,
+  `harnessFiles` with rendered `.claude/settings.json`).
+- `golden/run_result.ok.json` (includes a typeless event to pin the drop behavior),
+  `golden/run_result.error.json` (`{"ok": false, "error": "model exploded"}`).
+- Python `test_wire_contract.py`: builds payloads via the real configs and asserts `== golden`,
+  plus `set(payload) <= KNOWN_REQUEST_KEYS`.
+- TS `wire-contract.test.ts`: loads the goldens, asserts shapes through the runner helpers
+  (`resolvePromptText`, `messageText`, `resolveRunSessionId`), and the two compile-time guards
+  (`KNOWN_REQUEST_KEYS` / `CAPABILITY_KEYS` assigned to `keyof` types).
+
+The machinery is **good** and we keep its spirit: the goldens become "examples that must validate
+against the schema", and the duplicated key lists are replaced by schema-derived assertions.
+
+## 4. Design options for a single source of truth
+
+Three candidates, judged against this stack: **Python Pydantic 2 SDK** + a **standalone Node ESM
+runner package** (`services/agent`) that has its own `pnpm-lock.yaml`, runs through `tsx` with
+**no app compile step and no codegen toolchain today**, and is deliberately decoupled from the
+`web/` dependency graph. There is no JSON-Schema codegen, no `quicktype`,
+no `datamodel-code-generator`, and **no zod** anywhere in the runner or web (verified).
+
+### Option A — JSON Schema as source, codegen both sides
+
+Author the contract as hand-written JSON Schema files; generate TS types
+(`json-schema-to-typescript`) and Pydantic models (`datamodel-code-generator`) from them. Both
+sides validate with the schema at runtime (ajv on Node, `jsonschema`/Pydantic on Python).
+
+- **Pros:** language-neutral source; one artifact; both sides are generated, so neither drifts.
+- **Cons:** introduces **two new codegen toolchains** into a repo that has none for this, and a
+  build step into a package that intentionally has none (`services/agent/CLAUDE.md`: "no app
+  compile step"). Hand-writing JSON Schema is verbose and error-prone for a union as rich as
+  `AgentEvent` + `RenderHint`. The Python SDK already has hand-written BaseModels with custom
+  `to_wire`/`from_wire` (camelCase aliasing, the `model`-string back-compat split, the
+  drop-unknown-event behavior); regenerating them from schema would either lose that behavior or
+  require post-gen patching. High blast radius, fights the existing grain.
+
+### Option B — Pydantic as source, export JSON Schema, validate on the TS side (RECOMMENDED)
+
+Make **Python Pydantic models the source of truth** — but a **dedicated set of *wire* models**,
+NOT the existing semantic DTOs. This distinction is load-bearing (it was the sharpest review
+finding): the real contract today does not live in `dtos.py`'s classes — it lives in the **hand
+serializers** (`request_to_wire` builds a raw dict; `Message.to_wire`, `TraceContext.to_wire`,
+`AgentEvent.from_wire`, etc. do the camelCase + omit + drop-unknown work). The semantic DTOs use
+**snake_case** fields (`text_messages`, `mime_type`, `capture_content`) and an intentionally
+loose `AgentEvent` (`type: str` + free `data` dict, vs the real discriminated union in
+`protocol.ts`). Exporting `model_json_schema()` straight off those DTOs would produce the *wrong*
+schema (snake_case keys, a non-discriminated event). So:
+
+- Author new wire models in the SDK (e.g. `agents/wire_models.py`): `WireRunRequest`,
+  `WireRunResult`, and an **explicit discriminated `WireAgentEvent` union** (real variants on
+  `type`, plus an open fallback variant so unknown event types still validate, matching the
+  current drop-unknown tolerance) — with camelCase aliases (`populate_by_name=True`, as
+  `AgentConfig` already does), explicit nullability, and the exact field set the serializers emit.
+- These wire models become the single producer: `request_to_wire` / `result_from_wire` are
+  reimplemented in terms of them. The omit-when-empty behavior stays as serializer logic + golden
+  checks — `model_json_schema()` expresses "optional", not "omit when empty".
+- Pydantic 2's `model_json_schema()` exports the JSON Schema artifact **for free**, no new
+  toolchain. Commit it as `services/agent/contract/run-contract.schema.json`.
+- The TS runner validates incoming `/run` against it with one small validator (ajv, the one new
+  runner dep) — **no codegen, no build step**: the schema is data the runner loads.
+
+- **Pros:** fits the stack — Pydantic 2 is already the SDK's modeling layer (`pydantic>=2,<3`);
+  the producer (Python) is the natural source since it builds the request. Schema export is a
+  built-in, not a new tool. The runner gains real runtime validation with one dependency (ajv)
+  and zero build step, honoring the standalone-package constraint. The omit-when-empty behavior
+  stays in Python where it already lives and is tested. The exported schema becomes a
+  **CI-checked artifact**: a test fails if the committed schema drifts from the wire models.
+- **Cons:** requires writing dedicated wire models (a real cost, but it is the honest cost of a
+  single source — the alternative is the current double-maintenance). For the TS *types* there are
+  two sub-options: **(b1)** keep hand-written `protocol.ts` + a schema-derived guard, or **(b2)**
+  generate `protocol.ts` from the schema with `json-schema-to-typescript` as a committed artifact
+  (a checked-in generated file run by a script, not a runtime build step). A hand-written
+  `protocol.ts` + a top-level key guard does **not** catch *nullability* drift — the Claude golden
+  sends `sessionId: null` / `trace: null` while `protocol.ts` types them `?: string` (present-or-
+  absent, not nullable). So b1 needs schema-derived type-equivalence tests deeper than keys, and
+  **b2 (generate the types) is the safer end state**. This is an open question (§10).
+
+### Option C — A shared IDL (`.proto`, Smithy, etc.)
+
+Define the contract in a neutral IDL and generate both sides + a validator.
+
+- **Pros:** strongest neutrality; mature codegen.
+- **Cons:** the heaviest option for an internal JSON-over-HTTP/stdio boundary. The wire is JSON,
+  not protobuf; adopting proto means either proto-over-JSON (awkward) or changing the wire format
+  (out of scope and risky). Brings a build toolchain and a new language into a two-language repo
+  that wants fewer moving parts. The `AgentEvent` open-union + "drop unknown" semantics fit JSON
+  Schema's `additionalProperties`/`oneOf` better than proto's closed messages. Overkill.
+
+### Recommendation: Option B (Pydantic-as-source → exported JSON Schema → ajv validation in the runner)
+
+It is the only option that adds **one** runtime dependency (ajv) and **zero** runtime build
+steps, fits the Pydantic 2 stack, keeps the custom serialization semantics where they are tested,
+and turns the schema into a CI-checked artifact that makes drift a failing test instead of a
+code-review discipline. Source of truth = dedicated Pydantic **wire** models (not the semantic
+DTOs); the exported `run-contract.schema.json` is the shared artifact; the TS types are best
+**generated** from it (sub-option b2) so nullability drift cannot hide.
+
+## 5. Runtime validation at the boundary
+
+Today `/run` is implicitly all-optional and silently drops typos. The plan:
+
+- **Runner ingress (request).** In `server.ts` / `cli.ts`, after JSON-parse, validate the body
+  against `run-contract.schema.json` with ajv. On failure, return a structured `400` (HTTP) /
+  exit 1 (CLI) with the validator's error path — e.g. `{ ok:false, error:{ code:"invalid_request",
+  message:"sandboxPermission.network.mode: must be one of on|off|allowlist", retryable:false } }`.
+  Replace today's "empty body -> `{}` runs with defaults" with an explicit allowance: an empty
+  body is still valid (the contract tests rely on it), but a *present but malformed* body is
+  rejected. Decide on `additionalProperties`: start **permissive** (warn/log unknown top-level
+  keys, do not reject) to avoid breaking a newer service against an older runner, and tighten to
+  reject in a later, version-gated step. Unknown **event** types stay tolerated by design.
+- **Python egress (request) — optional symmetric guard.** `request_to_wire` can validate its own
+  output against the same schema in tests (and optionally under a debug flag at runtime), so the
+  producer cannot emit a payload the runner would reject. This is the producer-side half of the
+  guard and is cheap because the schema already exists.
+- **Python ingress (result).** `result_from_wire` gains schema validation of the result before
+  it constructs `AgentResult`, so a malformed runner result fails loudly rather than producing a
+  half-empty `AgentResult`.
+
+The validator is **not** a behavior change to any resolved input (tools, gateway, MCP,
+connections are validated for shape only, never re-resolved).
+
+## 6. The `/run` split decision
+
+The user agrees `/run` does too much. `/run` today conflates: (a) a one-shot turn, (b) a
+streaming turn (same route, switched by `Accept`), and (c) there is no separate way to ask "what
+can this runner do" except the unconsumed `/health`. Evaluated splits:
+
+### Keep as one endpoint: single-turn vs streaming
+
+**Do NOT split** one-shot and streaming into two endpoints. They share the identical
+`AgentRunRequest` and return the identical `AgentRunResult` (the streaming terminal `result`
+record is the same object with `events` emptied). The only difference is the `Accept` header
+selecting the framing. The `runner-interface` RFC §6 calls this the "symmetry guarantee", and
+both Python transports already parse both with the same `result_from_wire`. Splitting would
+duplicate the request schema and the dispatch for no contract benefit. Content negotiation
+(`Accept: application/x-ndjson`) is the right axis and is already in place. **Verdict: keep.**
+
+### Split out: a capability / contract probe
+
+**DO formalize the probe.** `/health` already returns `{status, runner, protocol, engines,
+harnesses}` but nothing consumes it, and `HarnessCapabilities` (per-harness, 11 flags) is only
+discoverable by doing a full run. Recommendation:
+
+- Keep `GET /health` as the cheap liveness + identity + **contract version** probe (it already
+  carries `protocol`). This is what the A1 version check consumes (Section 7).
+- Add `GET /capabilities` (or `GET /capabilities?harness=pi`) that returns the static **base**
+  `HarnessCapabilities` per harness **without running a turn**. Today capabilities are probed
+  per-run and returned in the result; a static probe lets the service/playground render UI and
+  pre-validate a request (e.g. reject `images` for a harness that lacks `fileAttachments`) before
+  spending a run. The probe must state base-vs-effective explicitly: some flags are
+  mode-dependent (`streamingDeltas` is derived at run time in `engines/sandbox_agent.ts`), so the
+  static probe returns **base** capabilities and the run result stays authoritative for
+  mode-dependent flags. This is additive, not a split of `/run`'s job.
+
+**Verdict: keep `/run` unified for the turn; promote a `/capabilities` probe and actually consume
+`/health`.** This removes work from the run path (capability discovery) without fragmenting the
+turn contract.
+
+### Considered and rejected
+
+- A separate `/cancel` endpoint: rejected. Cancellation is correctly modeled as transport
+  teardown (close the NDJSON connection / kill the subprocess), already wired for
+  `runSandboxAgent` over HTTP. A `/cancel` would need session affinity the cold runtime does not
+  have. The A10 change adds a *cancelled outcome* (Section 7), not a cancel endpoint.
+- A separate tool-callback or MCP endpoint on the runner: out of scope and unchanged — those are
+  the runner *calling out* (`/tools/call`) and the gateway/MCP surfaces, which this work does not
+  touch.
+
+## 7. Folding in the sibling projects (A1, A3, A10)
+
+This project assumes and coordinates with three parallel efforts. The schema is where they meet.
+
+### A3 — backend removal + harness rename (assumed end state)
+
+A3 removes the legacy in-process backend and the `backend` field, and renames harness values
+`pi -> pi_core` and `agenta -> pi_agenta`. The schema must land **after** or **with** A3, or it
+will pin the wrong enum. Concretely in the schema:
+
+- **Drop `backend`** from `AgentRunRequest` (the runner no longer dispatches on an engine id; one
+  engine path remains). Remove it from the key lists and goldens.
+- **`harness` enum becomes** `pi_core` | `pi_agenta` | `claude` (was `pi` | `agenta` | `claude`).
+  Update `version.ts` `HARNESSES`, the goldens, and the schema enum together.
+- Because A3 removes a field and changes an enum, it is a **breaking** contract change. It shares
+  **one** `PROTOCOL_VERSION` bump with the A10 error-model change (the single v2 cut in step 8) —
+  it does NOT get its own separate "also v2" bump (a second breaking change after a bump is not
+  incremental). The migration introduces the schema *first* at v1 (behavior-preserving, steps
+  1-6), then makes A3 + A10 the single v2 cut, so the schema work is not blocked on A3 landing. If
+  the team would rather decouple them, A3 becomes a distinct **v3** cut — but the two breaking
+  changes must not collapse into one ambiguous "v2" label.
+
+### A1 — versioning (coordinate: the schema carries/enables a contract version)
+
+A1 is the sibling project [`../contract-versioning/`](../contract-versioning/) (it owns the
+versioning strategy for the cross-service contracts). It independently found the same gap this
+project relies on: the runner advertises `protocol: 1` on `/health` (`version.ts`) but the Python
+client (`ts_runner.py`) never reads it — no negotiation, no skew guard. This project makes the
+schema **carry** the contract version so skew is detectable in-band, not only via `/health`:
+
+- Add an optional `contractVersion` (int major) to `AgentRunRequest` and `AgentRunResult`. The
+  service stamps the major it built for; the runner can reject a major it does not support with a
+  structured `{ code:"unsupported_contract_version", ... }` error (see A10) instead of silently
+  mis-parsing.
+- Consume the version on **both transports**, not only `/health` (the backend chooses HTTP or
+  subprocess in `adapters/sandbox_agent.py`, and the CLI has no health mode): the HTTP path probes
+  `GET /health.protocol`; the subprocess/CLI path needs a `--version`/`--protocol` mode (or reads
+  the in-band `contractVersion` echoed on the result). Either way the client refuses a runner
+  whose major it does not understand, closing the §12 "version skew guard is not consumed" gap.
+  The schema artifact itself is versioned by the same major.
+- Exact ownership of the bump mechanics stays with A1; this project provides the in-payload field
+  and the probe consumption. The two must agree on the major-number semantics (single int major,
+  surfaced on `/health` as `protocol` and in-band as `contractVersion`).
+
+### A10 — error model cleanup (in scope here)
+
+Replace `AgentRunResult.error?: string` with a structured error and add a distinct cancelled
+outcome:
+
+```jsonc
+// AgentRunResult, error branch
+{
+  "ok": false,
+  "error": {
+    "code": "model_error",            // taxonomy, see below
+    "message": "model exploded",      // human-readable, what today's string held
+    "retryable": false                // does a naive retry have a chance?
+  }
+}
+```
+
+- **Error taxonomy (`code`)**, a closed-ish enum the runner sets and the service can branch on:
+  `invalid_request` (schema validation failed), `unsupported_contract_version`,
+  `unsupported_harness`, `auth_error`, `quota_exceeded`, `rate_limited`, `configuration_error`,
+  `permission_denied`, `model_error`, `tool_error`, `mcp_error`, `sandbox_error`, `timeout`,
+  `cancelled`, `internal`. The `auth_error` / `quota_exceeded` / `rate_limited` codes are not
+  speculative: the runner already pattern-classifies these from provider error text in
+  `services/agent/src/engines/sandbox_agent/errors.ts` — the schema just gives that classification
+  a stable wire code. Keep the enum forward-compatible (an unknown code -> treat as `internal`),
+  mirroring the event "drop unknown" tolerance.
+- **`retryable`** lets the caller distinguish a transient `timeout` / `rate_limited` / `mcp_error`
+  from a permanent `invalid_request` / `unsupported_harness` / `auth_error`.
+- **Distinct cancelled outcome — but only where it is actually deliverable.** A user/client abort
+  is **not** a failure. The subtlety (a real review catch): a *client disconnect* mid-stream
+  cannot reliably receive a terminal record, because the disconnect is exactly what tears the
+  transport down — `server.ts` aborts the run *on* response `close`, and the Python streaming
+  transports treat a stream with no terminal `result` as an error (`ts_runner.py`). So:
+  - **Cooperative cancellation while the transport is still open** (e.g. an in-band stop signal,
+    or a future `/cancel`-style affordance): emit the terminal `{ ok:false, error:{code:
+    "cancelled"} }` record — the §8b "exactly one terminal result" invariant holds and the result
+    stays authoritative. Set `retryable:false` (or omit it) — a cancel is intentional, not a
+    transient fault.
+  - **Transport teardown (the disconnect case we have today)**: the terminal record cannot be
+    delivered; the Python side must map "generator cancelled / connection closed by us" to a
+    distinct **`CancelledError`-style outcome**, NOT the generic "stream ended without a terminal
+    result" `RuntimeError`. This is a Python-side parsing/exception change, not a wire record.
+  - Optionally also emit a `done` event with `stopReason:"cancelled"` for streams (useful as a
+    live signal), but the terminal result remains authoritative when the connection is alive.
+- **Migration:** `result_from_wire` must accept **both** the old free-string `error` and the new
+  structured object during the transition (parse a string into `{code:"internal", message:str,
+  retryable:false}`), so an old runner against a new service still parses. The structured form is
+  the v2 shape; the string form is read-compat only.
+
+This is a contract change (new error shape) and folds into the same v2 bump as A3.
+
+## 8. Incremental, test-at-each-step migration plan
+
+No big-bang. Each step is a small change plus the test that proves it. Steps 1-6 are
+**behavior-preserving at contract v1** (the schema must validate the *current* goldens), so they
+can land before A1/A3/A10. Steps 7-10 are the versioned (v2) changes that depend on the siblings.
+
+The sequence respects the shared-surface rule (`agent-coordination.md`): any change to
+`protocol.ts` / `wire.py` / golden / the two contract tests is coordinated, single-PR, both
+sides + golden together.
+
+1. **Add the Pydantic request/result models (no wire change).**
+   Add `AgentRunRequest` / `AgentRunResult` Pydantic models in the SDK that compose the existing
+   `dtos.py` sub-models with camelCase aliases, reproducing exactly what `request_to_wire` /
+   `result_from_wire` emit/parse today.
+   *Test:* a new unit test asserts `AgentRunRequest(...).model_dump(by_alias=True, exclude_none-ish)
+   == request_to_wire(...)` for the Pi, Claude, and Agenta payloads (round-trip parity with the
+   hand-built dicts and the goldens). Green before touching anything else.
+
+2. **Export the JSON Schema artifact + a freshness test.**
+   Add a script that writes `services/agent/contract/run-contract.schema.json` from
+   `model_json_schema()`. Commit the artifact.
+   *Test:* a CI test regenerates the schema in-memory and asserts it equals the committed file
+   (drift -> fail), mirroring how the goldens are regenerated deliberately.
+
+3. **Assert the existing goldens validate against the schema (Python side).**
+   *Test:* load each of the four goldens, validate against `run-contract.schema.json`
+   (`jsonschema`); all must pass. This proves the schema faithfully describes today's wire before
+   anything consumes it. No production code path changes yet.
+
+4. **Add ajv validation in the runner, behind a permissive mode (TS side).**
+   Add ajv (one dependency) and load `run-contract.schema.json`. Validate the `/run` body in
+   `server.ts`/`cli.ts`; in this step, on failure **log and continue** (permissive) so nothing
+   breaks, and assert the goldens validate.
+   *Test:* `wire-contract.test.ts` validates the goldens through ajv (must pass) and validates a
+   deliberately-malformed payload (must report the right error path) — without yet rejecting it.
+
+5. **Flip the runner to reject malformed requests (the boundary guard) — at v1 error shape.**
+   Change permissive -> reject: a present-but-malformed body is rejected with a `400`. **Ordering
+   subtlety (a real review catch):** at this point the result `error` is still the v1 free string
+   (`protocol.ts` `error?: string`), so this step must NOT emit the structured `{error:{code:...}}`
+   shape yet — emitting structured errors before the v2 error model lands would itself be a
+   contract change. So the v1 rejection returns a **string** error
+   (`{ok:false, error:"invalid_request: sandboxPermission.network.mode: must be one of ..."}`),
+   carrying the code as a stable prefix. The structured object replaces it in step 8. An empty
+   body still parses to a valid default request. Keep `additionalProperties` permissive (unknown
+   top-level keys logged, not rejected) for cross-version safety.
+   *Test:* `server.test.ts` asserts a malformed `/run` -> `400` with the `invalid_request:` prefix;
+   an empty body -> still runs; the goldens -> still accepted. CLI test mirrors via exit code.
+
+6. **Replace the duplicated key lists with schema-derived guards (+ a deeper-than-keys check).**
+   Swap the two hand-kept `KNOWN_REQUEST_KEYS` for: (Python) derive the allowed key set from the
+   schema's `properties`; (TS) drive the guard list from the schema property names so it cannot
+   silently fall behind. **But a key guard alone is not enough** (review catch): it misses
+   *nullability* drift — the Claude golden sends `sessionId: null` / `trace: null` while
+   `protocol.ts` types them `?: string`. Add a schema-derived check that compares each field's
+   nullability/optionality against the TS types (or, if sub-option b2 was chosen, generate
+   `protocol.ts` from the schema so the check is structural).
+   *Test:* `set(schema.properties) == set(KNOWN_REQUEST_KEYS)` on both sides, plus a nullability
+   assertion (`sessionId`/`trace` accept `null`). **End of the behavior-preserving phase: the
+   hand-maintained key lists are gone and the runtime guard is the schema. The TS *types* are
+   either generated (b2) or guarded deeper than keys (b1); the omit-when-empty serializer behavior
+   is still asserted by the goldens, not by the schema.**
+
+   --- the steps below are the v2 contract changes; they depend on the siblings ---
+
+7. **Add `contractVersion` in-band + consume the version on BOTH transports (with A1).**
+   Add optional `contractVersion` (additive, still v1-compatible) to the request/result schema;
+   service stamps it. Consume it on **both** transports, not only `/health` (review catch: the
+   backend picks HTTP *or* subprocess in `adapters/sandbox_agent.py`, and the CLI has no health
+   mode): the HTTP path probes `GET /health.protocol` once; the **subprocess/CLI path** gets a
+   `--version`/`--protocol` mode (or reads the in-band `contractVersion` echoed on the first
+   result) so a skewed CLI runner is also refused.
+   *Test:* a transport test stubs an incompatible `/health` major and asserts the HTTP adapter
+   refuses before `/run`; a CLI test asserts the subprocess version probe refuses an incompatible
+   major; a schema test asserts `contractVersion` round-trips.
+
+8. **The v2 cut: structured error model + cancelled outcome (A10) AND the A3 removal, together.**
+   These are **two breaking changes**, so they ship as **one `PROTOCOL_VERSION = 2` cut**, not as
+   two steps each calling itself "v2" (review catch: a second breaking change after a bump is not
+   incremental — bump once, change once). In this single cut:
+   - Result `error` becomes `{code, message, retryable}`; `result_from_wire` reads both the new
+     object and the old v1 string (string -> `{code:"internal", message:str}`) for read-compat
+     against an older runner. The step-5 `invalid_request:`-prefixed string rejection becomes the
+     structured object.
+   - Cancellation: cooperative cancel emits the terminal `{ok:false, error:{code:"cancelled"}}`;
+     transport-teardown cancel maps to a distinct Python `CancelledError` outcome (per §7 A10).
+   - A3: drop `backend` from the schema/models/goldens/guards; rename the `harness` enum to
+     `pi_core | pi_agenta | claude`; update `version.ts` `HARNESSES`.
+   - **Sequencing within the cut is itself incremental and test-guarded:** land read-compat error
+     parsing first (no behavior change), then flip the emitted shape, then the A3 removal — each
+     with its test green — but they release as one v2 because they share the breaking bump.
+   *Test:* `test_wire_contract.py` parses an old-string-error golden and a new-structured golden;
+   a runner test asserts cooperative cancel yields the terminal `cancelled` record and a disconnect
+   yields the Python `CancelledError`; regenerated Pi/Claude goldens (no `backend`, renamed
+   harness) assert the new shapes; a test asserts the schema rejects `backend` and the old
+   `pi`/`agenta` harness values. New goldens: `run_result.cancelled.json`,
+   `run_result.error_structured.json`. (If the team prefers to decouple, A3 can instead be a
+   separate **v3** cut — but it must not share v2's bump.)
+
+9. **Promote the capability probe: `GET /capabilities` (additive, can land any time).**
+   Add the static per-harness `HarnessCapabilities` route to the runner. Define explicitly whether
+   it returns **base** capabilities (what the harness supports at all) or **effective** ones (for a
+   given request/mode) — `streamingDeltas` is mode-dependent today (review catch:
+   `engines/sandbox_agent.ts` derives it at run time). Recommendation: return **base** capabilities
+   from this static probe and document that mode-dependent flags are only authoritative in a run
+   result. Optionally have the service pre-validate a request against the base capabilities.
+   *Test:* `server.test.ts` asserts `GET /capabilities` returns the schema-valid base capability
+   map per harness without running a turn; a schema test asserts the capability map validates.
+
+After these steps: one schema source (dedicated Pydantic wire models) -> one exported artifact ->
+runtime validation on both sides -> structured errors + a correctly-modeled cancelled outcome ->
+in-band + both-transport contract version -> a real capability probe, with a test at every step
+and no point where both sides are broken at once.
+
+## 9. Risks and mitigations
+
+- **Drift between `protocol.ts` types and the schema.** Mitigated by step 6's schema-derived guard
+  (keys **and** nullability, or generated TS types) + step-4 ajv validation of the goldens; a drift
+  fails `tsc` or a test.
+- **An older runner against a newer service (or vice versa).** Mitigated by permissive
+  `additionalProperties`, the read-compat error parsing (step 8), and the both-transport major
+  probe (step 7).
+- **The committed schema artifact going stale.** Mitigated by step 2's freshness test (regenerate
+  == committed), the same discipline the goldens already use.
+- **Sequencing against A1/A3/A10.** Steps 1-6 are independent and land first at v1; A10 + A3 are
+  the **single** v2 cut (step 8); the version probe (7) and capability probe (9) are additive.
+- **Standalone-package constraint.** Only one runner *runtime* dependency added (ajv); no runtime
+  build step; the schema is loaded as data, honoring `services/agent/CLAUDE.md`. (If TS types are
+  generated — sub-option b2 — that is a committed artifact produced by a script, not a build step.)
+
+## 10. Open questions for review
+
+1. **Wire models vs semantic DTOs, and where they live.** Confirmed direction: a **dedicated set
+   of wire models** (not the snake_case semantic DTOs in `dtos.py`). Open: place them in a new
+   `agents/wire_models.py` next to `dtos.py` (proposed) vs a dedicated contract package. Export the
+   artifact into `services/agent/contract/`.
+2. **TS types: hand-written + deep guard (b1) vs generated from the schema (b2).** Proposal: **b2
+   (generate `protocol.ts`)** — a key guard misses nullability drift (`sessionId`/`trace` are
+   `null` on the wire but `?:` in TS). If b1, the guard must compare nullability, not just keys.
+3. **`additionalProperties` end state.** Stay permissive forever (log unknowns) or tighten to
+   reject after the version probe is in place? Proposal: permissive for top-level request fields,
+   strict for nested objects where a typo is more likely a bug than a version skew.
+4. **Cancelled modeling.** Cooperative cancel -> terminal `error.code:"cancelled"`; transport
+   teardown -> distinct Python `CancelledError` (proposed). Optionally also a `done`
+   `stopReason:"cancelled"`. Needs A10 sign-off. `retryable` for cancel: `false`/omit.
+5. **Version-cut grouping.** A10 + A3 as one v2 cut (proposed) vs A3 as a separate v3. A1 owns the
+   final call on bump grouping.
+6. **`contractVersion` granularity + transport coverage.** Single int major (proposed, matches
+   `/health.protocol`) vs semver; and the subprocess/CLI version probe shape (`--protocol` flag vs
+   in-band echo). A1 owns the granularity.
+7. **Capability probe shape + base-vs-effective.** Return all harnesses (proposed) vs `?harness=`;
+   and the probe returns **base** capabilities (proposed), with mode-dependent flags
+   (`streamingDeltas`) authoritative only in a run result.
+
+## 11. Review
+
+This plan was reviewed by Codex (gpt-5.5, xhigh, read-only) on 2026-06-24. It verified the claims
+against the code and the verdict ("Option B is the right direction, but ...") drove six concrete
+corrections now folded in: (1) source from dedicated **wire** models, not the snake_case semantic
+DTOs; (2) a key guard misses **nullability** drift — generate TS types or test deeper; (3) A10 and
+A3 are **two** breaking changes -> one v2 cut (or A3 as v3), not two "v2" steps; (4) step 5 cannot
+emit the structured error shape while v1 still types `error` as a string; (5) the version probe
+must cover the **subprocess/CLI** transport, not only `/health`; (6) cancellation via a terminal
+record only works for **cooperative** cancel — a disconnect maps to a Python `CancelledError`.
+Also expanded the error taxonomy (`auth_error`/`quota_exceeded`/`rate_limited`/
+`configuration_error`/`permission_denied`, which `engines/sandbox_agent/errors.ts` already
+classifies) and added the base-vs-effective capability distinction.
diff --git a/docs/design/agent-workflows/projects/wire-contract-schema/status.md b/docs/design/agent-workflows/projects/wire-contract-schema/status.md
new file mode 100644
index 0000000000..4984a57b84
--- /dev/null
+++ b/docs/design/agent-workflows/projects/wire-contract-schema/status.md
@@ -0,0 +1,81 @@
+# Status: wire-contract-schema
+
+| | |
+| --- | --- |
+| **Phase** | Plan written + Codex-reviewed (corrections folded in). Awaiting human review. No code. |
+| **Owner** | wire-contract-schema (this session is A2 in the A1/A2/A3/A10 cohort) |
+| **Lane** | dedicated GitButler lane `feat/agent-wire-contract-schema-plan`, single commit (docs only) |
+| **Created** | 2026-06-24 |
+
+## What exists
+
+- `README.md` — the full plan: current-state assessment, the three source-of-truth options with
+  the Option B recommendation, the `/run` split decision, the A10 structured-error + cancelled
+  change, the A1 in-band contract-version coordination, a 9-step test-at-each-step migration, and a
+  Review section (§11) recording the Codex pass.
+
+## Decisions made in the plan
+
+1. **Schema source = dedicated Pydantic *wire* models (Option B), NOT the semantic DTOs.** Codex's
+   sharpest catch: the real contract lives in the hand serializers (`wire.py`, `*.to_wire`), and
+   the `dtos.py` classes are snake_case with a loose `AgentEvent` — exporting them would give the
+   wrong schema. So author `WireRunRequest` / `WireRunResult` / a discriminated `WireAgentEvent`
+   with camelCase aliases, export JSON Schema via `model_json_schema()`, validate in the runner
+   with ajv (one dep, no build step). JSON-Schema-as-source (A) and a shared IDL (C) both pull
+   codegen toolchains into a repo that has none and a package that forbids a build step.
+2. **TS types are best generated from the schema (b2).** A top-level key guard misses *nullability*
+   drift (`sessionId`/`trace` are `null` on the wire, `?:` in `protocol.ts`). Generate `protocol.ts`
+   as a committed artifact, or add nullability-deep type tests. Open question for review.
+3. **`/run` stays unified for the turn.** One-shot vs streaming is content negotiation (`Accept`),
+   not two endpoints — they share the request/result shapes. Promoted instead: a `GET /capabilities`
+   probe (static **base** per-harness capabilities, no run) and consuming the contract version on
+   **both** transports. Rejected: a `/cancel` endpoint.
+4. **Error model `{ code, message, retryable }`** with an expanded taxonomy (incl. `auth_error` /
+   `quota_exceeded` / `rate_limited` / `permission_denied`, which `errors.ts` already classifies)
+   and a cancelled outcome that is modeled correctly: terminal record for **cooperative** cancel,
+   a Python `CancelledError` for **transport-teardown** cancel (a disconnect cannot receive a
+   terminal record).
+5. **Behavior-preserving first.** Steps 1-6 introduce the schema, artifact, validation, and
+   boundary guard at contract **v1**. A10 + A3 are the **single** v2 cut (step 8), not two "v2"
+   steps. The version probe (7) and capability probe (9) are additive.
+
+## Codex review (2026-06-24)
+
+gpt-5.5, xhigh, read-only. Verdict: "Option B is the right direction, but the plan was too
+optimistic as written." Six corrections folded in: wire-models-not-DTOs, nullability drift,
+two-breaking-changes-one-cut, step-5 error-shape ordering, both-transport version probe,
+cooperative-vs-teardown cancellation. Plus taxonomy expansion and base-vs-effective capabilities.
+Full critique in README §11. Log at `/tmp/codex-out-wire-schema.log` (transient).
+
+## Coordination
+
+- **A1 (`contract-versioning`)** — sibling at `../contract-versioning/`; it found the same
+  unconsumed-`/health.protocol` gap. A1 owns version-number semantics + bump policy; this project
+  provides the in-band `contractVersion` field and the both-transport probe (step 7).
+- **A3 (backend removal + harness rename)** — assumed end state; schema drops `backend` and renames
+  the `harness` enum to `pi_core | pi_agenta | claude` as part of the v2 cut (step 8). Steps 1-6
+  are at v1, so the schema work does not block on A3.
+- **A10 (error model)** — folded into step 8.
+- **`sidecar-trust-and-sandbox-enforcement`** flagged a stale comment at `protocol.ts:149-150`
+  (sandbox enforcement is now real on Daytona). Noted; the v2 `SandboxPermission` schema work
+  should pick up the corrected wording.
+- **DOCS-ONLY.** No edit to `protocol.ts` / `wire.py` / golden / contract tests. Composio, the tool
+  gateway, connections, and MCP are described as existing and **unchanged**.
+
+## Next actions (after review)
+
+- Get sign-off on README §10 open questions (wire-model placement, b1-vs-b2 TS types,
+  `additionalProperties`, cancelled modeling, version-cut grouping, `contractVersion` granularity,
+  capability probe shape).
+- Confirm the v2-cut grouping (A10 + A3 together vs A3 as v3) with A1/A3 owners.
+- Then implement step 1 (dedicated wire models with round-trip parity tests against the goldens).
+
+## Open questions (see README §10 for full text)
+
+1. Wire-model placement: `agents/wire_models.py` (proposed) vs a contract package.
+2. TS types: generated from schema (b2, proposed) vs hand-written + deep guard (b1).
+3. `additionalProperties`: permissive top-level + strict nested (proposed).
+4. Cancelled modeling: cooperative terminal record + teardown `CancelledError` (proposed).
+5. Version-cut grouping: A10 + A3 one v2 (proposed) vs A3 as v3 — A1's call.
+6. `contractVersion` granularity + subprocess probe shape — A1's call.
+7. Capability probe: all harnesses + base capabilities (proposed).

From 80d271b11cae8486719d409fba12c70228b3ac64 Mon Sep 17 00:00:00 2001
From: Mahmoud Mabrouk <mahmoud@agenta.ai>
Date: Wed, 24 Jun 2026 21:46:52 +0200
Subject: [PATCH 2/6] docs(agent): revise /run schema plan per author POC
 review

Address the four inline review comments on PR #4830:
- Drop all backward-compat-preservation framing (this is a pre-production
  POC; any wire shape may change freely).
- Pydantic stays the source for now; the exported JSON Schema interface
  ships in the SDK (the CATALOG_TYPES path), with a Fern investigation:
  Fern reads Pydantic->OpenAPI->clients, so it can see this interface
  later via the OpenAPI surface once the contract stabilizes (no hard
  blocker, only a timing call). Drop using the schema in the
  sidecar/runner for now.
- Remove all runner-side request validation (server.ts/cli.ts); no
  ingress validation, no ajv, no new runner dependency this phase.
- Keep the /capabilities probe (author endorsed it).

Also drop the versioning machinery (the pi/agenta rename, already landed,
is not versioned; any version field defers to A1's simple string + if/else
convention). Keep the structured error model + cancelled outcome and the
keep-/run-unified decision. Update status.md to match.

Claude-Session: https://claude.ai/code/session_01GYo3UEfvsZpncagqb28Mbc
---
 .../projects/wire-contract-schema/README.md   | 565 +++++++++---------
 .../projects/wire-contract-schema/status.md   | 146 +++--
 2 files changed, 376 insertions(+), 335 deletions(-)

diff --git a/docs/design/agent-workflows/projects/wire-contract-schema/README.md b/docs/design/agent-workflows/projects/wire-contract-schema/README.md
index 4b0c98ef04..262ed168eb 100644
--- a/docs/design/agent-workflows/projects/wire-contract-schema/README.md
+++ b/docs/design/agent-workflows/projects/wire-contract-schema/README.md
@@ -2,9 +2,9 @@
 
 | | |
 | --- | --- |
-| **Status** | Plan. Not started. Awaiting review before any code. |
-| **Type** | Engineering project (a sequenced, test-driven migration), not a one-shot change. |
-| **Scope** | Replace the hand-mirrored `/run` wire contract with a single schema source; add boundary validation; evaluate splitting `/run`; fold in a structured error model and a carried contract version. |
+| **Status** | Plan. Revised per author PR review on #4830 (2026-06-24). Pre-production POC — any wire shape may change freely; no back-compat burden. |
+| **Type** | Engineering project (a sequenced, test-driven change), not a one-shot change. |
+| **Scope** | Replace the hand-mirrored `/run` wire contract with a single schema source (Pydantic for now); **ship the exported JSON interface in the SDK** and investigate whether Fern can see it; fold in a structured error model and a carried contract version. **No sidecar/runner validation yet** — the contract is still brittle. |
 | **Owner files (today)** | `services/agent/src/protocol.ts` (TS types), `sdks/python/agenta/sdk/agents/utils/wire.py` (Python mirror), `sdks/python/oss/tests/pytest/unit/agents/golden/` (fixtures), `sdks/python/oss/tests/pytest/unit/agents/test_wire_contract.py` + `services/agent/tests/unit/wire-contract.test.ts` (the two contract tests). |
 | **Reference** | The deep spec of the contract as built: [`../runner-interface/README.md`](../runner-interface/README.md). Its Section 12 ("Known gaps") names the exact gaps this project closes. The inventory page: [`../../interfaces/cross-service/service-to-agent-runner.md`](../../interfaces/cross-service/service-to-agent-runner.md). |
 | **Mirroring rule today** | `services/agent/CLAUDE.md` ("The wire contract is mirrored — change both sides"). |
@@ -39,7 +39,8 @@ This is brittle for concrete, observed reasons:
    misspelled field is silently ignored, not rejected. The contract is *implicitly all-optional*
    (every TS field is `?`, every Python field defaults). A typo like `sandboxPermision` is
    dropped on the floor with no error. This is `runner-interface/README.md` §12 gap
-   "No schema validation on the runner".
+   "No schema validation on the runner". (Observed gap — but **not** fixed in this POC phase; a
+   boundary guard is a deferred follow-up, Section 8.)
 3. **The version skew guard is exposed but unconsumed.** `version.ts` exports
    `PROTOCOL_VERSION = 1` and `/health` returns it, but **no Python caller probes `/health`**
    (verified: no reference to `runnerInfo`/`PROTOCOL_VERSION`/`/health` in the runner-calling
@@ -51,33 +52,46 @@ This is brittle for concrete, observed reasons:
    a user/client abort surfaces (if at all) as a transport teardown or a generic error, not as a
    first-class result. §12 names neither, but the user has scoped this as the A10 cleanup.
 
-The fix is a **single source of truth** plus **boundary validation**, sequenced so each step is
-a small change with a test that proves it, and folding in the sibling-project changes (A1
-versioning, A3 backend removal + harness rename, A10 error model).
+The fix for now is a **single source of truth** (Pydantic wire models) whose **JSON interface
+ships in the SDK**, plus the A10 error model and a `/capabilities` probe — sequenced so each step
+is a small change with a test that proves it. Boundary validation, generated TS types, and
+versioning are **deferred** (the contract is still brittle; this is a pre-production POC). The
+A3 rename (backend removal + `pi`->`pi_core` / `agenta`->`pi_agenta`) has already landed in the
+working tree, so the wire models describe that current shape from the start.
 
 ## 2. What this project changes vs leaves alone
 
 **In scope:**
 
 - One schema as the source of truth for the `/run` request, result, event union, capabilities,
-  and the sub-objects listed above.
-- Runtime validation of `/run` at the Node boundary (reject malformed input with a clear error).
-- A symmetric validation on the Python parse path (reject a malformed result).
+  and the sub-objects listed above. **Source = Pydantic for now** (Section 4).
+- **The exported JSON Schema interface lives in the SDK** (alongside the existing `CATALOG_TYPES`
+  JSON interfaces), and an investigation of whether Fern can see/generate it across languages
+  (Section 4).
 - A structured error object `{ code, message, retryable }` and a distinct `cancelled` outcome.
 - A contract version carried in the payload (not only on `/health`), and a probe that consumes
   it.
-- A decision on splitting `/run` into more than one endpoint.
-- Replacing the four golden fixtures + two key lists with schema-derived checks (the golden
-  fixtures stay as *examples that must validate*, not as the only guard).
+- A decision on splitting `/run` (verdict: keep `/run` unified; promote a `/capabilities` probe).
+- Replacing the four golden fixtures + two key lists with schema-derived checks **on the Python
+  side** (the golden fixtures stay as *examples that must validate*, not as the only guard).
+
+**Deliberately NOT in scope for now (the contract is still brittle):**
+
+- **No request validation in the runner** (`server.ts` / `cli.ts`). We do not gate `/run` on the
+  schema yet. The runner keeps parsing the body as it does today.
+- **No use of the schema in the sidecar/runner at all.** No ajv, no new runner dependency, no
+  runtime validation step on the Node side. The schema is an SDK-side artifact for now.
+- These are deferred until the contract stabilizes; revisit when we want a hard boundary guard.
 
 **Explicitly unchanged by this work** (called out so reviewers do not expect movement):
 
-- **Composio, the tool gateway, connections, and MCP** all continue to work exactly as today.
-  They are inputs the service already resolves: `customTools` (gateway callback / code / client),
+- **Composio, the tool gateway, connections, and MCP** all continue to work as today. They are
+  inputs the service already resolves: `customTools` (gateway callback / code / client),
   `toolCallback`, `mcpServers`, `connection` / `provider` / `endpoint` / `credentialMode`. This
-  project re-expresses their **shapes** in a schema and validates them; it does not change how
-  any of them resolve, route, or authenticate. The Composio key stays server-side; the gateway
-  callback still POSTs to `/tools/call`; MCP `stdio`/`http` shapes are preserved byte-for-byte.
+  project re-expresses their **shapes** in a schema; it does not change how any of them resolve,
+  route, or authenticate. The Composio key stays server-side; the gateway callback still POSTs to
+  `/tools/call`; the MCP `stdio`/`http` shapes are unchanged. (We may still adjust any of these
+  shapes if the schema work surfaces a better one — this is a POC, not a frozen contract.)
 - The transports (HTTP + subprocess CLI) and the two modes (one-shot JSON + NDJSON streaming).
 - The harness shaping logic (`config.wire_tools()` etc.) — the schema describes the *output*
   of that shaping, it does not move the shaping.
@@ -101,10 +115,11 @@ The contract has four families. This is what a single schema has to cover.
 | policy + files | `permissionPolicy`, `sandboxPermission`, `harnessFiles` (`[{path, content}]`) |
 | tracing | `trace` (`TraceContext`) |
 
-Back-compat splits the schema must preserve: a plain-string `model` keeps `provider` /
-`connection` / `deployment` / `endpoint` / `credentialMode` **off** the wire; `mcpServers`,
-`skills`, `sandboxPermission`, `harnessFiles` are **omitted** (not null) when empty so a
-minimal payload stays byte-identical to the golden.
+Shape notes (the current serializer behavior, **not** a back-compat constraint — this is a
+pre-production POC and any of these may change freely): a plain-string `model` keeps `provider` /
+`connection` / `deployment` / `endpoint` / `credentialMode` off the wire; `mcpServers`, `skills`,
+`sandboxPermission`, `harnessFiles` are omitted (not null) when empty. The schema describes
+whatever shape we settle on; it does not exist to freeze today's bytes.
 
 ### 3.2 Result (`AgentRunResult`)
 
@@ -156,19 +171,19 @@ no `datamodel-code-generator`, and **no zod** anywhere in the runner or web (ver
 ### Option A — JSON Schema as source, codegen both sides
 
 Author the contract as hand-written JSON Schema files; generate TS types
-(`json-schema-to-typescript`) and Pydantic models (`datamodel-code-generator`) from them. Both
-sides validate with the schema at runtime (ajv on Node, `jsonschema`/Pydantic on Python).
+(`json-schema-to-typescript`) and Pydantic models (`datamodel-code-generator`) from them.
 
 - **Pros:** language-neutral source; one artifact; both sides are generated, so neither drifts.
 - **Cons:** introduces **two new codegen toolchains** into a repo that has none for this, and a
   build step into a package that intentionally has none (`services/agent/CLAUDE.md`: "no app
   compile step"). Hand-writing JSON Schema is verbose and error-prone for a union as rich as
   `AgentEvent` + `RenderHint`. The Python SDK already has hand-written BaseModels with custom
-  `to_wire`/`from_wire` (camelCase aliasing, the `model`-string back-compat split, the
-  drop-unknown-event behavior); regenerating them from schema would either lose that behavior or
-  require post-gen patching. High blast radius, fights the existing grain.
+  `to_wire`/`from_wire` (camelCase aliasing, the `model`-string split, the drop-unknown-event
+  behavior); regenerating them from schema would either lose that behavior or require post-gen
+  patching. High blast radius, fights the existing grain. Also: it does not put the interface in
+  the SDK the way the existing `CATALOG_TYPES` Pydantic-derived schemas already are (Section 4.1).
 
-### Option B — Pydantic as source, export JSON Schema, validate on the TS side (RECOMMENDED)
+### Option B — Pydantic as source, export the JSON interface into the SDK (RECOMMENDED)
 
 Make **Python Pydantic models the source of truth** — but a **dedicated set of *wire* models**,
 NOT the existing semantic DTOs. This distinction is load-bearing (it was the sharpest review
@@ -189,29 +204,68 @@ schema (snake_case keys, a non-discriminated event). So:
   reimplemented in terms of them. The omit-when-empty behavior stays as serializer logic + golden
   checks — `model_json_schema()` expresses "optional", not "omit when empty".
 - Pydantic 2's `model_json_schema()` exports the JSON Schema artifact **for free**, no new
-  toolchain. Commit it as `services/agent/contract/run-contract.schema.json`.
-- The TS runner validates incoming `/run` against it with one small validator (ajv, the one new
-  runner dep) — **no codegen, no build step**: the schema is data the runner loads.
+  toolchain. **This exported JSON interface ships in the SDK** — exactly the way the SDK already
+  exposes Pydantic-derived JSON Schemas through `CATALOG_TYPES` (Section 4.1). The immediate goal
+  is that the interface (the JSON) lives in the SDK; the runner does **not** consume it yet
+  (Section 5).
 
 - **Pros:** fits the stack — Pydantic 2 is already the SDK's modeling layer (`pydantic>=2,<3`);
   the producer (Python) is the natural source since it builds the request. Schema export is a
-  built-in, not a new tool. The runner gains real runtime validation with one dependency (ajv)
-  and zero build step, honoring the standalone-package constraint. The omit-when-empty behavior
-  stays in Python where it already lives and is tested. The exported schema becomes a
-  **CI-checked artifact**: a test fails if the committed schema drifts from the wire models.
+  built-in, not a new tool. It puts the interface in the SDK alongside the existing
+  `CATALOG_TYPES` JSON interfaces (one consistent mechanism). The omit-when-empty behavior stays
+  in Python where it already lives and is tested. The exported schema becomes a **CI-checked
+  artifact**: a test fails if the committed schema drifts from the wire models.
 - **Cons:** requires writing dedicated wire models (a real cost, but it is the honest cost of a
-  single source — the alternative is the current double-maintenance). For the TS *types* there are
-  two sub-options: **(b1)** keep hand-written `protocol.ts` + a schema-derived guard, or **(b2)**
-  generate `protocol.ts` from the schema with `json-schema-to-typescript` as a committed artifact
-  (a checked-in generated file run by a script, not a runtime build step). A hand-written
-  `protocol.ts` + a top-level key guard does **not** catch *nullability* drift — the Claude golden
-  sends `sessionId: null` / `trace: null` while `protocol.ts` types them `?: string` (present-or-
-  absent, not nullable). So b1 needs schema-derived type-equivalence tests deeper than keys, and
-  **b2 (generate the types) is the safer end state**. This is an open question (§10).
+  single source — the alternative is the current double-maintenance). The TS `protocol.ts` stays
+  **hand-written for now** — we do *not* generate it from the schema yet, because the runner does
+  not consume the schema yet (Section 5) and the contract is still brittle. Keeping the schema and
+  `protocol.ts` aligned stays a Python-side discipline for the moment (the Python goldens are the
+  guard). Generating `protocol.ts` from the schema is a later option once the contract settles.
+
+#### 4.1 The interface in the SDK, and whether Fern can see it
+
+The author's direction: get this interface (the JSON Schema) **into the SDK** now, and find out
+whether **Fern** can also see/generate it across languages. Findings, with concrete paths:
+
+- **The SDK already exposes Pydantic-derived JSON interfaces.** `CATALOG_TYPES` in
+  `sdks/python/agenta/sdk/utils/types.py` (line ~1265) is a dict of
+  `model_json_schema()` outputs for `Message`, `Messages`, `AgentConfigSchema`,
+  `SkillConfigSchema`, `PromptTemplate`, etc., each dereferenced. The agent workflow surfaces
+  them through `/inspect` via thin `x-ag-type-ref` markers (`services/oss/src/agent/schemas.py`),
+  and the playground resolves them against `GET /workflows/catalog/types/{type}`. **The wire
+  contract should ship the same way:** add the exported `WireRunRequest` / `WireRunResult` JSON
+  Schema next to `CATALOG_TYPES` (or as a sibling export) so the SDK is the single home of the
+  JSON interface. This is the immediate, low-risk goal.
+
+- **How Fern is used here.** Fern in this repo generates the multi-language API clients (Python +
+  TypeScript) under `clients/` and `web/packages/agenta-api-client/`. The pipeline
+  (`clients/scripts/generate.sh`) is: the FastAPI app (Pydantic models) emits **`/api/openapi.json`**
+  → the script writes an ephemeral `fern.config.json` + `generators.yml` and runs the
+  `fernapi/fern-python-sdk` and `fernapi/fern-typescript-sdk` generators against that OpenAPI
+  spec. There is **no `.fern/` API-definition directory checked in** and no Fern IDL; Fern's only
+  input is the generated OpenAPI document. So the chain is **Pydantic → OpenAPI → Fern → SDKs**.
+
+- **Can Fern see this interface? Yes, but only via OpenAPI — with one real caveat.** Fern reads
+  the OpenAPI spec, and that spec is built from the FastAPI/Pydantic models the *public API*
+  exposes. The `/run` contract is the **service ↔ runner spine**, not a public FastAPI endpoint,
+  so it does **not** appear in `openapi.json` today and Fern therefore cannot see it as-is. Two
+  ways to make Fern see it, neither needed for the immediate goal:
+  - **(a) Reference the wire models from a FastAPI surface.** If any endpoint (even an internal or
+    `/inspect`-style descriptor) types a field with the wire Pydantic models, FastAPI emits their
+    JSON Schema into `components/schemas` of `openapi.json`, and Fern then generates them in every
+    client language. This is the same path `AgentConfigSchema` already takes to reach the clients.
+  - **(b) Add a standalone OpenAPI fragment as a second Fern spec.** `generators.yml` takes a list
+    under `api.specs`; a hand-authored fragment that `$ref`s the exported `run-contract.schema.json`
+    could be added. Heavier and not worth it now.
+  - **Blocker / reason not to do it yet:** the contract is still brittle (it changes often as the
+    POC evolves), and putting it on the public OpenAPI surface would publish a moving target into
+    every generated client. So **for now**: export the JSON interface into the SDK (the
+    `CATALOG_TYPES`-style path), keep it out of the public OpenAPI spec, and let Fern pick it up
+    later once it stabilizes. The path is clear and there is no hard blocker — only a timing call.
 
 ### Option C — A shared IDL (`.proto`, Smithy, etc.)
 
-Define the contract in a neutral IDL and generate both sides + a validator.
+Define the contract in a neutral IDL and generate both sides.
 
 - **Pros:** strongest neutrality; mature codegen.
 - **Cons:** the heaviest option for an internal JSON-over-HTTP/stdio boundary. The wire is JSON,
@@ -220,38 +274,44 @@ Define the contract in a neutral IDL and generate both sides + a validator.
   that wants fewer moving parts. The `AgentEvent` open-union + "drop unknown" semantics fit JSON
   Schema's `additionalProperties`/`oneOf` better than proto's closed messages. Overkill.
 
-### Recommendation: Option B (Pydantic-as-source → exported JSON Schema → ajv validation in the runner)
-
-It is the only option that adds **one** runtime dependency (ajv) and **zero** runtime build
-steps, fits the Pydantic 2 stack, keeps the custom serialization semantics where they are tested,
-and turns the schema into a CI-checked artifact that makes drift a failing test instead of a
-code-review discipline. Source of truth = dedicated Pydantic **wire** models (not the semantic
-DTOs); the exported `run-contract.schema.json` is the shared artifact; the TS types are best
-**generated** from it (sub-option b2) so nullability drift cannot hide.
-
-## 5. Runtime validation at the boundary
-
-Today `/run` is implicitly all-optional and silently drops typos. The plan:
-
-- **Runner ingress (request).** In `server.ts` / `cli.ts`, after JSON-parse, validate the body
-  against `run-contract.schema.json` with ajv. On failure, return a structured `400` (HTTP) /
-  exit 1 (CLI) with the validator's error path — e.g. `{ ok:false, error:{ code:"invalid_request",
-  message:"sandboxPermission.network.mode: must be one of on|off|allowlist", retryable:false } }`.
-  Replace today's "empty body -> `{}` runs with defaults" with an explicit allowance: an empty
-  body is still valid (the contract tests rely on it), but a *present but malformed* body is
-  rejected. Decide on `additionalProperties`: start **permissive** (warn/log unknown top-level
-  keys, do not reject) to avoid breaking a newer service against an older runner, and tighten to
-  reject in a later, version-gated step. Unknown **event** types stay tolerated by design.
-- **Python egress (request) — optional symmetric guard.** `request_to_wire` can validate its own
-  output against the same schema in tests (and optionally under a debug flag at runtime), so the
-  producer cannot emit a payload the runner would reject. This is the producer-side half of the
-  guard and is cheap because the schema already exists.
-- **Python ingress (result).** `result_from_wire` gains schema validation of the result before
-  it constructs `AgentResult`, so a malformed runner result fails loudly rather than producing a
-  half-empty `AgentResult`.
-
-The validator is **not** a behavior change to any resolved input (tools, gateway, MCP,
-connections are validated for shape only, never re-resolved).
+### Recommendation: Option B (Pydantic-as-source → exported JSON interface in the SDK)
+
+Use **Pydantic as the source for now**. It fits the Pydantic 2 stack, keeps the custom
+serialization semantics where they are tested, exports the JSON Schema for free, and **puts the
+interface in the SDK** the same way `CATALOG_TYPES` already does — which is exactly the immediate
+goal. Source of truth = dedicated Pydantic **wire** models (not the semantic DTOs); the exported
+schema ships in the SDK as a CI-checked artifact (a test fails if it drifts from the wire models).
+
+Two deliberate constraints from the author's review:
+
+- **No runner/sidecar validation yet.** The runner does not load or validate against the schema;
+  there is no ajv, no new runner dependency, no build step. The contract is still brittle, so we
+  hold off on a hard boundary guard (Section 5).
+- **`protocol.ts` stays hand-written for now.** We do not generate TS types from the schema yet
+  (that only pays off once the runner consumes the schema). The Python goldens remain the guard.
+
+Fern can reach this interface later through the existing **Pydantic → OpenAPI → Fern → SDKs**
+pipeline once the contract stabilizes (Section 4.1); for now the interface lives in the SDK only.
+
+## 5. Validation — deferred (no runtime guard yet)
+
+Author's direction (PR review): **do not validate for the moment.** The contract is still
+brittle, so this project does **not** add a runtime boundary guard on either side yet.
+
+- **No runner ingress validation.** `server.ts` / `cli.ts` keep parsing the `/run` body exactly
+  as today (empty body → defaults, unknown fields ignored). No ajv, no new runner dependency, no
+  schema loaded on the Node side. A present-but-malformed body is still tolerated for now.
+- **No runtime Python validation either.** `request_to_wire` / `result_from_wire` are not gated
+  on the schema at runtime.
+
+What the schema *is* used for in this phase is **Python-side tests only**: the exported schema
+validates the existing goldens (an example-must-validate check) and can validate `request_to_wire`
+output in a unit test, so the schema is proven faithful without changing any production code path.
+That is the full extent of validation for now.
+
+When the contract stabilizes, a real boundary guard (runner ingress validation + a symmetric
+Python result check) is a natural follow-up — see Section 8 / Open questions. Until then it is
+explicitly out of scope.
 
 ## 6. The `/run` split decision
 
@@ -271,13 +331,14 @@ duplicate the request schema and the dispatch for no contract benefit. Content n
 
 ### Split out: a capability / contract probe
 
-**DO formalize the probe.** `/health` already returns `{status, runner, protocol, engines,
-harnesses}` but nothing consumes it, and `HarnessCapabilities` (per-harness, 11 flags) is only
-discoverable by doing a full run. Recommendation:
+**DO formalize the probe — the author endorsed this in review ("that's a good idea with
+capabilities").** `/health` already returns `{status, runner, protocol, engines, harnesses}` but
+nothing consumes it, and `HarnessCapabilities` (per-harness, 11 flags) is only discoverable by
+doing a full run. Recommendation:
 
 - Keep `GET /health` as the cheap liveness + identity + **contract version** probe (it already
   carries `protocol`). This is what the A1 version check consumes (Section 7).
-- Add `GET /capabilities` (or `GET /capabilities?harness=pi`) that returns the static **base**
+- Add `GET /capabilities` (or `GET /capabilities?harness=pi_core`) that returns the static **base**
   `HarnessCapabilities` per harness **without running a turn**. Today capabilities are probed
   per-run and returned in the result; a static probe lets the service/playground render UI and
   pre-validate a request (e.g. reject `images` for a harness that lacks `fileAttachments`) before
@@ -304,45 +365,37 @@ turn contract.
 
 This project assumes and coordinates with three parallel efforts. The schema is where they meet.
 
-### A3 — backend removal + harness rename (assumed end state)
+### A3 — backend removal + harness rename (already landed in the working tree)
+
+A3 removed the legacy in-process backend and the `backend` field, and renamed harness values
+`pi -> pi_core` and `agenta -> pi_agenta`. This is **no longer "assumed end state"** — it is
+already in the working tree (`version.ts` now declares `HARNESSES = ["pi_core","claude",
+"pi_agenta"]`; the pi golden is renamed `run_request.pi_core.json`; `engines/pi.ts` is deleted).
+So the schema simply describes that current shape:
 
-A3 removes the legacy in-process backend and the `backend` field, and renames harness values
-`pi -> pi_core` and `agenta -> pi_agenta`. The schema must land **after** or **with** A3, or it
-will pin the wrong enum. Concretely in the schema:
+- No `backend` field.
+- `harness` is `pi_core` | `pi_agenta` | `claude`.
 
-- **Drop `backend`** from `AgentRunRequest` (the runner no longer dispatches on an engine id; one
-  engine path remains). Remove it from the key lists and goldens.
-- **`harness` enum becomes** `pi_core` | `pi_agenta` | `claude` (was `pi` | `agenta` | `claude`).
-  Update `version.ts` `HARNESSES`, the goldens, and the schema enum together.
-- Because A3 removes a field and changes an enum, it is a **breaking** contract change. It shares
-  **one** `PROTOCOL_VERSION` bump with the A10 error-model change (the single v2 cut in step 8) —
-  it does NOT get its own separate "also v2" bump (a second breaking change after a bump is not
-  incremental). The migration introduces the schema *first* at v1 (behavior-preserving, steps
-  1-6), then makes A3 + A10 the single v2 cut, so the schema work is not blocked on A3 landing. If
-  the team would rather decouple them, A3 becomes a distinct **v3** cut — but the two breaking
-  changes must not collapse into one ambiguous "v2" label.
+Because this is a **pre-production POC, we do NOT version the pi/agenta rename.** There is no v1→v2
+cut for it, no downcaster, no `PROTOCOL_VERSION` bump tied to the rename — the wire just changes.
+The wire models are authored against today's renamed shape from the start.
 
-### A1 — versioning (coordinate: the schema carries/enables a contract version)
+### A1 — versioning (coordinate: a simple string version, the LLM-as-judge style)
 
 A1 is the sibling project [`../contract-versioning/`](../contract-versioning/) (it owns the
-versioning strategy for the cross-service contracts). It independently found the same gap this
-project relies on: the runner advertises `protocol: 1` on `/health` (`version.ts`) but the Python
-client (`ts_runner.py`) never reads it — no negotiation, no skew guard. This project makes the
-schema **carry** the contract version so skew is detectable in-band, not only via `/health`:
-
-- Add an optional `contractVersion` (int major) to `AgentRunRequest` and `AgentRunResult`. The
-  service stamps the major it built for; the runner can reject a major it does not support with a
-  structured `{ code:"unsupported_contract_version", ... }` error (see A10) instead of silently
-  mis-parsing.
-- Consume the version on **both transports**, not only `/health` (the backend chooses HTTP or
-  subprocess in `adapters/sandbox_agent.py`, and the CLI has no health mode): the HTTP path probes
-  `GET /health.protocol`; the subprocess/CLI path needs a `--version`/`--protocol` mode (or reads
-  the in-band `contractVersion` echoed on the result). Either way the client refuses a runner
-  whose major it does not understand, closing the §12 "version skew guard is not consumed" gap.
-  The schema artifact itself is versioned by the same major.
-- Exact ownership of the bump mechanics stays with A1; this project provides the in-payload field
-  and the probe consumption. The two must agree on the major-number semantics (single int major,
-  surfaced on `/health` as `protocol` and in-band as `contractVersion`).
+versioning strategy). Per the author's review, A1 is being simplified to **a plain string version
+plus an if/else branch — the same pattern the codebase already uses elsewhere** (the
+`x-ag-messages-version: "v1"` header and `VERCEL_MESSAGE_PROTOCOL_VERSION` string; the LLM-as-judge
+string-version + if/else dispatch). **No `{major, minor}` struct, no `contractVersion` field name,
+no upcaster/downcaster machinery.** This project defers to whatever simple string convention A1
+lands on and reuses it verbatim (do NOT invent a new scheme).
+
+It is still true that the runner advertises `protocol: 1` on `/health` (`version.ts`) but the
+Python client (`ts_runner.py`) never reads it. If A1 wants the version carried on the payload, it
+rides as the same simple string A1 chooses, stamped by the producer and branched on with a plain
+if/else on the consumer. Skew handling and any negotiation are A1's call; this project only agrees
+to carry the field A1 specifies in the wire models. Given the POC framing, even this is optional
+for now.
 
 ### A10 — error model cleanup (in scope here)
 
@@ -362,16 +415,17 @@ outcome:
 ```
 
 - **Error taxonomy (`code`)**, a closed-ish enum the runner sets and the service can branch on:
-  `invalid_request` (schema validation failed), `unsupported_contract_version`,
   `unsupported_harness`, `auth_error`, `quota_exceeded`, `rate_limited`, `configuration_error`,
   `permission_denied`, `model_error`, `tool_error`, `mcp_error`, `sandbox_error`, `timeout`,
   `cancelled`, `internal`. The `auth_error` / `quota_exceeded` / `rate_limited` codes are not
   speculative: the runner already pattern-classifies these from provider error text in
   `services/agent/src/engines/sandbox_agent/errors.ts` — the schema just gives that classification
   a stable wire code. Keep the enum forward-compatible (an unknown code -> treat as `internal`),
-  mirroring the event "drop unknown" tolerance.
+  mirroring the event "drop unknown" tolerance. (No `invalid_request` /
+  `unsupported_contract_version` codes for now — we are not validating requests or enforcing a
+  version at the boundary in this phase.)
 - **`retryable`** lets the caller distinguish a transient `timeout` / `rate_limited` / `mcp_error`
-  from a permanent `invalid_request` / `unsupported_harness` / `auth_error`.
+  from a permanent `unsupported_harness` / `auth_error` / `configuration_error`.
 - **Distinct cancelled outcome — but only where it is actually deliverable.** A user/client abort
   is **not** a failure. The subtlety (a real review catch): a *client disconnect* mid-stream
   cannot reliably receive a terminal record, because the disconnect is exactly what tears the
@@ -389,176 +443,147 @@ outcome:
   - Optionally also emit a `done` event with `stopReason:"cancelled"` for streams (useful as a
     live signal), but the terminal result remains authoritative when the connection is alive.
 - **Migration:** `result_from_wire` must accept **both** the old free-string `error` and the new
-  structured object during the transition (parse a string into `{code:"internal", message:str,
-  retryable:false}`), so an old runner against a new service still parses. The structured form is
-  the v2 shape; the string form is read-compat only.
+  structured object (parse a string into `{code:"internal", message:str, retryable:false}`). This
+  read-compat is cheap and avoids a hard flag-day, but because this is a POC we do **not** treat
+  the new error shape as a versioned cut — the wire just changes to the structured form.
 
-This is a contract change (new error shape) and folds into the same v2 bump as A3.
+This is a wire-shape change (the new structured error), made directly. No version bump is tied to
+it (POC).
 
-## 8. Incremental, test-at-each-step migration plan
+## 8. Incremental, test-at-each-step plan (POC-framed)
 
-No big-bang. Each step is a small change plus the test that proves it. Steps 1-6 are
-**behavior-preserving at contract v1** (the schema must validate the *current* goldens), so they
-can land before A1/A3/A10. Steps 7-10 are the versioned (v2) changes that depend on the siblings.
+No big-bang, but no versioning machinery either — this is a pre-production POC, so the wire just
+changes when it needs to. Each step is a small change plus the test that proves it. The
+heaviest items (runner-side validation, generating `protocol.ts`, version negotiation) are
+**deferred** until the contract stabilizes; they are listed at the end as follow-ups, not steps.
 
 The sequence respects the shared-surface rule (`agent-coordination.md`): any change to
 `protocol.ts` / `wire.py` / golden / the two contract tests is coordinated, single-PR, both
 sides + golden together.
 
-1. **Add the Pydantic request/result models (no wire change).**
-   Add `AgentRunRequest` / `AgentRunResult` Pydantic models in the SDK that compose the existing
-   `dtos.py` sub-models with camelCase aliases, reproducing exactly what `request_to_wire` /
-   `result_from_wire` emit/parse today.
-   *Test:* a new unit test asserts `AgentRunRequest(...).model_dump(by_alias=True, exclude_none-ish)
-   == request_to_wire(...)` for the Pi, Claude, and Agenta payloads (round-trip parity with the
-   hand-built dicts and the goldens). Green before touching anything else.
-
-2. **Export the JSON Schema artifact + a freshness test.**
-   Add a script that writes `services/agent/contract/run-contract.schema.json` from
-   `model_json_schema()`. Commit the artifact.
-   *Test:* a CI test regenerates the schema in-memory and asserts it equals the committed file
-   (drift -> fail), mirroring how the goldens are regenerated deliberately.
-
-3. **Assert the existing goldens validate against the schema (Python side).**
-   *Test:* load each of the four goldens, validate against `run-contract.schema.json`
-   (`jsonschema`); all must pass. This proves the schema faithfully describes today's wire before
-   anything consumes it. No production code path changes yet.
-
-4. **Add ajv validation in the runner, behind a permissive mode (TS side).**
-   Add ajv (one dependency) and load `run-contract.schema.json`. Validate the `/run` body in
-   `server.ts`/`cli.ts`; in this step, on failure **log and continue** (permissive) so nothing
-   breaks, and assert the goldens validate.
-   *Test:* `wire-contract.test.ts` validates the goldens through ajv (must pass) and validates a
-   deliberately-malformed payload (must report the right error path) — without yet rejecting it.
-
-5. **Flip the runner to reject malformed requests (the boundary guard) — at v1 error shape.**
-   Change permissive -> reject: a present-but-malformed body is rejected with a `400`. **Ordering
-   subtlety (a real review catch):** at this point the result `error` is still the v1 free string
-   (`protocol.ts` `error?: string`), so this step must NOT emit the structured `{error:{code:...}}`
-   shape yet — emitting structured errors before the v2 error model lands would itself be a
-   contract change. So the v1 rejection returns a **string** error
-   (`{ok:false, error:"invalid_request: sandboxPermission.network.mode: must be one of ..."}`),
-   carrying the code as a stable prefix. The structured object replaces it in step 8. An empty
-   body still parses to a valid default request. Keep `additionalProperties` permissive (unknown
-   top-level keys logged, not rejected) for cross-version safety.
-   *Test:* `server.test.ts` asserts a malformed `/run` -> `400` with the `invalid_request:` prefix;
-   an empty body -> still runs; the goldens -> still accepted. CLI test mirrors via exit code.
-
-6. **Replace the duplicated key lists with schema-derived guards (+ a deeper-than-keys check).**
-   Swap the two hand-kept `KNOWN_REQUEST_KEYS` for: (Python) derive the allowed key set from the
-   schema's `properties`; (TS) drive the guard list from the schema property names so it cannot
-   silently fall behind. **But a key guard alone is not enough** (review catch): it misses
-   *nullability* drift — the Claude golden sends `sessionId: null` / `trace: null` while
-   `protocol.ts` types them `?: string`. Add a schema-derived check that compares each field's
-   nullability/optionality against the TS types (or, if sub-option b2 was chosen, generate
-   `protocol.ts` from the schema so the check is structural).
-   *Test:* `set(schema.properties) == set(KNOWN_REQUEST_KEYS)` on both sides, plus a nullability
-   assertion (`sessionId`/`trace` accept `null`). **End of the behavior-preserving phase: the
-   hand-maintained key lists are gone and the runtime guard is the schema. The TS *types* are
-   either generated (b2) or guarded deeper than keys (b1); the omit-when-empty serializer behavior
-   is still asserted by the goldens, not by the schema.**
-
-   --- the steps below are the v2 contract changes; they depend on the siblings ---
-
-7. **Add `contractVersion` in-band + consume the version on BOTH transports (with A1).**
-   Add optional `contractVersion` (additive, still v1-compatible) to the request/result schema;
-   service stamps it. Consume it on **both** transports, not only `/health` (review catch: the
-   backend picks HTTP *or* subprocess in `adapters/sandbox_agent.py`, and the CLI has no health
-   mode): the HTTP path probes `GET /health.protocol` once; the **subprocess/CLI path** gets a
-   `--version`/`--protocol` mode (or reads the in-band `contractVersion` echoed on the first
-   result) so a skewed CLI runner is also refused.
-   *Test:* a transport test stubs an incompatible `/health` major and asserts the HTTP adapter
-   refuses before `/run`; a CLI test asserts the subprocess version probe refuses an incompatible
-   major; a schema test asserts `contractVersion` round-trips.
-
-8. **The v2 cut: structured error model + cancelled outcome (A10) AND the A3 removal, together.**
-   These are **two breaking changes**, so they ship as **one `PROTOCOL_VERSION = 2` cut**, not as
-   two steps each calling itself "v2" (review catch: a second breaking change after a bump is not
-   incremental — bump once, change once). In this single cut:
-   - Result `error` becomes `{code, message, retryable}`; `result_from_wire` reads both the new
-     object and the old v1 string (string -> `{code:"internal", message:str}`) for read-compat
-     against an older runner. The step-5 `invalid_request:`-prefixed string rejection becomes the
-     structured object.
-   - Cancellation: cooperative cancel emits the terminal `{ok:false, error:{code:"cancelled"}}`;
-     transport-teardown cancel maps to a distinct Python `CancelledError` outcome (per §7 A10).
-   - A3: drop `backend` from the schema/models/goldens/guards; rename the `harness` enum to
-     `pi_core | pi_agenta | claude`; update `version.ts` `HARNESSES`.
-   - **Sequencing within the cut is itself incremental and test-guarded:** land read-compat error
-     parsing first (no behavior change), then flip the emitted shape, then the A3 removal — each
-     with its test green — but they release as one v2 because they share the breaking bump.
-   *Test:* `test_wire_contract.py` parses an old-string-error golden and a new-structured golden;
-   a runner test asserts cooperative cancel yields the terminal `cancelled` record and a disconnect
-   yields the Python `CancelledError`; regenerated Pi/Claude goldens (no `backend`, renamed
-   harness) assert the new shapes; a test asserts the schema rejects `backend` and the old
-   `pi`/`agenta` harness values. New goldens: `run_result.cancelled.json`,
-   `run_result.error_structured.json`. (If the team prefers to decouple, A3 can instead be a
-   separate **v3** cut — but it must not share v2's bump.)
-
-9. **Promote the capability probe: `GET /capabilities` (additive, can land any time).**
-   Add the static per-harness `HarnessCapabilities` route to the runner. Define explicitly whether
-   it returns **base** capabilities (what the harness supports at all) or **effective** ones (for a
-   given request/mode) — `streamingDeltas` is mode-dependent today (review catch:
-   `engines/sandbox_agent.ts` derives it at run time). Recommendation: return **base** capabilities
-   from this static probe and document that mode-dependent flags are only authoritative in a run
-   result. Optionally have the service pre-validate a request against the base capabilities.
-   *Test:* `server.test.ts` asserts `GET /capabilities` returns the schema-valid base capability
-   map per harness without running a turn; a schema test asserts the capability map validates.
-
-After these steps: one schema source (dedicated Pydantic wire models) -> one exported artifact ->
-runtime validation on both sides -> structured errors + a correctly-modeled cancelled outcome ->
-in-band + both-transport contract version -> a real capability probe, with a test at every step
-and no point where both sides are broken at once.
+1. **Add the dedicated Pydantic wire models in the SDK (no wire change).**
+   Add `WireRunRequest` / `WireRunResult` (and the discriminated `WireAgentEvent`) wire models in
+   the SDK, with camelCase aliases, reproducing exactly what `request_to_wire` / `result_from_wire`
+   emit/parse today (against the *current* renamed shape — `pi_core` / `pi_agenta`, no `backend`).
+   *Test:* a unit test asserts `WireRunRequest(...).model_dump(by_alias=True, exclude-none-ish)
+   == request_to_wire(...)` for the pi_core, claude, and pi_agenta payloads (round-trip parity with
+   the goldens). Green before anything else.
+
+2. **Export the JSON interface into the SDK + a freshness test.**
+   Export `model_json_schema()` for the wire models and ship it in the SDK alongside the existing
+   `CATALOG_TYPES` JSON interfaces (Section 4.1). Commit the artifact.
+   *Test:* a test regenerates the schema in-memory and asserts it equals the committed export
+   (drift -> fail), the same discipline the goldens already use.
+
+3. **Assert the existing goldens validate against the exported schema (Python side, tests only).**
+   *Test:* load each golden, validate against the exported schema (`jsonschema`); all must pass.
+   This proves the schema faithfully describes today's wire. **No production code path changes, and
+   nothing on the runner side** — validation here is a test, not a runtime guard (Section 5).
+
+4. **Make the wire models the single producer.**
+   Reimplement `request_to_wire` / `result_from_wire` in terms of the wire models, keeping the
+   omit-when-empty serializer behavior. The goldens stay byte-identical (this is a refactor, the
+   models already match the wire from step 1).
+   *Test:* the existing golden wire-contract test stays green unchanged; add a parity test that the
+   reimplemented serializers equal the old output.
+
+5. **Replace the duplicated key lists with a schema-derived guard (Python side).**
+   Swap the hand-kept Python `KNOWN_REQUEST_KEYS` for a set derived from the exported schema's
+   `properties`, so the Python guard cannot silently fall behind. The TS `KNOWN_REQUEST_KEYS` guard
+   in `wire-contract.test.ts` stays hand-written for now (we are not generating `protocol.ts` or
+   touching the runner this phase).
+   *Test:* `set(schema.properties) == set(python KNOWN_REQUEST_KEYS)`.
+
+6. **Structured error model + cancelled outcome (A10).**
+   Result `error` becomes `{code, message, retryable}`; `result_from_wire` also reads the old free
+   string for read-compat (string -> `{code:"internal", message:str}`). Cancellation: cooperative
+   cancel emits the terminal `{ok:false, error:{code:"cancelled"}}`; transport-teardown cancel maps
+   to a distinct Python `CancelledError` (per §7 A10). This is a direct wire change — **no version
+   bump** (POC).
+   *Test:* `test_wire_contract.py` parses an old-string-error golden and a new-structured golden; a
+   transport test asserts a disconnect yields the Python `CancelledError`. New goldens:
+   `run_result.cancelled.json`, `run_result.error_structured.json`.
+
+7. **Promote the capability probe: `GET /capabilities` (additive, the author endorsed it).**
+   Add the static per-harness `HarnessCapabilities` route to the runner. It returns **base**
+   capabilities (what the harness supports at all); mode-dependent flags (`streamingDeltas`, derived
+   at run time in `engines/sandbox_agent.ts`) stay authoritative only in a run result. The service
+   can pre-render UI / pre-check a request against the base set.
+   *Test:* `server.test.ts` asserts `GET /capabilities` returns the base capability map per harness
+   without running a turn.
+
+### Deferred follow-ups (only once the contract stabilizes)
+
+These are explicitly **not** in this phase, per the author's review:
+
+- **Runner-side request validation.** Loading the schema in `server.ts` / `cli.ts` and rejecting a
+  malformed `/run` (with ajv or similar). The contract is too brittle to gate on yet.
+- **Generating `protocol.ts` from the schema.** Pays off only once the runner consumes the schema;
+  until then `protocol.ts` stays hand-written and the Python goldens are the guard.
+- **A version field + negotiation.** Owned by A1; if/when it lands it is a simple string version +
+  if/else (Section 7 A1), not a `{major, minor}` or upcaster/downcaster scheme.
+- **Fern generating the interface across languages.** Reachable later via Pydantic → OpenAPI → Fern
+  once the contract is stable enough to publish into the clients (Section 4.1).
+
+After this phase: one Pydantic wire-model source -> the JSON interface shipped in the SDK ->
+structured errors + a correctly-modeled cancelled outcome -> a real capability probe. No runner
+validation, no version machinery, no generated TS types — those are deferred until the contract
+settles.
 
 ## 9. Risks and mitigations
 
-- **Drift between `protocol.ts` types and the schema.** Mitigated by step 6's schema-derived guard
-  (keys **and** nullability, or generated TS types) + step-4 ajv validation of the goldens; a drift
-  fails `tsc` or a test.
-- **An older runner against a newer service (or vice versa).** Mitigated by permissive
-  `additionalProperties`, the read-compat error parsing (step 8), and the both-transport major
-  probe (step 7).
-- **The committed schema artifact going stale.** Mitigated by step 2's freshness test (regenerate
-  == committed), the same discipline the goldens already use.
-- **Sequencing against A1/A3/A10.** Steps 1-6 are independent and land first at v1; A10 + A3 are
-  the **single** v2 cut (step 8); the version probe (7) and capability probe (9) are additive.
-- **Standalone-package constraint.** Only one runner *runtime* dependency added (ajv); no runtime
-  build step; the schema is loaded as data, honoring `services/agent/CLAUDE.md`. (If TS types are
-  generated — sub-option b2 — that is a committed artifact produced by a script, not a build step.)
+- **Drift between `protocol.ts` types and the schema.** While `protocol.ts` stays hand-written and
+  the runner does not consume the schema, this drift is tolerated as a POC trade-off. The Python
+  goldens + the schema-derived Python key guard (step 5) catch Python-side drift; aligning the TS
+  types is a manual discipline for now. Generating `protocol.ts` is the deferred fix.
+- **The committed schema export going stale.** Mitigated by step 2's freshness test (regenerate ==
+  committed), the same discipline the goldens already use.
+- **Sequencing against A1.** A1 owns the version convention (a simple string + if/else); this
+  project only carries whatever field A1 specifies. The error model (step 6) and capability probe
+  (step 7) do not depend on A1.
+- **No boundary guard means typos still pass silently.** Accepted for now — the contract is too
+  brittle to gate on. The runner keeps today's behavior. Revisit with the deferred runner-side
+  validation once the contract stabilizes.
 
 ## 10. Open questions for review
 
-1. **Wire models vs semantic DTOs, and where they live.** Confirmed direction: a **dedicated set
-   of wire models** (not the snake_case semantic DTOs in `dtos.py`). Open: place them in a new
-   `agents/wire_models.py` next to `dtos.py` (proposed) vs a dedicated contract package. Export the
-   artifact into `services/agent/contract/`.
-2. **TS types: hand-written + deep guard (b1) vs generated from the schema (b2).** Proposal: **b2
-   (generate `protocol.ts`)** — a key guard misses nullability drift (`sessionId`/`trace` are
-   `null` on the wire but `?:` in TS). If b1, the guard must compare nullability, not just keys.
-3. **`additionalProperties` end state.** Stay permissive forever (log unknowns) or tighten to
-   reject after the version probe is in place? Proposal: permissive for top-level request fields,
-   strict for nested objects where a typo is more likely a bug than a version skew.
-4. **Cancelled modeling.** Cooperative cancel -> terminal `error.code:"cancelled"`; transport
+1. **Wire models placement.** A new `agents/wire_models.py` next to `dtos.py` (proposed) vs a
+   dedicated contract package. The exported JSON interface ships in the SDK alongside
+   `CATALOG_TYPES`.
+2. **Where exactly the exported interface is surfaced in the SDK.** As an entry in (or sibling of)
+   `CATALOG_TYPES` in `sdks/python/agenta/sdk/utils/types.py`, vs a standalone export. Either keeps
+   it SDK-resident; the `CATALOG_TYPES` path also makes it `/inspect`-discoverable.
+3. **Cancelled modeling.** Cooperative cancel -> terminal `error.code:"cancelled"`; transport
    teardown -> distinct Python `CancelledError` (proposed). Optionally also a `done`
-   `stopReason:"cancelled"`. Needs A10 sign-off. `retryable` for cancel: `false`/omit.
-5. **Version-cut grouping.** A10 + A3 as one v2 cut (proposed) vs A3 as a separate v3. A1 owns the
-   final call on bump grouping.
-6. **`contractVersion` granularity + transport coverage.** Single int major (proposed, matches
-   `/health.protocol`) vs semver; and the subprocess/CLI version probe shape (`--protocol` flag vs
-   in-band echo). A1 owns the granularity.
-7. **Capability probe shape + base-vs-effective.** Return all harnesses (proposed) vs `?harness=`;
+   `stopReason:"cancelled"`. `retryable` for cancel: `false`/omit.
+4. **Capability probe shape + base-vs-effective.** Return all harnesses (proposed) vs `?harness=`;
    and the probe returns **base** capabilities (proposed), with mode-dependent flags
    (`streamingDeltas`) authoritative only in a run result.
+5. **The deferred follow-ups (Section 8).** Confirm runner-side validation, generated `protocol.ts`,
+   the version field, and Fern publication are all out of scope for this POC phase.
 
 ## 11. Review
 
-This plan was reviewed by Codex (gpt-5.5, xhigh, read-only) on 2026-06-24. It verified the claims
-against the code and the verdict ("Option B is the right direction, but ...") drove six concrete
-corrections now folded in: (1) source from dedicated **wire** models, not the snake_case semantic
-DTOs; (2) a key guard misses **nullability** drift — generate TS types or test deeper; (3) A10 and
-A3 are **two** breaking changes -> one v2 cut (or A3 as v3), not two "v2" steps; (4) step 5 cannot
-emit the structured error shape while v1 still types `error` as a string; (5) the version probe
-must cover the **subprocess/CLI** transport, not only `/health`; (6) cancellation via a terminal
-record only works for **cooperative** cancel — a disconnect maps to a Python `CancelledError`.
-Also expanded the error taxonomy (`auth_error`/`quota_exceeded`/`rate_limited`/
-`configuration_error`/`permission_denied`, which `engines/sandbox_agent/errors.ts` already
-classifies) and added the base-vs-effective capability distinction.
+This plan was reviewed by Codex (gpt-5.5, xhigh, read-only) on 2026-06-24, then revised on
+2026-06-24 per the author's PR review on #4830. The author's direction simplified it toward the POC
+reality:
+
+- **No back-compat burden** — this is still an internal POC, so any wire shape may change freely
+  (the "must preserve the model/connection split" framing was dropped).
+- **Pydantic as the source for now**, with the immediate goal that the exported JSON interface
+  lives **in the SDK** (the `CATALOG_TYPES` path), plus a Fern investigation (Section 4.1): Fern
+  here is driven by Pydantic → OpenAPI → Fern → SDKs, so it can see this interface later via the
+  OpenAPI surface once the contract stabilizes — no hard blocker, only a timing call.
+- **No sidecar/runner validation yet** (no ajv, no new runner dependency) — the contract is still
+  brittle (Section 5); `protocol.ts` stays hand-written for now.
+- **No versioning machinery** — the pi/agenta rename (already landed) is not versioned, and any
+  version field defers to A1's simple string + if/else convention.
+- **Keep `/capabilities`** — the author endorsed the probe.
+
+Codex's earlier structural catches that survive the simplification: source from dedicated **wire**
+models (not the snake_case semantic DTOs); cancellation via a terminal record only works for
+**cooperative** cancel (a disconnect maps to a Python `CancelledError`); the error taxonomy is
+grounded in what `engines/sandbox_agent/errors.ts` already classifies; capabilities are
+base-vs-effective. The corrections that were about versioning/validation (two-breaking-changes-one-
+cut, the step-5 error-shape ordering, the both-transport version probe) are **moot** now that
+versioning and runner validation are deferred.
diff --git a/docs/design/agent-workflows/projects/wire-contract-schema/status.md b/docs/design/agent-workflows/projects/wire-contract-schema/status.md
index 4984a57b84..7fea08bf35 100644
--- a/docs/design/agent-workflows/projects/wire-contract-schema/status.md
+++ b/docs/design/agent-workflows/projects/wire-contract-schema/status.md
@@ -2,80 +2,96 @@
 
 | | |
 | --- | --- |
-| **Phase** | Plan written + Codex-reviewed (corrections folded in). Awaiting human review. No code. |
-| **Owner** | wire-contract-schema (this session is A2 in the A1/A2/A3/A10 cohort) |
+| **Phase** | Plan written + Codex-reviewed, then **revised per the author's PR review on #4830** (2026-06-24). Awaiting re-review. No code. |
+| **Owner** | wire-contract-schema (A2 in the A1/A2/A3/A10 cohort) |
 | **Lane** | dedicated GitButler lane `feat/agent-wire-contract-schema-plan`, single commit (docs only) |
 | **Created** | 2026-06-24 |
+| **Revised** | 2026-06-24 (author PR review) |
 
 ## What exists
 
-- `README.md` — the full plan: current-state assessment, the three source-of-truth options with
-  the Option B recommendation, the `/run` split decision, the A10 structured-error + cancelled
-  change, the A1 in-band contract-version coordination, a 9-step test-at-each-step migration, and a
-  Review section (§11) recording the Codex pass.
-
-## Decisions made in the plan
-
-1. **Schema source = dedicated Pydantic *wire* models (Option B), NOT the semantic DTOs.** Codex's
-   sharpest catch: the real contract lives in the hand serializers (`wire.py`, `*.to_wire`), and
-   the `dtos.py` classes are snake_case with a loose `AgentEvent` — exporting them would give the
-   wrong schema. So author `WireRunRequest` / `WireRunResult` / a discriminated `WireAgentEvent`
-   with camelCase aliases, export JSON Schema via `model_json_schema()`, validate in the runner
-   with ajv (one dep, no build step). JSON-Schema-as-source (A) and a shared IDL (C) both pull
-   codegen toolchains into a repo that has none and a package that forbids a build step.
-2. **TS types are best generated from the schema (b2).** A top-level key guard misses *nullability*
-   drift (`sessionId`/`trace` are `null` on the wire, `?:` in `protocol.ts`). Generate `protocol.ts`
-   as a committed artifact, or add nullability-deep type tests. Open question for review.
-3. **`/run` stays unified for the turn.** One-shot vs streaming is content negotiation (`Accept`),
-   not two endpoints — they share the request/result shapes. Promoted instead: a `GET /capabilities`
-   probe (static **base** per-harness capabilities, no run) and consuming the contract version on
-   **both** transports. Rejected: a `/cancel` endpoint.
-4. **Error model `{ code, message, retryable }`** with an expanded taxonomy (incl. `auth_error` /
-   `quota_exceeded` / `rate_limited` / `permission_denied`, which `errors.ts` already classifies)
-   and a cancelled outcome that is modeled correctly: terminal record for **cooperative** cancel,
-   a Python `CancelledError` for **transport-teardown** cancel (a disconnect cannot receive a
-   terminal record).
-5. **Behavior-preserving first.** Steps 1-6 introduce the schema, artifact, validation, and
-   boundary guard at contract **v1**. A10 + A3 are the **single** v2 cut (step 8), not two "v2"
-   steps. The version probe (7) and capability probe (9) are additive.
-
-## Codex review (2026-06-24)
-
-gpt-5.5, xhigh, read-only. Verdict: "Option B is the right direction, but the plan was too
-optimistic as written." Six corrections folded in: wire-models-not-DTOs, nullability drift,
-two-breaking-changes-one-cut, step-5 error-shape ordering, both-transport version probe,
-cooperative-vs-teardown cancellation. Plus taxonomy expansion and base-vs-effective capabilities.
-Full critique in README §11. Log at `/tmp/codex-out-wire-schema.log` (transient).
+- `README.md` — the plan, revised to the author's POC framing: current-state assessment, the three
+  source-of-truth options with the Option B recommendation (Pydantic-as-source **for now**, JSON
+  interface **in the SDK**, a Fern investigation in §4.1), the `/run` split decision (keep unified,
+  promote `/capabilities`), the A10 structured-error + cancelled change, A1 coordination on a
+  **simple string version**, a 7-step POC-framed plan with the heavy items deferred, and a Review
+  section (§11) recording both the Codex pass and the author's revision.
+
+## Author PR review (2026-06-24) — what changed
+
+Four inline comments on #4830, all addressed:
+
+1. **No back-compat burden** (README ~§3.1). Dropped all "the schema must preserve the
+   model/connection split / omit-when-empty bytes" framing. This is an internal POC; any wire shape
+   may change freely. Shape notes are now described as "current serializer behavior, not a
+   constraint." (README §Status, §1, §2, §3.1, §11.)
+2. **Pydantic-as-source now + interface in the SDK + Fern** (README ~§4 recommendation). Revised the
+   recommendation: Pydantic is the source for now; the immediate goal is that the exported JSON
+   Schema interface lives **in the SDK** (the `CATALOG_TYPES` path); added §4.1 investigating Fern.
+   Explicitly **dropped using the schema in the sidecar/runner** for now (contract still brittle).
+3. **No runner ingress validation** (README ~§5). Rewrote §5 as "validation — deferred": no ajv, no
+   runner dependency, no `server.ts`/`cli.ts` request validation. The schema is used in Python tests
+   only (goldens-must-validate). A boundary guard is a deferred follow-up.
+4. **Keep `/capabilities`** (README ~§6). The probe stays; the author endorsed it. Noted his
+   endorsement inline.
+
+## Fern findings (the §4.1 investigation)
+
+- Fern in this repo generates the multi-language API **clients** (Python + TS) under `clients/` and
+  `web/packages/agenta-api-client/`. The pipeline (`clients/scripts/generate.sh`) is
+  **Pydantic → `/api/openapi.json` → Fern (`fernapi/fern-python-sdk`, `fernapi/fern-typescript-sdk`)
+  → SDKs**. There is no checked-in `.fern/` IDL; Fern's only input is the generated OpenAPI doc.
+- The SDK **already** exposes Pydantic-derived JSON interfaces: `CATALOG_TYPES` in
+  `sdks/python/agenta/sdk/utils/types.py` (~line 1265) is a dict of `model_json_schema()` outputs,
+  surfaced via `/inspect` `x-ag-type-ref` markers (`services/oss/src/agent/schemas.py`). The wire
+  contract should ship the same way.
+- **Can Fern see this interface? Yes — but only via OpenAPI, with a caveat.** `/run` is the
+  service↔runner spine, not a public FastAPI endpoint, so it is not in `openapi.json` today and Fern
+  cannot see it as-is. Making Fern see it = reference the wire Pydantic models from a FastAPI surface
+  (FastAPI then emits them into `components/schemas`, the same path `AgentConfigSchema` takes). **No
+  hard blocker** — the only reason not to now is that the contract is brittle and publishing a moving
+  target into every generated client is premature. So: SDK-resident now, Fern later.
+
+## Decisions made in the (revised) plan
+
+1. **Schema source = dedicated Pydantic *wire* models (Option B), NOT the semantic DTOs**, authored
+   against the **already-landed** renamed shape (`pi_core` / `pi_agenta`, no `backend`). Export
+   `model_json_schema()` and ship it in the SDK alongside `CATALOG_TYPES`.
+2. **`protocol.ts` stays hand-written for now.** No generated TS types this phase (only pays off once
+   the runner consumes the schema). Python goldens are the guard.
+3. **`/run` stays unified for the turn.** Promote a `GET /capabilities` probe (static **base**
+   per-harness capabilities). Rejected: a `/cancel` endpoint.
+4. **Error model `{ code, message, retryable }`** with a grounded taxonomy and a cancelled outcome
+   (terminal record for cooperative cancel; Python `CancelledError` for transport-teardown cancel).
+   Made as a **direct wire change, no version bump** (POC).
+5. **No versioning machinery.** The pi/agenta rename is not versioned. Any version field defers to
+   A1's **simple string version + if/else** (the `x-ag-messages-version: "v1"` / LLM-as-judge
+   pattern) — no `{major, minor}`, no `contractVersion` name, no upcaster/downcaster.
+6. **No runner/sidecar validation yet** (deferred follow-up).
+
+## Deferred (Section 8 follow-ups, out of scope for this POC phase)
+
+- Runner-side request validation (ajv / boundary guard).
+- Generating `protocol.ts` from the schema.
+- A version field + negotiation (A1-owned, simple string).
+- Fern generating the interface across languages (via Pydantic → OpenAPI once stable).
 
 ## Coordination
 
-- **A1 (`contract-versioning`)** — sibling at `../contract-versioning/`; it found the same
-  unconsumed-`/health.protocol` gap. A1 owns version-number semantics + bump policy; this project
-  provides the in-band `contractVersion` field and the both-transport probe (step 7).
-- **A3 (backend removal + harness rename)** — assumed end state; schema drops `backend` and renames
-  the `harness` enum to `pi_core | pi_agenta | claude` as part of the v2 cut (step 8). Steps 1-6
-  are at v1, so the schema work does not block on A3.
-- **A10 (error model)** — folded into step 8.
-- **`sidecar-trust-and-sandbox-enforcement`** flagged a stale comment at `protocol.ts:149-150`
-  (sandbox enforcement is now real on Daytona). Noted; the v2 `SandboxPermission` schema work
-  should pick up the corrected wording.
-- **DOCS-ONLY.** No edit to `protocol.ts` / `wire.py` / golden / contract tests. Composio, the tool
-  gateway, connections, and MCP are described as existing and **unchanged**.
+- **A1 (`contract-versioning`)** — sibling at `../contract-versioning/`, being simplified by another
+  agent to a plain string version + if/else per the author. This project reuses whatever string
+  convention A1 lands on; does NOT invent its own. (Did not touch A1's README — another agent owns it.)
+- **A3 (backend removal + harness rename)** — **already landed in the working tree** (`version.ts`
+  has `pi_core`/`pi_agenta`, golden renamed `run_request.pi_core.json`, `engines/pi.ts` deleted). The
+  wire models describe that shape from the start; the rename is not versioned (POC).
+- **A10 (error model)** — folded into the plan (step 6) as a direct wire change.
+- **`sidecar-trust-and-sandbox-enforcement`** flagged a stale `protocol.ts:149-150` comment; noted.
+- **DOCS-ONLY.** No edit to `protocol.ts` / `wire.py` / golden / contract tests / `interfaces/*`.
+  Composio, the tool gateway, connections, and MCP are described as existing and unchanged.
 
 ## Next actions (after review)
 
-- Get sign-off on README §10 open questions (wire-model placement, b1-vs-b2 TS types,
-  `additionalProperties`, cancelled modeling, version-cut grouping, `contractVersion` granularity,
-  capability probe shape).
-- Confirm the v2-cut grouping (A10 + A3 together vs A3 as v3) with A1/A3 owners.
+- Get sign-off on README §10 open questions (wire-model placement, where the SDK surfaces the export,
+  cancelled modeling, capability probe shape, and the deferred follow-up list).
+- Confirm with A1 the exact simple string version convention to carry (if any) on the payload.
 - Then implement step 1 (dedicated wire models with round-trip parity tests against the goldens).
-
-## Open questions (see README §10 for full text)
-
-1. Wire-model placement: `agents/wire_models.py` (proposed) vs a contract package.
-2. TS types: generated from schema (b2, proposed) vs hand-written + deep guard (b1).
-3. `additionalProperties`: permissive top-level + strict nested (proposed).
-4. Cancelled modeling: cooperative terminal record + teardown `CancelledError` (proposed).
-5. Version-cut grouping: A10 + A3 one v2 (proposed) vs A3 as v3 — A1's call.
-6. `contractVersion` granularity + subprocess probe shape — A1's call.
-7. Capability probe: all harnesses + base capabilities (proposed).

From 3813d39c0f6cb702810e94f5ab3704f168d72313 Mon Sep 17 00:00:00 2001
From: Mahmoud Mabrouk <mahmoud@agenta.ai>
Date: Wed, 24 Jun 2026 23:30:08 +0200
Subject: [PATCH 3/6] feat(agent): wire models as the /run schema source +
 canonical /inspect response

Implement the schema-driven /run contract plan (this PR's plan doc) plus the folded
/inspect follow-ups. Pre-production POC: no back-compat, no runtime validation.

- Dedicated Pydantic wire models (sdks/python/agenta/sdk/agents/wire_models.py:
  WireRunRequest / WireRunResult + an open WireAgentEvent) are the single schema source
  of truth, separate from the snake_case semantic DTOs. run_contract_schemas() exports
  their dereferenced camelCase JSON Schema, shipped in the SDK via CATALOG_TYPES
  (run_request / run_result), the same path agent_config takes. No new endpoint, no new
  toolchain.
- No validation: wire.py stays the dict producer (omit-when-empty lives there, pinned by
  goldens); the models are the schema authority + test guard only. Nothing gates a live
  /run. test_wire_models.py: freshness guard (committed catalog == fresh export), goldens
  validate + parse, request_to_wire output validates, schema props == KNOWN_REQUEST_KEYS.
- Issue 1: canonical WorkflowInspectResponse in models/workflows.py; handle_inspect_success
  normalizes the built WorkflowInvokeRequest into it, lifting WorkflowRevisionData to a flat
  top-level revision so schemas live at response.revision.schemas (was the latent-broken
  data.revision.data.schemas). The three /inspect routes return WorkflowInspectResponse. FE
  InspectWorkflowResponse type + store.ts read now resolve against the real body; the
  deprecated interface?.schemas fallback stays as a bridge for two sibling readers.
- Issue 4: typed /inspect outputs keyed per surface (invoke -> message, messages ->
  messages) in services/oss/src/agent/schemas.py; reuses existing catalog markers.

Deferred: /run version field, runner-side validation, GET /capabilities, generating
protocol.ts, structured-error/cancelled outcome, Fern publication.

Claude-Session: https://claude.ai/code/session_01GYo3UEfvsZpncagqb28Mbc
---
 .../projects/wire-contract-schema/status.md   |  49 ++-
 sdks/python/agenta/sdk/agents/utils/wire.py   |   8 +
 sdks/python/agenta/sdk/agents/wire_models.py  | 374 ++++++++++++++++++
 sdks/python/agenta/sdk/decorators/routing.py  |  30 +-
 sdks/python/agenta/sdk/models/workflows.py    |  31 ++
 sdks/python/agenta/sdk/utils/types.py         |   6 +
 .../pytest/unit/agents/test_wire_models.py    | 121 ++++++
 .../pytest/unit/test_inspect_response.py      | 103 +++++
 services/oss/src/agent/schemas.py             |  22 +-
 .../agenta-entities/src/workflow/api/api.ts   |  26 +-
 .../src/workflow/state/store.ts               |   6 +-
 .../inspectResponseSchemaResolution.test.ts   |  83 ++++
 12 files changed, 840 insertions(+), 19 deletions(-)
 create mode 100644 sdks/python/agenta/sdk/agents/wire_models.py
 create mode 100644 sdks/python/oss/tests/pytest/unit/agents/test_wire_models.py
 create mode 100644 sdks/python/oss/tests/pytest/unit/test_inspect_response.py
 create mode 100644 web/packages/agenta-entities/tests/unit/inspectResponseSchemaResolution.test.ts

diff --git a/docs/design/agent-workflows/projects/wire-contract-schema/status.md b/docs/design/agent-workflows/projects/wire-contract-schema/status.md
index 7fea08bf35..12420c086a 100644
--- a/docs/design/agent-workflows/projects/wire-contract-schema/status.md
+++ b/docs/design/agent-workflows/projects/wire-contract-schema/status.md
@@ -2,11 +2,56 @@
 
 | | |
 | --- | --- |
-| **Phase** | Plan written + Codex-reviewed, then **revised per the author's PR review on #4830** (2026-06-24). Awaiting re-review. No code. |
+| **Phase** | **Implemented** (2026-06-24). Pydantic wire models are the schema source of truth, exported into the SDK via `CATALOG_TYPES`; the `/inspect` canonical response + typed outputs landed. No runner/validation work (deferred). |
 | **Owner** | wire-contract-schema (A2 in the A1/A2/A3/A10 cohort) |
-| **Lane** | dedicated GitButler lane `feat/agent-wire-contract-schema-plan`, single commit (docs only) |
+| **Lane** | `feat/agent-wire-contract-schema-plan` (PR #4830), re-stacked on `feat/agent-contract-versioning-docs` (#4829). One PR = plan doc + impl. |
 | **Created** | 2026-06-24 |
 | **Revised** | 2026-06-24 (author PR review) |
+| **Implemented** | 2026-06-24 |
+
+## What shipped (the implementation)
+
+The plan's source-of-truth slice plus the folded `/inspect` follow-ups (architecture-followups
+issue 1 + typed outputs). Resolved every open question with the least-code option:
+
+- **Wire models as the single schema source of truth** —
+  `sdks/python/agenta/sdk/agents/wire_models.py`: dedicated camelCase Pydantic models
+  (`WireRunRequest`, `WireRunResult`, sub-objects, and an OPEN `WireAgentEvent` whose `type` is
+  optional so a typeless event is tolerated, mirroring the parser's drop behavior). NOT the
+  snake_case semantic DTOs. `run_contract_schemas()` exports their dereferenced, camelCase JSON
+  Schema.
+- **The JSON interface ships in the SDK** via `CATALOG_TYPES` (`run_request` / `run_result`), the
+  same path `agent_config` takes — so it is `/inspect`-discoverable through
+  `GET /workflows/catalog/types/{type}`. No new endpoint.
+- **Tests, no runtime validation** (`test_wire_models.py`): the committed catalog matches a fresh
+  export (freshness guard), all four goldens validate against the exported schema and parse into
+  the models, `request_to_wire` output validates, and the schema's property set equals
+  `KNOWN_REQUEST_KEYS` (the schema-derived key guard). Nothing gates a live `/run`.
+- **`wire.py` stays the dict producer** — least-code: the omit-when-empty behavior lives there and
+  is pinned by the goldens (a thing `model_json_schema()` cannot express). The models are the
+  *schema* authority and a docstring in `wire.py` points to them. No serializer rewrite.
+- **Issue 1 — canonical `/inspect` response**: `WorkflowInspectResponse` in
+  `sdks/python/agenta/sdk/models/workflows.py`; `handle_inspect_success` normalizes the
+  internally-built `WorkflowInvokeRequest` into it (`_to_inspect_response`), lifting the resolved
+  `WorkflowRevisionData` to a flat top-level `revision`, so schemas live at
+  `response.revision.schemas` (was the latent-broken `data.revision.data.schemas` nesting). The
+  three `/inspect` routes' `response_model` is now `WorkflowInspectResponse`. FE: the
+  `InspectWorkflowResponse` type and the `store.ts` read now resolve against the real body
+  (`revision.schemas`); the deprecated `interface?.schemas` fallback is kept on the type as a
+  migration bridge (two sibling readers still use it).
+- **Issue 4 — typed `/inspect` outputs**: `services/oss/src/agent/schemas.py` `AGENT_OUTPUTS_SCHEMA`
+  is keyed per output surface (`invoke` -> `message`, `messages` -> `messages`). Reuses existing
+  catalog markers, so the catalog-refs guard is unchanged. POC: no flat back-compat output field.
+
+### Deferred (noted in the PR body; NOT built)
+
+- The `/run` `version` field + dispatch (A1 already deferred it).
+- Runner-side request validation (no ajv, no runner dependency).
+- The `GET /capabilities` probe.
+- Generating `protocol.ts` from the schema; the structured-error / cancelled outcome; Fern
+  publication across languages.
+- `services/agent/CLAUDE.md`'s mirroring rule should mention the Pydantic wire models are now the
+  schema source — left for the runner owner (`services/agent/*` is their surface, not touched here).
 
 ## What exists
 
diff --git a/sdks/python/agenta/sdk/agents/utils/wire.py b/sdks/python/agenta/sdk/agents/utils/wire.py
index ae0e369c70..a4fd01f6a5 100644
--- a/sdks/python/agenta/sdk/agents/utils/wire.py
+++ b/sdks/python/agenta/sdk/agents/utils/wire.py
@@ -5,6 +5,14 @@
 under ``sdks/python/oss/tests/pytest/unit/agents/golden/`` (see ``test_wire_contract.py``).
 The runner drives one engine (the sandbox-agent ACP path); the ``harness`` field selects the
 agent, so there is no engine selector on the wire.
+
+The SCHEMA source of truth for this contract is the dedicated Pydantic wire models in
+``agenta.sdk.agents.wire_models`` (``WireRunRequest`` / ``WireRunResult``). Their exported JSON
+Schema ships in the SDK through ``CATALOG_TYPES`` and is asserted to describe exactly what the
+functions below emit/parse (``test_wire_models.py``). The serializer here stays a hand-built
+dict on purpose: the omit-when-empty behavior lives in this file (and is pinned by the goldens),
+which ``model_json_schema()`` cannot express. Add or rename a wire field in BOTH places (here and
+the wire models) plus ``protocol.ts`` and the goldens — the tests catch a one-sided change.
 """
 
 from __future__ import annotations
diff --git a/sdks/python/agenta/sdk/agents/wire_models.py b/sdks/python/agenta/sdk/agents/wire_models.py
new file mode 100644
index 0000000000..cce8381926
--- /dev/null
+++ b/sdks/python/agenta/sdk/agents/wire_models.py
@@ -0,0 +1,374 @@
+"""The ``/run`` wire contract as Pydantic models — the single schema source of truth.
+
+These models describe the EXACT camelCase JSON the Python producer emits and parses in
+``utils/wire.py`` (``request_to_wire`` / ``result_from_wire``) and the TS runner mirrors in
+``services/agent/src/protocol.ts``. They are deliberately a SEPARATE set from the semantic
+DTOs in ``dtos.py``: the DTOs are snake_case and intentionally loose (``AgentEvent`` is a free
+``type: str`` + ``data`` bag), while the real wire is camelCase with a discriminated event
+union. Exporting ``model_json_schema()`` off the DTOs would produce the wrong schema, so the
+contract lives here.
+
+What these models are for in this phase (a pre-production POC):
+
+- They are the schema authority: ``run_contract_schemas()`` exports their JSON Schema, which
+  ships in the SDK through ``CATALOG_TYPES`` (the same mechanism ``AgentConfigSchema`` uses to
+  reach the SDK / clients / ``/inspect``). A test asserts the committed catalog entry matches a
+  fresh export, so the schema cannot drift from these models.
+- They validate the golden fixtures and ``request_to_wire`` output in tests, proving the schema
+  faithfully describes today's wire.
+
+What they are NOT (deferred, per the project plan):
+
+- They are NOT a runtime guard. ``request_to_wire`` still builds a plain dict and the runner
+  still parses the body as-is; nothing validates against these models on a live ``/run``.
+- They do NOT carry a contract ``version`` field, structured errors, or a ``cancelled`` outcome
+  yet — those are deferred follow-ups. The result error stays the current free string.
+
+Conventions: every field is camelCase via an alias, with ``populate_by_name=True`` so the
+models also accept the Python field name. Optional fields default to ``None`` / empty, matching
+the implicitly-all-optional wire. ``extra="allow"`` keeps the models forward-compatible (an
+unknown field is not the schema's job to reject in this POC phase).
+"""
+
+from __future__ import annotations
+
+from typing import Any, ClassVar, Dict, List, Literal, Optional, Union
+
+from pydantic import BaseModel, ConfigDict, Field
+
+
+class _WireModel(BaseModel):
+    """Base for every wire model: camelCase aliases, accept-by-name, allow extra.
+
+    ``populate_by_name=True`` lets a producer construct with the Python field names while the
+    schema and ``model_dump(by_alias=True)`` speak camelCase. ``extra="allow"`` keeps the
+    contract open/forward-compatible (matching the runner's tolerant parsing); this POC does not
+    reject unknown fields.
+
+    ``__ag_type__`` is the catalog key a top-level model carries into ``CATALOG_TYPES`` (the
+    same role :class:`~agenta.sdk.utils.types.AgSchemaMixin` plays for the other catalog types).
+    It is NOT mixed in from ``utils/types`` on purpose: ``utils/types`` imports the agents
+    package, so importing it here would create a load cycle. ``ag_type()`` reads the marker.
+    """
+
+    model_config = ConfigDict(populate_by_name=True, extra="allow")
+
+    __ag_type__: ClassVar[Optional[str]] = None
+
+    @classmethod
+    def ag_type(cls) -> str:
+        if cls.__ag_type__ is None:
+            raise ValueError(f"{cls.__name__} does not define __ag_type__")
+        return cls.__ag_type__
+
+
+# ---------------------------------------------------------------------------
+# Shared sub-objects
+# ---------------------------------------------------------------------------
+
+
+class WireEndpoint(_WireModel):
+    """Non-secret connection config (mirrors ``Endpoint.to_wire``)."""
+
+    base_url: Optional[str] = Field(default=None, alias="baseUrl")
+    api_version: Optional[str] = Field(default=None, alias="apiVersion")
+    region: Optional[str] = None
+    headers: Optional[Dict[str, str]] = None
+
+
+class WireConnection(_WireModel):
+    """The author's credential-connection intent (``{mode, slug?}``)."""
+
+    mode: Literal["agenta", "self_managed"] = "agenta"
+    slug: Optional[str] = None
+
+
+class WireContentBlock(_WireModel):
+    """One content block of a message (mirrors ``ContentBlock.to_wire``)."""
+
+    type: str
+    text: Optional[str] = None
+    data: Optional[str] = None
+    mime_type: Optional[str] = Field(default=None, alias="mimeType")
+    uri: Optional[str] = None
+    tool_call_id: Optional[str] = Field(default=None, alias="toolCallId")
+    tool_name: Optional[str] = Field(default=None, alias="toolName")
+    input: Optional[Any] = None
+    output: Optional[Any] = None
+    is_error: Optional[bool] = Field(default=None, alias="isError")
+
+
+class WireChatMessage(_WireModel):
+    """A chat message on the wire: ``{role, content}`` (string or content blocks)."""
+
+    role: str
+    content: Union[str, List[WireContentBlock]] = ""
+
+
+class WireTraceContext(_WireModel):
+    """Agenta trace context threaded into a run (mirrors ``TraceContext.to_wire``)."""
+
+    traceparent: Optional[str] = None
+    baggage: Optional[str] = None
+    endpoint: Optional[str] = None
+    authorization: Optional[str] = None
+    capture_content: bool = Field(default=True, alias="captureContent")
+
+
+class WireToolCallback(_WireModel):
+    """Where callback (gateway) tools route their calls back to."""
+
+    endpoint: Optional[str] = None
+    authorization: Optional[str] = None
+
+
+class WireRenderHint(_WireModel):
+    """How a tool's result should be rendered by a client."""
+
+    kind: Optional[str] = None
+    component: Optional[str] = None
+
+
+class WireResolvedToolSpec(_WireModel):
+    """A resolved tool the runner delivers to the harness (the three-axis tool surface).
+
+    ``kind`` is the executor axis (``callback`` / ``code`` / ``client`` / ``builtin``);
+    ``needsApproval`` / ``render`` are the orthogonal axes; ``callRef`` / ``runtime`` / ``code``
+    / ``env`` are executor-specific. Extra fields are allowed so an executor variant the schema
+    has not enumerated still validates.
+    """
+
+    name: str
+    description: Optional[str] = None
+    input_schema: Optional[Dict[str, Any]] = Field(default=None, alias="inputSchema")
+    kind: Optional[str] = None
+    call_ref: Optional[str] = Field(default=None, alias="callRef")
+    runtime: Optional[str] = None
+    code: Optional[str] = None
+    env: Optional[Dict[str, str]] = None
+    needs_approval: Optional[bool] = Field(default=None, alias="needsApproval")
+    render: Optional[WireRenderHint] = None
+    read_only: Optional[bool] = Field(default=None, alias="readOnly")
+    permission: Optional[str] = None
+
+
+class WireMcpServer(_WireModel):
+    """A user-declared MCP server (stdio or http), mirrors ``mcp_servers_to_wire``."""
+
+    name: str
+    transport: Optional[str] = None
+    command: Optional[str] = None
+    args: Optional[List[str]] = None
+    env: Optional[Dict[str, str]] = None
+    url: Optional[str] = None
+    headers: Optional[Dict[str, str]] = None
+    tools: Optional[List[str]] = None
+    permission: Optional[str] = None
+
+
+class WireSkillFile(_WireModel):
+    """One bundled file in a resolved inline skill package."""
+
+    path: str
+    content: str
+    executable: Optional[bool] = None
+
+
+class WireSkill(_WireModel):
+    """A resolved inline skill package (mirrors ``skill_to_wire``)."""
+
+    name: str
+    description: Optional[str] = None
+    body: Optional[str] = None
+    files: Optional[List[WireSkillFile]] = None
+    disable_model_invocation: Optional[bool] = Field(
+        default=None, alias="disableModelInvocation"
+    )
+    allow_executable_files: Optional[bool] = Field(
+        default=None, alias="allowExecutableFiles"
+    )
+
+
+class WireNetworkEgress(_WireModel):
+    """The sandbox outbound-network policy (mirrors ``NetworkEgress``)."""
+
+    mode: Literal["on", "off", "allowlist"] = "on"
+    allowlist: List[str] = Field(default_factory=list)
+
+
+class WireSandboxPermission(_WireModel):
+    """The declared sandbox security boundary (mirrors ``SandboxPermission.to_wire``)."""
+
+    network: WireNetworkEgress = Field(default_factory=WireNetworkEgress)
+    filesystem: Optional[Literal["on", "readonly", "off"]] = None
+    enforcement: Literal["strict", "best_effort"] = "strict"
+
+
+class WireHarnessFile(_WireModel):
+    """One file the active harness's config renders into the session cwd before a run."""
+
+    path: str
+    content: str
+
+
+class WireHarnessCapabilities(_WireModel):
+    """What a harness can do, probed by the runner (the 11 boolean flags)."""
+
+    text_messages: bool = Field(default=True, alias="textMessages")
+    images: bool = False
+    file_attachments: bool = Field(default=False, alias="fileAttachments")
+    mcp_tools: bool = Field(default=False, alias="mcpTools")
+    tool_calls: bool = Field(default=False, alias="toolCalls")
+    reasoning: bool = False
+    plan_mode: bool = Field(default=False, alias="planMode")
+    permissions: bool = False
+    usage: bool = False
+    streaming_deltas: bool = Field(default=False, alias="streamingDeltas")
+    session_lifecycle: bool = Field(default=False, alias="sessionLifecycle")
+
+
+class WireAgentUsage(_WireModel):
+    """Token / cost usage rolled onto a workflow span."""
+
+    input: Optional[int] = None
+    output: Optional[int] = None
+    total: Optional[int] = None
+    cost: Optional[float] = None
+
+
+# ---------------------------------------------------------------------------
+# The event union (open / forward-compatible)
+# ---------------------------------------------------------------------------
+
+
+class WireAgentEvent(_WireModel):
+    """One structured event from a run, keyed by ``type``.
+
+    The Python parser (``AgentEvent.from_wire``) keeps the whole event verbatim and drops a
+    typeless event, so the wire event is intentionally OPEN: ``type`` is the discriminator and
+    ``extra="allow"`` carries the rest. ``type`` is OPTIONAL on the model on purpose — a
+    typeless event is dropped, not rejected (a golden pins exactly that), and the schema must
+    describe that tolerance rather than reject it. A closed discriminated union would also reject
+    the forward-compatible event types the runner may add, which contradicts the "drop unknown"
+    guarantee. The known ``type`` values are documented for readers, not enforced: ``message``,
+    ``thought``, the ``message_*`` / ``reasoning_*`` lifecycle trios, ``tool_call``,
+    ``tool_result``, ``interaction_request``, ``data``, ``file``, ``usage``, ``error``, ``done``.
+    """
+
+    type: Optional[str] = None
+
+
+# ---------------------------------------------------------------------------
+# The request
+# ---------------------------------------------------------------------------
+
+
+class WireRunRequest(_WireModel):
+    """The ``/run`` request payload — the exact field set ``request_to_wire`` may emit.
+
+    Every field is optional on the wire (the contract is implicitly all-optional), so the schema
+    expresses "optional" while the producer's omit-when-empty behavior stays in ``wire.py`` and
+    is pinned by the golden fixtures. The harness selects the agent (``pi_core`` / ``pi_agenta``
+    / ``claude``); there is no engine selector on the wire (A3 removed the legacy backend).
+    """
+
+    __ag_type__ = "run_request"
+
+    harness: Optional[str] = None
+    sandbox: Optional[str] = None
+    session_id: Optional[str] = Field(default=None, alias="sessionId")
+    agents_md: Optional[str] = Field(default=None, alias="agentsMd")
+    # Model + connection. ``model`` stays a plain string; the structured provider/connection
+    # fields ride alongside only when a resolved connection / model ref is present.
+    model: Optional[str] = None
+    provider: Optional[str] = None
+    connection: Optional[WireConnection] = None
+    deployment: Optional[str] = None
+    endpoint: Optional[WireEndpoint] = None
+    credential_mode: Optional[str] = Field(default=None, alias="credentialMode")
+    # Turn.
+    messages: Optional[List[WireChatMessage]] = None
+    # Secrets injected as harness env (provider keys); never written to the agent filesystem.
+    secrets: Optional[Dict[str, str]] = None
+    trace: Optional[WireTraceContext] = None
+    # Tools + skills.
+    tools: Optional[List[str]] = None
+    custom_tools: Optional[List[WireResolvedToolSpec]] = Field(
+        default=None, alias="customTools"
+    )
+    tool_callback: Optional[WireToolCallback] = Field(
+        default=None, alias="toolCallback"
+    )
+    mcp_servers: Optional[List[WireMcpServer]] = Field(default=None, alias="mcpServers")
+    skills: Optional[List[WireSkill]] = None
+    # Policy + prompt overrides + files.
+    permission_policy: Optional[str] = Field(default=None, alias="permissionPolicy")
+    system_prompt: Optional[str] = Field(default=None, alias="systemPrompt")
+    append_system_prompt: Optional[str] = Field(
+        default=None, alias="appendSystemPrompt"
+    )
+    sandbox_permission: Optional[WireSandboxPermission] = Field(
+        default=None, alias="sandboxPermission"
+    )
+    harness_files: Optional[List[WireHarnessFile]] = Field(
+        default=None, alias="harnessFiles"
+    )
+
+
+# ---------------------------------------------------------------------------
+# The result
+# ---------------------------------------------------------------------------
+
+
+class WireRunResult(_WireModel):
+    """The ``/run`` result payload — what ``result_from_wire`` parses.
+
+    ``ok`` is the outcome flag; on failure ``error`` is the current free string (a structured
+    error model is a deferred follow-up, not this phase). On success the run carries ``output``,
+    ``messages``, ``events``, ``usage``, ``stopReason``, ``capabilities``, plus the resolved
+    ``sessionId`` / ``model`` / ``traceId``.
+    """
+
+    __ag_type__ = "run_result"
+
+    ok: bool
+    output: Optional[str] = None
+    messages: Optional[List[WireChatMessage]] = None
+    events: Optional[List[WireAgentEvent]] = None
+    usage: Optional[WireAgentUsage] = None
+    stop_reason: Optional[str] = Field(default=None, alias="stopReason")
+    capabilities: Optional[WireHarnessCapabilities] = None
+    session_id: Optional[str] = Field(default=None, alias="sessionId")
+    model: Optional[str] = None
+    trace_id: Optional[str] = Field(default=None, alias="traceId")
+    error: Optional[str] = None
+
+
+# ---------------------------------------------------------------------------
+# The exported JSON interface
+# ---------------------------------------------------------------------------
+
+# The top-level wire models whose JSON Schema ships in the SDK. Each is keyed by its
+# ``x-ag-type`` so ``CATALOG_TYPES`` can carry it the same way it carries ``agent_config``.
+WIRE_CONTRACT_MODELS = (WireRunRequest, WireRunResult)
+
+
+def run_contract_schemas() -> Dict[str, Dict[str, Any]]:
+    """The exported JSON Schema of the ``/run`` wire models, keyed by ``x-ag-type``.
+
+    Uses ``model_json_schema(by_alias=True)`` so the emitted property names are the camelCase
+    wire keys, and dereferences ``$defs`` (the same treatment ``CATALOG_TYPES`` gives every other
+    entry, via ``_dereference_schema``) so the catalog entries are self-contained. This is the
+    single export point: ``CATALOG_TYPES`` adds these entries, and a freshness test asserts the
+    committed catalog matches a fresh call here so the schema cannot silently drift from the
+    models.
+    """
+    # Local import to avoid a module-load cycle: ``utils/types`` imports the agents package.
+    from ..utils.types import _dereference_schema
+
+    schemas: Dict[str, Dict[str, Any]] = {}
+    for model in WIRE_CONTRACT_MODELS:
+        schema = _dereference_schema(model.model_json_schema(by_alias=True))
+        schema["x-ag-type"] = model.ag_type()
+        schemas[model.ag_type()] = schema
+    return schemas
diff --git a/sdks/python/agenta/sdk/decorators/routing.py b/sdks/python/agenta/sdk/decorators/routing.py
index e4e8543f07..2516ea98f7 100644
--- a/sdks/python/agenta/sdk/decorators/routing.py
+++ b/sdks/python/agenta/sdk/decorators/routing.py
@@ -14,6 +14,7 @@
 from agenta.sdk.models.workflows import (
     WorkflowInvokeRequest,
     WorkflowInspectRequest,
+    WorkflowInspectResponse,
     WorkflowServiceStatus,
     WorkflowBatchResponse,
     WorkflowStreamingResponse,
@@ -350,11 +351,32 @@ async def handle_invoke_failure(exception: Exception) -> Response:
     return _make_json_response(error)
 
 
+def _to_inspect_response(
+    request: WorkflowInvokeRequest,
+) -> WorkflowInspectResponse:
+    """Normalize the internally-built ``WorkflowInvokeRequest`` into the canonical response.
+
+    ``workflow.inspect()`` builds its result as a ``WorkflowInvokeRequest`` (a REQUEST model), so
+    the resolved interface lands nested at ``data.revision.data``. The public ``/inspect``
+    contract is :class:`WorkflowInspectResponse` instead, which lifts that
+    :class:`WorkflowRevisionData` up to a flat top-level ``revision`` — so a client reads schemas
+    at ``response.revision.schemas`` rather than guessing the request envelope.
+    """
+    nested = (request.data.revision or {}) if request.data else {}
+    revision_data = nested.get("data") if isinstance(nested, dict) else None
+    return WorkflowInspectResponse(
+        version=request.version,
+        revision=revision_data,
+        meta=request.meta,
+    )
+
+
 async def handle_inspect_success(
     request: Optional[WorkflowInvokeRequest],
 ):
     if request:
-        return JSONResponse(request.model_dump(mode="json", exclude_none=True))
+        response = _to_inspect_response(request)
+        return JSONResponse(response.model_dump(mode="json", exclude_none=True))
 
     return JSONResponse({"details": {"message": "Workflow not found"}}, status_code=404)
 
@@ -544,7 +566,7 @@ def _add_agent_routes(target: Any, prefix: str) -> None:
                 self.path + "/inspect",
                 inspect_endpoint,
                 methods=["POST"],
-                response_model=WorkflowInvokeRequest,
+                response_model=WorkflowInspectResponse,
             )
             if agent_enabled:
                 _add_agent_routes(self.router_fallback, self.path)
@@ -568,7 +590,7 @@ def _add_agent_routes(target: Any, prefix: str) -> None:
                 "/inspect",
                 inspect_endpoint,
                 methods=["POST"],
-                response_model=WorkflowInvokeRequest,
+                response_model=WorkflowInspectResponse,
             )
             if agent_enabled:
                 _add_agent_routes(self.mount_root, "")
@@ -587,7 +609,7 @@ def _add_agent_routes(target: Any, prefix: str) -> None:
             "/inspect",
             inspect_endpoint,
             methods=["POST"],
-            response_model=WorkflowInvokeRequest,
+            response_model=WorkflowInspectResponse,
         )
         if agent_enabled:
             _add_agent_routes(sub_app, "")
diff --git a/sdks/python/agenta/sdk/models/workflows.py b/sdks/python/agenta/sdk/models/workflows.py
index 0779695a8e..18c212dd53 100644
--- a/sdks/python/agenta/sdk/models/workflows.py
+++ b/sdks/python/agenta/sdk/models/workflows.py
@@ -307,6 +307,37 @@ def _coerce_nested_models(cls, values: Dict[str, Any]) -> Dict[str, Any]:
 WorkflowServiceInspectRequest = WorkflowInspectRequest
 
 
+class WorkflowInspectResponse(Metadata):
+    """The canonical ``/inspect`` response: the resolved interface, flat and self-describing.
+
+    ``/inspect`` is a public edge — it tells the browser which form to render and which inputs,
+    parameters, and outputs a workflow has. The response used to be a ``WorkflowInvokeRequest``
+    (a REQUEST model carrying response semantics), which nested the schemas under
+    ``data.revision.data.schemas`` and made every client guess the envelope. This model is the
+    explicit response contract instead:
+
+    - ``revision`` is a :class:`WorkflowRevisionData` directly (it already owns ``uri`` / ``url``
+      / ``headers`` / ``schemas`` / ``parameters``), so the schemas live at the obvious
+      ``response.revision.schemas`` — no ``data.revision.data`` nesting.
+    - ``configuration`` and ``meta`` carry the resolved config and any interface metadata (the
+      agent workflow rides its per-harness connection capability in ``meta``).
+
+    Typed outputs (POC, no back-compat field): ``revision.schemas.outputs`` may be keyed per
+    output type (for example ``{"messages": {...}, "invoke": {...}}``) so a workflow with more
+    than one output surface describes each one. A single-output workflow still uses a plain
+    output schema. Consumers read the keyed shape when present and fall back to the plain one.
+    """
+
+    version: Optional[str] = "2025.07.14"
+
+    revision: Optional[WorkflowRevisionData] = None
+    configuration: Optional[Dict[str, Any]] = None
+
+
+# back-compat alias
+WorkflowServiceInspectResponse = WorkflowInspectResponse
+
+
 class WorkflowBaseResponse(TraceID, SpanID):
     version: Optional[str] = "2025.07.14"
 
diff --git a/sdks/python/agenta/sdk/utils/types.py b/sdks/python/agenta/sdk/utils/types.py
index d2536917de..70b89613a3 100644
--- a/sdks/python/agenta/sdk/utils/types.py
+++ b/sdks/python/agenta/sdk/utils/types.py
@@ -11,6 +11,7 @@
 from agenta.sdk.agents.dtos import HARNESS_IDENTITIES, SandboxPermission
 from agenta.sdk.agents.mcp import MCPServerConfig
 from agenta.sdk.agents.tools import ToolConfig
+from agenta.sdk.agents.wire_models import run_contract_schemas
 from agenta.sdk.utils.assets import supported_llm_models, model_metadata
 from agenta.sdk.utils.helpers import _PLACEHOLDER_RE
 from agenta.sdk.utils.rendering import (
@@ -1360,4 +1361,9 @@ class _SkillEmbedRefSchema(BaseModel):
     SkillConfigSchema.ag_type(): _dereference_schema(
         SkillConfigSchema.model_json_schema()
     ),
+    # The `/run` wire contract (request + result), exported from the dedicated Pydantic wire
+    # models in `agenta.sdk.agents.wire_models`. This puts the service<->runner wire interface in
+    # the SDK the same way the other catalog types are exposed; a freshness test asserts these
+    # entries match a fresh export so the schema cannot drift from the models.
+    **run_contract_schemas(),
 }
diff --git a/sdks/python/oss/tests/pytest/unit/agents/test_wire_models.py b/sdks/python/oss/tests/pytest/unit/agents/test_wire_models.py
new file mode 100644
index 0000000000..095c509586
--- /dev/null
+++ b/sdks/python/oss/tests/pytest/unit/agents/test_wire_models.py
@@ -0,0 +1,121 @@
+"""The ``/run`` wire models are the single schema source of truth.
+
+These tests prove the dedicated Pydantic wire models in ``agenta.sdk.agents.wire_models``
+faithfully describe the wire that ``request_to_wire`` / ``result_from_wire`` (in ``utils/wire.py``)
+produce and parse, and that the exported JSON Schema is the one shipped in the SDK through
+``CATALOG_TYPES``.
+
+This is the Python-side guard the project plan calls for:
+
+- The exported schema is committed into the SDK (``CATALOG_TYPES['run_request' | 'run_result']``)
+  and a freshness test asserts the catalog entry equals a fresh export, so the schema cannot
+  silently drift from the models.
+- The golden fixtures (the cross-language anchor) validate against the exported schema — an
+  "example must validate" check that proves the schema describes today's wire.
+- ``request_to_wire`` output validates against the schema, so the producer and the schema agree.
+- The request schema's property set equals the hand-kept ``KNOWN_REQUEST_KEYS`` in
+  ``test_wire_contract.py``, so a new wire field cannot land in one place and not the other.
+
+There is NO runtime validation in this phase (per the project plan): nothing here gates a live
+``/run``. These models are an SDK-side schema artifact and a test guard only.
+"""
+
+from __future__ import annotations
+
+import jsonschema
+import pytest
+
+from agenta.sdk.agents.wire_models import (
+    WireRunRequest,
+    WireRunResult,
+    run_contract_schemas,
+)
+from agenta.sdk.utils.types import CATALOG_TYPES
+
+from .test_wire_contract import (
+    KNOWN_REQUEST_KEYS,
+    _agenta_payload,
+    _claude_payload,
+    _pi_payload,
+)
+
+
+def test_run_contract_ships_in_the_sdk_catalog():
+    # The exported JSON interface lives in the SDK alongside the other catalog types, so a
+    # client / the playground / `/inspect` can resolve it the same way as `agent_config`.
+    assert "run_request" in CATALOG_TYPES
+    assert "run_result" in CATALOG_TYPES
+    assert CATALOG_TYPES["run_request"]["x-ag-type"] == "run_request"
+    assert CATALOG_TYPES["run_result"]["x-ag-type"] == "run_result"
+
+
+def test_committed_catalog_matches_a_fresh_export():
+    # Freshness: regenerate the schema in-memory and assert the committed catalog entry equals
+    # it (drift -> fail), the same discipline the goldens already use. If the wire models change,
+    # this fails until the export is regenerated.
+    fresh = run_contract_schemas()
+    assert CATALOG_TYPES["run_request"] == fresh["run_request"]
+    assert CATALOG_TYPES["run_result"] == fresh["run_result"]
+
+
+def test_exported_schema_is_dereferenced_and_camelcase():
+    # The exported schema is self-contained (no `$defs`/`$ref`, like every catalog entry) and
+    # speaks the camelCase wire keys, not the snake_case Python field names.
+    req = CATALOG_TYPES["run_request"]
+    assert "$defs" not in req
+    props = req["properties"]
+    assert "sessionId" in props and "session_id" not in props
+    assert "customTools" in props and "custom_tools" not in props
+
+
+def test_request_schema_properties_equal_known_request_keys():
+    # The schema-derived property set is exactly the hand-kept guard in `test_wire_contract.py`.
+    # This is the schema-derived key guard: a new wire field cannot be added to one without the
+    # other, so the two cannot silently fall out of step.
+    assert set(CATALOG_TYPES["run_request"]["properties"]) == KNOWN_REQUEST_KEYS
+
+
+@pytest.mark.parametrize(
+    "golden_name, model",
+    [
+        ("run_request.pi_core.json", WireRunRequest),
+        ("run_request.claude.json", WireRunRequest),
+        ("run_result.ok.json", WireRunResult),
+        ("run_result.error.json", WireRunResult),
+    ],
+)
+def test_goldens_parse_into_the_wire_models(golden, golden_name, model):
+    # Every golden parses cleanly into its wire model (by camelCase alias), proving the models
+    # accept the real wire. The ok-result golden includes a deliberately typeless event; the
+    # open `WireAgentEvent` (type optional) tolerates it, mirroring the parser's drop behavior.
+    model.model_validate(golden(golden_name))
+
+
+@pytest.mark.parametrize(
+    "golden_name, ag_type",
+    [
+        ("run_request.pi_core.json", "run_request"),
+        ("run_request.claude.json", "run_request"),
+        ("run_result.ok.json", "run_result"),
+        ("run_result.error.json", "run_result"),
+    ],
+)
+def test_goldens_validate_against_the_exported_schema(golden, golden_name, ag_type):
+    # "Examples must validate": each golden validates against the exported JSON Schema shipped in
+    # the SDK. This proves the schema describes today's wire. It is a TEST, not a runtime guard.
+    jsonschema.validate(golden(golden_name), CATALOG_TYPES[ag_type])
+
+
+def test_request_to_wire_output_validates_against_the_schema():
+    # The producer and the schema agree: the dict `request_to_wire` builds for each harness
+    # validates against the exported request schema and round-trips through the wire model.
+    for payload in (_pi_payload(), _claude_payload(), _agenta_payload()):
+        jsonschema.validate(payload, CATALOG_TYPES["run_request"])
+        WireRunRequest.model_validate(payload)
+
+
+def test_minimal_result_validates():
+    # A bare success result (the `result_from_wire` minimal case) is valid against the schema.
+    payload = {"ok": True}
+    jsonschema.validate(payload, CATALOG_TYPES["run_result"])
+    assert WireRunResult.model_validate(payload).ok is True
diff --git a/sdks/python/oss/tests/pytest/unit/test_inspect_response.py b/sdks/python/oss/tests/pytest/unit/test_inspect_response.py
new file mode 100644
index 0000000000..b6592674b8
--- /dev/null
+++ b/sdks/python/oss/tests/pytest/unit/test_inspect_response.py
@@ -0,0 +1,103 @@
+"""The ``/inspect`` response is the canonical :class:`WorkflowInspectResponse`.
+
+Architecture-followups issue 1: ``/inspect`` used to return a ``WorkflowInvokeRequest`` (a
+REQUEST model carrying response semantics), nesting the resolved interface at
+``data.revision.data.schemas`` so every client had to guess the envelope. ``handle_inspect_success``
+now normalizes that internally-built request into a flat :class:`WorkflowInspectResponse` whose
+``revision`` IS the :class:`WorkflowRevisionData`, so schemas live at the obvious
+``response["revision"]["schemas"]``.
+
+These are the acceptance criteria from
+``docs/design/agent-workflows/interfaces/architecture-followups.md`` issue 1:
+
+- The response exposes schemas at ``response["revision"]["schemas"]`` (not ``data.revision.data``).
+- The frontend can resolve schemas from the new shape.
+"""
+
+from __future__ import annotations
+
+import json
+
+from agenta.sdk.decorators.routing import _to_inspect_response
+from agenta.sdk.models.workflows import (
+    WorkflowInspectResponse,
+    WorkflowInvokeRequest,
+    WorkflowRequestData,
+    WorkflowRevision,
+    WorkflowRevisionData,
+)
+
+_RESOLVED_REVISION = WorkflowRevisionData(
+    uri="agenta:builtin:agent:v0",
+    schemas={
+        "inputs": {"type": "object", "properties": {"messages": {"type": "array"}}},
+        "parameters": {"type": "object"},
+        # Typed outputs keyed per output surface (issue 4): the POC shape, no flat field.
+        "outputs": {
+            "invoke": {"x-ag-type-ref": "message", "type": "object"},
+            "messages": {"x-ag-type-ref": "messages", "type": "array"},
+        },
+    },
+    parameters={"agent": {"model": "gpt-5.5"}},
+)
+
+
+def _built_invoke_request() -> WorkflowInvokeRequest:
+    """The internally-built inspect result (what ``workflow.inspect()`` returns today)."""
+    return WorkflowInvokeRequest(
+        meta={"harness_capabilities": {"pi_core": {}}},
+        data=WorkflowRequestData(
+            revision=WorkflowRevision(
+                id=None,
+                slug="agent",
+                version="v0",
+                name="Agent",
+                data=_RESOLVED_REVISION,
+            ).model_dump(mode="json", exclude_none=True),
+        ),
+    )
+
+
+def test_inspect_response_lifts_revision_to_top_level():
+    response = _to_inspect_response(_built_invoke_request())
+
+    assert isinstance(response, WorkflowInspectResponse)
+    assert response.revision is not None
+    # Schemas live at response.revision.schemas — not nested under data.revision.data.
+    assert response.revision.schemas is not None
+    assert response.revision.schemas.inputs == _RESOLVED_REVISION.schemas.inputs
+    assert response.revision.uri == "agenta:builtin:agent:v0"
+    assert response.revision.parameters == {"agent": {"model": "gpt-5.5"}}
+    # Interface metadata rides top-level meta.
+    assert response.meta == {"harness_capabilities": {"pi_core": {}}}
+
+
+def test_inspect_response_serializes_schemas_at_revision_schemas():
+    # The acceptance criterion in the words of a client: post /inspect, read response body,
+    # find schemas at body["revision"]["schemas"]. This is the exact path the frontend reads.
+    response = _to_inspect_response(_built_invoke_request())
+    body = json.loads(response.model_dump_json(exclude_none=True))
+
+    assert "revision" in body
+    assert "schemas" in body["revision"]
+    assert "inputs" in body["revision"]["schemas"]
+    # No request-envelope leakage: there is no top-level `data.revision.data` nesting.
+    assert "data" not in body
+
+
+def test_inspect_response_outputs_are_keyed_per_surface():
+    # Issue 4: outputs carry the typed shape keyed per output surface (messages / invoke).
+    response = _to_inspect_response(_built_invoke_request())
+    outputs = response.revision.schemas.outputs
+
+    assert set(outputs) == {"invoke", "messages"}
+    assert outputs["invoke"]["x-ag-type-ref"] == "message"
+    assert outputs["messages"]["x-ag-type-ref"] == "messages"
+
+
+def test_inspect_response_handles_a_request_with_no_revision():
+    # A built request with no resolved revision normalizes to an empty-revision response, not a
+    # crash (the inspect path can resolve nothing for an unknown URI).
+    response = _to_inspect_response(WorkflowInvokeRequest())
+    assert isinstance(response, WorkflowInspectResponse)
+    assert response.revision is None
diff --git a/services/oss/src/agent/schemas.py b/services/oss/src/agent/schemas.py
index 5a49a38a93..ccd2bb1f7b 100644
--- a/services/oss/src/agent/schemas.py
+++ b/services/oss/src/agent/schemas.py
@@ -67,12 +67,28 @@
     "properties": {"agent": AGENT_CONFIG_SCHEMA},
 }
 
-# Outputs: the final assistant message.
+# Outputs, keyed per output surface (the agent has two): `invoke` returns the single final
+# assistant message (the batch `/invoke` shape, `x-ag-type-ref: message`); `messages` returns the
+# ordered conversation the `/messages` route streams (`x-ag-type-ref: messages`). Keying outputs by
+# surface lets the playground render the right output view per route. POC, so no flat back-compat
+# output field: a consumer reads the keyed shape directly. Both refs already appear elsewhere in
+# AGENT_SCHEMAS, so this adds no new catalog marker.
 AGENT_OUTPUTS_SCHEMA = {
     "$schema": _SCHEMA,
-    "x-ag-type-ref": "message",
     "type": "object",
-    "description": "Final assistant message returned by the agent.",
+    "description": "Agent outputs, keyed per output surface (invoke / messages).",
+    "properties": {
+        "invoke": {
+            "x-ag-type-ref": "message",
+            "type": "object",
+            "description": "Final assistant message returned by a batch /invoke.",
+        },
+        "messages": {
+            "x-ag-type-ref": "messages",
+            "type": "array",
+            "description": "The ordered conversation the /messages route returns.",
+        },
+    },
 }
 
 AGENT_SCHEMAS = {
diff --git a/web/packages/agenta-entities/src/workflow/api/api.ts b/web/packages/agenta-entities/src/workflow/api/api.ts
index 44ee390dae..c8b5a4be47 100644
--- a/web/packages/agenta-entities/src/workflow/api/api.ts
+++ b/web/packages/agenta-entities/src/workflow/api/api.ts
@@ -415,12 +415,19 @@ export async function fetchWorkflowRevisionById(
 // ============================================================================
 
 /**
- * Response shape from the inspect endpoint.
- * Returns a WorkflowServiceRequest with resolved interface.
+ * Response shape from the `/inspect` endpoint.
+ *
+ * The canonical backend model is `WorkflowInspectResponse`
+ * (sdks/python/agenta/sdk/models/workflows.py): a flat response whose `revision` IS the
+ * resolved `WorkflowRevisionData`, so schemas live at `revision.schemas`. The endpoint no
+ * longer returns the old `WorkflowInvokeRequest` envelope that nested them under
+ * `data.revision.data.schemas`.
+ *
+ * `outputs` is typed per output surface (POC): `{invoke, messages}` for the agent workflow,
+ * or a single schema for a one-output workflow. The store reads either shape.
  */
 export interface InspectWorkflowResponse {
     version?: string
-    /** New shape (feat/extend-runnables): revision contains the resolved data */
     revision?: {
         uri?: string
         url?: string
@@ -432,7 +439,14 @@ export interface InspectWorkflowResponse {
         }
         parameters?: Record<string, unknown>
     }
-    /** @deprecated Old shape — kept for backward compat during migration */
+    configuration?: Record<string, unknown>
+    meta?: Record<string, unknown>
+    /**
+     * @deprecated Migration bridge for the old `WorkflowInvokeRequest` inspect envelope. The
+     * canonical response puts schemas at `revision.schemas`; read that first. Remove this once
+     * every reader (appUtils / evaluatorUtils) no longer needs the `?? interface?.schemas`
+     * fallback — i.e. once no deployed service returns the old envelope.
+     */
     interface?: {
         version?: string
         uri?: string
@@ -444,10 +458,6 @@ export interface InspectWorkflowResponse {
             outputs?: Record<string, unknown>
         }
     }
-    configuration?: {
-        script?: Record<string, unknown>
-        parameters?: Record<string, unknown>
-    }
 }
 
 /**
diff --git a/web/packages/agenta-entities/src/workflow/state/store.ts b/web/packages/agenta-entities/src/workflow/state/store.ts
index f7fc3d344b..83531a1cb8 100644
--- a/web/packages/agenta-entities/src/workflow/state/store.ts
+++ b/web/packages/agenta-entities/src/workflow/state/store.ts
@@ -1535,11 +1535,13 @@ export const workflowEntityAtomFamily = atomFamily((workflowId: string) =>
         let resolvedParams: Record<string, unknown> | null | undefined = null
 
         // (a) Inspect — primary source for any workflow with a URI.
-        // Returns interface.schemas.{inputs, parameters, outputs} directly.
+        // The canonical `WorkflowInspectResponse` puts the resolved interface at
+        // `revision.schemas.{inputs, parameters, outputs}` (revision IS the WorkflowRevisionData),
+        // so we read it directly. `outputs` may be typed per output surface ({invoke, messages}).
         const inspectQuery = get(workflowInspectAtomFamily(workflowId))
         const inspectData = inspectQuery.data ?? null
         if (inspectData) {
-            const inspectSchemas = inspectData.revision?.schemas ?? inspectData.interface?.schemas
+            const inspectSchemas = inspectData.revision?.schemas
             if (inspectSchemas) {
                 resolvedInputs = inspectSchemas.inputs
                 resolvedOutputs = inspectSchemas.outputs
diff --git a/web/packages/agenta-entities/tests/unit/inspectResponseSchemaResolution.test.ts b/web/packages/agenta-entities/tests/unit/inspectResponseSchemaResolution.test.ts
new file mode 100644
index 0000000000..411676de6c
--- /dev/null
+++ b/web/packages/agenta-entities/tests/unit/inspectResponseSchemaResolution.test.ts
@@ -0,0 +1,83 @@
+/**
+ * Schema resolution from the canonical `/inspect` response shape.
+ *
+ * Architecture-followups issue 1: `/inspect` now returns the canonical `WorkflowInspectResponse`
+ * (sdks/python/agenta/sdk/models/workflows.py) whose `revision` IS the resolved
+ * `WorkflowRevisionData`, so schemas live at `revision.schemas`. The store read
+ * (`web/packages/agenta-entities/src/workflow/state/store.ts`, the inspect branch) reads exactly
+ * that path. These tests pin that read against the real response shape so it cannot silently
+ * regress to resolving `undefined` (the latent break this fix closes).
+ *
+ * The store's read is an inline expression over the query data, not an exported function, so we
+ * reproduce that exact expression here over a typed `InspectWorkflowResponse`. The point is the
+ * CONTRACT: the canonical body resolves schemas; the old nested envelope does not.
+ */
+
+import {describe, expect, it} from "vitest"
+
+import type {InspectWorkflowResponse} from "../../src/workflow/api/api"
+
+// The exact read the store performs in its inspect branch (store.ts).
+function resolveInspectSchemas(inspectData: InspectWorkflowResponse | null) {
+    if (!inspectData) return null
+    const inspectSchemas = inspectData.revision?.schemas
+    if (!inspectSchemas) return null
+    return {
+        inputs: inspectSchemas.inputs,
+        outputs: inspectSchemas.outputs,
+        parameters: inspectSchemas.parameters,
+    }
+}
+
+describe("inspect response schema resolution", () => {
+    it("resolves schemas from the canonical revision.schemas shape", () => {
+        const body: InspectWorkflowResponse = {
+            version: "2025.07.14",
+            revision: {
+                uri: "agenta:builtin:agent:v0",
+                schemas: {
+                    inputs: {type: "object", properties: {messages: {type: "array"}}},
+                    parameters: {type: "object"},
+                    outputs: {
+                        invoke: {"x-ag-type-ref": "message", type: "object"},
+                        messages: {"x-ag-type-ref": "messages", type: "array"},
+                    },
+                },
+                parameters: {agent: {model: "gpt-5.5"}},
+            },
+            meta: {harness_capabilities: {}},
+        }
+
+        const resolved = resolveInspectSchemas(body)
+        expect(resolved).not.toBeNull()
+        expect(resolved?.inputs).toEqual({
+            type: "object",
+            properties: {messages: {type: "array"}},
+        })
+        expect(resolved?.parameters).toEqual({type: "object"})
+    })
+
+    it("exposes outputs keyed per output surface (invoke / messages)", () => {
+        const body: InspectWorkflowResponse = {
+            revision: {
+                schemas: {
+                    outputs: {
+                        invoke: {"x-ag-type-ref": "message"},
+                        messages: {"x-ag-type-ref": "messages"},
+                    },
+                },
+            },
+        }
+
+        const resolved = resolveInspectSchemas(body)
+        const outputs = resolved?.outputs as Record<string, Record<string, unknown>> | undefined
+        expect(outputs && Object.keys(outputs).sort()).toEqual(["invoke", "messages"])
+        expect(outputs?.invoke["x-ag-type-ref"]).toBe("message")
+        expect(outputs?.messages["x-ag-type-ref"]).toBe("messages")
+    })
+
+    it("resolves nothing when there is no revision (no crash, no stale schemas)", () => {
+        expect(resolveInspectSchemas({})).toBeNull()
+        expect(resolveInspectSchemas(null)).toBeNull()
+    })
+})

From 443daf8c3ab230f13b830b9fe34a268d4c451c44 Mon Sep 17 00:00:00 2001
From: Mahmoud Mabrouk <mahmoud@agenta.ai>
Date: Thu, 25 Jun 2026 10:10:40 +0200
Subject: [PATCH 4/6] fix(agent): preserve resolved configuration in normalized
 /inspect response

---
 sdks/python/agenta/sdk/decorators/routing.py               | 7 +++++++
 sdks/python/oss/tests/pytest/unit/test_inspect_response.py | 3 +++
 2 files changed, 10 insertions(+)

diff --git a/sdks/python/agenta/sdk/decorators/routing.py b/sdks/python/agenta/sdk/decorators/routing.py
index 2516ea98f7..b312b8cb02 100644
--- a/sdks/python/agenta/sdk/decorators/routing.py
+++ b/sdks/python/agenta/sdk/decorators/routing.py
@@ -364,9 +364,16 @@ def _to_inspect_response(
     """
     nested = (request.data.revision or {}) if request.data else {}
     revision_data = nested.get("data") if isinstance(nested, dict) else None
+    # Carry the resolved config so the public boundary doesn't drop it: the FE reads
+    # ``configuration.parameters`` as a fallback when ``revision.parameters`` is absent.
+    parameters = (
+        revision_data.get("parameters") if isinstance(revision_data, dict) else None
+    )
+    configuration = {"parameters": parameters} if parameters is not None else None
     return WorkflowInspectResponse(
         version=request.version,
         revision=revision_data,
+        configuration=configuration,
         meta=request.meta,
     )
 
diff --git a/sdks/python/oss/tests/pytest/unit/test_inspect_response.py b/sdks/python/oss/tests/pytest/unit/test_inspect_response.py
index b6592674b8..b4fdff8766 100644
--- a/sdks/python/oss/tests/pytest/unit/test_inspect_response.py
+++ b/sdks/python/oss/tests/pytest/unit/test_inspect_response.py
@@ -68,6 +68,8 @@ def test_inspect_response_lifts_revision_to_top_level():
     assert response.revision.schemas.inputs == _RESOLVED_REVISION.schemas.inputs
     assert response.revision.uri == "agenta:builtin:agent:v0"
     assert response.revision.parameters == {"agent": {"model": "gpt-5.5"}}
+    # Resolved config is preserved at the public boundary, not dropped.
+    assert response.configuration == {"parameters": {"agent": {"model": "gpt-5.5"}}}
     # Interface metadata rides top-level meta.
     assert response.meta == {"harness_capabilities": {"pi_core": {}}}
 
@@ -101,3 +103,4 @@ def test_inspect_response_handles_a_request_with_no_revision():
     response = _to_inspect_response(WorkflowInvokeRequest())
     assert isinstance(response, WorkflowInspectResponse)
     assert response.revision is None
+    assert response.configuration is None

From 8d8bca30d35c60ae6f1d53d692b190521e3950d5 Mon Sep 17 00:00:00 2001
From: Mahmoud Mabrouk <mahmoud@agenta.ai>
Date: Thu, 25 Jun 2026 00:00:37 +0200
Subject: [PATCH 5/6] feat(agent): harness-aware provider + model picker
 (inspect models map)

Phase A (SDK): add a per-provider models map to HarnessConnectionCapabilities and
populate HARNESS_CONNECTION_CAPABILITIES (Pi from supported_llm_models; Claude from a
new CLAUDE_MODEL_ALIASES constant), emitted on /inspect meta.harness_capabilities.

Phases B-D (FE): thread meta.harness_capabilities to the playground via a new
harnessCapabilitiesAtomFamily; retire the static FE capability map; build a
harness-filtered unified provider+model picker (selecting a model sets both provider
and model; standalone provider field removed); add an Agenta-managed vs self-managed
authentication toggle + a vault-fed connection picker. The model is ALWAYS a ModelRef.

Claude-Session: https://claude.ai/code/session_01GYo3UEfvsZpncagqb28Mbc
---
 .../documentation/agent-configuration.md      |  11 +-
 .../public-edge/workflow-inspect.md           |  11 +-
 .../projects/agent-model-picker/README.md     |  54 ++++
 .../projects/agent-model-picker/context.md    | 106 ++++++
 .../projects/agent-model-picker/plan.md       | 114 +++++++
 .../projects/agent-model-picker/research.md   | 130 ++++++++
 .../projects/agent-model-picker/status.md     | 103 ++++++
 sdks/python/agenta/sdk/agents/capabilities.py |  40 +++
 .../agents/connections/test_capabilities.py   |  45 +++
 .../agenta-entities/src/workflow/index.ts     |   7 +
 .../src/workflow/state/index.ts               |  10 +
 .../src/workflow/state/inspectMeta.ts         |  53 +++
 .../SchemaControls/AgentConfigControl.tsx     | 236 ++++++++------
 .../SchemaControls/connectionUtils.ts         | 304 +++++++++++++-----
 .../tests/unit/connectionUtils.test.ts        | 258 +++++++++------
 15 files changed, 1209 insertions(+), 273 deletions(-)
 create mode 100644 docs/design/agent-workflows/projects/agent-model-picker/README.md
 create mode 100644 docs/design/agent-workflows/projects/agent-model-picker/context.md
 create mode 100644 docs/design/agent-workflows/projects/agent-model-picker/plan.md
 create mode 100644 docs/design/agent-workflows/projects/agent-model-picker/research.md
 create mode 100644 docs/design/agent-workflows/projects/agent-model-picker/status.md
 create mode 100644 web/packages/agenta-entities/src/workflow/state/inspectMeta.ts

diff --git a/docs/design/agent-workflows/documentation/agent-configuration.md b/docs/design/agent-workflows/documentation/agent-configuration.md
index 43708589fa..785255c5da 100644
--- a/docs/design/agent-workflows/documentation/agent-configuration.md
+++ b/docs/design/agent-workflows/documentation/agent-configuration.md
@@ -54,7 +54,16 @@ existing control:
 
 - `agents_md` renders as a multiline text input labeled "Instructions". It falls back to a
   legacy `instructions` value when `agents_md` is missing.
-- `model` renders as a grouped choice control.
+- `model` renders as a unified, harness-filtered provider + model picker. The options come from
+  the agent's `/inspect` `meta.harness_capabilities[harness].models` (Pi: the vault providers'
+  catalog ids; Claude: its aliases), not the full shared catalog. Selecting a model sets BOTH the
+  model id and its provider, so there is no separate provider field. When `/inspect` publishes no
+  per-harness models (older agents / a standalone control), it falls back to the schema's full
+  grouped-choice catalog. Below the picker, an **Authentication** toggle chooses *Agenta-managed*
+  (a vault connection — "Project default" or a named connection picked from `GET /secrets/`) vs
+  *Self-managed* (the harness uses its own login; Agenta injects nothing). The form always writes
+  `model` as a structured `ModelRef` (`{provider, model, connection?}`), never a free-text string;
+  the connection rides in `model_ref.connection`. A raw-JSON escape hatch remains for power users.
 - `tools` renders as a flat array. Each entry uses `ToolItemControl`, the same tool object
   shape the prompt control uses.
 - `mcp_servers` renders as a flat array. Each entry uses `McpServerItemControl`, which is a
diff --git a/docs/design/agent-workflows/interfaces/public-edge/workflow-inspect.md b/docs/design/agent-workflows/interfaces/public-edge/workflow-inspect.md
index 9424169d28..95a8fea1cd 100644
--- a/docs/design/agent-workflows/interfaces/public-edge/workflow-inspect.md
+++ b/docs/design/agent-workflows/interfaces/public-edge/workflow-inspect.md
@@ -14,7 +14,7 @@ workflow provides it:
 ```jsonc
 {
   "version": "2025.07.14",
-  "meta": { "harness_capabilities": { /* per-harness provider/deployment limits */ } },
+  "meta": { "harness_capabilities": { /* per-harness providers, deployments, connection_modes, model_selection, models */ } },
   "data": {
     "revision": {
       "data": {
@@ -38,6 +38,15 @@ the form and the schema stay in one place. The `meta.harness_capabilities` block
 table the service uses server-side to reject unreachable provider and deployment choices, so
 the form can filter stored connections before the user submits when that metadata is present.
 
+Per harness, the block carries `providers`, `deployments`, `connection_modes`, `model_selection`,
+and `models`. `models` is a provider-keyed map of the selectable models the harness can reach: Pi
+publishes each vault provider's catalog ids; Claude publishes its alias set (`default`/`sonnet`/
+`opus`/`haiku` and their `[1m]` variants) under `anthropic`. The agent playground renders its
+harness-filtered provider + model picker straight from this map instead of the full shared
+catalog, and uses `model_selection` to interpret a value (`provider/id` for Pi vs `alias` for
+Claude). The table is published by `harness_capabilities_document()` in
+`sdks/python/agenta/sdk/agents/capabilities.py`.
+
 The shape of the config itself lives in [Agent config
 schema](agent-config-schema.md). This page covers what `/inspect` returns; that page covers
 the fields.
diff --git a/docs/design/agent-workflows/projects/agent-model-picker/README.md b/docs/design/agent-workflows/projects/agent-model-picker/README.md
new file mode 100644
index 0000000000..1a06de6b55
--- /dev/null
+++ b/docs/design/agent-workflows/projects/agent-model-picker/README.md
@@ -0,0 +1,54 @@
+# Agent playground: provider + model picker (harness-aware, inspect-driven)
+
+Make the **agent** playground pick a **provider + model** the way the completion/chat playground
+does, but **filtered to what the selected harness can actually reach**, with an explicit
+**Agenta-auth vs self-managed** choice and a **connection picker** for the credential. The
+per-harness reach (providers, models, connection modes) is published by the agent's `/inspect`
+response and the frontend renders from it.
+
+## The shape in one paragraph
+
+`/inspect` already publishes `meta.harness_capabilities` (per harness: `providers`,
+`deployments`, `connection_modes`, `model_selection`). This project **adds a per-provider model
+list** to that surface (Pi: the vault-reachable providers' model ids; Claude: its alias list), and
+**rewires the frontend to render from `/inspect`** instead of the current static hardcoded copy. The
+model picker becomes one unified control (selecting a model sets both provider and model id),
+filtered to the harness. The credential is chosen with a clear **Authentication** toggle — *Agenta*
+(managed: pick the project-default or a named connection from the vault) or *Self-managed* (the
+harness uses its own login; Agenta injects nothing). The connection rides in `model_ref.connection`
+inside the config the playground already sends; no new request field and no new vault route.
+
+## What already exists (do not rebuild)
+
+The parent [../provider-model-auth/](../provider-model-auth/) project (PR #4815, **merged** to
+`big-agents`) already shipped the backend and a *minimal* form:
+
+- `ModelRef` (`provider` + `model` + `params` + `connection`) in the agent config, coerced from a
+  bare string for back-compat.
+- A connection resolver that reads the existing `GET /secrets/` and injects **one** least-privilege
+  credential (replacing the whole-vault dump).
+- `/inspect` `meta.harness_capabilities` (the per-harness `providers`/`deployments`/
+  `connection_modes`/`model_selection` table) + server-side fail-loud reject.
+- A minimal connection form: a grouped model picker (unfiltered), a separate free-text/select
+  Provider field, a connection-mode select, a free-text slug, and a raw-JSON escape hatch.
+
+This project is the **playground UX + the inspect model-list addition** that the parent explicitly
+deferred.
+
+## Read in this order
+
+1. [context.md](context.md): why this exists, the exact merged state with `file:line`, goals,
+   non-goals, the three decisions taken.
+2. [research.md](research.md): the precise findings (inspect, the capability table, the model
+   picker, the connection form, vault listing, the completion/chat pattern), with citations.
+3. [plan.md](plan.md): the phased slices, backend through frontend, with the test strategy.
+4. [status.md](status.md): current state, decisions, open items, risks. Source of truth.
+
+## Related work
+
+- [../provider-model-auth/](../provider-model-auth/): the backend `ModelRef`/connection resolver
+  and the minimal form this project builds on.
+- [../model-config/](../model-config/): how a requested model becomes settable on each harness (the
+  Pi `auth.json`/`models.json` write, the custom-endpoint consumption). Out of scope here.
+- [../harness-capabilities/](../harness-capabilities/): owns the general capability-table mechanism;
+  this project extends the `providers`/`models` entries it consumes.
diff --git a/docs/design/agent-workflows/projects/agent-model-picker/context.md b/docs/design/agent-workflows/projects/agent-model-picker/context.md
new file mode 100644
index 0000000000..79c40f3ccc
--- /dev/null
+++ b/docs/design/agent-workflows/projects/agent-model-picker/context.md
@@ -0,0 +1,106 @@
+# Context
+
+## Why this exists
+
+The agent (harness) playground should let a user pick a **provider + model** like the completion and
+chat playgrounds do, but the agent case has an extra constraint the prompt case does not: **each
+harness can only reach some providers and models**. Claude Code reaches Anthropic only and selects by
+alias; Pi reaches eight vault-mapped providers and selects by `provider/id`. The picker must filter
+to the selected harness, and that per-harness reach must come from the agent itself (`/inspect`), not
+a list hardcoded in the frontend. The user also needs to choose **whether Agenta supplies the
+credential** (managed) or the **harness brings its own login** (self-managed), and when managed, to
+pick *which* stored connection.
+
+## Current state (merged on `big-agents`, PR #4815)
+
+The backend and a minimal form already landed. The remaining work is UX + one inspect addition.
+
+### Model selection
+- The agent config model field renders through the **same grouped picker** as completion/chat:
+  `AgentConfigControl` -> `GroupedChoiceControl` -> `SelectLLMProviderBase`
+  (`web/packages/agenta-entity-ui/src/DrillInView/SchemaControls/AgentConfigControl.tsx:342-349`).
+- The picker's choices come from the **whole** shared LiteLLM catalog, **unfiltered by harness**:
+  `_model_catalog_type()` deep-copies `supported_llm_models` as grouped `choices`
+  (`sdks/python/agenta/sdk/utils/types.py:1046-1055`), registered as `CATALOG_TYPES["model"]`
+  (`types.py:1321`). The agent field declares `x-parameter: "grouped_choice"`
+  (`types.py:1088-1093`). Catalog source: `sdks/python/agenta/sdk/utils/assets.py:6-193`
+  (`supported_llm_models`, provider -> prefixed ids like `anthropic/claude-opus-4-7`).
+- A **second, redundant** free-text/select "Provider" field sits in the connection section
+  (`AgentConfigControl.tsx:355-380`), disjoint from the model picker.
+
+### Per-harness reach is already in `/inspect`
+- `/inspect` publishes `meta.harness_capabilities` via `harness_capabilities_document()`
+  (`services/oss/src/agent/app.py:294-300`). The table is
+  `sdks/python/agenta/sdk/agents/capabilities.py` with, per harness:
+  `providers`, `deployments`, `connection_modes`, `model_selection`
+  (`capabilities.py:57-95`). It has **no `models`** field.
+- Harness types: `pi_core`, `pi_agenta` (both reach the 8 `PI_VAULT_PROVIDERS`, `model_selection
+  "provider/id"`), `claude` (anthropic only, `model_selection "alias"`)
+  (`capabilities.py:41-50,76-95`; `HarnessType` `dtos.py:42-58`).
+- The agent service uses the same table for a server-side fail-loud reject
+  (`app.py:84-117`).
+
+### The frontend ignores inspect and uses a static copy
+- `connectionUtils.ts` holds a **hardcoded** `HARNESS_CONNECTION_CAPABILITIES`
+  (`web/packages/agenta-entity-ui/src/DrillInView/SchemaControls/connectionUtils.ts:146-202`) with a
+  `TODO(harness-capabilities)` to consume `/inspect` `meta.harness_capabilities`. The form filters
+  the **Provider** select with `allowedProviders(harness)` but never filters the **model** picker.
+
+### Credential (connection)
+- The connection rides in the config as `model_ref.connection = {mode, slug}`; the backend coerces a
+  structured `model` into `ModelRef` (`sdks/python/agenta/sdk/agents/dtos.py:806-831`,
+  `_parse_agent_fields` `:929-960`). So "the frontend sends the connection" already works.
+- Modes: `agenta` (managed) and `self_managed` (`connections/models.py` `Connection.mode`); the
+  project default is `agenta` with no slug. The form has a mode `Select` and a **free-text** slug
+  field with a `TODO(provider-model-auth)` to become a picker once a connections endpoint exists
+  (`AgentConfigControl.tsx:382-421`).
+- Resolution reads the existing `GET /secrets/` (no new route): `VaultConnectionResolver`
+  (`sdks/python/agenta/sdk/agents/platform/connections.py:380-431`). The FE can list connections from
+  the existing `vaultSecretsQueryAtom` (`web/packages/agenta-entities/src/secret/state/atoms.ts:78-100`).
+
+## Goals
+
+1. Publish a **per-provider model list per harness** in `/inspect` `meta.harness_capabilities`
+   (Pi: vault-reachable providers' ids; Claude: its aliases). The frontend renders from it.
+2. Make the frontend **consume `/inspect`** for the capability map (providers, models, modes),
+   replacing the static hardcoded copy.
+3. **Filter the model picker to the selected harness** and unify it: selecting a model sets both
+   `provider` and `model`; drop the redundant standalone Provider field.
+4. Present a clear **Authentication** choice — *Agenta* (managed) vs *Self-managed* — and, for
+   Agenta, a **connection picker** (project default or a named connection) fed by the existing vault
+   list, filtered to the chosen provider.
+5. Keep the wire contract and the resolver unchanged; the connection still rides `model_ref.connection`.
+
+## Decisions taken (2026-06-24, with the user)
+
+1. **The per-provider model list is published in `/inspect`.** The backend builds and publishes the
+   exact per-harness, per-provider model list in `meta.harness_capabilities`; the frontend renders
+   straight from inspect. (Chosen over filtering the shared catalog client-side. Trade-off: the
+   model list is duplicated into the capability surface and must be kept fresh; mitigated by sourcing
+   it from the same `supported_llm_models` catalog on the backend.)
+2. **"Not Agenta authentication" means self-managed login only.** The harness uses its own credential
+   in the sandbox (env var or prior OAuth login); Agenta injects nothing. No per-run pasted-key
+   channel is added (matches the connection design and completion/chat).
+3. **Claude is presented as an alias dropdown** (default, sonnet, opus, haiku, and `[1m]` variants),
+   matching `model_selection: "alias"`. The alias list is added to `/inspect`.
+
+## Non-goals (v1)
+
+- A new vault storage model, write path, CRUD, or a new connections route. v1 reads `GET /secrets/`.
+- A per-run pasted API key / inline-secret channel.
+- Where a deployed agent's durable per-environment default connection lives (a parent open
+  decision; the config-stored path is unaffected).
+- Migrating the completion/prompt path onto the agent resolver. Completions keep their reader.
+- Pi consuming custom endpoints / cloud deployments (Azure/Bedrock/Vertex) — owned by
+  [../model-config/](../model-config/); v1 stays `direct`/fail-loud.
+
+## Constraints inherited from the codebase
+
+- `/inspect` `meta.harness_capabilities` must stay a plain JSON-able dict (no model import on the
+  consumer side) — `harness_capabilities_document()` (`capabilities.py:98-108`).
+- The SDK owns the capability table; the agent service imports it; the SDK must not import the
+  service (`../../documentation/ports-and-adapters.md`).
+- Frontend API calls go through the Fern client + a zod boundary (`web/CLAUDE.md`). The capability
+  map arrives inside the inspect/workflow schema response, not a new endpoint.
+- Any wire change updates Python (`utils/wire.py`) and TypeScript (`services/agent/src/protocol.ts`)
+  with the golden tests in one PR. This project's inspect-meta change is **not** a `/run` wire change.
diff --git a/docs/design/agent-workflows/projects/agent-model-picker/plan.md b/docs/design/agent-workflows/projects/agent-model-picker/plan.md
new file mode 100644
index 0000000000..15639e9c34
--- /dev/null
+++ b/docs/design/agent-workflows/projects/agent-model-picker/plan.md
@@ -0,0 +1,114 @@
+# Plan
+
+Five phases, backend-first so the frontend has real `/inspect` data to render. Each phase is
+independently shippable and green on its tests. Decisions taken are in [context.md](context.md).
+
+## Phase A — Backend: publish the per-provider model list in `/inspect`
+
+Add a `models` map to the per-harness capability surface so the frontend can render the picker
+straight from inspect.
+
+- **A1.** Extend `HarnessConnectionCapabilities` with
+  `models: Dict[str, List[str]] = {}` (provider family -> list of model ids/aliases), keeping the
+  existing `providers`/`deployments`/`connection_modes`/`model_selection`
+  (`sdks/python/agenta/sdk/agents/capabilities.py:57-73`).
+- **A2.** Populate `models` in `HARNESS_CONNECTION_CAPABILITIES` (`:76-95`):
+  - `pi_core` / `pi_agenta`: `{p: supported_llm_models[p] for p in PI_VAULT_PROVIDERS}` (import the
+    catalog from `agenta.sdk.utils.assets`). Keep the import light; the SDK already imports assets.
+  - `claude`: `{"anthropic": CLAUDE_MODEL_ALIASES}` — a new module constant
+    (`default, sonnet, opus, haiku` + `[1m]` variants per the harness matrix).
+- **A3.** Emit `models` from `harness_capabilities_document()` (`:98-108`) so `/inspect` `meta`
+  carries it. Optionally attach per-model pricing as `x-ag-metadata` from `model_metadata`
+  (nice-to-have; can defer to the FE looking it up). No `/run` wire change.
+- **A4.** Keep the SDK as the single source; the agent service already imports the table
+  (`services/oss/src/agent/app.py`). No service change beyond re-publishing.
+
+**Tests (A).** Extend the capability-table contract test
+(`sdks/python/oss/tests/pytest/unit/agents/connections/test_capabilities.py`): every harness has a
+`models` map; Pi's providers ⊆ `supported_llm_models`; Claude's models are the alias set; the
+document round-trips as a plain dict. Assert `model_selection` still matches per harness.
+
+## Phase B — Frontend: consume `/inspect`, retire the static map
+
+- **B1.** Find how the inspect/workflow-schema response reaches the playground (the atom/selector
+  that holds `parameters` schema for `agent_config`) and surface `meta.harness_capabilities`
+  alongside it. Add a selector (e.g. `harnessCapabilitiesAtom`) keyed to the current
+  workflow/revision. (This is the main FE unknown — see research.md "Open question".)
+- **B2.** Replace the static `HARNESS_CONNECTION_CAPABILITIES` in `connectionUtils.ts:146-202` with
+  the inspect-fed map. Keep `allowedProviders`/`allowedConnectionModes`/`harnessAllowsProvider` as
+  pure helpers but read from the passed-in capability object. Keep a **permissive fallback** (show
+  everything) when `meta` is absent (older agents, standalone). Remove the
+  `TODO(harness-capabilities)`.
+
+**Tests (B).** Unit-test the helpers against an inspect-shaped capability object (filter providers,
+modes; permissive fallback when missing).
+
+## Phase C — Frontend: harness-filtered, unified provider+model picker
+
+- **C1.** Build the grouped model options from `capabilities[harness].models` (provider -> ids) and
+  pass them to `GroupedChoiceControl` / `SelectLLMProviderBase` instead of the full catalog from the
+  schema's `choices`. Reuse `SelectLLMProviderBase`'s `ProviderGroup[]` shape; map pricing from
+  `model_metadata`/`x-ag-metadata` if present.
+- **C2.** Make the picker the single source of the provider: selecting a model sets both
+  `model_ref.model` and `model_ref.provider`. **Remove the standalone Provider field**
+  (`AgentConfigControl.tsx:355-380`) or render it read-only/derived. Honor `model_selection`:
+  - `provider/id` (Pi): value is the prefixed id; derive `provider` from the group.
+  - `alias` (Claude): value is the bare alias; `provider = "anthropic"`. Update
+    `connectionUtils` `composeModelValue`/`modelIdFromConfig` accordingly.
+- **C3.** On harness change, re-filter; if the current model is unreachable under the new harness,
+  clear it (or surface an inline warning) rather than sending an unsupported model.
+
+**Tests (C).** Extend the `connectionUtils` helper tests: option building from inspect models;
+alias coercion for Claude; provider derived from the picked model; harness-switch clears an
+unreachable model. (`web/packages/agenta-entity-ui/.../__tests__` per package practices.)
+
+## Phase D — Frontend: authentication choice + connection picker
+
+- **D1.** Replace the bare connection-mode `Select` with a clear **Authentication** control:
+  *Agenta (managed)* vs *Self-managed*. Map to `connection.mode` (`agenta` / `self_managed`).
+  Filter the options by `capabilities[harness].connection_modes`.
+- **D2.** For *Agenta*, replace the free-text slug (`AgentConfigControl.tsx:400-421`) with a
+  **connection picker**:
+  - Option "Project default" -> `slug = null`.
+  - Named connections -> from `vaultSecretsQueryAtom`
+    (`web/packages/agenta-entities/src/secret/state/atoms.ts:78-100`), filtered to the selected
+    provider and the harness's reachable providers. Show `header.name`; value = the slug.
+  - Keep an inline guard when a named connection is required but missing (existing behavior at
+    `AgentConfigControl.tsx:413-419`).
+- **D3.** For *Self-managed*, show a short note ("the harness uses its own login; Agenta injects
+  nothing") and send `connection: {mode: "self_managed"}`. Keep the raw-JSON escape hatch for power
+  users.
+
+**Tests (D).** Helper/unit tests: connection options filtered by provider + harness; "Project
+default" yields `slug=null`; self-managed yields `{mode:"self_managed"}`. No new backend route; this
+reuses `GET /secrets/`.
+
+## Phase E — Verification, docs, landing
+
+- **E1.** Live verification on the running stack (the agent feature-matrix), per the
+  `agent-workflows-qa` skill: Pi + OpenAI default connection; Pi + a named OpenAI connection; Pi +
+  Gemini; Claude alias (`opus`); a self-managed run. Confirm the picker filters per harness and the
+  resolved connection matches what was picked.
+- **E2.** Docs in the same PR via `keep-docs-in-sync`: update
+  `../../documentation/agent-configuration.md` (model/provider/connection UX), the interface
+  inventory entry for `/inspect` `meta` (now carries `models`), and this project's `status.md`.
+- **E3.** Land as a GitButler stacked branch on `big-agents` (per the repo's GitButler rules), this
+  feature's files only; coordinate shared-file edits (`capabilities.py`,
+  `connectionUtils.ts`, `AgentConfigControl.tsx`) via `../../scratch/agent-coordination.md`.
+
+## Test strategy summary
+
+| Phase | Backend | Frontend | Live |
+| --- | --- | --- | --- |
+| A | capability-table contract (+`models`) | — | — |
+| B | — | helper filter/fallback tests | — |
+| C | — | option-build + alias coercion tests | — |
+| D | — | connection-option tests | — |
+| E | — | — | feature-matrix matrix run |
+
+## Sequencing / dependencies
+
+- A before C/D (the FE needs `models` + the inspect-fed map).
+- B before C/D (both read the inspect-fed capability object).
+- C and D are independent once B lands; can go in either order or together.
+- E last.
diff --git a/docs/design/agent-workflows/projects/agent-model-picker/research.md b/docs/design/agent-workflows/projects/agent-model-picker/research.md
new file mode 100644
index 0000000000..f459c6be29
--- /dev/null
+++ b/docs/design/agent-workflows/projects/agent-model-picker/research.md
@@ -0,0 +1,130 @@
+# Research
+
+Findings from reading the merged code on `big-agents` (2026-06-24). Every claim cites `file:line`.
+Grouped by the four questions the request raises.
+
+## Q1. How model authentication works right now
+
+The whole-vault dump is **gone**; PR #4815 replaced it with a least-privilege resolver.
+
+- The agent config carries a structured `ModelRef` (`provider` + `model` + `params` +
+  `connection`): `sdks/python/agenta/sdk/agents/connections/models.py` — `ModelRef` (`:108-158`),
+  `Connection` (`mode` + `slug`, `:42-74`), `ResolvedConnection` (`provider`/`model`/`deployment`/
+  `credential_mode`/`env`/`endpoint`, `:160-201`). A bare-string `model` still works: it is coerced
+  in `dtos.py:_split_model_ref` (`:806-831`) and `_coerce_model_ref` (`:389-390`, `:506-507`).
+- At run time the service builds a `ModelRef` + `RuntimeAuthContext`, calls the resolver, and feeds
+  the result into `SessionConfig.secrets` / `resolved_connection`
+  (`services/oss/src/agent/app.py:_agent()`, the resolve around `:83`).
+- The resolver reads the **existing** `GET /secrets/`, builds an in-memory connection catalog, picks
+  **one** connection by `ModelRef` (named slug, or unique project default), and returns only that
+  connection's env: `sdks/python/agenta/sdk/agents/platform/connections.py` —
+  `_resolve_from_secrets` (`:346-368`), `VaultConnectionResolver.resolve` (`:391-431`). No new route
+  (`:4-9` header: "uses the existing `GET /secrets/` … connection is only a runtime read view").
+- Modes: `agenta` (managed) and `self_managed` (`models.py` `Connection.mode`,
+  `ConnectionMode = Literal["agenta","self_managed"]`). Project default = `agenta` with no slug.
+  `self_managed` injects nothing (the harness uses its own login).
+- The TS runner clears inherited provider env then applies only the resolved env (no cross-provider
+  leak): the parent project's Slice 4 (`services/agent/src/engines/sandbox_agent/daemon.ts`,
+  `pi.ts withRequestProviderEnv`). Out of scope to change here.
+
+Prior whole-vault behavior (now replaced) is documented in
+[../../scratch/notes-model-auth.md](../../scratch/notes-model-auth.md).
+
+## Q2. What is inside `/inspect`
+
+`/inspect` is published by the agent workflow decorator
+(`services/oss/src/agent/app.py:294-300`). It returns:
+
+- `inputs` / `parameters` / `outputs` JSON schemas (`services/oss/src/agent/schemas.py:105-109`);
+  `parameters` carries `x-ag-type-ref: "agent_config"` (the form-driving catalog type).
+- `meta.harness_capabilities` = `harness_capabilities_document()`
+  (`app.py:296`; `sdks/python/agenta/sdk/agents/capabilities.py:98-108`).
+
+The capability table (`capabilities.py`):
+
+```
+HarnessConnectionCapabilities (:57-73):
+  providers: List[str]          # the provider families the harness reaches
+  deployments: List[str]        # default ["direct"]
+  connection_modes: List[str]   # default ["agenta","self_managed"]
+  model_selection: str          # "provider/id" | "alias"
+
+HARNESS_CONNECTION_CAPABILITIES (:76-95):
+  pi_core / pi_agenta: providers = PI_VAULT_PROVIDERS (8), deployments=[direct],
+                       modes=[agenta,self_managed], model_selection="provider/id"
+  claude:              providers = ["anthropic"],
+                       deployments=[direct,custom,bedrock,vertex_ai,vertex],
+                       modes=[agenta,self_managed], model_selection="alias"
+PI_VAULT_PROVIDERS (:41-50): openai, anthropic, gemini, mistral, groq, minimax,
+                             together_ai, openrouter
+```
+
+**There is no `models` field.** That is the one inspect addition this project makes. The table is
+also consumed server-side for the fail-loud reject: `harness_allows_provider/mode/deployment`
+(`capabilities.py:111-147`), called pre/post resolve in `app.py:84-117`. There is a separate,
+unrelated runtime probe `HarnessCapabilities` (boolean feature flags, `dtos.py:108-147`) that is
+parsed but unused downstream — not relevant here.
+
+## Q3. How completion/chat picks a provider + model (the pattern to match)
+
+- Picker component: `web/packages/agenta-ui/src/SelectLLMProvider/SelectLLMProviderBase.tsx`
+  (`:31-340`) — a grouped provider dropdown; options shape `ProviderGroup[] = {label, options:
+  {label, value, metadata}[]}`. Cascading per-provider submenu when `showGroup` + model options.
+- Model list source (one catalog): `sdks/python/agenta/sdk/utils/assets.py` —
+  `supported_llm_models` (`:6-193`, provider -> prefixed ids), `providers_list` (`:195`),
+  `_build_model_metadata()` pricing (`:225-247`), `model_to_provider_mapping` (`:249-253`).
+- Schema injection: `MCField()` (`sdks/python/agenta/sdk/utils/types.py:49-69`) injects
+  `choices` + `x-ag-type: grouped_choice` + `x-ag-metadata` (pricing). The catalog model type is
+  `_model_catalog_type()` (`:1046-1055`), registered `CATALOG_TYPES["model"]` (`:1321`); the prompt
+  config references it via `x-ag-type-ref: "model"` (`:418`, `:571`).
+- Schema -> UI options: `getOptionsFromSchema()`
+  (`web/packages/agenta-shared/src/utils/schemaOptions.ts:17-54`) reads `choices` (grouped) or
+  `enum` (flat) + `x-ag-metadata`.
+- Credentials: resolved **server-side** from the vault; the completion request sends only the model
+  string, no key (`sdks/python/agenta/sdk/managers/secrets.py:get_provider_settings` `:158-246`,
+  model->provider via `model_to_provider_mapping`). **No per-request pasted-key override** — which is
+  why decision #2 (self-managed only) keeps agent parity with completion.
+
+The agent form already reuses this exact picker (`AgentConfigControl.tsx:342-349` ->
+`GroupedChoiceControl` -> `SelectLLMProviderBase`). The gap is the **options source** (full catalog,
+not harness-filtered) and the **redundant Provider field**.
+
+## Q4. How the connection is chosen and sent
+
+- The connection lives in `model_ref.connection` inside the config the playground already posts to
+  `/invoke`; backend parse `AgentConfig.from_params` / `_parse_agent_fields`
+  (`dtos.py:929-960`), structured-model coercion `_split_model_ref` (`:806-831`). **No new request
+  field is needed** to "send the connection".
+- Current form controls (`AgentConfigControl.tsx`): model picker (`:342-349`), a standalone Provider
+  free-text/select (`:355-380`), a connection-mode `Select` (`:382-398`), a **free-text** slug
+  (`:400-421`, with a `TODO` to become a picker), a raw-JSON escape hatch (`:423-464`).
+- Listing connections for a picker: the existing `GET /vault/secrets/` -> `vaultSecretsQueryAtom`
+  (`web/packages/agenta-entities/src/secret/state/atoms.ts:78-100`, fetched by `fetchVaultSecret`
+  `secret/api/api.ts:29-36`). Secret shape: `SecretResponseDTO` with `kind`
+  (`provider_key`/`custom_provider`), `header.name` (the connection name/slug), and the provider
+  kind (`api/oss/src/core/secrets/dtos.py`, `enums.py StandardProviderKind`/`CustomProviderKind`).
+  A named connection = a secret whose `header.name` is set; its provider is the standard kind or the
+  custom `provider_slug`. **No `GET /vault/connections` route exists** (the parent reworked to use
+  `GET /secrets/`); the design-doc reference to it is stale.
+- The FE static capability map to be replaced: `connectionUtils.ts:146-202`
+  (`HARNESS_CONNECTION_CAPABILITIES`, `allowedProviders`, `allowedConnectionModes`,
+  `harnessAllowsProvider`), with the `TODO(harness-capabilities)` to feed it from `/inspect`.
+
+## Source data for the new inspect `models` field
+
+- **Pi (`pi_core`, `pi_agenta`)**: `{provider: supported_llm_models[provider]}` for each provider in
+  `PI_VAULT_PROVIDERS`. All eight keys exist in `supported_llm_models` (verified: openai, anthropic,
+  gemini, mistral, groq, minimax, together_ai, openrouter). Model ids are provider-prefixed
+  (`openai/gpt-...`); the FE already handles that shape.
+- **Claude (`claude`)**: `{"anthropic": CLAUDE_MODEL_ALIASES}` — a new small constant. The alias set
+  per the harness facts: `default`, `sonnet`, `opus`, `haiku`, plus the `[1m]` long-context variants
+  (`docs/design/agent-workflows/projects/provider-model-auth/harness-provider-matrix.md:84-88`).
+- Pricing metadata (`model_metadata`, `assets.py:247`) can ride along per model as optional
+  `x-ag-metadata`, or the FE can look it up from the shared catalog. Treat as nice-to-have.
+
+## Open question to resolve in the plan
+
+How the inspect/workflow-schema response surfaces `meta.harness_capabilities` to the
+`AgentConfigControl` on the FE (which atom/selector carries the inspect `meta`). The control today
+reads only the static map; the implementation must thread inspect `meta` to it. This is the main FE
+plumbing unknown and is the first task of Phase B.
diff --git a/docs/design/agent-workflows/projects/agent-model-picker/status.md b/docs/design/agent-workflows/projects/agent-model-picker/status.md
new file mode 100644
index 0000000000..de402194c1
--- /dev/null
+++ b/docs/design/agent-workflows/projects/agent-model-picker/status.md
@@ -0,0 +1,103 @@
+# Status
+
+Source of truth for where this work stands. Keep it current.
+
+## State
+
+**Implemented (A-E); green; PR open on `big-agents`.** Built on the just-landed
+contract-versioning (#4829) and wire-schema (#4830) work; this lane is stacked on
+`feat/agent-wire-contract-schema-plan` (#4830). The parent
+[../provider-model-auth/](../provider-model-auth/) (PR #4815) is **merged** and provides the
+backend `ModelRef` + connection resolver + `/inspect` capability table + a minimal form.
+
+Last updated: 2026-06-24.
+
+## What landed
+
+- **Phase A (SDK).** `HarnessConnectionCapabilities` gained a `models: Dict[str, List[str]]`
+  field; `HARNESS_CONNECTION_CAPABILITIES` populates it — Pi from `supported_llm_models` for each
+  `PI_VAULT_PROVIDERS` entry (defensive `_pi_models()` skips a missing provider), Claude from a new
+  `CLAUDE_MODEL_ALIASES` constant (`default`/`sonnet`/`opus`/`haiku` + `[1m]` variants) under
+  `anthropic`. `harness_capabilities_document()` emits it (via `model_dump()`), so `/inspect`
+  `meta.harness_capabilities` carries per-harness models. NOT a `/run` wire change. Contract test
+  extended. (`sdks/python/agenta/sdk/agents/capabilities.py`.)
+- **Phase B (FE plumbing).** New inspect-meta atom
+  `harnessCapabilitiesAtomFamily(revisionId)` derives `meta.harness_capabilities` from
+  `workflowInspectAtomFamily` (`web/packages/agenta-entities/src/workflow/state/inspectMeta.ts`,
+  exported via the workflow barrel). `connectionUtils.ts` retired the static FE capability map; its
+  helpers now take the inspect-fed `HarnessCapabilitiesMap` and stay permissive when absent.
+- **Phase C (picker).** Unified harness-filtered provider + model picker
+  (`buildModelOptionGroups` + `SelectLLMProviderBase`): selecting a model sets BOTH provider and
+  model (`providerForModel` derives the provider from the group). The standalone Provider field is
+  gone. Harness switch clears an unreachable model (`harnessAllowsModel`). Falls back to the schema
+  catalog when inspect publishes no models.
+- **Phase D (auth).** Authentication `Segmented` toggle (Agenta-managed vs Self-managed) + a
+  connection picker (Project default / named connections from `vaultSecretsQueryAtom`,
+  `namedConnectionOptions` filtered to provider + harness). Self-managed shows a note. Raw-JSON
+  hatch kept.
+- **model = ModelRef fold-in (#4821 c3469645457).** `composeModelValue` ALWAYS returns a structured
+  ModelRef object — the bare-string path is dropped. `connectionFromConfig`/`modelIdFromConfig`
+  still READ a legacy bare string so an old stored config populates the picker.
+- **Phase E.** Docs synced: this status, the inspect interface inventory entry (now lists
+  `models`), and `documentation/agent-configuration.md` (picker + auth UX).
+
+## Decisions (taken with the user, 2026-06-24)
+
+1. **Per-provider model list is published in `/inspect`** `meta.harness_capabilities` (not filtered
+   client-side from the shared catalog). The backend builds it from `supported_llm_models` (Pi) and a
+   Claude alias constant; the frontend renders straight from inspect.
+2. **"Not Agenta auth" = self-managed login only.** No per-run pasted-key channel.
+3. **Claude is an alias dropdown** (`default/sonnet/opus/haiku` + `[1m]`), matching
+   `model_selection: "alias"`. Alias list added to `/inspect`.
+
+## What this project does NOT change
+
+- The `/run` wire contract and golden tests (the inspect-meta change is not a `/run` change).
+- The connection resolver and the `GET /secrets/`-backed resolution (parent's work).
+- Vault storage / routes. No `GET /vault/connections` (the parent reworked away from it).
+
+## Key facts grounding the plan (verified)
+
+- `/inspect` publishes `meta.harness_capabilities` (`services/oss/src/agent/app.py:294-300`); the
+  table is `sdks/python/agenta/sdk/agents/capabilities.py:57-95`; it has **no `models`** field yet.
+- The agent model picker is `GroupedChoiceControl` -> `SelectLLMProviderBase`
+  (`AgentConfigControl.tsx:342-349`), fed by the **whole** `supported_llm_models` catalog
+  (`types.py:1046-1055,1321`), **unfiltered by harness**.
+- The FE capability map is a **static** copy with a TODO to consume inspect
+  (`connectionUtils.ts:146-202`).
+- The connection rides `model_ref.connection` already; slug is **free text** today
+  (`AgentConfigControl.tsx:400-421`). Vault list available via `vaultSecretsQueryAtom`
+  (`secret/state/atoms.ts:78-100`).
+
+## Open items / risks
+
+- **FE plumbing (RESOLVED).** `harnessCapabilitiesAtomFamily(revisionId)` (new
+  `workflow/state/inspectMeta.ts`) derives `meta.harness_capabilities` from
+  `workflowInspectAtomFamily`. `AgentConfigControl` reads the open revision id from the optional
+  drill-in context (`useOptionalDrillIn().entityId`) and falls back permissively when absent
+  (standalone control / no drill-in provider).
+- **openai catalog ids are NOT provider-prefixed.** Contrary to the research note, `supported_llm_models["openai"]`
+  lists bare ids (`gpt-5.5`, `gpt-5.4`, …) while other providers are prefixed
+  (`anthropic/claude-...`). The picker (`buildModelOptionGroups`) uses the catalog id verbatim as
+  the option `value` and the provider comes from the group key, so the mix is handled — but the
+  contract test does NOT assert provider-prefixing for this reason. ModelRef carries provider
+  separately, so a bare openai id round-trips fine.
+- **Model freshness:** publishing the list in inspect duplicates it from the catalog; mitigated by
+  sourcing from `supported_llm_models` on the backend so there is still one edit point. Note in
+  the `update-llm-model-list` skill scope: editing the catalog also changes what `/inspect`
+  publishes for Pi harnesses.
+- **Claude alias list source:** `CLAUDE_MODEL_ALIASES` (`default`/`sonnet`/`opus`/`haiku` +
+  `[1m]`) from the harness matrix; revisit if the runner's accepted alias set changes
+  (`../model-config/`).
+- **Pricing metadata:** the picker accepts an optional `ModelMetadataMap` but the control does not
+  yet wire pricing into the inspect models (nice-to-have; the FE can look it up from the shared
+  catalog). Deferred.
+
+## Done
+
+1. Phase A: `models` in the capability table + document + contract test. DONE (green).
+2. Phase B: inspect `meta` threaded to the FE; static map retired. DONE.
+3. Phase C/D: harness-filtered unified picker + authentication/connection picker. DONE.
+4. Phase E: docs synced. DONE. Live feature-matrix verification is a follow-up (deferred).
+
+To implement, run the `implement-feature` skill against this folder.
diff --git a/sdks/python/agenta/sdk/agents/capabilities.py b/sdks/python/agenta/sdk/agents/capabilities.py
index 7d559d8e20..278f0de878 100644
--- a/sdks/python/agenta/sdk/agents/capabilities.py
+++ b/sdks/python/agenta/sdk/agents/capabilities.py
@@ -35,6 +35,8 @@
 
 from pydantic import BaseModel, Field
 
+from agenta.sdk.utils.assets import supported_llm_models
+
 # The eight Agenta-vault-mapped providers Pi reaches directly via its env-key map (a stored
 # ``provider_key`` secret of these drives Pi). Kept in agreement with the SDK resolver
 # provider-env maps.
@@ -49,11 +51,41 @@
     "openrouter",
 ]
 
+# Claude Code selects a model by alias, not a ``provider/id`` string. These are the aliases the
+# Claude harness accepts (``default``/``sonnet``/``opus``/``haiku``) plus their ``[1m]``
+# long-context variants. They live under the ``anthropic`` provider in the ``models`` map (Claude
+# reaches anthropic only). Revisit if the runner's accepted alias set changes (see
+# ``docs/design/agent-workflows/projects/model-config/``).
+CLAUDE_MODEL_ALIASES: List[str] = [
+    "default",
+    "sonnet",
+    "opus",
+    "haiku",
+    "default[1m]",
+    "sonnet[1m]",
+    "opus[1m]",
+    "haiku[1m]",
+]
+
 # Both modes every harness supports today. (No ``default`` mode: the project default is just
 # ``agenta`` with no slug.)
 _ALL_MODES = ["agenta", "self_managed"]
 
 
+def _pi_models() -> Dict[str, List[str]]:
+    """The per-provider model ids Pi reaches: the catalog entry for each vault provider.
+
+    Defensive against a provider missing from ``supported_llm_models`` (skip it) so a catalog
+    edit never breaks the capability document. The ids are provider-prefixed (``openai/gpt-...``),
+    the same shape the playground model picker already renders.
+    """
+    return {
+        provider: list(supported_llm_models[provider])
+        for provider in PI_VAULT_PROVIDERS
+        if provider in supported_llm_models
+    }
+
+
 class HarnessConnectionCapabilities(BaseModel):
     """The connection-relevant capabilities of one harness (the ``/inspect`` ``meta`` shape).
 
@@ -65,12 +97,17 @@ class HarnessConnectionCapabilities(BaseModel):
       (``["agenta", "self_managed"]``).
     - ``model_selection``: how a model is named for the harness (``"provider/id"`` exact for Pi,
       ``"alias"`` for Claude).
+    - ``models``: the selectable models per provider family. Pi publishes each vault provider's
+      catalog ids (provider-prefixed, e.g. ``openai/gpt-...``); Claude publishes its alias set
+      under ``anthropic``. The frontend renders the harness-filtered model picker straight from
+      this map instead of the full shared catalog.
     """
 
     providers: List[str] = Field(default_factory=list)
     deployments: List[str] = Field(default_factory=lambda: ["direct"])
     connection_modes: List[str] = Field(default_factory=lambda: list(_ALL_MODES))
     model_selection: str = "provider/id"
+    models: Dict[str, List[str]] = Field(default_factory=dict)
 
 
 HARNESS_CONNECTION_CAPABILITIES: Dict[str, HarnessConnectionCapabilities] = {
@@ -79,18 +116,21 @@ class HarnessConnectionCapabilities(BaseModel):
         deployments=["direct"],
         connection_modes=list(_ALL_MODES),
         model_selection="provider/id",
+        models=_pi_models(),
     ),
     "pi_agenta": HarnessConnectionCapabilities(
         providers=list(PI_VAULT_PROVIDERS),
         deployments=["direct"],
         connection_modes=list(_ALL_MODES),
         model_selection="provider/id",
+        models=_pi_models(),
     ),
     "claude": HarnessConnectionCapabilities(
         providers=["anthropic"],
         deployments=["direct", "custom", "bedrock", "vertex_ai", "vertex"],
         connection_modes=list(_ALL_MODES),
         model_selection="alias",
+        models={"anthropic": list(CLAUDE_MODEL_ALIASES)},
     ),
 }
 
diff --git a/sdks/python/oss/tests/pytest/unit/agents/connections/test_capabilities.py b/sdks/python/oss/tests/pytest/unit/agents/connections/test_capabilities.py
index 329c649c96..988d4e84a9 100644
--- a/sdks/python/oss/tests/pytest/unit/agents/connections/test_capabilities.py
+++ b/sdks/python/oss/tests/pytest/unit/agents/connections/test_capabilities.py
@@ -9,6 +9,7 @@
 from __future__ import annotations
 
 from agenta.sdk.agents.capabilities import (
+    CLAUDE_MODEL_ALIASES,
     HARNESS_CONNECTION_CAPABILITIES,
     PI_VAULT_PROVIDERS,
     harness_allows_deployment,
@@ -16,6 +17,7 @@
     harness_allows_provider,
     harness_capabilities_document,
 )
+from agenta.sdk.utils.assets import supported_llm_models
 
 
 def test_claude_is_anthropic_only():
@@ -76,3 +78,46 @@ def test_capabilities_document_shape():
         "vertex_ai",
         "vertex",
     ]
+
+
+def test_every_harness_publishes_a_models_map():
+    doc = harness_capabilities_document()
+    for harness in ("pi_core", "pi_agenta", "claude"):
+        assert isinstance(doc[harness]["models"], dict)
+        assert doc[harness]["models"], f"{harness} has an empty models map"
+
+
+def test_pi_models_are_a_subset_of_the_shared_catalog():
+    # Each Pi harness publishes, per vault provider, exactly that provider's catalog ids.
+    for harness in ("pi_core", "pi_agenta"):
+        models = HARNESS_CONNECTION_CAPABILITIES[harness].models
+        # Only the vault-mapped providers are published (no arbitrary catalog providers).
+        assert set(models) <= set(PI_VAULT_PROVIDERS)
+        assert set(models) == set(PI_VAULT_PROVIDERS)
+        for provider, ids in models.items():
+            # The published ids are exactly the shared catalog's ids for that provider
+            # (verbatim — most are provider-prefixed like ``anthropic/...``, but some
+            # providers, e.g. openai, list bare ids like ``gpt-5.5``).
+            assert ids == list(supported_llm_models[provider])
+
+
+def test_claude_models_are_the_alias_set_under_anthropic():
+    models = HARNESS_CONNECTION_CAPABILITIES["claude"].models
+    assert set(models) == {"anthropic"}
+    assert models["anthropic"] == list(CLAUDE_MODEL_ALIASES)
+    # Aliases, not provider-prefixed ids.
+    assert "opus" in models["anthropic"]
+    assert "opus[1m]" in models["anthropic"]
+    assert all("/" not in alias for alias in models["anthropic"])
+
+
+def test_models_round_trip_as_a_plain_dict():
+    doc = harness_capabilities_document()
+    # The published document is plain JSON-able dicts/lists, no model objects.
+    for harness, entry in doc.items():
+        assert isinstance(entry, dict)
+        assert isinstance(entry["models"], dict)
+        for provider, ids in entry["models"].items():
+            assert isinstance(provider, str)
+            assert isinstance(ids, list)
+            assert all(isinstance(model_id, str) for model_id in ids)
diff --git a/web/packages/agenta-entities/src/workflow/index.ts b/web/packages/agenta-entities/src/workflow/index.ts
index 01c635ebc3..9c520ff89b 100644
--- a/web/packages/agenta-entities/src/workflow/index.ts
+++ b/web/packages/agenta-entities/src/workflow/index.ts
@@ -46,6 +46,13 @@ export {workflowMolecule, type WorkflowMolecule, type WorkflowType} from "./stat
 
 export {deriveWorkflowTypeFromRevision} from "./state/helpers"
 
+// Per-harness capability map from the `/inspect` response `meta` (agent playground picker).
+export {
+    harnessCapabilitiesAtomFamily,
+    type HarnessCapabilities,
+    type HarnessCapabilitiesMap,
+} from "./state/inspectMeta"
+
 // ============================================================================
 // SCHEMAS & TYPES
 // ============================================================================
diff --git a/web/packages/agenta-entities/src/workflow/state/index.ts b/web/packages/agenta-entities/src/workflow/state/index.ts
index 08e4f50a06..9e031e2a4b 100644
--- a/web/packages/agenta-entities/src/workflow/state/index.ts
+++ b/web/packages/agenta-entities/src/workflow/state/index.ts
@@ -10,6 +10,16 @@
 
 export {workflowMolecule, type WorkflowMolecule} from "./molecule"
 
+// ============================================================================
+// INSPECT META (per-harness capability map from `/inspect` meta)
+// ============================================================================
+
+export {
+    harnessCapabilitiesAtomFamily,
+    type HarnessCapabilities,
+    type HarnessCapabilitiesMap,
+} from "./inspectMeta"
+
 // ============================================================================
 // HELPERS
 // ============================================================================
diff --git a/web/packages/agenta-entities/src/workflow/state/inspectMeta.ts b/web/packages/agenta-entities/src/workflow/state/inspectMeta.ts
new file mode 100644
index 0000000000..cba30851c4
--- /dev/null
+++ b/web/packages/agenta-entities/src/workflow/state/inspectMeta.ts
@@ -0,0 +1,53 @@
+/**
+ * Inspect-meta atoms
+ *
+ * Derived selectors over the workflow `/inspect` response `meta`. The agent service publishes
+ * `meta.harness_capabilities` (per harness: `providers` / `deployments` / `connection_modes` /
+ * `model_selection` / `models`) so the agent playground can render a harness-filtered provider +
+ * model picker straight from inspect instead of a static FE copy. These atoms thread that map to
+ * the config control keyed by the revision the playground has open.
+ *
+ * Source: `sdks/python/agenta/sdk/agents/capabilities.py` (the published shape) and
+ * `services/oss/src/agent/app.py` (`/inspect` `meta.harness_capabilities`).
+ */
+
+import {atom} from "jotai"
+import {atomFamily} from "jotai/utils"
+
+import {workflowInspectAtomFamily} from "./store"
+
+/** One harness's connection-relevant capabilities, as published on `/inspect` `meta`. */
+export interface HarnessCapabilities {
+    /** Provider families the harness can reach (a literal list; never `"*"`). */
+    providers: string[]
+    /** Deployment surfaces it can consume (`["direct"]` for Pi today). */
+    deployments?: string[]
+    /** Supported connection modes (`["agenta", "self_managed"]`). */
+    connection_modes: string[]
+    /** How a model is named: `"provider/id"` (Pi) or `"alias"` (Claude). */
+    model_selection: string
+    /** Selectable models per provider family (provider -> list of ids/aliases). */
+    models: Record<string, string[]>
+}
+
+/** The full per-harness capability map (harness type -> capabilities). */
+export type HarnessCapabilitiesMap = Record<string, HarnessCapabilities>
+
+function isRecord(value: unknown): value is Record<string, unknown> {
+    return typeof value === "object" && value !== null && !Array.isArray(value)
+}
+
+/**
+ * The per-harness capability map carried on the inspect response `meta.harness_capabilities`,
+ * keyed by the open revision. `null` when inspect has not resolved or the agent did not publish it
+ * (older agents / a non-agent workflow) — callers fall back to permissive behavior.
+ */
+export const harnessCapabilitiesAtomFamily = atomFamily((revisionId: string) =>
+    atom<HarnessCapabilitiesMap | null>((get) => {
+        const inspect = get(workflowInspectAtomFamily(revisionId))
+        const meta = inspect.data?.meta
+        if (!isRecord(meta)) return null
+        const caps = meta.harness_capabilities
+        return isRecord(caps) ? (caps as HarnessCapabilitiesMap) : null
+    }),
+)
diff --git a/web/packages/agenta-entity-ui/src/DrillInView/SchemaControls/AgentConfigControl.tsx b/web/packages/agenta-entity-ui/src/DrillInView/SchemaControls/AgentConfigControl.tsx
index d46baf5830..2b5a384ce1 100644
--- a/web/packages/agenta-entity-ui/src/DrillInView/SchemaControls/AgentConfigControl.tsx
+++ b/web/packages/agenta-entity-ui/src/DrillInView/SchemaControls/AgentConfigControl.tsx
@@ -11,23 +11,33 @@
  * from the SDK model (AgentConfigSchema in agenta.sdk.utils.types); the agent service ships a
  * thin `x-ag-type-ref` the playground resolves and reads back (services/oss/src/agent).
  */
-import {useCallback, useMemo, useState} from "react"
+import {useCallback, useEffect, useMemo, useState} from "react"
 
+import {vaultSecretsQueryAtom} from "@agenta/entities/secret"
 import type {SchemaProperty} from "@agenta/entities/shared"
+import {harnessCapabilitiesAtomFamily} from "@agenta/entities/workflow"
 import {LabeledField} from "@agenta/ui/components/presentational"
 import {useDrillInUI} from "@agenta/ui/drill-in"
+import {SelectLLMProviderBase} from "@agenta/ui/select-llm-provider"
 import {cn} from "@agenta/ui/styles"
 import {CaretDown, CaretRight, Plus} from "@phosphor-icons/react"
-import {Button, Select, Switch, Typography} from "antd"
+import {Button, Segmented, Select, Switch, Typography} from "antd"
+import {useAtomValue} from "jotai"
+
+import {useOptionalDrillIn} from "../components/MoleculeDrillInContext"
 
 import {ClaudePermissionsControl} from "./ClaudePermissionsControl"
 import {
     allowedConnectionModes,
-    allowedProviders,
+    buildModelOptionGroups,
     composeModelValue,
     connectionFromConfig,
+    harnessAllowsModel,
     modelIdFromConfig,
+    namedConnectionOptions,
+    providerForModel,
     type ConnectionMode,
+    type VaultConnectionEntry,
 } from "./connectionUtils"
 import {EnumSelectControl} from "./EnumSelectControl"
 import {GroupedChoiceControl} from "./GroupedChoiceControl"
@@ -39,11 +49,6 @@ import {ToolItemControl} from "./ToolItemControl"
 import {ToolSelectorPopover, type ToolSelectionMeta} from "./ToolSelectorPopover"
 import {type ToolObj} from "./toolUtils"
 
-const CONNECTION_MODE_LABELS: Record<ConnectionMode, string> = {
-    self_managed: "Self-managed",
-    agenta: "Agenta connection",
-}
-
 export interface AgentConfigControlProps {
     schema?: SchemaProperty | null
     label?: string
@@ -101,37 +106,92 @@ export function AgentConfigControl({
         [config, onChange],
     )
 
-    // Model + credential connection (the ModelRef). `config.model` is either a plain string
-    // (the default connection, kept byte-identical to today) or a structured object the SDK
-    // coerces into a ModelRef. The form edits the fields directly via composeModelValue.
+    // Per-harness capability map from the `/inspect` response meta, keyed by the open revision.
+    // Null when inspect hasn't resolved or the agent didn't publish it (older agents / standalone),
+    // in which case the connectionUtils helpers fall back permissively.
+    const drillIn = useOptionalDrillIn<unknown>()
+    const revisionId = drillIn?.entityId ?? null
+    const capabilities = useAtomValue(
+        useMemo(() => harnessCapabilitiesAtomFamily(revisionId ?? ""), [revisionId]),
+    )
+
+    // The project's stored connections (read-only) for the connection picker. The transformed
+    // vault list surfaces custom-provider connections as {type, name, provider}; the resolver
+    // matches a named connection by that name (the slug).
+    const vaultQuery = useAtomValue(vaultSecretsQueryAtom)
+    const vaultSecrets = useMemo(
+        () => (Array.isArray(vaultQuery.data) ? (vaultQuery.data as VaultConnectionEntry[]) : []),
+        [vaultQuery.data],
+    )
+
+    // Model + credential connection (the ModelRef). `config.model` is ALWAYS a structured ModelRef
+    // (the picker only ever produces one); a legacy bare string is read for display. The picker is
+    // harness-filtered: selecting a model sets BOTH the model id and its provider.
     const harness = typeof config.harness === "string" ? config.harness : null
     const modelId = useMemo(() => modelIdFromConfig(config.model), [config.model])
     const connection = useMemo(() => connectionFromConfig(config.model), [config.model])
-    const providerOptions = useMemo(() => allowedProviders(harness), [harness])
-    const providersOpen = providerOptions.includes("*")
-    const modeOptions = useMemo(() => allowedConnectionModes(harness), [harness])
+    const modeOptions = useMemo(
+        () => allowedConnectionModes(capabilities, harness),
+        [capabilities, harness],
+    )
+
+    // Harness-filtered model options, built straight from inspect meta. Empty when the harness
+    // publishes none (older agent / standalone) — fall back to the schema's full catalog picker.
+    const modelGroups = useMemo(
+        () => buildModelOptionGroups(capabilities, harness),
+        [capabilities, harness],
+    )
+    const hasInspectModels = modelGroups.length > 0
 
-    // Compose the new `config.model` from the current connection fields, overriding one of
-    // them. Empty provider/slug clear that part of the structured value.
+    // Compose the new `config.model` ModelRef from the current fields, overriding some. Picking a
+    // model derives its provider from the harness's published groups (sets both).
     const writeModel = useCallback(
         (patch: {
             modelId?: string | null
             provider?: string | null
             mode?: ConnectionMode
             slug?: string | null
-        }) =>
+        }) => {
+            const nextModelId = patch.modelId !== undefined ? patch.modelId : modelId
+            // When the model changes, derive the provider from the picked model; otherwise keep it.
+            let nextProvider: string | null
+            if (patch.provider !== undefined) {
+                nextProvider = patch.provider
+            } else if (patch.modelId !== undefined) {
+                nextProvider =
+                    providerForModel(capabilities, harness, nextModelId) ?? connection.provider
+            } else {
+                nextProvider = connection.provider
+            }
             setField(
                 "model",
                 composeModelValue({
-                    modelId: patch.modelId !== undefined ? patch.modelId : modelId,
-                    provider: patch.provider !== undefined ? patch.provider : connection.provider,
+                    modelId: nextModelId,
+                    provider: nextProvider,
                     mode: patch.mode !== undefined ? patch.mode : connection.mode,
                     slug: patch.slug !== undefined ? patch.slug : connection.slug,
                     // Carry through extra ModelRef keys (params, ...) the form does not edit.
                     existing: config.model,
                 }),
-            ),
-        [setField, modelId, connection, config.model],
+            )
+        },
+        [setField, modelId, connection, config.model, capabilities, harness],
+    )
+
+    // On harness switch, clear a model the new harness can't reach (rather than sending an
+    // unsupported model). Permissive when the new harness publishes no models.
+    useEffect(() => {
+        if (!harness || !modelId) return
+        if (!harnessAllowsModel(capabilities, harness, modelId)) {
+            writeModel({modelId: null, provider: null})
+        }
+        // Only react to harness/capabilities changes, not every model edit.
+    }, [harness, capabilities])
+
+    // Named connections selectable for the chosen provider under this harness (Agenta-managed).
+    const connectionOptions = useMemo(
+        () => namedConnectionOptions(vaultSecrets, capabilities, harness, connection.provider),
+        [vaultSecrets, capabilities, harness, connection.provider],
     )
 
     // Raw-JSON escape hatch for the whole `config.model` value (collapsed by default).
@@ -339,85 +399,81 @@ export function AgentConfigControl({
                 multiline
             />
 
-            <GroupedChoiceControl
-                schema={props.model}
-                label="Model"
-                value={modelId}
-                onChange={(v) => writeModel({modelId: v})}
-                withTooltip={withTooltip}
-                disabled={disabled}
-            />
-
-            {/* Connection (provider + credential mode + slug for the ModelRef) */}
-            <div className="flex flex-col gap-2">
-                <Typography.Text className="text-sm font-medium">Connection</Typography.Text>
-
+            {/* Unified provider + model picker. When the agent's `/inspect` publishes a
+                harness-filtered model list we render from it (selecting a model sets both the model
+                id and its provider). Otherwise we fall back to the schema's full catalog picker. */}
+            {hasInspectModels ? (
                 <LabeledField
-                    label="Provider"
-                    description="The provider family for this model. Leave empty to infer it."
-                    withTooltip
+                    label="Model"
+                    description="Filtered to the models this harness can reach. Selecting a model also sets its provider."
+                    withTooltip={withTooltip}
                 >
-                    {providersOpen ? (
-                        <TextInputControl
-                            value={connection.provider}
-                            onChange={(v) => writeModel({provider: v || null})}
-                            withTooltip={false}
-                            disabled={disabled}
-                            placeholder="e.g. openai (optional)"
-                        />
-                    ) : (
-                        <Select
-                            value={connection.provider ?? undefined}
-                            onChange={(v) => writeModel({provider: v ?? null})}
-                            options={providerOptions.map((p) => ({value: p, label: p}))}
-                            disabled={disabled}
-                            placeholder="Select provider..."
-                            allowClear
-                            className="w-full"
-                            size="small"
-                        />
-                    )}
-                </LabeledField>
-
-                <LabeledField
-                    label="Connection mode"
-                    description="Where this run's credential comes from."
-                    withTooltip
-                >
-                    <Select<ConnectionMode>
-                        value={connection.mode}
-                        onChange={(v) => writeModel({mode: v})}
-                        options={modeOptions.map((m) => ({
-                            value: m,
-                            label: CONNECTION_MODE_LABELS[m],
-                        }))}
+                    <SelectLLMProviderBase
+                        showGroup
+                        options={modelGroups}
+                        value={modelId ?? undefined}
+                        onChange={(v) => writeModel({modelId: (v as string) ?? null})}
                         disabled={disabled}
+                        placeholder="Select a model..."
                         className="w-full"
                         size="small"
                     />
                 </LabeledField>
+            ) : (
+                <GroupedChoiceControl
+                    schema={props.model}
+                    label="Model"
+                    value={modelId}
+                    onChange={(v) => writeModel({modelId: v})}
+                    withTooltip={withTooltip}
+                    disabled={disabled}
+                />
+            )}
 
-                {connection.mode === "agenta" && (
-                    <div className="flex flex-col gap-1">
-                        {/* TODO(provider-model-auth): this becomes a Select fed by an
-                            atomWithQuery over GET /vault/connections (Slice 2) once the Fern
-                            client exposes that endpoint. Free text until then. */}
-                        <TextInputControl
-                            label="Connection name"
-                            value={connection.slug}
-                            onChange={(v) => writeModel({slug: v || null})}
-                            description="The name of a connection in this project's vault."
+            {/* Authentication: Agenta-managed (a vault connection) vs self-managed (the harness
+                uses its own login; Agenta injects nothing). Maps to connection.mode. */}
+            <div className="flex flex-col gap-2">
+                <Typography.Text className="text-sm font-medium">Authentication</Typography.Text>
+
+                <Segmented<ConnectionMode>
+                    block
+                    size="small"
+                    value={connection.mode}
+                    onChange={(v) => writeModel({mode: v})}
+                    disabled={disabled}
+                    options={modeOptions.map((m) => ({
+                        value: m,
+                        label: m === "agenta" ? "Agenta-managed" : "Self-managed",
+                    }))}
+                />
+
+                {connection.mode === "agenta" ? (
+                    <LabeledField
+                        label="Connection"
+                        description="Which stored connection supplies the credential. Project default uses the project's provider key."
+                        withTooltip
+                    >
+                        <Select<string>
+                            value={connection.slug ?? "__default__"}
+                            onChange={(v) =>
+                                writeModel({slug: v === "__default__" ? null : (v ?? null)})
+                            }
+                            options={[
+                                {value: "__default__", label: "Project default"},
+                                ...connectionOptions.map((o) => ({value: o.value, label: o.label})),
+                            ]}
                             disabled={disabled}
-                            placeholder="e.g. openai-prod"
+                            className="w-full"
+                            size="small"
+                            showSearch
+                            optionFilterProp="label"
                         />
-                        {!connection.slug && (
-                            // The backend rejects an "agenta" connection with no slug; surface
-                            // it here rather than as a raw server validation error on save.
-                            <Typography.Text type="danger" className="text-xs">
-                                A connection name is required for an Agenta connection.
-                            </Typography.Text>
-                        )}
-                    </div>
+                    </LabeledField>
+                ) : (
+                    <Typography.Text type="secondary" className="text-xs">
+                        The harness uses its own login (env var or prior OAuth). Agenta injects
+                        nothing.
+                    </Typography.Text>
                 )}
 
                 <div className="flex items-center gap-2">
diff --git a/web/packages/agenta-entity-ui/src/DrillInView/SchemaControls/connectionUtils.ts b/web/packages/agenta-entity-ui/src/DrillInView/SchemaControls/connectionUtils.ts
index 8424eb4bbb..3e65e1af32 100644
--- a/web/packages/agenta-entity-ui/src/DrillInView/SchemaControls/connectionUtils.ts
+++ b/web/packages/agenta-entity-ui/src/DrillInView/SchemaControls/connectionUtils.ts
@@ -1,21 +1,29 @@
 /**
  * connectionUtils
  *
- * Pure helpers and a static capability map for the agent config's model + credential
- * connection (the `ModelRef` shape from the provider-model-auth project). The backend
- * accepts `config.model` either as a plain string (legacy, the default connection) or as a
- * structured object `{provider?, model, params?, connection?: {mode, slug?}}` that the SDK
- * coerces into a `ModelRef`. These helpers translate between the form fields the
- * AgentConfigControl renders and that on-the-wire value, keeping the default case
- * byte-identical to today (a plain string) so existing agents do not change shape.
+ * Pure helpers for the agent config's model + credential connection (the `ModelRef` shape from
+ * the provider-model-auth project). In this POC the agent config `model` is ALWAYS a structured
+ * `ModelRef` object — `{provider, model, params?, connection?: {mode, slug?}}` — never a free-text
+ * string. The harness-filtered unified picker (provider + model + authentication + connection) is
+ * the only way to set it, and it always produces a ModelRef. These helpers translate between the
+ * form fields the AgentConfigControl renders and that on-the-wire object.
  *
- * They live in their own module (not inline in AgentConfigControl) so the package unit
- * tests can import and exercise them without a React harness.
+ * The per-harness capability surface (which providers/models/connection-modes a harness can reach)
+ * is published on the `/inspect` response `meta.harness_capabilities`; the frontend renders from it
+ * via the passed-in `HarnessCapabilitiesMap` rather than a static FE copy. When the map is absent
+ * (older agents, a standalone control) the helpers fall back permissively.
  *
- * Design: docs/design/agent-workflows/projects/provider-model-auth/design.md (Concern 1:
- * ModelRef; Concern 3b: per-harness provider/mode gating).
+ * They live in their own module (not inline in AgentConfigControl) so the package unit tests can
+ * import and exercise them without a React harness.
+ *
+ * Design: docs/design/agent-workflows/projects/agent-model-picker/ (the picker UX + inspect model
+ * list) and provider-model-auth/design.md (Concern 1: ModelRef; Concern 3b: per-harness gating).
  */
 
+import type {HarnessCapabilities, HarnessCapabilitiesMap} from "@agenta/entities/workflow"
+
+export type {HarnessCapabilities, HarnessCapabilitiesMap}
+
 /**
  * A connection mode: where the credential comes from. Two modes only — `agenta` (a vault
  * connection; project-default when no slug, named when a slug is set) and `self_managed`
@@ -25,9 +33,9 @@ export type ConnectionMode = "agenta" | "self_managed"
 
 /** The connection fields the form edits, read back from `config.model`. */
 export interface ConnectionFields {
-    /** Logical provider family (e.g. "openai", "anthropic"); null when inferred. */
+    /** Logical provider family (e.g. "openai", "anthropic"); null when not yet picked. */
     provider: string | null
-    /** Credential mode. Defaults to "agenta" (the project default) for a bare-string model. */
+    /** Credential mode. Defaults to "agenta" (the project default). */
     mode: ConnectionMode
     /** Named connection slug; only meaningful when mode === "agenta". */
     slug: string | null
@@ -53,11 +61,11 @@ function coerceMode(mode: unknown): ConnectionMode {
 }
 
 /**
- * The picked model id, whatever the stored shape: a plain string is itself; an object
- * yields its `.model`. Returns null when neither is present.
+ * The picked model id from a stored ModelRef. A legacy bare-string `model` is still read (so an
+ * older stored config still populates the picker), but the form only ever writes a ModelRef.
  */
 export function modelIdFromConfig(model: unknown): string | null {
-    if (typeof model === "string") return model
+    if (typeof model === "string") return model || null
     if (isModelRefObject(model)) {
         return typeof model.model === "string" ? model.model : null
     }
@@ -65,8 +73,8 @@ export function modelIdFromConfig(model: unknown): string | null {
 }
 
 /**
- * The connection fields behind `config.model`: a plain string is the implicit default
- * connection (no provider override); an object exposes its provider and connection.
+ * The connection fields behind a stored ModelRef. A legacy bare string is read as the default
+ * (agenta, no slug) connection with no provider; an object exposes its provider and connection.
  */
 export function connectionFromConfig(model: unknown): ConnectionFields {
     if (isModelRefObject(model)) {
@@ -86,10 +94,9 @@ export interface ComposeModelValueArgs {
     mode: ConnectionMode
     slug: string | null
     /**
-     * The prior `config.model` value. When it is a structured object, its extra keys
-     * (notably `params`, set via the raw-JSON hatch) are carried through so a form edit
-     * never silently drops them. The form-managed keys (model/provider/connection) are then
-     * overwritten from the args.
+     * The prior `config.model` value. When it is a structured object, its extra keys (notably
+     * `params`, set via the raw-JSON hatch) are carried through so a form edit never silently
+     * drops them. The form-managed keys (model/provider/connection) are then overwritten.
      */
     existing?: unknown
 }
@@ -97,13 +104,12 @@ export interface ComposeModelValueArgs {
 const FORM_MANAGED_KEYS = new Set(["model", "provider", "connection"])
 
 /**
- * Compose the `config.model` value the backend expects from the form fields.
+ * Compose the `config.model` ModelRef the backend expects from the form fields.
  *
- * Keeps the plain string for the default `agenta` connection (no slug) with no provider
- * override AND no extra keys to preserve (so existing agents stay byte-identical). Otherwise
- * returns the structured object, emitting the `connection` only when it carries non-default
- * info (a `self_managed` mode, or an `agenta` slug) and the `slug` only for an agenta
- * connection. Extra keys on the prior object (e.g. `params`) ride through.
+ * Always returns the structured object (never a bare string): the picker always produces a
+ * ModelRef. The `connection` is emitted only when it carries non-default info (a `self_managed`
+ * mode, or an `agenta` slug); the `slug` is emitted only for an agenta connection. Extra keys on
+ * the prior object (e.g. `params`) ride through.
  */
 export function composeModelValue({
     modelId,
@@ -111,7 +117,7 @@ export function composeModelValue({
     mode,
     slug,
     existing,
-}: ComposeModelValueArgs): string | Record<string, unknown> {
+}: ComposeModelValueArgs): Record<string, unknown> {
     const id = modelId ?? ""
     const hasProvider = Boolean(provider)
 
@@ -122,18 +128,11 @@ export function composeModelValue({
             if (!FORM_MANAGED_KEYS.has(key)) extras[key] = val
         }
     }
-    const hasExtras = Object.keys(extras).length > 0
-
-    // The default agenta connection (agenta + no slug) carries no info beyond the model id, so
-    // with no provider override and no extras it stays a plain string (byte-identical to today).
-    const isDefaultConnection = mode === "agenta" && !slug
-    if (isDefaultConnection && !hasProvider && !hasExtras) {
-        return id
-    }
 
     const result: Record<string, unknown> = {...extras, model: id}
     if (hasProvider) result.provider = provider
 
+    const isDefaultConnection = mode === "agenta" && !slug
     if (!isDefaultConnection) {
         const connection: Record<string, unknown> = {mode}
         if (mode === "agenta" && slug) connection.slug = slug
@@ -144,72 +143,213 @@ export function composeModelValue({
 }
 
 // ---------------------------------------------------------------------------
-// Static per-harness capability map.
-//
-// A frontend copy of `sdks/python/agenta/sdk/agents/capabilities.py`, mirroring its REAL
-// entries: pi_core/pi_agenta reach the eight vault-mapped providers; claude is anthropic-only;
-// both modes (`agenta`/`self_managed`) on every harness. A harness with no entry is permissive.
+// Harness capability gating (fed from `/inspect` meta.harness_capabilities).
 //
-// TODO(harness-capabilities): the sibling harness-capabilities project replaces this static
-// map with one fed from `/inspect` `meta.harness_capabilities`. Keep it in agreement with the
-// SDK table until then.
+// The helpers read the passed-in capability map (keyed by harness type) instead of a static FE
+// copy. A harness with no entry — or a missing map (older agents / standalone) — is permissive.
 // ---------------------------------------------------------------------------
 
-interface HarnessConnectionCapabilities {
-    providers: string[]
-    connectionModes: ConnectionMode[]
-}
-
 const ALL_MODES: ConnectionMode[] = ["agenta", "self_managed"]
 
-// The eight Agenta-vault-mapped providers Pi reaches directly (mirrors PI_VAULT_PROVIDERS in
-// the SDK capabilities table).
-const PI_VAULT_PROVIDERS = [
-    "openai",
-    "anthropic",
-    "gemini",
-    "mistral",
-    "groq",
-    "minimax",
-    "together_ai",
-    "openrouter",
-]
-
-const HARNESS_CONNECTION_CAPABILITIES: Record<string, HarnessConnectionCapabilities> = {
-    pi_core: {providers: [...PI_VAULT_PROVIDERS], connectionModes: ALL_MODES},
-    pi_agenta: {providers: [...PI_VAULT_PROVIDERS], connectionModes: ALL_MODES},
-    claude: {providers: ["anthropic"], connectionModes: ALL_MODES},
+function capsFor(
+    capabilities: HarnessCapabilitiesMap | null | undefined,
+    harness: string | null | undefined,
+): HarnessCapabilities | null {
+    if (!capabilities || !harness) return null
+    return capabilities[harness] ?? null
 }
 
 /**
- * The provider families the harness can reach. A missing harness is permissive (returns `["*"]`,
- * so the form shows a free-text provider field).
+ * The provider families the harness can reach. A missing harness/capability is permissive
+ * (returns `["*"]`, so the form shows a free-text provider field).
  */
-export function allowedProviders(harness: string | null | undefined): string[] {
-    if (!harness) return ["*"]
-    const entry = HARNESS_CONNECTION_CAPABILITIES[harness]
-    return entry ? entry.providers : ["*"]
+export function allowedProviders(
+    capabilities: HarnessCapabilitiesMap | null | undefined,
+    harness: string | null | undefined,
+): string[] {
+    const entry = capsFor(capabilities, harness)
+    return entry?.providers?.length ? entry.providers : ["*"]
 }
 
 /**
- * The connection modes the harness supports. A missing harness is permissive (returns all
- * modes).
+ * The connection modes the harness supports. A missing harness/capability is permissive (returns
+ * both modes).
  */
-export function allowedConnectionModes(harness: string | null | undefined): ConnectionMode[] {
-    if (!harness) return ALL_MODES
-    const entry = HARNESS_CONNECTION_CAPABILITIES[harness]
-    return entry ? entry.connectionModes : ALL_MODES
+export function allowedConnectionModes(
+    capabilities: HarnessCapabilitiesMap | null | undefined,
+    harness: string | null | undefined,
+): ConnectionMode[] {
+    const entry = capsFor(capabilities, harness)
+    if (!entry?.connection_modes?.length) return ALL_MODES
+    return entry.connection_modes.filter(
+        (m): m is ConnectionMode => m === "agenta" || m === "self_managed",
+    )
 }
 
 /**
- * Whether the harness can reach the provider. A `"*"` entry matches any provider; otherwise
- * the match is case-insensitive on the provider family. A missing harness is permissive.
+ * Whether the harness can reach the provider. A `"*"` entry matches any provider; otherwise the
+ * match is case-insensitive on the provider family. A missing harness/capability is permissive.
  */
 export function harnessAllowsProvider(
+    capabilities: HarnessCapabilitiesMap | null | undefined,
     harness: string | null | undefined,
     provider: string,
 ): boolean {
-    const providers = allowedProviders(harness)
+    const providers = allowedProviders(capabilities, harness)
     if (providers.includes("*")) return true
     return providers.some((p) => p.toLowerCase() === provider.toLowerCase())
 }
+
+/**
+ * How the selected harness names a model: `"provider/id"` (Pi — value is the catalog id, provider
+ * is derived from the group) or `"alias"` (Claude — value is the bare alias, provider is the
+ * group). Defaults to `"provider/id"` when unknown.
+ */
+export function modelSelectionMode(
+    capabilities: HarnessCapabilitiesMap | null | undefined,
+    harness: string | null | undefined,
+): string {
+    return capsFor(capabilities, harness)?.model_selection ?? "provider/id"
+}
+
+// ---------------------------------------------------------------------------
+// Harness-filtered, unified model picker (provider + model in one control).
+// ---------------------------------------------------------------------------
+
+/** A grouped model option group, the shape SelectLLMProviderBase / GroupedChoiceControl expect. */
+export interface ModelOptionGroup {
+    label: string
+    options: {label: string; value: string; metadata?: Record<string, unknown>}[]
+}
+
+/** Optional per-model pricing metadata keyed `{provider: {modelId: {input, output}}}`. */
+export type ModelMetadataMap = Record<string, Record<string, Record<string, unknown>>>
+
+function titleizeProvider(provider: string): string {
+    return provider.charAt(0).toUpperCase() + provider.slice(1).replace(/_/g, " ")
+}
+
+/**
+ * Build the grouped model options for the harness from `capabilities[harness].models`
+ * (provider -> ids/aliases). Each option's `value` is the model id/alias and its group label is
+ * the provider — so selecting an option yields both the model and the provider it belongs to.
+ * Pricing rides along from `metadata` when present. Returns `[]` when the harness publishes no
+ * models (the caller then falls back to the schema's full catalog).
+ */
+export function buildModelOptionGroups(
+    capabilities: HarnessCapabilitiesMap | null | undefined,
+    harness: string | null | undefined,
+    metadata?: ModelMetadataMap | null,
+): ModelOptionGroup[] {
+    const models = capsFor(capabilities, harness)?.models
+    if (!models) return []
+    return Object.entries(models)
+        .filter(([, ids]) => Array.isArray(ids) && ids.length > 0)
+        .map(([provider, ids]) => ({
+            label: titleizeProvider(provider),
+            options: ids.map((id) => ({
+                label: id,
+                value: id,
+                metadata: metadata?.[provider]?.[id],
+            })),
+        }))
+}
+
+/**
+ * The provider family that owns a picked model id, derived from the harness's published models
+ * (the group the id sits in). Returns null when the id is not in any group (e.g. a stale id under
+ * a switched harness). Use this so picking a model sets BOTH provider and model.
+ */
+export function providerForModel(
+    capabilities: HarnessCapabilitiesMap | null | undefined,
+    harness: string | null | undefined,
+    modelId: string | null | undefined,
+): string | null {
+    if (!modelId) return null
+    const models = capsFor(capabilities, harness)?.models
+    if (!models) return null
+    for (const [provider, ids] of Object.entries(models)) {
+        if (Array.isArray(ids) && ids.includes(modelId)) return provider
+    }
+    return null
+}
+
+/**
+ * Whether a model id is reachable under the harness (present in any of its published model
+ * groups). A harness with no published models is permissive (returns true) so the schema-catalog
+ * fallback path is not over-cleared. Use to clear an unreachable model on harness switch.
+ */
+export function harnessAllowsModel(
+    capabilities: HarnessCapabilitiesMap | null | undefined,
+    harness: string | null | undefined,
+    modelId: string | null | undefined,
+): boolean {
+    if (!modelId) return true
+    const models = capsFor(capabilities, harness)?.models
+    if (!models || Object.keys(models).length === 0) return true
+    return Object.values(models).some((ids) => Array.isArray(ids) && ids.includes(modelId))
+}
+
+// ---------------------------------------------------------------------------
+// Connection picker (Agenta-managed): named connections from the vault list.
+//
+// Fed by the existing `GET /secrets/` via vaultSecretsQueryAtom (read-only). The backend resolver
+// matches a named connection by the secret's `header.name` (the slug); the transformed vault list
+// surfaces custom-provider connections as `{type: "custom_provider", name, provider}`. We list
+// those, filtered to the selected provider and the harness's reachable providers.
+// ---------------------------------------------------------------------------
+
+/** A vault entry as surfaced by transformSecret (the in-app `LlmProvider` shape, loosely typed). */
+export interface VaultConnectionEntry {
+    /** The secret kind: "provider_key" (standard, no custom name) or "custom_provider". */
+    type?: string
+    /** For custom_provider: the connection name (== header.name == the slug to send). */
+    name?: string
+    /** For custom_provider: the provider family (data.kind). */
+    provider?: string
+    /** For standard provider_key: the provider title (the provider family). */
+    title?: string
+}
+
+/** A named-connection option for the picker: `{label: header.name, value: slug}`. */
+export interface ConnectionOption {
+    label: string
+    value: string
+}
+
+/**
+ * Named connections selectable for a provider under a harness, from the vault list. Only
+ * custom-provider secrets carry a connection name (the slug the resolver matches on); standard
+ * provider keys are the implicit project default and are not listed here. Filtered to the chosen
+ * provider (case-insensitive) and, when no provider is chosen, to the harness's reachable
+ * providers.
+ */
+export function namedConnectionOptions(
+    secrets: VaultConnectionEntry[] | null | undefined,
+    capabilities: HarnessCapabilitiesMap | null | undefined,
+    harness: string | null | undefined,
+    provider: string | null | undefined,
+): ConnectionOption[] {
+    if (!secrets?.length) return []
+    const reachable = allowedProviders(capabilities, harness)
+    const anyProvider = reachable.includes("*")
+    const target = provider?.toLowerCase() || null
+
+    const out: ConnectionOption[] = []
+    const seen = new Set<string>()
+    for (const secret of secrets) {
+        if (secret.type !== "custom_provider") continue
+        const slug = secret.name?.trim()
+        if (!slug || seen.has(slug)) continue
+        const secretProvider = secret.provider?.toLowerCase() || null
+        if (target) {
+            if (secretProvider !== target) continue
+        } else if (!anyProvider && secretProvider) {
+            // No provider chosen yet: keep only connections the harness can reach.
+            if (!reachable.some((p) => p.toLowerCase() === secretProvider)) continue
+        }
+        seen.add(slug)
+        out.push({label: slug, value: slug})
+    }
+    return out
+}
diff --git a/web/packages/agenta-entity-ui/tests/unit/connectionUtils.test.ts b/web/packages/agenta-entity-ui/tests/unit/connectionUtils.test.ts
index 9ba173a5f6..f4e83892e7 100644
--- a/web/packages/agenta-entity-ui/tests/unit/connectionUtils.test.ts
+++ b/web/packages/agenta-entity-ui/tests/unit/connectionUtils.test.ts
@@ -1,28 +1,59 @@
 /**
- * Unit tests for the pure ModelRef <-> form helpers in connectionUtils.
+ * Unit tests for the pure ModelRef <-> form helpers and the harness-filtered picker helpers in
+ * connectionUtils.
  *
- * These back the agent config's Connection sub-form (provider-model-auth, PR 5). The
- * helpers are extracted so the round-trip between `config.model` and the form fields is
- * testable without a React harness. Runs under @agenta/entity-ui's own vitest runner.
+ * These back the agent config's unified provider + model + connection picker (agent-model-picker).
+ * The model is ALWAYS a structured ModelRef (never a bare string); the picker filters to what the
+ * selected harness publishes on `/inspect` meta.harness_capabilities. The helpers are extracted so
+ * the round-trip and the option-building are testable without a React harness. Runs under
+ * @agenta/entity-ui's own vitest runner.
  */
 import {describe, expect, it} from "vitest"
 
 import {
     allowedConnectionModes,
     allowedProviders,
+    buildModelOptionGroups,
     composeModelValue,
     connectionFromConfig,
+    harnessAllowsModel,
     harnessAllowsProvider,
     modelIdFromConfig,
+    modelSelectionMode,
+    namedConnectionOptions,
+    providerForModel,
+    type HarnessCapabilitiesMap,
 } from "../../src/DrillInView/SchemaControls/connectionUtils"
 
+// An inspect-shaped capability map (the `/inspect` meta.harness_capabilities payload).
+const CAPABILITIES: HarnessCapabilitiesMap = {
+    pi_core: {
+        providers: ["openai", "anthropic", "gemini"],
+        deployments: ["direct"],
+        connection_modes: ["agenta", "self_managed"],
+        model_selection: "provider/id",
+        models: {
+            openai: ["gpt-5.5", "gpt-5.4"],
+            anthropic: ["anthropic/claude-opus-4-7"],
+            gemini: ["gemini/gemini-2.5-pro"],
+        },
+    },
+    claude: {
+        providers: ["anthropic"],
+        deployments: ["direct", "custom", "bedrock"],
+        connection_modes: ["agenta", "self_managed"],
+        model_selection: "alias",
+        models: {anthropic: ["opus", "sonnet", "opus[1m]"]},
+    },
+}
+
 describe("connectionUtils: modelIdFromConfig", () => {
-    it("returns a plain string model as itself", () => {
-        expect(modelIdFromConfig("gpt-5.5")).toBe("gpt-5.5")
+    it("reads .model from a structured ModelRef", () => {
+        expect(modelIdFromConfig({model: "gpt-5.5", provider: "openai"})).toBe("gpt-5.5")
     })
 
-    it("reads .model from a structured object", () => {
-        expect(modelIdFromConfig({model: "gpt-5.5", provider: "openai"})).toBe("gpt-5.5")
+    it("still reads a legacy bare-string model", () => {
+        expect(modelIdFromConfig("gpt-5.5")).toBe("gpt-5.5")
     })
 
     it("returns null for absent or malformed values", () => {
@@ -34,15 +65,7 @@ describe("connectionUtils: modelIdFromConfig", () => {
 })
 
 describe("connectionUtils: connectionFromConfig", () => {
-    it("treats a plain string as the implicit default (agenta, no slug) connection", () => {
-        expect(connectionFromConfig("gpt-5.5")).toEqual({
-            provider: null,
-            mode: "agenta",
-            slug: null,
-        })
-    })
-
-    it("reads provider and connection from a structured object", () => {
+    it("reads provider and connection from a ModelRef", () => {
         expect(
             connectionFromConfig({
                 model: "gpt-5.5",
@@ -54,7 +77,6 @@ describe("connectionUtils: connectionFromConfig", () => {
 
     it("defaults the mode to agenta when the connection block is absent or unknown", () => {
         expect(connectionFromConfig({model: "gpt-5.5"}).mode).toBe("agenta")
-        // The removed "default" mode (and any bogus value) maps to agenta.
         expect(connectionFromConfig({model: "gpt-5.5", connection: {mode: "default"}}).mode).toBe(
             "agenta",
         )
@@ -62,23 +84,20 @@ describe("connectionUtils: connectionFromConfig", () => {
             "agenta",
         )
     })
-})
 
-describe("connectionUtils: composeModelValue", () => {
-    it("keeps the plain string for the default (agenta, no slug) connection with no provider", () => {
-        expect(
-            composeModelValue({modelId: "gpt-5.5", provider: null, mode: "agenta", slug: null}),
-        ).toBe("gpt-5.5")
+    it("reads a legacy bare string as the default connection with no provider", () => {
+        expect(connectionFromConfig("gpt-5.5")).toEqual({
+            provider: null,
+            mode: "agenta",
+            slug: null,
+        })
     })
+})
 
-    it("emits a structured object once a provider is overridden", () => {
+describe("connectionUtils: composeModelValue (always a ModelRef)", () => {
+    it("returns a structured object even for the default connection (no bare string)", () => {
         expect(
-            composeModelValue({
-                modelId: "gpt-5.5",
-                provider: "openai",
-                mode: "agenta",
-                slug: null,
-            }),
+            composeModelValue({modelId: "gpt-5.5", provider: "openai", mode: "agenta", slug: null}),
         ).toEqual({model: "gpt-5.5", provider: "openai"})
     })
 
@@ -100,21 +119,12 @@ describe("connectionUtils: composeModelValue", () => {
     it("omits the slug for a self_managed connection", () => {
         expect(
             composeModelValue({
-                modelId: "claude-opus-4-8",
-                provider: null,
+                modelId: "opus",
+                provider: "anthropic",
                 mode: "self_managed",
                 slug: null,
             }),
-        ).toEqual({model: "claude-opus-4-8", connection: {mode: "self_managed"}})
-    })
-
-    it("round-trips a default string through the helpers as a string", () => {
-        const fields = connectionFromConfig("gpt-5.5")
-        const round = composeModelValue({
-            modelId: modelIdFromConfig("gpt-5.5"),
-            ...fields,
-        })
-        expect(round).toBe("gpt-5.5")
+        ).toEqual({model: "opus", provider: "anthropic", connection: {mode: "self_managed"}})
     })
 
     it("round-trips a structured object through the helpers", () => {
@@ -128,80 +138,130 @@ describe("connectionUtils: composeModelValue", () => {
         expect(round).toEqual(value)
     })
 
-    it("preserves extra ModelRef keys (params) on a form edit", () => {
+    it("preserves extra ModelRef keys (params) and provider on a form edit", () => {
         const existing = {
             model: "gpt-5.5",
+            provider: "openai",
             params: {reasoning_effort: "high"},
             connection: {mode: "agenta", slug: "openai-prod"},
         }
-        // The user swaps the model id; provider/connection/params must survive.
         const fields = connectionFromConfig(existing)
-        const round = composeModelValue({
-            modelId: "gpt-5.6",
-            ...fields,
-            existing,
-        })
+        const round = composeModelValue({modelId: "gpt-5.6", ...fields, existing})
         expect(round).toEqual({
             params: {reasoning_effort: "high"},
             model: "gpt-5.6",
+            provider: "openai",
             connection: {mode: "agenta", slug: "openai-prod"},
         })
     })
+})
 
-    it("keeps extras even for a default connection (no longer a bare string)", () => {
-        const existing = {model: "gpt-5.5", params: {temperature: 0.2}}
-        const round = composeModelValue({
-            modelId: "gpt-5.5",
-            provider: null,
-            mode: "agenta",
-            slug: null,
-            existing,
-        })
-        expect(round).toEqual({params: {temperature: 0.2}, model: "gpt-5.5"})
+describe("connectionUtils: capability gating (inspect-fed)", () => {
+    it("reads providers and modes from the passed-in capability map", () => {
+        expect(allowedProviders(CAPABILITIES, "pi_core")).toEqual(["openai", "anthropic", "gemini"])
+        expect(allowedProviders(CAPABILITIES, "claude")).toEqual(["anthropic"])
+        expect(allowedConnectionModes(CAPABILITIES, "pi_core")).toEqual(["agenta", "self_managed"])
+        expect(harnessAllowsProvider(CAPABILITIES, "claude", "anthropic")).toBe(true)
+        expect(harnessAllowsProvider(CAPABILITIES, "claude", "Anthropic")).toBe(true)
+        expect(harnessAllowsProvider(CAPABILITIES, "claude", "openai")).toBe(false)
     })
 
-    it("changing the model id preserves a set connection", () => {
-        const existing = {
-            model: "gpt-5.5",
-            provider: "openai",
-            connection: {mode: "agenta", slug: "openai-prod"},
-        }
-        const fields = connectionFromConfig(existing)
-        const round = composeModelValue({modelId: "gpt-5.6", ...fields, existing})
-        expect(round).toEqual({
-            model: "gpt-5.6",
-            provider: "openai",
-            connection: {mode: "agenta", slug: "openai-prod"},
+    it("exposes the per-harness model selection mode", () => {
+        expect(modelSelectionMode(CAPABILITIES, "pi_core")).toBe("provider/id")
+        expect(modelSelectionMode(CAPABILITIES, "claude")).toBe("alias")
+    })
+
+    it("is permissive when the map or harness is missing", () => {
+        expect(allowedProviders(null, "pi_core")).toEqual(["*"])
+        expect(allowedProviders(CAPABILITIES, "future-harness")).toEqual(["*"])
+        expect(allowedProviders(CAPABILITIES, null)).toEqual(["*"])
+        expect(allowedConnectionModes(undefined, "pi_core")).toEqual(["agenta", "self_managed"])
+        expect(harnessAllowsProvider(CAPABILITIES, "future-harness", "whatever")).toBe(true)
+        expect(modelSelectionMode(null, "pi_core")).toBe("provider/id")
+    })
+})
+
+describe("connectionUtils: harness-filtered model picker", () => {
+    it("builds grouped options from the harness's published models", () => {
+        const groups = buildModelOptionGroups(CAPABILITIES, "pi_core")
+        const byLabel = Object.fromEntries(
+            groups.map((g) => [g.label, g.options.map((o) => o.value)]),
+        )
+        expect(byLabel["Openai"]).toEqual(["gpt-5.5", "gpt-5.4"])
+        expect(byLabel["Anthropic"]).toEqual(["anthropic/claude-opus-4-7"])
+        expect(byLabel["Gemini"]).toEqual(["gemini/gemini-2.5-pro"])
+    })
+
+    it("groups Claude aliases under anthropic (alias selection)", () => {
+        const groups = buildModelOptionGroups(CAPABILITIES, "claude")
+        expect(groups).toHaveLength(1)
+        expect(groups[0].label).toBe("Anthropic")
+        expect(groups[0].options.map((o) => o.value)).toEqual(["opus", "sonnet", "opus[1m]"])
+    })
+
+    it("attaches pricing metadata when provided", () => {
+        const metadata = {openai: {"gpt-5.5": {input: 1, output: 2}}}
+        const groups = buildModelOptionGroups(CAPABILITIES, "pi_core", metadata)
+        const openai = groups.find((g) => g.label === "Openai")!
+        expect(openai.options.find((o) => o.value === "gpt-5.5")?.metadata).toEqual({
+            input: 1,
+            output: 2,
         })
     })
+
+    it("returns [] when the harness publishes no models (FE falls back to the catalog)", () => {
+        expect(buildModelOptionGroups(null, "pi_core")).toEqual([])
+        expect(buildModelOptionGroups(CAPABILITIES, "future-harness")).toEqual([])
+    })
+
+    it("derives the provider from the picked model (sets both provider and model)", () => {
+        expect(providerForModel(CAPABILITIES, "pi_core", "gpt-5.5")).toBe("openai")
+        expect(providerForModel(CAPABILITIES, "pi_core", "gemini/gemini-2.5-pro")).toBe("gemini")
+        // Claude alias derives anthropic.
+        expect(providerForModel(CAPABILITIES, "claude", "opus")).toBe("anthropic")
+        // A stale id under the wrong harness derives nothing.
+        expect(providerForModel(CAPABILITIES, "claude", "gpt-5.5")).toBeNull()
+    })
+
+    it("clears a model unreachable under a switched harness", () => {
+        // gpt-5.5 is a pi_core model; not reachable on claude.
+        expect(harnessAllowsModel(CAPABILITIES, "pi_core", "gpt-5.5")).toBe(true)
+        expect(harnessAllowsModel(CAPABILITIES, "claude", "gpt-5.5")).toBe(false)
+        expect(harnessAllowsModel(CAPABILITIES, "claude", "opus")).toBe(true)
+        // No published models -> permissive (don't over-clear the catalog fallback).
+        expect(harnessAllowsModel(CAPABILITIES, "future-harness", "anything")).toBe(true)
+        expect(harnessAllowsModel(CAPABILITIES, "pi_core", null)).toBe(true)
+    })
 })
 
-describe("connectionUtils: harness capability gating", () => {
-    it("pi_core and pi_agenta reach the vault providers (real list, not a wildcard) and both modes", () => {
-        // Real list, not "*": the eight vault-mapped providers (mirrors the SDK table).
-        expect(allowedProviders("pi_core")).toContain("openai")
-        expect(allowedProviders("pi_core")).toContain("together_ai")
-        expect(allowedProviders("pi_core")).not.toContain("*")
-        expect(allowedProviders("pi_agenta")).toEqual(allowedProviders("pi_core"))
-        expect(allowedConnectionModes("pi_core")).toEqual(["agenta", "self_managed"])
-        expect(harnessAllowsProvider("pi_core", "openai")).toBe(true)
-        // An unmapped provider is NOT reachable (the wildcard is gone).
-        expect(harnessAllowsProvider("pi_core", "anything")).toBe(false)
-    })
-
-    it("claude is narrow: anthropic only", () => {
-        expect(allowedProviders("claude")).toEqual(["anthropic"])
-        expect(harnessAllowsProvider("claude", "anthropic")).toBe(true)
-        expect(harnessAllowsProvider("claude", "Anthropic")).toBe(true)
-        expect(harnessAllowsProvider("claude", "openai")).toBe(false)
-        // both connection modes
-        expect(allowedConnectionModes("claude")).toEqual(["agenta", "self_managed"])
-    })
-
-    it("is permissive for an unknown or missing harness", () => {
-        expect(allowedProviders("future-harness")).toEqual(["*"])
-        expect(allowedProviders(null)).toEqual(["*"])
-        expect(allowedConnectionModes(undefined)).toEqual(["agenta", "self_managed"])
-        expect(harnessAllowsProvider("future-harness", "whatever")).toBe(true)
+describe("connectionUtils: named connection options (vault list)", () => {
+    const SECRETS = [
+        {type: "provider_key", title: "openai", name: "OPENAI_API_KEY"},
+        {type: "custom_provider", name: "openai-prod", provider: "openai"},
+        {type: "custom_provider", name: "openai-staging", provider: "openai"},
+        {type: "custom_provider", name: "anthropic-prod", provider: "anthropic"},
+    ]
+
+    it("lists custom-provider connections filtered to the chosen provider", () => {
+        const opts = namedConnectionOptions(SECRETS, CAPABILITIES, "pi_core", "openai")
+        expect(opts.map((o) => o.value)).toEqual(["openai-prod", "openai-staging"])
+        // value is the slug == header.name (what the resolver matches on).
+        expect(opts[0]).toEqual({label: "openai-prod", value: "openai-prod"})
+    })
+
+    it("excludes standard provider_key secrets (the implicit project default)", () => {
+        const opts = namedConnectionOptions(SECRETS, CAPABILITIES, "pi_core", "openai")
+        expect(opts.some((o) => o.value === "OPENAI_API_KEY")).toBe(false)
+    })
+
+    it("with no provider chosen, keeps only connections the harness can reach", () => {
+        // claude reaches anthropic only -> openai connections dropped.
+        const opts = namedConnectionOptions(SECRETS, CAPABILITIES, "claude", null)
+        expect(opts.map((o) => o.value)).toEqual(["anthropic-prod"])
+    })
+
+    it("returns [] for an empty or missing vault list", () => {
+        expect(namedConnectionOptions([], CAPABILITIES, "pi_core", "openai")).toEqual([])
+        expect(namedConnectionOptions(null, CAPABILITIES, "pi_core", "openai")).toEqual([])
     })
 })

From c406a5d2724c33dbd0eb4f4ac6f10c5188769bf8 Mon Sep 17 00:00:00 2001
From: Mahmoud Mabrouk <mahmoud@agenta.ai>
Date: Thu, 25 Jun 2026 12:38:33 +0200
Subject: [PATCH 6/6] fix(agent): publish harness capabilities on
 request-driven /inspect

The agent playground's model picker showed an empty provider/model list: the
agent /inspect response carried no meta.harness_capabilities, so the FE had
nothing to render. Two breaks, fixed here.

Backend: the routed agent workflow sets meta=harness_capabilities, but the
playground posts /inspect WITH a revision, which takes the request-driven
inspect_workflow path. That builds a fresh workflow from the request (no meta)
and never consults the routed instance's meta, so the capabilities were dropped.
WorkflowRevisionData has no meta field, so the interface registry cannot carry
it. Add a META_REGISTRY (mirrors register_interface/retrieve_interface); the
agent service registers its meta under the builtin URI; workflow.inspect()
merges the registered meta (request/decorator meta wins per key). Now /inspect
emits meta.harness_capabilities on the request-driven path too.

Frontend: the inspect atom skipped the fetch when the revision already carried
all schemas inline (a redundancy optimization). The agent stores its schemas
inline, so inspect never ran and the capabilities never reached the FE. Fetch
inspect for the agent regardless (detected by the builtin agent URI, since the
is_agent flag is not reliably stored).

Regression test pins the exact broken path: a request-driven inspect of the
agent URI normalizes to a response carrying per-harness models.

Claude-Session: https://claude.ai/code/session_01GYo3UEfvsZpncagqb28Mbc
---
 sdks/python/agenta/sdk/decorators/running.py  | 13 ++++++-
 .../agenta/sdk/engines/running/utils.py       | 31 +++++++++++++++
 services/oss/src/agent/app.py                 | 20 +++++++---
 .../unit/agent/test_builtin_uri_binding.py    | 39 ++++++++++++++++++-
 .../src/workflow/state/store.ts               | 18 ++++++++-
 5 files changed, 113 insertions(+), 8 deletions(-)

diff --git a/sdks/python/agenta/sdk/decorators/running.py b/sdks/python/agenta/sdk/decorators/running.py
index 74f151081b..7b58c22a23 100644
--- a/sdks/python/agenta/sdk/decorators/running.py
+++ b/sdks/python/agenta/sdk/decorators/running.py
@@ -43,6 +43,7 @@
     retrieve_handler,
     retrieve_interface,
     retrieve_configuration,
+    retrieve_meta,
     parse_uri,
 )
 from agenta.sdk.engines.running.handlers import remote_forward_v0
@@ -455,6 +456,16 @@ async def inspect(
                 running_ctx.parameters = self.parameters
 
                 if self.default_request is None:
+                    # The request-driven inspect path builds a fresh workflow from the request
+                    # (no `meta`), so the routed instance's `meta` (e.g. the agent's
+                    # `harness_capabilities`) is unreachable. Fall back to the URI's registered
+                    # interface meta, with the request/decorator meta winning per key.
+                    registered_meta = retrieve_meta(self.uri)
+                    inspect_meta = (
+                        {**registered_meta, **(self.meta or {})}
+                        if registered_meta
+                        else self.meta
+                    )
                     self.default_request = WorkflowInvokeRequest(
                         #
                         references=self.references,
@@ -464,7 +475,7 @@ async def inspect(
                         #
                         flags=self.flags,
                         tags=self.tags,
-                        meta=self.meta,
+                        meta=inspect_meta,
                         #
                         data=WorkflowRequestData(
                             revision=WorkflowRevision(
diff --git a/sdks/python/agenta/sdk/engines/running/utils.py b/sdks/python/agenta/sdk/engines/running/utils.py
index 1ecec3a9ab..83fa7cde6f 100644
--- a/sdks/python/agenta/sdk/engines/running/utils.py
+++ b/sdks/python/agenta/sdk/engines/running/utils.py
@@ -323,6 +323,12 @@ def _catalog_entry() -> dict:
     ),
 )
 
+# Per-URI interface metadata (the `/inspect` response `meta`). `WorkflowRevisionData` has no
+# `meta` field, so the routed workflow's `meta=` (e.g. the agent's `harness_capabilities`) cannot
+# ride the interface registry; this registry carries it so the request-driven inspect path can
+# emit it. Process-local, like the others.
+META_REGISTRY: dict = dict()
+
 # Global registry for workflow handlers organized by URI structure.
 #
 # URI Format: provider:kind:key:version
@@ -535,6 +541,31 @@ def retrieve_configuration(uri: Optional[str] = None) -> Optional[dict]:
     return _get_with_latest(CONFIGURATION_REGISTRY, provider, kind, key, version)
 
 
+def register_meta(meta: dict, uri: str) -> str:
+    """Register (or OVERRIDE) the interface ``meta`` for a URI in the meta registry.
+
+    Mirrors :func:`register_interface`: a service process binds the ``meta`` it wants ``/inspect``
+    to publish under a builtin URI (e.g. the agent service registers ``harness_capabilities``).
+    Process-local. The stored dict is copied so a later mutation of the caller's dict cannot leak
+    into the registry.
+    """
+    provider, kind, key, version = parse_uri(uri)
+    if not provider or not kind or not key or not version:
+        raise ValueError(f"Invalid URI: {uri}")
+    META_REGISTRY.setdefault(provider, {}).setdefault(kind, {}).setdefault(key, {})[
+        version
+    ] = dict(meta)
+    return uri
+
+
+def retrieve_meta(uri: Optional[str] = None) -> Optional[dict]:
+    if not uri:
+        return None
+    provider, kind, key, version = parse_uri(uri)
+
+    return _get_with_latest(META_REGISTRY, provider, kind, key, version)
+
+
 def is_user_custom_uri(uri: Optional[str] = None) -> bool:
     if not uri:
         return True
diff --git a/services/oss/src/agent/app.py b/services/oss/src/agent/app.py
index 0dcf81e4ae..00a4ed5077 100644
--- a/services/oss/src/agent/app.py
+++ b/services/oss/src/agent/app.py
@@ -47,7 +47,11 @@
 from agenta.sdk.agents.platform import resolve_connection
 
 from agenta.sdk.decorators.tracing import auto_instrument
-from agenta.sdk.engines.running.utils import register_handler, register_interface
+from agenta.sdk.engines.running.utils import (
+    register_handler,
+    register_interface,
+    register_meta,
+)
 from agenta.sdk.models.workflows import WorkflowRevisionData
 
 from agenta.sdk.utils.logging import get_module_logger
@@ -296,10 +300,15 @@ def create_agent_app():
     #    only instruments inside `_register_handler`, which it skips once a handler exists in the
     #    registry. Registering the raw `_agent` would lose tracing instrumentation; registering the
     #    instrumented one keeps it (mirrors chat.py, whose registry handler is pre-instrumented).
-    # 2. OVERRIDE the interface registry with the service interface (AGENT_SCHEMAS + the inspect
-    #    `meta`), so `retrieve_interface(AGENT_URI)` returns the SAME data `/inspect` advertises.
-    #    `register_interface` replaces (not setdefault), unlike the SDK's minimal seed.
-    # 3. Build the workflow against the URI. `ag.workflow.__init__` then resolves the (instrumented)
+    # 2. OVERRIDE the interface registry with the service interface (AGENT_SCHEMAS), so
+    #    `retrieve_interface(AGENT_URI)` returns the SAME schemas `/inspect` advertises.
+    #    `register_interface` replaces (not setdefault), unlike the SDK's minimal seed. It carries
+    #    only `WorkflowRevisionData` (schemas), which has no `meta` field — so the inspect `meta`
+    #    goes through `register_meta` (step 3), not here.
+    # 3. Register the inspect `meta` for the URI. The request-driven `/inspect` path builds a fresh
+    #    workflow from the request (which has no `meta`), so the routed instance's `meta=` below
+    #    would not survive; the meta registry is what `workflow.inspect()` reads to emit it.
+    # 4. Build the workflow against the URI. `ag.workflow.__init__` resolves the (instrumented)
     #    handler and merges the registered interface; the passed `schemas`/`meta` still win.
     #
     # The per-harness connection capability rides the inspect response `meta`, NOT a fourth
@@ -313,6 +322,7 @@ def create_agent_app():
         WorkflowRevisionData(uri=AGENT_URI, schemas=AGENT_SCHEMAS),
         uri=AGENT_URI,
     )
+    register_meta(meta, uri=AGENT_URI)
     routed = ag.workflow(uri=AGENT_URI, schemas=AGENT_SCHEMAS, meta=meta)(_agent)
     # is_agent gates the agent-only `/messages` route (next to /invoke).
     ag.route("/", app=app, flags={"is_chat": True, "is_agent": True})(routed)
diff --git a/services/oss/tests/pytest/unit/agent/test_builtin_uri_binding.py b/services/oss/tests/pytest/unit/agent/test_builtin_uri_binding.py
index f5d1cae2ab..75eaf8ac8c 100644
--- a/services/oss/tests/pytest/unit/agent/test_builtin_uri_binding.py
+++ b/services/oss/tests/pytest/unit/agent/test_builtin_uri_binding.py
@@ -15,7 +15,16 @@
 
 from __future__ import annotations
 
-from agenta.sdk.engines.running.utils import retrieve_handler, retrieve_interface
+import pytest
+
+from agenta.sdk.decorators.routing import _to_inspect_response
+from agenta.sdk.decorators.running import inspect_workflow
+from agenta.sdk.engines.running.utils import (
+    retrieve_handler,
+    retrieve_interface,
+    retrieve_meta,
+)
+from agenta.sdk.models.workflows import WorkflowInspectRequest
 
 from oss.src.agent import app
 from oss.src.agent.schemas import AGENT_SCHEMAS
@@ -57,3 +66,31 @@ def test_rebuilding_the_app_keeps_the_binding_stable():
     assert retrieve_handler(_AGENT_URI) is not None
     interface = retrieve_interface(_AGENT_URI)
     assert interface is not None and interface.uri == _AGENT_URI
+
+
+def test_register_meta_publishes_harness_capabilities():
+    # The interface registry carries only schemas (`WorkflowRevisionData` has no `meta`), so the
+    # agent's `harness_capabilities` rides the meta registry instead.
+    meta = retrieve_meta(_AGENT_URI)
+    assert meta is not None
+    assert "harness_capabilities" in meta
+    assert set(meta["harness_capabilities"]) == {"pi_core", "pi_agenta", "claude"}
+
+
+@pytest.mark.asyncio
+async def test_request_driven_inspect_emits_harness_capabilities():
+    # The exact broken path: the playground posts `/inspect` with a `revision`, so the endpoint
+    # takes the request-driven `inspect_workflow` branch (NOT `wf.inspect()`). The request carries
+    # no `meta`, so without the meta registry the per-harness models would be dropped. Assert the
+    # normalized response carries them — and that each harness publishes its model map.
+    built = await inspect_workflow(
+        request=WorkflowInspectRequest(revision={"uri": _AGENT_URI})
+    )
+    response = _to_inspect_response(built)
+
+    assert response.meta is not None
+    caps = response.meta["harness_capabilities"]
+    assert set(caps) == {"pi_core", "pi_agenta", "claude"}
+    # Claude reaches anthropic only, with its alias set; Pi publishes per-provider model lists.
+    assert caps["claude"]["models"]["anthropic"]
+    assert caps["pi_core"]["models"]["openai"]
diff --git a/web/packages/agenta-entities/src/workflow/state/store.ts b/web/packages/agenta-entities/src/workflow/state/store.ts
index 83531a1cb8..b4e27e005a 100644
--- a/web/packages/agenta-entities/src/workflow/state/store.ts
+++ b/web/packages/agenta-entities/src/workflow/state/store.ts
@@ -56,6 +56,11 @@ import {
 // HELPERS
 // ============================================================================
 
+// The builtin agent workflow URI. The agent publishes its per-harness capabilities only in
+// the `/inspect` response `meta`, so the inspect atom must fetch for this URI even when the
+// stored revision already carries its schemas inline.
+const AGENT_BUILTIN_URI = "agenta:builtin:agent:v0"
+
 function getStore(options?: StoreOptions) {
     return options?.store ?? getDefaultStore()
 }
@@ -1053,12 +1058,23 @@ export const workflowInspectAtomFamily = atomFamily((revisionId: string) =>
         // Skip inspect when the revision already carries all schemas inline.
         // The merge step (workflowEntityAtomFamily) gives server schemas
         // precedence, so fetching inspect would be redundant.
+        // Exception: an agent publishes its per-harness capabilities ONLY in the inspect
+        // response `meta` (harness_capabilities), never in the stored schemas — so the agent
+        // playground's model picker needs inspect even when the schemas are inline. The
+        // `is_agent` flag is not reliably stored on the revision, so detect the agent by its
+        // builtin URI too.
         const serverSchemas = serverData?.data?.schemas
+        const isAgent = (serverData?.flags?.is_agent ?? false) || uri === AGENT_BUILTIN_URI
         const hasAllSchemas =
             !!serverSchemas?.inputs && !!serverSchemas?.outputs && !!serverSchemas?.parameters
 
         const isEnabled =
-            get(sessionAtom) && !!projectId && !!uri && !!serviceUrl && hasUrl && !hasAllSchemas
+            get(sessionAtom) &&
+            !!projectId &&
+            !!uri &&
+            !!serviceUrl &&
+            hasUrl &&
+            (isAgent || !hasAllSchemas)
 
         return {
             queryKey: ["workflows", "inspect", revisionId, uri, serviceUrl, projectId],