diff --git a/docs/design/agent-workflows/documentation/ground-truth.md b/docs/design/agent-workflows/documentation/ground-truth.md index c19454276e..173692d7c4 100644 --- a/docs/design/agent-workflows/documentation/ground-truth.md +++ b/docs/design/agent-workflows/documentation/ground-truth.md @@ -63,8 +63,12 @@ this page and the referenced code as the source of truth. - Warm daemon sessions, ACP `session/load`, and session fork are not wired. - `AgentaHarness` ships placeholder Agenta preamble, persona, and skill set. It does run on sandbox-agent local and Daytona, verified by the QA matrix (`projects/qa/findings.md`, F-002). -- The agent is not registered as a first-class built-in workflow type. The builtin interface - exists in the SDK, but the handler is still bound directly (`services/oss/src/agent/app.py:138`). +- The live agent handler is bound to the builtin URI `agenta:builtin:agent:v0`: + `create_agent_app()` (`services/oss/src/agent/app.py`) registers the instrumented `_agent` and the + service interface under that URI, so `retrieve_handler` / `retrieve_interface` return the live + handler and the same schemas `/inspect` advertises (the interface override is process-local to the + agent service). The harness in the agent_config interface carries a versioned slug + display name + per option (`HARNESS_IDENTITIES`); the stored/wire harness value stays the bare string. - Per-request model override is not honored on the Pi ACP path. pi-acp accepts only its default model and silently falls back (`projects/qa/findings.md`, F-007). - Remote (`http`) MCP servers are skipped by the runner path. Local stdio MCP is the path diff --git a/docs/design/agent-workflows/documentation/protocol.md b/docs/design/agent-workflows/documentation/protocol.md index 26eae473d4..e40e222251 100644 --- a/docs/design/agent-workflows/documentation/protocol.md +++ b/docs/design/agent-workflows/documentation/protocol.md @@ -108,7 +108,7 @@ Request fields include: | Field | Meaning | | --- | --- | -| `harness` | Harness id: `pi_core`, `pi_agenta`, or `claude`. `pi_core` and `pi_agenta` both drive the `pi` ACP agent; `pi_agenta` is Pi with Agenta's forced skills, prompt, and policy. `claude` drives the `claude` ACP agent. | +| `harness` | Harness id, the bare string `pi_core`, `pi_agenta`, or `claude`. `pi_core` and `pi_agenta` both drive the `pi` ACP agent; `pi_agenta` is Pi with Agenta's forced skills, prompt, and policy. `claude` drives the `claude` ACP agent. The wire value is bare; the agent_config *interface* dresses each value with a versioned slug + display name (see [Agent config schema](../interfaces/public-edge/agent-config-schema.md)), but the wire and the runner selector are unchanged. | | `sandbox` | Sandbox id, usually `local` or `daytona`. | | `sessionId` | External conversation id. The runtime is cold and receives history in `messages`. | | `agentsMd` | Instructions that become `AGENTS.md`. | diff --git a/docs/design/agent-workflows/interfaces/README.md b/docs/design/agent-workflows/interfaces/README.md index 0823392566..88d645d705 100644 --- a/docs/design/agent-workflows/interfaces/README.md +++ b/docs/design/agent-workflows/interfaces/README.md @@ -42,9 +42,9 @@ page. `Status` is read from each page's prose: **stable** (wired and unlikely to | Interface | Blast radius | Owner file(s) | Status | Tests | |---|---|---|---|---| | [`/invoke`](public-edge/workflow-invoke.md) | public | `decorators/routing.py`, `models/workflows.py`, `agent/app.py` | stable | `unit/agent/`, `utils/test_messages_endpoint.py` | -| [`/inspect`](public-edge/workflow-inspect.md) | public | `agent/schemas.py`, `models/workflows.py`, `decorators/routing.py` | stable | `unit/agents/test_dtos_agent_config.py` | +| [`/inspect`](public-edge/workflow-inspect.md) | public | `agent/schemas.py`, `agent/app.py` (builtin-URI binding), `models/workflows.py`, `decorators/routing.py` | stable | `unit/agents/test_dtos_agent_config.py`, `unit/agent/test_builtin_uri_binding.py` | | [`/messages`](public-edge/agent-messages.md) | public | `adapters/vercel/{routing,messages,stream}.py`, `agentRequest.ts` | evolving (create-or-resume not observable until storage lands) | `utils/test_messages_endpoint.py`, `unit/agents/test_ui_messages.py` | -| [Agent config schema](public-edge/agent-config-schema.md) | public | `agent/schemas.py`, `sdk/utils/types.py`, `agents/dtos.py` | stable | `unit/agents/test_dtos_agent_config.py` | +| [Agent config schema](public-edge/agent-config-schema.md) | public | `agent/schemas.py`, `sdk/utils/types.py`, `agents/dtos.py` (`HARNESS_IDENTITIES`) | stable | `unit/agents/test_dtos_agent_config.py`, `unit/agents/test_harness_identity.py` | | [`/run`](cross-service/service-to-agent-runner.md) | cross-service (the spine) | `protocol.ts`, `utils/wire.py`, `utils/ts_runner.py`, `server.ts`/`cli.ts` | stable (pinned by golden) | `unit/agents/test_wire_contract.py` + `golden/`, `services/agent/tests/unit/wire-contract.test.ts` | | [Runner to harness](cross-service/runner-to-harness.md) | cross-service (ACP) | `engines/sandbox_agent.ts` + `sandbox_agent/{run-plan,capabilities,permissions}.ts` | evolving | `services/agent/tests/unit/sandbox-agent-*.test.ts` | | [Runner to MCP server](cross-service/runner-to-mcp-server.md) | cross-service | `agents/mcp/`, `engines/sandbox_agent/mcp.ts`, `tools/{mcp-bridge,mcp-server,relay}.ts` | evolving (stdio wired; remote deferred) | `services/agent/tests/unit/mcp-servers.test.ts` | @@ -52,7 +52,7 @@ page. `Status` is read from each page's prose: **stable** (wired and unlikely to | [Service and runner trace export](cross-service/service-and-runner-trace-export.md) | cross-service | `agent/tracing.py`, `tracing/otel.ts`, `extensions/agenta.ts` | stable | `services/agent/tests/unit/` | | [Service to vault and tool providers](cross-service/service-to-vault-and-tool-providers.md) | cross-service (external) | `agent/app.py`, `platform/{resolve,connections}.py`, `agents/capabilities.py`, `tools/router.py` | stable | `unit/agents/connections/`, `unit/agents/platform/`, `unit/agents/tools/` | | [Agent service handler](in-service/agent-service-handler.md) | in-service | `services/oss/src/agent/app.py` | stable | `services/oss/tests/pytest/unit/agent/` | -| [Neutral runtime DTOs](in-service/neutral-runtime-dtos.md) | in-service | `agents/dtos.py` | stable | `unit/agents/test_dtos_*.py` | +| [Neutral runtime DTOs](in-service/neutral-runtime-dtos.md) | in-service | `agents/dtos.py` | stable | `unit/agents/test_dtos_*.py`, `test_harness_identity.py` | | [Runtime ports](in-service/runtime-ports.md) | in-service | `agents/interfaces.py` | evolving (`LocalBackend` stub) | `unit/agents/test_environment_lifecycle.py`, `test_harness_adapters.py` | | [Backend adapter](in-service/backend-adapter.md) | in-service | `agents/adapters/sandbox_agent.py` | stable | `unit/agents/test_runner_adapter_config.py`, `test_environment_lifecycle.py` | | [Harness adapters](in-service/harness-adapters.md) | in-service | `agents/adapters/harnesses.py`, `agents/dtos.py` | stable | `unit/agents/test_harness_adapters.py`, `test_dtos_harness_configs.py` | diff --git a/docs/design/agent-workflows/interfaces/in-service/agent-service-handler.md b/docs/design/agent-workflows/interfaces/in-service/agent-service-handler.md index 5c812bf3a2..6f0dfac1cb 100644 --- a/docs/design/agent-workflows/interfaces/in-service/agent-service-handler.md +++ b/docs/design/agent-workflows/interfaces/in-service/agent-service-handler.md @@ -33,6 +33,23 @@ The handler (`_agent` in `app.py`) takes the workflow envelope's pieces: `{"role": "assistant", "content": result.output}`. 9. Record usage. +## App build: binding the builtin URI + +`create_agent_app()` binds the handler to the canonical builtin URI `agenta:builtin:agent:v0` +instead of letting it fall to an auto `user:custom:...` URI, so the handler and the interface +`/inspect` advertises share one identity. The order avoids two traps: + +1. **Instrument before registering.** `register_handler(auto_instrument(_agent), uri=...)` — not the + raw `_agent`. `ag.workflow` only instruments inside its own `_register_handler`, which it skips + once a handler already exists in the registry, so the service registers the instrumented one. +2. **Override the interface.** `register_interface(...)` REPLACES the SDK's minimal seed for the + URI with the service interface (`AGENT_SCHEMAS`), so `retrieve_interface(uri)` returns what + `/inspect` advertises. This is process-local to the agent service; the API catalog still builds + from the SDK defaults in its own process. + +Then `ag.workflow(uri="agenta:builtin:agent:v0", schemas=AGENT_SCHEMAS, meta=...)(_agent)` resolves +the instrumented handler and merges the registered interface (the passed `schemas`/`meta` win). + ## Owned by - `services/oss/src/agent/app.py` diff --git a/docs/design/agent-workflows/interfaces/in-service/neutral-runtime-dtos.md b/docs/design/agent-workflows/interfaces/in-service/neutral-runtime-dtos.md index b15f3c6397..e5d5c45afb 100644 --- a/docs/design/agent-workflows/interfaces/in-service/neutral-runtime-dtos.md +++ b/docs/design/agent-workflows/interfaces/in-service/neutral-runtime-dtos.md @@ -22,7 +22,12 @@ All in `dtos.py`. The ones that carry the most weight: by harness name. Built by `from_params(...)`. The editable schema is [Agent config schema](../public-edge/agent-config-schema.md). - **`RunSelection`**: `harness` (default `pi_core`), `sandbox` (default `local`), - `permission_policy` (`auto` | `deny`). + `permission_policy` (`auto` | `deny`). The `harness` value is the bare `HarnessType` string. +- **`HarnessType` and `HARNESS_IDENTITIES`**: the closed harness enum plus the single source for + each harness's interface identity — a versioned slug (`agenta:harness::v0`, the repo's + slug grammar) and a display name. The agent_config schema builds its harness `oneOf` from + `HARNESS_IDENTITIES`; the stored/wire value stays the bare enum string, so only the interface + gains the slug + name. See [Agent config schema](../public-edge/agent-config-schema.md). - **`SessionConfig`**: everything one run needs, assembled by the handler: the agent config, secrets, resolved connection, permission policy, trace, session id, and the resolved tool and MCP inputs. diff --git a/docs/design/agent-workflows/interfaces/public-edge/agent-config-schema.md b/docs/design/agent-workflows/interfaces/public-edge/agent-config-schema.md index 38675eee13..f53e7599f3 100644 --- a/docs/design/agent-workflows/interfaces/public-edge/agent-config-schema.md +++ b/docs/design/agent-workflows/interfaces/public-edge/agent-config-schema.md @@ -23,7 +23,7 @@ The fields and the full schema follow. | `model` | string (`grouped_choice`) | `"gpt-5.5"` | Model the agent runs on. A plain id (`"gpt-5.5"`) or a structured `{provider, connection}` ref. See [Model connection resolution](../in-service/model-connection-resolution.md). | | `tools` | `ToolConfig[]` | `[]` | Runnable tools: `builtin`, `gateway`, `code`, or `client`. See [Tool models and resolution](../in-service/tool-models-and-resolution.md). | | `mcp_servers` | `MCPServerConfig[]` | `[]` | Declared MCP servers; secret env resolved from the vault at run time. See [MCP models and resolution](../in-service/mcp-models-and-resolution.md). | -| `harness` | `"pi_core" \| "claude" \| "pi_agenta"` | `"pi_core"` | The coding agent to drive. `pi_core` and `pi_agenta` both drive the `pi` ACP agent; `pi_agenta` adds Agenta's forced skills, prompt, and policy. | +| `harness` | `"pi_core" \| "claude" \| "pi_agenta"` (see slug+name note) | `"pi_core"` | The coding agent to drive. `pi_core` and `pi_agenta` both drive the `pi` ACP agent; `pi_agenta` adds Agenta's forced skills, prompt, and policy. | | `sandbox` | `"local" \| "daytona"` | `"local"` | Where it runs. | | `permission_policy` | `"auto" \| "deny"` | `"auto"` | How a gating harness (Claude Code) handles tool-use prompts in a headless run. | | `sandbox_permission` | `SandboxPermission \| null` | `null` (form pre-fills one) | The declared network and filesystem boundary. See [Sandbox permission](../in-service/sandbox-permission.md). | @@ -33,6 +33,35 @@ Note that `harness`, `sandbox`, and `permission_policy` are the run selection. T reads them from the same `parameters` object via `RunSelection.from_params(...)`, not just from `AgentConfig`. +### Harness as a slug + display name + +The `harness` field's JSON Schema carries both a flat `enum` of the bare values (back-compat +for any consumer that reads `schema.enum`) AND a `oneOf` of per-option entries, each a versioned +**slug** identity plus a **display name**, built from one SDK source +(`HARNESS_IDENTITIES` in `sdks/python/agenta/sdk/agents/dtos.py`). The slug follows the repo's +`agenta:::v` grammar (mirroring `agenta:builtin:agent:v0`), namespace +`harness`: + +```jsonc +"harness": { + "type": "string", + "default": "pi_core", + "enum": ["pi_core", "pi_agenta", "claude"], + "oneOf": [ + { "const": "pi_core", "title": "Pi", "x-ag-harness-slug": "agenta:harness:pi_core:v0" }, + { "const": "pi_agenta", "title": "Pi (Agenta)", "x-ag-harness-slug": "agenta:harness:pi_agenta:v0" }, + { "const": "claude", "title": "Claude Code", "x-ag-harness-slug": "agenta:harness:claude:v0" } + ] +} +``` + +The **stored/wire value stays the bare string** (`const`): the runner reads it as the runtime +selector and the frontend keys connection gating off it, so the `/run` wire is unchanged. The +playground `EnumSelectControl` reads the `oneOf` `title` for the dropdown label and writes the +bare `const` back. The slug is the harness contract's versioned identity in the interface only; +versioning the contract (`/run` `version`, the `/health` skew read) is deferred (see the +[contract-versioning project](../../projects/contract-versioning/README.md)). + ## The default config `/inspect` ships this as the value the form starts from. It is the canonical example of diff --git a/docs/design/agent-workflows/interfaces/public-edge/workflow-inspect.md b/docs/design/agent-workflows/interfaces/public-edge/workflow-inspect.md index 92a31a523c..9424169d28 100644 --- a/docs/design/agent-workflows/interfaces/public-edge/workflow-inspect.md +++ b/docs/design/agent-workflows/interfaces/public-edge/workflow-inspect.md @@ -45,6 +45,10 @@ the fields. ## Owned by - `services/oss/src/agent/schemas.py`: builds the input, parameter, and output schemas. +- `services/oss/src/agent/app.py`: `create_agent_app()` binds the live `_agent` handler AND the + service interface to the builtin URI `agenta:builtin:agent:v0` (via `register_handler` / + `register_interface`), so `retrieve_handler` / `retrieve_interface` return the live handler and + the same schemas `/inspect` advertises. The handler and the interface share one identity. - `sdks/python/agenta/sdk/models/workflows.py`: the inspect response model. - `sdks/python/agenta/sdk/decorators/routing.py`: the generic inspect route. @@ -53,6 +57,11 @@ the fields. - **Catalog type markers.** `agent_config` and `messages` bind the schema to a playground control. Renaming a marker without updating the catalog breaks the form silently. - **The config default.** `/inspect` ships the default agent config the form starts from. - Keep it in sync with what the runtime actually accepts. + Keep it in sync with what the runtime actually accepts. The SDK builtin config registry entry + (`CONFIGURATION_REGISTRY` for `agent:v0`) uses the same `build_agent_v0_default()` builder, so a + URI-dispatched run with no parameters gets the same default. - **Harness capability metadata.** The form filters connections from this block. If it drifts from the server-side table, the form offers choices the run will reject. +- **The builtin URI binding.** The live handler and interface are registered under + `agenta:builtin:agent:v0` at app build time. The interface override is process-local (the agent + service process), so the API process's catalog still builds from the SDK defaults. diff --git a/docs/design/agent-workflows/projects/contract-versioning/README.md b/docs/design/agent-workflows/projects/contract-versioning/README.md new file mode 100644 index 0000000000..239e2b0f95 --- /dev/null +++ b/docs/design/agent-workflows/projects/contract-versioning/README.md @@ -0,0 +1,225 @@ +# Contract versioning for the agent-workflows feature + +A proposal for putting a single, explicit version on the cross-service contract between the +Python agent service and the Node runner sidecar, and reading it. The center of gravity is +the `/run` spine between those two, because they deploy independently and their contract +today carries no version that anything checks. + +This is a proposal, not an implementation. It changes no code and no contract. Where it +names Composio, the tool gateway, connections, or MCP, it describes them only as things that +exist; this work leaves them unchanged. + +We are **preproduction**: there is no deployed fleet to keep compatible and no back-compat +burden. The point of versioning here is to fail loudly on a skewed deploy during development, +not to migrate a live install base. That framing keeps the whole design small. + +Graduated from the interface inventory at `../../interfaces/`, which maps every boundary in +the feature and is the source of truth for the contract shapes referenced here. + +## The shape in one paragraph + +The public edge versions itself: `/messages` stamps `x-ag-messages-version: v1` on every +response. The cross-service spine does not. The runner already exports a `protocol: 1` field +on `GET /health` and `version.ts` even documents it as "the version-skew guard," but nothing +reads it: the Python client (`ts_runner.py`) POSTs straight to `/run` and never probes +`/health`. So the guard is documented intent with no consumer. Meanwhile both sides ignore +unknown fields silently (Python via `data.get(...)`, the runner via a raw +`JSON.parse(...) as AgentRunRequest` cast with no validation), which makes additive changes +safe but makes a new *semantic* field a silent no-op on an old peer. The recommendation is +deliberately small: carry one **string** version on the `/run` payload, read the runner's +`protocol` off `/health` before the first run, and dispatch behavior with a plain `if/elif` +on the version string — exactly the way the LLM-as-a-judge evaluator already does it. No new +adapter machinery, no version negotiation protocol. + +## Problem + +The agent-workflows feature is a distributed system whose parts ship on different cadences: + +- The Python agent service (`services/oss/src/agent/` + `sdks/python/agenta/sdk/agents/`) + ships with the API / SDK release. +- The Node runner sidecar (`services/agent/`) is a standalone pnpm package with its own + lockfile and Docker image, deployed as a sidecar that can lag or lead the service. +- The sandbox-agent harness layer is a **pinned npm dependency** of the runner + (`sandbox-agent@0.4.2` in `services/agent/package.json`), so the harness contract version + is baked into whichever runner image is deployed. + +Because these deploy independently, a field can change on one side and reach an older version +on the other. Today the spine between them has no version anyone checks. A future change such +as a new agent config shape can reach a skewed peer and fail in a way that is hard to +attribute, instead of being rejected cleanly with a "your runner is too old" message. + +A sibling effort (A2, `wire-contract-schema`) is designing a schema-driven `/run` contract to +replace the hand-mirror between `protocol.ts` and `wire.py`. A schema source of truth is the +natural place to carry the version string this proposal adds. + +## Current-state matrix + +Every cross-service, cross-process, or external contract in the feature, and whether it +carries any version or skew check today. (P) = process boundary, (X) = external boundary, +(E) = public edge. + +| Contract | Boundary | Version today | Notes | +|---|---|---|---| +| `/run` request + result | service -> runner (P) | **None on the payload.** Runner advertises `protocol: 1` on `/health` but no field on `/run` itself. `ts_runner.py` never reads `/health`. | The spine. `protocol.ts` hand-mirrored by `wire.py`, pinned by golden fixtures. Unknown fields ignored silently on both sides. | +| `/run` streaming (NDJSON `{kind:"event"\|"result"}`) | service -> runner (P) | None (shares the `/run` contract). | The stream record envelope (`kind`) rides the already-unversioned `/run`. | +| `GET /health` runner identity | service <- runner (P) | **Carries `protocol: 1`, `runner` build version, `engines`, `harnesses`.** Advertised but **unconsumed**. | The only place a version lives on the spine. `version.ts` calls it "the version-skew guard"; there is no guard. This is the seam to build on. | +| `harness` selection | inside `/run` (P) | Versionless enum. Closed `HarnessType` enum in Python; **free `string` on the TS wire** (`harness?: string`). | The harness values are now `pi_core` / `pi_agenta` / `claude` (see below). | +| `POST /tools/call` (gateway callback) | runner -> Agenta (P/X) | None. | The tool gateway and Composio are UNCHANGED by this work; listed only to show the boundary is unversioned. | +| `POST /tools/resolve` | service -> Agenta tool resolution (P) | None. | UNCHANGED by this work. | +| Runner-owned MCP stdio bridge | runner -> harness (P) | **Yes, external standard:** MCP `protocolVersion`, defaults `"2025-06-18"`. | The one boundary that already does it right, because it follows the MCP spec. UNCHANGED by this work. | +| OTLP trace export (`/api/otlp/v1/traces`) | runner -> Agenta (X) | **Yes, external standard:** OTLP proto; endpoint path carries `v1`. | Standards-versioned. Trace pipeline UNCHANGED by this work. | +| Vault / connection / secret resolution | service -> Agenta (P) | None. | UNCHANGED by this work. | +| `/invoke` (batch) | client -> service (E) | Generic `WorkflowInvokeRequest` envelope; `/inspect` carries a dated `"version": "2025.07.14"`. | Public edge; shared across all workflow types. | +| `/inspect` (interface description) | client -> service (E) | Dated envelope `version` + `x-ag-type-ref` schema markers (`agent_config`, `messages`). | The agent config schema lives behind `x-ag-type-ref: "agent_config"`. | +| `/messages` (chat stream) | client -> service (E) | **Yes:** `x-ag-messages-version: v1` on responses; `VERCEL_MESSAGE_PROTOCOL_VERSION = "v1"`. | The model to copy: an explicit, named, header-carried version. | +| pinned `sandbox-agent@0.4.2` npm dep | runner -> harness lib (build-time) | **Yes, semver, but baked into the image.** | The runner image *is* the version. `/health` could expose it; today it exposes nothing about it. | + +### Where versioning is missing, ranked by blast radius + +1. **The `/run` payload has no version field.** This is the spine. A skewed deploy (new + service + old runner, or the reverse) has no clean failure mode. +2. **`/health`'s `protocol: 1` has no consumer.** The skew guard exists on paper. Wiring a + reader is the single highest-leverage, lowest-risk fix. +3. **The runner cannot tell the service what it can do.** `/health` lists `engines` and + `harnesses` but the service ignores them, so it cannot decline to send a harness id a + runner does not advertise. +4. **Stream record envelope, tool-callback envelope, and resolution shapes are unversioned**, + but they are lower risk: callback/resolve are request-scoped within one logical release + path, and the stream envelope rides `/run`. + +### Preproduction: the harness rename is not a versioning event + +The harnesses were renamed `pi` -> `pi_core` and `agenta` -> `pi_agenta` (the values now live +in `HarnessType` at `sdks/python/agenta/sdk/agents/dtos.py`). Because we are preproduction, +that rename just changed; it does **not** get its own version, downcaster, or compatibility +window. The same holds for the rename of the `pi` / `agenta` in-process artifacts. We version +the *contract shape*, not every naming change made before there is anything deployed. + +## Recommendation + +Three small parts, each shippable on its own. + +### 1. One version string on the `/run` payload, named the way the repo already names versions + +Add a `version` string to the `/run` request and result. Use the convention already in the +codebase — **do not invent a new field name or scheme.** Two existing conventions apply: + +- **A plain version string in the payload, matching the evaluator convention.** Built-in + evaluators store their interface version as a plain string in their parameters + (`"version": "5"` for the LLM-as-a-judge evaluator, `"3"` for code eval — see + `api/oss/src/resources/evaluators/evaluators.py`). The `/run` payload should carry the same + kind of field: a plain `version` string the service stamps and the runner reads. +- **A versioned slug for the contract identity, matching the workflow-URI convention.** Every + built-in workflow interface is identified by a colon-delimited slug whose final segment is + the version: `agenta:builtin:agent:v0`, `agenta:builtin:llm:v0`, etc. + (`sdks/python/agenta/sdk/engines/running/interfaces.py`). The `/run` contract is itself an + interface, so its natural identity is a slug of the same shape — e.g. + `agenta:runner:run:v0` — with the trailing `v0` bumped to `v1` when the shape breaks. + +Concretely: the payload carries a `version` string (`"v0"`), and `/health`'s existing +`protocol` integer is the same number surfaced for the cheap liveness probe. The version is +part of the schema, so it lives wherever A2's schema source of truth ends up. We never use a +`contractVersion`-style field — we reuse the field name and the `v` spelling the repo +already uses. + +### 2. Make the runner expose a versioned harness slug, following the workflow-URI convention + +The harness id on the wire is a free string today (`harness?: string`). When the harness +naming carries a version, give it the same colon-delimited, `v`-suffixed shape the workflow +interfaces use: + +- The dominant slug convention in the repo is `agenta:::v0` + (`agenta:builtin:agent:v0` in `interfaces.py`; lowercase `v`, colon delimiter, version as + the final segment). Mirror it for a versioned harness identity rather than inventing a + parallel format. +- The harness *values* themselves (`pi_core`, `pi_agenta`, `claude`) stay as they are; this is + about giving the harness contract a versioned identity if and when its shape changes, using + the existing slug grammar — not about renaming the harnesses again. + +`/health` already returns `harnesses`. The service should read that list and decline a harness +the runner does not advertise, with a clear error, instead of forwarding an unknown value into +the sandbox-agent SDK where it fails opaquely. + +### 3. Read the version and dispatch with a plain if/elif, like the evaluator does + +This is the whole "evolution" story, and it is intentionally not a framework. The codebase +already evolves a contract by reading a version string and branching on it. The canonical +example is the LLM-as-a-judge evaluator handler +(`sdks/python/agenta/sdk/engines/running/handlers.py`, `auto_ai_critique_v0`): + +```python +template_version = str(parameters.get("version") or "3") + +# Per-version default. Existing versions are unchanged: v2 -> fstring, +# v3/v4 -> curly. v5 introduces mustache as the default ... +if template_version == "2": + default_format = "fstring" +elif template_version == "5": + default_format = "mustache" +else: + default_format = "curly" +``` + +It reads the stored string, branches with `if/elif/else`, and keeps a comment explaining what +each version does. The code evaluator does the same +(`version if declared_version in ("2", "3") else "2"`, then `templates.get("v2" if version == +"3" else "v1")`). + +The `/run` contract should evolve the same way: + +- The service reads the runner's `protocol` off `/health` once per runner endpoint and the + `version` echoed on results. If the runner's major is older than the minimum the service + understands, the run is **rejected with an explicit skew error** ("agent runner protocol vN + is older than what this service supports; upgrade the runner") before any work starts. That + is the guard `version.ts` already describes; this gives it a consumer. +- Where a shape actually differs between versions, the producer/consumer **branches on the + version string** at the one place that cares, exactly like the evaluator handler — not + through a layer of upcaster/downcaster functions. A new optional field is read when present + and defaulted when absent; a renamed value is mapped in the branch. There is no separate + adapter module, no version-keyed translation chain, and no per-version golden directory. + +Why this is enough: preproduction means we do not need to interoperate a `v0` service with a +`v1` runner across a long deprecation window. We need a clear, attributable failure on skew, +plus the ability to keep a couple of older code paths alive behind an `if/elif` while the two +sides catch up in the same dev cycle. The evaluator already proves this pattern carries +several live versions (`v2`..`v5`) with nothing more than a string and a branch. + +### Why this approach over the alternatives + +- **Versioned URL paths (`/v2/run`).** Heavier: forks routing, multiplies handlers. A payload + version string plus an `if/elif` is lighter and composes with A2's single schema. +- **A `contractVersion` `{major, minor}` struct with upcasting/downcasting adapters.** + Rejected: it invents a field name and a translation framework the repo does not use, and it + buys cross-version interoperability we do not need preproduction. The evaluator's plain + string + branch is the convention; mirror it. +- **Do nothing and rely on additive-only changes.** This is the status quo. It works until + the first non-additive change, and the silent-ignore behavior actively hides the break. +- **Strict validation that rejects unknown fields.** That belongs to A2's schema work, not + here. A version string plus the `/health` read is the cheap, high-value slice. + +## Compatibility with the sibling efforts (A2, A3) + +- **A2 (`wire-contract-schema`, schema source of truth).** This proposal puts the `version` + string *in the schema*, so A2 owns where it lives and this work owns that it is read and + branched on. If A2 splits `/run` into multiple endpoints, the same `version` string + the + `/health` read + an `if/elif` apply per resulting contract. +- **A3 (backend removal + `pi` -> `pi_core`, `agenta` -> `pi_agenta`).** Preproduction, so the + rename is not a versioned change — it simply landed (the values are already in + `HarnessType`). Removing the legacy in-process backend removes an *engine*, not a contract + version; `/health`'s `engines` list already advertises which engines a runner has. + +## Open questions + +- **Who owns the version string after A2?** Today `PROTOCOL_VERSION` lives in `version.ts` and + is mirrored by intent in `wire.py`. With a schema source of truth, generate it from the + schema so the two sides cannot drift. Confirm with A2. +- **Slug spelling for the contract identity.** `agenta:runner:run:v0` mirrors + `agenta:builtin:agent:v0`. Confirm the namespace segment (`runner`) with A2 when the schema + lands. +- **Floor enforcement location: service-side, runner-side, or both?** Service-side reject on + the `/health` read is the cheapest and catches the common "old runner" case before any run. + A runner-side check on the inbound `version` would also catch "old service." Service-side + first is the minimum. +- **Should `/health` advertise the baked `sandbox-agent` version?** Low cost (one more field on + an existing endpoint); it would make "this runner's sandbox-agent is too old" diagnosable. + Out of scope to decide here, flagged for A2/A3. diff --git a/docs/design/agent-workflows/projects/contract-versioning/build-notes.md b/docs/design/agent-workflows/projects/contract-versioning/build-notes.md new file mode 100644 index 0000000000..b656904b42 --- /dev/null +++ b/docs/design/agent-workflows/projects/contract-versioning/build-notes.md @@ -0,0 +1,123 @@ +# Contract-versioning implementation — build notes + +Session: https://claude.ai/code/session_01GYo3UEfvsZpncagqb28Mbc +Date: 2026-06-24 +Lane: `feat/agent-contract-versioning-docs` (PR #4829), stacked on `refactor/agent-harness-rename` (#4833). + +This file records the judgment calls for the implementation slice of the (revised) contract-versioning +proposal. The README is the spec. Two pieces are in scope; the rest is deliberately deferred. + +## Scope implemented + +1. **Harness as slug + display name in the interface** (README §2 / author review). +2. **Issue 2** — bind the builtin URI `agenta:builtin:agent:v0` to the live `_agent` handler + (architecture-followups.md #2). + +## Explicitly deferred (POC, "we don't need version yet") + +- `version` string on `/run`, the if/elif version dispatch, the `/health` `protocol` skew-guard read. + These remain the documented (deferred) design in the README. Not implemented here. + +## Decision: the minimal harness slug + name representation + +The author asked for the harness in the **interface** to be a slug (mirroring +`agenta:builtin:agent:v0` from `interfaces.py`) plus a display name, not a bare enum string. + +### What "the interface" is, and what stays bare + +The harness appears in four places: + +- **`AgentConfigSchema.harness`** (`sdks/python/agenta/sdk/utils/types.py`) — the JSON Schema the + `/inspect` agent_config catalog type advertises and the playground renders. THIS is "the interface." +- **`AgentConfig` runtime parse + `RunSelection.harness`** (`agents/dtos.py`) — the stored/runtime value. +- **`HarnessType` enum** (`agents/dtos.py`) — the closed enum the runtime/wire use. +- **The wire `harness` field** (`wire.py` → `protocol.ts`) — the runtime SELECTOR the runner reads to + pick the ACP agent (`run-plan.ts`: `harness === "pi_core" || "pi_agenta" ? "pi" : harness`). + +The minimal representation gives the slug+name to the **interface only** and leaves the **stored/wire +value bare** (`pi_core` / `pi_agenta` / `claude`). Why bare-value-stays: + +- The wire `harness` value is a runtime selector consumed verbatim by the runner and by FE + `connectionUtils` (`allowedProviders("pi_core")`). Promoting it to a full slug would ripple into the + runner agent-selection, the golden fixtures, both wire tests, `RunSelection`, and the FE read/write — + a large change for what the doc frames as a preproduction identity restructuring. The README §2 itself + says the harness *values* "stay as they are." +- Keeping the value bare means the `/run` wire shape DOES NOT change, so `protocol.ts` / `wire.py` / + the golden fixtures are untouched (the scope's "if the wire harness field shape changes" condition is + false). The wire-contract tests stay green unchanged. + +### The representation + +One SDK source of truth: a small registry mapping each `HarnessType` to a versioned **slug** and a +**display name**, in `agents/dtos.py` (next to `HarnessType`). The slug convention mirrors +`agenta:builtin:agent:v0`: `agenta:harness::v0`. + +- `pi_core` → slug `agenta:harness:pi_core:v0`, name "Pi" +- `pi_agenta` → slug `agenta:harness:pi_agenta:v0`, name "Pi (Agenta)" +- `claude` → slug `agenta:harness:claude:v0`, name "Claude Code" + +`AgentConfigSchema.harness` changes from `Literal[...]` to a `oneOf` of +`{const: , title: , x-ag-slug: }` entries (one source: built from the +registry). The stored value is still the bare `const` string; the slug rides as `x-ag-slug` metadata and +the name as the option `title`. This is the JSON-Schema-native "enum of values, each with a display +title" — no parallel format invented. + +The FE `EnumSelectControl` learns to read a `oneOf` of `{const, title}` (in addition to a flat `enum`), +so the harness dropdown shows the display names. It writes back the bare `const` value, so +`config.harness` and the wire are unchanged. + +This is the minimal-complexity representation that satisfies #4829: it gives the harness a versioned slug +identity + display name in the interface, reuses the repo's slug grammar, and changes neither the wire +nor the runtime selector. + +## Decision: issue 2 — bind the builtin URI without the import-ordering trap + +`create_agent_app()` registers `_agent` via `ag.workflow(...)` with no URI, so it gets an auto +`user:custom:...` URI. Issue 2 wants it bound to `agenta:builtin:agent:v0` so `retrieve_handler` / +`retrieve_interface` for that URI return the live handler/interface. + +### The import-ordering trap + +`workflow.__init__` (running.py) calls `_retrieve_handler(self.uri)` for non-custom URIs. If the builtin +URI is passed before the interface is registered, the lookup can fail at import time. The fix passes the +URI through `ag.workflow(uri=...)` and ensures the SDK registers the agent interface (the SDK's +`agent_v0_interface`) for that URI BEFORE the handler constructor runs, so the non-custom-URI lookup +resolves. Verified against running.py's ordering. + +## Codex review refinements (xhigh, read-only) + +Codex reviewed both decisions. Verdict: Decision 1 mostly right; Decision 2 as first drafted was +unsafe. Folded in: + +- **Keep parent `enum` AND add `oneOf` titles.** Do NOT drop the flat `enum`. The FE schema + validator handles `enum` before `oneOf` and does not enforce `const`; keeping `enum` preserves + every existing consumer while `oneOf` adds the display labels. So the harness field carries both + `enum: [values]` and `oneOf: [{const,title,x-ag-harness-slug}]`. +- **Slug key name: `x-ag-harness-slug`**, not the generic `x-ag-slug` (specific to the harness). +- **Issue 2 — instrument BEFORE registering.** `workflow.__call__` only runs `auto_instrument` + inside `_register_handler` when `self.handler is None`. Pre-registering the RAW `_agent` makes + `__init__`'s `_retrieve_handler` set `self.handler` to the raw callable, so instrumentation is + skipped. Fix: `instrumented = auto_instrument(_agent)`, register THAT under the builtin URI, then + `ag.workflow(uri=..., schemas=AGENT_SCHEMAS, meta=...)(_agent)`. Mirrors chat.py, whose registry + `chat_v0` is already instrumented. +- **Stale `CONFIGURATION_REGISTRY` agent entry.** Binding the URI makes `workflow.__init__` apply + `CONFIGURATION_REGISTRY["agenta"]["builtin"]["agent"]["v0"]` as the default parameters when the + caller passes none. That entry is a flat `{model, agents_md}`, not the `{"agent": ...}` shape with + service defaults. Fixed it to `{"agent": build_agent_v0_default()}` (the shared SDK builder, one + owner), so the SDK builtin config matches the interface default. +- **`register_interface` must REPLACE, not setdefault.** `INTERFACE_REGISTRY` already has + `agent:v0` (the SDK minimal interface). To make `retrieve_interface(uri)` return what `/inspect` + advertises (AGENT_SCHEMAS) in the agent-service process, the service explicitly OVERRIDES the + registry entry under the builtin URI (an explicit set, not setdefault). This is a process-local + override; it does NOT change the API process's catalog output (the API builds its catalog from the + SDK `INTERFACE_REGISTRY` in its own process). Documented as such. +- **`register_handler` uses `setdefault`** — a second `create_agent_app()` call in the same process + won't replace the handler. Benign (the instrumented handler is identical), but tests that rebuild + the app are written to tolerate it. + +## Tests + +- Wire contract (both sides) stay green unchanged (wire harness value is bare). +- New issue-2 acceptance tests (architecture-followups §2 acceptance criteria). +- New harness-slug tests (registry + schema oneOf shape + FE EnumSelectControl oneOf reading). +- ruff format + check; pnpm lint-fix; pnpm test + typecheck in services/agent. diff --git a/docs/design/agent-workflows/projects/contract-versioning/status.md b/docs/design/agent-workflows/projects/contract-versioning/status.md new file mode 100644 index 0000000000..1ffb739936 --- /dev/null +++ b/docs/design/agent-workflows/projects/contract-versioning/status.md @@ -0,0 +1,83 @@ +# Contract versioning — implementation status + +Status: LANDED (2026-06-24) — code + tests + docs done; on the GitButler lane +`feat/agent-contract-versioning-docs` (PR #4829), stacked on `refactor/agent-harness-rename` +(#4833) for human review (NOT merged to big-agents). + +The README is the (revised) spec — a preproduction POC. This implementation slice does the +identity/slug restructuring + architecture-followups issue 2, and DEFERS the version-dispatch +machinery exactly as the README documents. + +## What landed + +### 1. Harness as a slug + display name in the interface + +- One SDK source of truth: `HARNESS_IDENTITIES` (a `HarnessIdentity` list) in + `sdks/python/agenta/sdk/agents/dtos.py`, mapping each `HarnessType` to a versioned slug + (`agenta:harness::v0`, the repo's `agenta:...:v0` grammar mirroring + `agenta:builtin:agent:v0`) and a display name (`Pi` / `Pi (Agenta)` / `Claude Code`). +- `AgentConfigSchema.harness` (`sdks/python/agenta/sdk/utils/types.py`) is now a `str` field whose + JSON Schema carries BOTH a flat `enum` (back-compat for `schema.enum` consumers) AND a `oneOf` + of `{const, title, x-ag-harness-slug}` built from `HARNESS_IDENTITIES`. +- FE `EnumSelectControl` (`web/packages/agenta-entity-ui/.../EnumSelectControl.tsx`) reads a `oneOf` + of `{const, title}` for option labels (preferring it over the flat `enum`), still writing the + bare `const` value back. +- The **stored/wire/runtime harness value stays the bare string** (`pi_core` / `pi_agenta` / + `claude`): the runner reads it as the agent selector and FE connection gating keys off it, so + `protocol.ts` / `wire.py` / the golden fixtures / both wire-contract tests are UNCHANGED. The + slug+name is interface-only. + +Minimal-complexity rationale (and Codex's review) are in `build-notes.md`. + +### 2. Issue 2 — bind the builtin URI to the live handler + +- `create_agent_app()` (`services/oss/src/agent/app.py`) now binds the live handler to + `agenta:builtin:agent:v0`: + 1. `register_handler(auto_instrument(_agent), uri=...)` — instrument BEFORE registering, so the + bound handler keeps tracing (mirrors chat.py, whose registry handler is pre-instrumented). + 2. `register_interface(...)` — a new helper in `engines/running/utils.py` that REPLACES (not + setdefault) the SDK's minimal `agent_v0_interface` seed for the URI, so + `retrieve_interface(uri)` returns the same schemas `/inspect` advertises. Process-local to the + agent service. + 3. `ag.workflow(uri="agenta:builtin:agent:v0", schemas=AGENT_SCHEMAS, meta=...)(_agent)`. +- Fixed the stale `CONFIGURATION_REGISTRY` agent entry (`engines/running/utils.py`): it was a flat + `{model, agents_md}`; now `{"agent": build_agent_v0_default()}`, the canonical shape via the + shared builder, so a URI-dispatched run with no parameters gets the interface default. + +The import-ordering trap (`workflow.__init__` calls `_retrieve_handler` for non-custom URIs) is +avoided by registering the instrumented handler first; the binding mechanics + Codex's three +prioritized corrections are in `build-notes.md`. + +## Deferred (as the README documents — POC, "we don't need version yet") + +- The `version` string on `/run`, the if/elif version dispatch, and the `/health` `protocol` + skew-guard read. Left as the documented (deferred) design in the README. + +## Tests (all green) + +- SDK unit: 945 (incl. 5 new `test_harness_identity.py`). +- Wire contract: Python 20 (`test_wire_contract.py`, unchanged) + TS 160 (`pnpm test`), typecheck + clean — the wire is unchanged. +- Service agent unit: 38 (incl. 4 new `test_builtin_uri_binding.py`). +- FE: entity-ui 23 (incl. 5 new `enumSelectControl.test.ts` + 18 connectionUtils), playground 26. +- ruff format + check clean; FE prettier clean. + +## Docs synced + +- `documentation/`: `ground-truth.md` (binding + slug note, replacing the stale "not registered"), + `protocol.md` (wire harness is bare; interface dresses it), and (no change needed) + `ports-and-adapters.md` / `runner-to-harness.md` (the runner selector is unchanged). +- `interfaces/`: `public-edge/agent-config-schema.md` (harness slug+name section), + `public-edge/workflow-inspect.md` (binding in Owned by + Watch), + `in-service/agent-service-handler.md` (app-build binding section), + `in-service/neutral-runtime-dtos.md` (`HARNESS_IDENTITIES`), and the `interfaces/README.md` index + rows (new test files). +- Left `interfaces/architecture-followups.md` (another session's untracked file) untouched; issue 2 + resolution is recorded here instead. + +## GitButler + +Stacked `feat/agent-contract-versioning-docs` on `refactor/agent-harness-rename` (#4833) so the +code edits depend on A3's renamed files. Staged only this project's + the implemented files; did +NOT sweep the unassigned tree files (`.husky/*`, `architecture-followups.md`, the coordination +board). NOT merged to big-agents. diff --git a/sdks/python/agenta/sdk/agents/__init__.py b/sdks/python/agenta/sdk/agents/__init__.py index b54c01f548..c523d666b5 100644 --- a/sdks/python/agenta/sdk/agents/__init__.py +++ b/sdks/python/agenta/sdk/agents/__init__.py @@ -58,8 +58,10 @@ AgentResult, ClaudeAgentConfig, ContentBlock, + HARNESS_IDENTITIES, HarnessAgentConfig, HarnessCapabilities, + HarnessIdentity, HarnessType, Message, NetworkEgress, @@ -147,6 +149,8 @@ "ClaudeAgentConfig", "AgentaAgentConfig", "HarnessType", + "HarnessIdentity", + "HARNESS_IDENTITIES", "HarnessCapabilities", "ContentBlock", "Message", diff --git a/sdks/python/agenta/sdk/agents/dtos.py b/sdks/python/agenta/sdk/agents/dtos.py index 7de31f8444..2ce3199ea2 100644 --- a/sdks/python/agenta/sdk/agents/dtos.py +++ b/sdks/python/agenta/sdk/agents/dtos.py @@ -58,6 +58,53 @@ def coerce(cls, value: "HarnessType | str") -> "HarnessType": return cls(str(value).lower()) +# --------------------------------------------------------------------------- +# Harness identity in the interface: a versioned slug + a display name +# --------------------------------------------------------------------------- + +# The harness contract's versioned identity, in the repo's slug grammar +# (``agenta:::v``, mirroring ``agenta:builtin:agent:v0`` in +# ``engines/running/interfaces.py``). The namespace is ``harness`` and the trailing ``v0`` is +# bumped only when the harness contract shape breaks. This is purely the INTERFACE identity the +# agent_config schema advertises; the stored/wire harness VALUE stays the bare enum string +# (``pi_core`` / ``pi_agenta`` / ``claude``), which the runner reads as the runtime selector. + + +class HarnessIdentity(BaseModel): + """One harness's interface identity: its bare value, versioned slug, and display name. + + ``value`` is the wire/runtime selector (the ``HarnessType`` value); ``slug`` is the + versioned contract identity in the repo's slug grammar; ``name`` is the human-facing label + the playground dropdown shows. This is the single source the agent_config schema builds the + harness ``oneOf`` from, so the slug, name, and value never drift across the SDK, the service + schema, and the frontend control.""" + + value: str + slug: str + name: str + + +# One entry per ``HarnessType``. The slug version is ``v0`` for every harness today (the +# contract has not broken). ``HARNESS_IDENTITIES`` is the single source of truth. +HARNESS_IDENTITIES: List[HarnessIdentity] = [ + HarnessIdentity( + value=HarnessType.PI.value, + slug=f"agenta:harness:{HarnessType.PI.value}:v0", + name="Pi", + ), + HarnessIdentity( + value=HarnessType.AGENTA.value, + slug=f"agenta:harness:{HarnessType.AGENTA.value}:v0", + name="Pi (Agenta)", + ), + HarnessIdentity( + value=HarnessType.CLAUDE.value, + slug=f"agenta:harness:{HarnessType.CLAUDE.value}:v0", + name="Claude Code", + ), +] + + # Permission policy for harness tool use in a headless run. ``auto`` approves (tools are # backend-resolved and trusted, no human to prompt); ``deny`` rejects. PermissionPolicy = Literal["auto", "deny"] diff --git a/sdks/python/agenta/sdk/engines/running/utils.py b/sdks/python/agenta/sdk/engines/running/utils.py index 9a5b3444f3..1ecec3a9ab 100644 --- a/sdks/python/agenta/sdk/engines/running/utils.py +++ b/sdks/python/agenta/sdk/engines/running/utils.py @@ -6,6 +6,7 @@ WorkflowFlags, WorkflowRevisionData, ) +from agenta.sdk.utils.types import build_agent_v0_default from agenta.sdk.engines.running.handlers import ( # --- NEW URI @@ -293,17 +294,12 @@ def _catalog_entry() -> dict: # --- OLD URI chat=dict(v0=WorkflowRevisionData()), completion=dict(v0=WorkflowRevisionData()), + # The agent builtin's default parameters use the canonical `{"agent": ...}` shape from + # the shared `build_agent_v0_default` builder (one owner; see utils/types.py), so a run + # that binds `agenta:builtin:agent:v0` with no parameters gets the same default the + # interface advertises instead of a stale flat `{model, agents_md}`. agent=dict( - v0=WorkflowRevisionData( - parameters={ - "model": "gpt-5.5", - "agents_md": ( - "You are a friendly hello-world agent running on the " - "Agenta agent service.\n\n- Greet the user warmly.\n- " - "Answer the user's message in one or two short sentences." - ), - } - ) + v0=WorkflowRevisionData(parameters={"agent": build_agent_v0_default()}) ), echo=dict(v0=WorkflowRevisionData()), auto_exact_match=dict(v0=WorkflowRevisionData()), @@ -464,6 +460,25 @@ def register_handler(fn: Callable, uri: Optional[str] = None) -> str: return uri +def register_interface(interface: WorkflowRevisionData, uri: str) -> str: + """Register (or OVERRIDE) the interface for a URI in the global interface registry. + + Unlike :func:`register_handler`'s ``setdefault``, this REPLACES any existing entry, so a + service process can bind its own richer interface under a builtin URI that the SDK already + seeds with a minimal default (e.g. the agent service overrides ``agenta:builtin:agent:v0`` so + ``retrieve_interface`` returns the same schemas ``/inspect`` advertises). This is a + process-local registration: it changes only the process that calls it (the agent service), not + the API process that builds the catalog from the SDK defaults. + """ + provider, kind, key, version = parse_uri(uri) + if not provider or not kind or not key or not version: + raise ValueError(f"Invalid URI: {uri}") + INTERFACE_REGISTRY.setdefault(provider, {}).setdefault(kind, {}).setdefault( + key, {} + )[version] = interface + return uri + + def _get_with_latest( registry: dict, provider: Optional[str] = None, diff --git a/sdks/python/agenta/sdk/utils/types.py b/sdks/python/agenta/sdk/utils/types.py index 0c2d5c13e1..d2536917de 100644 --- a/sdks/python/agenta/sdk/utils/types.py +++ b/sdks/python/agenta/sdk/utils/types.py @@ -8,7 +8,7 @@ from pydantic import Field, model_validator, AliasChoices -from agenta.sdk.agents.dtos import SandboxPermission +from agenta.sdk.agents.dtos import HARNESS_IDENTITIES, SandboxPermission from agenta.sdk.agents.mcp import MCPServerConfig from agenta.sdk.agents.tools import ToolConfig from agenta.sdk.utils.assets import supported_llm_models, model_metadata @@ -1062,6 +1062,38 @@ def _model_catalog_type() -> dict: "- Answer the user's message in one or two short sentences." ) +# The single source of the run-selection defaults. The SDK builtin interface +# (`agenta:builtin:agent:v0`) and the agent service (`AGENT_SCHEMAS` / the value +# `AgentConfig.from_params` falls back to) both consume these via `build_agent_v0_default`, so a new +# default changes one place. The harness default also seeds `AgentConfigSchema.harness`. +_DEFAULT_HARNESS = "pi_core" +_DEFAULT_SANDBOX = "local" +_DEFAULT_PERMISSION_POLICY = "auto" + +# The schema key carrying each harness option's versioned slug identity (the contract identity in +# the repo's `agenta:...:v0` grammar). Specific to the harness rather than a generic `x-ag-slug`. +_HARNESS_SLUG_KEY = "x-ag-harness-slug" + + +def _harness_field_schema_extra() -> Dict[str, Any]: + """Build the harness field's JSON-Schema extras from the single ``HARNESS_IDENTITIES`` source. + + Carries BOTH a flat ``enum`` of the bare values (so every existing consumer that reads + ``schema.enum`` keeps working) and a ``oneOf`` of ``{const, title, x-ag-harness-slug}`` (so the + playground shows the display name and the harness's versioned slug identity rides alongside its + bare value). The stored/wire harness value is still the bare ``const`` string.""" + return { + "enum": [identity.value for identity in HARNESS_IDENTITIES], + "oneOf": [ + { + "const": identity.value, + "title": identity.name, + _HARNESS_SLUG_KEY: identity.slug, + } + for identity in HARNESS_IDENTITIES + ], + } + class AgentConfigSchema(AgSchemaMixin): """The playground's editable agent config (the ``agent`` element), as one semantic type. @@ -1107,13 +1139,18 @@ class AgentConfigSchema(AgSchemaMixin): "secret env from the vault at run time; tokens never live in the config." ), ) - harness: Literal["pi_core", "claude", "pi_agenta"] = Field( - default="pi_core", + # The harness is a plain string field whose JSON Schema carries a versioned slug + display name + # per option (see `_harness_field_schema_extra`). The stored value is the bare harness string + # (`pi_core` / `pi_agenta` / `claude`) — the runtime/wire selector — so this stays a `str`, not a + # `Literal` (a Literal would emit its own `enum` that collides with the curated extras). + harness: str = Field( + default=_DEFAULT_HARNESS, title="Harness", description=( "Coding agent to drive: pi_core (plain Pi), claude, or pi_agenta (Pi with " "Agenta's forced skills, tools, and base instructions)." ), + json_schema_extra=_harness_field_schema_extra(), ) sandbox: Literal["local", "daytona"] = Field( default="local", @@ -1148,17 +1185,6 @@ class AgentConfigSchema(AgSchemaMixin): ) -# The single source of the `agent_config` default. The SDK builtin interface -# (`agenta:builtin:agent:v0`) and the agent service (`AGENT_SCHEMAS` / the value -# `AgentConfig.from_params` falls back to) both consume this, so a new default field changes one -# place. Service-only choices are named args, never a second copy: the platform default skill is -# an `@ag.embed` the service inlines, and the declared sandbox boundary is the playground -# pre-fill. The harness default lives on `AgentConfigSchema.harness` (this mirrors it). -_DEFAULT_HARNESS = "pi_core" -_DEFAULT_SANDBOX = "local" -_DEFAULT_PERMISSION_POLICY = "auto" - - def build_agent_v0_default( *, skill_slug: Optional[str] = None, diff --git a/sdks/python/oss/tests/pytest/unit/agents/test_harness_identity.py b/sdks/python/oss/tests/pytest/unit/agents/test_harness_identity.py new file mode 100644 index 0000000000..1a3f050ef7 --- /dev/null +++ b/sdks/python/oss/tests/pytest/unit/agents/test_harness_identity.py @@ -0,0 +1,59 @@ +"""The harness interface identity: a versioned slug + display name per harness. + +The harness in the agent_config interface is structured as a slug (the repo's +``agenta:...:v0`` grammar, mirroring ``agenta:builtin:agent:v0``) plus a display name, built +from one SDK source (``HARNESS_IDENTITIES``). The stored/wire harness VALUE stays the bare enum +string, so the runtime selector and the golden wire contract are unchanged; only the interface +gains the slug+name structure. These tests pin that contract. +""" + +from __future__ import annotations + +from agenta.sdk.agents import HARNESS_IDENTITIES, HarnessType +from agenta.sdk.utils.types import CATALOG_TYPES + + +def test_one_identity_per_harness_type(): + # Every HarnessType value has exactly one identity, and nothing extra. + by_value = {identity.value: identity for identity in HARNESS_IDENTITIES} + assert set(by_value) == {h.value for h in HarnessType} + assert len(by_value) == len(HARNESS_IDENTITIES) + + +def test_slug_follows_the_repo_versioned_slug_grammar(): + # Mirrors `agenta:builtin:agent:v0`: namespace `harness`, the bare value, trailing `v0`. + for identity in HARNESS_IDENTITIES: + assert identity.slug == f"agenta:harness:{identity.value}:v0" + assert identity.name # a non-empty display name + + +def test_identity_value_is_the_bare_harness_string(): + # The identity's `value` is the bare HarnessType value (the runtime/wire selector), NOT the + # slug — so the wire/runner contract is unchanged. + values = {identity.value for identity in HARNESS_IDENTITIES} + assert values == {"pi_core", "pi_agenta", "claude"} + + +def test_agent_config_harness_field_carries_enum_and_oneOf_from_the_registry(): + # The agent_config catalog type's harness field carries BOTH a flat `enum` (back-compat for + # every `schema.enum` consumer) and a `oneOf` of `{const, title, x-ag-harness-slug}` built from + # the same registry, so the playground shows the display name + slug while writing the bare value. + harness = CATALOG_TYPES["agent_config"]["properties"]["harness"] + + assert harness["type"] == "string" + assert harness["default"] == "pi_core" + assert harness["enum"] == [identity.value for identity in HARNESS_IDENTITIES] + + one_of = harness["oneOf"] + assert len(one_of) == len(HARNESS_IDENTITIES) + for entry, identity in zip(one_of, HARNESS_IDENTITIES): + assert entry["const"] == identity.value + assert entry["title"] == identity.name + assert entry["x-ag-harness-slug"] == identity.slug + + +def test_harness_oneOf_const_values_match_the_enum(): + # The `oneOf` consts and the flat `enum` describe the same value set, so a control reading + # either shape offers the same harnesses. + harness = CATALOG_TYPES["agent_config"]["properties"]["harness"] + assert [entry["const"] for entry in harness["oneOf"]] == harness["enum"] diff --git a/services/oss/src/agent/app.py b/services/oss/src/agent/app.py index 2882836af1..0dcf81e4ae 100644 --- a/services/oss/src/agent/app.py +++ b/services/oss/src/agent/app.py @@ -46,6 +46,10 @@ from agenta.sdk.agents.platform import resolve_connection +from agenta.sdk.decorators.tracing import auto_instrument +from agenta.sdk.engines.running.utils import register_handler, register_interface +from agenta.sdk.models.workflows import WorkflowRevisionData + from agenta.sdk.utils.logging import get_module_logger from oss.src.agent.config import load_config, runner_dir, runner_url @@ -279,22 +283,37 @@ async def _agent_vercel_stream(harness, session_config, msgs): await harness.cleanup() +AGENT_URI = "agenta:builtin:agent:v0" + + def create_agent_app(): app = ag.create_app() - # The builtin agent workflow interface (`agenta:builtin:agent:v0`, `agent_v0_interface` - # in the SDK) now exists, but this service still registers the handler directly, so it - # gets an auto URI (`user:custom:...`) and runs locally. Binding the handler to the - # builtin URI is the remaining step. + # Bind the live `_agent` handler to the builtin URI `agenta:builtin:agent:v0` (issue 2: one + # canonical identity for the agent workflow). The SDK seeds the registries for this URI with a + # minimal default interface; the service is the authoritative live owner in its own process, so: + # + # 1. Instrument `_agent`, then register THAT under the builtin URI. Order matters: `ag.workflow` + # only instruments inside `_register_handler`, which it skips once a handler exists in the + # registry. Registering the raw `_agent` would lose tracing instrumentation; registering the + # instrumented one keeps it (mirrors chat.py, whose registry handler is pre-instrumented). + # 2. OVERRIDE the interface registry with the service interface (AGENT_SCHEMAS + the inspect + # `meta`), so `retrieve_interface(AGENT_URI)` returns the SAME data `/inspect` advertises. + # `register_interface` replaces (not setdefault), unlike the SDK's minimal seed. + # 3. Build the workflow against the URI. `ag.workflow.__init__` then resolves the (instrumented) + # handler and merges the registered interface; the passed `schemas`/`meta` still win. # # The per-harness connection capability rides the inspect response `meta`, NOT a fourth - # `AGENT_SCHEMAS` schema key (`JsonSchemas` allows only inputs/parameters/outputs). The - # frontend reads `meta.harness_capabilities` and intersects it with the existing `/secrets/` - # payload projected as connections; the agent service imports the - # SAME SDK table (above) for its server-side reject, never calling its own `/inspect`. - routed = ag.workflow( - schemas=AGENT_SCHEMAS, - meta={"harness_capabilities": harness_capabilities_document()}, - )(_agent) + # `AGENT_SCHEMAS` schema key (`JsonSchemas` allows only inputs/parameters/outputs). The frontend + # reads `meta.harness_capabilities` and intersects it with the existing `/secrets/` payload + # projected as connections; the agent service imports the SAME SDK table (above) for its + # server-side reject, never calling its own `/inspect`. + meta = {"harness_capabilities": harness_capabilities_document()} + register_handler(auto_instrument(_agent), uri=AGENT_URI) + register_interface( + WorkflowRevisionData(uri=AGENT_URI, schemas=AGENT_SCHEMAS), + uri=AGENT_URI, + ) + routed = ag.workflow(uri=AGENT_URI, schemas=AGENT_SCHEMAS, meta=meta)(_agent) # is_agent gates the agent-only `/messages` route (next to /invoke). ag.route("/", app=app, flags={"is_chat": True, "is_agent": True})(routed) return app diff --git a/services/oss/tests/pytest/unit/agent/test_builtin_uri_binding.py b/services/oss/tests/pytest/unit/agent/test_builtin_uri_binding.py new file mode 100644 index 0000000000..f5d1cae2ab --- /dev/null +++ b/services/oss/tests/pytest/unit/agent/test_builtin_uri_binding.py @@ -0,0 +1,59 @@ +"""Issue 2: the agent builtin URI is bound to the live handler + interface. + +`create_agent_app()` registers the live `_agent` handler under `agenta:builtin:agent:v0` and +overrides the interface registry with the service interface, so the builtin URI and the live +service identity are one. These tests are the acceptance criteria from +`docs/design/agent-workflows/interfaces/architecture-followups.md` issue 2: + +- `retrieve_handler("agenta:builtin:agent:v0")` returns the live handler (not None). +- `retrieve_interface("agenta:builtin:agent:v0")` returns the same schemas `/inspect` advertises + (the service `AGENT_SCHEMAS`), not the SDK's minimal builtin interface. + +Importing `oss.src.agent.app` builds the app (module-level `create_agent_app()`), which performs +the binding, so the registries are populated by the time these tests run. +""" + +from __future__ import annotations + +from agenta.sdk.engines.running.utils import retrieve_handler, retrieve_interface + +from oss.src.agent import app +from oss.src.agent.schemas import AGENT_SCHEMAS + +_AGENT_URI = "agenta:builtin:agent:v0" + + +def test_retrieve_handler_returns_the_live_handler(): + handler = retrieve_handler(_AGENT_URI) + assert handler is not None + assert callable(handler) + + +def test_bound_handler_is_instrumented_not_the_raw_agent(): + # The handler registered under the URI is the auto-instrumented `_agent`, not the raw function: + # `ag.workflow` only instruments inside `_register_handler`, which it skips once a handler + # exists in the registry, so the service must register the instrumented one itself. + handler = retrieve_handler(_AGENT_URI) + assert handler is not app._agent + + +def test_retrieve_interface_matches_what_inspect_advertises(): + # The interface bound under the builtin URI carries the SAME schemas `/inspect` advertises + # (the service `AGENT_SCHEMAS`), not the SDK's minimal builtin interface. One identity, one + # interface, so the inspect path and the catalog/invoke path agree. + interface = retrieve_interface(_AGENT_URI) + assert interface is not None + assert interface.uri == _AGENT_URI + assert interface.schemas is not None + assert interface.schemas.inputs == AGENT_SCHEMAS["inputs"] + assert interface.schemas.parameters == AGENT_SCHEMAS["parameters"] + assert interface.schemas.outputs == AGENT_SCHEMAS["outputs"] + + +def test_rebuilding_the_app_keeps_the_binding_stable(): + # A second build in the same process must not break the binding (the handler register is a + # setdefault, the interface override is idempotent). The acceptance criteria still hold. + app.create_agent_app() + assert retrieve_handler(_AGENT_URI) is not None + interface = retrieve_interface(_AGENT_URI) + assert interface is not None and interface.uri == _AGENT_URI diff --git a/web/packages/agenta-entity-ui/src/DrillInView/SchemaControls/EnumSelectControl.tsx b/web/packages/agenta-entity-ui/src/DrillInView/SchemaControls/EnumSelectControl.tsx index 5e52ca01cd..41d764fdc4 100644 --- a/web/packages/agenta-entity-ui/src/DrillInView/SchemaControls/EnumSelectControl.tsx +++ b/web/packages/agenta-entity-ui/src/DrillInView/SchemaControls/EnumSelectControl.tsx @@ -41,11 +41,38 @@ export interface EnumSelectControlProps { } /** - * Extract options from schema enum + * Extract options from a schema's enum or a `oneOf` of `{const, title}` entries. + * + * Two shapes are supported. A flat `enum` (the common case) maps each value through + * `formatEnumLabel`. A `oneOf` of `{const, title}` entries (used by the agent harness field, + * where each option carries a display name and a versioned slug identity alongside its bare + * value) keeps the bare `const` as the value and shows the `title` as the label, so the option + * value the control writes back is unchanged while the dropdown reads clearly. */ -function getEnumOptions( +export function getEnumOptions( schema: SchemaProperty | null | undefined, ): {value: string; label: string}[] { + const oneOf = (schema as {oneOf?: unknown})?.oneOf + if (Array.isArray(oneOf)) { + const options = oneOf + .filter( + (entry): entry is {const: unknown; title?: unknown} => + !!entry && + typeof entry === "object" && + "const" in (entry as Record), + ) + .map((entry) => ({ + value: String(entry.const), + label: + typeof entry.title === "string" && entry.title + ? entry.title + : formatEnumLabel(entry.const), + })) + if (options.length > 0) { + return options + } + } + if (!schema?.enum || !Array.isArray(schema.enum)) { return [] } diff --git a/web/packages/agenta-entity-ui/tests/unit/enumSelectControl.test.ts b/web/packages/agenta-entity-ui/tests/unit/enumSelectControl.test.ts new file mode 100644 index 0000000000..16156341e2 --- /dev/null +++ b/web/packages/agenta-entity-ui/tests/unit/enumSelectControl.test.ts @@ -0,0 +1,75 @@ +/** + * Unit tests for `getEnumOptions`, the pure schema -> dropdown-options helper behind + * EnumSelectControl. The agent harness field carries a `oneOf` of `{const, title}` (each option + * has a display name and a versioned slug identity); this verifies the control reads that shape + * for labels while still keeping the bare `const` as the written value, and still handles the + * flat `enum` shape every other consumer uses. Runs under @agenta/entity-ui's vitest runner. + */ +import {describe, expect, it} from "vitest" + +import type {SchemaProperty} from "@agenta/entities/shared" + +import {getEnumOptions} from "../../src/DrillInView/SchemaControls/EnumSelectControl" + +describe("getEnumOptions: flat enum", () => { + it("maps each enum value to a value/label option", () => { + const schema = {type: "string", enum: ["local", "daytona"]} as SchemaProperty + const options = getEnumOptions(schema) + expect(options.map((o) => o.value)).toEqual(["local", "daytona"]) + // labels come from formatEnumLabel; the value stays the bare string. + expect(options.every((o) => typeof o.label === "string" && o.label.length > 0)).toBe(true) + }) + + it("returns [] for a schema with no enum and no oneOf", () => { + expect(getEnumOptions({type: "string"} as SchemaProperty)).toEqual([]) + expect(getEnumOptions(null)).toEqual([]) + expect(getEnumOptions(undefined)).toEqual([]) + }) +}) + +describe("getEnumOptions: oneOf of {const,title} (the agent harness field)", () => { + const harnessSchema = { + type: "string", + enum: ["pi_core", "pi_agenta", "claude"], + oneOf: [ + {const: "pi_core", title: "Pi", "x-ag-harness-slug": "agenta:harness:pi_core:v0"}, + { + const: "pi_agenta", + title: "Pi (Agenta)", + "x-ag-harness-slug": "agenta:harness:pi_agenta:v0", + }, + { + const: "claude", + title: "Claude Code", + "x-ag-harness-slug": "agenta:harness:claude:v0", + }, + ], + } as unknown as SchemaProperty + + it("uses the bare const as the value and the title as the label", () => { + const options = getEnumOptions(harnessSchema) + expect(options).toEqual([ + {value: "pi_core", label: "Pi"}, + {value: "pi_agenta", label: "Pi (Agenta)"}, + {value: "claude", label: "Claude Code"}, + ]) + }) + + it("prefers oneOf titles over the flat enum when both are present", () => { + // The harness schema carries both shapes; the labels must be the display names, not the + // formatEnumLabel of the bare values. + const labels = getEnumOptions(harnessSchema).map((o) => o.label) + expect(labels).toEqual(["Pi", "Pi (Agenta)", "Claude Code"]) + }) + + it("falls back to formatEnumLabel when a oneOf entry has no title", () => { + const schema = { + type: "string", + oneOf: [{const: "pi_core"}, {const: "claude", title: "Claude Code"}], + } as unknown as SchemaProperty + const options = getEnumOptions(schema) + expect(options[0].value).toBe("pi_core") + expect(typeof options[0].label).toBe("string") + expect(options[1]).toEqual({value: "claude", label: "Claude Code"}) + }) +})