diff --git a/api/oss/src/core/tools/dtos.py b/api/oss/src/core/tools/dtos.py
index 224a956227..f0dca37e28 100644
--- a/api/oss/src/core/tools/dtos.py
+++ b/api/oss/src/core/tools/dtos.py
@@ -46,6 +46,9 @@ class ToolCatalogAction(BaseModel):
     #
     categories: List[str] = []
     logo: Optional[str] = None
+    #
+    # From the MCP behavioral hints: True (read-only), False (mutating), None (unknown).
+    read_only: Optional[bool] = None
 
 
 class ToolCatalogActionDetails(ToolCatalogAction):
@@ -274,6 +277,7 @@ class ResolvedAgentTool(BaseModel):
     description: Optional[str] = None
     input_schema: Optional[Dict[str, Any]] = None
     call_ref: str
+    read_only: Optional[bool] = None
 
 
 class AgentToolsResolution(BaseModel):
diff --git a/api/oss/src/core/tools/providers/composio/adapter.py b/api/oss/src/core/tools/providers/composio/adapter.py
index 80fcdee100..e388d7072e 100644
--- a/api/oss/src/core/tools/providers/composio/adapter.py
+++ b/api/oss/src/core/tools/providers/composio/adapter.py
@@ -16,7 +16,10 @@
 )
 from oss.src.core.tools.interfaces import ToolsGatewayInterface
 from oss.src.core.tools.exceptions import AdapterError
-from oss.src.core.tools.providers.composio.catalog import ComposioCatalogClient
+from oss.src.core.tools.providers.composio.catalog import (
+    ComposioCatalogClient,
+    _derive_read_only,
+)
 
 
 log = get_module_logger(__name__)
@@ -173,6 +176,7 @@ async def get_action(
             if input_params or output_params
             else None,
             scopes=item.get("scopes") or None,
+            read_only=_derive_read_only(item.get("tags")),
         )
 
     # -----------------------------------------------------------------------
diff --git a/api/oss/src/core/tools/providers/composio/catalog.py b/api/oss/src/core/tools/providers/composio/catalog.py
index 0ffa589027..555eb995fe 100644
--- a/api/oss/src/core/tools/providers/composio/catalog.py
+++ b/api/oss/src/core/tools/providers/composio/catalog.py
@@ -359,6 +359,22 @@ def _parse_integration_detail(item: Dict[str, Any]) -> ToolCatalogIntegration:
     )
 
 
+def _derive_read_only(raw_tags: Any) -> Optional[bool]:
+    """Distil the MCP behavioral hint tags into a single read-only signal.
+
+    ``readOnlyHint`` with no mutating hint -> read-only; ``destructiveHint`` or
+    ``updateHint`` -> mutating. No hint present -> unknown (``None``); never guessed.
+    """
+    if not isinstance(raw_tags, list):
+        return None
+    tags = {t for t in raw_tags if isinstance(t, str)}
+    if "destructiveHint" in tags or "updateHint" in tags:
+        return False
+    if "readOnlyHint" in tags:
+        return True
+    return None
+
+
 def _parse_action(item: Dict[str, Any], integration_key: str) -> ToolCatalogAction:
     raw_tags = item.get("tags")
     # Tags mix MCP hint flags with semantic categories — strip the known hints
@@ -381,4 +397,5 @@ def _parse_action(item: Dict[str, Any], integration_key: str) -> ToolCatalogActi
         name=item.get("name", ""),
         description=item.get("description"),
         categories=categories,
+        read_only=_derive_read_only(raw_tags),
     )
diff --git a/api/oss/src/core/tools/service.py b/api/oss/src/core/tools/service.py
index 03c8244f5d..e304519497 100644
--- a/api/oss/src/core/tools/service.py
+++ b/api/oss/src/core/tools/service.py
@@ -583,4 +583,5 @@ async def _resolve_composio_tool(
             description=action.description,
             input_schema=input_schema,
             call_ref=call_ref,
+            read_only=action.read_only,
         )
diff --git a/api/oss/tests/pytest/unit/tools/test_agent_resolution.py b/api/oss/tests/pytest/unit/tools/test_agent_resolution.py
index 12ad49266a..1689a3f635 100644
--- a/api/oss/tests/pytest/unit/tools/test_agent_resolution.py
+++ b/api/oss/tests/pytest/unit/tools/test_agent_resolution.py
@@ -10,6 +10,7 @@
 
 from oss.src.apis.fastapi.tools.models import ToolResolveRequest
 from oss.src.core.tools.dtos import AgentBuiltinTool, AgentComposioTool
+from oss.src.core.tools.providers.composio.catalog import _derive_read_only
 from oss.src.core.tools.service import ToolsService
 
 
@@ -59,6 +60,7 @@ async def _action(**_kwargs):
             schemas=SimpleNamespace(
                 inputs={"type": "object", "properties": {}},
             ),
+            read_only=True,
         )
 
     monkeypatch.setattr(service, "resolve_connection_by_slug", _connection)
@@ -77,3 +79,25 @@ async def _action(**_kwargs):
     )
     assert result.builtins == ["read"]
     assert result.custom[0].call_ref == "tools.composio.github.GET_USER.c1"
+    assert result.custom[0].read_only is True
+
+
+@pytest.mark.parametrize(
+    "tags, expected",
+    [
+        (["readOnlyHint"], True),
+        (["updateHint"], False),
+        (["destructiveHint"], False),
+        # A mutating hint wins even when readOnlyHint is also present.
+        (["destructiveHint", "readOnlyHint"], False),
+        (["updateHint", "readOnlyHint"], False),
+        # Unknown == None (never guess), not False.
+        ([], None),
+        (None, None),
+        (["unrelatedHint"], None),
+        # Non-list input is ignored.
+        ("readOnlyHint", None),
+    ],
+)
+def test_derive_read_only_tag_matrix(tags, expected):
+    assert _derive_read_only(tags) is expected
diff --git a/docs/design/agent-workflows/projects/capability-config/README.md b/docs/design/agent-workflows/projects/capability-config/README.md
new file mode 100644
index 0000000000..6e0d0be7eb
--- /dev/null
+++ b/docs/design/agent-workflows/projects/capability-config/README.md
@@ -0,0 +1,31 @@
+# Capability and permission configuration
+
+How an author configures what an Agenta agent may do (files, network, tools, tool approvals),
+and how those controls enforce end to end: from the playground form, through the SDK and agent
+service, to the runner and the harness. Graduated from the scratch notes in
+`../../scratch/capability-architecture.md` on 2026-06-23.
+
+## The shape in one paragraph
+
+Three configuration layers, each with one job and one enforcement point. **Layer 1, harness
+configuration:** the runner translates author kwargs into the harness's own config (a
+`.claude/settings.json` for Claude, `builtin_names` for Pi). **Layer 2, sandbox permission:** an
+optional `sandbox_permission` field draws the network and filesystem boundary, enforced by the
+backend when it provisions the sandbox. **Layer 3, tool permission:** a per-tool permission
+(always-allow / ask / deny), enforced at the runner relay for resolved tools and at the harness
+permission plane for builtins. The work spans the playground frontend, the schema, the SDK, the
+service, and the runner.
+
+## Files
+
+- `context.md` — why this exists, goals, non-goals, background, how it relates to the sibling
+  projects.
+- `proposal.md` — the three-layer design. The canonical spec.
+- `plan.md` — phased execution plan, end to end including the playground frontend.
+- `research.md` — current-state codebase findings and exact insertion points (backend, runner,
+  frontend), plus the library facts the design rests on.
+- `status.md` — progress, decisions, and open questions. The source of truth for state.
+
+## Status
+
+Code-complete and reviewed; backend + runner + FE built and green, live-QA'd on the running stack. Live Daytona egress + Claude behavioral cells pending credentials. See `status.md`.
diff --git a/docs/design/agent-workflows/projects/capability-config/context.md b/docs/design/agent-workflows/projects/capability-config/context.md
new file mode 100644
index 0000000000..525a000fb6
--- /dev/null
+++ b/docs/design/agent-workflows/projects/capability-config/context.md
@@ -0,0 +1,73 @@
+# Context
+
+## Why this work exists
+
+Every agent run has to be governed. The author needs to say what the agent may touch, and the
+system has to enforce that across two harnesses (`pi` and `claude`) and three backends
+(sandbox-agent local, sandbox-agent on Daytona, and a future in-process local SDK).
+
+Today almost none of this is wired:
+
+- The runner drops Pi's `builtin_names`, so even Pi's own tool selection has no effect on the
+  sandbox-agent path.
+- The runner never restricts Claude. It creates the session with only `cwd` and `mcpServers`,
+  so "Claude without web" or "Claude read-only" is not expressible.
+- The runner never sets a network boundary, so a Daytona run has full egress by default.
+- `permission_policy` is the only live control, and it is coarse (auto or deny, all tools at
+  once) and effective on Claude only.
+- The playground renders every config field unconditionally, with no per-harness gating and no
+  way to set capability or per-tool approval.
+
+So a request as simple as "give this agent web access but not write access" cannot be
+expressed, on either harness, from the playground or the SDK. This project makes capability and
+permission a real, configurable, end-to-end feature.
+
+## Goals
+
+1. A three-layer configuration model the author can set: harness configuration, sandbox
+   permission, and per-tool permission. Each layer has one job and one enforcement point.
+2. End to end. The playground frontend is in scope: the config form gains the new sections, and
+   the agent chat gains a tool-approval surface for the "ask" permission.
+3. Honest enforcement. The sandbox layer is authoritative for the network and the filesystem. A
+   run fails loud when a backend cannot deliver a requested guarantee, rather than pretending.
+4. Sensible defaults. Read-only tools default to always-allow and mutating tools to ask, using
+   Composio's read/write metadata, so the author does not label every tool by hand.
+
+## Non-goals (for now)
+
+- **Pi MCP.** Deferred. When built it follows the same permission pattern as Claude
+  (settings-style `mcp__<server>` rules). Tracked in `../harness-capabilities/`.
+- **A real filesystem jail.** No backend confines the filesystem today; the local cwd is a temp
+  dir, not a jail. Layer 2 ships network first; filesystem stays tool-plane only until a backend
+  can enforce it.
+- **Durable / unattended HITL approval.** The "ask" permission this project ships asks the user
+  in the open chat. The global, durable approval channel that survives a closed tab or a
+  scheduled run is Flow 7 in `../../scratch/flows-and-capabilities.md`, a later milestone.
+- **A sandbox boundary for the local backend.** The local sidecar is the host; it cannot enforce
+  Layer 2. That is by design, and the fail-loud rule covers it.
+
+## Background
+
+The runtime splits work across a Python agent service (`services/oss/src/agent/`, decides what
+to run) and a TypeScript runner (`services/agent/`, runs it). The runner drives the harness over
+an ACP bridge, `sandbox-agent`, on a chosen backend. The SDK (`sdks/python/agenta/sdk/agents/`)
+owns the neutral config, the ports, and the per-harness adapters.
+
+Three earlier scratch documents set up this project, and their facts are folded into
+`research.md`:
+
+- `../../scratch/capability-map.md` — the current-state web/exec/read/write cut: what each
+  harness can do, what is on by default, what the backend changes.
+- `../../scratch/capability-architecture.md` — the design exploration this project's
+  `proposal.md` cleans up.
+- `../../scratch/flows-and-capabilities.md` — the user-facing flows, including Flow 7 (HITL).
+
+## Relation to sibling projects
+
+- `../harness-capabilities/` declares which capabilities each harness supports (the static
+  capability table) and owns the deferred Pi-MCP work. This project sets the capability *values*
+  the author chooses; that project declares which choices a harness can honor. They meet at the
+  schema and the fail-loud check.
+- `../model-config/` is the same static-then-dynamic pattern for the model axis. Layer 1's
+  Claude `model` setting overlaps it.
+- `../skills-config/` configures forced skills, a different axis on the same agent config.
diff --git a/docs/design/agent-workflows/projects/capability-config/plan.md b/docs/design/agent-workflows/projects/capability-config/plan.md
new file mode 100644
index 0000000000..eedae886f1
--- /dev/null
+++ b/docs/design/agent-workflows/projects/capability-config/plan.md
@@ -0,0 +1,181 @@
+# Execution plan
+
+Phased, end to end. Each phase is shippable and de-risks the next. File references and exact
+insertion points live in `research.md`; this plan names the work and the acceptance bar. Some
+line numbers in `research.md` are approximate and must be reconfirmed at implementation time.
+
+The dependency spine: Phase 0 unblocks Pi and the metadata defaults. Phases 1 to 3 build the
+three layers in the backend and runner. Phases 4 and 5 surface them in the playground. Phase 6
+verifies live. Phases 1 and the Composio half of Phase 0 are independent and can run in
+parallel.
+
+## Phase 0: prerequisites
+
+Two latent gaps that the rest of the design assumes are closed.
+
+- **(S0b) Investigate Pi's real control surface.** The runner does drop `request.tools`
+  (`run-plan.ts:101`), but `pi-acp@0.0.29` spawns `pi --mode rpc --no-themes` with no `tools`
+  field on `newSession` (`index.js:135`), so honoring `builtin_names` is not simply a matter of
+  forwarding `request.tools`. Find the actual lever — a `pi` CLI flag, a Pi config file written
+  into the cwd, or an ACP session field — or conclude that Pi built-in restriction is unsupported
+  over sandbox-agent. Read-only; record the finding in `research.md`.
+- **(S0a) Stop stripping Composio read/write hints.** Carry a `read_only` flag from the catalog
+  through to the resolved tool, so Phase 3 can default permissions from it.
+  (`api/oss/src/core/tools/providers/composio/catalog.py`, `api/oss/src/core/tools/dtos.py`, the
+  resolved `ToolSpec`.)
+
+Acceptance: S0b ends with a written finding (the lever, or "unsupported → fail loud"); S0a makes
+a resolved Composio tool carry `read_only` on the wire (read-only for `readOnlyHint`, mutating for
+`destructiveHint`/`updateHint`). If S0b finds Pi restriction unsupported, Layer 1 for Pi fails
+loud rather than silently granting the full tool set.
+
+## Phase 1: Layer 2, sandbox permission (backend)
+
+The network boundary, enforced at sandbox provisioning. Filesystem is declared but not enforced
+yet (no backend confines it).
+
+- Add an optional `sandbox_permission` object to `AgentConfig` and the `agent_config` schema
+  (presets plus `network_egress` with `mode` and `enforcement`). (`dtos.py`,
+  `sdk/utils/types.py` `AgentConfigSchema`, `services/oss/src/agent/schemas.py`.)
+- **Thread the policy across the Python→TS wire**, because Daytona is provisioned in the runner,
+  not in Python: `SessionConfig` → `request_to_wire` (`sdk/agents/utils/wire.py`) →
+  `AgentRunRequest` (`protocol.ts`) → `buildRunPlan` (`run-plan.ts`) → `buildSandboxProvider`.
+  Move the golden fixtures and both wire-contract tests with the new field. The Python
+  `Backend.create_sandbox` also gains the typed policy for the local/in-process path.
+- Apply it on Daytona: set `networkBlockAll` / `networkAllowList` on the provider `create`
+  object. (`services/agent/src/engines/sandbox_agent/provider.ts`.)
+- Fail loud when a backend cannot enforce a requested guarantee (local sidecar, local SDK),
+  unless the author sets a per-axis opt-out.
+- **(S1g) Close the runner-host hole before claiming `network: off`.** `relay.ts` runs `code` and
+  gateway/callback tools on the runner host, so a network-blocked Daytona sandbox does not confine
+  them. When `network: off`/`exec: off`, reject or strip `code` tools, gateway/callback tools, and
+  stdio MCP servers — or move their execution into the sandbox. The runner must not report
+  `network: off` while a runner-side tool is still reachable. See `status.md` open question 1.
+
+Acceptance: a `network: off` config on Daytona blocks egress (a `curl` from `bash` fails) **and**
+a `code`/gateway tool under that config either refuses at plan time or runs inside the sandbox,
+never on the runner host; the same config on local fails loud unless opted out.
+
+## Phase 2: Layer 1, harness configuration (settings.json)
+
+The runner renders author kwargs into the harness's native config.
+
+- Define the `harness_options` surface for Claude: permission mode plus per-tool allow/deny/ask
+  rules. Decide raw `ClaudeCodeSettings` passthrough vs a curated subset. (`dtos.py`
+  `harness_options`.)
+- In the runner, write `.claude/settings.json` into the session cwd before `createSession`.
+  (`run-plan.ts` cwd creation; the engine before `sandbox.createSession` in `sandbox_agent.ts`.)
+- Derive the baseline rules from Layer 2 and the read-only profile (for example `network: off`
+  emits `deny: ["WebFetch","WebSearch"]`; read-only emits `deny: ["Write","Edit","Bash"]`).
+
+Acceptance: a Claude run with a settings.json `deny` rule does not call the denied tool; a
+`defaultMode` setting takes effect.
+
+## Phase 3: Layer 3, tool permission (backend + runner)
+
+Per-tool permission: always-allow / ask / deny.
+
+- Carry the permission on each tool's spec and each MCP server's spec. The frontend already
+  stores `agenta_metadata.permission_mode` on each tool; define its canonical values
+  (always-allow/ask/deny) and serialize them. `permission_policy` stays as the global default.
+  Add `read_only` to `ToolSpec` (Phase 0). (`dtos.py`, `tools/models.py`.)
+- Enforce resolved tools at the relay: deny refuses, ask parks, always-allow runs.
+  (`services/agent/src/tools/relay.ts`.)
+- Enforce Claude builtins through the responder: pass the per-tool map to the responder, which
+  reads the tool name off the permission request and applies the permission.
+  (`responder.ts`, `engines/sandbox_agent/permissions.ts`.)
+- Default the permission from the `read_only` flag: read-only to always-allow, mutating to ask.
+  The author overrides.
+
+Known risk handled in S1g (Phase 1), not here: resolved `code` **and** gateway/callback tools run
+on the runner host, not the sandbox, so a network-blocked Daytona run does not confine their
+egress. The interim guard (reject `code`, gateway/callback, and stdio MCP when `network`/`exec`
+are off) versus the target (move resolved-tool execution into the sandbox) is decided there. See
+`status.md` open question 1.
+
+Acceptance: a tool set to deny never runs; a tool set to ask raises an approval request; a
+read-only Composio tool defaults to always-allow.
+
+## Phase 4: playground form (the three sections)
+
+The form is generic-schema-driven, so most fields appear once the schema carries them. The
+work is the controls and the gating.
+
+- Render the new sections in `AgentConfigControl.tsx`: a sandbox-permission section (presets
+  plus network/filesystem toggles), an advanced harness-config section, and a per-tool
+  permission control (allow / ask / deny on each tool). NOTE (S3a finding): `permission_mode` is
+  NOT round-tripped in the FE today — this control must be built from scratch, writing
+  `agenta_metadata.permission_mode` with the `allow|ask|deny` vocabulary (the SDK already accepts
+  that key via `AliasChoices`, so no mapping is needed).
+- Default the per-tool control from the tool's `read_only` metadata.
+- Gate fields by the live `harness` value (hide `permission`-mode controls for Pi, hide
+  `mcp_servers` where not applicable), reading the harness-capabilities map from `/inspect`.
+  (`AgentConfigControl.tsx`, `SchemaPropertyRenderer.tsx`.)
+
+Acceptance: an author sets a preset and a per-tool permission in the playground, saves, and the
+values persist on the variant config and reach the run.
+
+## Phase 5: playground HITL approval surface (the "ask" path)
+
+The chat already exposes `addToolApprovalResponse` (`AgentChatPanel.tsx:86`) **and** already
+renders approve/deny buttons (`ToolPart.tsx:153`). So the UI is largely built; the missing piece
+is the runtime parked/resumed path, not the buttons.
+
+- Map the runner's `interaction_request` (permission) event onto the ai-sdk approval part, so a
+  Layer 3 "ask" in the runtime surfaces as the existing approval UI, and route the user's
+  decision back through `addToolApprovalResponse` to resume (or reject) the parked call.
+  (`web/oss/src/components/AgentChatSlice/`: `AgentChatPanel.tsx`, `AgentMessage.tsx`,
+  `ToolPart.tsx`; runner responder seam `responder.ts`, `permissions.ts`.)
+- Confirm the responder actually parks the call and resumes it on the answer (the underestimated
+  part), rather than only rendering a prompt.
+
+Acceptance: a tool set to ask shows the approve/deny prompt in the playground chat, and the
+answer resolves the parked call end to end.
+
+## Phase 6 (S6): tests and live verification
+
+**Unit / golden — DONE.** The settings.json builder, the relay permission enforcement, the
+`resolvePermission` table, the `HITLResponder`, the Composio `read_only` mapping, and the
+permission default ladder are all unit-tested; the new wire fields cross the golden fixtures and
+both contract tests. 293 SDK + 15 API + 10 services/oss + 177 TS pass.
+
+**Live — PENDING (needs a deployed stack).** The running dev stack is
+`agenta-ee-dev-wp-b2-rendering` (traefik `:8280`, sandbox-agent sidecar `:8765`). The SDK changes
+need a container reload and the TS-runner changes need a sandbox-agent image rebuild before these
+checks reflect this branch. Run each check and record pass/fail here:
+
+1. **L2 Daytona `network: off` blocks egress (E3).** Config `sandbox_permission.network.mode=off`,
+   harness `pi` or `claude`, sandbox `daytona`. Make the agent run `curl https://example.com` (or
+   a `code` tool doing a fetch). Expect egress to FAIL. Confirm the Daytona create call carried
+   `networkBlockAll: true` (sidecar logs). Re-run with `enforcement=strict` on the LOCAL sidecar:
+   expect the run to fail loud at plan time, not silently.
+2. **L2 runner-host guard (S1g).** Config `network: off`, `enforcement=strict`, plus a `code` or
+   gateway tool (or a stdio MCP server). Expect the run to be REJECTED at plan time with the
+   runner-host message. With `enforcement=best_effort`, expect it to run.
+3. **L1 Claude `settings.json` deny (E2/E3, claude).** Author `harness_options.claude.permissions.deny=["WebFetch"]`
+   (or `network:off` to derive it). Ask Claude to fetch a URL. Expect Claude NOT to call WebFetch.
+   Confirm `<cwd>/.claude/settings.json` was written (sidecar logs / Daytona FS). Also verify a
+   `defaultMode` takes effect.
+4. **L1 MCP `mcp__<server>` rule.** An MCP server with `permission:"deny"` → confirm Claude cannot
+   call its tools; `permission:"allow"` → confirm it runs without a prompt.
+5. **L3 per-tool deny / read-only default.** A gateway tool with `permission:"deny"` → the relay
+   refuses it (refusal string in the transcript, tool not executed). A read-only Composio tool with
+   no explicit permission → defaults to allow (runs without a prompt). A mutating one → defaults to
+   ask. Verify the permission survived the save (it is a TOP-LEVEL `permission` key on the tool,
+   not in `agenta_metadata`).
+6. **L3 HITL multi-turn round-trip (the deferred S5 unknown).** In the `/messages` playground chat,
+   trigger an `ask` tool. Turn 1: expect a `tool-approval-request` to surface (approve/deny buttons)
+   and the tool NOT to run. Approve. Turn 2: expect the gate to re-raise and the tool to actually
+   run; deny variant: expect refusal. THIS is the unverified design assumption (cold-replay
+   re-raise) — see `../../scratch/open-issues.md`. If turn 2 does not re-raise, apply the fallback
+   noted there.
+7. **Surfaces.** Drive checks 1–5 via BOTH the SDK (E4 script pulling config + running on host) and
+   the playground UI (`mcp__chrome-devtools__*` against `:8280`). The playground form (S4) must
+   render the three sections, gate the Claude section to the claude harness, and persist values
+   that reach the run.
+
+**Pin a replay regression.** Capture one green `/run` per layer, redact volatile fields, and write
+a replay test (`agent-replay-test` skill) so the cells stay green cost-free in CI.
+
+Acceptance: the matrix in `../qa/` gains capability + per-tool-permission rows, green on the
+harness/backend pairings the enforceability table claims.
diff --git a/docs/design/agent-workflows/projects/capability-config/proposal.md b/docs/design/agent-workflows/projects/capability-config/proposal.md
new file mode 100644
index 0000000000..a40fb76d25
--- /dev/null
+++ b/docs/design/agent-workflows/projects/capability-config/proposal.md
@@ -0,0 +1,222 @@
+# Proposal: configuring agent capabilities and permissions
+
+The canonical design. How an author controls what an agent may do, and how those controls reach
+each harness and backend. Why this matters and how it fits the other agent-workflows projects is
+in `context.md`; the codebase findings are in `research.md`; the work is phased in `plan.md`.
+
+## The problem
+
+Every agent run has to be governed. The author needs to say what the agent may touch (files,
+the network, which tools), and the system has to enforce that across two harnesses (`pi` and
+`claude`) and three backends (sandbox-agent local, sandbox-agent on Daytona, and a future
+in-process local SDK).
+
+Today almost none of this is wired. The runner drops Pi's tool selection, never restricts
+Claude, and never sets a network boundary. So a request as simple as "give this agent web
+access but not write access" is not expressible on either harness.
+
+The fix is three configuration layers, each with one job and one enforcement point. This
+document defines the three and shows how each reaches `pi` and `claude`.
+
+## The three layers
+
+The three layers answer three different questions. Keep them separate. Collapsing them is what
+made the current code confusing.
+
+1. **Harness configuration** sets how the harness itself behaves: its permission mode and its
+   own tool rules.
+2. **Sandbox permission** sets what the running process can physically do: reach the network,
+   write the filesystem.
+3. **Tool permission** sets what happens to a single tool call: run it, ask a human, or refuse
+   it.
+
+Layer 1 configures the agent. Layer 2 draws the security boundary. Layer 3 governs each call.
+The next three sections take them in turn.
+
+### Layer 1: harness configuration (author kwargs to a settings file)
+
+The author sets harness-specific options in the existing generic `harness_options` kwargs (a map
+keyed by harness: `harness_options.claude.permissions`, `harness_options.pi.*`, and so on, room
+for any future harness). The author never writes harness-native config by hand and the generic
+interface stays free of per-harness fields. The SDK's per-harness adapter (in Python) translates
+those neutral options into the harness's native config files, and the runner writes those files
+into the session cwd before the session starts. The runner stays harness-agnostic: it materializes
+a generic `harnessFiles` list (`{path, content}`) and knows nothing about Claude. This scales:
+ten harnesses means ten Python adapters, not ten first-party interface fields.
+
+For **Claude**, that native config is a `.claude/settings.json` file written into the session's
+working directory. The Python claude adapter renders it; the Claude ACP adapter reads it. The Claude ACP adapter reads it, because it builds the underlying SDK query
+with `settingSources: ["user", "project", "local"]` (`acp-agent.js:954`). Through that one file
+the runner sets:
+
+- the permission mode, via `permissions.defaultMode` (`default`, `acceptEdits`, `plan`, or
+  `bypassPermissions`), which the adapter reads at session start (`acp-agent.js:935`);
+- per-tool rules, via `permissions.allow` / `deny` / `ask`, in Claude's rule syntax (`"Read"`,
+  `"Bash(npm run:*)"`, `"mcp__<server>__<tool>"`);
+- `env` and `model`.
+
+This is the clean delivery path, and it is the only one. The other channel, the
+`_meta.claudeCode.options` passthrough, never arrives: sandbox-agent strips `_meta` from the
+session request (`index.d.ts:2778`). The settings file does not depend on that channel, so it
+works over sandbox-agent today with no upstream change. The runner already owns the working
+directory (a temp dir), so it writes the file before it creates the session. For the mode alone
+there is also a runtime control, `session.setMode(modeId)`, if we ever want to switch mode
+mid-run.
+
+For **Pi**, the lever is thinner. Pi exposes no settings file and no permission mode over the
+ACP bridge; its probe reports `permissions: false`. Pi's Layer 1 is its built-in tool selection,
+`builtin_names`: which native tools (read, write, edit, bash) the model is given at all. The
+runner must honor that list, which it drops today. If Pi later grows a config surface, it
+attaches here.
+
+### Layer 2: sandbox permission (the security boundary)
+
+An optional `sandbox_permission` field on `AgentConfig` declares what the process may physically
+do: its network egress and its filesystem access. The backend enforces it when it provisions
+the sandbox. The harness does not, and the runner logic does not.
+
+This layer is the only real boundary for the network and the filesystem. A harness tool rule can
+hide Claude's `WebFetch`, but Pi can still `curl` from `bash`, so "no web" is a guarantee only
+when the sandbox blocks egress. On Daytona the backend sets `networkBlockAll` or a
+`networkAllowList` (CIDR ranges) at create time. The local sidecar and the future local SDK have
+no sandbox, so they cannot enforce this layer at all.
+
+That gap must stay honest. When a config asks for a network or filesystem guarantee that the
+chosen backend cannot deliver, the run fails loud, unless the author sets an explicit per-axis
+opt-out for local development. We never tell the author the web is off when it is not. Filesystem
+confinement is not real on any backend today, so Layer 2 ships network first and declares
+filesystem without enforcing it.
+
+### Layer 3: tool permission (the sidecar-managed permission policy)
+
+This layer is the sidecar's own permission policy, and it carries the human-in-the-loop gate. It
+subsumes `permission_policy`: that auto/deny switch is just this layer's global default.
+
+For each tool, the author assigns one permission, stored on the tool's own spec:
+
+- **allow.** The call runs with no prompt. If the tool is one we resolved (a gateway or
+  code tool, not a harness builtin), the runner runs it through the relay. This is today's
+  auto-accept behavior.
+- **ask.** The runner raises a human-in-the-loop request and waits for the answer. For now it
+  asks the user in the playground chat. Later it raises a durable approval event, so a triggered
+  or scheduled run can be answered even when no one has the chat open (Flow 7).
+- **deny.** The call never runs.
+
+The permission lives next to the thing it governs. A resolved tool carries its permission on
+its tool spec; an MCP server carries one on its server spec, which the runner renders as a
+whole-server `mcp__<server>` rule or a per-tool `mcp__<server>__<tool>` rule. Anything with no
+explicit permission falls back to the global default, `permission_policy`. Harness builtins have
+no spec, so their permission is rendered in Layer 1 instead: Claude builtins as settings.json
+rules, Pi builtins as `builtin_names` selection.
+
+Where Layer 3 is enforced depends on where the tool runs, and this is the subtle part. There
+are two cases.
+
+Resolved tools (gateway, code) run in the runner, through the relay. The runner is the choke
+point, so it applies the permission directly: allow runs the call, ask parks it, deny
+refuses it. This works the same on `pi` and `claude`.
+
+Harness builtins (Claude `Bash`/`Edit`/`Read`, Pi `bash`/`read`) run inside the harness, where
+the runner cannot intercept them. For Claude, the settings.json `allow`/`deny`/`ask` rules set
+the static baseline, and any call that still needs a decision arrives at the runner's responder
+through Claude's permission callback, carrying the tool name. The responder applies the
+permission there. For Pi, builtins cannot be gated, because Pi never asks. The only way to deny
+a Pi builtin is to not grant it in Layer 1.
+
+The author should not have to label every tool by hand. Composio already tells us whether a tool
+reads or writes, through hint tags (`readOnlyHint`, `destructiveHint`, `updateHint`) that the
+catalog parser strips today. We will keep them, carry a read-only flag onto the tool, and
+default read-only tools to allow and mutating tools to ask. The author overrides any
+default.
+
+## Where `permission_policy` fits
+
+Folded in. `permission_policy` (auto or deny) is the global default of Layer 3: the answer the
+sidecar gives for any tool with no explicit permission and no human. The HITL gate, the
+per-tool permissions, and `permission_policy` are one thing, the sidecar-managed permission
+policy. There is no separate permission plane.
+
+One implementation caution, so the fold does not blur two ideas in code. Layer 3 carries two
+distinct things, and they must stay distinct internally even though they are one policy to the
+author:
+
+- the **permission** of a tool — allow, ask, or deny. This is static capability config and
+  rides on the tool's spec.
+- the **responder mode** for an `ask` that reaches the runtime with no human — block and wait for
+  the UI, emit a durable approval event, auto-allow, or deny. This is a runtime answering choice,
+  and `permission_policy` is its default.
+
+A tool set to `ask` always asks; what happens to that ask when nobody is watching is the
+responder mode. Collapsing the two — treating `permission_policy` as if it were a fourth
+permission — is the mistake to avoid.
+
+## A worked example: web off, read-only, on Daytona
+
+Take an author who sets `sandbox_permission.network: off` and a read-only tool profile, and runs
+on Daytona.
+
+- Layer 1 gives Claude a `.claude/settings.json` that denies `Write`, `Edit`, `Bash`,
+  `WebFetch`, and `WebSearch`. Pi gets a `builtin_names` of `read` only.
+- Layer 2 tells the Daytona backend to create the sandbox with `networkBlockAll: true`.
+- Layer 3 has little to do here, because the write and web tools are already gone. Any resolved
+  tool the agent still calls runs through the relay under its permission.
+
+Claude now holds no write or web tools, and the VM has no egress. Pi holds only `read`, and its
+`bash` is gone, so it cannot `curl` even if it tried, and the VM would block it anyway. Both
+harnesses are safe, with the tool layer and the sandbox layer reinforcing each other.
+
+Run the same config on the local sidecar and Layer 1 still works: Claude loses the tools, Pi
+gets only `read`. Layer 2 cannot: the local provider is the host, with no egress control. So the
+run fails loud unless the author opted into unsafe local execution. That is the
+honest-degradation rule in action.
+
+## Known risk: the runner-host execution surface
+
+The "sandbox layer is authoritative" claim holds only for tools that run inside the sandbox.
+Resolved `code` **and** gateway/callback tools do not: the relay runs both in the runner process
+(`tools/relay.ts` — `runCodeTool` for `code`, `callAgentaTool` for gateway). So a network-blocked
+Daytona sandbox confines the harness's own `bash` and `WebFetch`, but it does not confine a
+resolved code tool or a gateway tool, which run on the runner host with the runner's network. A
+network-blocked agent can still egress through either.
+
+This project does not hand-wave that. The guarantee is gated at Phase 1, with a target and an
+interim guard:
+
+- **Target:** move resolved-tool execution into the sandbox, so one boundary covers everything.
+- **Interim guard:** when `network: off` or `exec: off`, reject or remove `code` tools,
+  gateway/callback tools, and stdio MCP servers (which are arbitrary commands), and confine the
+  runner host separately.
+
+The decision is tracked in `status.md`. Until it lands, `network: off` is only a full guarantee
+for harness-native tools, so the runner must not *claim* `network: off` while a runner-side tool
+is still reachable.
+
+## What each pairing can enforce
+
+The guarantees vary by harness and backend. The design must record that variance, not discover
+it at runtime.
+
+| Capability | Pi tool layer | Claude tool layer | Daytona sandbox layer | Local sandbox layer |
+| --- | --- | --- | --- | --- |
+| network off | none (no web tool; curl remains) | deny WebFetch/WebSearch | enforce (networkBlockAll) | cannot, fail loud |
+| network allowlist | none | none (no per-host tool gate) | enforce (networkAllowList CIDR) | cannot, fail loud |
+| no code execution | drop `bash` from builtins | deny Bash + mode | partial (interpreters remain) | tool layer only |
+| read-only | drop write/edit/bash | deny Write/Edit/Bash | no fs jail today | no fs jail today |
+
+The empty cells are the honest gaps. Pi has no web tool, so its web access is purely a sandbox
+concern. Neither harness has a per-host web gate, so a network allowlist is a sandbox guarantee
+for both. This is why the network boundary has to live in Layer 2 to be real.
+
+## Decisions
+
+1. Three layers, three jobs. Harness configuration, sandbox permission, tool permission.
+2. Claude config ships as `.claude/settings.json`, written into the session cwd before session
+   start. No `_meta`, no upstream change.
+3. MCP permissions are settings.json `mcp__` rules. The separate per-server `tools` allowlist
+   that the runner parses but never enforces is dropped.
+4. Pi MCP stays out of scope, and follows the Claude pattern when built.
+5. Composio hints drive Layer 3 defaults: keep them, carry a read-only flag, default read-only
+   to allow and mutating to ask.
+6. The sandbox layer is authoritative for the network; it declares filesystem confinement but
+   enforces none today (no backend has a real fs jail). The tool layer is best effort. A run
+   fails loud when a backend cannot deliver a requested guarantee.
diff --git a/docs/design/agent-workflows/projects/capability-config/research.md b/docs/design/agent-workflows/projects/capability-config/research.md
new file mode 100644
index 0000000000..8c630e6b2e
--- /dev/null
+++ b/docs/design/agent-workflows/projects/capability-config/research.md
@@ -0,0 +1,204 @@
+# Research: current state and insertion points
+
+Self-contained map of how the runtime governs an agent today and where the three layers attach.
+All claims trace to code or installed package source. The deep web/exec/read/write current-state
+cut lives in `../../scratch/capability-map.md`; this doc adds the enforcement mechanics and the
+exact insertion points for the backend, the runner, and the frontend. Line numbers are accurate
+as of 2026-06-23 and a few are approximate (flagged); reconfirm at implementation time.
+
+## 1. Enforcement mechanics (the load-bearing library facts)
+
+**Claude config reaches the harness through `.claude/settings.json`.** The Claude ACP adapter
+builds the underlying SDK query with `settingSources: ["user", "project", "local"]`
+(`@zed-industries/claude-agent-acp` `acp-agent.js:954`), so the SDK reads
+`<cwd>/.claude/settings.json` and honors its `permissions.allow` / `deny` / `ask` rules. The
+adapter reads `permissions.defaultMode` for the initial permission mode (`acp-agent.js:935`).
+The `_meta.claudeCode.options` channel (disallowedTools, permissionMode) is unusable, because
+sandbox-agent strips `_meta` from the session request
+(`sessionInit?: Omit<NewSessionRequest, "_meta">`, `sandbox-agent/dist/index.d.ts:2778`). So the
+settings file is the one clean delivery path. For the mode alone there is a runtime control,
+`session.setMode(modeId)` (`index.d.ts:3064`; modes `default`/`acceptEdits`/`plan`/
+`bypassPermissions`, `acp-agent.js:1046-1078`). The runner owns the cwd (a temp dir), so it can
+write the file before `createSession`.
+
+**Pi has no permission gate over ACP, and its built-in tool restriction is backend-dependent.**
+Its permission probe reports `permissions: false`. The built-in tool lever (S0b finding, verified
+in code) is real but only reachable on one backend:
+
+- The Pi SDK natively supports restriction: `createAgentSession({ tools, excludeTools, noTools })`
+  and the CLI flags `--tools` / `--no-builtin-tools` / `--exclude-tools`
+  (`@earendil-works/pi-coding-agent@0.79.4` `dist/core/sdk.d.ts`, `dist/cli/args.js:79-96`).
+- The **in-process** Pi engine already uses it: `pi.ts:311-314` passes `tools: toolAllowlist`
+  (built from `request.tools`) into `createAgentSession()`. So Pi Layer 1 **works on the
+  in-process / local backend**.
+- The **sandbox-agent ACP** path does not. `pi-acp@0.0.29` hardcodes the spawn as
+  `pi --mode rpc --no-themes` (`pi-acp/dist/index.js:134-142`) and its `newSession` accepts only
+  `cwd` + `mcpServers` (`index.js:1701`) — no `tools`/`excludeTools`/`noTools` forwarded. And
+  `sandbox_agent.ts:208-212` passes only `{ agent, cwd, sessionInit: { cwd, mcpServers } }`,
+  dropping `request.tools` (`run-plan.ts:101`).
+
+So Pi's Layer 1 is supported in-process and **unsupported over sandbox-agent ACP** until pi-acp
+forwards the flags. Design consequence: honor `builtin_names` on the in-process backend, and
+**fail loud** when an author requests Pi built-in restriction on a sandbox-agent backend
+(`buildRunPlan` validation in `sandbox_agent.ts`, ~:108-115), rather than silently granting the
+full set. Future lever: patch/fork pi-acp to add `--tools`/`--exclude-tools` to the spawn args
+(or upstream it), which would lift the restriction to the ACP path too.
+
+**The permission responder seam already exists.** When the harness raises an ACP permission
+request, `attachPermissionResponder` emits an `interaction_request` event and asks a `Responder`
+(`engines/sandbox_agent/permissions.ts:16-45`). Today `PolicyResponder` answers headlessly from
+`permission_policy` (`responder.ts:44-62`). The request carries the tool call, so a per-tool
+responder can branch on the tool name.
+
+**Resolved tools run in the runner, not the sandbox.** Gateway and code tools execute runner-side
+through the relay (`tools/relay.ts`), regardless of backend. This is the basis for enforcing
+Layer 3 at the relay, and it is also the security caveat in section 5.
+
+## 2. Backend, SDK, and runner insertion points
+
+From a code sweep on 2026-06-23. Reconfirm line numbers when implementing.
+
+**Config DTOs** (`sdks/python/agenta/sdk/agents/dtos.py`):
+- `AgentConfig` (~309-369) holds `instructions`, `model`, `tools`, `mcp_servers`, `skills`,
+  `harness_options`. Add optional `sandbox_permission` here.
+- `RunSelection` (~372-395) already holds `harness`, `sandbox`, `permission_policy`.
+- `SessionConfig` (~571-620) is the wire bag. Per-tool permissions ride on each `ToolSpec`, not
+  a separate map here; MCP permissions ride on each `mcp_servers` entry.
+- `PiAgentConfig` / `ClaudeAgentConfig` / `AgentaAgentConfig` (~468-564) and their `wire_tools()`
+  are where tool/permission serialization happens.
+- `ToolSpec` (`tools/models.py:95-157`) already has `needs_approval` and `render`; add the
+  permission (always-allow/ask/deny) and the `read_only` flag, and serialize both in
+  `to_wire()`. Each `mcp_servers` entry gains a matching permission field.
+
+**Schema generation:**
+- `AgentConfigSchema` (`sdk/utils/types.py` ~1065-1138) is the `agent_config` catalog type the
+  playground renders. Add `sandbox_permission`, and a per-tool permission shape on `tools`.
+- `services/oss/src/agent/schemas.py` holds `_DEFAULT_AGENT_CONFIG` and the `x-ag-type-ref`
+  reference; add the default for the new field.
+
+**Concrete `sandbox_permission` schema (decision 11).** The first-slice shape, kept minimal and
+extensible:
+
+```python
+class NetworkEgress(BaseModel):
+    mode: Literal["on", "off", "allowlist"] = "on"
+    allowlist: list[str] = []  # CIDR ranges; honored when mode == "allowlist"
+
+class SandboxPermission(BaseModel):
+    network: NetworkEgress = NetworkEgress()
+    filesystem: Optional[Literal["on", "readonly", "off"]] = None  # declared, NOT enforced today
+    enforcement: Literal["strict", "best_effort"] = "strict"       # strict = fail loud; best_effort = local opt-out
+```
+
+Daytona maps `network.mode == "off"` → `networkBlockAll: true`, and `"allowlist"` →
+`networkAllowList: <cidr[]>`. `filesystem` is carried and surfaced but enforces nothing until a
+backend gains an fs jail. `enforcement: "strict"` makes a backend that cannot deliver a requested
+network guarantee (local sidecar, local SDK) fail loud; `"best_effort"` is the per-axis opt-out
+for local development. Named presets (e.g. "locked down") are FE sugar deferred to S4, not a wire
+concept.
+
+**Ports for Layer 2 — the policy crosses the Python→TS wire.** Daytona is provisioned in the TS
+runner (`buildSandboxProvider`), not in the Python `Backend.create_sandbox`, so the policy must
+travel on the run request, not only as a Python call argument. The full path (Codex-confirmed):
+
+- Python authoring → wire: add `sandbox_permission` to `AgentConfig`
+  (`sdks/python/agenta/sdk/agents/dtos.py`), carry it on `SessionConfig`, and serialize it in
+  `request_to_wire` (`sdk/agents/utils/wire.py`) so it lands in the `/run` request.
+- TS runner: surface it on `AgentRunRequest` (`services/agent/src/protocol.ts`), thread it through
+  `buildRunPlan` (`engines/sandbox_agent/run-plan.ts`) into `buildSandboxProvider`, and set it on
+  the Daytona `create` object (`engines/sandbox_agent/provider.ts` ~14-43, which sets
+  `snapshot`/`target`/`envVars`/`ephemeral` only today) as `networkBlockAll` / `networkAllowList`.
+- The golden wire fixtures (`.../golden/run_request.*.json`) and both contract tests
+  (`test_wire_contract.py`, `tests/unit/wire-contract.test.ts`) move together with the new field.
+- The Python `Backend.create_sandbox` (`interfaces.py` ~155, parameterless) still gains the typed
+  policy for the local/in-process backend path, and `Environment` (~177-232) derives it from
+  `session_config`. For shared sandboxes (`sandbox_per_session=False`), reject a post-create
+  policy change or cache by policy. But the wire path above is the one that reaches Daytona.
+
+**Runner enforcement:**
+- `run-plan.ts` (~67-145) builds the cwd; write `.claude/settings.json` here or just before
+  `sandbox.createSession` in `sandbox_agent.ts` (~195).
+- `responder.ts` `PolicyResponder` and `permissions.ts` `attachPermissionResponder`: thread the
+  per-tool map into the responder.
+- `relay.ts` `executeRelayedTool` (~92-113): enforce deny/ask/allow for resolved tools.
+- `mcp.ts:26`: the per-server `tools` allowlist is parsed but unenforced; replace it with
+  settings.json `mcp__<server>` / `mcp__<server>__<tool>` rules.
+
+## 3. Frontend insertion points
+
+From a code sweep on 2026-06-23. The form is generic-schema-driven, which is the good news:
+fields that the schema declares render through the existing pipeline.
+
+**The agent config form** (`web/packages/agenta-entity-ui/src/DrillInView/SchemaControls/`):
+- `AgentConfigControl.tsx` (the composite, ~186-323) renders instructions, model, tools,
+  mcp_servers, harness, sandbox, permission_policy. It reads the live `harness` off the config
+  object. New sections render here.
+- `SchemaPropertyRenderer.tsx` (~80-217) routes a schema field to a control by type
+  (`enum` → EnumSelectControl, `grouped_choice` → GroupedChoiceControl, `agent_config` →
+  AgentConfigControl). A custom `x-ag-type` dispatches to a custom control.
+- `McpServerItemControl.tsx` and `ToolItemControl.tsx` are the per-item controls.
+
+**Already present and reusable:**
+- CORRECTION (S3a, verified 2026-06-23): the FE does **not** round-trip
+  `tool.agenta_metadata.permission_mode` today — there are zero `permission_mode` references in
+  `web/`. The earlier "head start" claim was a future-state assumption. `ToolItemControl` /
+  `AgentConfigControl` round-trip a free-form `agenta_metadata` bag with no permission. So S4 must
+  **build** the per-tool permission control, not just wire it. Good news: the SDK `permission`
+  field accepts `permission_mode` / `permissionMode` via `AliasChoices`, so if S4 writes
+  `agenta_metadata.permission_mode` with the `allow|ask|deny` vocabulary, it deserializes into
+  `permission` with no mapping and no breaking change.
+- The HITL "ask" surface DOES have a head start (confirmed in S2 review): the agent chat uses
+  ai-sdk `useChat` and already exposes `addToolApprovalResponse`
+  (`web/oss/src/components/AgentChatSlice/AgentChatPanel.tsx:86`), and `ToolPart.tsx:153` already
+  renders approve/deny buttons. The missing piece for S5 is the runtime parked/resumed path:
+  mapping the runner's `interaction_request` onto the ai-sdk approval part and resuming the call.
+
+**Schema and persistence flow:** the schema arrives through the workflow molecule
+(`agenta-entities/src/workflow/state/molecule.ts`, `parametersSchema`), and config edits persist
+through `updateConfiguration` → `updateWorkflowDraftAtom`. New fields under `data.parameters.*`
+persist automatically once the schema declares them and `AgentConfigControl` calls `setField`.
+
+**Per-harness gating is not built.** Today every field renders unconditionally. Gating reads the
+live `harness` value and a harness-capabilities map (the `/inspect` capabilities document from
+`../harness-capabilities/`) and hides inapplicable fields.
+
+## 4. Composio read/write metadata
+
+Composio returns MCP behavioral hint tags per action (`readOnlyHint`, `destructiveHint`,
+`updateHint`, `idempotentHint`, ...). The catalog parser strips them as noise
+(`api/oss/src/core/tools/providers/composio/catalog.py:278,362`), and `ToolCatalogAction` /
+`ToolCatalogActionDetails` (`api/oss/src/core/tools/dtos.py:41-54`) carry no mutation field. So
+the read-vs-write signal exists at the source and we discard it. Phase 0 keeps it: derive
+`read_only` from `readOnlyHint` (and treat `destructiveHint`/`updateHint` as mutating), carry it
+on the catalog action and the resolved tool, and default Layer 3 permissions from it.
+
+## 5. Security caveat: the runner-host execution surface
+
+This is the sharpest correctness risk and the design must not hide it. `executeRelayedTool`
+(`tools/relay.ts` ~:92-113) runs **both** resolved `code` tools (`runCodeTool`, ~:101) and
+gateway/callback tools (`callAgentaTool` over HTTP, ~:106) in the runner process, not in the
+sandbox. So Daytona's `networkBlockAll` confines the harness's own `bash`/`WebFetch` (which run
+in the VM) but does **not** confine a resolved `code` tool *or* a gateway/callback tool, both of
+which run on the runner host with the runner's network. A network-blocked Daytona agent can still
+egress through either. Two ways out, gating the `network: off` guarantee in Phase 1 (S1g):
+
+- **(a) target:** move resolved-tool execution into the sandbox, so the sandbox plane is truly
+  authoritative for every tool.
+- **(b) interim guard:** keep the relay runner-side, but when `network: off` or `exec: off`,
+  reject or remove `code` tools, **gateway/callback tools**, and stdio MCP servers (MCP servers
+  are arbitrary commands), and confine the runner host separately. The guard must cover
+  gateway/callback, not only `code` and MCP — that was the gap Codex caught.
+
+## 6. Uncertainties to resolve during implementation
+
+1. How the Layer 2 policy threads into `buildSandboxProvider` (no config param today).
+2. Mutation detection for `read_only` enforcement on arbitrary resolved-tool code (explicit flag
+   vs input inspection). The honest first cut is to treat `read_only` as an advisory default for
+   the permission, not a hard runtime block on resolved tools.
+3. `LocalBackend.create_sandbox` signature parity with the new policy parameter.
+4. The exact `.claude/settings.json` contents validated against Claude Code's settings schema,
+   and the `mcp__<server>__<tool>` naming validated on a live run.
+5. Resolved: per-tool permissions live on the tool spec, and MCP permissions live on the MCP
+   server spec. (The FE does NOT round-trip `permission_mode` yet — see the S3a correction in
+   section 3; S4 builds that control.)
+   Keep the wire field name consistent across SDK, wire, and FE.
diff --git a/docs/design/agent-workflows/projects/capability-config/status.md b/docs/design/agent-workflows/projects/capability-config/status.md
new file mode 100644
index 0000000000..095d2d541d
--- /dev/null
+++ b/docs/design/agent-workflows/projects/capability-config/status.md
@@ -0,0 +1,330 @@
+# Status
+
+Source of truth for where this project stands. Keep it current.
+
+## State
+
+**Code-complete, reviewed, green. Live QA + branch pending.** All three layers plus Phase 0
+are implemented across backend, runner, and frontend, each slice paired with a review and tests.
+Full suites pass together: **293 SDK agents + 15 API tools + 10 services/oss + 177 TS**, tsc and
+ruff clean. What remains is (1) the live QA pass against a deployed stack (checklist in
+`plan.md` S6) and (2) landing the work on a GitButler lane. Codex reviewed the project (below)
+and its feedback is folded in. Built under the `implement-feature` skill, slice by slice.
+
+## Live QA (2026-06-24, against :8280, direct sandbox-agent /run)
+
+Ran on the deployed stack (`agenta-ee-dev-wp-b2-rendering`, sidecar mounts `services/agent/src` +
+tsx, so a restart deployed the runner changes). Drove `/run` directly with forced un-guessable
+tokens. **No feature-code bugs found.**
+
+- **L3 per-tool `deny` — PASS.** Identical code tool, only `permission` flipped: `deny` →
+  `tool_result="Tool 'get_build_id' is denied by policy."`, token `BUILDID_A7F3_QA` absent, code
+  never ran; `allow` → token present. (relay.ts enforcement.)
+- **L1 Claude `.claude/settings.json` — write PASS, behavior BLOCKED.** Captured the file the
+  runner actually wrote mid-run: `{"permissions":{"deny":["WebFetch","WebSearch"]}}` (exactly the
+  `claudeSettings.deny`). The behavioral half (Claude refusing WebFetch) needs a real
+  `secrets.ANTHROPIC_API_KEY` — none was obtainable (vault scan correctly safety-denied), runs die
+  at Claude auth. The write is the mechanism; Claude honoring it is its documented `settingSources`
+  behavior (code-verified earlier).
+- **L2 runner-host guard — PASS.** daytona+code+`network:off`/strict and local+`network:off`/strict
+  both rejected at plan time with the runner-host / local-cannot-enforce messages; `best_effort`
+  control allowed the run. (run-plan.ts guard.)
+- **L2 Daytona `networkBlockAll` — INCONCLUSIVE (environment, not feature).** Code maps
+  `off`→`{networkBlockAll:true}` correctly and the field reaches Daytona without error, but the
+  Daytona org blocks ALL outbound, so the `network:on` positive control also fails. Needs a Daytona
+  target with working egress to demonstrate off-vs-on.
+
+**Finding — backend-routing footgun (now hardened):** `backend:"pi"` (legacy in-process
+`engines/pi.ts`) enforced NONE of the three layers — a silent bypass. Only `backend:"sandbox-agent"`
+(the default) enforces. Addressed by making the in-process engine fail loud (see changelog).
+
+What still needs a live run before full sign-off: the Daytona egress positive control (needs an
+egress-capable target) and the Claude behavioral refusal + HITL multi-turn round-trip (need a
+project Anthropic key). The enforcement LOGIC is otherwise live-proven.
+
+## Decisions made
+
+1. **Three layers, three jobs.** Harness configuration (Layer 1), sandbox permission (Layer 2),
+   tool permission (Layer 3). Each has one enforcement point.
+2. **Claude config ships as `.claude/settings.json`,** written into the session cwd before
+   session start. It carries the permission mode and the per-tool allow/deny/ask rules. No
+   `_meta`, no upstream sandbox-agent change.
+3. **MCP permissions are settings.json `mcp__` rules** (`mcp__<server>` or
+   `mcp__<server>__<tool>`). The unenforced per-server `tools` allowlist is dropped.
+4. **Pi MCP stays out of scope,** and adopts the Claude pattern when built (tracked in
+   `../harness-capabilities/`).
+5. **Composio read/write hints drive Layer 3 defaults:** keep the hints, carry a `read_only`
+   flag, default read-only to always-allow and mutating to ask.
+6. **The sandbox layer is authoritative for the network.** It declares filesystem confinement but enforces none today (no fs jail on any backend). The tool layer is best
+   effort. A run fails loud when a backend cannot deliver a requested guarantee.
+7. **Scope is end to end, including the playground frontend:** the config form (Phase 4) and the
+   chat tool-approval surface (Phase 5).
+8. **`sandbox_permission` is the confirmed name for Layer 2.**
+9. **`permission_policy` folds into Layer 3 as its global default.** The HITL gate, the per-tool
+   permissions, and `permission_policy` are one sidecar-managed permission policy. There is no
+   separate permission plane.
+10. **Permissions live on the spec they govern:** a resolved tool's permission on its tool
+    spec, an MCP server's permission on its server spec. Not a separate map. Harness builtins
+    have no spec, so their permission is rendered in Layer 1 (Claude settings.json rules, Pi
+    `builtin_names`).
+11. **`sandbox_permission` shape (S1).** A typed object: `network` = `{ mode: on|off|allowlist,
+    allowlist: [cidr] }` (default `on`); `filesystem` = `on|readonly|off` declared but **not
+    enforced** today; `enforcement` = `strict|best_effort` (default `strict` = fail loud when the
+    backend cannot deliver the requested guarantee; `best_effort` is the per-axis local opt-out).
+    The first slice ships `network: off` enforcement on Daytona; `allowlist` and named presets are
+    follow-ups (presets become FE sugar in S4). Concrete schema in `research.md` §2.
+12. **Runner-host hole → interim guard now, sandbox-execution later (S1g).** Best-judgment call
+    (user authorized "use best judgement, note it"): ship the interim guard — when `network: off`
+    or `exec: off`, reject at plan time any runner-side-executed tool (`code`, gateway/callback,
+    stdio MCP) unless it can run inside the sandbox. The target (move resolved-tool execution into
+    the sandbox so one boundary covers everything) is recorded as deferred future work, not built
+    in this project. The runner must never *report* `network: off` while a runner-side tool is
+    still reachable.
+
+## Resolved (2026-06-23, user)
+
+- `sandbox_permission` is the confirmed name for Layer 2.
+- `permission_policy` folds into Layer 3 as the global default of the sidecar-managed permission
+  policy. The HITL gate is the same thing, not a separate plane.
+- Per-tool permissions live on the tool spec; MCP permissions live on the MCP server spec.
+
+## Codex review (2026-06-23)
+
+Codex (gpt-5.5, xhigh, read-only) reviewed the project. Verdict: "Good direction, not ready to
+implement as written" — the three-layer shape is right, three seams need work. Summary of
+its seven points:
+
+**Acting on (Codex correct, verified in code):**
+
+1. **Runner-host guard is under-scoped and too late.** `relay.ts` `executeRelayedTool` runs
+   *both* `code` (`runCodeTool`, ~:101) and gateway/callback (`callAgentaTool`, ~:106) tools in
+   the runner process. The interim guard must cover gateway/callback too, not just `code` + stdio
+   MCP, and it gates Phase 1's `network: off` acceptance, not Phase 3. Folded into the guard
+   scope in `proposal.md`/`research.md` and into Phase 1 acceptance in `plan.md`.
+2. **Layer 2 wire path was vague.** Daytona is provisioned in the TS runner's
+   `buildSandboxProvider`, not in `Backend.create_sandbox` (parameterless). The policy must cross
+   the Python→TS wire: `SessionConfig` → `request_to_wire` → `AgentRunRequest` → `buildRunPlan` →
+   provider `create`. Pinned in `research.md` section 2 and Phase 1.
+3. **Pi Phase 0 may be impossible as written.** The runner does drop `request.tools`
+   (`run-plan.ts:101`), but `pi-acp@0.0.29` spawns `pi --mode rpc --no-themes` with **no `tools`
+   field on `newSession`** (`index.js:135`). Phase 0's Pi half is now an investigation: find Pi's
+   real control surface (CLI args, a Pi config file, or an ACP session field) or mark Pi
+   builtin-restriction unsupported on sandbox-agent and fail loud. Verified in code.
+4. **Filesystem-authority contradiction.** The proposal said the sandbox layer is authoritative
+   for the filesystem in one place and "declared, not enforced" in another. Corrected to: Layer 2
+   is authoritative for the network; it declares filesystem but enforces nothing today.
+5. **Phase 5 is more "wire the runtime" than "build UI."** The approve/deny buttons already exist
+   (`ToolPart.tsx:153`) on top of `addToolApprovalResponse`. The real work is the parked/resumed
+   responder path, not rendering. Sharpened in `plan.md` Phase 5.
+
+**Holding the user's decisions (Codex conflicts, recorded not applied):**
+
+6. **Keep `sandbox_permission`.** Codex prefers `sandbox_policy`/`environment_policy` because
+   "permission" is overloaded. The user confirmed `sandbox_permission` twice; keeping it. Note
+   recorded so the objection is not lost.
+7. **Keep `permission_policy` folded into Layer 3.** The user decided this. Codex's fair
+   sub-distinction is captured as a note: Layer 3 holds two things that must not be conflated in
+   code — the per-tool *permission* (allow/ask/deny, static, on the spec) and the *responder
+   mode* (what a headless `ask` does: block for UI / emit durable approval / auto-allow / deny).
+   `permission_policy` is the responder-mode default; both live in the one sidecar-managed policy.
+
+## Open questions
+
+1. **The runner-host execution hole (Phase 1/3).** Resolved tools — `code` **and**
+   gateway/callback — run on the runner host, so a network-blocked Daytona sandbox does not
+   confine them. Pick the target (move execution into the sandbox) versus the interim guard
+   (reject `code`, gateway/callback, and stdio MCP when exec/network are off). Blocks the
+   `network: off` guarantee. See `research.md` section 5.
+2. **`read_only` enforcement strength.** Is `read_only` a hard runtime block on resolved tools,
+   or only an advisory default for the permission? The honest first cut is advisory.
+3. ~~**Pi's real control surface (Phase 0).**~~ **Resolved by S0b.** Backend-dependent: supported
+   in-process (`pi.ts:311` passes `tools`), unsupported over sandbox-agent ACP (`pi-acp@0.0.29`
+   forwards nothing). Design: honor `builtin_names` in-process, fail loud over sandbox-agent.
+4. **The implementation branch.** No pre-named `capability-config` lane exists. Best-judgment
+   plan: a new GitButler lane `feat/agent-capability-config` created at Phase 6, anchored on the
+   appropriate parent (the `docs/agent-harness-capabilities` lane or `main`), confirmed with the
+   user before any push. Recorded per the "use best judgement, note it" instruction.
+
+## Slices (implementation cut)
+
+Smallest shippable, independently reviewable units. Each names its acceptance check. Status:
+`todo` / `wip` / `done`.
+
+- **S0a — Composio `read_only` on the wire** (`done`, reviewed). Stopped stripping the Composio
+  mutation hints; `read_only` now flows catalog (`_derive_read_only`) → `ToolCatalogAction` /
+  `ResolvedAgentTool` → SDK `ToolSpec.to_wire()` (camelCase `readOnly`, `exclude_none`) → TS
+  `protocol.ts` `ResolvedToolSpec`. Golden + both contract tests updated. 218 SDK + 15 API + 7 TS
+  tests pass; reviewer APPROVED (precedence both-hints→mutating, no behavior change without a hint,
+  single resolution site covered). Unblocks S3 defaults.
+- **S0b — Pi control-surface investigation** (`done`). Finding (in `research.md` §1): Pi built-in
+  restriction is **backend-dependent**. The Pi SDK supports `tools`/`excludeTools`/`noTools`
+  (+ CLI `--tools`/`--no-builtin-tools`); the in-process engine already passes it (`pi.ts:311`).
+  But `pi-acp@0.0.29` hardcodes `pi --mode rpc --no-themes` and forwards nothing, so restriction
+  is **unsupported over sandbox-agent ACP** → honor `builtin_names` in-process, fail loud over
+  sandbox-agent. Future lever: patch/fork pi-acp to add the flags to its spawn.
+- **S1a — `sandbox_permission` config + wire plumbing** (`done`, reviewed). Model
+  (`NetworkEgress`/`SandboxPermission`) on `AgentConfig`, threaded `from_params` →
+  `_to_harness_config` (all 3 harnesses) → `request_to_wire` (`sandboxPermission`) → `protocol.ts`
+  `AgentRunRequest` → `buildRunPlan`/`RunPlan`. Catalog schema + `_DEFAULT_AGENT_CONFIG` declare
+  it; both `KNOWN_REQUEST_KEYS` guards + both contract tests updated; optionality proven (no key
+  when unset, claude golden omits it). 220 SDK + 111 TS pass, tsc + ruff clean. Reviewer APPROVED.
+  NO enforcement (deferred to S1b via `TODO(S1b)`).
+- **S1b — Daytona enforcement + fail-loud** (`done`, reviewed). `daytonaNetworkFields` maps
+  `off`→`networkBlockAll:true`, `allowlist`(non-empty)→`networkAllowList:"a,b"`,
+  `allowlist`(empty)→`networkBlockAll:true` (block-all, the must-fix from review — empty list =
+  allow nothing, never default-open); `buildSandboxProvider` spreads it into the Daytona `create`.
+  `buildRunPlan` fails loud (`strict` + restricted) on the local sidecar. Daytona network fields
+  verified against `@daytonaio/sdk` (`networkBlockAll?: boolean`, `networkAllowList?: string`).
+  Live `network: off` egress check deferred to Phase 3. typecheck + 126 TS tests pass.
+- **S1g — Runner-host guard** (`done`, reviewed). `buildRunPlan` rejects, at plan time (before
+  any cwd/temp-dir allocation), a `strict` + restricted-network run that carries a runner-side
+  tool: `executableToolSpecs` (code **and** gateway/callback — reviewer confirmed it filters out
+  only `kind:"client"`, so Codex's hole is closed) or a stdio MCP server (`hasStdioMcpServer`,
+  mirrors `mcp.ts` delivery rule). `best_effort` is the opt-out. Covered by run-plan unit tests.
+
+## Deferred follow-ups (defer-todo)
+
+- **S1b-py — Python in-process fail-loud.** A `network: off` + `strict` run on the in-process
+  Python SDK backend does NOT fail loud (it has no sandbox). `SessionConfig` intentionally omits
+  sandbox concerns, so this needs the typed policy threaded through `Backend.create_sandbox` /
+  `Environment.create_session` / `LocalBackend` (`sdks/python/agenta/sdk/agents/interfaces.py`,
+  `adapters/local.py`). Repro: run the in-process SDK path with `sandbox_permission.network=off`,
+  `enforcement=strict` — expect a loud failure, observe a silent unconfined run.
+- **S1b-pi-inproc — TS in-process Pi fail-loud.** Symmetric gap inside `services/agent`:
+  `engines/pi.ts` (in-process Pi, not the sandbox-agent path) has no sandbox-permission handling,
+  so `network: off` + `strict` runs unconfined there while the local-sidecar path correctly
+  rejects. Add the same fail-loud check to the in-process engine. Reviewer-flagged, nice-to-have.
+- **S4-readonly-fe — surface Composio `read_only` in the FE tool catalog.** The per-tool
+  permission control defaults from `read_only`, but the FE tool catalog / `ToolSelectionMeta`
+  (`ToolSelectorPopover.tsx`) doesn't carry `read_only` yet, so the default shows "Inherit policy"
+  until an author picks. The backend catalog already exposes `read_only` (S0a); plumb it from the
+  `/tools` catalog response into the added tool object so the default auto-populates. Layer 3 works
+  on explicit author choice without this; nice-to-have.
+- **S2 — Layer 1 Claude `.claude/settings.json`** (`done`, reviewed). `claudeSettings` (mode +
+  allow/deny/ask) flows `harness_options["claude"]["permissions"]` → wire → `prepareWorkspace`,
+  which writes `<cwd>/.claude/settings.json` for Claude (local `mkdirSync`+`writeFileSync`,
+  Daytona `mkdirFs`+`writeFsFile`), merging author rules with Layer-2-derived denies
+  (`network≠on`→WebFetch/WebSearch; `filesystem` readonly/off→Write/Edit), deduped. Pure
+  `buildClaudeSettings` returns `undefined` for Pi (no file) and when fully-open+unset. Structured
+  as `RuleSet[]` for S3. Reviewer live-verified the ACP adapter reads it + rejects invalid modes;
+  mkdir-before-write safe both ways. 136 TS + 274 SDK pass, tsc + ruff clean. Live rule-syntax
+  check deferred to Phase 3. mcp__ tool-spec rules + Layer-3 permissions deferred to S3.
+- **S3a — Layer 3 permission plumbing** (`done`, self-reviewed). `permission`
+  (`allow`/`ask`/`deny`) on `ToolSpecBase` + `MCPServerConfig`/`ResolvedMCPServer` + `protocol.ts`
+  `ResolvedToolSpec`/`McpServerConfig`. `effective_permission()` default ladder: explicit wins →
+  `needs_approval`→ask → `read_only` true→allow/false→ask → unset (runner falls to
+  `permissionPolicy`). `AliasChoices` accepts the FE's `permission_mode`. Goldens + contract tests
+  updated (sub-key, no `KNOWN_REQUEST_KEYS` change). 148 TS + 286 SDK pass. NO enforcement (S3b).
+- **S3b — Layer 3 enforcement** (`done`, reviewed). Relay enforces resolved-tool permission
+  (`resolvePermission`: deny→refusal string before any execution, allow→run, ask/unset→headless
+  `permissionPolicy`); `permissionPolicy` threaded into `startToolRelay`, same resolver as the
+  Claude-builtin responder. `claude-settings.ts` renders per-MCP-server `mcp__<server>` rules
+  (name verified against `toAcpMcpServers`). Responder untouched (Claude builtins handled by S2
+  settings.json + existing `PolicyResponder`); no fragile per-resolved-tool mcp rules. HITL "ask"
+  surfacing deferred to S5 (`TODO(S5)`). 166 TS pass. Reviewer APPROVED (deny-before-execute +
+  MCP-name both verified against downstream consumers).
+- **S4 — Playground form** (`done`, reviewed + fixed). New controls: `SandboxPermissionControl`
+  (network mode/allowlist/filesystem/enforcement), `ClaudePermissionsControl` (mode +
+  allow/deny/ask, gated to Claude harness, collapsible advanced), per-tool `ToolPermissionControl`
+  (allow/ask/deny). Persist under `data.parameters.agent.*` (`sandbox_permission`,
+  `harness_options.claude.permissions` preserving sibling slices, top-level tool `permission`).
+  Non-destructive; lint + typecheck clean. Live browser check deferred to Phase 3.
+- **S4b — Layer 3 persistence fix** (`done`). Review caught that per-tool permission was written
+  into `agenta_metadata` (stripped on save) AND that the authored config layer didn't carry it.
+  Fixed end to end: `ToolConfigBase.permission` (AliasChoices), `_copy_tool_metadata` +
+  `_apply_tool_metadata` + `platform/gateway.py` `CallbackToolSpec` (the gateway/playground path,
+  an extra miss) all carry it, FE writes top-level `permission` (survives strip). Round-trip
+  tests prove config dict → `ToolSpec.to_wire()` carries it (52 pass). Also fixed the `&#10;`
+  placeholder cosmetic.
+- **S5 — Playground HITL parked/resume** (`done` core, live round-trip deferred). The HITL path
+  was ~60% built (responder seam, `interaction_request` emission, Vercel egress/ingress, FE
+  approve/deny UI + auto-resume). This slice added the missing runtime: `HITLResponder`
+  (responder.ts) applies a stored approval decision on resume, parks (deny) on first occurrence
+  when a human surface is present (`hasHumanSurface = !!request.sessionId`; verified `/invoke`
+  leaves it None so headless is byte-identical to `PolicyResponder`), and falls to `basePolicy`
+  headless. `extractApprovalDecisions` reads the existing Vercel-converted `tool_result`
+  `{approved}` envelope by `toolCallId` — no new wire carrier. 177 TS pass. DEFERRED to live
+  verification: the multi-turn round-trip (does cold-replay re-raise the gate on turn 2 so the
+  decision applies, and does the harness re-attempt the tool) — exact live test in `open-issues.md`.
+  Relay-tool HITL deferred as S5.2 (relay is fire-and-forget; see `open-issues.md`).
+- **S6 — Tests + live verification** (`todo`). Unit, golden wire, and the QA-matrix cells the
+  enforceability table claims; pin one green run as a replay test.
+
+Current run starts with **S0a** (independent, low-risk) and **S0b** (read-only investigation) in
+parallel.
+
+## Next steps
+
+1. **Live QA** against a deployed stack: run the `plan.md` S6 checklist (Daytona egress block,
+   Claude settings.json, per-tool deny, HITL round-trip). Needs SDK container reload + a
+   sandbox-agent image rebuild + Daytona/Anthropic keys.
+2. **Land the branch.** The work is on the GitButler lane `feat/agent-capability-config`. Push +
+   open the PR when ready (`but push <lane>` then `gh pr create --base <parent>`).
+3. **After live QA:** update the user-facing `documentation/` pages and resolve the deferred
+   follow-ups (S1b-py, S1b-pi-inproc, S4-readonly-fe, S5.2 relay HITL, the S5 multi-turn round-trip).
+
+## Landing (branch handoff)
+
+The non-shared work is committed to the GitButler lane **`feat/agent-capability-config`**
+(commit `97581d25ef`, based on `main`): 32 files, +2451 lines — all new code + the project
+docs. Verified the concurrent skills-config slice was NOT swept in.
+
+**Left UNASSIGNED on purpose** (co-edited by the concurrent skills-config slice; both slices
+extend the same core config/wire surface, so wholesale-staging would steal the other session's
+hunks). Hunk-split these from the skills slice — or land the two slices together — before the
+lane builds/pushes:
+
+- `sdks/python/agenta/sdk/agents/dtos.py` (SandboxPermission, ClaudePermissions, wire methods)
+- `sdks/python/agenta/sdk/agents/utils/wire.py`, `adapters/harnesses.py`, `__init__.py`
+- `sdks/python/agenta/sdk/utils/types.py` (AgentConfigSchema fields)
+- `services/oss/src/agent/schemas.py` (`_DEFAULT_AGENT_CONFIG` default)
+- `services/agent/src/protocol.ts` (the new wire interfaces), `engines/sandbox_agent.ts`
+  (HITLResponder + relay policy wiring), `engines/sandbox_agent/run-plan.ts`
+- `web/.../SchemaControls/AgentConfigControl.tsx`, `index.ts`
+- `sdks/python/oss/tests/.../golden/run_request.{pi,claude}.json`, `test_wire_contract.py`
+- `services/agent/tests/unit/{wire-contract,sandbox-agent-run-plan,sandbox-agent-orchestration}.test.ts`
+
+**Finish + push** (after the shared files are assigned to the lane):
+
+```bash
+but rub <shared-file-or-hunk> feat/agent-capability-config   # per shared file/hunk
+but commit feat/agent-capability-config --only -m "..."       # the shared remainder
+but push feat/agent-capability-config
+gh pr create --head feat/agent-capability-config --base main
+```
+
+## Changelog
+
+- 2026-06-23: Project created from `../../scratch/capability-architecture.md`. Design, plan,
+  research, and status written. Scope extended to the playground frontend.
+- 2026-06-23: Confirmed `sandbox_permission` naming; folded `permission_policy` into Layer 3 as
+  its global default; placed per-tool permissions on the tool spec and MCP permissions on the
+  MCP server spec.
+- 2026-06-23: Codex reviewed (verdict above). Verified two claims in code: `pi-acp@0.0.29`
+  spawns `pi --mode rpc` with no `tools` field, and `relay.ts` runs gateway/callback tools
+  runner-side alongside `code`. Folded the feedback in, broadened the runner-host guard to
+  gateway/callback, fixed the filesystem-authority contradiction, and re-scoped Pi Phase 0 to an
+  investigation. Started `implement-feature`; cut the slice list (S0a–S6).
+- 2026-06-23: Landed Phase 0 + Layer 2 (code-complete, reviewed). S0a (Composio `read_only`,
+  reviewed), S0b (Pi finding), S1a (`sandbox_permission` config + wire, reviewed), S1b+S1g
+  (Daytona enforcement + runner-host guard, reviewed; fixed the empty-allowlist→default-open
+  footgun to block-all). All green: 220 SDK + 126 TS tests, tsc + ruff clean. Live Daytona/Claude
+  end-to-end verification batched to a later Phase-3 pass. Two in-process fail-loud gaps deferred
+  (S1b-py, S1b-pi-inproc). Next: S2 (Layer 1 Claude settings.json).
+- 2026-06-24: LIVE QA on :8280 + PR. Pushed PR #4811 (non-shared slice). Live-proved L3 deny
+  (token absent on deny, present on allow), the L1 settings.json write (captured the real file),
+  and the L2 runner-host guard (both reject messages + best_effort control). Daytona egress block
+  is code-correct but env-inconclusive (org blocks all egress); Claude behavioral refusal needs a
+  project Anthropic key. Found + FIXED + live-verified a footgun: `backend:"pi"` (in-process engine)
+  silently ignored all 3 layers; added `unenforceableCapabilityConfig` fail-loud guard in
+  `engines/pi.ts` (rejects restrictive sandbox_permission + deny/ask permissions), 8 tests, live
+  rejection confirmed. 185 TS green. Note: `engines/pi.ts` is in the shared-files set (carries the
+  guard); test `pi-capability-guard.test.ts` is new.
+- 2026-06-24: Landed Layers 1 + 3 + the playground (code-complete, reviewed). S2 (Claude
+  `.claude/settings.json`, reviewed), S3a (permission plumbing), S3b (relay + MCP-rule
+  enforcement, reviewed), S4 (playground form, reviewed) + S4b (fixed the `agenta_metadata`-strip
+  + authored-config-layer gap so per-tool permission persists end to end), S5 (HITL cross-turn
+  responder core; multi-turn round-trip + relay HITL deferred to live verification / S5.2). Full
+  suites green together: 293 SDK + 15 API + 10 services/oss + 177 TS, tsc + ruff clean. Corrected
+  two stale research claims (FE `permission_mode` round-trip did not exist; Pi restriction is
+  backend-dependent). Remaining: live QA (S6 checklist) + push the branch.
diff --git a/sdks/python/agenta/sdk/agents/adapters/claude_settings.py b/sdks/python/agenta/sdk/agents/adapters/claude_settings.py
new file mode 100644
index 0000000000..ca9bf842fa
--- /dev/null
+++ b/sdks/python/agenta/sdk/agents/adapters/claude_settings.py
@@ -0,0 +1,194 @@
+"""Layer 1 for Claude: render the harness's permission settings into a ``.claude/settings.json``.
+
+This is the claude adapter. It builds the full file CONTENT in Python (the translation used to
+live in the TS runner's ``claude-settings.ts``); the runner is now a dumb file-writer that drops
+whatever ``harnessFiles`` the adapter produced into the session cwd. The Claude ACP adapter reads
+``<cwd>/.claude/settings.json`` because it builds its SDK query with
+``settingSources: ["user", "project", "local"]`` (and applies ``permissions.defaultMode``); that
+file is the only clean Claude-config path because the sandbox-agent daemon strips ACP ``_meta``.
+
+Three rule sources merge here:
+ - the AUTHOR's options (Layer 1), read from the generic ``harness_options["claude"]["permissions"]``
+   slice: ``default_mode`` + per-tool ``allow``/``deny``/``ask`` strings. This is the only place the
+   claude-specific shape of that slice is known.
+ - rules DERIVED from ``sandbox_permission`` (Layer 2): baseline reinforcement of the sandbox
+   boundary as Claude-tool rules (block web tools when egress is off, block edits when the
+   filesystem is read-only/off). A safety floor, not the primary enforcement.
+ - rules DERIVED from per-MCP-server ``permission`` (Layer 3, S3b): each user MCP server with a
+   set permission becomes a whole-server ``mcp__<server>`` allow/ask/deny rule.
+
+Layer 3 enforcement is split by tool source: resolved tools (code / gateway-callback) run
+runner-side and are enforced at the relay, NOT here. Only the per-MCP-server permission lands in
+this file.
+"""
+
+from __future__ import annotations
+
+import json
+from typing import Any, Dict, List, Optional, Sequence
+
+# Claude Code's four permission modes (its ``permissions.defaultMode``); any other authored value
+# is dropped.
+PERMISSION_MODES = frozenset({"default", "acceptEdits", "plan", "bypassPermissions"})
+
+# Where the rendered settings land, relative to the session cwd.
+SETTINGS_PATH = ".claude/settings.json"
+
+
+def _string_list(value: Any) -> List[str]:
+    """Keep only the string entries of an authored allow/deny/ask value; default to ``[]``."""
+    if not isinstance(value, list):
+        return []
+    return [item for item in value if isinstance(item, str)]
+
+
+def _parse_author_permissions(slice_: Any) -> Dict[str, Any]:
+    """Parse the untyped author block from ``harness_options["claude"]["permissions"]``.
+
+    ``default_mode`` (also accepted as ``defaultMode``) survives only when it is one of the four
+    valid modes; ``allow``/``deny``/``ask`` become string lists. This is where the claude-specific
+    knowledge of that slice lives. Returns ``{mode?, allow, deny, ask}`` (mode omitted when unset
+    or invalid).
+    """
+    if not isinstance(slice_, dict):
+        return {"allow": [], "deny": [], "ask": []}
+    out: Dict[str, Any] = {}
+    mode = slice_.get("default_mode", slice_.get("defaultMode"))
+    if isinstance(mode, str) and mode in PERMISSION_MODES:
+        out["mode"] = mode
+    out["allow"] = _string_list(slice_.get("allow"))
+    out["deny"] = _string_list(slice_.get("deny"))
+    out["ask"] = _string_list(slice_.get("ask"))
+    return out
+
+
+def _dedupe(values: Sequence[str]) -> List[str]:
+    """Dedupe in first-seen order, dropping falsy entries."""
+    seen: set[str] = set()
+    out: List[str] = []
+    for value in values:
+        if not value or value in seen:
+            continue
+        seen.add(value)
+        out.append(value)
+    return out
+
+
+def _rules_from_sandbox_permission(sandbox_permission: Any) -> Dict[str, List[str]]:
+    """Derive baseline Claude-tool rules from the Layer-2 sandbox boundary.
+
+    These reinforce the declared boundary at the harness level (the sandbox provider is the real
+    enforcement):
+     - network not fully ``on`` (off / allowlist) -> deny the web tools (``WebFetch``, ``WebSearch``);
+     - filesystem ``readonly`` or ``off`` -> deny the mutating file tools (``Write``, ``Edit``).
+
+    Accepts either a :class:`~agenta.sdk.agents.dtos.SandboxPermission` or a plain dict (network is
+    a nested object with a ``mode``; filesystem is a plain string).
+    """
+    deny: List[str] = []
+    if sandbox_permission is None:
+        return {"deny": deny}
+
+    network = _get(sandbox_permission, "network")
+    network_mode = _get(network, "mode") if network is not None else None
+    if network is not None and (network_mode or "on") != "on":
+        deny.extend(["WebFetch", "WebSearch"])
+
+    filesystem = _get(sandbox_permission, "filesystem")
+    if filesystem in ("readonly", "off"):
+        deny.extend(["Write", "Edit"])
+
+    return {"deny": deny}
+
+
+def _rules_from_mcp_permissions(mcp_servers: Any) -> Dict[str, List[str]]:
+    """Derive whole-server Claude rules from each MCP server's Layer-3 ``permission`` (S3b).
+
+    Claude addresses a whole MCP server as ``mcp__<serverName>`` (a per-tool rule is
+    ``mcp__<server>__<tool>``); the server name is the ``name`` carried to the runtime verbatim.
+    ``allow``/``ask``/``deny`` route to the matching list; a server with no permission contributes
+    nothing (falls back to the global policy). Accepts a list of
+    :class:`~agenta.sdk.agents.mcp.models.ResolvedMCPServer` or plain dicts.
+    """
+    allow: List[str] = []
+    ask: List[str] = []
+    deny: List[str] = []
+    for server in mcp_servers or []:
+        name = _get(server, "name")
+        permission = _get(server, "permission")
+        if not permission or not name:
+            continue
+        rule = f"mcp__{name}"
+        if permission == "allow":
+            allow.append(rule)
+        elif permission == "ask":
+            ask.append(rule)
+        elif permission == "deny":
+            deny.append(rule)
+    return {"allow": allow, "ask": ask, "deny": deny}
+
+
+def _get(obj: Any, key: str) -> Any:
+    """Read ``key`` off a pydantic model (attribute) or a plain dict (item)."""
+    if obj is None:
+        return None
+    if isinstance(obj, dict):
+        return obj.get(key)
+    return getattr(obj, key, None)
+
+
+def build_claude_settings_files(
+    harness_options: Optional[Dict[str, Any]],
+    sandbox_permission: Any = None,
+    mcp_servers: Any = None,
+) -> List[Dict[str, str]]:
+    """Build the Claude ``settings.json`` as a generic ``harnessFiles`` entry, or ``[]`` if none.
+
+    Reads the author's Layer-1 options from ``harness_options["claude"]["permissions"]``, merges
+    them with the Layer-2-derived rules (from ``sandbox_permission``) and the Layer-3-derived MCP
+    rules (from ``mcp_servers``), dedupes each list, and emits the smallest valid file:
+    ``permissions.defaultMode`` is set only when authored (and valid), and each allow/deny/ask list
+    appears only when non-empty. When there is nothing to write at all (no author options AND no
+    derived rules) it returns ``[]`` so the runner writes no file.
+
+    Returns ``[{"path": ".claude/settings.json", "content": <json str>}]`` or ``[]``.
+    """
+    author = _parse_author_permissions(_claude_permissions_slice(harness_options))
+
+    # Merge order: author rules first, then derived rules (Layer 2, then Layer 3). ``_dedupe``
+    # keeps first-seen order, so an author rule wins its position and derived rules append.
+    sandbox_rules = _rules_from_sandbox_permission(sandbox_permission)
+    mcp_rules = _rules_from_mcp_permissions(mcp_servers)
+
+    allow = _dedupe([*author["allow"], *mcp_rules.get("allow", [])])
+    deny = _dedupe(
+        [*author["deny"], *sandbox_rules.get("deny", []), *mcp_rules.get("deny", [])]
+    )
+    ask = _dedupe([*author["ask"], *mcp_rules.get("ask", [])])
+
+    permissions: Dict[str, Any] = {}
+    if "mode" in author:
+        permissions["defaultMode"] = author["mode"]
+    if allow:
+        permissions["allow"] = allow
+    if deny:
+        permissions["deny"] = deny
+    if ask:
+        permissions["ask"] = ask
+
+    # Nothing authored and nothing derived -> no file (the boundary-free Claude run is unchanged).
+    if not permissions:
+        return []
+
+    content = json.dumps({"permissions": permissions}, indent=2)
+    return [{"path": SETTINGS_PATH, "content": content}]
+
+
+def _claude_permissions_slice(harness_options: Optional[Dict[str, Any]]) -> Any:
+    """Pull the ``claude.permissions`` slice from the generic per-harness options map."""
+    if not isinstance(harness_options, dict):
+        return None
+    claude = harness_options.get("claude")
+    if not isinstance(claude, dict):
+        return None
+    return claude.get("permissions")
diff --git a/sdks/python/agenta/sdk/agents/mcp/models.py b/sdks/python/agenta/sdk/agents/mcp/models.py
index 37c3f6806b..f782dd8c0b 100644
--- a/sdks/python/agenta/sdk/agents/mcp/models.py
+++ b/sdks/python/agenta/sdk/agents/mcp/models.py
@@ -4,7 +4,14 @@
 
 from typing import Any, Dict, List, Literal, Optional
 
-from pydantic import BaseModel, ConfigDict, Field, model_validator
+from pydantic import AliasChoices, BaseModel, ConfigDict, Field, model_validator
+
+
+# Layer-3 per-server permission (same value set as a tool's): ``allow`` runs with
+# no prompt, ``ask`` raises a human-in-the-loop request, ``deny`` never runs. Absent means the
+# runner falls back to the global ``permissionPolicy`` default. An MCP server carries no
+# ``read_only`` hint, so there is no default to compute: an explicit author value or nothing.
+Permission = Literal["allow", "ask", "deny"]
 
 
 class MCPServerConfig(BaseModel):
@@ -18,6 +25,12 @@ class MCPServerConfig(BaseModel):
     url: Optional[str] = None
     secrets: Dict[str, str] = Field(default_factory=dict)
     tools: List[str] = Field(default_factory=list)
+    permission: Optional[Permission] = Field(
+        default=None,
+        validation_alias=AliasChoices(
+            "permission", "permission_mode", "permissionMode"
+        ),
+    )
 
     @model_validator(mode="after")
     def _validate_transport(self) -> "MCPServerConfig":
@@ -38,6 +51,7 @@ class ResolvedMCPServer(BaseModel):
     env: Dict[str, str] = Field(default_factory=dict, repr=False)
     url: Optional[str] = None
     tools: List[str] = Field(default_factory=list)
+    permission: Optional[Permission] = None
 
     @model_validator(mode="after")
     def _validate_transport(self) -> "ResolvedMCPServer":
@@ -62,4 +76,6 @@ def to_wire(self) -> Dict[str, Any]:
             wire["url"] = self.url
         if self.tools:
             wire["tools"] = list(self.tools)
+        if self.permission is not None:
+            wire["permission"] = self.permission
         return wire
diff --git a/sdks/python/agenta/sdk/agents/mcp/resolver.py b/sdks/python/agenta/sdk/agents/mcp/resolver.py
index 6ce78162dd..1593f9de8d 100644
--- a/sdks/python/agenta/sdk/agents/mcp/resolver.py
+++ b/sdks/python/agenta/sdk/agents/mcp/resolver.py
@@ -63,6 +63,7 @@ async def resolve(
                     env=env,
                     url=server_config.url,
                     tools=list(server_config.tools),
+                    permission=server_config.permission,
                 )
             )
         return resolved
diff --git a/sdks/python/agenta/sdk/agents/platform/gateway.py b/sdks/python/agenta/sdk/agents/platform/gateway.py
index 3fc9a41692..c297aee048 100644
--- a/sdks/python/agenta/sdk/agents/platform/gateway.py
+++ b/sdks/python/agenta/sdk/agents/platform/gateway.py
@@ -195,6 +195,8 @@ async def resolve(
                     call_ref=str(raw_spec["call_ref"]),
                     needs_approval=tool_config.needs_approval,
                     render=tool_config.render,
+                    permission=tool_config.permission,
+                    read_only=raw_spec.get("read_only"),
                 )
             )
 
diff --git a/sdks/python/agenta/sdk/agents/tools/compat.py b/sdks/python/agenta/sdk/agents/tools/compat.py
index d2ddf16b4b..033cafdc78 100644
--- a/sdks/python/agenta/sdk/agents/tools/compat.py
+++ b/sdks/python/agenta/sdk/agents/tools/compat.py
@@ -56,6 +56,12 @@ def _copy_tool_metadata(
         result["needs_approval"] = source["needs_approval"]
     if isinstance(source.get("render"), dict):
         result["render"] = dict(source["render"])
+    # Layer-3 permission: accept any of the keys the FE may write (the playground used
+    # ``permission_mode``); the config model's ``Permission`` field validates the value.
+    for key in ("permission", "permission_mode", "permissionMode"):
+        if key in source:
+            result["permission"] = source[key]
+            break
     return result
 
 
diff --git a/sdks/python/agenta/sdk/agents/tools/models.py b/sdks/python/agenta/sdk/agents/tools/models.py
index 6e467f51dd..ecf2a5416a 100644
--- a/sdks/python/agenta/sdk/agents/tools/models.py
+++ b/sdks/python/agenta/sdk/agents/tools/models.py
@@ -19,13 +19,30 @@ def _empty_object_schema() -> Dict[str, Any]:
     return {"type": "object", "properties": {}}
 
 
+# Layer-3 per-tool permission: ``allow`` runs with no prompt, ``ask`` raises a
+# human-in-the-loop request, ``deny`` never runs. Absent means "fall back to the global
+# ``permissionPolicy`` default" (the runner resolves that, in a later slice).
+Permission = Literal["allow", "ask", "deny"]
+
+
 class ToolConfigBase(BaseModel):
     """Fields shared by every persisted tool declaration."""
 
-    model_config = ConfigDict(extra="forbid")
+    model_config = ConfigDict(extra="forbid", populate_by_name=True)
 
     needs_approval: bool = False
     render: Optional[Dict[str, Any]] = None
+    # Layer-3 permission the author set on this tool. Mirrors
+    # ``ToolSpecBase.permission``: accepts ``permission_mode``/``permissionMode`` too (the keys
+    # the playground writes), so an FE-set value deserializes onto the ``extra="forbid"`` config
+    # without a breaking change. ``_apply_tool_metadata`` then carries it onto the resolved spec.
+    permission: Optional[Permission] = Field(
+        default=None,
+        validation_alias=AliasChoices(
+            "permission", "permission_mode", "permissionMode"
+        ),
+        serialization_alias="permission",
+    )
 
 
 class BuiltinToolConfig(ToolConfigBase):
@@ -114,6 +131,43 @@ class ToolSpecBase(BaseModel):
         serialization_alias="needsApproval",
     )
     render: Optional[Dict[str, Any]] = None
+    read_only: Optional[bool] = Field(
+        default=None,
+        validation_alias=AliasChoices("read_only", "readOnly"),
+        serialization_alias="readOnly",
+    )
+    # Layer-3 permission the author set on this tool. Accepts ``permission_mode``
+    # too, the key the playground writes into ``agenta_metadata``, so an FE-set value
+    # deserializes without a later breaking change. Unset means "no explicit author choice";
+    # ``effective_permission`` then derives a default from ``read_only`` / ``needs_approval``.
+    permission: Optional[Permission] = Field(
+        default=None,
+        validation_alias=AliasChoices(
+            "permission", "permission_mode", "permissionMode"
+        ),
+        serialization_alias="permission",
+    )
+
+    def effective_permission(self) -> Optional[Permission]:
+        """Resolve the permission that rides the wire, by this precedence:
+
+        1. An explicit author ``permission`` wins outright.
+        2. Else, when ``needs_approval`` is set, the default is ``"ask"`` (approval beats the
+           read-only auto-allow: an author who asked to be prompted still gets prompted).
+        3. Else, default from ``read_only``: ``True`` -> ``"allow"`` (read-only tools are safe to
+           auto-run), ``False`` -> ``"ask"`` (mutating tools prompt).
+        4. Else (``read_only`` is ``None`` and nothing explicit) -> ``None`` (unset), so the runner
+           falls back to the global ``permissionPolicy`` default in a later slice.
+        """
+        if self.permission is not None:
+            return self.permission
+        if self.needs_approval:
+            return "ask"
+        if self.read_only is True:
+            return "allow"
+        if self.read_only is False:
+            return "ask"
+        return None
 
     def to_wire(self) -> Dict[str, Any]:
         wire = self.model_dump(
@@ -125,6 +179,11 @@ def to_wire(self) -> Dict[str, Any]:
             wire.pop("needsApproval", None)
         if not wire.get("env"):
             wire.pop("env", None)
+        permission = self.effective_permission()
+        if permission is not None:
+            wire["permission"] = permission
+        else:
+            wire.pop("permission", None)
         return wire
 
 
diff --git a/sdks/python/agenta/sdk/agents/tools/resolver.py b/sdks/python/agenta/sdk/agents/tools/resolver.py
index 54f4c8b03f..306b9d0134 100644
--- a/sdks/python/agenta/sdk/agents/tools/resolver.py
+++ b/sdks/python/agenta/sdk/agents/tools/resolver.py
@@ -40,6 +40,7 @@ def _apply_tool_metadata(tool_spec: ToolSpec, tool_config: ToolConfig) -> ToolSp
         update={
             "needs_approval": tool_config.needs_approval,
             "render": tool_config.render,
+            "permission": tool_config.permission,
         }
     )
 
diff --git a/sdks/python/oss/tests/pytest/unit/agents/adapters/__init__.py b/sdks/python/oss/tests/pytest/unit/agents/adapters/__init__.py
new file mode 100644
index 0000000000..0597a675ca
--- /dev/null
+++ b/sdks/python/oss/tests/pytest/unit/agents/adapters/__init__.py
@@ -0,0 +1 @@
+# Unit tests for the agent runtime adapters.
diff --git a/sdks/python/oss/tests/pytest/unit/agents/adapters/test_claude_settings.py b/sdks/python/oss/tests/pytest/unit/agents/adapters/test_claude_settings.py
new file mode 100644
index 0000000000..7b5c26a7b3
--- /dev/null
+++ b/sdks/python/oss/tests/pytest/unit/agents/adapters/test_claude_settings.py
@@ -0,0 +1,176 @@
+"""The Python claude adapter: ``build_claude_settings_files`` (Layer 1 + Layer 2/3 derivation).
+
+This is the translation that used to live in the TS runner's ``claude-settings.ts``. It merges
+three rule sources into one ``.claude/settings.json``: the author's
+``harness_options["claude"]["permissions"]`` slice, the Layer-2 sandbox-boundary derivation, and
+the per-MCP-server Layer-3 permissions. The runner is now a dumb writer of the rendered
+``harnessFiles`` entry this produces.
+"""
+
+from __future__ import annotations
+
+import json
+
+from agenta.sdk.agents.adapters.claude_settings import build_claude_settings_files
+from agenta.sdk.agents.dtos import SandboxPermission
+from agenta.sdk.agents.mcp import ResolvedMCPServer
+
+
+def _settings(files):
+    """Unwrap the single rendered file and parse its JSON content."""
+    assert len(files) == 1
+    assert files[0]["path"] == ".claude/settings.json"
+    return json.loads(files[0]["content"])
+
+
+def _claude(permissions):
+    return {"claude": {"permissions": permissions}}
+
+
+def test_renders_author_mode_and_rules():
+    files = build_claude_settings_files(
+        _claude(
+            {
+                "default_mode": "acceptEdits",
+                "allow": ["Read", "Bash(npm run:*)"],
+                "deny": ["Write"],
+                "ask": ["mcp__github__create_issue"],
+            }
+        )
+    )
+    assert _settings(files) == {
+        "permissions": {
+            "defaultMode": "acceptEdits",
+            "allow": ["Read", "Bash(npm run:*)"],
+            "deny": ["Write"],
+            "ask": ["mcp__github__create_issue"],
+        }
+    }
+
+
+def test_content_is_json_dumps_indent_2():
+    # The exact serialization matters (it is pinned by the golden); assert the indent-2 form.
+    files = build_claude_settings_files(_claude({"deny": ["Write"]}))
+    assert files[0]["content"] == json.dumps(
+        {"permissions": {"deny": ["Write"]}}, indent=2
+    )
+
+
+def test_network_off_denies_web_tools():
+    files = build_claude_settings_files(
+        None, SandboxPermission(network={"mode": "off"})
+    )
+    assert _settings(files)["permissions"]["deny"] == ["WebFetch", "WebSearch"]
+
+
+def test_network_allowlist_denies_web_tools():
+    files = build_claude_settings_files(
+        None,
+        SandboxPermission(network={"mode": "allowlist", "allowlist": ["10.0.0.0/8"]}),
+    )
+    assert _settings(files)["permissions"]["deny"] == ["WebFetch", "WebSearch"]
+
+
+def test_filesystem_readonly_denies_write_edit():
+    files = build_claude_settings_files(None, SandboxPermission(filesystem="readonly"))
+    assert _settings(files)["permissions"]["deny"] == ["Write", "Edit"]
+
+
+def test_filesystem_off_denies_write_edit():
+    files = build_claude_settings_files(None, SandboxPermission(filesystem="off"))
+    assert _settings(files)["permissions"]["deny"] == ["Write", "Edit"]
+
+
+def test_mcp_permission_deny_renders_server_rule():
+    server = ResolvedMCPServer(
+        name="github", transport="http", url="https://x", permission="deny"
+    )
+    files = build_claude_settings_files(None, None, [server])
+    perms = _settings(files)["permissions"]
+    assert perms["deny"] == ["mcp__github"]
+    assert "allow" not in perms
+    assert "ask" not in perms
+
+
+def test_mcp_permissions_route_to_their_lists_and_skip_unset():
+    servers = [
+        ResolvedMCPServer(
+            name="filesystem", transport="http", url="https://x", permission="allow"
+        ),
+        ResolvedMCPServer(
+            name="github", transport="http", url="https://x", permission="ask"
+        ),
+        ResolvedMCPServer(
+            name="shell", transport="http", url="https://x", permission="deny"
+        ),
+        ResolvedMCPServer(name="unset", transport="http", url="https://x"),
+    ]
+    perms = _settings(build_claude_settings_files(None, None, servers))["permissions"]
+    assert perms["allow"] == ["mcp__filesystem"]
+    assert perms["ask"] == ["mcp__github"]
+    assert perms["deny"] == ["mcp__shell"]
+
+
+def test_merges_author_with_derived_and_dedupes():
+    # Author `WebFetch` keeps its position; the network-derived `WebFetch` is deduped, and the
+    # filesystem-derived `Write`/`Edit` append.
+    files = build_claude_settings_files(
+        _claude({"default_mode": "plan", "deny": ["WebFetch"]}),
+        SandboxPermission(
+            network={"mode": "allowlist", "allowlist": ["10.0.0.0/8"]},
+            filesystem="readonly",
+        ),
+    )
+    perms = _settings(files)["permissions"]
+    assert perms["deny"] == ["WebFetch", "WebSearch", "Write", "Edit"]
+    assert perms["defaultMode"] == "plan"
+
+
+def test_invalid_default_mode_dropped():
+    files = build_claude_settings_files(
+        _claude({"default_mode": "yolo", "deny": ["Write"]})
+    )
+    perms = _settings(files)["permissions"]
+    assert "defaultMode" not in perms
+    assert perms["deny"] == ["Write"]
+
+
+def test_accepts_camelcase_default_mode_alias():
+    files = build_claude_settings_files(_claude({"defaultMode": "plan"}))
+    assert _settings(files)["permissions"]["defaultMode"] == "plan"
+
+
+def test_accepts_plain_dicts_for_sandbox_and_mcp():
+    # The builder duck-types its inputs, so plain dicts (not pydantic models) work too.
+    files = build_claude_settings_files(
+        None,
+        {"network": {"mode": "off"}},
+        [{"name": "github", "permission": "deny"}],
+    )
+    perms = _settings(files)["permissions"]
+    assert perms["deny"] == ["WebFetch", "WebSearch", "mcp__github"]
+
+
+def test_empty_inputs_render_nothing():
+    assert build_claude_settings_files(None) == []
+    assert build_claude_settings_files({}) == []
+    assert build_claude_settings_files(_claude({})) == []
+    # network `on` + filesystem `on` derive nothing.
+    assert (
+        build_claude_settings_files(
+            None, SandboxPermission(network={"mode": "on"}, filesystem="on")
+        )
+        == []
+    )
+    # an MCP server with no permission contributes nothing.
+    assert (
+        build_claude_settings_files(
+            None, None, [ResolvedMCPServer(name="x", transport="http", url="https://x")]
+        )
+        == []
+    )
+
+
+def test_non_claude_slice_renders_nothing():
+    # Only the `claude` slice is the claude adapter's concern; a `pi` slice contributes nothing.
+    assert build_claude_settings_files({"pi": {"system": "You are Pi."}}) == []
diff --git a/sdks/python/oss/tests/pytest/unit/agents/mcp/test_resolver.py b/sdks/python/oss/tests/pytest/unit/agents/mcp/test_resolver.py
index a8a97ab6f0..983571f379 100644
--- a/sdks/python/oss/tests/pytest/unit/agents/mcp/test_resolver.py
+++ b/sdks/python/oss/tests/pytest/unit/agents/mcp/test_resolver.py
@@ -60,6 +60,32 @@ async def test_missing_mcp_secret_is_explicit():
         )
 
 
+async def test_permission_rides_the_wire_when_set():
+    # An author's per-server permission is carried onto the resolved server and serialized.
+    servers = await MCPResolver(secret_provider=DictSecretProvider({})).resolve(
+        [MCPServerConfig(name="github", command="npx", permission="ask")]
+    )
+    assert servers[0].permission == "ask"
+    assert servers[0].to_wire()["permission"] == "ask"
+
+
+async def test_permission_absent_from_wire_when_unset():
+    # No permission declared -> no `permission` key (a server has no read_only to default from).
+    servers = await MCPResolver(secret_provider=DictSecretProvider({})).resolve(
+        [MCPServerConfig(name="github", command="npx")]
+    )
+    assert servers[0].permission is None
+    assert "permission" not in servers[0].to_wire()
+
+
+def test_permission_accepts_fe_permission_mode_alias():
+    # The playground writes `permission_mode`; the server config deserializes it.
+    config = MCPServerConfig.model_validate(
+        {"name": "github", "command": "npx", "permission_mode": "deny"}
+    )
+    assert config.permission == "deny"
+
+
 async def test_mcp_compatibility_policy_can_omit_missing_secret():
     servers = await MCPResolver(
         secret_provider=DictSecretProvider({}),
diff --git a/sdks/python/oss/tests/pytest/unit/agents/platform/test_gateway_http.py b/sdks/python/oss/tests/pytest/unit/agents/platform/test_gateway_http.py
index d41407a07d..d44008aa57 100644
--- a/sdks/python/oss/tests/pytest/unit/agents/platform/test_gateway_http.py
+++ b/sdks/python/oss/tests/pytest/unit/agents/platform/test_gateway_http.py
@@ -42,6 +42,7 @@ async def test_gateway_metadata_and_description_fallback_are_preserved(
                     "description": None,
                     "input_schema": {"type": "object"},
                     "call_ref": "tools.composio.github.GET_USER.c1",
+                    "read_only": True,
                 }
             ]
         },
@@ -58,7 +59,12 @@ async def test_gateway_metadata_and_description_fallback_are_preserved(
     assert spec.description == "get_user"  # falls back to name when null
     assert spec.needs_approval is True
     assert spec.render == {"kind": "component", "component": "User"}
+    assert spec.read_only is True  # the catalog read-only hint reaches the spec
     assert spec.to_wire()["needsApproval"] is True
+    assert spec.to_wire()["readOnly"] is True
+    # needs_approval beats the read-only auto-allow: the gateway resolver's effective
+    # permission is "ask" (pins the real precedence end to end, not just the model helper).
+    assert spec.to_wire()["permission"] == "ask"
     assert isinstance(resolved.tool_callback, ToolCallback)
     assert resolved.tool_callback.endpoint == "https://api.x/api/tools/call"
     assert resolved.tool_callback.authorization == "Access tok"
diff --git a/sdks/python/oss/tests/pytest/unit/agents/tools/test_models.py b/sdks/python/oss/tests/pytest/unit/agents/tools/test_models.py
index f823b4f32c..ace0549665 100644
--- a/sdks/python/oss/tests/pytest/unit/agents/tools/test_models.py
+++ b/sdks/python/oss/tests/pytest/unit/agents/tools/test_models.py
@@ -40,6 +40,8 @@ def test_code_spec_serializes_only_runner_fields():
         "env": {"TOKEN": "secret"},
         "needsApproval": True,
         "render": {"kind": "component", "component": "Calculator"},
+        # needs_approval with no explicit permission -> derived `ask`.
+        "permission": "ask",
     }
 
 
@@ -61,3 +63,62 @@ def test_secret_values_are_hidden_from_repr():
         env={"TOKEN": "do-not-print"},
     )
     assert "do-not-print" not in repr(spec)
+
+
+# --- Layer-3 permission default ladder (S3a) -----------------------------------------
+
+
+def _spec(**kwargs):
+    return CallbackToolSpec(
+        name="t",
+        description="t",
+        call_ref="tools.composio.x.Y.c1",
+        **kwargs,
+    )
+
+
+def test_permission_explicit_author_value_wins():
+    # An explicit author permission wins over any read_only/needs_approval default.
+    spec = _spec(read_only=False, permission="allow")
+    assert spec.effective_permission() == "allow"
+    assert spec.to_wire()["permission"] == "allow"
+
+
+def test_permission_default_from_read_only_true_is_allow():
+    spec = _spec(read_only=True)
+    assert spec.effective_permission() == "allow"
+    assert spec.to_wire()["permission"] == "allow"
+
+
+def test_permission_default_from_read_only_false_is_ask():
+    spec = _spec(read_only=False)
+    assert spec.effective_permission() == "ask"
+    assert spec.to_wire()["permission"] == "ask"
+
+
+def test_permission_needs_approval_beats_read_only_auto_allow():
+    # needs_approval (no explicit permission) forces `ask` even when read_only would allow.
+    spec = _spec(read_only=True, needs_approval=True)
+    assert spec.effective_permission() == "ask"
+    assert spec.to_wire()["permission"] == "ask"
+
+
+def test_permission_absent_when_all_unset():
+    # read_only is None and nothing explicit -> no permission on the wire (runner falls back).
+    spec = _spec()
+    assert spec.effective_permission() is None
+    assert "permission" not in spec.to_wire()
+
+
+def test_permission_accepts_fe_permission_mode_alias():
+    # The playground writes `permission_mode` into agenta_metadata; the spec deserializes it.
+    spec = CallbackToolSpec.model_validate(
+        {
+            "name": "t",
+            "description": "t",
+            "callRef": "tools.composio.x.Y.c1",
+            "permission_mode": "deny",
+        }
+    )
+    assert spec.permission == "deny"
+    assert spec.to_wire()["permission"] == "deny"
diff --git a/sdks/python/oss/tests/pytest/unit/agents/tools/test_parsing.py b/sdks/python/oss/tests/pytest/unit/agents/tools/test_parsing.py
index 2f707a7ab5..aa7b9c18bf 100644
--- a/sdks/python/oss/tests/pytest/unit/agents/tools/test_parsing.py
+++ b/sdks/python/oss/tests/pytest/unit/agents/tools/test_parsing.py
@@ -65,6 +65,60 @@ def test_compat_parser_does_not_flip_string_false_needs_approval():
     assert approved.needs_approval is True
 
 
+def test_compat_parser_carries_top_level_permission_on_typed_config():
+    # A canonical typed config dict carries the author's Layer-3 permission end to end.
+    gateway = coerce_tool_config(
+        {
+            "type": "gateway",
+            "integration": "github",
+            "action": "GET_USER",
+            "connection": "c1",
+            "permission": "deny",
+        }
+    )
+    assert isinstance(gateway, GatewayToolConfig)
+    assert gateway.permission == "deny"
+
+
+def test_compat_parser_carries_permission_from_gateway_slug():
+    # The playground writes a gateway slug + a sibling permission; both survive coercion.
+    gateway = coerce_tool_config(
+        {
+            "function": {"name": "tools.composio.github.GET_USER.c1"},
+            "permission": "deny",
+        }
+    )
+    assert isinstance(gateway, GatewayToolConfig)
+    assert gateway.permission == "deny"
+
+
+def test_compat_parser_accepts_permission_mode_alias_for_permission():
+    # The legacy FE key `permission_mode` deserializes to the same `permission` field.
+    gateway = coerce_tool_config(
+        {
+            "type": "gateway",
+            "integration": "github",
+            "action": "GET_USER",
+            "connection": "c1",
+            "permission_mode": "deny",
+        }
+    )
+    assert gateway.permission == "deny"
+
+
+def test_compat_parser_omits_permission_when_absent():
+    # Backward compatible: a tool with no permission leaves the field unset.
+    gateway = coerce_tool_config(
+        {
+            "type": "gateway",
+            "integration": "github",
+            "action": "GET_USER",
+            "connection": "c1",
+        }
+    )
+    assert gateway.permission is None
+
+
 def test_coerce_tool_configs_rejects_invalid_on_error():
     with pytest.raises(ValueError):
         coerce_tool_configs(["read"], on_error="bogus")  # type: ignore[arg-type]
diff --git a/services/agent/src/engines/sandbox_agent/provider.ts b/services/agent/src/engines/sandbox_agent/provider.ts
index 6ab03dd052..13fd392fa5 100644
--- a/services/agent/src/engines/sandbox_agent/provider.ts
+++ b/services/agent/src/engines/sandbox_agent/provider.ts
@@ -1,15 +1,44 @@
 import { local } from "sandbox-agent/local";
 import { daytona } from "sandbox-agent/daytona";
 
+import type { SandboxPermission } from "../../protocol.ts";
 import { applyDaytonaClientEnv, daytonaEnvVars } from "./daytona.ts";
 
+/**
+ * Translate the Layer 2 network policy into Daytona create fields. Daytona enforces egress
+ * at the sandbox boundary: `networkBlockAll` blocks all outbound, `networkAllowList` is a
+ * COMMA-SEPARATED CIDR string (not an array). `mode: "on"` (or no policy) leaves both unset
+ * so the sandbox stays default-open. The create object is cast `as any` at the call site, so
+ * these pass through even though the daytona wrapper's create type does not surface them.
+ *
+ * `mode: "allowlist"` with an EMPTY list maps to `networkBlockAll` (block-all), not default-open:
+ * "allow these zero ranges" is faithfully read as "allow nothing", and it keeps this mapping
+ * consistent with `buildRunPlan`, which already treats any `mode !== "on"` as a restricted
+ * boundary. Leaving it unset would silently grant full egress — the opposite of the author's
+ * intent — so an empty allowlist locks down rather than opens up.
+ */
+export function daytonaNetworkFields(
+  sandboxPermission: SandboxPermission | undefined,
+): { networkBlockAll: true } | { networkAllowList: string } | {} {
+  const network = sandboxPermission?.network;
+  if (network?.mode === "off") return { networkBlockAll: true };
+  if (network?.mode === "allowlist") {
+    const allowlist = network.allowlist ?? [];
+    if (allowlist.length > 0) return { networkAllowList: allowlist.join(",") };
+    return { networkBlockAll: true };
+  }
+  return {};
+}
+
 /**
  * Build the sandbox-agent provider for the requested axis.
  *
  * Daytona needs an image or snapshot that carries the daemon and harness CLI. The
  * code-evaluator `DAYTONA_SNAPSHOT` is intentionally not reused because it has no daemon.
  * Provider keys come from the request secrets. Pi's self-managed login is only uploaded
- * when no key is available.
+ * when no key is available. The Layer 2 network policy (S1b) is enforced on Daytona via
+ * `networkBlockAll` / `networkAllowList`; `buildRunPlan` rejects restricted policies the
+ * local provider cannot enforce before this is reached.
  */
 export function buildSandboxProvider(
   sandboxId: string,
@@ -17,6 +46,7 @@ export function buildSandboxProvider(
   binaryPath: string | undefined,
   piExtEnv: Record<string, string>,
   secrets: Record<string, string>,
+  sandboxPermission?: SandboxPermission,
 ) {
   if (sandboxId === "daytona") {
     applyDaytonaClientEnv();
@@ -31,6 +61,7 @@ export function buildSandboxProvider(
         // suppresses that so the snapshot is used as-is.
         ...(snapshot ? { snapshot, image: undefined } : {}),
         ...(target ? { target } : {}),
+        ...daytonaNetworkFields(sandboxPermission),
         envVars: daytonaEnvVars(piExtEnv, secrets),
         ephemeral: true,
       } as any,
diff --git a/services/agent/src/responder.ts b/services/agent/src/responder.ts
index 63348764e7..ea2fe92e0f 100644
--- a/services/agent/src/responder.ts
+++ b/services/agent/src/responder.ts
@@ -9,14 +9,18 @@
  *
  *   - `PolicyResponder` is the headless answer (a fixed `auto` / `deny` policy, no human).
  *     It reproduces the previous behavior exactly and is what `/invoke` uses.
- *   - A cross-turn responder (the `/messages` HITL path) slots in here later: it surfaces the
- *     request to the browser, ends the turn, and resolves on the next turn's reply. The
- *     harness adapter does not change when the responder does.
+ *   - `HITLResponder` is the cross-turn responder (the `/messages` HITL path): it parks an
+ *     un-decided permission (the `interaction_request` already went to the browser), ends the
+ *     turn, and resolves on the next turn's stored reply. With no decision and no human
+ *     surface it falls back to the base policy, so the headless path is unchanged. The harness
+ *     adapter does not change when the responder does.
  *
  * Resolution is modeled as `allow` / `deny`; the adapter maps that onto the harness's
  * available ACP replies via `decisionToReply`.
  */
 
+import type { AgentRunRequest, ContentBlock } from "./protocol.ts";
+
 export type PermissionPolicy = "auto" | "deny";
 
 export type PermissionDecision = "allow" | "deny";
@@ -49,13 +53,121 @@ export class PolicyResponder implements Responder {
   }
 }
 
+/**
+ * A lookup of approval decisions the user already made on a prior turn, keyed by the
+ * identifier carried on the permission request. Both the tool-call id and the tool name are
+ * indexed because a cold-replayed run mints fresh ACP permission/tool-call ids each turn, so
+ * the stable cross-turn anchor is whichever the harness re-presents. `toolCallId` is the
+ * precise match; `toolName` is the fallback when the id was not preserved across the boundary.
+ */
+export type ApprovalDecisions = ReadonlyMap<string, PermissionDecision>;
+
+/**
+ * The cross-turn human-in-the-loop responder for the `/messages` path.
+ *
+ * It answers a permission gate three ways, in order:
+ *   1. The user already decided (a stored `decisions` entry for this tool-call id or tool
+ *      name) -> apply it. THIS IS THE RESUME PATH: turn N parks, turn N+1 carries the reply.
+ *   2. No stored decision and there is a human surface (`hasHumanSurface`) -> `deny` to PARK.
+ *      The `interaction_request` was already emitted upstream (the FE prompts), so denying
+ *      here just declines to run the unapproved tool this turn; the turn ends safely and a
+ *      later turn carrying the decision resolves it via branch 1.
+ *   3. No stored decision and no human surface (headless `/invoke`) -> the `basePolicy`
+ *      decision. This branch is byte-identical to `PolicyResponder`, so `/invoke` is unchanged.
+ *
+ * Pure: every input (decisions, base policy, surface flag) is injected; no I/O.
+ */
+export class HITLResponder implements Responder {
+  constructor(
+    private readonly decisions: ApprovalDecisions,
+    private readonly basePolicy: PermissionPolicy,
+    private readonly hasHumanSurface: boolean,
+  ) {}
+
+  async onPermission(request: PermissionRequest): Promise<PermissionDecision> {
+    const stored = this.lookup(request);
+    if (stored) return stored;
+    if (this.hasHumanSurface) return "deny"; // park: do not run the unapproved tool this turn
+    return this.basePolicy === "deny" ? "deny" : "allow"; // headless: PolicyResponder parity
+  }
+
+  private lookup(request: PermissionRequest): PermissionDecision | undefined {
+    for (const key of permissionRequestKeys(request)) {
+      const decision = this.decisions.get(key);
+      if (decision) return decision;
+    }
+    return undefined;
+  }
+}
+
+/** The identifiers a stored decision may be keyed by: the gated tool-call id, then its name. */
+function permissionRequestKeys(request: PermissionRequest): string[] {
+  const keys: string[] = [];
+  const raw = request.raw as
+    | { toolCall?: { toolCallId?: unknown; name?: unknown } }
+    | undefined;
+  const toolCallId = raw?.toolCall?.toolCallId;
+  if (typeof toolCallId === "string" && toolCallId) keys.push(toolCallId);
+  const name = raw?.toolCall?.name;
+  if (typeof name === "string" && name) keys.push(name);
+  return keys;
+}
+
+/**
+ * Build the approval-decision lookup from the inbound run request's message history.
+ *
+ * The signal is the converted approval reply that the Vercel adapter
+ * (`_approval_response_blocks`) already produces: a `tool_result` content block keyed by
+ * `toolCallId` whose `output` is an `{ approved: boolean }` envelope. That envelope shape is
+ * unique to an approval response (an ordinary tool result carries the tool's real output, not
+ * an `approved` flag), so no new wire carrier is needed. We index each decision by its
+ * `toolCallId` and, when the block also names a tool, by `toolName` (the cross-turn fallback
+ * when a cold replay did not preserve the id).
+ */
+export function extractApprovalDecisions(
+  request: AgentRunRequest,
+): Map<string, PermissionDecision> {
+  const decisions = new Map<string, PermissionDecision>();
+  for (const message of request.messages ?? []) {
+    const content = message?.content;
+    if (!Array.isArray(content)) continue;
+    for (const block of content) {
+      const decision = approvalDecisionOf(block);
+      if (!decision) continue;
+      if (block.toolCallId) decisions.set(block.toolCallId, decision);
+      if (block.toolName) decisions.set(block.toolName, decision);
+    }
+  }
+  return decisions;
+}
+
+/** A `tool_result` block whose `output` is an `{ approved: boolean }` envelope -> a decision. */
+function approvalDecisionOf(
+  block: ContentBlock,
+): PermissionDecision | undefined {
+  if (!block || block.type !== "tool_result") return undefined;
+  const output = block.output;
+  if (
+    output &&
+    typeof output === "object" &&
+    !Array.isArray(output) &&
+    typeof (output as { approved?: unknown }).approved === "boolean"
+  ) {
+    return (output as { approved: boolean }).approved ? "allow" : "deny";
+  }
+  return undefined;
+}
+
 /**
  * Resolve the permission policy with the same precedence as before: an explicit per-run
  * `permissionPolicy: "deny"` or the `SANDBOX_AGENT_DENY_PERMISSIONS` env flips to deny; the
  * default is auto-allow, because backend-resolved tools are trusted and the run is headless.
  */
 export function policyFromRequest(permissionPolicy?: string): PermissionPolicy {
-  if (permissionPolicy === "deny" || process.env.SANDBOX_AGENT_DENY_PERMISSIONS === "true") {
+  if (
+    permissionPolicy === "deny" ||
+    process.env.SANDBOX_AGENT_DENY_PERMISSIONS === "true"
+  ) {
     return "deny";
   }
   return "auto";
diff --git a/services/agent/src/tools/code.ts b/services/agent/src/tools/code.ts
index d4d8db92b8..14544b0cfd 100644
--- a/services/agent/src/tools/code.ts
+++ b/services/agent/src/tools/code.ts
@@ -1,197 +1,27 @@
 /**
- * Code-tool executor: run a resolved `code` tool's snippet in the agent sandbox.
+ * Code-tool sidecar execution gate.
  *
- * A code tool ships a snippet (`code`) + a runtime (`python` | `node`) + a scoped `env` (the
- * tool's declared vault secrets, resolved server-side). Unlike a `callback` tool, it never
- * touches Agenta's /tools/call — it runs locally where the harness runs, which is exactly why
- * its secrets are injected here as subprocess env (and nowhere else).
- *
- * Entry convention (same for both runtimes): the snippet defines a top-level `main`. A bare
- * `def main(**inputs)` / `function main(inputs)` is found automatically; an explicit export
- * (`module.exports.main` / `exports.main` / `module.exports = fn` in Node) is also accepted.
- * Python calls `main(**inputs)` (keyword args from the tool input object); Node calls
- * `main(inputs)` (the input object) and may return a promise. The return value is
- * JSON-serialized and handed to the model as the tool result.
- *
- * Shared by every delivery path that runs code locally: engines/pi.ts (in-process Pi),
- * extensions/agenta.ts (Pi under sandbox-agent), tools/mcp-server.ts (the MCP bridge for other
- * harnesses).
+ * The code-tool interface still exists and code tools are still advertised to harnesses. The
+ * sidecar no longer executes author-supplied snippets locally, though: every delivery path
+ * funnels a `kind: "code"` call through this function, so throwing here makes direct Pi,
+ * sandbox Pi, and the ACP/MCP bridge fail consistently without changing the public wire shape.
  */
-import { spawn } from "node:child_process";
-import { mkdtempSync, rmSync, writeFileSync } from "node:fs";
-import { tmpdir } from "node:os";
-import { join } from "node:path";
-
-/** Per-call budget for a code tool. Surfaced as a tool error on timeout. */
-export const CODE_TOOL_TIMEOUT_MS = Number(
-  process.env.AGENTA_AGENT_CODE_TOOL_TIMEOUT_MS ?? 30000,
-);
-
-// argv[1] is the snippet path (python `-c`/node `-e` put the first extra arg at argv[1]);
-// the tool input arrives as JSON on stdin. Both bootstraps evaluate the snippet in a fresh
-// scope and pick up a top-level `main` (a bare `def main`/`function main`), falling back to an
-// explicit export. Either way the contract is: define a callable `main`.
-const PY_BOOTSTRAP = `import sys, json
-_path = sys.argv[1]
-with open(_path) as _f:
-    _src = _f.read()
-_ns = {}
-exec(compile(_src, _path, "exec"), _ns)
-if not callable(_ns.get("main")):
-    sys.stderr.write("code tool must define a callable main(**inputs)")
-    sys.exit(1)
-_args = json.loads(sys.stdin.read() or "{}")
-_out = _ns["main"](**_args)
-sys.stdout.write(json.dumps(_out))
-`;
-
-// `require(path)` would only see CommonJS exports, so a bare top-level `function main` (which
-// exports nothing under CommonJS) would be invisible. Instead read the source and evaluate it
-// in a scope that captures a top-level `main`, while still honoring an explicit
-// `module.exports.main` / `exports.main` / `module.exports = fn`.
-const NODE_BOOTSTRAP = `const fs = require("fs");
-const path = process.argv[1];
-const src = fs.readFileSync(path, "utf8");
-const mod = { exports: {} };
-const factory = new Function(
-  "exports",
-  "require",
-  "module",
-  "__filename",
-  "__dirname",
-  src +
-    "\\n;return (typeof main !== 'undefined' ? main : (module.exports && (module.exports.main || module.exports.default)) || module.exports);",
-);
-const fn = factory(mod.exports, require, mod, path, require("path").dirname(path));
-if (typeof fn !== "function") {
-  process.stderr.write("code tool must define or export a callable main(inputs) function");
-  process.exit(1);
-}
-const args = JSON.parse(fs.readFileSync(0, "utf8") || "{}");
-Promise.resolve(fn(args))
-  .then((out) => process.stdout.write(JSON.stringify(out === undefined ? null : out)))
-  .catch((err) => { process.stderr.write(String((err && err.stack) || err)); process.exit(1); });
-`;
 
 export type CodeRuntime = "python" | "node";
 
-// The minimal set of host env vars a python3/node runtime needs to start. Deliberately
-// excludes everything secret-bearing or sidecar-specific: no AGENTA_*, no *_API_KEY /
-// *_TOKEN, no COMPOSIO_* / DAYTONA_*, no provider keys that the in-process Pi path writes
-// into process.env before a run. Only the tool's declared scoped `env` is layered on top.
-const BASE_ENV_ALLOWLIST = [
-  "PATH",
-  "HOME",
-  "LANG",
-  "LC_ALL",
-  "LC_CTYPE",
-  "TZ",
-  "TMPDIR",
-  "TEMP",
-  "TMP",
-  "NODE_PATH",
-  // Windows essentials, copied only when present.
-  "SystemRoot",
-  "ComSpec",
-];
-
-/** Build the child env from a minimal allowlist (copied only when set) plus scoped secrets. */
-function buildChildEnv(
-  env: Record<string, string> | undefined,
-): Record<string, string> {
-  const base: Record<string, string> = {};
-  for (const key of BASE_ENV_ALLOWLIST) {
-    const value = process.env[key];
-    if (value !== undefined) base[key] = value;
-  }
-  return { ...base, ...(env ?? {}) };
-}
+export const CODE_TOOL_UNSUPPORTED_MESSAGE =
+  "Code tools are not supported by the sidecar.";
 
 /**
- * Run a code tool's snippet and return its JSON-serialized output as text. Throws on a
- * non-zero exit, a timeout, or an abort; callers turn the throw into a tool-error result so
- * the model loop continues.
+ * Fail a code-tool invocation. Callers turn this throw into a tool-error result so the model
+ * loop continues rather than crashing the whole run.
  */
 export async function runCodeTool(
-  runtime: CodeRuntime | undefined,
-  code: string,
-  env: Record<string, string> | undefined,
-  args: unknown,
-  signal?: AbortSignal,
+  _runtime: CodeRuntime | undefined,
+  _code: string,
+  _env: Record<string, string> | undefined,
+  _args: unknown,
+  _signal?: AbortSignal,
 ): Promise<string> {
-  const dir = mkdtempSync(join(tmpdir(), "agenta-code-"));
-  try {
-    const isNode = runtime === "node";
-    const scriptPath = join(dir, isNode ? "tool.js" : "tool.py");
-    writeFileSync(scriptPath, code ?? "", "utf8");
-
-    const command = isNode ? "node" : "python3";
-    const childArgs = isNode
-      ? ["-e", NODE_BOOTSTRAP, scriptPath]
-      : ["-c", PY_BOOTSTRAP, scriptPath];
-
-    return await new Promise<string>((resolve, reject) => {
-      const child = spawn(command, childArgs, {
-        // The child inherits ONLY a minimal startup allowlist (PATH, HOME, locale/temp, and
-        // Windows essentials when present) plus the tool's declared scoped secrets. It does
-        // NOT inherit the sidecar's process.env, so provider keys (OPENAI_API_KEY, etc.) that
-        // the in-process Pi path writes into process.env, AGENTA_* config, and other secret-
-        // bearing vars never reach an author-supplied snippet. Nothing is written to the
-        // agent-visible filesystem beyond the temp dir.
-        env: buildChildEnv(env),
-        stdio: ["pipe", "pipe", "pipe"],
-      });
-
-      let stdout = "";
-      let stderr = "";
-      let settled = false;
-      const finish = (fn: () => void) => {
-        if (settled) return;
-        settled = true;
-        clearTimeout(timer);
-        if (signal) signal.removeEventListener("abort", onAbort);
-        fn();
-      };
-
-      const timer = setTimeout(() => {
-        child.kill("SIGKILL");
-        finish(() =>
-          reject(new Error(`code tool timed out after ${CODE_TOOL_TIMEOUT_MS}ms`)),
-        );
-      }, CODE_TOOL_TIMEOUT_MS);
-
-      const onAbort = () => {
-        child.kill("SIGKILL");
-        finish(() => reject(new Error("aborted")));
-      };
-      if (signal) {
-        if (signal.aborted) onAbort();
-        else signal.addEventListener("abort", onAbort, { once: true });
-      }
-
-      child.stdout.on("data", (d) => (stdout += d));
-      child.stderr.on("data", (d) => (stderr += d));
-      child.on("error", (err) => finish(() => reject(err)));
-      child.on("close", (exitCode) =>
-        finish(() => {
-          if (exitCode === 0) resolve(stdout.trim());
-          else
-            reject(
-              new Error(
-                `code tool exited ${exitCode}: ${stderr.slice(0, 500) || "(no stderr)"}`,
-              ),
-            );
-        }),
-      );
-
-      child.stdin.write(JSON.stringify(args ?? {}));
-      child.stdin.end();
-    });
-  } finally {
-    try {
-      rmSync(dir, { recursive: true, force: true });
-    } catch {
-      // best-effort cleanup of the throwaway snippet dir
-    }
-  }
+  throw new Error(CODE_TOOL_UNSUPPORTED_MESSAGE);
 }
diff --git a/services/agent/src/tools/dispatch.ts b/services/agent/src/tools/dispatch.ts
index e8e2d25e38..89fdda10d1 100644
--- a/services/agent/src/tools/dispatch.ts
+++ b/services/agent/src/tools/dispatch.ts
@@ -9,7 +9,7 @@
  * advertise/skip behavior for `client` tools — only the execution itself is shared.
  *
  * The three executor kinds (see `ResolvedToolSpec`):
- *  - `code`: run the snippet in a sandbox subprocess with its scoped secret `env`.
+ *  - `code`: advertised to harnesses, but rejected by the sidecar as unsupported.
  *  - `client`: browser-fulfilled across a turn boundary; never executed in-sandbox (throws).
  *  - `callback` (default): POST back through Agenta's /tools/call so the Composio key and
  *    connection auth stay server-side. On Daytona the in-sandbox process can't reach Agenta,
@@ -96,7 +96,7 @@ export async function relayToolCall(
  * Execute one resolved tool and return its result text. Throws on failure; every call site
  * turns the throw into a tool-error result so the model loop continues rather than crashing.
  *
- *  - `code`   → run the snippet locally (scoped secret env), no callback/relay.
+ *  - `code`   -> reject as unsupported by the sidecar, no callback/relay.
  *  - `client` → throw: browser-fulfilled, never executed in-sandbox.
  *  - default/`callback` → relay through the runner when `opts.relayDir` is set (Daytona),
  *    else POST directly to `opts.endpoint`.
diff --git a/services/agent/src/tools/relay.ts b/services/agent/src/tools/relay.ts
index 4889b110af..824cbe16dd 100644
--- a/services/agent/src/tools/relay.ts
+++ b/services/agent/src/tools/relay.ts
@@ -20,6 +20,7 @@ import { existsSync, mkdirSync, readdirSync, readFileSync, writeFileSync } from
 import { callAgentaTool } from "./callback.ts";
 import { runCodeTool } from "./code.ts";
 import type { ResolvedToolSpec, ToolCallbackContext } from "../protocol.ts";
+import type { PermissionPolicy } from "../responder.ts";
 
 export const RELAY_REQ_SUFFIX = ".req.json";
 export const RELAY_RES_SUFFIX = ".res.json";
@@ -44,6 +45,27 @@ export function sanitizeRelayId(id: string): string {
 
 export const sleep = (ms: number): Promise<void> => new Promise((r) => setTimeout(r, ms));
 
+/**
+ * Layer 3 enforcement (S3b): resolve a resolved-tool spec's `permission` to a concrete
+ * runner-side decision. `allow` runs, `deny` never runs; `ask` and an UNSET permission
+ * degrade to the run's headless `permissionPolicy` (`auto` -> allow, `deny` -> deny).
+ *
+ * Resolved tools (code / gateway-callback) run runner-side via the relay, harness-agnostic, so
+ * this is where their permission is enforced (Claude builtins are enforced at Layer 1 via
+ * .claude/settings.json instead). Surfacing an `ask` to a live human is the cross-turn HITL
+ * path (S5); here `ask` is a headless run, so it collapses onto the policy.
+ */
+export function resolvePermission(
+  permission: string | undefined,
+  policy: PermissionPolicy,
+): "allow" | "deny" {
+  if (permission === "allow") return "allow";
+  if (permission === "deny") return "deny";
+  // `ask` or unset: headless, so defer to the run policy.
+  // TODO(S5): surface ask to HITL instead of collapsing onto permissionPolicy.
+  return policy === "deny" ? "deny" : "allow";
+}
+
 export interface RelayHost {
   list: (dir: string) => Promise<string[]>;
   read: (path: string) => Promise<string>;
@@ -93,7 +115,19 @@ async function executeRelayedTool(
   spec: ResolvedToolSpec,
   req: RelayRequest,
   callback: ToolCallbackContext | undefined,
+  policy: PermissionPolicy,
 ): Promise<string> {
+  // Layer 3 enforcement (S3b): gate the call on the spec's permission before it runs.
+  // `deny` returns a refusal string (not a throw) so the harness folds it into the tool
+  // result and the model loop continues. `ask`/unset degrade to the headless policy.
+  const decision = resolvePermission(spec.permission, policy);
+  if (decision === "deny") {
+    if (spec.permission === "deny") {
+      return `Tool '${spec.name}' is denied by policy.`;
+    }
+    // ask/unset that the headless policy refused. TODO(S5): surface ask to HITL.
+    return `Tool '${spec.name}' requires approval; denied in headless mode.`;
+  }
   if (spec.kind === "client") {
     throw new Error(`client tool '${spec.name}' is browser-fulfilled and cannot be executed`);
   }
@@ -114,7 +148,7 @@ async function executeRelayedTool(
 
 /**
  * Runner-side relay loop. Polls the sandbox relay dir for request files, executes each
- * against Agenta's /tools/call (which the runner can reach), and writes the response file
+ * against the private spec in memory, and writes the response file
  * the in-sandbox extension is waiting on. Returns `stop()` to end the loop and drain any
  * in-flight executions; call it once the prompt resolves.
  */
@@ -123,6 +157,7 @@ export function startToolRelay(
   relayDir: string,
   specs: ResolvedToolSpec[],
   callback: ToolCallbackContext | undefined,
+  policy: PermissionPolicy,
 ): { stop: () => Promise<void> } {
   let active = true;
   const seen = new Set<string>();
@@ -141,6 +176,7 @@ export function startToolRelay(
         spec,
         { ...req, toolCallId: req.toolCallId ?? id },
         callback,
+        policy,
       );
       res = { ok: true, text };
     } catch (err) {
diff --git a/services/agent/tests/unit/code-tool.test.ts b/services/agent/tests/unit/code-tool.test.ts
index 5a3566614d..5914797459 100644
--- a/services/agent/tests/unit/code-tool.test.ts
+++ b/services/agent/tests/unit/code-tool.test.ts
@@ -1,89 +1,25 @@
 /**
- * Unit test for the code-tool executor (runCodeTool).
+ * Unit test for the code-tool sidecar execution gate.
  *
- * Exercises both runtimes end-to-end through real subprocesses: a python tool, node tools
- * written as a bare top-level `function main` (the F2 regression) and as an explicit
- * `module.exports.main`, an async node `main`, the F3 env-isolation guarantee (provider keys
- * do NOT leak in; declared scoped secrets DO), and the non-zero-exit reject path.
- *
- * Needs `python3` and `node` on PATH (both present locally and on ubuntu CI runners).
- *
- * Run: pnpm test (or: pnpm exec vitest run tests/unit/code-tool.test.ts)
+ * Code tools remain part of the public tool interface and can still be advertised to harnesses,
+ * but the sidecar must not execute author-supplied snippets locally.
  */
 import { describe, it } from "vitest";
 import assert from "node:assert/strict";
 
-import { runCodeTool } from "../../src/tools/code.ts";
+import { CODE_TOOL_UNSUPPORTED_MESSAGE, runCodeTool } from "../../src/tools/code.ts";
 
 describe("runCodeTool", () => {
-  it("runs a python bare `def main(**kw)`", async () => {
-    const code = 'def main(**kw):\n    return {"sum": kw.get("a", 0) + kw.get("b", 0)}\n';
-    const out = await runCodeTool("python", code, undefined, { a: 2, b: 3 });
-    assert.deepEqual(JSON.parse(out), { sum: 5 }, "python bare main returns the right JSON");
-  });
-
-  it("runs a node bare top-level `function main` (F2 regression)", async () => {
-    const code = "function main(inputs) { return { got: inputs }; }";
-    const out = await runCodeTool("node", code, undefined, { hello: "world" });
-    assert.deepEqual(
-      JSON.parse(out),
-      { got: { hello: "world" } },
-      "node bare function main executes and echoes the input",
-    );
-  });
-
-  it("runs a node explicit `module.exports.main`", async () => {
-    const code = "module.exports.main = function (inputs) { return { via: 'exports', got: inputs }; };";
-    const out = await runCodeTool("node", code, undefined, { x: 1 });
-    assert.deepEqual(
-      JSON.parse(out),
-      { via: "exports", got: { x: 1 } },
-      "node module.exports.main works",
-    );
-  });
-
-  it("runs an async node `main` returning a Promise", async () => {
-    const code =
-      "async function main(inputs) { await new Promise((r) => setTimeout(r, 5)); return { doubled: inputs.n * 2 }; }";
-    const out = await runCodeTool("node", code, undefined, { n: 21 });
-    assert.deepEqual(JSON.parse(out), { doubled: 42 }, "node async main resolves");
-  });
-
-  it("F3: provider keys do NOT leak; scoped secrets DO", async () => {
-    const hadKey = "OPENAI_API_KEY" in process.env;
-    const prevKey = process.env.OPENAI_API_KEY;
-    process.env.OPENAI_API_KEY = "leak-me-xyz";
-    try {
-      // The provider key sits in process.env but must not reach the snippet.
-      const leakCode = "function main() { return { key: process.env.OPENAI_API_KEY ?? 'absent' }; }";
-      const leakOut = await runCodeTool("node", leakCode, undefined, {});
-      assert.deepEqual(
-        JSON.parse(leakOut),
-        { key: "absent" },
-        "F3: OPENAI_API_KEY did NOT leak into the snippet env",
-      );
-
-      // A secret declared on the tool (passed via the scoped `env` arg) must be visible.
-      const scopedCode =
-        "function main() { return { secret: process.env.MY_TOOL_SECRET ?? 'absent' }; }";
-      const scopedOut = await runCodeTool("node", scopedCode, { MY_TOOL_SECRET: "ok" }, {});
-      assert.deepEqual(
-        JSON.parse(scopedOut),
-        { secret: "ok" },
-        "F3: scoped MY_TOOL_SECRET IS visible to the snippet",
-      );
-    } finally {
-      if (hadKey) process.env.OPENAI_API_KEY = prevKey;
-      else delete process.env.OPENAI_API_KEY;
-    }
-  });
-
-  it("rejects when the snippet throws / exits non-zero", async () => {
-    const code = "function main() { throw new Error('boom'); }";
+  it("fails with a clear unsupported error instead of executing code", async () => {
     await assert.rejects(
-      () => runCodeTool("node", code, undefined, {}),
-      /boom|exited/,
-      "a throwing snippet rejects",
+      () =>
+        runCodeTool(
+          "node",
+          "function main() { return { executed: true }; }",
+          { MY_TOOL_SECRET: "secret" },
+          { input: true },
+        ),
+      new RegExp(CODE_TOOL_UNSUPPORTED_MESSAGE),
     );
   });
 });
diff --git a/services/agent/tests/unit/pi-capability-guard.test.ts b/services/agent/tests/unit/pi-capability-guard.test.ts
new file mode 100644
index 0000000000..653fe084d9
--- /dev/null
+++ b/services/agent/tests/unit/pi-capability-guard.test.ts
@@ -0,0 +1,76 @@
+import { describe, it } from "vitest";
+import assert from "node:assert";
+
+import { unenforceableCapabilityConfig } from "../../src/engines/pi.ts";
+import type { AgentRunRequest } from "../../src/protocol.ts";
+
+/**
+ * The in-process `pi` backend has no sandbox and runs tools without the relay, so it must reject
+ * capability config it cannot enforce instead of silently ignoring it (the backend-routing footgun
+ * found in live QA). `sandbox-agent` is the enforcing backend.
+ */
+function req(extra: Partial<AgentRunRequest>): AgentRunRequest {
+  return { harness: "pi", messages: [], ...extra } as AgentRunRequest;
+}
+
+describe("unenforceableCapabilityConfig (in-process pi guard)", () => {
+  it("passes a plain request (no capability config)", () => {
+    assert.equal(unenforceableCapabilityConfig(req({})), undefined);
+  });
+
+  it("passes a permissive sandbox_permission (network on, no fs restriction)", () => {
+    assert.equal(
+      unenforceableCapabilityConfig(
+        req({ sandboxPermission: { network: { mode: "on" }, enforcement: "strict" } }),
+      ),
+      undefined,
+    );
+  });
+
+  it("rejects a restricted network", () => {
+    const msg = unenforceableCapabilityConfig(
+      req({ sandboxPermission: { network: { mode: "off" }, enforcement: "strict" } }),
+    );
+    assert.match(msg ?? "", /cannot enforce sandbox_permission\.network='off'/);
+  });
+
+  it("rejects an allowlist network too", () => {
+    const msg = unenforceableCapabilityConfig(
+      req({
+        sandboxPermission: { network: { mode: "allowlist", allowlist: ["10.0.0.0/8"] }, enforcement: "strict" },
+      }),
+    );
+    assert.match(msg ?? "", /network='allowlist'/);
+  });
+
+  it("rejects a restricting filesystem", () => {
+    const msg = unenforceableCapabilityConfig(
+      req({ sandboxPermission: { network: { mode: "on" }, filesystem: "readonly", enforcement: "strict" } }),
+    );
+    assert.match(msg ?? "", /filesystem='readonly'/);
+  });
+
+  it("rejects a deny tool permission and names the tool", () => {
+    const msg = unenforceableCapabilityConfig(
+      req({ customTools: [{ kind: "code", name: "danger", permission: "deny" } as never] }),
+    );
+    assert.match(msg ?? "", /does not enforce tool permissions/);
+    assert.match(msg ?? "", /danger/);
+  });
+
+  it("rejects an ask tool permission", () => {
+    const msg = unenforceableCapabilityConfig(
+      req({ customTools: [{ kind: "code", name: "t", permission: "ask" } as never] }),
+    );
+    assert.match(msg ?? "", /deny\/ask/);
+  });
+
+  it("passes an allow permission (no enforcement needed)", () => {
+    assert.equal(
+      unenforceableCapabilityConfig(
+        req({ customTools: [{ kind: "code", name: "t", permission: "allow" } as never] }),
+      ),
+      undefined,
+    );
+  });
+});
diff --git a/services/agent/tests/unit/sandbox-agent-provider.test.ts b/services/agent/tests/unit/sandbox-agent-provider.test.ts
new file mode 100644
index 0000000000..2e33286f95
--- /dev/null
+++ b/services/agent/tests/unit/sandbox-agent-provider.test.ts
@@ -0,0 +1,54 @@
+/**
+ * Unit tests for the Layer 2 network policy -> Daytona create field mapping (S1b).
+ *
+ * The mapping is tested directly because the real `daytona()` provider closes over its
+ * create object and constructs a Daytona client (needs API-key env), so it cannot be
+ * inspected through `buildSandboxProvider`. The orchestration test covers that the run
+ * plan's `sandboxPermission` reaches `buildSandboxProvider` as the new argument.
+ *
+ * Run: pnpm test (or: pnpm exec vitest run tests/unit/sandbox-agent-provider.test.ts)
+ */
+import { describe, it } from "vitest";
+import assert from "node:assert/strict";
+
+import { daytonaNetworkFields } from "../../src/engines/sandbox_agent/provider.ts";
+
+describe("daytonaNetworkFields", () => {
+  it("blocks all egress for network:off", () => {
+    assert.deepEqual(
+      daytonaNetworkFields({ network: { mode: "off" }, enforcement: "strict" }),
+      { networkBlockAll: true },
+    );
+  });
+
+  it("renders a non-empty allowlist as a comma-separated CIDR string", () => {
+    assert.deepEqual(
+      daytonaNetworkFields({
+        network: { mode: "allowlist", allowlist: ["a", "b"] },
+        enforcement: "strict",
+      }),
+      { networkAllowList: "a,b" },
+    );
+  });
+
+  it("blocks all egress for an empty allowlist (allow zero ranges == allow nothing)", () => {
+    assert.deepEqual(
+      daytonaNetworkFields({
+        network: { mode: "allowlist", allowlist: [] },
+        enforcement: "strict",
+      }),
+      { networkBlockAll: true },
+    );
+  });
+
+  it("leaves the sandbox default-open for network:on", () => {
+    assert.deepEqual(
+      daytonaNetworkFields({ network: { mode: "on" }, enforcement: "strict" }),
+      {},
+    );
+  });
+
+  it("leaves the sandbox default-open when no policy is set", () => {
+    assert.deepEqual(daytonaNetworkFields(undefined), {});
+  });
+});
diff --git a/services/agent/tests/unit/tool-bridge.test.ts b/services/agent/tests/unit/tool-bridge.test.ts
index fcd7eb6a13..0371633b20 100644
--- a/services/agent/tests/unit/tool-bridge.test.ts
+++ b/services/agent/tests/unit/tool-bridge.test.ts
@@ -2,8 +2,8 @@
  * Unit tests for buildToolMcpServers (the tool MCP bridge attachment decision).
  *
  * Regression cover for F4: attachment must be decided per tool kind, not on the callback
- * endpoint alone. A `code` tool runs locally in mcp-server.ts and needs no endpoint, so a run
- * whose tools are all `code` must still attach the `agenta-tools` server. Only `callback`-kind
+ * endpoint alone. A `code` tool is still advertised through mcp-server.ts and needs no endpoint,
+ * so a run whose tools are all `code` must still attach the `agenta-tools` server. Only `callback`-kind
  * tools require AGENTA_TOOL_CALLBACK_ENDPOINT; missing it must degrade those tools, not drop the
  * whole server. `client` tools are browser-fulfilled and never justify attaching the bridge.
  *
@@ -109,7 +109,7 @@ describe("buildToolMcpServers", () => {
     ];
     const out = buildToolMcpServers(specs, relayDir);
     assert.notDeepEqual(out, [], "mixed run with no endpoint must not return []");
-    assert.equal(out.length, 1, "still attaches the server so the code tool works");
+    assert.equal(out.length, 1, "still attaches the server so the code tool is advertised");
     assert.equal(
       envValue(out[0].env, "AGENTA_TOOL_CALLBACK_ENDPOINT"),
       undefined,
diff --git a/services/agent/tests/unit/tool-dispatch.test.ts b/services/agent/tests/unit/tool-dispatch.test.ts
index af27dc991f..a6ca4e8e85 100644
--- a/services/agent/tests/unit/tool-dispatch.test.ts
+++ b/services/agent/tests/unit/tool-dispatch.test.ts
@@ -7,9 +7,9 @@
  * advertising behavior that stays per-site:
  *  - buildCustomTools (pi.ts) skips `client` specs, builds a tool per `code`/`callback` spec,
  *    and skips a `callback` spec with no callback endpoint.
- *  - runResolvedTool runs a real `code` snippet end-to-end (python) and throws for `client`.
+ *  - runResolvedTool advertises code tools but fails their sidecar execution, and throws for `client`.
  *
- * No network and no harness: the `code` path shells out to python3 (available locally); the
+ * No network and no harness: the `code` path now fails before any subprocess; the
  * `callback`/relay paths are not exercised here (they need a live /tools/call or a relay dir).
  *
  * Run: pnpm test (or: pnpm exec vitest run tests/unit/tool-dispatch.test.ts)
@@ -66,15 +66,14 @@ describe("buildCustomTools routing", () => {
 });
 
 describe("runResolvedTool", () => {
-  it("runs a code spec end-to-end (python)", async () => {
-    const text = await runResolvedTool(codeSpec, { greeting: "hi", n: 3 }, {
-      toolCallId: "call-1",
-    });
-    const parsed = JSON.parse(text);
-    assert.deepEqual(
-      parsed,
-      { echo: { greeting: "hi", n: 3 } },
-      "code tool runs the snippet and returns its JSON output containing the input",
+  it("fails code specs with a clear unsupported error", async () => {
+    await assert.rejects(
+      () =>
+        runResolvedTool(codeSpec, { greeting: "hi", n: 3 }, {
+          toolCallId: "call-1",
+        }),
+      /Code tools are not supported by the sidecar\./,
+      "code tools remain advertised but are not executable by the sidecar",
     );
   });
 
diff --git a/services/agent/tests/unit/tool-relay-permission.test.ts b/services/agent/tests/unit/tool-relay-permission.test.ts
new file mode 100644
index 0000000000..5c4cff38db
--- /dev/null
+++ b/services/agent/tests/unit/tool-relay-permission.test.ts
@@ -0,0 +1,109 @@
+/**
+ * Unit tests for Layer-3 permission enforcement in the runner-side tool relay (S3b).
+ *
+ *  - `resolvePermission` is the pure ladder: allow/deny are honored as-is; ask/unset degrade to
+ *    the headless permission policy (auto -> allow, deny -> deny).
+ *  - `startToolRelay` enforces that ladder before executing a relayed tool: a `deny` spec is
+ *    refused before execution and an `allow` code spec reaches the sidecar unsupported gate.
+ *
+ * The relay loop is driven over a real temp dir via `localRelayHost`: write a `<id>.req.json`,
+ * poll for the `<id>.res.json` the runner writes back. A `code` spec is used so execution
+ * needs no network or callback; the sidecar now returns a deterministic unsupported error
+ * instead of spawning a runtime.
+ *
+ * Run: pnpm test (or: pnpm exec vitest run tests/unit/tool-relay-permission.test.ts)
+ */
+import { describe, it } from "vitest";
+import assert from "node:assert/strict";
+import { existsSync, mkdtempSync, readFileSync, rmSync, writeFileSync } from "node:fs";
+import { tmpdir } from "node:os";
+import { join } from "node:path";
+
+import {
+  localRelayHost,
+  resolvePermission,
+  startToolRelay,
+  type RelayResponse,
+} from "../../src/tools/relay.ts";
+import type { ResolvedToolSpec } from "../../src/protocol.ts";
+
+const codeSpec = (
+  name: string,
+  permission?: ResolvedToolSpec["permission"],
+): ResolvedToolSpec => ({
+  name,
+  kind: "code",
+  runtime: "python",
+  code: 'def main(**kw):\n    return {"ran": True, "echo": kw}\n',
+  permission,
+});
+
+/** Drive one tool call through the relay loop and return the response the runner wrote. */
+async function relayOnce(
+  spec: ResolvedToolSpec,
+  policy: "auto" | "deny",
+): Promise<RelayResponse> {
+  const dir = mkdtempSync(join(tmpdir(), "agenta-relay-disp-"));
+  try {
+    const id = "call-1";
+    writeFileSync(
+      join(dir, `${id}.req.json`),
+      JSON.stringify({ toolName: spec.name, toolCallId: id, args: { a: 1 } }),
+    );
+    const relay = startToolRelay(localRelayHost(), dir, [spec], undefined, policy);
+    const resPath = join(dir, `${id}.res.json`);
+    const deadline = Date.now() + 5000;
+    while (Date.now() < deadline && !existsSync(resPath)) {
+      await new Promise((r) => setTimeout(r, 20));
+    }
+    await relay.stop();
+    assert.ok(existsSync(resPath), "the relay wrote a response file");
+    return JSON.parse(readFileSync(resPath, "utf-8")) as RelayResponse;
+  } finally {
+    rmSync(dir, { recursive: true, force: true });
+  }
+}
+
+describe("resolvePermission", () => {
+  const cases: Array<[ResolvedToolSpec["permission"], "auto" | "deny", "allow" | "deny"]> = [
+    ["allow", "auto", "allow"],
+    ["allow", "deny", "allow"],
+    ["deny", "auto", "deny"],
+    ["deny", "deny", "deny"],
+    ["ask", "auto", "allow"],
+    ["ask", "deny", "deny"],
+    [undefined, "auto", "allow"],
+    [undefined, "deny", "deny"],
+    // A garbage/unrecognized permission must fall to the policy (never auto-allow).
+    ["bogus" as ResolvedToolSpec["permission"], "auto", "allow"],
+    ["bogus" as ResolvedToolSpec["permission"], "deny", "deny"],
+  ];
+
+  for (const [permission, policy, expected] of cases) {
+    it(`permission=${permission ?? "unset"} policy=${policy} -> ${expected}`, () => {
+      assert.equal(resolvePermission(permission, policy), expected);
+    });
+  }
+});
+
+describe("startToolRelay permission enforcement", () => {
+  it("refuses a deny spec without executing its code", async () => {
+    const res = await relayOnce(codeSpec("blocked", "deny"), "auto");
+    assert.equal(res.ok, true, "a policy refusal rides as an ok tool result, not an error");
+    assert.equal(res.text, "Tool 'blocked' is denied by policy.");
+    // The refusal string is the whole result: the snippet's `{"ran": true}` never appears.
+    assert.ok(!String(res.text).includes("ran"), "the denied tool's code did not run");
+  });
+
+  it("returns unsupported for an allow code spec", async () => {
+    const res = await relayOnce(codeSpec("permitted", "allow"), "auto");
+    assert.equal(res.ok, false);
+    assert.match(res.error ?? "", /Code tools are not supported by the sidecar\./);
+  });
+
+  it("refuses an unset spec when the headless policy is deny", async () => {
+    const res = await relayOnce(codeSpec("gated", undefined), "deny");
+    assert.equal(res.ok, true);
+    assert.equal(res.text, "Tool 'gated' requires approval; denied in headless mode.");
+  });
+});
diff --git a/web/packages/agenta-entity-ui/src/DrillInView/SchemaControls/ClaudePermissionsControl.tsx b/web/packages/agenta-entity-ui/src/DrillInView/SchemaControls/ClaudePermissionsControl.tsx
new file mode 100644
index 0000000000..96445f3d83
--- /dev/null
+++ b/web/packages/agenta-entity-ui/src/DrillInView/SchemaControls/ClaudePermissionsControl.tsx
@@ -0,0 +1,166 @@
+/**
+ * ClaudePermissionsControl
+ *
+ * Layer 1 of the agent capability config, Claude-only: the Claude harness's own permission knobs.
+ * These map 1:1 to Claude Code's `permissions` settings block and persist into the neutral
+ * `harness_options.claude.permissions` bag on the agent config (the SDK's `ClaudePermissions`,
+ * agenta.sdk.agents.dtos). The shape:
+ *   { default_mode?: "default"|"acceptEdits"|"plan"|"bypassPermissions",
+ *     allow: string[], deny: string[], ask: string[] }
+ *
+ * It is rendered as a collapsed "advanced" section by AgentConfigControl and is hidden when the
+ * harness is not Claude (nothing here applies to Pi, which never gates tool use). Non-destructive:
+ * an author who touches nothing leaves `harness_options` untouched.
+ */
+import {memo, useCallback, useMemo} from "react"
+
+import {LabeledField} from "@agenta/ui/components/presentational"
+import {cn} from "@agenta/ui/styles"
+import {Input, Select} from "antd"
+
+type ClaudePermissionMode = "default" | "acceptEdits" | "plan" | "bypassPermissions"
+
+interface ClaudePermissionsValue {
+    default_mode?: ClaudePermissionMode | null
+    allow?: string[]
+    deny?: string[]
+    ask?: string[]
+}
+
+export interface ClaudePermissionsControlProps {
+    /** The `harness_options.claude.permissions` value (object or null/undefined). */
+    value?: Record<string, unknown> | null
+    /** Called with the next permissions object. */
+    onChange: (value: Record<string, unknown>) => void
+    /** Disable the control */
+    disabled?: boolean
+    /** Additional CSS classes */
+    className?: string
+}
+
+const MODE_OPTIONS: {value: ClaudePermissionMode; label: string}[] = [
+    {value: "default", label: "Default (prompt on each gated tool)"},
+    {value: "acceptEdits", label: "Accept edits (auto-accept file edits)"},
+    {value: "plan", label: "Plan (read-only planning)"},
+    {value: "bypassPermissions", label: "Bypass (skip every gate)"},
+]
+
+/** Read the current value, defaulting the rule lists to empty arrays. */
+function readValue(value: Record<string, unknown> | null | undefined): {
+    defaultMode: ClaudePermissionMode | null
+    allow: string[]
+    deny: string[]
+    ask: string[]
+} {
+    const v = (value ?? {}) as ClaudePermissionsValue
+    return {
+        defaultMode: (v.default_mode as ClaudePermissionMode | null | undefined) ?? null,
+        allow: Array.isArray(v.allow) ? v.allow : [],
+        deny: Array.isArray(v.deny) ? v.deny : [],
+        ask: Array.isArray(v.ask) ? v.ask : [],
+    }
+}
+
+/** Split a textarea (one rule per line) into a trimmed, non-empty rule list. */
+function parseRules(text: string): string[] {
+    return text
+        .split("\n")
+        .map((s) => s.trim())
+        .filter(Boolean)
+}
+
+export const ClaudePermissionsControl = memo(function ClaudePermissionsControl({
+    value,
+    onChange,
+    disabled = false,
+    className,
+}: ClaudePermissionsControlProps) {
+    const current = useMemo(() => readValue(value), [value])
+
+    // Compose the full permissions object, overriding one slice. `default_mode` is only written
+    // when set so an author who only edits rules never emits a null mode.
+    const write = useCallback(
+        (patch: {
+            defaultMode?: ClaudePermissionMode | null
+            allow?: string[]
+            deny?: string[]
+            ask?: string[]
+        }) => {
+            const next = {...current, ...patch}
+            const out: Record<string, unknown> = {
+                allow: next.allow,
+                deny: next.deny,
+                ask: next.ask,
+            }
+            if (next.defaultMode) out.default_mode = next.defaultMode
+            onChange(out)
+        },
+        [current, onChange],
+    )
+
+    return (
+        <div className={cn("flex flex-col gap-2", className)}>
+            <LabeledField
+                label="Permission mode"
+                description="Claude Code's default permission mode for this headless run."
+                withTooltip
+            >
+                <Select<ClaudePermissionMode>
+                    value={current.defaultMode ?? undefined}
+                    onChange={(v) => write({defaultMode: v ?? null})}
+                    options={MODE_OPTIONS}
+                    disabled={disabled}
+                    placeholder="Claude default"
+                    allowClear
+                    className="w-full"
+                    size="small"
+                />
+            </LabeledField>
+
+            <LabeledField
+                label="Allow rules"
+                description='Per-tool allow rules, one per line (e.g. "Read", "Bash(npm run:*)").'
+                withTooltip
+            >
+                <Input.TextArea
+                    value={current.allow.join("\n")}
+                    onChange={(e) => write({allow: parseRules(e.target.value)})}
+                    disabled={disabled}
+                    placeholder={"Read\nBash(npm run:*)"}
+                    rows={2}
+                    className="resize-y font-mono text-xs"
+                />
+            </LabeledField>
+
+            <LabeledField
+                label="Ask rules"
+                description="Per-tool rules that prompt before use, one per line."
+                withTooltip
+            >
+                <Input.TextArea
+                    value={current.ask.join("\n")}
+                    onChange={(e) => write({ask: parseRules(e.target.value)})}
+                    disabled={disabled}
+                    placeholder="Bash(rm:*)"
+                    rows={2}
+                    className="resize-y font-mono text-xs"
+                />
+            </LabeledField>
+
+            <LabeledField
+                label="Deny rules"
+                description="Per-tool rules that are always blocked, one per line."
+                withTooltip
+            >
+                <Input.TextArea
+                    value={current.deny.join("\n")}
+                    onChange={(e) => write({deny: parseRules(e.target.value)})}
+                    disabled={disabled}
+                    placeholder={"Write\nmcp__server__tool"}
+                    rows={2}
+                    className="resize-y font-mono text-xs"
+                />
+            </LabeledField>
+        </div>
+    )
+})
diff --git a/web/packages/agenta-entity-ui/src/DrillInView/SchemaControls/SandboxPermissionControl.tsx b/web/packages/agenta-entity-ui/src/DrillInView/SchemaControls/SandboxPermissionControl.tsx
new file mode 100644
index 0000000000..eb6ddd4aa1
--- /dev/null
+++ b/web/packages/agenta-entity-ui/src/DrillInView/SchemaControls/SandboxPermissionControl.tsx
@@ -0,0 +1,192 @@
+/**
+ * SandboxPermissionControl
+ *
+ * Layer 2 of the agent capability config: the sandbox security boundary the agent runs inside
+ * (`sandbox_permission` on the agent config). It applies to every harness. The shape mirrors
+ * the SDK's `SandboxPermission` (agenta.sdk.agents.dtos):
+ *   { network: { mode: "on"|"off"|"allowlist", allowlist: string[] },
+ *     filesystem?: "on"|"readonly"|"off",
+ *     enforcement: "strict"|"best_effort" }
+ *
+ * The emitted `agent_config` schema exposes `sandbox_permission` as a nullable object (anyOf),
+ * so the generic SchemaPropertyRenderer can only show a "click to expand" drill-in indicator for
+ * it. This control renders the four knobs as a real form instead: a network-mode selector, a
+ * CIDR allowlist shown only when mode = allowlist, a filesystem selector, and an enforcement
+ * toggle. It is non-destructive: an unset value stays `null` (no declared boundary) until the
+ * author changes something.
+ */
+import {memo, useCallback, useMemo} from "react"
+
+import {LabeledField} from "@agenta/ui/components/presentational"
+import {cn} from "@agenta/ui/styles"
+import {Input, Select, Typography} from "antd"
+
+type NetworkMode = "on" | "off" | "allowlist"
+type FilesystemMode = "on" | "readonly" | "off"
+type Enforcement = "strict" | "best_effort"
+
+interface NetworkEgress {
+    mode: NetworkMode
+    allowlist: string[]
+}
+
+interface SandboxPermissionValue {
+    network?: NetworkEgress
+    filesystem?: FilesystemMode | null
+    enforcement?: Enforcement
+}
+
+export interface SandboxPermissionControlProps {
+    /** The `sandbox_permission` value (object or null) from the agent config. */
+    value?: Record<string, unknown> | null
+    /** Called with the next `sandbox_permission` object (never null once the author touches it). */
+    onChange: (value: Record<string, unknown>) => void
+    /** Disable the control */
+    disabled?: boolean
+    /** Additional CSS classes */
+    className?: string
+}
+
+const NETWORK_MODE_OPTIONS: {value: NetworkMode; label: string}[] = [
+    {value: "on", label: "Allow all egress"},
+    {value: "off", label: "Block all egress"},
+    {value: "allowlist", label: "Allowlist (CIDR ranges)"},
+]
+
+const FILESYSTEM_OPTIONS: {value: FilesystemMode; label: string}[] = [
+    {value: "on", label: "Read/write"},
+    {value: "readonly", label: "Read-only"},
+    {value: "off", label: "No access"},
+]
+
+const ENFORCEMENT_OPTIONS: {value: Enforcement; label: string}[] = [
+    {value: "strict", label: "Strict (fail if unenforceable)"},
+    {value: "best_effort", label: "Best effort"},
+]
+
+/** Read the current value with the same defaults the SDK applies, so the form is never blank. */
+function readValue(value: Record<string, unknown> | null | undefined): {
+    networkMode: NetworkMode
+    allowlist: string[]
+    filesystem: FilesystemMode | null
+    enforcement: Enforcement
+} {
+    const v = (value ?? {}) as SandboxPermissionValue
+    const network = (v.network ?? {}) as NetworkEgress
+    return {
+        networkMode: network.mode ?? "on",
+        allowlist: Array.isArray(network.allowlist) ? network.allowlist : [],
+        filesystem: (v.filesystem as FilesystemMode | null | undefined) ?? null,
+        enforcement: v.enforcement ?? "strict",
+    }
+}
+
+export const SandboxPermissionControl = memo(function SandboxPermissionControl({
+    value,
+    onChange,
+    disabled = false,
+    className,
+}: SandboxPermissionControlProps) {
+    const current = useMemo(() => readValue(value), [value])
+
+    // Compose the full `sandbox_permission` object from the current state, overriding one slice.
+    // We always write the network object (with mode + allowlist) and enforcement; filesystem is
+    // only written when set (it is declared, not enforced, and stays omitted otherwise).
+    const write = useCallback(
+        (patch: {
+            networkMode?: NetworkMode
+            allowlist?: string[]
+            filesystem?: FilesystemMode | null
+            enforcement?: Enforcement
+        }) => {
+            const next = {...current, ...patch}
+            const out: Record<string, unknown> = {
+                network: {
+                    mode: next.networkMode,
+                    allowlist: next.networkMode === "allowlist" ? next.allowlist : [],
+                },
+                enforcement: next.enforcement,
+            }
+            if (next.filesystem) out.filesystem = next.filesystem
+            onChange(out)
+        },
+        [current, onChange],
+    )
+
+    return (
+        <div className={cn("flex flex-col gap-2", className)}>
+            <Typography.Text className="text-sm font-medium">Sandbox permissions</Typography.Text>
+
+            <LabeledField
+                label="Network egress"
+                description="Outbound network access for the sandbox. Declared config; enforced by the runner in a later slice."
+                withTooltip
+            >
+                <Select<NetworkMode>
+                    value={current.networkMode}
+                    onChange={(v) => write({networkMode: v})}
+                    options={NETWORK_MODE_OPTIONS}
+                    disabled={disabled}
+                    className="w-full"
+                    size="small"
+                />
+            </LabeledField>
+
+            {current.networkMode === "allowlist" && (
+                <LabeledField
+                    label="Allowlist"
+                    description="CIDR ranges allowed for outbound egress, one per line (e.g. 10.0.0.0/8)."
+                    withTooltip
+                >
+                    <Input.TextArea
+                        value={current.allowlist.join("\n")}
+                        onChange={(e) =>
+                            write({
+                                allowlist: e.target.value
+                                    .split("\n")
+                                    .map((s) => s.trim())
+                                    .filter(Boolean),
+                            })
+                        }
+                        disabled={disabled}
+                        placeholder={"10.0.0.0/8\n192.168.0.0/16"}
+                        rows={3}
+                        className="resize-y font-mono text-xs"
+                    />
+                </LabeledField>
+            )}
+
+            <LabeledField
+                label="Filesystem"
+                description="Declared filesystem access for the sandbox. Optional; leave unset for no declared boundary."
+                withTooltip
+            >
+                <Select<FilesystemMode>
+                    value={current.filesystem ?? undefined}
+                    onChange={(v) => write({filesystem: v ?? null})}
+                    options={FILESYSTEM_OPTIONS}
+                    disabled={disabled}
+                    placeholder="Not declared"
+                    allowClear
+                    className="w-full"
+                    size="small"
+                />
+            </LabeledField>
+
+            <LabeledField
+                label="Enforcement"
+                description="Strict fails the run when the boundary cannot be applied; best effort continues."
+                withTooltip
+            >
+                <Select<Enforcement>
+                    value={current.enforcement}
+                    onChange={(v) => write({enforcement: v})}
+                    options={ENFORCEMENT_OPTIONS}
+                    disabled={disabled}
+                    className="w-full"
+                    size="small"
+                />
+            </LabeledField>
+        </div>
+    )
+})
diff --git a/web/packages/agenta-entity-ui/src/DrillInView/SchemaControls/ToolItemControl.tsx b/web/packages/agenta-entity-ui/src/DrillInView/SchemaControls/ToolItemControl.tsx
index 205bb21c85..70bfee0613 100644
--- a/web/packages/agenta-entity-ui/src/DrillInView/SchemaControls/ToolItemControl.tsx
+++ b/web/packages/agenta-entity-ui/src/DrillInView/SchemaControls/ToolItemControl.tsx
@@ -23,7 +23,7 @@ import {CollapseToggleButton, getCollapseStyle} from "@agenta/ui/components/pres
 import {useDrillInUI} from "@agenta/ui/drill-in"
 import {getProviderIcon} from "@agenta/ui/select-llm-provider"
 import {CopySimple, MinusCircle} from "@phosphor-icons/react"
-import {Button, Tooltip, Typography} from "antd"
+import {Button, Select, Tooltip, Typography} from "antd"
 import clsx from "clsx"
 
 import {TOOL_PROVIDERS_META, TOOL_SPECS, parseGatewayFunctionName, type ToolObj} from "./toolUtils"
@@ -435,6 +435,98 @@ function defaultRenderProviderIcon(providerKey: string): React.ReactNode {
     return <Icon className="w-4 h-4" />
 }
 
+// ============================================================================
+// PER-TOOL PERMISSION (Layer 3: allow / ask / deny)
+// ============================================================================
+
+/**
+ * The tool-use permission vocabulary. Stored as a TOP-LEVEL `permission` key on the tool
+ * object (not inside `agenta_metadata`, which `stripAgentaMetadataDeep` removes on every
+ * save/run path). The SDK tool config/spec deserializes it with no mapping
+ * (`AliasChoices("permission", "permission_mode", "permissionMode")`). Unset means the tool
+ * inherits the global permission policy.
+ */
+type ToolPermission = "allow" | "ask" | "deny"
+
+const PERMISSION_OPTIONS: {value: ToolPermission; label: string}[] = [
+    {value: "allow", label: "Allow"},
+    {value: "ask", label: "Ask"},
+    {value: "deny", label: "Deny"},
+]
+
+/**
+ * Default the permission from the tool's `read_only` metadata, when the catalog supplied it:
+ * `read_only === true` → allow (no side effect), `read_only === false` → ask, unknown → unset
+ * (inherit the global policy). This is a display default only — it is not written back unless
+ * the author changes it.
+ */
+function defaultPermissionFromReadOnly(
+    metadata: Record<string, unknown> | undefined,
+): ToolPermission | undefined {
+    const readOnly = metadata?.read_only
+    if (readOnly === true) return "allow"
+    if (readOnly === false) return "ask"
+    return undefined
+}
+
+function isPermission(value: unknown): value is ToolPermission {
+    return value === "allow" || value === "ask" || value === "deny"
+}
+
+/**
+ * Read the stored permission. Canonical home is the TOP-LEVEL `permission` key (it survives
+ * `stripAgentaMetadataDeep`); fall back to any legacy `agenta_metadata.permission_mode` so values
+ * written before this change still display.
+ */
+function readPermission(
+    topLevel: unknown,
+    metadata: Record<string, unknown> | undefined,
+): ToolPermission | undefined {
+    if (isPermission(topLevel)) return topLevel
+    if (isPermission(metadata?.permission_mode)) return metadata.permission_mode
+    return undefined
+}
+
+interface ToolPermissionControlProps {
+    /** The current top-level `permission` value on the tool (canonical store). */
+    permission: unknown
+    /** The current `agenta_metadata` of the tool (read for the read_only default + legacy mode). */
+    metadata: Record<string, unknown> | undefined
+    /** Called with the chosen permission (or null to clear back to the global policy). */
+    onChange: (permission: ToolPermission | null) => void
+    disabled?: boolean
+}
+
+/** Compact allow/ask/deny selector for one tool. Displays the read_only-derived default unwritten. */
+function ToolPermissionControl({
+    permission,
+    metadata,
+    onChange,
+    disabled,
+}: ToolPermissionControlProps) {
+    const stored = readPermission(permission, metadata)
+    const displayDefault = defaultPermissionFromReadOnly(metadata)
+    return (
+        <div className="flex items-center gap-2 px-1 py-0.5">
+            <Tooltip title="How tool-use is gated: allow auto-approves, ask prompts, deny blocks. Unset inherits the global permission policy.">
+                <Typography.Text type="secondary" className="text-xs">
+                    Permission
+                </Typography.Text>
+            </Tooltip>
+            <Select<ToolPermission>
+                value={stored ?? undefined}
+                onChange={(v) => onChange(v ?? null)}
+                options={PERMISSION_OPTIONS}
+                disabled={disabled}
+                placeholder={displayDefault ? `${displayDefault} (default)` : "Inherit policy"}
+                allowClear
+                size="small"
+                className="w-[160px]"
+            />
+        </div>
+    )
+}
+
 export interface ToolItemControlProps {
     /** Tool value (object or JSON string) */
     value: unknown
@@ -500,6 +592,16 @@ export const ToolItemControl = memo(function ToolItemControl({
         [onChange, agentaMetadata],
     )
 
+    // The current `agenta_metadata` as an object (read for the permission control's read_only
+    // default and any legacy `permission_mode`).
+    const metadataObj = useMemo(
+        () =>
+            agentaMetadata && typeof agentaMetadata === "object" && !Array.isArray(agentaMetadata)
+                ? (agentaMetadata as Record<string, unknown>)
+                : undefined,
+        [agentaMetadata],
+    )
+
     const {
         toolObj,
         editorText,
@@ -507,6 +609,47 @@ export const ToolItemControl = memo(function ToolItemControl({
         onEditorChange,
     } = useToolState(cleanedValue, isReadOnly, handleChange)
 
+    // The current top-level `permission` on the tool (canonical store, survives metadata strip).
+    // Read off the LIVE editor object (`toolObj`), not the `cleanedValue` prop: the JSON editor
+    // mutates `toolObj` inside `useToolState` and only propagates to the parent (and back into
+    // `value`/`cleanedValue`) asynchronously, so the prop lags in-flight edits.
+    const topLevelPermission = useMemo(
+        () =>
+            toolObj && typeof toolObj === "object" && !Array.isArray(toolObj)
+                ? (toolObj as Record<string, unknown>).permission
+                : undefined,
+        [toolObj],
+    )
+
+    // Set (or clear) the TOP-LEVEL `permission` key on this tool, leaving the rest of the tool
+    // definition intact. It lives at the top level (not inside `agenta_metadata`) so it survives
+    // `stripAgentaMetadataDeep` on the save/run path. Clearing removes the key (and any legacy
+    // `agenta_metadata.permission_mode`) so the tool falls back to the global policy.
+    //
+    // Compose off the LIVE editor object (`toolObj`), the same source `handleChange` propagates
+    // through `useToolState`, so a permission change merges with in-flight JSON edits instead of
+    // clobbering them with the stale `cleanedValue` prop.
+    const handlePermissionChange = useCallback(
+        (permission: "allow" | "ask" | "deny" | null) => {
+            if (!onChange) return
+            const base = (toolObj && typeof toolObj === "object" ? toolObj : {}) as Record<
+                string,
+                unknown
+            >
+            const {permission: _dropPermission, ...restBase} = base
+            // Drop any legacy `permission_mode` stored inside `agenta_metadata`; the top-level key
+            // is now canonical.
+            const {permission_mode: _dropLegacy, ...restMeta} = metadataObj ?? {}
+            const withMeta =
+                Object.keys(restMeta).length > 0
+                    ? {...restBase, agenta_metadata: restMeta}
+                    : restBase
+            const merged = permission ? {...withMeta, permission} : withMeta
+            onChange(merged as ToolObj)
+        },
+        [onChange, metadataObj, toolObj],
+    )
+
     const functionName =
         (toolObj as Record<string, unknown>)?.function &&
         typeof (toolObj as Record<string, unknown>).function === "object"
@@ -655,12 +798,20 @@ export const ToolItemControl = memo(function ToolItemControl({
                     containerRef={containerRef}
                 />
                 {!minimized && (
-                    <textarea
-                        className="font-mono text-xs p-2 border rounded min-h-[120px] resize-y w-full"
-                        value={editorText}
-                        onChange={(e) => onEditorChange(e.target.value)}
-                        readOnly={isReadOnly}
-                    />
+                    <>
+                        <textarea
+                            className="font-mono text-xs p-2 border rounded min-h-[120px] resize-y w-full"
+                            value={editorText}
+                            onChange={(e) => onEditorChange(e.target.value)}
+                            readOnly={isReadOnly}
+                        />
+                        <ToolPermissionControl
+                            permission={topLevelPermission}
+                            metadata={metadataObj}
+                            onChange={handlePermissionChange}
+                            disabled={isReadOnly}
+                        />
+                    </>
                 )}
             </div>
         )
@@ -727,6 +878,14 @@ export const ToolItemControl = memo(function ToolItemControl({
                     />
                 }
             />
+            {!minimized && (
+                <ToolPermissionControl
+                    permission={topLevelPermission}
+                    metadata={metadataObj}
+                    onChange={handlePermissionChange}
+                    disabled={isReadOnly}
+                />
+            )}
         </div>
     )
 })