diff --git a/sdks/python/agenta/sdk/agents/connections/errors.py b/sdks/python/agenta/sdk/agents/connections/errors.py index becd44d2c9..2d01f0888e 100644 --- a/sdks/python/agenta/sdk/agents/connections/errors.py +++ b/sdks/python/agenta/sdk/agents/connections/errors.py @@ -46,12 +46,16 @@ class MissingProviderError(ConnectionResolutionError): NOT a tolerated self-managed/OAuth fallback case: the config itself is underspecified. """ - def __init__(self, *, model: str) -> None: + def __init__(self, *, model: str, hint_provider: str = "openai") -> None: + # The example provider in the hint is harness-appropriate: a Claude harness must read + # "anthropic/", never "openai/" (Claude reaches Anthropic only). The + # caller passes the harness's default provider so the suggested prefix is reachable. super().__init__( - f"model '{model}' needs a provider prefix (e.g. 'openai/{model}') " + f"model '{model}' needs a provider prefix (e.g. '{hint_provider}/{model}') " "or a structured {provider, model}; a bare model id can't resolve a credential" ) self.model = model + self.hint_provider = hint_provider class AmbiguousConnectionError(ConnectionResolutionError): diff --git a/sdks/python/agenta/sdk/agents/platform/connections.py b/sdks/python/agenta/sdk/agents/platform/connections.py index cf175a4752..73cff4334f 100644 --- a/sdks/python/agenta/sdk/agents/platform/connections.py +++ b/sdks/python/agenta/sdk/agents/platform/connections.py @@ -18,6 +18,10 @@ from agenta.sdk.utils.logging import get_module_logger +from ..capabilities import ( + CLAUDE_MODEL_ALIASES, + HARNESS_CONNECTION_CAPABILITIES, +) from ..connections import ( AmbiguousConnectionError, ConnectionNotFoundError, @@ -46,6 +50,47 @@ "openrouter": "OPENROUTER_API_KEY", } +# The Claude harness selects a model by a bare alias (``haiku``/``sonnet``/``opus`` + ``[1m]``) +# or by a dated id (``claude-opus-4-8``), never with a ``provider/`` prefix. Those bare ids are +# unambiguously Anthropic, so the F-017 "needs a provider prefix" rule must not reject them: a +# bare alias resolves to ``anthropic`` here before the fail-loud check. The canonical alias set +# lives in ``capabilities.py`` (the ``/inspect`` surface) so the two never drift. +_CLAUDE_ALIASES: Set[str] = {alias.lower() for alias in CLAUDE_MODEL_ALIASES} + + +def _inferred_claude_provider(model: ModelRef) -> Optional[str]: + """Return ``"anthropic"`` when a bare model id is a known Claude alias, else ``None``. + + Only fires for a bare id (no explicit ``provider``). A known alias (the ``capabilities.py`` + set) or any ``claude-*`` dated id (the Anthropic naming convention) is unambiguously + Anthropic, so it resolves to the ``anthropic`` provider rather than failing loud as a + provider-less model. + """ + if model.provider: + return None + bare = (model.model or "").strip().lower() + if not bare: + return None + if bare in _CLAUDE_ALIASES or bare.startswith("claude-"): + return "anthropic" + return None + + +def _harness_default_provider(harness: Optional[str]) -> str: + """The provider to suggest in a missing-provider hint for ``harness``. + + Claude reaches Anthropic only, so its hint must read ``anthropic/``; every other + harness defaults to ``openai`` (the existing hint). Derived from the capability table's + provider list so a harness's reachable providers stay the single source of truth. + """ + caps = HARNESS_CONNECTION_CAPABILITIES.get(harness or "") + if caps and caps.providers: + if "openai" in caps.providers: + return "openai" + return caps.providers[0] + return "openai" + + # Extras keys the current UI stores on custom_provider secrets, normalized to harness env. _SNAKE_EXTRA_ENV_ALIASES: Dict[str, str] = { "aws_region_name": "AWS_REGION", @@ -299,14 +344,19 @@ def _candidate_pool( def _choose_default( - candidates: Sequence[_ConnectionCandidate], model: ModelRef + candidates: Sequence[_ConnectionCandidate], + model: ModelRef, + harness: Optional[str] = None, ) -> _ConnectionCandidate: pool = _candidate_pool(candidates, model) if not pool and not model.provider: # A bare model id (no provider prefix) matched nothing by model id, so there is no # provider to look a credential up against. Fail loud with an actionable message rather # than degrade to no-credential and surface later as a misleading "add your key" error. - raise MissingProviderError(model=model.model) + # The hint names the harness-reachable provider (anthropic for Claude, not openai). + raise MissingProviderError( + model=model.model, hint_provider=_harness_default_provider(harness) + ) if len(pool) == 1: return pool[0] default_named = [candidate for candidate in pool if candidate.slug == "default"] @@ -350,9 +400,15 @@ def _choose_named( def _resolve_from_secrets( - *, secrets: Sequence[Any], model: ModelRef + *, secrets: Sequence[Any], model: ModelRef, harness: Optional[str] = None ) -> ResolvedConnection: connection = model.connection + # A bare Claude alias (haiku/sonnet/opus + [1m]) or a dated claude-* id is unambiguously + # Anthropic: infer the provider so the F-017 fail-loud rule does not reject a documented + # Claude model id. Inference only fills a missing provider; an explicit provider is honored. + inferred = _inferred_claude_provider(model) + if inferred: + model = model.model_copy(update={"provider": inferred}) if connection.mode == "self_managed": return ResolvedConnection( provider=model.provider or "", @@ -368,7 +424,7 @@ def _resolve_from_secrets( chosen = ( _choose_named(candidates, model, slug) if slug - else _choose_default(candidates, model) + else _choose_default(candidates, model, harness) ) provider = chosen.resolved_provider(model) env = chosen.resolved_env(provider) @@ -434,7 +490,7 @@ async def resolve( data = response.json() or [] if not isinstance(data, list): raise ConnectionResolutionError("connection resolution returned a non-list") - return _resolve_from_secrets(secrets=data, model=model) + return _resolve_from_secrets(secrets=data, model=model, harness=context.harness) class _StaticSecretsResolver: @@ -447,4 +503,6 @@ async def resolve( model: ModelRef, context: RuntimeAuthContext, ) -> ResolvedConnection: - return _resolve_from_secrets(secrets=self._secrets, model=model) + return _resolve_from_secrets( + secrets=self._secrets, model=model, harness=context.harness + ) diff --git a/sdks/python/agenta/sdk/agents/utils/ts_runner.py b/sdks/python/agenta/sdk/agents/utils/ts_runner.py index 590f47cd1c..92599a8108 100644 --- a/sdks/python/agenta/sdk/agents/utils/ts_runner.py +++ b/sdks/python/agenta/sdk/agents/utils/ts_runner.py @@ -11,8 +11,24 @@ import os from typing import Any, AsyncIterator, Dict, Optional, Sequence +from agenta.sdk.utils.logging import get_module_logger + _DEFAULT_TIMEOUT = float(os.getenv("AGENTA_AGENT_RUNNER_TIMEOUT_SECONDS", "180")) +log = get_module_logger(__name__) + + +def _transport_error(user_message: str, *, detail: str) -> RuntimeError: + """A transport RuntimeError whose surfaced text is clean; the raw detail is logged only. + + The HTTP body / process stderr / raw stdout that pinpoints a transport failure is internal: + log it for diagnosis but keep it out of the caller/UI-facing error, which gets only the + short ``user_message``. Mirrors ``wire.sanitize_runner_error`` for the result boundary. + """ + if detail: + log.warning("agent: %s | detail: %s", user_message, detail) + return RuntimeError(user_message) + async def deliver_http( base_url: str, @@ -27,10 +43,12 @@ async def deliver_http( async with httpx.AsyncClient(timeout=timeout) as client: response = await client.post(url, json=payload) # Any non-2xx is a transport failure; 4xx left to fall through surfaces as an opaque - # JSON parse error instead of a clear runner failure. + # JSON parse error instead of a clear runner failure. The response body can carry internal + # detail, so it is logged, not surfaced. if response.status_code >= 400: - raise RuntimeError( - f"Agent runner HTTP {response.status_code}: {response.text[:1000]}" + raise _transport_error( + f"Agent runner HTTP {response.status_code}", + detail=response.text[:1000], ) return response.json() @@ -67,14 +85,16 @@ async def deliver_subprocess( out = stdout.decode("utf-8", "replace") err = stderr.decode("utf-8", "replace") if not out.strip(): - raise RuntimeError( - f"Agent runner returned no output. exit={proc.returncode} stderr={err[-2000:]}" + raise _transport_error( + "Agent runner returned no output", + detail=f"exit={proc.returncode} stderr={err[-2000:]}", ) try: return json.loads(out) except json.JSONDecodeError as exc: - raise RuntimeError( - f"Agent runner returned invalid JSON. stdout={out[:500]} stderr={err[-1000:]}" + raise _transport_error( + "Agent runner returned invalid JSON", + detail=f"stdout={out[:500]} stderr={err[-1000:]}", ) from exc @@ -110,8 +130,9 @@ async def deliver_http_stream( ) as response: if response.status_code >= 400: body = await response.aread() - raise RuntimeError( - f"Agent runner HTTP {response.status_code}: {body[:1000]!r}" + raise _transport_error( + f"Agent runner HTTP {response.status_code}", + detail=repr(body[:1000]), ) async for line in response.aiter_lines(): line = line.strip() @@ -175,9 +196,10 @@ async def deliver_subprocess_stream( err = b"" if proc.stderr is not None: err = await proc.stderr.read() - raise RuntimeError( - "Agent runner stream ended without a terminal result record. " - f"exit={proc.returncode} stderr={err.decode('utf-8', 'replace')[-2000:]}" + raise _transport_error( + "Agent runner stream ended without a terminal result record", + detail=f"exit={proc.returncode} " + f"stderr={err.decode('utf-8', 'replace')[-2000:]}", ) finally: if proc.returncode is None: diff --git a/sdks/python/agenta/sdk/agents/utils/wire.py b/sdks/python/agenta/sdk/agents/utils/wire.py index cee5c67b53..c78f570bc1 100644 --- a/sdks/python/agenta/sdk/agents/utils/wire.py +++ b/sdks/python/agenta/sdk/agents/utils/wire.py @@ -17,8 +17,11 @@ from __future__ import annotations +import re from typing import Any, Dict, List, Optional, Sequence +from agenta.sdk.utils.logging import get_module_logger + from ..dtos import ( AgentEvent, AgentResult, @@ -29,6 +32,38 @@ TraceContext, ) +log = get_module_logger(__name__) + +# The user-facing error must not carry an internal stack/path dump. Cap the surfaced line and +# strip the patterns that leak implementation detail; the full text is logged, never shown. +_ERROR_MAX_LEN = 300 +# A stack frame leaked into the message ("at fn (/abs/path:12:3)" / 'File "/abs/path", line 12'). +_STACK_FRAME_RE = re.compile(r"\b(at\s+\S+\s*\(|File\s+\"|/[\w./-]+:\d+)") + + +def sanitize_runner_error(error: Any) -> str: + """Reduce a runner ``error`` to one clean user-facing line, logging the full detail. + + The runner already concise-formats its known auth/credit failures, but the fall-through case + returns the raw first line of an SDK/JS error, and the transport errors (HTTP/stderr/stdout + dumps) carry internal text. This is the single boundary that reaches the caller/UI, so it + keeps the actionable message, drops stack-frame and path noise, caps the length, and logs the + untruncated original for the trace/logs. A clean concise message passes through unchanged. + """ + raw = "" if error is None else str(error) + if raw and ( + len(raw) > _ERROR_MAX_LEN or "\n" in raw or _STACK_FRAME_RE.search(raw) + ): + log.warning("agent: runner reported a failure: %s", raw) + # Keep only the first line; a multi-line body is a stack dump, never the message. + message = raw.split("\n", 1)[0].strip() + # If even the first line is a raw stack frame, fall back to a generic line. + if not message or _STACK_FRAME_RE.match(message): + return "agent run failed" + if len(message) > _ERROR_MAX_LEN: + message = message[: _ERROR_MAX_LEN - 1].rstrip() + "…" + return message + def request_to_wire( *, @@ -91,10 +126,13 @@ def result_from_wire(data: Dict[str, Any]) -> AgentResult: """Parse a ``/run`` result JSON into an :class:`AgentResult`. Raises ``RuntimeError`` when the runner reported a failure, so the caller surfaces a - clear message rather than handing the model an empty reply. + clear message rather than handing the model an empty reply. The runner ``error`` is + sanitized at this boundary (one clean line, no stack/path leak); the full detail is logged. """ if not data.get("ok"): - raise RuntimeError(f"Agent run failed: {data.get('error')}") + raise RuntimeError( + f"Agent run failed: {sanitize_runner_error(data.get('error'))}" + ) messages: List[Message] = [] for raw in data.get("messages") or []: diff --git a/sdks/python/oss/tests/pytest/unit/agents/platform/test_connections_http.py b/sdks/python/oss/tests/pytest/unit/agents/platform/test_connections_http.py index 02bd06e8af..bfa6e1daa1 100644 --- a/sdks/python/oss/tests/pytest/unit/agents/platform/test_connections_http.py +++ b/sdks/python/oss/tests/pytest/unit/agents/platform/test_connections_http.py @@ -128,7 +128,7 @@ async def test_bare_model_without_provider_fails_loud(fake_http, connection): # F-017: a bare model id (no `provider/` prefix) that matches no vault candidate by model # id has no provider to look a credential up against. It must fail loud with an actionable # message, not degrade silently to no-credential (which surfaced as a misleading auth error - # even when the key was present). + # even when the key was present). On a Pi harness the hint suggests an openai/ prefix. fake_http(connections, payload=[_provider_key("openai-prod", "openai", "sk-prod")]) with pytest.raises(MissingProviderError) as exc: await VaultConnectionResolver(connection).resolve( @@ -138,6 +138,56 @@ async def test_bare_model_without_provider_fails_loud(fake_http, connection): assert "openai/gpt-4o-mini" in str(exc.value) +async def test_missing_provider_hint_is_harness_correct_for_claude( + fake_http, connection +): + # F-031: the missing-provider hint must name a harness-REACHABLE provider. On a Claude + # harness (Anthropic only) an unrecognized bare id must read "anthropic/", never + # "openai/" (which Claude cannot reach). Use a non-alias bare id so it still fails loud. + fake_http( + connections, payload=[_provider_key("anthropic-prod", "anthropic", "sk-ant")] + ) + with pytest.raises(MissingProviderError) as exc: + await VaultConnectionResolver(connection).resolve( + model=ModelRef.coerce("some-unknown-model"), + context=RuntimeAuthContext(harness="claude"), + ) + message = str(exc.value) + assert "anthropic/some-unknown-model" in message + assert "openai/" not in message + + +async def test_bare_claude_alias_resolves_to_anthropic(fake_http, connection): + # F-031: a bare Claude alias (haiku/sonnet/opus + [1m]) is unambiguously Anthropic, so the + # F-017 prefix rule must NOT reject it. It resolves against the vault's anthropic key the + # same way the documented `anthropic/haiku` form does, instead of failing loud. + fake_http( + connections, payload=[_provider_key("anthropic-prod", "anthropic", "sk-ant")] + ) + for alias in ("haiku", "sonnet", "opus", "opus[1m]"): + resolved = await VaultConnectionResolver(connection).resolve( + model=ModelRef.coerce(alias), + context=RuntimeAuthContext(harness="claude"), + ) + assert resolved.provider == "anthropic", alias + assert resolved.model == alias, alias + assert resolved.env == {"ANTHROPIC_API_KEY": "sk-ant"}, alias + + +async def test_bare_claude_dated_id_resolves_to_anthropic(fake_http, connection): + # F-031: a bare dated Anthropic id (claude-opus-4-8) is also unambiguously Anthropic via the + # claude-* naming convention, so it resolves rather than failing loud on a missing prefix. + fake_http( + connections, payload=[_provider_key("anthropic-prod", "anthropic", "sk-ant")] + ) + resolved = await VaultConnectionResolver(connection).resolve( + model=ModelRef.coerce("claude-opus-4-8"), + context=RuntimeAuthContext(harness="claude"), + ) + assert resolved.provider == "anthropic" + assert resolved.model == "claude-opus-4-8" + + async def test_bare_model_matching_a_candidate_infers_the_provider( fake_http, connection ): diff --git a/sdks/python/oss/tests/pytest/unit/agents/test_wire_contract.py b/sdks/python/oss/tests/pytest/unit/agents/test_wire_contract.py index c1385940cb..0fb0d0aa1e 100644 --- a/sdks/python/oss/tests/pytest/unit/agents/test_wire_contract.py +++ b/sdks/python/oss/tests/pytest/unit/agents/test_wire_contract.py @@ -32,7 +32,11 @@ ToolCallback, TraceContext, ) -from agenta.sdk.agents.utils.wire import request_to_wire, result_from_wire +from agenta.sdk.agents.utils.wire import ( + request_to_wire, + result_from_wire, + sanitize_runner_error, +) # The full set of top-level keys ``request_to_wire`` may emit. The TS ``AgentRunRequest`` # interface must declare a superset of these. Adding a key here without adding it to @@ -361,6 +365,52 @@ def test_result_from_wire_raises_on_failure(golden): result_from_wire(golden("run_result.error.json")) +def test_sanitize_runner_error_passes_clean_message_through(): + # A concise, single-line message (what conciseError emits for known cases) is unchanged. + clean = "pi_core: model authentication failed — add the project's Anthropic key." + assert sanitize_runner_error(clean) == clean + + +def test_sanitize_runner_error_strips_multiline_stack_to_first_line(): + raw = ( + "TypeError: cannot read x\n" + " at run (/app/services/agent/src/engine.ts:12:3)\n" + " at process (/app/node_modules/foo.js:99:1)" + ) + # Only the first line survives; the stack frames never reach the caller. + assert sanitize_runner_error(raw) == "TypeError: cannot read x" + + +def test_sanitize_runner_error_falls_back_when_first_line_is_a_stack_frame(): + raw = 'File "/abs/secret/path.py", line 12, in run\n raise ValueError("boom")' + assert sanitize_runner_error(raw) == "agent run failed" + + +def test_sanitize_runner_error_caps_length(): + raw = "x" * 1000 + result = sanitize_runner_error(raw) + assert len(result) <= 300 + assert result.endswith("…") + + +def test_sanitize_runner_error_handles_none_and_empty(): + assert sanitize_runner_error(None) == "agent run failed" + assert sanitize_runner_error("") == "agent run failed" + + +def test_result_from_wire_sanitizes_a_leaky_error(): + leaky = { + "ok": False, + "error": "boom\n at run (/app/src/engine.ts:1:1)", + } + with pytest.raises(RuntimeError) as exc: + result_from_wire(leaky) + message = str(exc.value) + assert "boom" in message + assert "/app/src/engine.ts" not in message + assert "\n" not in message + + def test_result_from_wire_minimal_ok(): # A bare success: empty output, empty collections, no capabilities. result = result_from_wire({"ok": True}) diff --git a/services/agent/src/engines/sandbox_agent.ts b/services/agent/src/engines/sandbox_agent.ts index 311c650ef6..5d0c9b844c 100644 --- a/services/agent/src/engines/sandbox_agent.ts +++ b/services/agent/src/engines/sandbox_agent.ts @@ -190,6 +190,10 @@ export async function runSandboxAgent( ? buildPiExtensionEnv(request, !plan.isDaytona, { relayDir: plan.relayDir, usageOutPath: plan.usageOutPath, + // The materialized skill names (author + forced `_agenta.*`) so Pi's own agent span + // records which skills loaded (F-029); local Pi self-instruments, so the runner's + // sandbox-agent otel has no span to stamp here. + skills: plan.skillDirs.map((s) => s.name), }) : {}; Object.assign(env, piExtEnv); // local daemon inherits it; daytona gets it via envVars @@ -293,6 +297,9 @@ export async function runSandboxAgent( isPi: plan.isPi, capabilities, harness: plan.harness, + // Daytona: skip the internal loopback HTTP MCP channel (unreachable from the in-sandbox + // harness); gateway tools are delivered through the Daytona file relay started below. + isDaytona: plan.isDaytona, toolSpecs: plan.toolSpecs, userMcpServers: request.mcpServers, relayDir: plan.relayDir, @@ -321,6 +328,9 @@ export async function runSandboxAgent( const run = (deps.createOtel ?? createSandboxAgentOtel)({ harness: plan.harness, model, + // The names of every skill that materialized for this run (author + forced `_agenta.*`), + // stamped on the agent span so the trace shows which skills loaded (F-029). + skills: plan.skillDirs.map((s) => s.name), traceparent: request.trace?.traceparent, baggage: request.trace?.baggage, endpoint: request.trace?.endpoint, @@ -400,34 +410,38 @@ export async function runSandboxAgent( }); run.setUsage(usage); - const output = run.finish(); - await run.flush(); - - // Fail loud on a swallowed model error (A7 / "fail loud, not silent"). When Pi's provider - // call fails (out-of-quota, bad key, rate limit, unknown model, ...), Pi's pi-acp bridge - // reports the turn as a plain `end_turn` with NO content, so without this the run would - // return an `ok:true` empty turn and the user would see a silent "No response" instead of - // the real failure. On the LOCAL Pi path the error is recoverable from Pi's own session - // transcript; surface it as a run error. Only checked when the turn produced no output and - // ran no tools (a real tool-only turn legitimately has empty text), and never on Daytona - // (the transcript lives in the remote sandbox). - if ( + // Peek for a swallowed model error BEFORE finishing the trace so the error message + provider + // can be stamped on the still-open agent span (F-030). When Pi's provider call fails + // (out-of-quota, bad key, rate limit, unknown model, ...), Pi's pi-acp bridge reports the + // turn as a plain `end_turn` with NO content, so without this the run would return an + // `ok:true` empty turn and the user would see a silent "No response" instead of the real + // failure. On the LOCAL Pi path the error is recoverable from Pi's own session transcript. + // Only checked when the turn produced no output and ran no tools (a real tool-only turn + // legitimately has empty text), and never on Daytona (the transcript lives in the remote + // sandbox). + const swallowedPiError = plan.isPi && !plan.isDaytona && - !output.trim() && + !run.output().trim() && !run.events().some((e) => e.type === "tool_call") - ) { - const piError = findSwallowedPiError(plan.sourcePiAgentDir, plan.cwd); - if (piError) { - return { - ok: false, - error: conciseError( - new Error(piError), - plan.harness, - request.provider, - ), - }; - } + ? findSwallowedPiError(plan.sourcePiAgentDir, plan.cwd) + : undefined; + let swallowedError: string | undefined; + if (swallowedPiError) { + swallowedError = conciseError( + new Error(swallowedPiError), + plan.harness, + request.provider, + ); + run.recordError(swallowedError, request.provider); + } + + const output = run.finish(); + await run.flush(); + + // Fail loud on the swallowed error detected above (A7 / "fail loud, not silent"). + if (swallowedError) { + return { ok: false, error: swallowedError }; } return { @@ -450,11 +464,15 @@ export async function runSandboxAgent( traceId: run.traceId(), }; } catch (err) { + const error = conciseError(err, plan.harness, request.provider); + // Stamp the error message + provider on the agent span before finishing it (F-030), so a + // trace carries the same diagnostic the response does (it previously held only a count). + otel?.recordError(error, request.provider); otel?.finish(); await otel?.flush().catch(() => {}); return { ok: false, - error: conciseError(err, plan.harness, request.provider), + error, }; } finally { await toolRelay?.stop().catch(() => {}); diff --git a/services/agent/src/engines/sandbox_agent/pi-assets.ts b/services/agent/src/engines/sandbox_agent/pi-assets.ts index 8c6b975040..50856056bc 100644 --- a/services/agent/src/engines/sandbox_agent/pi-assets.ts +++ b/services/agent/src/engines/sandbox_agent/pi-assets.ts @@ -33,7 +33,7 @@ export const EXTENSION_BUNDLE = export function buildPiExtensionEnv( request: AgentRunRequest, tracing: boolean, - opts: { relayDir?: string; usageOutPath?: string } = {}, + opts: { relayDir?: string; usageOutPath?: string; skills?: string[] } = {}, ): Record { const env: Record = {}; const trace = tracing ? request.trace : undefined; @@ -42,6 +42,11 @@ export function buildPiExtensionEnv( if (trace?.authorization) env.AGENTA_OTLP_AUTHORIZATION = trace.authorization; if (trace && trace.captureContent === false) env.AGENTA_CAPTURE_CONTENT = "false"; + // The skills that materialized for this run (author + forced `_agenta.*`), so Pi's own agent + // span records which skills loaded (F-029). Only set under tracing (the extension's only span + // consumer); a JSON array string the extension parses. + if (trace && opts.skills && opts.skills.length > 0) + env.AGENTA_SKILLS_LOADED = JSON.stringify(opts.skills); const specs = publicToolSpecs( (request.customTools as ResolvedToolSpec[]) ?? [], diff --git a/services/agent/src/extensions/agenta.ts b/services/agent/src/extensions/agenta.ts index d44810db79..2ec51ebf7d 100644 --- a/services/agent/src/extensions/agenta.ts +++ b/services/agent/src/extensions/agenta.ts @@ -35,6 +35,19 @@ function log(message: string): void { process.stderr.write(`[agenta-pi-ext] ${message}\n`); } +/** Parse the JSON array of loaded skill names from AGENTA_SKILLS_LOADED; [] on absent/malformed. */ +function parseSkillsLoaded(raw: string | undefined): string[] { + if (!raw) return []; + try { + const parsed = JSON.parse(raw); + return Array.isArray(parsed) + ? parsed.filter((s): s is string => typeof s === "string") + : []; + } catch { + return []; + } +} + /** Register public tool metadata as Pi tools whose execution relays to the runner. */ function registerTools(pi: ExtensionAPI): void { const raw = process.env.AGENTA_TOOL_PUBLIC_SPECS; @@ -78,8 +91,12 @@ function registerTools(pi: ExtensionAPI): void { const factory = (pi: ExtensionAPI): void => { // Fully inert unless Agenta wired this run (so it is safe to install globally in a // shared Pi agent dir — a normal `pi` session with no Agenta env does nothing). - const hasTracing = !!(process.env.AGENTA_TRACEPARENT || process.env.AGENTA_OTLP_ENDPOINT); - const hasTools = !!(process.env.AGENTA_TOOL_PUBLIC_SPECS && process.env.AGENTA_TOOL_RELAY_DIR); + const hasTracing = !!( + process.env.AGENTA_TRACEPARENT || process.env.AGENTA_OTLP_ENDPOINT + ); + const hasTools = !!( + process.env.AGENTA_TOOL_PUBLIC_SPECS && process.env.AGENTA_TOOL_RELAY_DIR + ); const usageOut = process.env.AGENTA_USAGE_OUT; if (!hasTracing && !hasTools && !usageOut) return; @@ -96,6 +113,9 @@ const factory = (pi: ExtensionAPI): void => { endpoint: process.env.AGENTA_OTLP_ENDPOINT, authorization: process.env.AGENTA_OTLP_AUTHORIZATION, captureContent: process.env.AGENTA_CAPTURE_CONTENT !== "false", + // The skills that loaded for this run (author + forced `_agenta.*`), stamped on the agent + // span so a trace shows which skills surfaced (F-029). A JSON array string from the runner. + skills: parseSkillsLoaded(process.env.AGENTA_SKILLS_LOADED), }); otel.register(pi); // lifecycle handlers (spans + usage accumulation) diff --git a/services/agent/src/tracing/otel.ts b/services/agent/src/tracing/otel.ts index eb4acbe012..589eca2da8 100644 --- a/services/agent/src/tracing/otel.ts +++ b/services/agent/src/tracing/otel.ts @@ -198,7 +198,8 @@ function orderParentFirst(spans: ReadableSpan[]): ReadableSpan[] { const ordered: ReadableSpan[] = []; const visit = (s: ReadableSpan) => { ordered.push(s); - for (const child of childrenOf.get(s.spanContext().spanId) ?? []) visit(child); + for (const child of childrenOf.get(s.spanContext().spanId) ?? []) + visit(child); }; roots.forEach(visit); // Any spans not reached (defensive) get appended so nothing is dropped. @@ -221,7 +222,8 @@ function parentContext(traceparent?: string): Context | undefined { traceId, spanId, // Honor the incoming sampled bit; default to sampled so child spans record. - traceFlags: (parseInt(flags, 16) & 1) === 1 ? TraceFlags.SAMPLED : TraceFlags.NONE, + traceFlags: + (parseInt(flags, 16) & 1) === 1 ? TraceFlags.SAMPLED : TraceFlags.NONE, isRemote: true, }; return trace.setSpanContext(ROOT_CONTEXT, spanContext); @@ -249,6 +251,12 @@ export interface RunConfig { provider?: string; /** Resolved model id, set after the model is picked. */ requestModel?: string; + /** + * Skill names materialized for this run (author + forced `_agenta.*`), stamped on the agent + * span so a trace shows which skills loaded (F-029). Set on the local-Pi path, where Pi's own + * extension owns the agent span (the runner's sandbox-agent otel is span-less there). + */ + skills?: string[]; /** Filled by the extension on agent_start so the runner can flush/return it. */ traceId?: string; } @@ -354,7 +362,9 @@ function applyAssistant(span: Span, msg: any, capture: boolean): void { span.setAttribute("gen_ai.response.model", msg.responseModel ?? msg.model); if (msg.responseId) span.setAttribute("gen_ai.response.id", msg.responseId); if (msg.stopReason) - span.setAttribute("gen_ai.response.finish_reasons", [String(msg.stopReason)]); + span.setAttribute("gen_ai.response.finish_reasons", [ + String(msg.stopReason), + ]); const u = msg.usage; if (u) { @@ -372,8 +382,12 @@ function applyAssistant(span: Span, msg: any, capture: boolean): void { if (u.cacheRead) span.setAttribute("gen_ai.usage.cache_read_input_tokens", u.cacheRead); if (u.cacheWrite) - span.setAttribute("gen_ai.usage.cache_creation_input_tokens", u.cacheWrite); - if (u.cost?.total != null) span.setAttribute("gen_ai.usage.cost", u.cost.total); + span.setAttribute( + "gen_ai.usage.cache_creation_input_tokens", + u.cacheWrite, + ); + if (u.cost?.total != null) + span.setAttribute("gen_ai.usage.cost", u.cost.total); } emitMessages(span, "llm.output_messages", [msg], capture); @@ -415,6 +429,7 @@ export function createAgentaOtel( sessionId: init.sessionId, provider: init.provider, requestModel: init.requestModel, + skills: init.skills, }; const tracer = trace.getTracer("agenta-pi-otel", "0.1.0"); @@ -457,11 +472,22 @@ export function createAgentaOtel( agentSpan.setAttribute("openinference.span.kind", "AGENT"); agentSpan.setAttribute("gen_ai.operation.name", "invoke_agent"); agentSpan.setAttribute("gen_ai.agent.name", "pi"); + // F-029: record which skills loaded (author + forced `_agenta.*`) on Pi's own agent span, + // so a local-Pi trace shows the surfaced skills, not just the author config echoed + // elsewhere. The set is passed from the runner via AGENTA_SKILLS_LOADED. + if (config.skills && config.skills.length > 0) { + agentSpan.setAttribute("ag.agent.skills.loaded", config.skills); + agentSpan.setAttribute("ag.agent.skills.count", config.skills.length); + } if (config.sessionId) { agentSpan.setAttribute("session.id", config.sessionId); agentSpan.setAttribute("gen_ai.conversation.id", config.sessionId); } - setInputs(agentSpan, { prompt: pendingPrompt ?? "" }, config.captureContent); + setInputs( + agentSpan, + { prompt: pendingPrompt ?? "" }, + config.captureContent, + ); const traceId = agentSpan.spanContext().traceId; config.traceId = traceId; @@ -479,24 +505,39 @@ export function createAgentaOtel( pi.on("turn_start", async (event: any) => { const parent = agentCtx ?? context.active(); - const name = event?.turnIndex != null ? `turn ${event.turnIndex}` : "turn"; + const name = + event?.turnIndex != null ? `turn ${event.turnIndex}` : "turn"; const span = tracer.startSpan(name, undefined, parent); span.setAttribute("openinference.span.kind", "CHAIN"); - if (event?.turnIndex != null) span.setAttribute("pi.turn.index", event.turnIndex); - currentTurn = { span, ctx: trace.setSpan(parent, span), index: event?.turnIndex }; + if (event?.turnIndex != null) + span.setAttribute("pi.turn.index", event.turnIndex); + currentTurn = { + span, + ctx: trace.setSpan(parent, span), + index: event?.turnIndex, + }; }); pi.on("before_provider_request", async (_event: any, ctx: any) => { const parent = currentTurn?.ctx ?? agentCtx ?? context.active(); const modelId = config.requestModel ?? ctx?.model?.id; const providerName = config.provider ?? ctx?.model?.provider; - llmSpan = tracer.startSpan(modelId ? `chat ${modelId}` : "chat", undefined, parent); + llmSpan = tracer.startSpan( + modelId ? `chat ${modelId}` : "chat", + undefined, + parent, + ); llmSpan.setAttribute("openinference.span.kind", "LLM"); llmSpan.setAttribute("gen_ai.operation.name", "chat"); if (providerName) llmSpan.setAttribute("gen_ai.system", providerName); if (modelId) llmSpan.setAttribute("gen_ai.request.model", modelId); if (lastContextMessages) - emitMessages(llmSpan, "llm.input_messages", lastContextMessages, config.captureContent); + emitMessages( + llmSpan, + "llm.input_messages", + lastContextMessages, + config.captureContent, + ); }); pi.on("message_end", async (event: any) => { @@ -510,18 +551,28 @@ export function createAgentaOtel( pi.on("tool_execution_start", async (event: any) => { const parent = currentTurn?.ctx ?? agentCtx ?? context.active(); - const name = event?.toolName ? `execute_tool ${event.toolName}` : "execute_tool"; + const name = event?.toolName + ? `execute_tool ${event.toolName}` + : "execute_tool"; const span = tracer.startSpan(name, undefined, parent); span.setAttribute("openinference.span.kind", "TOOL"); span.setAttribute("gen_ai.operation.name", "execute_tool"); - if (event?.toolName) span.setAttribute("gen_ai.tool.name", event.toolName); - if (event?.toolCallId) span.setAttribute("gen_ai.tool.call.id", event.toolCallId); - setInputs(span, (event?.args as Record) ?? {}, config.captureContent); + if (event?.toolName) + span.setAttribute("gen_ai.tool.name", event.toolName); + if (event?.toolCallId) + span.setAttribute("gen_ai.tool.call.id", event.toolCallId); + setInputs( + span, + (event?.args as Record) ?? {}, + config.captureContent, + ); if (event?.toolCallId) toolSpans.set(event.toolCallId, span); }); pi.on("tool_execution_end", async (event: any) => { - const span = event?.toolCallId ? toolSpans.get(event.toolCallId) : undefined; + const span = event?.toolCallId + ? toolSpans.get(event.toolCallId) + : undefined; if (!span) return; setOutput(span, toolResultText(event?.result), config.captureContent); if (event?.isError) span.setStatus({ code: SpanStatusCode.ERROR }); @@ -546,16 +597,24 @@ export function createAgentaOtel( pi.on("agent_end", async (event: any) => { if (!agentSpan) return; - setOutput(agentSpan, lastAssistantText(event?.messages), config.captureContent); + setOutput( + agentSpan, + lastAssistantText(event?.messages), + config.captureContent, + ); // Stamp the run total on the agent span so it shows the agent's tokens/cost even // though Agenta cannot roll the per-turn LLM spans up across batches. if (runUsage.total > 0) { agentSpan.setAttribute("gen_ai.usage.input_tokens", runUsage.input); agentSpan.setAttribute("gen_ai.usage.output_tokens", runUsage.output); agentSpan.setAttribute("gen_ai.usage.prompt_tokens", runUsage.input); - agentSpan.setAttribute("gen_ai.usage.completion_tokens", runUsage.output); + agentSpan.setAttribute( + "gen_ai.usage.completion_tokens", + runUsage.output, + ); agentSpan.setAttribute("gen_ai.usage.total_tokens", runUsage.total); - if (runUsage.cost > 0) agentSpan.setAttribute("gen_ai.usage.cost", runUsage.cost); + if (runUsage.cost > 0) + agentSpan.setAttribute("gen_ai.usage.cost", runUsage.cost); } agentSpan.end(); agentSpan = undefined; @@ -593,7 +652,8 @@ export function createAgentaOtel( function acpBlockText(block: any): string { if (!block) return ""; if (typeof block === "string") return block; - if (block.type === "text" && typeof block.text === "string") return block.text; + if (block.type === "text" && typeof block.text === "string") + return block.text; return ""; } @@ -726,6 +786,12 @@ export interface SandboxAgentOtelInit extends Partial { harness?: string; /** Resolved model id ("openai-codex/gpt-5.5"); set on the LLM span. */ model?: string; + /** + * Skill names actually materialized for this run — BOTH the author-supplied skills and the + * forced Agenta platform `_agenta.*` skills the server injected. Stamped on the agent span so + * a trace shows which skills loaded (F-029), not just the author config echoed elsewhere. + */ + skills?: string[]; /** * Emit the span tree from the ACP event stream. Default true. Set false when the * harness instruments itself (e.g. Pi via the agenta extension propagates the trace @@ -756,6 +822,12 @@ export interface SandboxAgentOtel { emitEvent(event: AgentEvent): void; /** End all open spans. Returns the accumulated assistant text. */ finish(): string; + /** + * Record a run-level error on the agent span: the user-facing message (F-030) plus the + * provider that failed, and an OTel exception event, so a trace carries the same diagnostic + * the HTTP response does (it previously showed only an error COUNT). Call before finish/flush. + */ + recordError(message: string, provider?: string): void; /** Set final run usage before finish/flush so events and exported spans carry final totals. */ setUsage(usage: AgentUsage | undefined): void; /** Flush this run's trace to Agenta (invoke_agent has a remote parent). */ @@ -774,7 +846,9 @@ export interface SandboxAgentOtel { * Build an ACP-event-driven tracer scoped to a single sandbox-agent run. Call `start` once, * `handleUpdate` for every ACP session update, then `finish` + `await flush`. */ -export function createSandboxAgentOtel(init: SandboxAgentOtelInit): SandboxAgentOtel { +export function createSandboxAgentOtel( + init: SandboxAgentOtelInit, +): SandboxAgentOtel { ensureProvider(); const capture = init.captureContent !== false; @@ -918,10 +992,16 @@ export function createSandboxAgentOtel(init: SandboxAgentOtelInit): SandboxAgent record({ type: "reasoning_start", id: reasoningBlockId }); } record({ type: "reasoning_delta", id: reasoningBlockId, delta }); - reasoningEmitted = target.startsWith(reasoningEmitted) ? target : reasoningEmitted + delta; + reasoningEmitted = target.startsWith(reasoningEmitted) + ? target + : reasoningEmitted + delta; } - function start(input: { prompt?: string; messages?: any[]; sessionId?: string }): void { + function start(input: { + prompt?: string; + messages?: any[]; + sessionId?: string; + }): void { // Span-less mode (harness self-instruments): only track the trace id so the run can // report it; the harness emits the spans under the propagated parent. if (!emitSpans) { @@ -934,6 +1014,12 @@ export function createSandboxAgentOtel(init: SandboxAgentOtelInit): SandboxAgent agentSpan.setAttribute("openinference.span.kind", "AGENT"); agentSpan.setAttribute("gen_ai.operation.name", "invoke_agent"); agentSpan.setAttribute("gen_ai.agent.name", init.harness ?? "agent"); + // F-029: stamp the skills that actually materialized (author + forced `_agenta.*`) so a + // trace shows which skills loaded, not just the author config echoed on the workflow span. + if (init.skills && init.skills.length > 0) { + agentSpan.setAttribute("ag.agent.skills.loaded", init.skills); + agentSpan.setAttribute("ag.agent.skills.count", init.skills.length); + } const sessionId = input.sessionId ?? init.sessionId; if (sessionId) { agentSpan.setAttribute("session.id", sessionId); @@ -950,7 +1036,11 @@ export function createSandboxAgentOtel(init: SandboxAgentOtelInit): SandboxAgent turnSpan.setAttribute("pi.turn.index", 0); turnCtx = trace.setSpan(agentCtx, turnSpan); - llmSpan = tracer.startSpan(modelId ? `chat ${modelId}` : "chat", undefined, turnCtx); + llmSpan = tracer.startSpan( + modelId ? `chat ${modelId}` : "chat", + undefined, + turnCtx, + ); llmSpan.setAttribute("openinference.span.kind", "LLM"); llmSpan.setAttribute("gen_ai.operation.name", "chat"); if (provider) llmSpan.setAttribute("gen_ai.system", provider); @@ -1009,7 +1099,12 @@ export function createSandboxAgentOtel(init: SandboxAgentOtelInit): SandboxAgent setInputs(span, update.rawInput as Record, capture); } toolSpans.set(id, { span, name: String(name) }); - record({ type: "tool_call", id: String(id), name: String(name), input: update.rawInput }); + record({ + type: "tool_call", + id: String(id), + name: String(name), + input: update.rawInput, + }); // A tool_call can arrive already completed (status set up front). maybeCloseTool(id, update); return; @@ -1029,8 +1124,8 @@ export function createSandboxAgentOtel(init: SandboxAgentOtelInit): SandboxAgent usage = { input: usage?.input ?? 0, output: usage?.output ?? 0, - total: typeof total === "number" ? total : usage?.total ?? 0, - cost: typeof cost === "number" ? cost : usage?.cost ?? 0, + total: typeof total === "number" ? total : (usage?.total ?? 0), + cost: typeof cost === "number" ? cost : (usage?.cost ?? 0), }; record({ type: "usage", ...usage }); } @@ -1043,14 +1138,68 @@ export function createSandboxAgentOtel(init: SandboxAgentOtelInit): SandboxAgent if (!entry) return; const status = update?.status; if (status !== "completed" && status !== "failed") return; - const out = acpToolContentText(update.content) || acpToolContentText(update.rawOutput); + const out = + acpToolContentText(update.content) || + acpToolContentText(update.rawOutput); if (entry.span) { setOutput(entry.span, out, capture); - if (status === "failed") entry.span.setStatus({ code: SpanStatusCode.ERROR }); + if (status === "failed") + entry.span.setStatus({ code: SpanStatusCode.ERROR }); entry.span.end(); } toolSpans.delete(id); - record({ type: "tool_result", id, output: out, isError: status === "failed" }); + record({ + type: "tool_result", + id, + output: out, + isError: status === "failed", + }); + } + + /** + * Stamp a run-level error (the user-facing message + the provider that failed) and an OTel + * exception event so a trace carries the same diagnostic the HTTP response does (F-030). + * + * Two cases: + * - This tracer owns the agent span (emitSpans=true: Claude / gateway / Daytona) — stamp it + * directly, before finish() ends it. + * - The harness self-instruments (emitSpans=false: local Pi emits its own spans in another + * process, which never carry the error and are already flushed) — emit a standalone error + * span as a child of the caller's traceparent, so the error still reaches the /invoke trace. + * Idempotent and best-effort: a second call or a tracing failure must never break the run. + */ + function recordError(message: string, errorProvider?: string): void { + const text = message || "agent run failed"; + const stamp = (span: Span): void => { + span.setAttribute("ag.error.message", text); + const failedProvider = errorProvider ?? provider; + if (failedProvider) + span.setAttribute("ag.error.provider", failedProvider); + span.recordException({ name: "AgentRunError", message: text }); + span.setStatus({ code: SpanStatusCode.ERROR, message: text }); + }; + try { + if (agentSpan) { + stamp(agentSpan); + return; + } + // No owned span (harness self-instruments). Emit a standalone error span under the + // caller's traceparent so the failure is visible in the /invoke trace. + const parent = parentContext(init.traceparent); + const errSpan = tracer.startSpan("agent_error", undefined, parent); + errSpan.setAttribute("openinference.span.kind", "AGENT"); + errSpan.setAttribute("gen_ai.operation.name", "invoke_agent"); + errSpan.setAttribute("gen_ai.agent.name", init.harness ?? "agent"); + stamp(errSpan); + errSpan.end(); + // The standalone span shares the caller's trace id; make sure the run reports it so the + // engine flushes this trace (a self-instrumenting run otherwise only tracked it from the + // traceparent, which is the same id — but set it defensively if it was never resolved). + runTraceId = runTraceId ?? errSpan.spanContext().traceId; + traceTargets.set(runTraceId, { endpoint, authorization }); + } catch { + // tracing must never break the run + } } function finish(): string { @@ -1110,6 +1259,7 @@ export function createSandboxAgentOtel(init: SandboxAgentOtelInit): SandboxAgent handleUpdate, emitEvent: record, finish, + recordError, setUsage, flush: () => flushTrace(runTraceId), traceId: () => runTraceId, diff --git a/services/agent/tests/unit/otel-skills-error.test.ts b/services/agent/tests/unit/otel-skills-error.test.ts new file mode 100644 index 0000000000..e203718f54 --- /dev/null +++ b/services/agent/tests/unit/otel-skills-error.test.ts @@ -0,0 +1,219 @@ +/** + * Unit tests for the tracing additions in `tracing/otel.ts`: + * + * - F-029: the loaded skills (author + forced `_agenta.*`) are stamped on the agent span via + * `ag.agent.skills.loaded` / `ag.agent.skills.count`, on BOTH the sandbox-agent ACP tracer + * (`createSandboxAgentOtel`, non-Pi / Daytona) and Pi's own extension tracer + * (`createAgentaOtel`, local Pi). + * - F-030: `recordError` stamps the user-facing error message + the provider that failed plus + * an exception event on the agent span, so an error run's trace carries the same diagnostic + * the HTTP response does. When the harness self-instruments (no owned span) it emits a + * standalone `agent_error` span so the failure still reaches the trace. + * + * Spans export over OTLP from a module-level provider, so we spy on the OTel API tracer and + * capture the attributes/exceptions each span records, rather than wiring an in-memory exporter. + * + * Run: pnpm exec vitest run tests/unit/otel-skills-error.test.ts + */ +import { afterEach, describe, expect, it, vi } from "vitest"; +import { trace, type Span } from "@opentelemetry/api"; + +import { + createSandboxAgentOtel, + createAgentaOtel, +} from "../../src/tracing/otel.ts"; + +interface FakeSpan { + name: string; + attributes: Record; + exceptions: Array<{ name?: string; message?: string }>; + status?: { code: number; message?: string }; + ended: boolean; +} + +/** Replace the OTel tracer so every span built records into a captured array. */ +function spyTracer(): FakeSpan[] { + const spans: FakeSpan[] = []; + const makeSpan = (name: string): Span => { + const span: FakeSpan = { + name, + attributes: {}, + exceptions: [], + ended: false, + }; + spans.push(span); + const api = { + setAttribute(key: string, value: unknown) { + span.attributes[key] = value; + return api; + }, + setAttributes(attrs: Record) { + Object.assign(span.attributes, attrs); + return api; + }, + recordException(exc: { name?: string; message?: string }) { + span.exceptions.push(exc); + }, + setStatus(status: { code: number; message?: string }) { + span.status = status; + return api; + }, + end() { + span.ended = true; + }, + spanContext() { + return { + traceId: "0".repeat(32), + spanId: "0".repeat(16), + traceFlags: 1, + }; + }, + isRecording() { + return true; + }, + addEvent() { + return api; + }, + updateName() { + return api; + }, + }; + return api as unknown as Span; + }; + vi.spyOn(trace, "getTracer").mockReturnValue({ + startSpan: (name: string) => makeSpan(name), + startActiveSpan: ((name: string, fn: (s: Span) => unknown) => + fn(makeSpan(name))) as any, + } as any); + return spans; +} + +afterEach(() => { + vi.restoreAllMocks(); +}); + +describe("otel skills + error tracing", () => { + it("stamps loaded skills on the sandbox-agent agent span (F-029)", () => { + const spans = spyTracer(); + const otel = createSandboxAgentOtel({ + harness: "claude", + model: "anthropic/claude-haiku", + emitSpans: true, + skills: ["weather-oracle", "_agenta.agenta-getting-started"], + }); + otel.start({ prompt: "hi" }); + + const agentSpan = spans.find((s) => s.name === "invoke_agent"); + expect(agentSpan).toBeTruthy(); + expect(agentSpan?.attributes["ag.agent.skills.loaded"]).toEqual([ + "weather-oracle", + "_agenta.agenta-getting-started", + ]); + expect(agentSpan?.attributes["ag.agent.skills.count"]).toBe(2); + }); + + it("omits the skills attributes when no skills loaded", () => { + const spans = spyTracer(); + const otel = createSandboxAgentOtel({ + harness: "claude", + model: "anthropic/claude-haiku", + emitSpans: true, + }); + otel.start({ prompt: "hi" }); + const agentSpan = spans.find((s) => s.name === "invoke_agent"); + expect(agentSpan?.attributes["ag.agent.skills.loaded"]).toBeUndefined(); + expect(agentSpan?.attributes["ag.agent.skills.count"]).toBeUndefined(); + }); + + it("stamps loaded skills on Pi's own agent span (F-029, local Pi)", () => { + const spans = spyTracer(); + const otel = createAgentaOtel({ + captureContent: true, + skills: ["weather-oracle", "_agenta.agenta-getting-started"], + }); + // Drive the Pi lifecycle: register handlers, then fire before_agent_start + agent_start. + const handlers: Record Promise> = {}; + otel.register({ + on: (name: string, fn: (e: any) => Promise) => { + handlers[name] = fn; + }, + } as any); + return (async () => { + await handlers["before_agent_start"]?.({ prompt: "hi" }); + await handlers["agent_start"]?.({}); + const agentSpan = spans.find((s) => s.name === "invoke_agent"); + expect(agentSpan?.attributes["ag.agent.skills.loaded"]).toEqual([ + "weather-oracle", + "_agenta.agenta-getting-started", + ]); + expect(agentSpan?.attributes["ag.agent.skills.count"]).toBe(2); + })(); + }); + + it("records the error message + provider + exception on the owned agent span (F-030)", () => { + const spans = spyTracer(); + const otel = createSandboxAgentOtel({ + harness: "claude", + model: "anthropic/claude-haiku", + emitSpans: true, + }); + otel.start({ prompt: "hi" }); + otel.recordError( + "claude: model authentication failed — add the project's Anthropic key.", + "anthropic", + ); + + const agentSpan = spans.find((s) => s.name === "invoke_agent"); + expect(agentSpan?.attributes["ag.error.message"]).toContain( + "model authentication failed", + ); + expect(agentSpan?.attributes["ag.error.provider"]).toBe("anthropic"); + expect(agentSpan?.exceptions.length).toBe(1); + expect(agentSpan?.exceptions[0].message).toContain( + "model authentication failed", + ); + // ERROR status (SpanStatusCode.ERROR = 2). + expect(agentSpan?.status?.code).toBe(2); + }); + + it("emits a standalone error span when the harness self-instruments (F-030, local Pi)", () => { + const spans = spyTracer(); + const otel = createSandboxAgentOtel({ + harness: "pi", + model: "openai/gpt-4o-mini", + // Local Pi: the runner's sandbox-agent tracer is span-less; Pi emits its own spans. + emitSpans: false, + traceparent: `00-${"a".repeat(32)}-${"b".repeat(16)}-01`, + }); + otel.start({ prompt: "hi" }); + // No invoke_agent span exists yet (span-less mode). + expect(spans.find((s) => s.name === "invoke_agent")).toBeUndefined(); + + otel.recordError( + "pi_core: model authentication failed — add the project's Anthropic key.", + "anthropic", + ); + + const errSpan = spans.find((s) => s.name === "agent_error"); + expect(errSpan).toBeTruthy(); + expect(errSpan?.attributes["ag.error.message"]).toContain( + "model authentication failed", + ); + expect(errSpan?.attributes["ag.error.provider"]).toBe("anthropic"); + expect(errSpan?.exceptions.length).toBe(1); + expect(errSpan?.ended).toBe(true); + }); + + it("falls back to the init model provider when recordError omits one", () => { + const spans = spyTracer(); + const otel = createSandboxAgentOtel({ + harness: "claude", + model: "anthropic/claude-haiku", + emitSpans: true, + }); + otel.start({ prompt: "hi" }); + otel.recordError("some failure"); + const agentSpan = spans.find((s) => s.name === "invoke_agent"); + expect(agentSpan?.attributes["ag.error.provider"]).toBe("anthropic"); + }); +}); diff --git a/services/agent/tests/unit/sandbox-agent-orchestration.test.ts b/services/agent/tests/unit/sandbox-agent-orchestration.test.ts index 4fe7fb9c4b..f3bff1ec0f 100644 --- a/services/agent/tests/unit/sandbox-agent-orchestration.test.ts +++ b/services/agent/tests/unit/sandbox-agent-orchestration.test.ts @@ -52,6 +52,7 @@ function fakeHarness(options: FakeOptions = {}) { }>, runFinished: 0, runFlushed: 0, + recordedErrors: [] as Array<{ message: string; provider?: string }>, }; const events: AgentEvent[] = []; let eventHandler: ((event: any) => void) | undefined; @@ -121,6 +122,12 @@ function fakeHarness(options: FakeOptions = {}) { calls.runFinished += 1; return options.output ?? "assistant output"; }, + recordError(message: string, provider?: string) { + calls.recordedErrors.push({ message, provider }); + }, + output() { + return options.output ?? "assistant output"; + }, async flush() { calls.runFlushed += 1; }, diff --git a/services/agent/tests/unit/sandbox-agent-pi-assets.test.ts b/services/agent/tests/unit/sandbox-agent-pi-assets.test.ts index cb0042aca3..f5d5d5527f 100644 --- a/services/agent/tests/unit/sandbox-agent-pi-assets.test.ts +++ b/services/agent/tests/unit/sandbox-agent-pi-assets.test.ts @@ -107,6 +107,41 @@ describe("buildPiExtensionEnv", () => { assert.equal(env.AGENTA_TOOL_PUBLIC_SPECS, undefined); assert.equal(env.AGENTA_TOOL_RELAY_DIR, undefined); }); + + it("carries the loaded skill names under tracing (F-029)", () => { + const request = { + trace: { + traceparent: "00-aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa-bbbbbbbbbbbbbbbb-01", + }, + } as AgentRunRequest; + + const env = buildPiExtensionEnv(request, true, { + skills: ["weather-oracle", "_agenta.agenta-getting-started"], + }); + + assert.deepEqual(JSON.parse(env.AGENTA_SKILLS_LOADED ?? "[]"), [ + "weather-oracle", + "_agenta.agenta-getting-started", + ]); + }); + + it("omits the loaded skills env when there are none or tracing is off", () => { + const request = { + trace: { + traceparent: "00-aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa-bbbbbbbbbbbbbbbb-01", + }, + } as AgentRunRequest; + + assert.equal( + buildPiExtensionEnv(request, true, { skills: [] }).AGENTA_SKILLS_LOADED, + undefined, + ); + assert.equal( + buildPiExtensionEnv(request, false, { skills: ["x"] }) + .AGENTA_SKILLS_LOADED, + undefined, + ); + }); }); describe("writeSystemPromptLocal", () => {