Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
200 changes: 15 additions & 185 deletions services/agent/src/tools/code.ts
Original file line number Diff line number Diff line change
@@ -1,197 +1,27 @@
/**
* Code-tool executor: run a resolved `code` tool's snippet in the agent sandbox.
* Code-tool sidecar execution gate.
*
* A code tool ships a snippet (`code`) + a runtime (`python` | `node`) + a scoped `env` (the
* tool's declared vault secrets, resolved server-side). Unlike a `callback` tool, it never
* touches Agenta's /tools/call — it runs locally where the harness runs, which is exactly why
* its secrets are injected here as subprocess env (and nowhere else).
*
* Entry convention (same for both runtimes): the snippet defines a top-level `main`. A bare
* `def main(**inputs)` / `function main(inputs)` is found automatically; an explicit export
* (`module.exports.main` / `exports.main` / `module.exports = fn` in Node) is also accepted.
* Python calls `main(**inputs)` (keyword args from the tool input object); Node calls
* `main(inputs)` (the input object) and may return a promise. The return value is
* JSON-serialized and handed to the model as the tool result.
*
* Shared by every delivery path that runs code locally: engines/pi.ts (in-process Pi),
* extensions/agenta.ts (Pi under sandbox-agent), tools/mcp-server.ts (the MCP bridge for other
* harnesses).
* The code-tool interface still exists and code tools are still advertised to harnesses. The
* sidecar no longer executes author-supplied snippets locally, though: every delivery path
* funnels a `kind: "code"` call through this function, so throwing here makes direct Pi,
* sandbox Pi, and the ACP/MCP bridge fail consistently without changing the public wire shape.
*/
import { spawn } from "node:child_process";
import { mkdtempSync, rmSync, writeFileSync } from "node:fs";
import { tmpdir } from "node:os";
import { join } from "node:path";

/** Per-call budget for a code tool. Surfaced as a tool error on timeout. */
export const CODE_TOOL_TIMEOUT_MS = Number(
process.env.AGENTA_AGENT_CODE_TOOL_TIMEOUT_MS ?? 30000,
);

// argv[1] is the snippet path (python `-c`/node `-e` put the first extra arg at argv[1]);
// the tool input arrives as JSON on stdin. Both bootstraps evaluate the snippet in a fresh
// scope and pick up a top-level `main` (a bare `def main`/`function main`), falling back to an
// explicit export. Either way the contract is: define a callable `main`.
const PY_BOOTSTRAP = `import sys, json
_path = sys.argv[1]
with open(_path) as _f:
_src = _f.read()
_ns = {}
exec(compile(_src, _path, "exec"), _ns)
if not callable(_ns.get("main")):
sys.stderr.write("code tool must define a callable main(**inputs)")
sys.exit(1)
_args = json.loads(sys.stdin.read() or "{}")
_out = _ns["main"](**_args)
sys.stdout.write(json.dumps(_out))
`;

// `require(path)` would only see CommonJS exports, so a bare top-level `function main` (which
// exports nothing under CommonJS) would be invisible. Instead read the source and evaluate it
// in a scope that captures a top-level `main`, while still honoring an explicit
// `module.exports.main` / `exports.main` / `module.exports = fn`.
const NODE_BOOTSTRAP = `const fs = require("fs");
const path = process.argv[1];
const src = fs.readFileSync(path, "utf8");
const mod = { exports: {} };
const factory = new Function(
"exports",
"require",
"module",
"__filename",
"__dirname",
src +
"\\n;return (typeof main !== 'undefined' ? main : (module.exports && (module.exports.main || module.exports.default)) || module.exports);",
);
const fn = factory(mod.exports, require, mod, path, require("path").dirname(path));
if (typeof fn !== "function") {
process.stderr.write("code tool must define or export a callable main(inputs) function");
process.exit(1);
}
const args = JSON.parse(fs.readFileSync(0, "utf8") || "{}");
Promise.resolve(fn(args))
.then((out) => process.stdout.write(JSON.stringify(out === undefined ? null : out)))
.catch((err) => { process.stderr.write(String((err && err.stack) || err)); process.exit(1); });
`;

export type CodeRuntime = "python" | "node";

// The minimal set of host env vars a python3/node runtime needs to start. Deliberately
// excludes everything secret-bearing or sidecar-specific: no AGENTA_*, no *_API_KEY /
// *_TOKEN, no COMPOSIO_* / DAYTONA_*, no provider keys that the in-process Pi path writes
// into process.env before a run. Only the tool's declared scoped `env` is layered on top.
const BASE_ENV_ALLOWLIST = [
"PATH",
"HOME",
"LANG",
"LC_ALL",
"LC_CTYPE",
"TZ",
"TMPDIR",
"TEMP",
"TMP",
"NODE_PATH",
// Windows essentials, copied only when present.
"SystemRoot",
"ComSpec",
];

/** Build the child env from a minimal allowlist (copied only when set) plus scoped secrets. */
function buildChildEnv(
env: Record<string, string> | undefined,
): Record<string, string> {
const base: Record<string, string> = {};
for (const key of BASE_ENV_ALLOWLIST) {
const value = process.env[key];
if (value !== undefined) base[key] = value;
}
return { ...base, ...(env ?? {}) };
}
export const CODE_TOOL_UNSUPPORTED_MESSAGE =
"Code tools are not supported by the sidecar.";

/**
* Run a code tool's snippet and return its JSON-serialized output as text. Throws on a
* non-zero exit, a timeout, or an abort; callers turn the throw into a tool-error result so
* the model loop continues.
* Fail a code-tool invocation. Callers turn this throw into a tool-error result so the model
* loop continues rather than crashing the whole run.
*/
export async function runCodeTool(
runtime: CodeRuntime | undefined,
code: string,
env: Record<string, string> | undefined,
args: unknown,
signal?: AbortSignal,
_runtime: CodeRuntime | undefined,
_code: string,
_env: Record<string, string> | undefined,
_args: unknown,
_signal?: AbortSignal,
): Promise<string> {
const dir = mkdtempSync(join(tmpdir(), "agenta-code-"));
try {
const isNode = runtime === "node";
const scriptPath = join(dir, isNode ? "tool.js" : "tool.py");
writeFileSync(scriptPath, code ?? "", "utf8");

const command = isNode ? "node" : "python3";
const childArgs = isNode
? ["-e", NODE_BOOTSTRAP, scriptPath]
: ["-c", PY_BOOTSTRAP, scriptPath];

return await new Promise<string>((resolve, reject) => {
const child = spawn(command, childArgs, {
// The child inherits ONLY a minimal startup allowlist (PATH, HOME, locale/temp, and
// Windows essentials when present) plus the tool's declared scoped secrets. It does
// NOT inherit the sidecar's process.env, so provider keys (OPENAI_API_KEY, etc.) that
// the in-process Pi path writes into process.env, AGENTA_* config, and other secret-
// bearing vars never reach an author-supplied snippet. Nothing is written to the
// agent-visible filesystem beyond the temp dir.
env: buildChildEnv(env),
stdio: ["pipe", "pipe", "pipe"],
});

let stdout = "";
let stderr = "";
let settled = false;
const finish = (fn: () => void) => {
if (settled) return;
settled = true;
clearTimeout(timer);
if (signal) signal.removeEventListener("abort", onAbort);
fn();
};

const timer = setTimeout(() => {
child.kill("SIGKILL");
finish(() =>
reject(new Error(`code tool timed out after ${CODE_TOOL_TIMEOUT_MS}ms`)),
);
}, CODE_TOOL_TIMEOUT_MS);

const onAbort = () => {
child.kill("SIGKILL");
finish(() => reject(new Error("aborted")));
};
if (signal) {
if (signal.aborted) onAbort();
else signal.addEventListener("abort", onAbort, { once: true });
}

child.stdout.on("data", (d) => (stdout += d));
child.stderr.on("data", (d) => (stderr += d));
child.on("error", (err) => finish(() => reject(err)));
child.on("close", (exitCode) =>
finish(() => {
if (exitCode === 0) resolve(stdout.trim());
else
reject(
new Error(
`code tool exited ${exitCode}: ${stderr.slice(0, 500) || "(no stderr)"}`,
),
);
}),
);

child.stdin.write(JSON.stringify(args ?? {}));
child.stdin.end();
});
} finally {
try {
rmSync(dir, { recursive: true, force: true });
} catch {
// best-effort cleanup of the throwaway snippet dir
}
}
throw new Error(CODE_TOOL_UNSUPPORTED_MESSAGE);
}
4 changes: 2 additions & 2 deletions services/agent/src/tools/dispatch.ts
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
* advertise/skip behavior for `client` tools — only the execution itself is shared.
*
* The three executor kinds (see `ResolvedToolSpec`):
* - `code`: run the snippet in a sandbox subprocess with its scoped secret `env`.
* - `code`: advertised to harnesses, but rejected by the sidecar as unsupported.
* - `client`: browser-fulfilled across a turn boundary; never executed in-sandbox (throws).
* - `callback` (default): POST back through Agenta's /tools/call so the Composio key and
* connection auth stay server-side. On Daytona the in-sandbox process can't reach Agenta,
Expand Down Expand Up @@ -96,7 +96,7 @@ export async function relayToolCall(
* Execute one resolved tool and return its result text. Throws on failure; every call site
* turns the throw into a tool-error result so the model loop continues rather than crashing.
*
* - `code` → run the snippet locally (scoped secret env), no callback/relay.
* - `code` -> reject as unsupported by the sidecar, no callback/relay.
* - `client` → throw: browser-fulfilled, never executed in-sandbox.
* - default/`callback` → relay through the runner when `opts.relayDir` is set (Daytona),
* else POST directly to `opts.endpoint`.
Expand Down
90 changes: 13 additions & 77 deletions services/agent/tests/unit/code-tool.test.ts
Original file line number Diff line number Diff line change
@@ -1,89 +1,25 @@
/**
* Unit test for the code-tool executor (runCodeTool).
* Unit test for the code-tool sidecar execution gate.
*
* Exercises both runtimes end-to-end through real subprocesses: a python tool, node tools
* written as a bare top-level `function main` (the F2 regression) and as an explicit
* `module.exports.main`, an async node `main`, the F3 env-isolation guarantee (provider keys
* do NOT leak in; declared scoped secrets DO), and the non-zero-exit reject path.
*
* Needs `python3` and `node` on PATH (both present locally and on ubuntu CI runners).
*
* Run: pnpm test (or: pnpm exec vitest run tests/unit/code-tool.test.ts)
* Code tools remain part of the public tool interface and can still be advertised to harnesses,
* but the sidecar must not execute author-supplied snippets locally.
*/
import { describe, it } from "vitest";
import assert from "node:assert/strict";

import { runCodeTool } from "../../src/tools/code.ts";
import { CODE_TOOL_UNSUPPORTED_MESSAGE, runCodeTool } from "../../src/tools/code.ts";

describe("runCodeTool", () => {
it("runs a python bare `def main(**kw)`", async () => {
const code = 'def main(**kw):\n return {"sum": kw.get("a", 0) + kw.get("b", 0)}\n';
const out = await runCodeTool("python", code, undefined, { a: 2, b: 3 });
assert.deepEqual(JSON.parse(out), { sum: 5 }, "python bare main returns the right JSON");
});

it("runs a node bare top-level `function main` (F2 regression)", async () => {
const code = "function main(inputs) { return { got: inputs }; }";
const out = await runCodeTool("node", code, undefined, { hello: "world" });
assert.deepEqual(
JSON.parse(out),
{ got: { hello: "world" } },
"node bare function main executes and echoes the input",
);
});

it("runs a node explicit `module.exports.main`", async () => {
const code = "module.exports.main = function (inputs) { return { via: 'exports', got: inputs }; };";
const out = await runCodeTool("node", code, undefined, { x: 1 });
assert.deepEqual(
JSON.parse(out),
{ via: "exports", got: { x: 1 } },
"node module.exports.main works",
);
});

it("runs an async node `main` returning a Promise", async () => {
const code =
"async function main(inputs) { await new Promise((r) => setTimeout(r, 5)); return { doubled: inputs.n * 2 }; }";
const out = await runCodeTool("node", code, undefined, { n: 21 });
assert.deepEqual(JSON.parse(out), { doubled: 42 }, "node async main resolves");
});

it("F3: provider keys do NOT leak; scoped secrets DO", async () => {
const hadKey = "OPENAI_API_KEY" in process.env;
const prevKey = process.env.OPENAI_API_KEY;
process.env.OPENAI_API_KEY = "leak-me-xyz";
try {
// The provider key sits in process.env but must not reach the snippet.
const leakCode = "function main() { return { key: process.env.OPENAI_API_KEY ?? 'absent' }; }";
const leakOut = await runCodeTool("node", leakCode, undefined, {});
assert.deepEqual(
JSON.parse(leakOut),
{ key: "absent" },
"F3: OPENAI_API_KEY did NOT leak into the snippet env",
);

// A secret declared on the tool (passed via the scoped `env` arg) must be visible.
const scopedCode =
"function main() { return { secret: process.env.MY_TOOL_SECRET ?? 'absent' }; }";
const scopedOut = await runCodeTool("node", scopedCode, { MY_TOOL_SECRET: "ok" }, {});
assert.deepEqual(
JSON.parse(scopedOut),
{ secret: "ok" },
"F3: scoped MY_TOOL_SECRET IS visible to the snippet",
);
} finally {
if (hadKey) process.env.OPENAI_API_KEY = prevKey;
else delete process.env.OPENAI_API_KEY;
}
});

it("rejects when the snippet throws / exits non-zero", async () => {
const code = "function main() { throw new Error('boom'); }";
it("fails with a clear unsupported error instead of executing code", async () => {
await assert.rejects(
() => runCodeTool("node", code, undefined, {}),
/boom|exited/,
"a throwing snippet rejects",
() =>
runCodeTool(
"node",
"function main() { return { executed: true }; }",
{ MY_TOOL_SECRET: "secret" },
{ input: true },
),
new RegExp(CODE_TOOL_UNSUPPORTED_MESSAGE),
);
});
});
6 changes: 3 additions & 3 deletions services/agent/tests/unit/tool-bridge.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@
* Unit tests for buildToolMcpServers (the tool MCP bridge attachment decision).
*
* Regression cover for F4: attachment must be decided per tool kind, not on the callback
* endpoint alone. A `code` tool runs locally in mcp-server.ts and needs no endpoint, so a run
* whose tools are all `code` must still attach the `agenta-tools` server. Only `callback`-kind
* endpoint alone. A `code` tool is still advertised through mcp-server.ts and needs no endpoint,
* so a run whose tools are all `code` must still attach the `agenta-tools` server. Only `callback`-kind
* tools require AGENTA_TOOL_CALLBACK_ENDPOINT; missing it must degrade those tools, not drop the
* whole server. `client` tools are browser-fulfilled and never justify attaching the bridge.
*
Expand Down Expand Up @@ -109,7 +109,7 @@ describe("buildToolMcpServers", () => {
];
const out = buildToolMcpServers(specs, relayDir);
assert.notDeepEqual(out, [], "mixed run with no endpoint must not return []");
assert.equal(out.length, 1, "still attaches the server so the code tool works");
assert.equal(out.length, 1, "still attaches the server so the code tool is advertised");
assert.equal(
envValue(out[0].env, "AGENTA_TOOL_CALLBACK_ENDPOINT"),
undefined,
Expand Down
21 changes: 10 additions & 11 deletions services/agent/tests/unit/tool-dispatch.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,9 @@
* advertising behavior that stays per-site:
* - buildCustomTools (pi.ts) skips `client` specs, builds a tool per `code`/`callback` spec,
* and skips a `callback` spec with no callback endpoint.
* - runResolvedTool runs a real `code` snippet end-to-end (python) and throws for `client`.
* - runResolvedTool advertises code tools but fails their sidecar execution, and throws for `client`.
*
* No network and no harness: the `code` path shells out to python3 (available locally); the
* No network and no harness: the `code` path now fails before any subprocess; the
* `callback`/relay paths are not exercised here (they need a live /tools/call or a relay dir).
*
* Run: pnpm test (or: pnpm exec vitest run tests/unit/tool-dispatch.test.ts)
Expand Down Expand Up @@ -66,15 +66,14 @@ describe("buildCustomTools routing", () => {
});

describe("runResolvedTool", () => {
it("runs a code spec end-to-end (python)", async () => {
const text = await runResolvedTool(codeSpec, { greeting: "hi", n: 3 }, {
toolCallId: "call-1",
});
const parsed = JSON.parse(text);
assert.deepEqual(
parsed,
{ echo: { greeting: "hi", n: 3 } },
"code tool runs the snippet and returns its JSON output containing the input",
it("fails code specs with a clear unsupported error", async () => {
await assert.rejects(
() =>
runResolvedTool(codeSpec, { greeting: "hi", n: 3 }, {
toolCallId: "call-1",
}),
/Code tools are not supported by the sidecar\./,
"code tools remain advertised but are not executable by the sidecar",
);
});

Expand Down
Loading