Skip to content

Commit 6a3bf99

Browse files
committed
chore: add basic eval
1 parent f81cd2d commit 6a3bf99

7 files changed

Lines changed: 345 additions & 4 deletions

File tree

.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,9 @@ yarn-error.log*
77
lerna-debug.log*
88
.pnpm-debug.log*
99

10+
trace.json
11+
trace.json.gz
12+
1013
# Diagnostic reports (https://nodejs.org/api/report.html)
1114
report.[0-9]*.[0-9]*.[0-9]*.[0-9]*.json
1215

GEMINI.md

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,3 +4,12 @@
44
- Use `npm run build` to run tsc and test build.
55
- Use `npm run test` to build and run tests, run all tests to verify correctness.
66
- use `npm run test path-to-test.ts` to build and run a single test file, for example, `npm run test tests/McpContext.test.ts`.
7+
8+
## Rules for TypeScript
9+
10+
- Do not use `any` type.
11+
- Do not use `as` keyword for type casting.
12+
- Do not use `!` operator for type assertion.
13+
- Do not use `// @ts-ignore` comments.
14+
- Do not use `// @ts-nocheck` comments.
15+
- Do not use `// @ts-expect-error` comments.

package-lock.json

Lines changed: 14 additions & 3 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

package.json

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,8 @@
2121
"test:only": "npm run build && node scripts/test.mjs --test-only",
2222
"test:update-snapshots": "npm run build && node scripts/test.mjs --test-update-snapshots",
2323
"prepare": "node --experimental-strip-types scripts/prepare.ts",
24-
"verify-server-json-version": "node --experimental-strip-types scripts/verify-server-json-version.ts"
24+
"verify-server-json-version": "node --experimental-strip-types scripts/verify-server-json-version.ts",
25+
"eval": "npm run build && node --experimental-strip-types --test scripts/eval_gemini.ts"
2526
},
2627
"files": [
2728
"build/src",
@@ -39,6 +40,7 @@
3940
"mcpName": "io.github.ChromeDevTools/chrome-devtools-mcp",
4041
"devDependencies": {
4142
"@eslint/js": "^9.35.0",
43+
"@google/generative-ai": "^0.24.1",
4244
"@modelcontextprotocol/sdk": "1.25.2",
4345
"@rollup/plugin-commonjs": "^29.0.0",
4446
"@rollup/plugin-json": "^6.1.0",

scripts/eval_gemini.ts

Lines changed: 275 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,275 @@
1+
/**
2+
* @license
3+
* Copyright 2026 Google LLC
4+
* SPDX-License-Identifier: Apache-2.0
5+
*/
6+
7+
import fs from 'node:fs';
8+
import path from 'node:path';
9+
import {describe, test} from 'node:test';
10+
11+
import {
12+
GoogleGenerativeAI,
13+
type FunctionDeclaration,
14+
SchemaType,
15+
} from '@google/generative-ai';
16+
import {Client} from '@modelcontextprotocol/sdk/client/index.js';
17+
import {StdioClientTransport} from '@modelcontextprotocol/sdk/client/stdio.js';
18+
19+
const ROOT_DIR = path.resolve(import.meta.dirname, '..');
20+
const SCENARIOS_DIR = path.join(import.meta.dirname, 'eval_scenarios');
21+
22+
// Define schema for our test scenarios
23+
export interface CapturedFunctionCall {
24+
name: string;
25+
args: Record<string, unknown>;
26+
}
27+
28+
interface TestScenario {
29+
prompt: string;
30+
maxTurns: number;
31+
expectations?: (calls: CapturedFunctionCall[]) => void;
32+
}
33+
34+
async function loadScenario(scenarioPath: string): Promise<TestScenario> {
35+
// Dynamic import of the test scenario
36+
// We assume the test file exports a 'scenario' object
37+
const module = await import(scenarioPath);
38+
if (!module.scenario) {
39+
throw new Error(
40+
`Scenario file ${scenarioPath} does not export a 'scenario' object.`,
41+
);
42+
}
43+
return module.scenario;
44+
}
45+
46+
async function runSingleScenario(
47+
scenarioPath: string,
48+
apiKey: string,
49+
): Promise<void> {
50+
const absolutePath = path.resolve(scenarioPath);
51+
console.log(`\n### Running Scenario: ${absolutePath} ###`);
52+
53+
let client: Client | undefined;
54+
let transport: StdioClientTransport | undefined;
55+
56+
try {
57+
const scenario = await loadScenario(absolutePath);
58+
59+
// Path to the compiled MCP server
60+
const serverPath = path.join(ROOT_DIR, 'build/src/index.js');
61+
if (!fs.existsSync(serverPath)) {
62+
throw new Error(
63+
`MCP server not found at ${serverPath}. Please run 'npm run build' first.`,
64+
);
65+
}
66+
67+
// Environment variables
68+
const env: Record<string, string> = {};
69+
Object.entries(process.env).forEach(([key, value]) => {
70+
if (value !== undefined) {
71+
env[key] = value;
72+
}
73+
});
74+
75+
transport = new StdioClientTransport({
76+
command: 'node',
77+
args: [serverPath],
78+
env,
79+
});
80+
81+
client = new Client(
82+
{name: 'gemini-eval-client', version: '1.0.0'},
83+
{capabilities: {}},
84+
);
85+
86+
await client.connect(transport);
87+
88+
const toolsResult = await client.listTools();
89+
const mcpTools = toolsResult.tools;
90+
91+
// Helper to sanitize schema for Gemini
92+
function isRecord(v: unknown): v is Record<string, unknown> {
93+
return typeof v === 'object' && v !== null && !Array.isArray(v);
94+
}
95+
96+
const cleanSchemaRecursive = (schema: unknown): unknown => {
97+
if (!isRecord(schema)) {
98+
return schema;
99+
}
100+
101+
const out: Record<string, unknown> = {};
102+
for (const key in schema) {
103+
if (
104+
key === 'default' ||
105+
key === 'additionalProperties' ||
106+
key === 'exclusiveMinimum'
107+
) {
108+
continue;
109+
}
110+
111+
const value = schema[key];
112+
if (Array.isArray(value)) {
113+
out[key] = value.map(cleanSchemaRecursive);
114+
} else if (isRecord(value)) {
115+
out[key] = cleanSchemaRecursive(value);
116+
} else {
117+
out[key] = value;
118+
}
119+
}
120+
return out;
121+
};
122+
123+
// Convert MCP tools to Gemini function declarations
124+
const functionDeclarations: FunctionDeclaration[] = mcpTools.map(tool => ({
125+
name: tool.name.replace(/-/g, '_').replace(/\./g, '_'), // Sanitize name for Gemini
126+
description: tool.description?.substring(0, 1024) || '',
127+
parameters: cleanSchemaRecursive({
128+
type: SchemaType.OBJECT,
129+
properties:
130+
isRecord(tool.inputSchema) && 'properties' in tool.inputSchema
131+
? tool.inputSchema.properties
132+
: {},
133+
required:
134+
isRecord(tool.inputSchema) &&
135+
'required' in tool.inputSchema &&
136+
Array.isArray(tool.inputSchema.required)
137+
? tool.inputSchema.required
138+
: [],
139+
}) as FunctionDeclaration['parameters'],
140+
}));
141+
142+
// Keep a map of sanitized names to original names for execution
143+
const contentToolsMap = new Map<string, string>();
144+
mcpTools.forEach(tool => {
145+
const sanitized = tool.name.replace(/-/g, '_').replace(/\./g, '_');
146+
contentToolsMap.set(sanitized, tool.name);
147+
});
148+
149+
const genAI = new GoogleGenerativeAI(apiKey);
150+
const model = genAI.getGenerativeModel({
151+
model: 'gemini-3-pro-preview',
152+
tools: [{functionDeclarations}],
153+
});
154+
155+
const chat = model.startChat({
156+
systemInstruction: {
157+
role: 'system',
158+
parts: [{text: `Use available tools.`}],
159+
},
160+
});
161+
162+
const expectations = scenario.expectations;
163+
const allCalls: CapturedFunctionCall[] = [];
164+
165+
// Execute turns
166+
let turnCount = 0;
167+
console.log(`\n--- Turn 1 (User) ---`);
168+
console.log(scenario.prompt);
169+
170+
let result = await chat.sendMessage(scenario.prompt);
171+
let response = result.response;
172+
173+
while (turnCount < scenario.maxTurns) {
174+
turnCount++;
175+
console.log(`\n--- Turn ${turnCount} (Model) ---`);
176+
const text = response.text();
177+
if (text) {
178+
console.log(`Text: ${text}`);
179+
}
180+
181+
const functionCalls = response.functionCalls();
182+
if (functionCalls && functionCalls.length > 0) {
183+
console.log(
184+
`Function Calls: ${JSON.stringify(functionCalls, null, 2)}`,
185+
);
186+
187+
const functionResponses = [];
188+
for (const call of functionCalls) {
189+
const originalName = contentToolsMap.get(call.name);
190+
if (!originalName) {
191+
console.error(`Unknown tool called: ${call.name}`);
192+
functionResponses.push({
193+
functionResponse: {
194+
name: call.name,
195+
response: {error: `Unknown tool: ${call.name}`},
196+
},
197+
});
198+
continue;
199+
}
200+
201+
const safeArgs = isRecord(call.args) ? call.args : {};
202+
203+
console.log(
204+
`Executing tool: ${originalName} with args: ${JSON.stringify(call.args)}`,
205+
);
206+
207+
allCalls.push({
208+
name: originalName,
209+
args: safeArgs,
210+
});
211+
212+
try {
213+
const toolResult = await client.callTool({
214+
name: originalName,
215+
arguments: safeArgs,
216+
});
217+
218+
functionResponses.push({
219+
functionResponse: {
220+
name: call.name,
221+
response: {name: call.name, content: toolResult},
222+
},
223+
});
224+
} catch (e) {
225+
const errorMessage = e instanceof Error ? e.message : String(e);
226+
console.error(`Error executing tool ${originalName}:`, e);
227+
functionResponses.push({
228+
functionResponse: {
229+
name: call.name,
230+
response: {error: errorMessage},
231+
},
232+
});
233+
}
234+
}
235+
236+
// Send tool results back
237+
console.log(`Sending ${functionResponses.length} tool outputs back...`);
238+
result = await chat.sendMessage(functionResponses);
239+
response = result.response;
240+
} else {
241+
console.log('No tool calls. Interaction finished.');
242+
break;
243+
}
244+
}
245+
246+
console.log('\nVerifying expectations...');
247+
if (expectations) {
248+
expectations(allCalls);
249+
}
250+
} finally {
251+
if (client) {
252+
await client.close();
253+
}
254+
if (transport) {
255+
await transport.close();
256+
}
257+
}
258+
}
259+
260+
void describe('Gemini Eval Scenarios', () => {
261+
const apiKey = process.env.GEMINI_API_KEY;
262+
263+
const files = fs.readdirSync(SCENARIOS_DIR).filter(file => {
264+
return file.endsWith('.ts') || file.endsWith('.js');
265+
});
266+
267+
for (const file of files) {
268+
void test(file, async () => {
269+
if (!apiKey) {
270+
throw new Error('GEMINI_API_KEY environment variable is required.');
271+
}
272+
await runSingleScenario(path.join(SCENARIOS_DIR, file), apiKey);
273+
});
274+
}
275+
});
Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
/**
2+
* @license
3+
* Copyright 2026 Google LLC
4+
* SPDX-License-Identifier: Apache-2.0
5+
*/
6+
import assert from 'node:assert';
7+
8+
import {type CapturedFunctionCall} from '../eval_gemini.ts';
9+
10+
export const scenario = {
11+
prompt: 'Navigate to https://developers.chrome.com and tell me if it worked.',
12+
maxTurns: 1,
13+
expectations: (calls: CapturedFunctionCall[]) => {
14+
assert.deepStrictEqual(calls, [
15+
{
16+
name: 'navigate_page',
17+
args: {url: 'https://developers.chrome.com'},
18+
},
19+
]);
20+
},
21+
};
Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
/**
2+
* @license
3+
* Copyright 2026 Google LLC
4+
* SPDX-License-Identifier: Apache-2.0
5+
*/
6+
import assert from 'node:assert';
7+
8+
import {type CapturedFunctionCall} from '../eval_gemini.ts';
9+
10+
export const scenario = {
11+
prompt: 'Check the performance of https://developers.chrome.com',
12+
maxTurns: 2,
13+
expectations: (calls: CapturedFunctionCall[]) => {
14+
assert.strictEqual(calls.length, 2);
15+
assert.ok(
16+
calls[0].name === 'navigate_page' || calls[0].name === 'new_page',
17+
);
18+
assert.ok(calls[1].name === 'performance_start_trace');
19+
},
20+
};

0 commit comments

Comments
 (0)