Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,9 @@ yarn-error.log*
lerna-debug.log*
.pnpm-debug.log*

trace.json
trace.json.gz

# Diagnostic reports (https://nodejs.org/api/report.html)
report.[0-9]*.[0-9]*.[0-9]*.[0-9]*.json

Expand Down
13 changes: 12 additions & 1 deletion GEMINI.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,15 @@
- Use only scripts from `package.json` to run commands.
- Use `npm run build` to run tsc and test build.
- Use `npm run test` to build and run tests, run all tests to verify correctness.
- use `npm run test path-to-test.ts` to build and run a single test file, for example, `npm run test tests/McpContext.test.ts`.
- Use `npm run test path-to-test.ts` to build and run a single test file, for example, `npm run test tests/McpContext.test.ts`.
- Use `npm run format` to fix formatting and get linting errors.

## Rules for TypeScript

- Do not use `any` type.
- Do not use `as` keyword for type casting.
- Do not use `!` operator for type assertion.
- Do not use `// @ts-ignore` comments.
- Do not use `// @ts-nocheck` comments.
- Do not use `// @ts-expect-error` comments.
- Prefer `for..of` instead of `forEach`.
17 changes: 14 additions & 3 deletions package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 3 additions & 1 deletion package.json
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,8 @@
"test:only": "npm run build && node scripts/test.mjs --test-only",
"test:update-snapshots": "npm run build && node scripts/test.mjs --test-update-snapshots",
"prepare": "node --experimental-strip-types scripts/prepare.ts",
"verify-server-json-version": "node --experimental-strip-types scripts/verify-server-json-version.ts"
"verify-server-json-version": "node --experimental-strip-types scripts/verify-server-json-version.ts",
"eval": "npm run build && node --experimental-strip-types --test scripts/eval_gemini.ts"
},
"files": [
"build/src",
Expand All @@ -39,6 +40,7 @@
"mcpName": "io.github.ChromeDevTools/chrome-devtools-mcp",
"devDependencies": {
"@eslint/js": "^9.35.0",
"@google/generative-ai": "^0.24.1",
"@modelcontextprotocol/sdk": "1.25.2",
"@rollup/plugin-commonjs": "^29.0.0",
"@rollup/plugin-json": "^6.1.0",
Expand Down
268 changes: 268 additions & 0 deletions scripts/eval_gemini.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,268 @@
/**
* @license
* Copyright 2026 Google LLC
* SPDX-License-Identifier: Apache-2.0
*/

import fs from 'node:fs';
import path from 'node:path';
import {describe, test} from 'node:test';

import {
GoogleGenerativeAI,
type FunctionDeclaration,
SchemaType,
} from '@google/generative-ai';
import {Client} from '@modelcontextprotocol/sdk/client/index.js';
import {StdioClientTransport} from '@modelcontextprotocol/sdk/client/stdio.js';

const ROOT_DIR = path.resolve(import.meta.dirname, '..');
const SCENARIOS_DIR = path.join(import.meta.dirname, 'eval_scenarios');

// Define schema for our test scenarios
export interface CapturedFunctionCall {
name: string;
args: Record<string, unknown>;
}

export interface TestScenario {
prompt: string;
maxTurns: number;
expectations: (calls: CapturedFunctionCall[]) => void;
}

async function loadScenario(scenarioPath: string): Promise<TestScenario> {
// Dynamic import of the test scenario
// We assume the test file exports a 'scenario' object
const module = await import(scenarioPath);
if (!module.scenario) {
throw new Error(
`Scenario file ${scenarioPath} does not export a 'scenario' object.`,
);
}
return module.scenario;
}

// Helper to sanitize schema for Gemini
function isRecord(v: unknown): v is Record<string, unknown> {
return typeof v === 'object' && v !== null && !Array.isArray(v);
}

const cleanSchemaRecursive = (schema: unknown): unknown => {
if (!isRecord(schema)) {
return schema;
}

const out: Record<string, unknown> = {};
for (const key in schema) {
if (
key === 'default' ||
key === 'additionalProperties' ||
key === 'exclusiveMinimum'
) {
continue;
}

const value = schema[key];
if (Array.isArray(value)) {
out[key] = value.map(cleanSchemaRecursive);
} else if (isRecord(value)) {
out[key] = cleanSchemaRecursive(value);
} else {
out[key] = value;
}
}
return out;
};

async function runSingleScenario(
scenarioPath: string,
apiKey: string,
): Promise<void> {
const absolutePath = path.resolve(scenarioPath);
console.log(`\n### Running Scenario: ${absolutePath} ###`);

let client: Client | undefined;
let transport: StdioClientTransport | undefined;

try {
const scenario = await loadScenario(absolutePath);

// Path to the compiled MCP server
const serverPath = path.join(ROOT_DIR, 'build/src/index.js');
if (!fs.existsSync(serverPath)) {
throw new Error(
`MCP server not found at ${serverPath}. Please run 'npm run build' first.`,
);
}

// Environment variables
const env: Record<string, string> = {};
Object.entries(process.env).forEach(([key, value]) => {
if (value !== undefined) {
env[key] = value;
}
});

transport = new StdioClientTransport({
command: 'node',
args: [serverPath],
env,
});

client = new Client(
{name: 'gemini-eval-client', version: '1.0.0'},
{capabilities: {}},
);

await client.connect(transport);

const toolsResult = await client.listTools();
const mcpTools = toolsResult.tools;

// Convert MCP tools to Gemini function declarations
const functionDeclarations: FunctionDeclaration[] = mcpTools.map(tool => ({
name: tool.name.replace(/-/g, '_').replace(/\./g, '_'), // Sanitize name for Gemini
description: tool.description?.substring(0, 1024) || '',
parameters: cleanSchemaRecursive({
type: SchemaType.OBJECT,
properties:
isRecord(tool.inputSchema) && 'properties' in tool.inputSchema
? tool.inputSchema.properties
: {},
required:
isRecord(tool.inputSchema) &&
'required' in tool.inputSchema &&
Array.isArray(tool.inputSchema.required)
? tool.inputSchema.required
: [],
}) as FunctionDeclaration['parameters'],
}));

// Keep a map of sanitized names to original names for execution
const contentToolsMap = new Map<string, string>();
for (const tool of mcpTools) {
const sanitized = tool.name.replace(/-/g, '_').replace(/\./g, '_');
contentToolsMap.set(sanitized, tool.name);
}

const genAI = new GoogleGenerativeAI(apiKey);
const model = genAI.getGenerativeModel({
model: 'gemini-3-pro-preview',
tools: [{functionDeclarations}],
});

const chat = model.startChat({
systemInstruction: {
role: 'system',
parts: [{text: `Use available tools.`}],
},
});

const expectations = scenario.expectations;
const allCalls: CapturedFunctionCall[] = [];

// Execute turns
let turnCount = 0;
console.log(`\n--- Turn 1 (User) ---`);
console.log(scenario.prompt);

let result = await chat.sendMessage(scenario.prompt);
let response = result.response;

while (turnCount < scenario.maxTurns) {
turnCount++;
console.log(`\n--- Turn ${turnCount} (Model) ---`);
const text = response.text();
if (text) {
console.log(`Text: ${text}`);
}

const functionCalls = response.functionCalls();
if (functionCalls && functionCalls.length > 0) {
console.log(
`Function Calls: ${JSON.stringify(functionCalls, null, 2)}`,
);

const functionResponses = [];
for (const call of functionCalls) {
const originalName = contentToolsMap.get(call.name);
if (!originalName) {
console.error(`Unknown tool called: ${call.name}`);
functionResponses.push({
functionResponse: {
name: call.name,
response: {error: `Unknown tool: ${call.name}`},
},
});
continue;
}

const safeArgs = isRecord(call.args) ? call.args : {};

console.log(
`Executing tool: ${originalName} with args: ${JSON.stringify(call.args)}`,
);

allCalls.push({
name: originalName,
args: safeArgs,
});

try {
const toolResult = await client.callTool({
name: originalName,
arguments: safeArgs,
});

functionResponses.push({
functionResponse: {
name: call.name,
response: {name: call.name, content: toolResult},
},
});
} catch (e) {
const errorMessage = e instanceof Error ? e.message : String(e);
console.error(`Error executing tool ${originalName}:`, e);
functionResponses.push({
functionResponse: {
name: call.name,
response: {error: errorMessage},
},
});
}
}

// Send tool results back
console.log(`Sending ${functionResponses.length} tool outputs back...`);
result = await chat.sendMessage(functionResponses);
response = result.response;
} else {
console.log('No tool calls. Interaction finished.');
break;
}
}

console.log('\nVerifying expectations...');
expectations(allCalls);
} finally {
await client?.close();
await transport?.close();
}
}
const apiKey = process.env.GEMINI_API_KEY;
if (!apiKey) {
throw new Error('GEMINI_API_KEY environment variable is required.');
}

void describe('Gemini Eval Scenarios', () => {
const files = fs.readdirSync(SCENARIOS_DIR).filter(file => {
return file.endsWith('.ts') || file.endsWith('.js');
});

for (const file of files) {
void test(file, async () => {
await runSingleScenario(path.join(SCENARIOS_DIR, file), apiKey);
});
}
});
21 changes: 21 additions & 0 deletions scripts/eval_scenarios/navigation_test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
/**
* @license
* Copyright 2026 Google LLC
* SPDX-License-Identifier: Apache-2.0
*/
import assert from 'node:assert';

import type {TestScenario} from '../eval_gemini.ts';

export const scenario: TestScenario = {
prompt: 'Navigate to https://developers.chrome.com and tell me if it worked.',
maxTurns: 1,
expectations: calls => {
assert.deepStrictEqual(calls, [
{
name: 'navigate_page',
args: {url: 'https://developers.chrome.com'},
},
]);
},
};
20 changes: 20 additions & 0 deletions scripts/eval_scenarios/performance_test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
/**
* @license
* Copyright 2026 Google LLC
* SPDX-License-Identifier: Apache-2.0
*/
import assert from 'node:assert';

import type {TestScenario} from '../eval_gemini.ts';

export const scenario: TestScenario = {
prompt: 'Check the performance of https://developers.chrome.com',
maxTurns: 2,
expectations: calls => {
assert.strictEqual(calls.length, 2);
assert.ok(
calls[0].name === 'navigate_page' || calls[0].name === 'new_page',
);
assert.ok(calls[1].name === 'performance_start_trace');
},
};