Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 38 additions & 5 deletions scripts/eval_gemini.ts
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

import fs from 'node:fs';
import path from 'node:path';
import {describe, test} from 'node:test';
import {describe, test, before, after, afterEach} from 'node:test';

import {
GoogleGenerativeAI,
Expand All @@ -18,6 +18,7 @@ import {StdioClientTransport} from '@modelcontextprotocol/sdk/client/stdio.js';

const ROOT_DIR = path.resolve(import.meta.dirname, '..');
const SCENARIOS_DIR = path.join(import.meta.dirname, 'eval_scenarios');
import {TestServer} from '../build/tests/server.js';

// Define schema for our test scenarios
export interface CapturedFunctionCall {
Expand All @@ -29,6 +30,10 @@ export interface TestScenario {
prompt: string;
maxTurns: number;
expectations: (calls: CapturedFunctionCall[]) => void;
htmlRoute?: {
path: string;
htmlContent: string;
};
}

async function loadScenario(scenarioPath: string): Promise<TestScenario> {
Expand Down Expand Up @@ -78,6 +83,7 @@ const cleanSchemaRecursive = (schema: unknown): unknown => {
async function runSingleScenario(
scenarioPath: string,
apiKey: string,
server: TestServer,
): Promise<void> {
const absolutePath = path.resolve(scenarioPath);
console.log(`\n### Running Scenario: ${absolutePath} ###`);
Expand All @@ -88,6 +94,17 @@ async function runSingleScenario(
try {
const scenario = await loadScenario(absolutePath);

if (scenario.htmlRoute) {
server.addHtmlRoute(
scenario.htmlRoute.path,
scenario.htmlRoute.htmlContent,
);
scenario.prompt = scenario.prompt.replace(
'<TEST_URL>',
server.getRoute(scenario.htmlRoute.path),
);
}

// Path to the compiled MCP server
const serverPath = path.join(ROOT_DIR, 'build/src/index.js');
if (!fs.existsSync(serverPath)) {
Expand Down Expand Up @@ -148,7 +165,7 @@ async function runSingleScenario(

const genAI = new GoogleGenerativeAI(apiKey);
const model = genAI.getGenerativeModel({
model: 'gemini-3-pro-preview',
model: 'gemini-2.5-flash',
tools: [{functionDeclarations}],
});

Expand All @@ -167,7 +184,9 @@ async function runSingleScenario(
console.log(`\n--- Turn 1 (User) ---`);
console.log(scenario.prompt);

let result = await chat.sendMessage(scenario.prompt);
let result = await chat.sendMessage(scenario.prompt, {
timeout: 5000,
});
let response = result.response;

while (turnCount < scenario.maxTurns) {
Expand Down Expand Up @@ -256,13 +275,27 @@ if (!apiKey) {
}

void describe('Gemini Eval Scenarios', () => {
const server = new TestServer(TestServer.randomPort());

before(async () => {
await server.start();
});

after(async () => {
await server.stop();
});

afterEach(() => {
server.restore();
});

const files = fs.readdirSync(SCENARIOS_DIR).filter(file => {
return file.endsWith('.ts') || file.endsWith('.js');
});

for (const file of files) {
void test(file, async () => {
await runSingleScenario(path.join(SCENARIOS_DIR, file), apiKey);
void test(file, {timeout: 60_000}, async () => {
await runSingleScenario(path.join(SCENARIOS_DIR, file), apiKey, server);
});
}
});
24 changes: 24 additions & 0 deletions scripts/eval_scenarios/snapshot_test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
/**
* @license
* Copyright 2026 Google LLC
* SPDX-License-Identifier: Apache-2.0
*/
import assert from 'node:assert';

import type {TestScenario} from '../eval_gemini.ts';

export const scenario: TestScenario = {
prompt: 'Read the content of <TEST_URL>',
maxTurns: 3,
htmlRoute: {
path: '/test.html',
htmlContent: '<h1>Hello World</h1><p>This is a test.</p>',
},
expectations: calls => {
assert.strictEqual(calls.length, 2);
assert.ok(
calls[0].name === 'navigate_page' || calls[0].name === 'new_page',
);
assert.ok(calls[1].name === 'take_snapshot');
},
};
2 changes: 1 addition & 1 deletion tests/server.ts
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ import {before, after, afterEach} from 'node:test';

import {html} from './utils.js';

class TestServer {
export class TestServer {
#port: number;
#server: Server;

Expand Down