Skip to content

Commit dc39ab5

Browse files
committed
chore: support an html route in evals
1 parent 257b994 commit dc39ab5

3 files changed

Lines changed: 49 additions & 3 deletions

File tree

scripts/eval_gemini.ts

Lines changed: 24 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ import {StdioClientTransport} from '@modelcontextprotocol/sdk/client/stdio.js';
1818

1919
const ROOT_DIR = path.resolve(import.meta.dirname, '..');
2020
const SCENARIOS_DIR = path.join(import.meta.dirname, 'eval_scenarios');
21+
import {TestServer} from '../build/tests/server.js';
2122

2223
// Define schema for our test scenarios
2324
export interface CapturedFunctionCall {
@@ -29,6 +30,10 @@ export interface TestScenario {
2930
prompt: string;
3031
maxTurns: number;
3132
expectations: (calls: CapturedFunctionCall[]) => void;
33+
htmlRoute?: {
34+
path: string;
35+
htmlContent: string;
36+
};
3237
}
3338

3439
async function loadScenario(scenarioPath: string): Promise<TestScenario> {
@@ -84,10 +89,24 @@ async function runSingleScenario(
8489

8590
let client: Client | undefined;
8691
let transport: StdioClientTransport | undefined;
92+
let server: TestServer | undefined;
8793

8894
try {
8995
const scenario = await loadScenario(absolutePath);
9096

97+
if (scenario.htmlRoute) {
98+
server = new TestServer(TestServer.randomPort());
99+
await server.start();
100+
server.addHtmlRoute(
101+
scenario.htmlRoute.path,
102+
scenario.htmlRoute.htmlContent,
103+
);
104+
scenario.prompt = scenario.prompt.replace(
105+
'<TEST_URL>',
106+
server.getRoute(scenario.htmlRoute.path),
107+
);
108+
}
109+
91110
// Path to the compiled MCP server
92111
const serverPath = path.join(ROOT_DIR, 'build/src/index.js');
93112
if (!fs.existsSync(serverPath)) {
@@ -148,7 +167,7 @@ async function runSingleScenario(
148167

149168
const genAI = new GoogleGenerativeAI(apiKey);
150169
const model = genAI.getGenerativeModel({
151-
model: 'gemini-3-pro-preview',
170+
model: 'gemini-2.5-flash',
152171
tools: [{functionDeclarations}],
153172
});
154173

@@ -167,7 +186,9 @@ async function runSingleScenario(
167186
console.log(`\n--- Turn 1 (User) ---`);
168187
console.log(scenario.prompt);
169188

170-
let result = await chat.sendMessage(scenario.prompt);
189+
let result = await chat.sendMessage(scenario.prompt, {
190+
timeout: 5000,
191+
});
171192
let response = result.response;
172193

173194
while (turnCount < scenario.maxTurns) {
@@ -248,6 +269,7 @@ async function runSingleScenario(
248269
} finally {
249270
await client?.close();
250271
await transport?.close();
272+
await server?.stop();
251273
}
252274
}
253275
const apiKey = process.env.GEMINI_API_KEY;
Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
/**
2+
* @license
3+
* Copyright 2026 Google LLC
4+
* SPDX-License-Identifier: Apache-2.0
5+
*/
6+
import assert from 'node:assert';
7+
8+
import type {TestScenario} from '../eval_gemini.ts';
9+
10+
export const scenario: TestScenario = {
11+
prompt: 'Read the content of <TEST_URL>',
12+
maxTurns: 3,
13+
htmlRoute: {
14+
path: '/test.html',
15+
htmlContent: '<h1>Hello World</h1><p>This is a test.</p>',
16+
},
17+
expectations: calls => {
18+
assert.strictEqual(calls.length, 2);
19+
assert.ok(
20+
calls[0].name === 'navigate_page' || calls[0].name === 'new_page',
21+
);
22+
assert.ok(calls[1].name === 'take_snapshot');
23+
},
24+
};

tests/server.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ import {before, after, afterEach} from 'node:test';
1313

1414
import {html} from './utils.js';
1515

16-
class TestServer {
16+
export class TestServer {
1717
#port: number;
1818
#server: Server;
1919

0 commit comments

Comments
 (0)