From ea5b97059015bfc559215ce727be6efe5eaee5d4 Mon Sep 17 00:00:00 2001 From: Alex Rudenko Date: Wed, 14 Jan 2026 18:07:33 +0100 Subject: [PATCH] chore: support an html route in evals --- scripts/eval_gemini.ts | 43 ++++++++++++++++++++++--- scripts/eval_scenarios/snapshot_test.ts | 24 ++++++++++++++ tests/server.ts | 2 +- 3 files changed, 63 insertions(+), 6 deletions(-) create mode 100644 scripts/eval_scenarios/snapshot_test.ts diff --git a/scripts/eval_gemini.ts b/scripts/eval_gemini.ts index 8163dc641..c099399ca 100644 --- a/scripts/eval_gemini.ts +++ b/scripts/eval_gemini.ts @@ -6,7 +6,7 @@ import fs from 'node:fs'; import path from 'node:path'; -import {describe, test} from 'node:test'; +import {describe, test, before, after, afterEach} from 'node:test'; import { GoogleGenerativeAI, @@ -18,6 +18,7 @@ import {StdioClientTransport} from '@modelcontextprotocol/sdk/client/stdio.js'; const ROOT_DIR = path.resolve(import.meta.dirname, '..'); const SCENARIOS_DIR = path.join(import.meta.dirname, 'eval_scenarios'); +import {TestServer} from '../build/tests/server.js'; // Define schema for our test scenarios export interface CapturedFunctionCall { @@ -29,6 +30,10 @@ export interface TestScenario { prompt: string; maxTurns: number; expectations: (calls: CapturedFunctionCall[]) => void; + htmlRoute?: { + path: string; + htmlContent: string; + }; } async function loadScenario(scenarioPath: string): Promise { @@ -78,6 +83,7 @@ const cleanSchemaRecursive = (schema: unknown): unknown => { async function runSingleScenario( scenarioPath: string, apiKey: string, + server: TestServer, ): Promise { const absolutePath = path.resolve(scenarioPath); console.log(`\n### Running Scenario: ${absolutePath} ###`); @@ -88,6 +94,17 @@ async function runSingleScenario( try { const scenario = await loadScenario(absolutePath); + if (scenario.htmlRoute) { + server.addHtmlRoute( + scenario.htmlRoute.path, + scenario.htmlRoute.htmlContent, + ); + scenario.prompt = scenario.prompt.replace( + '', + server.getRoute(scenario.htmlRoute.path), + ); + } + // Path to the compiled MCP server const serverPath = path.join(ROOT_DIR, 'build/src/index.js'); if (!fs.existsSync(serverPath)) { @@ -148,7 +165,7 @@ async function runSingleScenario( const genAI = new GoogleGenerativeAI(apiKey); const model = genAI.getGenerativeModel({ - model: 'gemini-3-pro-preview', + model: 'gemini-2.5-flash', tools: [{functionDeclarations}], }); @@ -167,7 +184,9 @@ async function runSingleScenario( console.log(`\n--- Turn 1 (User) ---`); console.log(scenario.prompt); - let result = await chat.sendMessage(scenario.prompt); + let result = await chat.sendMessage(scenario.prompt, { + timeout: 5000, + }); let response = result.response; while (turnCount < scenario.maxTurns) { @@ -256,13 +275,27 @@ if (!apiKey) { } void describe('Gemini Eval Scenarios', () => { + const server = new TestServer(TestServer.randomPort()); + + before(async () => { + await server.start(); + }); + + after(async () => { + await server.stop(); + }); + + afterEach(() => { + server.restore(); + }); + const files = fs.readdirSync(SCENARIOS_DIR).filter(file => { return file.endsWith('.ts') || file.endsWith('.js'); }); for (const file of files) { - void test(file, async () => { - await runSingleScenario(path.join(SCENARIOS_DIR, file), apiKey); + void test(file, {timeout: 60_000}, async () => { + await runSingleScenario(path.join(SCENARIOS_DIR, file), apiKey, server); }); } }); diff --git a/scripts/eval_scenarios/snapshot_test.ts b/scripts/eval_scenarios/snapshot_test.ts new file mode 100644 index 000000000..c5f43ff55 --- /dev/null +++ b/scripts/eval_scenarios/snapshot_test.ts @@ -0,0 +1,24 @@ +/** + * @license + * Copyright 2026 Google LLC + * SPDX-License-Identifier: Apache-2.0 + */ +import assert from 'node:assert'; + +import type {TestScenario} from '../eval_gemini.ts'; + +export const scenario: TestScenario = { + prompt: 'Read the content of ', + maxTurns: 3, + htmlRoute: { + path: '/test.html', + htmlContent: '

Hello World

This is a test.

', + }, + expectations: calls => { + assert.strictEqual(calls.length, 2); + assert.ok( + calls[0].name === 'navigate_page' || calls[0].name === 'new_page', + ); + assert.ok(calls[1].name === 'take_snapshot'); + }, +}; diff --git a/tests/server.ts b/tests/server.ts index fb6fcd5b4..7278861af 100644 --- a/tests/server.ts +++ b/tests/server.ts @@ -13,7 +13,7 @@ import {before, after, afterEach} from 'node:test'; import {html} from './utils.js'; -class TestServer { +export class TestServer { #port: number; #server: Server;