Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions scripts/eval_gemini.ts
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,8 @@ export interface TestScenario {
path: string;
htmlContent: string;
};
/** Extra CLI flags passed to the MCP server (e.g. '--experimental-page-id-routing'). */
serverArgs?: string[];
}

async function loadScenario(scenarioPath: string): Promise<TestScenario> {
Expand Down Expand Up @@ -117,6 +119,9 @@ async function runSingleScenario(
if (!debug) {
args.push('--headless');
}
if (scenario.serverArgs) {
args.push(...scenario.serverArgs);
}

transport = new StdioClientTransport({
command: 'node',
Expand Down
59 changes: 59 additions & 0 deletions scripts/eval_scenarios/page_focus_keyboard_test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
/**
* @license
* Copyright 2026 Google LLC
* SPDX-License-Identifier: Apache-2.0
*/

import assert from 'node:assert';

import type {TestScenario} from '../eval_gemini.ts';

export const scenario: TestScenario = {
serverArgs: ['--experimental-page-id-routing'],
prompt: `Open two pages in the same isolated context "session":
- Page 1 at data:text/html,<textarea id="ta"></textarea>
- Page 2 at data:text/html,<h1>Other</h1>

Now use the press_key tool to type "a" on Page 1 without selecting it first. You must use press_key, not fill or type_text. If you encounter any errors, recover from them.`,
maxTurns: 10,
expectations: calls => {
// Should open 2 pages in the same context.
const newPages = calls.filter(c => c.name === 'new_page');
assert.strictEqual(newPages.length, 2, 'Should open 2 pages');
assert.strictEqual(newPages[0].args.isolatedContext, 'session');
assert.strictEqual(newPages[1].args.isolatedContext, 'session');

// Should attempt press_key at least once.
const pressKeys = calls.filter(c => c.name === 'press_key');
assert.ok(pressKeys.length >= 1, 'Should attempt press_key at least once');

const selectPages = calls.filter(c => c.name === 'select_page');

if (selectPages.length > 0) {
const firstPressKeyIndex = calls.indexOf(pressKeys[0]);
const firstSelectPageIndex = calls.indexOf(selectPages[0]);

if (firstPressKeyIndex < firstSelectPageIndex) {
// Error path: press_key was attempted first and failed.
// Verify recovery: must have a second press_key after select_page.
assert.ok(
pressKeys.length >= 2,
'Should retry press_key after error recovery',
);
const lastPressKeyIndex = calls.lastIndexOf(pressKeys.at(-1)!);
assert.ok(
firstSelectPageIndex < lastPressKeyIndex,
'select_page should precede the successful press_key',
);
} else {
// Proactive path: model selected page first.
assert.ok(
firstSelectPageIndex < firstPressKeyIndex,
'select_page should precede press_key',
);
}
}
// If no select_page was called, the model found another recovery path.
// This is acceptable as long as press_key was attempted.
},
};
40 changes: 40 additions & 0 deletions scripts/eval_scenarios/page_id_routing_test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
/**
* @license
* Copyright 2026 Google LLC
* SPDX-License-Identifier: Apache-2.0
*/

import assert from 'node:assert';

import type {TestScenario} from '../eval_gemini.ts';

export const scenario: TestScenario = {
serverArgs: ['--experimental-page-id-routing'],
prompt: `Open two new pages in isolated contexts:
- Page A (isolatedContext "contextA") at data:text/html,<button>Click A</button>
- Page B (isolatedContext "contextB") at data:text/html,<button>Click B</button>
Then take a snapshot of Page A, take a snapshot of Page B, and then click the button on Page A.`,
maxTurns: 12,
expectations: calls => {
// Should have 2 new_page calls with isolatedContext.
const newPages = calls.filter(c => c.name === 'new_page');
assert.strictEqual(newPages.length, 2, 'Should open 2 pages');
for (const np of newPages) {
assert.strictEqual(
typeof np.args.isolatedContext,
'string',
'new_page should use isolatedContext',
);
}

// Should have at least 2 take_snapshot calls (one per page).
// The model may use pageId directly or select_page before each snapshot.
const snapshots = calls.filter(c => c.name === 'take_snapshot');
assert.ok(snapshots.length >= 2, 'Should take at least 2 snapshots');

// Should have a click call (resolving uid from Page A's snapshot
// even though Page B was snapshotted after).
const clicks = calls.filter(c => c.name === 'click');
assert.ok(clicks.length >= 1, 'Should click the button on Page A');
},
};
Loading