From 7028a7762d52748a6e0ae7e94950c440626da5aa Mon Sep 17 00:00:00 2001 From: Alex Rudenko Date: Tue, 20 Jan 2026 11:30:17 +0100 Subject: [PATCH] chore: improve eval scenarios --- docs/tool-reference.md | 2 +- package-lock.json | 740 ++++++++++++++++++++++- package.json | 2 +- scripts/eval_gemini.ts | 234 ++----- scripts/eval_scenarios/console_test.ts | 15 +- scripts/eval_scenarios/emulation_test.ts | 6 +- scripts/eval_scenarios/input_test.ts | 22 +- scripts/eval_scenarios/network_test.ts | 15 +- src/tools/performance.ts | 5 +- 9 files changed, 830 insertions(+), 211 deletions(-) diff --git a/docs/tool-reference.md b/docs/tool-reference.md index 830644ca3..9dccd3e7e 100644 --- a/docs/tool-reference.md +++ b/docs/tool-reference.md @@ -234,7 +234,7 @@ **Parameters:** - **autoStop** (boolean) **(required)**: Determines if the trace recording should be automatically stopped. -- **reload** (boolean) **(required)**: Determines if, once tracing has started, the page should be automatically reloaded. +- **reload** (boolean) **(required)**: Determines if, once tracing has started, the current selected page should be automatically reloaded. Navigate the page to the right URL using the [`navigate_page`](#navigate_page) tool BEFORE starting the trace if reload or autoStop is set to true. - **filePath** (string) _(optional)_: The absolute file path, or a file path relative to the current working directory, to save the raw trace data. For example, trace.json.gz (compressed) or trace.json (uncompressed). --- diff --git a/package-lock.json b/package-lock.json index ad04fdb9c..396efc60a 100644 --- a/package-lock.json +++ b/package-lock.json @@ -13,7 +13,7 @@ }, "devDependencies": { "@eslint/js": "^9.35.0", - "@google/generative-ai": "^0.24.1", + "@google/genai": "^1.37.0", "@modelcontextprotocol/sdk": "1.25.2", "@rollup/plugin-commonjs": "^29.0.0", "@rollup/plugin-json": "^6.1.0", @@ -321,14 +321,27 @@ "node": "^18.18.0 || ^20.9.0 || >=21.1.0" } }, - "node_modules/@google/generative-ai": { - "version": "0.24.1", - "resolved": "https://registry.npmjs.org/@google/generative-ai/-/generative-ai-0.24.1.tgz", - "integrity": "sha512-MqO+MLfM6kjxcKoy0p1wRzG3b4ZZXtPI+z2IE26UogS2Cm/XHO+7gGRBh6gcJsOiIVoH93UwKvW4HdgiOZCy9Q==", + "node_modules/@google/genai": { + "version": "1.37.0", + "resolved": "https://registry.npmjs.org/@google/genai/-/genai-1.37.0.tgz", + "integrity": "sha512-of85LwNqretPhBHHEljUY05OSsQVUPyw5n1RdNkUpKR36kgHq7CVVEGY4GplVbFaqbdT3DXfLSv5B/Avbe5vXw==", "dev": true, "license": "Apache-2.0", + "dependencies": { + "google-auth-library": "^10.3.0", + "protobufjs": "^7.5.4", + "ws": "^8.18.0" + }, "engines": { - "node": ">=18.0.0" + "node": ">=20.0.0" + }, + "peerDependencies": { + "@modelcontextprotocol/sdk": "^1.25.2" + }, + "peerDependenciesMeta": { + "@modelcontextprotocol/sdk": { + "optional": true + } } }, "node_modules/@hono/node-server": { @@ -396,6 +409,80 @@ "url": "https://github.com/sponsors/nzakas" } }, + "node_modules/@isaacs/cliui": { + "version": "8.0.2", + "resolved": "https://registry.npmjs.org/@isaacs/cliui/-/cliui-8.0.2.tgz", + "integrity": "sha512-O8jcjabXaleOG9DQ0+ARXWZBTfnP4WNAqzuiJK7ll44AmxGKv/J2M4TPjxjY3znBCfvBXFzucm1twdyFybFqEA==", + "dev": true, + "license": "ISC", + "dependencies": { + "string-width": "^5.1.2", + "string-width-cjs": "npm:string-width@^4.2.0", + "strip-ansi": "^7.0.1", + "strip-ansi-cjs": "npm:strip-ansi@^6.0.1", + "wrap-ansi": "^8.1.0", + "wrap-ansi-cjs": "npm:wrap-ansi@^7.0.0" + }, + "engines": { + "node": ">=12" + } + }, + "node_modules/@isaacs/cliui/node_modules/ansi-styles": { + "version": "6.2.3", + "resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-6.2.3.tgz", + "integrity": "sha512-4Dj6M28JB+oAH8kFkTLUo+a2jwOFkuqb3yucU0CANcRRUbxS0cP0nZYCGjcc3BNXwRIsUVmDGgzawme7zvJHvg==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=12" + }, + "funding": { + "url": "https://github.com/chalk/ansi-styles?sponsor=1" + } + }, + "node_modules/@isaacs/cliui/node_modules/emoji-regex": { + "version": "9.2.2", + "resolved": "https://registry.npmjs.org/emoji-regex/-/emoji-regex-9.2.2.tgz", + "integrity": "sha512-L18DaJsXSUk2+42pv8mLs5jJT2hqFkFE4j21wOmgbUqsZ2hL72NsUU785g9RXgo3s0ZNgVl42TiHp3ZtOv/Vyg==", + "dev": true, + "license": "MIT" + }, + "node_modules/@isaacs/cliui/node_modules/string-width": { + "version": "5.1.2", + "resolved": "https://registry.npmjs.org/string-width/-/string-width-5.1.2.tgz", + "integrity": "sha512-HnLOCR3vjcY8beoNLtcjZ5/nxn2afmME6lhrDrebokqMap+XbeW8n9TXpPDOqdGK5qcI3oT0GKTW6wC7EMiVqA==", + "dev": true, + "license": "MIT", + "dependencies": { + "eastasianwidth": "^0.2.0", + "emoji-regex": "^9.2.2", + "strip-ansi": "^7.0.1" + }, + "engines": { + "node": ">=12" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, + "node_modules/@isaacs/cliui/node_modules/wrap-ansi": { + "version": "8.1.0", + "resolved": "https://registry.npmjs.org/wrap-ansi/-/wrap-ansi-8.1.0.tgz", + "integrity": "sha512-si7QWI6zUMq56bESFvagtmzMdGOtoxfR+Sez11Mobfc7tm+VkUckk9bW2UeffTGVUbOksxmSw0AA2gs8g71NCQ==", + "dev": true, + "license": "MIT", + "dependencies": { + "ansi-styles": "^6.1.0", + "string-width": "^5.0.1", + "strip-ansi": "^7.0.1" + }, + "engines": { + "node": ">=12" + }, + "funding": { + "url": "https://github.com/chalk/wrap-ansi?sponsor=1" + } + }, "node_modules/@jridgewell/sourcemap-codec": { "version": "1.5.5", "resolved": "https://registry.npmjs.org/@jridgewell/sourcemap-codec/-/sourcemap-codec-1.5.5.tgz", @@ -480,6 +567,91 @@ "@tybys/wasm-util": "^0.10.0" } }, + "node_modules/@pkgjs/parseargs": { + "version": "0.11.0", + "resolved": "https://registry.npmjs.org/@pkgjs/parseargs/-/parseargs-0.11.0.tgz", + "integrity": "sha512-+1VkjdD0QBLPodGrJUeqarH8VAIvQODIbwh9XpP5Syisf7YoQgsJKPNFoqqLQlu+VQ/tVSshMR6loPMn8U+dPg==", + "dev": true, + "license": "MIT", + "optional": true, + "engines": { + "node": ">=14" + } + }, + "node_modules/@protobufjs/aspromise": { + "version": "1.1.2", + "resolved": "https://registry.npmjs.org/@protobufjs/aspromise/-/aspromise-1.1.2.tgz", + "integrity": "sha512-j+gKExEuLmKwvz3OgROXtrJ2UG2x8Ch2YZUxahh+s1F2HZ+wAceUNLkvy6zKCPVRkU++ZWQrdxsUeQXmcg4uoQ==", + "dev": true, + "license": "BSD-3-Clause" + }, + "node_modules/@protobufjs/base64": { + "version": "1.1.2", + "resolved": "https://registry.npmjs.org/@protobufjs/base64/-/base64-1.1.2.tgz", + "integrity": "sha512-AZkcAA5vnN/v4PDqKyMR5lx7hZttPDgClv83E//FMNhR2TMcLUhfRUBHCmSl0oi9zMgDDqRUJkSxO3wm85+XLg==", + "dev": true, + "license": "BSD-3-Clause" + }, + "node_modules/@protobufjs/codegen": { + "version": "2.0.4", + "resolved": "https://registry.npmjs.org/@protobufjs/codegen/-/codegen-2.0.4.tgz", + "integrity": "sha512-YyFaikqM5sH0ziFZCN3xDC7zeGaB/d0IUb9CATugHWbd1FRFwWwt4ld4OYMPWu5a3Xe01mGAULCdqhMlPl29Jg==", + "dev": true, + "license": "BSD-3-Clause" + }, + "node_modules/@protobufjs/eventemitter": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/@protobufjs/eventemitter/-/eventemitter-1.1.0.tgz", + "integrity": "sha512-j9ednRT81vYJ9OfVuXG6ERSTdEL1xVsNgqpkxMsbIabzSo3goCjDIveeGv5d03om39ML71RdmrGNjG5SReBP/Q==", + "dev": true, + "license": "BSD-3-Clause" + }, + "node_modules/@protobufjs/fetch": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/@protobufjs/fetch/-/fetch-1.1.0.tgz", + "integrity": "sha512-lljVXpqXebpsijW71PZaCYeIcE5on1w5DlQy5WH6GLbFryLUrBD4932W/E2BSpfRJWseIL4v/KPgBFxDOIdKpQ==", + "dev": true, + "license": "BSD-3-Clause", + "dependencies": { + "@protobufjs/aspromise": "^1.1.1", + "@protobufjs/inquire": "^1.1.0" + } + }, + "node_modules/@protobufjs/float": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/@protobufjs/float/-/float-1.0.2.tgz", + "integrity": "sha512-Ddb+kVXlXst9d+R9PfTIxh1EdNkgoRe5tOX6t01f1lYWOvJnSPDBlG241QLzcyPdoNTsblLUdujGSE4RzrTZGQ==", + "dev": true, + "license": "BSD-3-Clause" + }, + "node_modules/@protobufjs/inquire": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/@protobufjs/inquire/-/inquire-1.1.0.tgz", + "integrity": "sha512-kdSefcPdruJiFMVSbn801t4vFK7KB/5gd2fYvrxhuJYg8ILrmn9SKSX2tZdV6V+ksulWqS7aXjBcRXl3wHoD9Q==", + "dev": true, + "license": "BSD-3-Clause" + }, + "node_modules/@protobufjs/path": { + "version": "1.1.2", + "resolved": "https://registry.npmjs.org/@protobufjs/path/-/path-1.1.2.tgz", + "integrity": "sha512-6JOcJ5Tm08dOHAbdR3GrvP+yUUfkjG5ePsHYczMFLq3ZmMkAD98cDgcT2iA1lJ9NVwFd4tH/iSSoe44YWkltEA==", + "dev": true, + "license": "BSD-3-Clause" + }, + "node_modules/@protobufjs/pool": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/@protobufjs/pool/-/pool-1.1.0.tgz", + "integrity": "sha512-0kELaGSIDBKvcgS4zkjz1PeddatrjYcmMWOlAuAPwAeccUrPHdUqo/J6LiymHHEiJT5NrF1UVwxY14f+fy4WQw==", + "dev": true, + "license": "BSD-3-Clause" + }, + "node_modules/@protobufjs/utf8": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/@protobufjs/utf8/-/utf8-1.1.0.tgz", + "integrity": "sha512-Vvn3zZrhQZkkBE8LSuW3em98c0FwgO4nxzv6OdSxPKJIEKY2bGbHn+mhGIPerzI4twdxaP8/0+06HBpwf345Lw==", + "dev": true, + "license": "BSD-3-Clause" + }, "node_modules/@puppeteer/browsers": { "version": "2.11.1", "resolved": "https://registry.npmjs.org/@puppeteer/browsers/-/browsers-2.11.1.tgz", @@ -2211,6 +2383,27 @@ "bare-path": "^3.0.0" } }, + "node_modules/base64-js": { + "version": "1.5.1", + "resolved": "https://registry.npmjs.org/base64-js/-/base64-js-1.5.1.tgz", + "integrity": "sha512-AKpaYlHn8t4SVbOHCy+b5+KKgvR4vrsD8vbvrbiQJps7fKDTkjkDry6ji0rUJjC0kzbNePLwzxq8iypo41qeWA==", + "dev": true, + "funding": [ + { + "type": "github", + "url": "https://github.com/sponsors/feross" + }, + { + "type": "patreon", + "url": "https://www.patreon.com/feross" + }, + { + "type": "consulting", + "url": "https://feross.org/support" + } + ], + "license": "MIT" + }, "node_modules/basic-ftp": { "version": "5.1.0", "resolved": "https://registry.npmjs.org/basic-ftp/-/basic-ftp-5.1.0.tgz", @@ -2221,6 +2414,16 @@ "node": ">=10.0.0" } }, + "node_modules/bignumber.js": { + "version": "9.3.1", + "resolved": "https://registry.npmjs.org/bignumber.js/-/bignumber.js-9.3.1.tgz", + "integrity": "sha512-Ko0uX15oIUS7wJ3Rb30Fs6SkVbLmPBAKdlm7q9+ak9bbIeFf0MwuBsQV6z7+X768/cHsfg+WlysDWJcmthjsjQ==", + "dev": true, + "license": "MIT", + "engines": { + "node": "*" + } + }, "node_modules/body-parser": { "version": "2.2.1", "resolved": "https://registry.npmjs.org/body-parser/-/body-parser-2.2.1.tgz", @@ -2266,6 +2469,13 @@ "node": "*" } }, + "node_modules/buffer-equal-constant-time": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/buffer-equal-constant-time/-/buffer-equal-constant-time-1.0.1.tgz", + "integrity": "sha512-zRpUiDwd/xk6ADqPMATG8vc9VPrkck7T07OIx0gnjmJAnHnTVXNQG3vfvWNuiZIkwu9KrKdA1iJKfsfTVxE6NA==", + "dev": true, + "license": "BSD-3-Clause" + }, "node_modules/bytes": { "version": "3.1.2", "resolved": "https://registry.npmjs.org/bytes/-/bytes-3.1.2.tgz", @@ -2746,6 +2956,23 @@ "node": ">= 0.4" } }, + "node_modules/eastasianwidth": { + "version": "0.2.0", + "resolved": "https://registry.npmjs.org/eastasianwidth/-/eastasianwidth-0.2.0.tgz", + "integrity": "sha512-I88TYZWc9XiYHRQ4/3c5rjjfgkjhLyW2luGIheGERbNQ6OY7yTybanSpDXZa8y7VUP9YmDcYa+eyq4ca7iLqWA==", + "dev": true, + "license": "MIT" + }, + "node_modules/ecdsa-sig-formatter": { + "version": "1.0.11", + "resolved": "https://registry.npmjs.org/ecdsa-sig-formatter/-/ecdsa-sig-formatter-1.0.11.tgz", + "integrity": "sha512-nagl3RYrbNv6kQkeJIpt6NJZy8twLB/2vtz6yN9Z4vRKHN4/QZJIEbqohALSgwKdnksuY3k5Addp5lg8sVoVcQ==", + "dev": true, + "license": "Apache-2.0", + "dependencies": { + "safe-buffer": "^5.0.1" + } + }, "node_modules/ee-first": { "version": "1.1.1", "resolved": "https://registry.npmjs.org/ee-first/-/ee-first-1.1.1.tgz", @@ -3501,6 +3728,13 @@ "express": ">= 4.11" } }, + "node_modules/extend": { + "version": "3.0.2", + "resolved": "https://registry.npmjs.org/extend/-/extend-3.0.2.tgz", + "integrity": "sha512-fjquC59cD7CyW6urNXK0FBufkZcoiGG80wTuPujX590cB5Ttln20E2UB4S/WARVqhXffZl2LNgS+gQdPIIim/g==", + "dev": true, + "license": "MIT" + }, "node_modules/extract-zip": { "version": "2.0.1", "resolved": "https://registry.npmjs.org/extract-zip/-/extract-zip-2.0.1.tgz", @@ -3595,6 +3829,30 @@ } } }, + "node_modules/fetch-blob": { + "version": "3.2.0", + "resolved": "https://registry.npmjs.org/fetch-blob/-/fetch-blob-3.2.0.tgz", + "integrity": "sha512-7yAQpD2UMJzLi1Dqv7qFYnPbaPx7ZfFK6PiIxQ4PfkGPyNyl2Ugx+a/umUonmKqjhM4DnfbMvdX6otXq83soQQ==", + "dev": true, + "funding": [ + { + "type": "github", + "url": "https://github.com/sponsors/jimmywarting" + }, + { + "type": "paypal", + "url": "https://paypal.me/jimmywarting" + } + ], + "license": "MIT", + "dependencies": { + "node-domexception": "^1.0.0", + "web-streams-polyfill": "^3.0.3" + }, + "engines": { + "node": "^12.20 || >= 14.13" + } + }, "node_modules/file-entry-cache": { "version": "8.0.0", "resolved": "https://registry.npmjs.org/file-entry-cache/-/file-entry-cache-8.0.0.tgz", @@ -3680,6 +3938,36 @@ "url": "https://github.com/sponsors/ljharb" } }, + "node_modules/foreground-child": { + "version": "3.3.1", + "resolved": "https://registry.npmjs.org/foreground-child/-/foreground-child-3.3.1.tgz", + "integrity": "sha512-gIXjKqtFuWEgzFRJA9WCQeSJLZDjgJUOMCMzxtvFq/37KojM1BFGufqsCy0r4qSQmYLsZYMeyRqzIWOMup03sw==", + "dev": true, + "license": "ISC", + "dependencies": { + "cross-spawn": "^7.0.6", + "signal-exit": "^4.0.1" + }, + "engines": { + "node": ">=14" + }, + "funding": { + "url": "https://github.com/sponsors/isaacs" + } + }, + "node_modules/formdata-polyfill": { + "version": "4.0.10", + "resolved": "https://registry.npmjs.org/formdata-polyfill/-/formdata-polyfill-4.0.10.tgz", + "integrity": "sha512-buewHzMvYL29jdeQTVILecSaZKnt/RJWjoZCF5OW60Z67/GmSLBkOFM7qh1PI3zFNtJbaZL5eQu1vLfazOwj4g==", + "dev": true, + "license": "MIT", + "dependencies": { + "fetch-blob": "^3.1.2" + }, + "engines": { + "node": ">=12.20.0" + } + }, "node_modules/forwarded": { "version": "0.2.0", "resolved": "https://registry.npmjs.org/forwarded/-/forwarded-0.2.0.tgz", @@ -3756,6 +4044,37 @@ "url": "https://github.com/sponsors/ljharb" } }, + "node_modules/gaxios": { + "version": "7.1.3", + "resolved": "https://registry.npmjs.org/gaxios/-/gaxios-7.1.3.tgz", + "integrity": "sha512-YGGyuEdVIjqxkxVH1pUTMY/XtmmsApXrCVv5EU25iX6inEPbV+VakJfLealkBtJN69AQmh1eGOdCl9Sm1UP6XQ==", + "dev": true, + "license": "Apache-2.0", + "dependencies": { + "extend": "^3.0.2", + "https-proxy-agent": "^7.0.1", + "node-fetch": "^3.3.2", + "rimraf": "^5.0.1" + }, + "engines": { + "node": ">=18" + } + }, + "node_modules/gcp-metadata": { + "version": "8.1.2", + "resolved": "https://registry.npmjs.org/gcp-metadata/-/gcp-metadata-8.1.2.tgz", + "integrity": "sha512-zV/5HKTfCeKWnxG0Dmrw51hEWFGfcF2xiXqcA3+J90WDuP0SvoiSO5ORvcBsifmx/FoIjgQN3oNOGaQ5PhLFkg==", + "dev": true, + "license": "Apache-2.0", + "dependencies": { + "gaxios": "^7.0.0", + "google-logging-utils": "^1.0.0", + "json-bigint": "^1.0.0" + }, + "engines": { + "node": ">=18" + } + }, "node_modules/generator-function": { "version": "2.0.1", "resolved": "https://registry.npmjs.org/generator-function/-/generator-function-2.0.1.tgz", @@ -3890,6 +4209,27 @@ "node": ">= 14" } }, + "node_modules/glob": { + "version": "10.5.0", + "resolved": "https://registry.npmjs.org/glob/-/glob-10.5.0.tgz", + "integrity": "sha512-DfXN8DfhJ7NH3Oe7cFmu3NCu1wKbkReJ8TorzSAFbSKrlNaQSKfIzqYqVY8zlbs2NLBbWpRiU52GX2PbaBVNkg==", + "dev": true, + "license": "ISC", + "dependencies": { + "foreground-child": "^3.1.0", + "jackspeak": "^3.1.2", + "minimatch": "^9.0.4", + "minipass": "^7.1.2", + "package-json-from-dist": "^1.0.0", + "path-scurry": "^1.11.1" + }, + "bin": { + "glob": "dist/esm/bin.mjs" + }, + "funding": { + "url": "https://github.com/sponsors/isaacs" + } + }, "node_modules/glob-parent": { "version": "6.0.2", "resolved": "https://registry.npmjs.org/glob-parent/-/glob-parent-6.0.2.tgz", @@ -3933,6 +4273,35 @@ "url": "https://github.com/sponsors/ljharb" } }, + "node_modules/google-auth-library": { + "version": "10.5.0", + "resolved": "https://registry.npmjs.org/google-auth-library/-/google-auth-library-10.5.0.tgz", + "integrity": "sha512-7ABviyMOlX5hIVD60YOfHw4/CxOfBhyduaYB+wbFWCWoni4N7SLcV46hrVRktuBbZjFC9ONyqamZITN7q3n32w==", + "dev": true, + "license": "Apache-2.0", + "dependencies": { + "base64-js": "^1.3.0", + "ecdsa-sig-formatter": "^1.0.11", + "gaxios": "^7.0.0", + "gcp-metadata": "^8.0.0", + "google-logging-utils": "^1.0.0", + "gtoken": "^8.0.0", + "jws": "^4.0.0" + }, + "engines": { + "node": ">=18" + } + }, + "node_modules/google-logging-utils": { + "version": "1.1.3", + "resolved": "https://registry.npmjs.org/google-logging-utils/-/google-logging-utils-1.1.3.tgz", + "integrity": "sha512-eAmLkjDjAFCVXg7A1unxHsLf961m6y17QFqXqAXGj/gVkKFrEICfStRfwUlGNfeCEjNRa32JEWOUTlYXPyyKvA==", + "dev": true, + "license": "Apache-2.0", + "engines": { + "node": ">=14" + } + }, "node_modules/gopd": { "version": "1.2.0", "resolved": "https://registry.npmjs.org/gopd/-/gopd-1.2.0.tgz", @@ -3946,6 +4315,20 @@ "url": "https://github.com/sponsors/ljharb" } }, + "node_modules/gtoken": { + "version": "8.0.0", + "resolved": "https://registry.npmjs.org/gtoken/-/gtoken-8.0.0.tgz", + "integrity": "sha512-+CqsMbHPiSTdtSO14O51eMNlrp9N79gmeqmXeouJOhfucAedHw9noVe/n5uJk3tbKE6a+6ZCQg3RPhVhHByAIw==", + "dev": true, + "license": "MIT", + "dependencies": { + "gaxios": "^7.0.0", + "jws": "^4.0.0" + }, + "engines": { + "node": ">=18" + } + }, "node_modules/has-bigints": { "version": "1.1.0", "resolved": "https://registry.npmjs.org/has-bigints/-/has-bigints-1.1.0.tgz", @@ -4639,6 +5022,22 @@ "dev": true, "license": "ISC" }, + "node_modules/jackspeak": { + "version": "3.4.3", + "resolved": "https://registry.npmjs.org/jackspeak/-/jackspeak-3.4.3.tgz", + "integrity": "sha512-OGlZQpz2yfahA/Rd1Y8Cd9SIEsqvXkLVoSw/cgwhnhFMDbsQFeZYoJJ7bIZBS9BcamUW96asq/npPWugM+RQBw==", + "dev": true, + "license": "BlueOak-1.0.0", + "dependencies": { + "@isaacs/cliui": "^8.0.2" + }, + "funding": { + "url": "https://github.com/sponsors/isaacs" + }, + "optionalDependencies": { + "@pkgjs/parseargs": "^0.11.0" + } + }, "node_modules/jose": { "version": "6.1.3", "resolved": "https://registry.npmjs.org/jose/-/jose-6.1.3.tgz", @@ -4694,6 +5093,16 @@ "js-yaml": "bin/js-yaml.js" } }, + "node_modules/json-bigint": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/json-bigint/-/json-bigint-1.0.0.tgz", + "integrity": "sha512-SiPv/8VpZuWbvLSMtTDU8hEfrZWg/mH/nV/b4o0CYbSxu1UIQPLdwKOCIyLQX+VIPO5vrLX3i8qtqFyhdPSUSQ==", + "dev": true, + "license": "MIT", + "dependencies": { + "bignumber.js": "^9.0.0" + } + }, "node_modules/json-buffer": { "version": "3.0.1", "resolved": "https://registry.npmjs.org/json-buffer/-/json-buffer-3.0.1.tgz", @@ -4742,6 +5151,29 @@ "json5": "lib/cli.js" } }, + "node_modules/jwa": { + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/jwa/-/jwa-2.0.1.tgz", + "integrity": "sha512-hRF04fqJIP8Abbkq5NKGN0Bbr3JxlQ+qhZufXVr0DvujKy93ZCbXZMHDL4EOtodSbCWxOqR8MS1tXA5hwqCXDg==", + "dev": true, + "license": "MIT", + "dependencies": { + "buffer-equal-constant-time": "^1.0.1", + "ecdsa-sig-formatter": "1.0.11", + "safe-buffer": "^5.0.1" + } + }, + "node_modules/jws": { + "version": "4.0.1", + "resolved": "https://registry.npmjs.org/jws/-/jws-4.0.1.tgz", + "integrity": "sha512-EKI/M/yqPncGUUh44xz0PxSidXFr/+r0pA70+gIYhjv+et7yxM+s29Y+VGDkovRofQem0fs7Uvf4+YmAdyRduA==", + "dev": true, + "license": "MIT", + "dependencies": { + "jwa": "^2.0.1", + "safe-buffer": "^5.0.1" + } + }, "node_modules/keyv": { "version": "4.5.4", "resolved": "https://registry.npmjs.org/keyv/-/keyv-4.5.4.tgz", @@ -4803,6 +5235,13 @@ "dev": true, "license": "MIT" }, + "node_modules/long": { + "version": "5.3.2", + "resolved": "https://registry.npmjs.org/long/-/long-5.3.2.tgz", + "integrity": "sha512-mNAgZ1GmyNhD7AuqnTG3/VQ26o760+ZYBPKjPvugO8+nLbYfX6TVpJPseBvopbdY+qpZ/lKUnmEc1LeZYS3QAA==", + "dev": true, + "license": "Apache-2.0" + }, "node_modules/lru-cache": { "version": "7.18.3", "resolved": "https://registry.npmjs.org/lru-cache/-/lru-cache-7.18.3.tgz", @@ -4905,6 +5344,16 @@ "url": "https://github.com/sponsors/ljharb" } }, + "node_modules/minipass": { + "version": "7.1.2", + "resolved": "https://registry.npmjs.org/minipass/-/minipass-7.1.2.tgz", + "integrity": "sha512-qOOzS1cBTWYF4BH8fVePDBOO9iptMnGUEZwNc/cMWnTV2nVLZ7VoNWEPHkYczZA0pdoA7dl6e7FL659nX9S2aw==", + "dev": true, + "license": "ISC", + "engines": { + "node": ">=16 || 14 >=14.17" + } + }, "node_modules/mitt": { "version": "3.0.1", "resolved": "https://registry.npmjs.org/mitt/-/mitt-3.0.1.tgz", @@ -4972,6 +5421,56 @@ "node": ">= 0.4.0" } }, + "node_modules/node-domexception": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/node-domexception/-/node-domexception-1.0.0.tgz", + "integrity": "sha512-/jKZoMpw0F8GRwl4/eLROPA3cfcXtLApP0QzLmUT/HuPCZWyB7IY9ZrMeKw2O/nFIqPQB3PVM9aYm0F312AXDQ==", + "deprecated": "Use your platform's native DOMException instead", + "dev": true, + "funding": [ + { + "type": "github", + "url": "https://github.com/sponsors/jimmywarting" + }, + { + "type": "github", + "url": "https://paypal.me/jimmywarting" + } + ], + "license": "MIT", + "engines": { + "node": ">=10.5.0" + } + }, + "node_modules/node-fetch": { + "version": "3.3.2", + "resolved": "https://registry.npmjs.org/node-fetch/-/node-fetch-3.3.2.tgz", + "integrity": "sha512-dRB78srN/l6gqWulah9SrxeYnxeddIG30+GOqK/9OlLVyLg3HPnr6SqOWTWOXKRwC2eGYCkZ59NNuSgvSrpgOA==", + "dev": true, + "license": "MIT", + "dependencies": { + "data-uri-to-buffer": "^4.0.0", + "fetch-blob": "^3.1.4", + "formdata-polyfill": "^4.0.10" + }, + "engines": { + "node": "^12.20.0 || ^14.13.1 || >=16.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/node-fetch" + } + }, + "node_modules/node-fetch/node_modules/data-uri-to-buffer": { + "version": "4.0.1", + "resolved": "https://registry.npmjs.org/data-uri-to-buffer/-/data-uri-to-buffer-4.0.1.tgz", + "integrity": "sha512-0R9ikRb668HB7QDxT1vkpuUBtqc53YyAwMwGeUFKRojY/NWKvdZ+9UYtRfGmhqNbRkTSVpMbmyhXipFFv2cb/A==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">= 12" + } + }, "node_modules/object-assign": { "version": "4.1.1", "resolved": "https://registry.npmjs.org/object-assign/-/object-assign-4.1.1.tgz", @@ -5204,6 +5703,13 @@ "node": ">= 14" } }, + "node_modules/package-json-from-dist": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/package-json-from-dist/-/package-json-from-dist-1.0.1.tgz", + "integrity": "sha512-UEZIS3/by4OC8vL3P2dTXRETpebLI2NiI5vIrjaD/5UtrkFX/tNbwjTSRAGC/+7CAo2pIcBaRgWmcBBHcsaCIw==", + "dev": true, + "license": "BlueOak-1.0.0" + }, "node_modules/package-name-regex": { "version": "2.0.6", "resolved": "https://registry.npmjs.org/package-name-regex/-/package-name-regex-2.0.6.tgz", @@ -5286,6 +5792,30 @@ "dev": true, "license": "MIT" }, + "node_modules/path-scurry": { + "version": "1.11.1", + "resolved": "https://registry.npmjs.org/path-scurry/-/path-scurry-1.11.1.tgz", + "integrity": "sha512-Xa4Nw17FS9ApQFJ9umLiJS4orGjm7ZzwUrwamcGQuHSzDyth9boKDaycYdDcZDuqYATXw4HFXgaqWTctW/v1HA==", + "dev": true, + "license": "BlueOak-1.0.0", + "dependencies": { + "lru-cache": "^10.2.0", + "minipass": "^5.0.0 || ^6.0.2 || ^7.0.0" + }, + "engines": { + "node": ">=16 || 14 >=14.18" + }, + "funding": { + "url": "https://github.com/sponsors/isaacs" + } + }, + "node_modules/path-scurry/node_modules/lru-cache": { + "version": "10.4.3", + "resolved": "https://registry.npmjs.org/lru-cache/-/lru-cache-10.4.3.tgz", + "integrity": "sha512-JNAzZcXrCt42VGLuYz0zfAzDfAvJWW6AfYlDBQyDV5DClI2m5sAmK+OIO7s59XfsRsWHp02jAJrRadPRGTt6SQ==", + "dev": true, + "license": "ISC" + }, "node_modules/path-to-regexp": { "version": "8.3.0", "resolved": "https://registry.npmjs.org/path-to-regexp/-/path-to-regexp-8.3.0.tgz", @@ -5390,6 +5920,31 @@ "node": ">=0.4.0" } }, + "node_modules/protobufjs": { + "version": "7.5.4", + "resolved": "https://registry.npmjs.org/protobufjs/-/protobufjs-7.5.4.tgz", + "integrity": "sha512-CvexbZtbov6jW2eXAvLukXjXUW1TzFaivC46BpWc/3BpcCysb5Vffu+B3XHMm8lVEuy2Mm4XGex8hBSg1yapPg==", + "dev": true, + "hasInstallScript": true, + "license": "BSD-3-Clause", + "dependencies": { + "@protobufjs/aspromise": "^1.1.2", + "@protobufjs/base64": "^1.1.2", + "@protobufjs/codegen": "^2.0.4", + "@protobufjs/eventemitter": "^1.1.0", + "@protobufjs/fetch": "^1.1.0", + "@protobufjs/float": "^1.0.2", + "@protobufjs/inquire": "^1.1.0", + "@protobufjs/path": "^1.1.2", + "@protobufjs/pool": "^1.1.0", + "@protobufjs/utf8": "^1.1.0", + "@types/node": ">=13.7.0", + "long": "^5.0.0" + }, + "engines": { + "node": ">=12.0.0" + } + }, "node_modules/proxy-addr": { "version": "2.0.7", "resolved": "https://registry.npmjs.org/proxy-addr/-/proxy-addr-2.0.7.tgz", @@ -5640,6 +6195,22 @@ "url": "https://github.com/privatenumber/resolve-pkg-maps?sponsor=1" } }, + "node_modules/rimraf": { + "version": "5.0.10", + "resolved": "https://registry.npmjs.org/rimraf/-/rimraf-5.0.10.tgz", + "integrity": "sha512-l0OE8wL34P4nJH/H2ffoaniAokM2qSmrtXHmlpvYr5AVVX8msAyW0l8NVJFDxlSK4u3Uh/f41cQheDVdnYijwQ==", + "dev": true, + "license": "ISC", + "dependencies": { + "glob": "^10.3.7" + }, + "bin": { + "rimraf": "dist/esm/bin.mjs" + }, + "funding": { + "url": "https://github.com/sponsors/isaacs" + } + }, "node_modules/rollup": { "version": "4.55.1", "resolved": "https://registry.npmjs.org/rollup/-/rollup-4.55.1.tgz", @@ -6049,6 +6620,19 @@ "url": "https://github.com/sponsors/ljharb" } }, + "node_modules/signal-exit": { + "version": "4.1.0", + "resolved": "https://registry.npmjs.org/signal-exit/-/signal-exit-4.1.0.tgz", + "integrity": "sha512-bzyZ1e88w9O1iNJbKnOlvYTrWPDl46O1bG0D3XInv+9tkPrxrN8jUUTiFlDkkmKWgn1M6CfIA13SuGqOa9Korw==", + "dev": true, + "license": "ISC", + "engines": { + "node": ">=14" + }, + "funding": { + "url": "https://github.com/sponsors/isaacs" + } + }, "node_modules/sinon": { "version": "21.0.1", "resolved": "https://registry.npmjs.org/sinon/-/sinon-21.0.1.tgz", @@ -6267,6 +6851,52 @@ "url": "https://github.com/sponsors/sindresorhus" } }, + "node_modules/string-width-cjs": { + "name": "string-width", + "version": "4.2.3", + "resolved": "https://registry.npmjs.org/string-width/-/string-width-4.2.3.tgz", + "integrity": "sha512-wKyQRQpjJ0sIp62ErSZdGsjMJWsap5oRNihHhu6G7JVO/9jIB6UyevL+tXuOqrng8j/cxKTWyWUwvSTriiZz/g==", + "dev": true, + "license": "MIT", + "dependencies": { + "emoji-regex": "^8.0.0", + "is-fullwidth-code-point": "^3.0.0", + "strip-ansi": "^6.0.1" + }, + "engines": { + "node": ">=8" + } + }, + "node_modules/string-width-cjs/node_modules/ansi-regex": { + "version": "5.0.1", + "resolved": "https://registry.npmjs.org/ansi-regex/-/ansi-regex-5.0.1.tgz", + "integrity": "sha512-quJQXlTSUGL2LH9SUXo8VwsY4soanhgo6LNSm84E1LBcE8s3O0wpdiRzyR9z/ZZJMlMWv37qOOb9pdJlMUEKFQ==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=8" + } + }, + "node_modules/string-width-cjs/node_modules/emoji-regex": { + "version": "8.0.0", + "resolved": "https://registry.npmjs.org/emoji-regex/-/emoji-regex-8.0.0.tgz", + "integrity": "sha512-MSjYzcWNOA0ewAHpz0MxpYFvwg6yjy1NG3xteoqz644VCo/RPgnr1/GGt+ic3iJTzQ8Eu3TdM14SawnVUmGE6A==", + "dev": true, + "license": "MIT" + }, + "node_modules/string-width-cjs/node_modules/strip-ansi": { + "version": "6.0.1", + "resolved": "https://registry.npmjs.org/strip-ansi/-/strip-ansi-6.0.1.tgz", + "integrity": "sha512-Y38VPSHcqkFrCpFnQ9vuSXmquuv5oXOKpGeT6aGrr3o3Gc9AlVa6JBfUSOCnbxGGZF+/0ooI7KrPuUSztUdU5A==", + "dev": true, + "license": "MIT", + "dependencies": { + "ansi-regex": "^5.0.1" + }, + "engines": { + "node": ">=8" + } + }, "node_modules/string.prototype.trim": { "version": "1.2.10", "resolved": "https://registry.npmjs.org/string.prototype.trim/-/string.prototype.trim-1.2.10.tgz", @@ -6342,6 +6972,30 @@ "url": "https://github.com/chalk/strip-ansi?sponsor=1" } }, + "node_modules/strip-ansi-cjs": { + "name": "strip-ansi", + "version": "6.0.1", + "resolved": "https://registry.npmjs.org/strip-ansi/-/strip-ansi-6.0.1.tgz", + "integrity": "sha512-Y38VPSHcqkFrCpFnQ9vuSXmquuv5oXOKpGeT6aGrr3o3Gc9AlVa6JBfUSOCnbxGGZF+/0ooI7KrPuUSztUdU5A==", + "dev": true, + "license": "MIT", + "dependencies": { + "ansi-regex": "^5.0.1" + }, + "engines": { + "node": ">=8" + } + }, + "node_modules/strip-ansi-cjs/node_modules/ansi-regex": { + "version": "5.0.1", + "resolved": "https://registry.npmjs.org/ansi-regex/-/ansi-regex-5.0.1.tgz", + "integrity": "sha512-quJQXlTSUGL2LH9SUXo8VwsY4soanhgo6LNSm84E1LBcE8s3O0wpdiRzyR9z/ZZJMlMWv37qOOb9pdJlMUEKFQ==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=8" + } + }, "node_modules/strip-bom": { "version": "3.0.0", "resolved": "https://registry.npmjs.org/strip-bom/-/strip-bom-3.0.0.tgz", @@ -6740,6 +7394,16 @@ "node": ">= 0.8" } }, + "node_modules/web-streams-polyfill": { + "version": "3.3.3", + "resolved": "https://registry.npmjs.org/web-streams-polyfill/-/web-streams-polyfill-3.3.3.tgz", + "integrity": "sha512-d2JWLCivmZYTSIoge9MsgFCZrt571BikcWGYkjC1khllbTeDlGqZ2D8vD8E/lJa8WGWbb7Plm8/XJYV7IJHZZw==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">= 8" + } + }, "node_modules/webdriver-bidi-protocol": { "version": "0.3.10", "resolved": "https://registry.npmjs.org/webdriver-bidi-protocol/-/webdriver-bidi-protocol-0.3.10.tgz", @@ -6880,6 +7544,70 @@ "url": "https://github.com/chalk/wrap-ansi?sponsor=1" } }, + "node_modules/wrap-ansi-cjs": { + "name": "wrap-ansi", + "version": "7.0.0", + "resolved": "https://registry.npmjs.org/wrap-ansi/-/wrap-ansi-7.0.0.tgz", + "integrity": "sha512-YVGIj2kamLSTxw6NsZjoBxfSwsn0ycdesmc4p+Q21c5zPuZ1pl+NfxVdxPtdHvmNVOQ6XSYG4AUtyt/Fi7D16Q==", + "dev": true, + "license": "MIT", + "dependencies": { + "ansi-styles": "^4.0.0", + "string-width": "^4.1.0", + "strip-ansi": "^6.0.0" + }, + "engines": { + "node": ">=10" + }, + "funding": { + "url": "https://github.com/chalk/wrap-ansi?sponsor=1" + } + }, + "node_modules/wrap-ansi-cjs/node_modules/ansi-regex": { + "version": "5.0.1", + "resolved": "https://registry.npmjs.org/ansi-regex/-/ansi-regex-5.0.1.tgz", + "integrity": "sha512-quJQXlTSUGL2LH9SUXo8VwsY4soanhgo6LNSm84E1LBcE8s3O0wpdiRzyR9z/ZZJMlMWv37qOOb9pdJlMUEKFQ==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=8" + } + }, + "node_modules/wrap-ansi-cjs/node_modules/emoji-regex": { + "version": "8.0.0", + "resolved": "https://registry.npmjs.org/emoji-regex/-/emoji-regex-8.0.0.tgz", + "integrity": "sha512-MSjYzcWNOA0ewAHpz0MxpYFvwg6yjy1NG3xteoqz644VCo/RPgnr1/GGt+ic3iJTzQ8Eu3TdM14SawnVUmGE6A==", + "dev": true, + "license": "MIT" + }, + "node_modules/wrap-ansi-cjs/node_modules/string-width": { + "version": "4.2.3", + "resolved": "https://registry.npmjs.org/string-width/-/string-width-4.2.3.tgz", + "integrity": "sha512-wKyQRQpjJ0sIp62ErSZdGsjMJWsap5oRNihHhu6G7JVO/9jIB6UyevL+tXuOqrng8j/cxKTWyWUwvSTriiZz/g==", + "dev": true, + "license": "MIT", + "dependencies": { + "emoji-regex": "^8.0.0", + "is-fullwidth-code-point": "^3.0.0", + "strip-ansi": "^6.0.1" + }, + "engines": { + "node": ">=8" + } + }, + "node_modules/wrap-ansi-cjs/node_modules/strip-ansi": { + "version": "6.0.1", + "resolved": "https://registry.npmjs.org/strip-ansi/-/strip-ansi-6.0.1.tgz", + "integrity": "sha512-Y38VPSHcqkFrCpFnQ9vuSXmquuv5oXOKpGeT6aGrr3o3Gc9AlVa6JBfUSOCnbxGGZF+/0ooI7KrPuUSztUdU5A==", + "dev": true, + "license": "MIT", + "dependencies": { + "ansi-regex": "^5.0.1" + }, + "engines": { + "node": ">=8" + } + }, "node_modules/wrap-ansi/node_modules/ansi-styles": { "version": "6.2.3", "resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-6.2.3.tgz", diff --git a/package.json b/package.json index c9ad6c709..bfcae662c 100644 --- a/package.json +++ b/package.json @@ -40,7 +40,7 @@ "mcpName": "io.github.ChromeDevTools/chrome-devtools-mcp", "devDependencies": { "@eslint/js": "^9.35.0", - "@google/generative-ai": "^0.24.1", + "@google/genai": "^1.37.0", "@modelcontextprotocol/sdk": "1.25.2", "@rollup/plugin-commonjs": "^29.0.0", "@rollup/plugin-json": "^6.1.0", diff --git a/scripts/eval_gemini.ts b/scripts/eval_gemini.ts index 462e6af7d..6a3e9662d 100644 --- a/scripts/eval_gemini.ts +++ b/scripts/eval_gemini.ts @@ -8,11 +8,7 @@ import fs from 'node:fs'; import path from 'node:path'; import {parseArgs} from 'node:util'; -import { - GoogleGenerativeAI, - type FunctionDeclaration, - SchemaType, -} from '@google/generative-ai'; +import {GoogleGenAI, mcpToTool} from '@google/genai'; import {Client} from '@modelcontextprotocol/sdk/client/index.js'; import {StdioClientTransport} from '@modelcontextprotocol/sdk/client/stdio.js'; @@ -47,37 +43,6 @@ async function loadScenario(scenarioPath: string): Promise { return module.scenario; } -function isRecord(v: unknown): v is Record { - return typeof v === 'object' && v !== null && !Array.isArray(v); -} - -const cleanSchemaRecursive = (schema: unknown): unknown => { - if (!isRecord(schema)) { - return schema; - } - - const out: Record = {}; - for (const key in schema) { - if ( - key === 'default' || - key === 'additionalProperties' || - key === 'exclusiveMinimum' - ) { - continue; - } - - const value = schema[key]; - if (Array.isArray(value)) { - out[key] = value.map(cleanSchemaRecursive); - } else if (isRecord(value)) { - out[key] = cleanSchemaRecursive(value); - } else { - out[key] = value; - } - } - return out; -}; - async function runSingleScenario( scenarioPath: string, apiKey: string, @@ -99,7 +64,12 @@ async function runSingleScenario( let transport: StdioClientTransport | undefined; try { - const scenario = await loadScenario(absolutePath); + const loadedScenario = await loadScenario(absolutePath); + const scenario = {...loadedScenario}; + + // Append random queryid to avoid caching issues and test distinct runs + const randomId = Math.floor(Math.random() * 1000000); + scenario.prompt = `${scenario.prompt}\nqueryid=${randomId}`; if (scenario.htmlRoute) { server.addHtmlRoute( @@ -147,137 +117,46 @@ async function runSingleScenario( await client.connect(transport); - const toolsResult = await client.listTools(); - const mcpTools = toolsResult.tools; - - // Convert MCP tools to Gemini function declarations - const functionDeclarations: FunctionDeclaration[] = mcpTools.map(tool => ({ - name: tool.name.replace(/-/g, '_').replace(/\./g, '_'), // Sanitize name for Gemini - description: tool.description?.substring(0, 1024) || '', - parameters: cleanSchemaRecursive({ - type: SchemaType.OBJECT, - properties: - isRecord(tool.inputSchema) && 'properties' in tool.inputSchema - ? tool.inputSchema.properties - : {}, - required: - isRecord(tool.inputSchema) && - 'required' in tool.inputSchema && - Array.isArray(tool.inputSchema.required) - ? tool.inputSchema.required - : [], - }) as FunctionDeclaration['parameters'], - })); - - // Keep a map of sanitized names to original names for execution - const contentToolsMap = new Map(); - for (const tool of mcpTools) { - const sanitized = tool.name.replace(/-/g, '_').replace(/\./g, '_'); - contentToolsMap.set(sanitized, tool.name); - } - - const genAI = new GoogleGenerativeAI(apiKey); - const model = genAI.getGenerativeModel({ - model: modelId, - tools: [{functionDeclarations}], - }); - - const chat = model.startChat({ - systemInstruction: { - role: 'system', - parts: [{text: `Use available tools.`}], - }, - }); - - const expectations = scenario.expectations; const allCalls: CapturedFunctionCall[] = []; + const originalCallTool = client.callTool.bind(client); + client.callTool = async (request, schema) => { + // NOTE: request.name is the original name as the MCP client sees it. + // mcpToTool handles the conversion from Gemini sanitized name to original name. + debugLog( + `Executing tool: ${request.name} with args: ${JSON.stringify(request.arguments)}`, + ); + allCalls.push({ + name: request.name, + args: (request.arguments as Record) || {}, + }); + return originalCallTool(request, schema); + }; - // Execute turns - let turnCount = 0; - debugLog(`\n--- Turn 1 (User) ---`); - debugLog(scenario.prompt); - - let result = await chat.sendMessage(scenario.prompt, { - timeout: 5000, - }); - let response = result.response; - - while (turnCount < scenario.maxTurns) { - turnCount++; - debugLog(`\n--- Turn ${turnCount} (Model) ---`); - const text = response.text(); - if (text) { - debugLog(`Text: ${text}`); - } - - const functionCalls = response.functionCalls(); - if (functionCalls && functionCalls.length > 0) { - debugLog(`Function Calls: ${JSON.stringify(functionCalls, null, 2)}`); - - const functionResponses = []; - for (const call of functionCalls) { - const originalName = contentToolsMap.get(call.name); - if (!originalName) { - console.error(`Unknown tool called: ${call.name}`); - functionResponses.push({ - functionResponse: { - name: call.name, - response: {error: `Unknown tool: ${call.name}`}, - }, - }); - continue; - } - - const safeArgs = isRecord(call.args) ? call.args : {}; + const ai = new GoogleGenAI({apiKey}); - debugLog( - `Executing tool: ${originalName} with args: ${JSON.stringify(call.args)}`, - ); + debugLog(`\n--- Prompt ---\n${scenario.prompt}`); - allCalls.push({ - name: originalName, - args: safeArgs, - }); - - try { - const toolResult = await client.callTool({ - name: originalName, - arguments: safeArgs, - }); - - functionResponses.push({ - functionResponse: { - name: call.name, - response: {name: call.name, content: toolResult}, - }, - }); - } catch (e) { - const errorMessage = e instanceof Error ? e.message : String(e); - console.error(`Error executing tool ${originalName}:`, e); - functionResponses.push({ - functionResponse: { - name: call.name, - response: {error: errorMessage}, - }, - }); - } - } + const result = await ai.models.generateContent({ + model: modelId, + contents: scenario.prompt, + config: { + tools: [mcpToTool(client)], + automaticFunctionCalling: { + maximumRemoteCalls: scenario.maxTurns, + }, + }, + }); - // Send tool results back - debugLog(`Sending ${functionResponses.length} tool outputs back...`); - result = await chat.sendMessage(functionResponses); - response = result.response; - } else { - debugLog('No tool calls. Interaction finished.'); - break; - } - } + debugLog(`\n--- Response ---\n${result.text}`); debugLog('\nVerifying expectations...'); - expectations(allCalls); + scenario.expectations(allCalls); } finally { - await client?.close(); - await transport?.close(); + try { + await client?.close(); + } catch (e) { + console.error('Error closing client:', e); + } } } @@ -297,12 +176,18 @@ async function main() { type: 'boolean', default: false, }, + repeat: { + type: 'boolean', + default: false, + }, }, allowPositionals: true, }); const modelId = values.model; const debug = values.debug; + const repeat = values.repeat; + const scenarioFiles = positionals.length > 0 ? positionals.map(p => path.resolve(p)) @@ -319,16 +204,25 @@ async function main() { try { for (const scenarioPath of scenarioFiles) { - try { - await runSingleScenario(scenarioPath, apiKey, server, modelId, debug); - console.log(`✔ ${path.relative(ROOT_DIR, scenarioPath)}`); - successCount++; - } catch (e) { - console.error(`✖ ${path.relative(ROOT_DIR, scenarioPath)}`); - console.error(e); - failureCount++; - } finally { - server.restore(); + for (let i = 1; i <= (repeat ? 3 : 1); i++) { + try { + if (debug) { + console.log( + `Running scenario: ${path.relative(ROOT_DIR, scenarioPath)} (Run ${i}/3)`, + ); + } + await runSingleScenario(scenarioPath, apiKey, server, modelId, debug); + console.log(`✔ ${path.relative(ROOT_DIR, scenarioPath)} (Run ${i})`); + successCount++; + } catch (e) { + console.error( + `✖ ${path.relative(ROOT_DIR, scenarioPath)} (Run ${i})`, + ); + console.error(e); + failureCount++; + } finally { + server.restore(); + } } } } finally { diff --git a/scripts/eval_scenarios/console_test.ts b/scripts/eval_scenarios/console_test.ts index 94ce4da92..d82abc2a3 100644 --- a/scripts/eval_scenarios/console_test.ts +++ b/scripts/eval_scenarios/console_test.ts @@ -21,12 +21,15 @@ export const scenario: TestScenario = { `, }, expectations: calls => { - const navigate = calls.find( - c => c.name === 'navigate_page' || c.name === 'new_page', + assert.strictEqual(calls.length, 2); + assert.ok( + calls[0].name === 'navigate_page' || calls[0].name === 'new_page', + 'First call should be navigation', + ); + assert.strictEqual( + calls[1].name, + 'list_console_messages', + 'Second call should be list_console_messages', ); - const listMessages = calls.find(c => c.name === 'list_console_messages'); - - assert.ok(navigate, 'Should navigate to the page'); - assert.ok(listMessages, 'Should list console messages'); }, }; diff --git a/scripts/eval_scenarios/emulation_test.ts b/scripts/eval_scenarios/emulation_test.ts index 4b6f01c81..2eebac836 100644 --- a/scripts/eval_scenarios/emulation_test.ts +++ b/scripts/eval_scenarios/emulation_test.ts @@ -12,8 +12,8 @@ export const scenario: TestScenario = { prompt: 'Emulate offline network conditions.', maxTurns: 2, expectations: calls => { - const emulate = calls.find(c => c.name === 'emulate'); - assert.ok(emulate, 'Should call emulate tool'); - assert.strictEqual(emulate.args.networkConditions, 'Offline'); + assert.strictEqual(calls.length, 1); + assert.strictEqual(calls[0].name, 'emulate'); + assert.strictEqual(calls[0].args.networkConditions, 'Offline'); }, }; diff --git a/scripts/eval_scenarios/input_test.ts b/scripts/eval_scenarios/input_test.ts index ca1e28d25..6078e7f96 100644 --- a/scripts/eval_scenarios/input_test.ts +++ b/scripts/eval_scenarios/input_test.ts @@ -11,7 +11,7 @@ import type {TestScenario} from '../eval_gemini.ts'; export const scenario: TestScenario = { prompt: 'Go to , fill the input with "hello world" and click the button.', - maxTurns: 3, + maxTurns: 4, htmlRoute: { path: '/input_test.html', htmlContent: ` @@ -20,20 +20,12 @@ export const scenario: TestScenario = { `, }, expectations: calls => { - // Expected sequence: navigate -> fill -> click - // But model might take snapshot in between or do things in parallel if supported (but standard loop is sequential turns usually) - // We just check if the tools were called. - - const navigate = calls.find( - c => c.name === 'navigate_page' || c.name === 'new_page', + assert.strictEqual(calls.length, 4); + assert.ok( + calls[0].name === 'navigate_page' || calls[0].name === 'new_page', ); - const fill = calls.find(c => c.name === 'fill'); - const click = calls.find(c => c.name === 'click'); - - assert.ok(navigate, 'Should navigate to the page'); - assert.ok(fill, 'Should fill the input'); - assert.ok(click, 'Should click the button'); - - assert.strictEqual(fill.args.value, 'hello world'); + assert.ok(calls[1].name === 'take_snapshot'); + assert.ok(calls[2].name === 'fill'); + assert.ok(calls[3].name === 'click'); }, }; diff --git a/scripts/eval_scenarios/network_test.ts b/scripts/eval_scenarios/network_test.ts index 480c3ec7b..bacb26ac6 100644 --- a/scripts/eval_scenarios/network_test.ts +++ b/scripts/eval_scenarios/network_test.ts @@ -21,12 +21,15 @@ export const scenario: TestScenario = { `, }, expectations: calls => { - const navigate = calls.find( - c => c.name === 'navigate_page' || c.name === 'new_page', + assert.strictEqual(calls.length, 2); + assert.ok( + calls[0].name === 'navigate_page' || calls[0].name === 'new_page', + 'First call should be navigation', + ); + assert.strictEqual( + calls[1].name, + 'list_network_requests', + 'Second call should be list_network_requests', ); - const listRequests = calls.find(c => c.name === 'list_network_requests'); - - assert.ok(navigate, 'Should navigate to the page'); - assert.ok(listRequests, 'Should list network requests'); }, }; diff --git a/src/tools/performance.ts b/src/tools/performance.ts index a6f8e718e..fb3728d16 100644 --- a/src/tools/performance.ts +++ b/src/tools/performance.ts @@ -30,8 +30,7 @@ const filePathSchema = zod export const startTrace = defineTool({ name: 'performance_start_trace', - description: - 'Starts a performance trace recording on the selected page. This can be used to look for performance problems and insights to improve the performance of the page. It will also report Core Web Vital (CWV) scores for the page.', + description: `Starts a performance trace recording on the selected page. This can be used to look for performance problems and insights to improve the performance of the page. It will also report Core Web Vital (CWV) scores for the page.`, annotations: { category: ToolCategory.PERFORMANCE, readOnlyHint: false, @@ -40,7 +39,7 @@ export const startTrace = defineTool({ reload: zod .boolean() .describe( - 'Determines if, once tracing has started, the page should be automatically reloaded.', + 'Determines if, once tracing has started, the current selected page should be automatically reloaded. Navigate the page to the right URL using the navigate_page tool BEFORE starting the trace if reload or autoStop is set to true.', ), autoStop: zod .boolean()