diff --git a/scripts/generate-docs.ts b/scripts/generate-docs.ts index 47740b5fc..9e59bdaf5 100644 --- a/scripts/generate-docs.ts +++ b/scripts/generate-docs.ts @@ -20,6 +20,7 @@ interface ToolWithAnnotations extends Tool { annotations?: { title?: string; category?: typeof ToolCategory; + conditions?: string[]; }; } @@ -265,31 +266,39 @@ async function generateToolDocumentation(): Promise { console.log('Generating tool documentation from definitions...'); // Convert ToolDefinitions to ToolWithAnnotations - const toolsWithAnnotations: ToolWithAnnotations[] = tools.map(tool => { - const properties: Record = {}; - const required: string[] = []; - - for (const [key, schema] of Object.entries( - tool.schema as unknown as Record, - )) { - const info = getZodTypeInfo(schema); - properties[key] = info; - if (isRequired(schema)) { - required.push(key); + const toolsWithAnnotations: ToolWithAnnotations[] = tools + .filter(tool => { + if (!tool.annotations.conditions) { + return true; + } + // Only include unconditional tools. + return tool.annotations.conditions.length === 0; + }) + .map(tool => { + const properties: Record = {}; + const required: string[] = []; + + for (const [key, schema] of Object.entries( + tool.schema as unknown as Record, + )) { + const info = getZodTypeInfo(schema); + properties[key] = info; + if (isRequired(schema)) { + required.push(key); + } } - } - return { - name: tool.name, - description: tool.description, - inputSchema: { - type: 'object', - properties, - required, - }, - annotations: tool.annotations, - }; - }); + return { + name: tool.name, + description: tool.description, + inputSchema: { + type: 'object', + properties, + required, + }, + annotations: tool.annotations, + }; + }); console.log(`Found ${toolsWithAnnotations.length} tools`); diff --git a/src/cli.ts b/src/cli.ts index db2680587..e1a623e78 100644 --- a/src/cli.ts +++ b/src/cli.ts @@ -152,6 +152,11 @@ export const cliOptions = { describe: 'Whether to enable automation over DevTools targets', hidden: true, }, + experimentalVision: { + type: 'boolean', + describe: 'Whether to enable vision tools', + hidden: true, + }, experimentalIncludeAllPages: { type: 'boolean', describe: diff --git a/src/main.ts b/src/main.ts index 84bb6d9b5..ab6ee3019 100644 --- a/src/main.ts +++ b/src/main.ts @@ -121,6 +121,12 @@ function registerTool(tool: ToolDefinition): void { ) { return; } + if ( + tool.annotations.conditions?.includes('computerVision') && + !args.experimentalVision + ) { + return; + } server.registerTool( tool.name, { diff --git a/src/tools/ToolDefinition.ts b/src/tools/ToolDefinition.ts index f3eaaa7eb..909feec13 100644 --- a/src/tools/ToolDefinition.ts +++ b/src/tools/ToolDefinition.ts @@ -24,6 +24,7 @@ export interface ToolDefinition< * If true, the tool does not modify its environment. */ readOnlyHint: boolean; + conditions?: string[]; }; schema: Schema; handler: ( diff --git a/src/tools/input.ts b/src/tools/input.ts index 505568922..bfbcf6b66 100644 --- a/src/tools/input.ts +++ b/src/tools/input.ts @@ -12,6 +12,11 @@ import {parseKey} from '../utils/keyboard.js'; import {ToolCategory} from './categories.js'; import {defineTool} from './ToolDefinition.js'; +const dblClickSchema = zod + .boolean() + .optional() + .describe('Set to true for double clicks. Default is false.'); + export const click = defineTool({ name: 'click', description: `Clicks on the provided element`, @@ -25,10 +30,7 @@ export const click = defineTool({ .describe( 'The uid of an element on the page from the page content snapshot', ), - dblClick: zod - .boolean() - .optional() - .describe('Set to true for double clicks. Default is false.'), + dblClick: dblClickSchema, }, handler: async (request, response, context) => { const uid = request.params.uid; @@ -51,6 +53,35 @@ export const click = defineTool({ }, }); +export const clickAt = defineTool({ + name: 'click_at', + description: `Clicks at the provided coordinates`, + annotations: { + category: ToolCategory.INPUT, + readOnlyHint: false, + conditions: ['computerVision'], + }, + schema: { + x: zod.number().describe('The x coordinate'), + y: zod.number().describe('The y coordinate'), + dblClick: dblClickSchema, + }, + handler: async (request, response, context) => { + const page = context.getSelectedPage(); + await context.waitForEventsAfterAction(async () => { + await page.mouse.click(request.params.x, request.params.y, { + clickCount: request.params.dblClick ? 2 : 1, + }); + }); + response.appendResponseLine( + request.params.dblClick + ? `Successfully double clicked at the coordinates` + : `Successfully clicked at the coordinates`, + ); + response.includeSnapshot(); + }, +}); + export const hover = defineTool({ name: 'hover', description: `Hover over the provided element`, diff --git a/tests/index.test.ts b/tests/index.test.ts index 5970e890b..7b864e45f 100644 --- a/tests/index.test.ts +++ b/tests/index.test.ts @@ -12,8 +12,13 @@ import {Client} from '@modelcontextprotocol/sdk/client/index.js'; import {StdioClientTransport} from '@modelcontextprotocol/sdk/client/stdio.js'; import {executablePath} from 'puppeteer'; +import type {ToolDefinition} from '../src/tools/ToolDefinition'; + describe('e2e', () => { - async function withClient(cb: (client: Client) => Promise) { + async function withClient( + cb: (client: Client) => Promise, + extraArgs: string[] = [], + ) { const transport = new StdioClientTransport({ command: 'node', args: [ @@ -22,6 +27,7 @@ describe('e2e', () => { '--isolated', '--executable-path', executablePath(), + ...extraArgs, ], }); const client = new Client( @@ -90,8 +96,11 @@ describe('e2e', () => { continue; } const fileTools = await import(`../src/tools/${file}`); - for (const maybeTool of Object.values(fileTools)) { + for (const maybeTool of Object.values(fileTools)) { if ('name' in maybeTool) { + if (maybeTool.annotations?.conditions?.includes('computerVision')) { + continue; + } definedNames.push(maybeTool.name); } } @@ -100,4 +109,15 @@ describe('e2e', () => { assert.deepStrictEqual(exposedNames, definedNames); }); }); + + it('has experimental vision tools', async () => { + await withClient( + async client => { + const {tools} = await client.listTools(); + const clickAt = tools.find(t => t.name === 'click_at'); + assert.ok(clickAt); + }, + ['--experimental-vision'], + ); + }); }); diff --git a/tests/tools/input.test.ts b/tests/tools/input.test.ts index f1f7fc593..cb8ae7111 100644 --- a/tests/tools/input.test.ts +++ b/tests/tools/input.test.ts @@ -17,6 +17,7 @@ import { fillForm, uploadFile, pressKey, + clickAt, } from '../../src/tools/input.js'; import {parseKey} from '../../src/utils/keyboard.js'; import {serverHooks} from '../server.js'; @@ -183,6 +184,67 @@ describe('input', () => { }); }); + describe('click_at', () => { + it('clicks at coordinates', async () => { + await withMcpContext(async (response, context) => { + const page = context.getSelectedPage(); + await page.setContent( + html`
`, + ); + await context.createTextSnapshot(); + await clickAt.handler( + { + params: { + x: 50, + y: 50, + }, + }, + response, + context, + ); + assert.strictEqual( + response.responseLines[0], + 'Successfully clicked at the coordinates', + ); + assert.ok(response.includeSnapshot); + assert.ok(await page.$('text/clicked')); + }); + }); + + it('double clicks at coordinates', async () => { + await withMcpContext(async (response, context) => { + const page = context.getSelectedPage(); + await page.setContent( + html`
`, + ); + await context.createTextSnapshot(); + await clickAt.handler( + { + params: { + x: 50, + y: 50, + dblClick: true, + }, + }, + response, + context, + ); + assert.strictEqual( + response.responseLines[0], + 'Successfully double clicked at the coordinates', + ); + assert.ok(response.includeSnapshot); + assert.ok(await page.$('text/dblclicked')); + }); + }); + }); + describe('fill', () => { it('fills out an input', async () => { await withMcpContext(async (response, context) => {