Skip to content

Commit a94dc0e

Browse files
authored
chore: experimental vision mode (#745)
this PR adds the `--experimental-vision` argument that exposes additional tool for visual automation (currently, click_at(x, y)).
1 parent 5c1ecf8 commit a94dc0e

7 files changed

Lines changed: 163 additions & 29 deletions

File tree

scripts/generate-docs.ts

Lines changed: 32 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ interface ToolWithAnnotations extends Tool {
2020
annotations?: {
2121
title?: string;
2222
category?: typeof ToolCategory;
23+
conditions?: string[];
2324
};
2425
}
2526

@@ -265,31 +266,39 @@ async function generateToolDocumentation(): Promise<void> {
265266
console.log('Generating tool documentation from definitions...');
266267

267268
// Convert ToolDefinitions to ToolWithAnnotations
268-
const toolsWithAnnotations: ToolWithAnnotations[] = tools.map(tool => {
269-
const properties: Record<string, TypeInfo> = {};
270-
const required: string[] = [];
271-
272-
for (const [key, schema] of Object.entries(
273-
tool.schema as unknown as Record<string, ZodSchema>,
274-
)) {
275-
const info = getZodTypeInfo(schema);
276-
properties[key] = info;
277-
if (isRequired(schema)) {
278-
required.push(key);
269+
const toolsWithAnnotations: ToolWithAnnotations[] = tools
270+
.filter(tool => {
271+
if (!tool.annotations.conditions) {
272+
return true;
273+
}
274+
// Only include unconditional tools.
275+
return tool.annotations.conditions.length === 0;
276+
})
277+
.map(tool => {
278+
const properties: Record<string, TypeInfo> = {};
279+
const required: string[] = [];
280+
281+
for (const [key, schema] of Object.entries(
282+
tool.schema as unknown as Record<string, ZodSchema>,
283+
)) {
284+
const info = getZodTypeInfo(schema);
285+
properties[key] = info;
286+
if (isRequired(schema)) {
287+
required.push(key);
288+
}
279289
}
280-
}
281290

282-
return {
283-
name: tool.name,
284-
description: tool.description,
285-
inputSchema: {
286-
type: 'object',
287-
properties,
288-
required,
289-
},
290-
annotations: tool.annotations,
291-
};
292-
});
291+
return {
292+
name: tool.name,
293+
description: tool.description,
294+
inputSchema: {
295+
type: 'object',
296+
properties,
297+
required,
298+
},
299+
annotations: tool.annotations,
300+
};
301+
});
293302

294303
console.log(`Found ${toolsWithAnnotations.length} tools`);
295304

src/cli.ts

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -152,6 +152,11 @@ export const cliOptions = {
152152
describe: 'Whether to enable automation over DevTools targets',
153153
hidden: true,
154154
},
155+
experimentalVision: {
156+
type: 'boolean',
157+
describe: 'Whether to enable vision tools',
158+
hidden: true,
159+
},
155160
experimentalIncludeAllPages: {
156161
type: 'boolean',
157162
describe:

src/main.ts

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -121,6 +121,12 @@ function registerTool(tool: ToolDefinition): void {
121121
) {
122122
return;
123123
}
124+
if (
125+
tool.annotations.conditions?.includes('computerVision') &&
126+
!args.experimentalVision
127+
) {
128+
return;
129+
}
124130
server.registerTool(
125131
tool.name,
126132
{

src/tools/ToolDefinition.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ export interface ToolDefinition<
2424
* If true, the tool does not modify its environment.
2525
*/
2626
readOnlyHint: boolean;
27+
conditions?: string[];
2728
};
2829
schema: Schema;
2930
handler: (

src/tools/input.ts

Lines changed: 35 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,11 @@ import {parseKey} from '../utils/keyboard.js';
1212
import {ToolCategory} from './categories.js';
1313
import {defineTool} from './ToolDefinition.js';
1414

15+
const dblClickSchema = zod
16+
.boolean()
17+
.optional()
18+
.describe('Set to true for double clicks. Default is false.');
19+
1520
export const click = defineTool({
1621
name: 'click',
1722
description: `Clicks on the provided element`,
@@ -25,10 +30,7 @@ export const click = defineTool({
2530
.describe(
2631
'The uid of an element on the page from the page content snapshot',
2732
),
28-
dblClick: zod
29-
.boolean()
30-
.optional()
31-
.describe('Set to true for double clicks. Default is false.'),
33+
dblClick: dblClickSchema,
3234
},
3335
handler: async (request, response, context) => {
3436
const uid = request.params.uid;
@@ -51,6 +53,35 @@ export const click = defineTool({
5153
},
5254
});
5355

56+
export const clickAt = defineTool({
57+
name: 'click_at',
58+
description: `Clicks at the provided coordinates`,
59+
annotations: {
60+
category: ToolCategory.INPUT,
61+
readOnlyHint: false,
62+
conditions: ['computerVision'],
63+
},
64+
schema: {
65+
x: zod.number().describe('The x coordinate'),
66+
y: zod.number().describe('The y coordinate'),
67+
dblClick: dblClickSchema,
68+
},
69+
handler: async (request, response, context) => {
70+
const page = context.getSelectedPage();
71+
await context.waitForEventsAfterAction(async () => {
72+
await page.mouse.click(request.params.x, request.params.y, {
73+
clickCount: request.params.dblClick ? 2 : 1,
74+
});
75+
});
76+
response.appendResponseLine(
77+
request.params.dblClick
78+
? `Successfully double clicked at the coordinates`
79+
: `Successfully clicked at the coordinates`,
80+
);
81+
response.includeSnapshot();
82+
},
83+
});
84+
5485
export const hover = defineTool({
5586
name: 'hover',
5687
description: `Hover over the provided element`,

tests/index.test.ts

Lines changed: 22 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,13 @@ import {Client} from '@modelcontextprotocol/sdk/client/index.js';
1212
import {StdioClientTransport} from '@modelcontextprotocol/sdk/client/stdio.js';
1313
import {executablePath} from 'puppeteer';
1414

15+
import type {ToolDefinition} from '../src/tools/ToolDefinition';
16+
1517
describe('e2e', () => {
16-
async function withClient(cb: (client: Client) => Promise<void>) {
18+
async function withClient(
19+
cb: (client: Client) => Promise<void>,
20+
extraArgs: string[] = [],
21+
) {
1722
const transport = new StdioClientTransport({
1823
command: 'node',
1924
args: [
@@ -22,6 +27,7 @@ describe('e2e', () => {
2227
'--isolated',
2328
'--executable-path',
2429
executablePath(),
30+
...extraArgs,
2531
],
2632
});
2733
const client = new Client(
@@ -90,8 +96,11 @@ describe('e2e', () => {
9096
continue;
9197
}
9298
const fileTools = await import(`../src/tools/${file}`);
93-
for (const maybeTool of Object.values<object>(fileTools)) {
99+
for (const maybeTool of Object.values<ToolDefinition>(fileTools)) {
94100
if ('name' in maybeTool) {
101+
if (maybeTool.annotations?.conditions?.includes('computerVision')) {
102+
continue;
103+
}
95104
definedNames.push(maybeTool.name);
96105
}
97106
}
@@ -100,4 +109,15 @@ describe('e2e', () => {
100109
assert.deepStrictEqual(exposedNames, definedNames);
101110
});
102111
});
112+
113+
it('has experimental vision tools', async () => {
114+
await withClient(
115+
async client => {
116+
const {tools} = await client.listTools();
117+
const clickAt = tools.find(t => t.name === 'click_at');
118+
assert.ok(clickAt);
119+
},
120+
['--experimental-vision'],
121+
);
122+
});
103123
});

tests/tools/input.test.ts

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ import {
1717
fillForm,
1818
uploadFile,
1919
pressKey,
20+
clickAt,
2021
} from '../../src/tools/input.js';
2122
import {parseKey} from '../../src/utils/keyboard.js';
2223
import {serverHooks} from '../server.js';
@@ -183,6 +184,67 @@ describe('input', () => {
183184
});
184185
});
185186

187+
describe('click_at', () => {
188+
it('clicks at coordinates', async () => {
189+
await withMcpContext(async (response, context) => {
190+
const page = context.getSelectedPage();
191+
await page.setContent(
192+
html`<div
193+
style="width: 100px; height: 100px; background: red;"
194+
onclick="this.innerText = 'clicked'"
195+
></div>`,
196+
);
197+
await context.createTextSnapshot();
198+
await clickAt.handler(
199+
{
200+
params: {
201+
x: 50,
202+
y: 50,
203+
},
204+
},
205+
response,
206+
context,
207+
);
208+
assert.strictEqual(
209+
response.responseLines[0],
210+
'Successfully clicked at the coordinates',
211+
);
212+
assert.ok(response.includeSnapshot);
213+
assert.ok(await page.$('text/clicked'));
214+
});
215+
});
216+
217+
it('double clicks at coordinates', async () => {
218+
await withMcpContext(async (response, context) => {
219+
const page = context.getSelectedPage();
220+
await page.setContent(
221+
html`<div
222+
style="width: 100px; height: 100px; background: red;"
223+
ondblclick="this.innerText = 'dblclicked'"
224+
></div>`,
225+
);
226+
await context.createTextSnapshot();
227+
await clickAt.handler(
228+
{
229+
params: {
230+
x: 50,
231+
y: 50,
232+
dblClick: true,
233+
},
234+
},
235+
response,
236+
context,
237+
);
238+
assert.strictEqual(
239+
response.responseLines[0],
240+
'Successfully double clicked at the coordinates',
241+
);
242+
assert.ok(response.includeSnapshot);
243+
assert.ok(await page.$('text/dblclicked'));
244+
});
245+
});
246+
});
247+
186248
describe('fill', () => {
187249
it('fills out an input', async () => {
188250
await withMcpContext(async (response, context) => {

0 commit comments

Comments
 (0)