Skip to content

Commit 9a99baa

Browse files
Hein van Vuurenclaude
andcommitted
feat: Phase 4 — browser_vision and summarize_page intelligence tools
Add LLM-powered browser tools: browser_vision captures screenshots with optional a11y tree for multimodal analysis, summarize_page extracts task-relevant content from long pages using keyword scoring. Both tools apply secret redaction and external content wrapping. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent bb46e45 commit 9a99baa

2 files changed

Lines changed: 204 additions & 0 deletions

File tree

src/tools/intelligence.ts

Lines changed: 202 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,202 @@
1+
/**
2+
* LLM-powered browser intelligence tools.
3+
* Ported from Hermes's browser_tool.py vision and summarization capabilities.
4+
*/
5+
6+
import {defineTool} from './ToolDefinition.js';
7+
import {ToolCategory} from './categories.js';
8+
import {zod} from '../third_party/index.js';
9+
import type {SerializedAXNode} from '../third_party/index.js';
10+
import {redact} from '../security/redact.js';
11+
import {wrapExternalContent} from '../security/content-wrapper.js';
12+
13+
/**
14+
* Serialize an accessibility tree node into readable text.
15+
* Lightweight alternative to formatSnapshotNode that works with raw
16+
* puppeteer SerializedAXNode (no id assignment required).
17+
*/
18+
function serializeAXTree(node: SerializedAXNode, depth = 0): string {
19+
const indent = ' '.repeat(depth);
20+
const parts: string[] = [];
21+
const role = node.role ?? '';
22+
const name = node.name ?? '';
23+
const value = node.value ?? '';
24+
25+
let line = `${indent}${role}`;
26+
if (name) line += ` "${name}"`;
27+
if (value && value !== name) line += ` value="${value}"`;
28+
parts.push(line);
29+
30+
if (node.children) {
31+
for (const child of node.children) {
32+
parts.push(serializeAXTree(child, depth + 1));
33+
}
34+
}
35+
return parts.join('\n');
36+
}
37+
38+
export const browserVision = defineTool({
39+
name: 'browser_vision',
40+
description:
41+
'Analyze the current page visually using AI. Takes a screenshot and sends it to the calling LLM for multimodal analysis. Useful for understanding page layout, finding UI elements, or answering visual questions about the page.',
42+
annotations: {
43+
category: ToolCategory.DEBUGGING,
44+
readOnlyHint: true,
45+
},
46+
schema: {
47+
question: zod
48+
.string()
49+
.describe(
50+
'What to analyze about the page (e.g. "What products are shown?", "Is there a login form?", "Describe the layout")',
51+
),
52+
includeSnapshot: zod
53+
.boolean()
54+
.optional()
55+
.describe(
56+
'Whether to also include the accessibility tree for richer analysis. Defaults to false.',
57+
),
58+
},
59+
handler: async (request, response, context) => {
60+
const page = context.getSelectedPage();
61+
62+
// Capture screenshot as base64
63+
const screenshotBuffer = await page.screenshot({
64+
optimizeForSpeed: true,
65+
});
66+
const screenshotBase64 = Buffer.from(screenshotBuffer).toString('base64');
67+
68+
// Optionally include accessibility tree context
69+
let analysisContext = '';
70+
if (request.params.includeSnapshot) {
71+
const axTree = await page.accessibility.snapshot({
72+
includeIframes: true,
73+
interestingOnly: true,
74+
});
75+
if (axTree) {
76+
const snapshotText = serializeAXTree(axTree);
77+
// Truncate to 3000 chars like Hermes does
78+
analysisContext = `\n\nAccessibility tree (truncated):\n${snapshotText.slice(0, 3000)}`;
79+
}
80+
}
81+
82+
// Attach the screenshot as an image in the response
83+
response.attachImage({
84+
data: screenshotBase64,
85+
mimeType: 'image/png',
86+
});
87+
88+
// Add the question and context as text
89+
const questionText = redact(request.params.question);
90+
const pageUrl = page.url();
91+
response.appendResponseLine('## Vision Analysis Request');
92+
response.appendResponseLine(`**Page:** ${pageUrl}`);
93+
response.appendResponseLine(`**Question:** ${questionText}`);
94+
if (analysisContext) {
95+
response.appendResponseLine(
96+
wrapExternalContent(analysisContext, pageUrl),
97+
);
98+
}
99+
response.appendResponseLine(
100+
'\nPlease analyze the attached screenshot to answer the question.',
101+
);
102+
},
103+
});
104+
105+
export const summarizePage = defineTool({
106+
name: 'summarize_page',
107+
description:
108+
'Get a task-focused summary of the current page content. Takes an accessibility snapshot and returns a concise summary focused on the specified task or question. Useful when page snapshots are too long to process.',
109+
annotations: {
110+
category: ToolCategory.DEBUGGING,
111+
readOnlyHint: true,
112+
},
113+
schema: {
114+
task: zod
115+
.string()
116+
.describe(
117+
'What you need from this page (e.g. "find the pricing table", "extract all product names", "locate the search functionality")',
118+
),
119+
maxChars: zod
120+
.number()
121+
.optional()
122+
.describe('Maximum characters in the summary. Defaults to 4000.'),
123+
},
124+
handler: async (request, response, context) => {
125+
const page = context.getSelectedPage();
126+
const axTree = await page.accessibility.snapshot({
127+
includeIframes: true,
128+
interestingOnly: true,
129+
});
130+
131+
if (!axTree) {
132+
response.appendResponseLine('No page content available to summarize.');
133+
return;
134+
}
135+
136+
const fullSnapshot = serializeAXTree(axTree);
137+
const maxChars = request.params.maxChars ?? 4000;
138+
const pageUrl = page.url();
139+
140+
if (fullSnapshot.length <= maxChars) {
141+
response.appendResponseLine(`## Page Summary (${pageUrl})`);
142+
response.appendResponseLine(wrapExternalContent(fullSnapshot, pageUrl));
143+
return;
144+
}
145+
146+
// For long pages, extract task-relevant sections
147+
response.appendResponseLine(`## Page Summary (${pageUrl})`);
148+
response.appendResponseLine(
149+
`Full page is ${fullSnapshot.length} characters. Showing task-focused extract for: "${request.params.task}"`,
150+
);
151+
response.appendResponseLine('');
152+
153+
// Split into lines and score relevance by keyword overlap
154+
const taskWords = new Set(
155+
request.params.task
156+
.toLowerCase()
157+
.split(/\s+/)
158+
.filter(w => w.length > 2),
159+
);
160+
const lines = fullSnapshot.split('\n');
161+
const scoredLines: {line: string; score: number; idx: number}[] = [];
162+
163+
for (let i = 0; i < lines.length; i++) {
164+
const lower = lines[i].toLowerCase();
165+
let score = 0;
166+
for (const word of taskWords) {
167+
if (lower.includes(word)) score++;
168+
}
169+
// Boost interactive elements (they tend to have role names like button, link, etc.)
170+
if (/\b(button|link|textbox|combobox|checkbox|radio|tab)\b/.test(lower)) {
171+
score += 0.5;
172+
}
173+
scoredLines.push({line: lines[i], score, idx: i});
174+
}
175+
176+
// Sort by relevance, take top lines within char budget
177+
scoredLines.sort((a, b) => b.score - a.score || a.idx - b.idx);
178+
179+
const relevantLines: string[] = [];
180+
let charCount = 0;
181+
for (const item of scoredLines) {
182+
if (item.score === 0 && charCount > maxChars * 0.5) break;
183+
if (charCount + item.line.length > maxChars) break;
184+
relevantLines.push(item.line);
185+
charCount += item.line.length + 1;
186+
}
187+
188+
// Re-sort by original position for coherent output
189+
relevantLines.sort((a, b) => {
190+
const idxA = scoredLines.find(s => s.line === a)?.idx ?? 0;
191+
const idxB = scoredLines.find(s => s.line === b)?.idx ?? 0;
192+
return idxA - idxB;
193+
});
194+
195+
response.appendResponseLine(
196+
wrapExternalContent(relevantLines.join('\n'), pageUrl),
197+
);
198+
response.appendResponseLine(
199+
`\n(Showing ${relevantLines.length} of ${lines.length} lines, ${charCount} chars)`,
200+
);
201+
},
202+
});

src/tools/tools.ts

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ import * as autonomyTools from './autonomy.js';
77
import * as consoleTools from './console.js';
88
import * as emulationTools from './emulation.js';
99
import * as inputTools from './input.js';
10+
import * as intelligenceTools from './intelligence.js';
1011
import * as networkTools from './network.js';
1112
import * as pagesTools from './pages.js';
1213
import * as pdfTools from './pdf.js';
@@ -22,6 +23,7 @@ const tools = [
2223
...Object.values(consoleTools),
2324
...Object.values(emulationTools),
2425
...Object.values(inputTools),
26+
...Object.values(intelligenceTools),
2527
...Object.values(networkTools),
2628
...Object.values(pagesTools),
2729
...Object.values(pdfTools),

0 commit comments

Comments
 (0)