|
| 1 | +/** |
| 2 | + * LLM-powered browser intelligence tools. |
| 3 | + * Ported from Hermes's browser_tool.py vision and summarization capabilities. |
| 4 | + */ |
| 5 | + |
| 6 | +import {defineTool} from './ToolDefinition.js'; |
| 7 | +import {ToolCategory} from './categories.js'; |
| 8 | +import {zod} from '../third_party/index.js'; |
| 9 | +import type {SerializedAXNode} from '../third_party/index.js'; |
| 10 | +import {redact} from '../security/redact.js'; |
| 11 | +import {wrapExternalContent} from '../security/content-wrapper.js'; |
| 12 | + |
| 13 | +/** |
| 14 | + * Serialize an accessibility tree node into readable text. |
| 15 | + * Lightweight alternative to formatSnapshotNode that works with raw |
| 16 | + * puppeteer SerializedAXNode (no id assignment required). |
| 17 | + */ |
| 18 | +function serializeAXTree(node: SerializedAXNode, depth = 0): string { |
| 19 | + const indent = ' '.repeat(depth); |
| 20 | + const parts: string[] = []; |
| 21 | + const role = node.role ?? ''; |
| 22 | + const name = node.name ?? ''; |
| 23 | + const value = node.value ?? ''; |
| 24 | + |
| 25 | + let line = `${indent}${role}`; |
| 26 | + if (name) line += ` "${name}"`; |
| 27 | + if (value && value !== name) line += ` value="${value}"`; |
| 28 | + parts.push(line); |
| 29 | + |
| 30 | + if (node.children) { |
| 31 | + for (const child of node.children) { |
| 32 | + parts.push(serializeAXTree(child, depth + 1)); |
| 33 | + } |
| 34 | + } |
| 35 | + return parts.join('\n'); |
| 36 | +} |
| 37 | + |
| 38 | +export const browserVision = defineTool({ |
| 39 | + name: 'browser_vision', |
| 40 | + description: |
| 41 | + 'Analyze the current page visually using AI. Takes a screenshot and sends it to the calling LLM for multimodal analysis. Useful for understanding page layout, finding UI elements, or answering visual questions about the page.', |
| 42 | + annotations: { |
| 43 | + category: ToolCategory.DEBUGGING, |
| 44 | + readOnlyHint: true, |
| 45 | + }, |
| 46 | + schema: { |
| 47 | + question: zod |
| 48 | + .string() |
| 49 | + .describe( |
| 50 | + 'What to analyze about the page (e.g. "What products are shown?", "Is there a login form?", "Describe the layout")', |
| 51 | + ), |
| 52 | + includeSnapshot: zod |
| 53 | + .boolean() |
| 54 | + .optional() |
| 55 | + .describe( |
| 56 | + 'Whether to also include the accessibility tree for richer analysis. Defaults to false.', |
| 57 | + ), |
| 58 | + }, |
| 59 | + handler: async (request, response, context) => { |
| 60 | + const page = context.getSelectedPage(); |
| 61 | + |
| 62 | + // Capture screenshot as base64 |
| 63 | + const screenshotBuffer = await page.screenshot({ |
| 64 | + optimizeForSpeed: true, |
| 65 | + }); |
| 66 | + const screenshotBase64 = Buffer.from(screenshotBuffer).toString('base64'); |
| 67 | + |
| 68 | + // Optionally include accessibility tree context |
| 69 | + let analysisContext = ''; |
| 70 | + if (request.params.includeSnapshot) { |
| 71 | + const axTree = await page.accessibility.snapshot({ |
| 72 | + includeIframes: true, |
| 73 | + interestingOnly: true, |
| 74 | + }); |
| 75 | + if (axTree) { |
| 76 | + const snapshotText = serializeAXTree(axTree); |
| 77 | + // Truncate to 3000 chars like Hermes does |
| 78 | + analysisContext = `\n\nAccessibility tree (truncated):\n${snapshotText.slice(0, 3000)}`; |
| 79 | + } |
| 80 | + } |
| 81 | + |
| 82 | + // Attach the screenshot as an image in the response |
| 83 | + response.attachImage({ |
| 84 | + data: screenshotBase64, |
| 85 | + mimeType: 'image/png', |
| 86 | + }); |
| 87 | + |
| 88 | + // Add the question and context as text |
| 89 | + const questionText = redact(request.params.question); |
| 90 | + const pageUrl = page.url(); |
| 91 | + response.appendResponseLine('## Vision Analysis Request'); |
| 92 | + response.appendResponseLine(`**Page:** ${pageUrl}`); |
| 93 | + response.appendResponseLine(`**Question:** ${questionText}`); |
| 94 | + if (analysisContext) { |
| 95 | + response.appendResponseLine( |
| 96 | + wrapExternalContent(analysisContext, pageUrl), |
| 97 | + ); |
| 98 | + } |
| 99 | + response.appendResponseLine( |
| 100 | + '\nPlease analyze the attached screenshot to answer the question.', |
| 101 | + ); |
| 102 | + }, |
| 103 | +}); |
| 104 | + |
| 105 | +export const summarizePage = defineTool({ |
| 106 | + name: 'summarize_page', |
| 107 | + description: |
| 108 | + 'Get a task-focused summary of the current page content. Takes an accessibility snapshot and returns a concise summary focused on the specified task or question. Useful when page snapshots are too long to process.', |
| 109 | + annotations: { |
| 110 | + category: ToolCategory.DEBUGGING, |
| 111 | + readOnlyHint: true, |
| 112 | + }, |
| 113 | + schema: { |
| 114 | + task: zod |
| 115 | + .string() |
| 116 | + .describe( |
| 117 | + 'What you need from this page (e.g. "find the pricing table", "extract all product names", "locate the search functionality")', |
| 118 | + ), |
| 119 | + maxChars: zod |
| 120 | + .number() |
| 121 | + .optional() |
| 122 | + .describe('Maximum characters in the summary. Defaults to 4000.'), |
| 123 | + }, |
| 124 | + handler: async (request, response, context) => { |
| 125 | + const page = context.getSelectedPage(); |
| 126 | + const axTree = await page.accessibility.snapshot({ |
| 127 | + includeIframes: true, |
| 128 | + interestingOnly: true, |
| 129 | + }); |
| 130 | + |
| 131 | + if (!axTree) { |
| 132 | + response.appendResponseLine('No page content available to summarize.'); |
| 133 | + return; |
| 134 | + } |
| 135 | + |
| 136 | + const fullSnapshot = serializeAXTree(axTree); |
| 137 | + const maxChars = request.params.maxChars ?? 4000; |
| 138 | + const pageUrl = page.url(); |
| 139 | + |
| 140 | + if (fullSnapshot.length <= maxChars) { |
| 141 | + response.appendResponseLine(`## Page Summary (${pageUrl})`); |
| 142 | + response.appendResponseLine(wrapExternalContent(fullSnapshot, pageUrl)); |
| 143 | + return; |
| 144 | + } |
| 145 | + |
| 146 | + // For long pages, extract task-relevant sections |
| 147 | + response.appendResponseLine(`## Page Summary (${pageUrl})`); |
| 148 | + response.appendResponseLine( |
| 149 | + `Full page is ${fullSnapshot.length} characters. Showing task-focused extract for: "${request.params.task}"`, |
| 150 | + ); |
| 151 | + response.appendResponseLine(''); |
| 152 | + |
| 153 | + // Split into lines and score relevance by keyword overlap |
| 154 | + const taskWords = new Set( |
| 155 | + request.params.task |
| 156 | + .toLowerCase() |
| 157 | + .split(/\s+/) |
| 158 | + .filter(w => w.length > 2), |
| 159 | + ); |
| 160 | + const lines = fullSnapshot.split('\n'); |
| 161 | + const scoredLines: {line: string; score: number; idx: number}[] = []; |
| 162 | + |
| 163 | + for (let i = 0; i < lines.length; i++) { |
| 164 | + const lower = lines[i].toLowerCase(); |
| 165 | + let score = 0; |
| 166 | + for (const word of taskWords) { |
| 167 | + if (lower.includes(word)) score++; |
| 168 | + } |
| 169 | + // Boost interactive elements (they tend to have role names like button, link, etc.) |
| 170 | + if (/\b(button|link|textbox|combobox|checkbox|radio|tab)\b/.test(lower)) { |
| 171 | + score += 0.5; |
| 172 | + } |
| 173 | + scoredLines.push({line: lines[i], score, idx: i}); |
| 174 | + } |
| 175 | + |
| 176 | + // Sort by relevance, take top lines within char budget |
| 177 | + scoredLines.sort((a, b) => b.score - a.score || a.idx - b.idx); |
| 178 | + |
| 179 | + const relevantLines: string[] = []; |
| 180 | + let charCount = 0; |
| 181 | + for (const item of scoredLines) { |
| 182 | + if (item.score === 0 && charCount > maxChars * 0.5) break; |
| 183 | + if (charCount + item.line.length > maxChars) break; |
| 184 | + relevantLines.push(item.line); |
| 185 | + charCount += item.line.length + 1; |
| 186 | + } |
| 187 | + |
| 188 | + // Re-sort by original position for coherent output |
| 189 | + relevantLines.sort((a, b) => { |
| 190 | + const idxA = scoredLines.find(s => s.line === a)?.idx ?? 0; |
| 191 | + const idxB = scoredLines.find(s => s.line === b)?.idx ?? 0; |
| 192 | + return idxA - idxB; |
| 193 | + }); |
| 194 | + |
| 195 | + response.appendResponseLine( |
| 196 | + wrapExternalContent(relevantLines.join('\n'), pageUrl), |
| 197 | + ); |
| 198 | + response.appendResponseLine( |
| 199 | + `\n(Showing ${relevantLines.length} of ${lines.length} lines, ${charCount} chars)`, |
| 200 | + ); |
| 201 | + }, |
| 202 | +}); |
0 commit comments