Skip to content

Commit 0abd1f6

Browse files
ForgeFlow v2claude
andcommitted
fix(security): Address 12 high-priority quality issues
CRITICAL fixes: - llm-extractor.ts: Add try-catch for JSON.parse with SyntaxError detection - llm-extractor.ts: Remove env var names from error messages (info disclosure) - cli.ts: Add RFC 7230 header validation, type checks, length limits (4096 max) Memory safety: - session-memory.ts: Add SESSION_MEMORY_LIMITS for bounded collections - Element cache: 500 entries max (oldest removed first) - Navigation history: 100 entries max - Form data: 200 entries max - HTML content: 5MB max Test coverage: - llm-extractor.test.ts: Add 4 test suites for error handling paths - Invalid JSON from OpenAI - Invalid JSON from Claude - Trailing comma JSON edge case - Error message security verification Type safety verified: - All 23+ 'as any' casts across 6 files confirmed documented Build: ✅ TypeScript compilation passes (0 errors) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
1 parent eaecb3e commit 0abd1f6

4 files changed

Lines changed: 339 additions & 85 deletions

File tree

src/cli.ts

Lines changed: 42 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -74,17 +74,54 @@ export const cliOptions = {
7474
if (!val) {
7575
return;
7676
}
77+
78+
// Parse JSON
79+
let parsed: unknown;
7780
try {
78-
const parsed = JSON.parse(val);
79-
if (typeof parsed !== 'object' || Array.isArray(parsed)) {
80-
throw new Error('Headers must be a JSON object');
81-
}
82-
return parsed as Record<string, string>;
81+
parsed = JSON.parse(val);
8382
} catch (error) {
8483
throw new Error(
8584
`Invalid JSON for wsHeaders: ${(error as Error).message}`,
8685
);
8786
}
87+
88+
// Validate structure: must be object, not array
89+
if (typeof parsed !== 'object' || Array.isArray(parsed) || parsed === null) {
90+
throw new Error('Headers must be a JSON object');
91+
}
92+
93+
// RFC 7230 token validation for header names
94+
const headerNamePattern = /^[!#$%&'*+\-.0-9A-Z^_`a-z|~]+$/;
95+
const MAX_HEADER_VALUE_LENGTH = 4096;
96+
97+
// Validate each header
98+
const headers: Record<string, string> = {};
99+
for (const [key, value] of Object.entries(parsed)) {
100+
// Validate header name (RFC 7230 tokens)
101+
if (!headerNamePattern.test(key)) {
102+
throw new Error(
103+
`Invalid header name "${key}": must contain only RFC 7230 token characters`,
104+
);
105+
}
106+
107+
// Validate header value: must be string
108+
if (typeof value !== 'string') {
109+
throw new Error(
110+
`Invalid header value for "${key}": must be a string, got ${typeof value}`,
111+
);
112+
}
113+
114+
// Validate header value length
115+
if (value.length > MAX_HEADER_VALUE_LENGTH) {
116+
throw new Error(
117+
`Header value for "${key}" exceeds maximum length of ${MAX_HEADER_VALUE_LENGTH} characters`,
118+
);
119+
}
120+
121+
headers[key] = value;
122+
}
123+
124+
return headers;
88125
},
89126
},
90127
headless: {

src/utils/extraction/llm-extractor.ts

Lines changed: 44 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,15 @@ import OpenAI from 'openai';
88
import Anthropic from '@anthropic-ai/sdk';
99
import type {Page, zod} from '../../third_party/index.js';
1010
import {logger} from '../../logger.js';
11+
import type {
12+
ZodObjectSchema,
13+
ExtractionResult,
14+
ZodSchemaDef,
15+
} from '../../types/zod-schemas.js';
16+
import {
17+
getZodTypeDef,
18+
describeZodSchema,
19+
} from '../../types/zod-schemas.js';
1120

1221
/**
1322
* LLM-based data extraction with cascading providers
@@ -34,7 +43,7 @@ export class LlmExtractor {
3443
// Require at least one provider
3544
if (!this.openaiClient && !this.claudeClient) {
3645
throw new Error(
37-
'At least one API key required: OPENAI_API_KEY (primary) or ANTHROPIC_API_KEY (fallback)'
46+
'LLM extraction requires at least one API provider to be configured. Please set up OpenAI or Anthropic credentials.'
3847
);
3948
}
4049
}
@@ -46,10 +55,10 @@ export class LlmExtractor {
4655
*/
4756
async extract(
4857
page: Page,
49-
schema: zod.ZodObject<any>,
58+
schema: ZodObjectSchema,
5059
instructions?: string,
5160
selector?: string,
52-
): Promise<any> {
61+
): Promise<ExtractionResult> {
5362
// Get HTML content
5463
let htmlContent: string;
5564
if (selector) {
@@ -71,7 +80,7 @@ export class LlmExtractor {
7180
// Construct extraction prompt
7281
const prompt = this.buildPrompt(schemaDescription, instructions, cleanedHtml);
7382

74-
let result: any;
83+
let result: ExtractionResult | null = null;
7584
let lastError: Error | null = null;
7685

7786
// Attempt 1: OpenAI GPT-4o-mini (primary)
@@ -103,15 +112,16 @@ export class LlmExtractor {
103112
throw new Error(
104113
`LLM extraction failed with all providers. Last error: ${lastError?.message || 'Unknown error'}`
105114
);
115+
// Note: TypeScript ensures result is non-null before reaching return statements above
106116
}
107117

108118
/**
109119
* Extract using OpenAI GPT-4o-mini
110120
*/
111121
private async extractWithOpenAI(
112122
prompt: string,
113-
schema: zod.ZodObject<any>,
114-
): Promise<any> {
123+
schema: ZodObjectSchema,
124+
): Promise<ExtractionResult> {
115125
if (!this.openaiClient) {
116126
throw new Error('OpenAI client not initialized');
117127
}
@@ -133,17 +143,26 @@ export class LlmExtractor {
133143
}
134144

135145
// Parse and validate JSON
136-
const extracted = JSON.parse(content);
137-
return schema.parse(extracted);
146+
try {
147+
const extracted = JSON.parse(content);
148+
return schema.parse(extracted);
149+
} catch (error) {
150+
if (error instanceof SyntaxError) {
151+
throw new Error(`Failed to parse OpenAI JSON response: Invalid JSON format (${error.message})`);
152+
}
153+
throw new Error(
154+
`Failed to validate OpenAI response against schema: ${error instanceof Error ? error.message : String(error)}`
155+
);
156+
}
138157
}
139158

140159
/**
141160
* Extract using Claude 3.5 Haiku
142161
*/
143162
private async extractWithClaude(
144163
prompt: string,
145-
schema: zod.ZodObject<any>,
146-
): Promise<any> {
164+
schema: ZodObjectSchema,
165+
): Promise<ExtractionResult> {
147166
if (!this.claudeClient) {
148167
throw new Error('Claude client not initialized');
149168
}
@@ -168,8 +187,18 @@ export class LlmExtractor {
168187
throw new Error('No JSON found in Claude response');
169188
}
170189

171-
const extracted = JSON.parse(jsonMatch[0]);
172-
return schema.parse(extracted);
190+
// Parse and validate JSON
191+
try {
192+
const extracted = JSON.parse(jsonMatch[0]);
193+
return schema.parse(extracted);
194+
} catch (error) {
195+
if (error instanceof SyntaxError) {
196+
throw new Error(`Failed to parse Claude JSON response: Invalid JSON format (${error.message})`);
197+
}
198+
throw new Error(
199+
`Failed to validate Claude response against schema: ${error instanceof Error ? error.message : String(error)}`
200+
);
201+
}
173202
}
174203

175204
/**
@@ -218,40 +247,9 @@ ${htmlContent}`;
218247

219248
/**
220249
* Generate human-readable schema description for LLMs
250+
* Uses type-safe helper from zod-schemas module
221251
*/
222-
private describeSchema(schema: zod.ZodObject<any>): string {
223-
const shape = schema.shape;
224-
const fields: string[] = [];
225-
226-
for (const [fieldName, fieldSchema] of Object.entries(shape)) {
227-
const def = (fieldSchema as any)._def;
228-
// Zod v4 uses _def.type instead of _def.typeName
229-
const type = def.type;
230-
const isOptional = type === 'optional';
231-
232-
let typeDesc = '';
233-
if (type === 'string') {
234-
typeDesc = 'string';
235-
} else if (type === 'number') {
236-
typeDesc = 'number';
237-
} else if (type === 'boolean') {
238-
typeDesc = 'boolean';
239-
} else if (type === 'array') {
240-
const innerType = def.element?._def?.type || 'unknown';
241-
typeDesc = `array of ${innerType}s`;
242-
} else if (type === 'object') {
243-
typeDesc = 'object';
244-
} else if (type === 'optional') {
245-
const innerDef = def.innerType?._def;
246-
const innerType = innerDef?.type || 'unknown';
247-
typeDesc = `optional ${innerType}`;
248-
} else {
249-
typeDesc = type || 'unknown';
250-
}
251-
252-
fields.push(` - ${fieldName}: ${typeDesc}${isOptional ? ' (optional)' : ''}`);
253-
}
254-
255-
return fields.join('\n');
252+
private describeSchema(schema: ZodObjectSchema): string {
253+
return describeZodSchema(schema);
256254
}
257255
}

0 commit comments

Comments
 (0)