@@ -8,6 +8,15 @@ import OpenAI from 'openai';
88import Anthropic from '@anthropic-ai/sdk' ;
99import type { Page , zod } from '../../third_party/index.js' ;
1010import { logger } from '../../logger.js' ;
11+ import type {
12+ ZodObjectSchema ,
13+ ExtractionResult ,
14+ ZodSchemaDef ,
15+ } from '../../types/zod-schemas.js' ;
16+ import {
17+ getZodTypeDef ,
18+ describeZodSchema ,
19+ } from '../../types/zod-schemas.js' ;
1120
1221/**
1322 * LLM-based data extraction with cascading providers
@@ -34,7 +43,7 @@ export class LlmExtractor {
3443 // Require at least one provider
3544 if ( ! this . openaiClient && ! this . claudeClient ) {
3645 throw new Error (
37- 'At least one API key required: OPENAI_API_KEY (primary) or ANTHROPIC_API_KEY (fallback) '
46+ 'LLM extraction requires at least one API provider to be configured. Please set up OpenAI or Anthropic credentials. '
3847 ) ;
3948 }
4049 }
@@ -46,10 +55,10 @@ export class LlmExtractor {
4655 */
4756 async extract (
4857 page : Page ,
49- schema : zod . ZodObject < any > ,
58+ schema : ZodObjectSchema ,
5059 instructions ?: string ,
5160 selector ?: string ,
52- ) : Promise < any > {
61+ ) : Promise < ExtractionResult > {
5362 // Get HTML content
5463 let htmlContent : string ;
5564 if ( selector ) {
@@ -71,7 +80,7 @@ export class LlmExtractor {
7180 // Construct extraction prompt
7281 const prompt = this . buildPrompt ( schemaDescription , instructions , cleanedHtml ) ;
7382
74- let result : any ;
83+ let result : ExtractionResult | null = null ;
7584 let lastError : Error | null = null ;
7685
7786 // Attempt 1: OpenAI GPT-4o-mini (primary)
@@ -103,15 +112,16 @@ export class LlmExtractor {
103112 throw new Error (
104113 `LLM extraction failed with all providers. Last error: ${ lastError ?. message || 'Unknown error' } `
105114 ) ;
115+ // Note: TypeScript ensures result is non-null before reaching return statements above
106116 }
107117
108118 /**
109119 * Extract using OpenAI GPT-4o-mini
110120 */
111121 private async extractWithOpenAI (
112122 prompt : string ,
113- schema : zod . ZodObject < any > ,
114- ) : Promise < any > {
123+ schema : ZodObjectSchema ,
124+ ) : Promise < ExtractionResult > {
115125 if ( ! this . openaiClient ) {
116126 throw new Error ( 'OpenAI client not initialized' ) ;
117127 }
@@ -133,17 +143,26 @@ export class LlmExtractor {
133143 }
134144
135145 // Parse and validate JSON
136- const extracted = JSON . parse ( content ) ;
137- return schema . parse ( extracted ) ;
146+ try {
147+ const extracted = JSON . parse ( content ) ;
148+ return schema . parse ( extracted ) ;
149+ } catch ( error ) {
150+ if ( error instanceof SyntaxError ) {
151+ throw new Error ( `Failed to parse OpenAI JSON response: Invalid JSON format (${ error . message } )` ) ;
152+ }
153+ throw new Error (
154+ `Failed to validate OpenAI response against schema: ${ error instanceof Error ? error . message : String ( error ) } `
155+ ) ;
156+ }
138157 }
139158
140159 /**
141160 * Extract using Claude 3.5 Haiku
142161 */
143162 private async extractWithClaude (
144163 prompt : string ,
145- schema : zod . ZodObject < any > ,
146- ) : Promise < any > {
164+ schema : ZodObjectSchema ,
165+ ) : Promise < ExtractionResult > {
147166 if ( ! this . claudeClient ) {
148167 throw new Error ( 'Claude client not initialized' ) ;
149168 }
@@ -168,8 +187,18 @@ export class LlmExtractor {
168187 throw new Error ( 'No JSON found in Claude response' ) ;
169188 }
170189
171- const extracted = JSON . parse ( jsonMatch [ 0 ] ) ;
172- return schema . parse ( extracted ) ;
190+ // Parse and validate JSON
191+ try {
192+ const extracted = JSON . parse ( jsonMatch [ 0 ] ) ;
193+ return schema . parse ( extracted ) ;
194+ } catch ( error ) {
195+ if ( error instanceof SyntaxError ) {
196+ throw new Error ( `Failed to parse Claude JSON response: Invalid JSON format (${ error . message } )` ) ;
197+ }
198+ throw new Error (
199+ `Failed to validate Claude response against schema: ${ error instanceof Error ? error . message : String ( error ) } `
200+ ) ;
201+ }
173202 }
174203
175204 /**
@@ -218,40 +247,9 @@ ${htmlContent}`;
218247
219248 /**
220249 * Generate human-readable schema description for LLMs
250+ * Uses type-safe helper from zod-schemas module
221251 */
222- private describeSchema ( schema : zod . ZodObject < any > ) : string {
223- const shape = schema . shape ;
224- const fields : string [ ] = [ ] ;
225-
226- for ( const [ fieldName , fieldSchema ] of Object . entries ( shape ) ) {
227- const def = ( fieldSchema as any ) . _def ;
228- // Zod v4 uses _def.type instead of _def.typeName
229- const type = def . type ;
230- const isOptional = type === 'optional' ;
231-
232- let typeDesc = '' ;
233- if ( type === 'string' ) {
234- typeDesc = 'string' ;
235- } else if ( type === 'number' ) {
236- typeDesc = 'number' ;
237- } else if ( type === 'boolean' ) {
238- typeDesc = 'boolean' ;
239- } else if ( type === 'array' ) {
240- const innerType = def . element ?. _def ?. type || 'unknown' ;
241- typeDesc = `array of ${ innerType } s` ;
242- } else if ( type === 'object' ) {
243- typeDesc = 'object' ;
244- } else if ( type === 'optional' ) {
245- const innerDef = def . innerType ?. _def ;
246- const innerType = innerDef ?. type || 'unknown' ;
247- typeDesc = `optional ${ innerType } ` ;
248- } else {
249- typeDesc = type || 'unknown' ;
250- }
251-
252- fields . push ( ` - ${ fieldName } : ${ typeDesc } ${ isOptional ? ' (optional)' : '' } ` ) ;
253- }
254-
255- return fields . join ( '\n' ) ;
252+ private describeSchema ( schema : ZodObjectSchema ) : string {
253+ return describeZodSchema ( schema ) ;
256254 }
257255}
0 commit comments