From 71e37907e660f3072989480fd820ac4d250012a1 Mon Sep 17 00:00:00 2001 From: anil-bd Date: Mon, 25 May 2026 10:42:34 +0200 Subject: [PATCH 1/2] fix(output): wire CSV/HTML/MD serializers, reject XLSX with helpful error MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Before this change, --output paths with .csv, .html, or .md extensions silently wrote pretty-printed JSON to disk. format_from_ext() mapped the extension to the correct Output_format, but serialize() only handled 'pretty', 'json', and string values — every other format fell through to JSON.stringify(data, null, 2). This breaks the documented contract ("Output file format from extension") and corrupts downstream consumers: opening a .csv in Excel or a DataFrame loader fails or yields a single column of JSON. The video creator brief for Scraper Studio explicitly promises "JSON, CSV, or XLSX" output — today only JSON works. Changes - serialize_csv: array-of-objects → RFC 4180 CSV with header row, embedded comma/quote/newline escaping, union of keys across heterogeneous rows - serialize_markdown: array-of-objects → pipe-table; non-tabular data falls back to a fenced JSON block - serialize_html: array-of-objects → minimal ; non-tabular data falls back to
JSON
; HTML-special chars escaped - format_from_ext: rejects .xlsx / .xls up front with a clear message pointing to (a) --pretty -o file.json + xlsx-cli, or (b) the web UI's download button. Hard fail beats silent corruption. - print(): no behavior change; serialize() now does the right thing for csv/html/markdown. Tests - 17 new tests in src/__tests__/utils/output.test.ts covering CSV escaping, key union across heterogeneous rows, markdown pipe-escaping, HTML entity escaping, the xlsx rejection, and end-to-end print() writes for each extension (regression coverage for the silent-JSON bug). No public-API change; all existing exports preserved. Refs: docs/audit DX issues N1, N2 (see scraper-studio-cli-demo/ISSUES.md) --- src/__tests__/utils/output.test.ts | 158 +++++++++++++++++++++++++++++ src/utils/output.ts | 122 +++++++++++++++++++++- 2 files changed, 279 insertions(+), 1 deletion(-) create mode 100644 src/__tests__/utils/output.test.ts diff --git a/src/__tests__/utils/output.test.ts b/src/__tests__/utils/output.test.ts new file mode 100644 index 0000000..7c55cf9 --- /dev/null +++ b/src/__tests__/utils/output.test.ts @@ -0,0 +1,158 @@ +import {describe, it, expect, vi, beforeEach, afterEach} from 'vitest'; +import fs from 'fs'; +import path from 'path'; +import os from 'os'; +import {serialize, format_from_ext, print} from '../../utils/output'; + +describe('utils/output.serialize csv', ()=>{ + it('serializes array of flat objects as RFC 4180 CSV with header row', ()=>{ + const rows = [ + {url: 'https://a.test/1', title: 'A', price: 1.5}, + {url: 'https://a.test/2', title: 'B', price: 2.0}, + ]; + const out = serialize(rows, 'csv'); + const lines = out.trim().split('\n'); + expect(lines[0]).toBe('url,title,price'); + expect(lines[1]).toBe('https://a.test/1,A,1.5'); + expect(lines[2]).toBe('https://a.test/2,B,2'); + }); + + it('quotes and escapes embedded commas, quotes, and newlines', ()=>{ + const rows = [{name: 'Smith, John', note: 'He said "hi"'}, + {name: 'multi\nline', note: 'ok'}]; + const out = serialize(rows, 'csv'); + const lines = out.trim().split(/\n/); + // header + expect(lines[0]).toBe('name,note'); + // row 1: both fields need quoting; embedded quote doubled + expect(lines[1]).toBe('"Smith, John","He said ""hi"""'); + }); + + it('unions keys across heterogeneous rows', ()=>{ + const rows = [{a: 1, b: 2}, {a: 3, c: 4}]; + const out = serialize(rows, 'csv'); + const lines = out.trim().split('\n'); + expect(lines[0]).toBe('a,b,c'); + expect(lines[1]).toBe('1,2,'); + expect(lines[2]).toBe('3,,4'); + }); + + it('wraps a single object as one CSV row', ()=>{ + const out = serialize({a: 1, b: 'x'}, 'csv'); + expect(out.trim()).toBe('a,b\n1,x'); + }); + + it('serializes nested values via JSON', ()=>{ + const rows = [{id: 1, meta: {tag: 'x'}}]; + const out = serialize(rows, 'csv'); + const lines = out.trim().split('\n'); + expect(lines[1]).toBe('1,"{""tag"":""x""}"'); + }); +}); + +describe('utils/output.serialize markdown', ()=>{ + it('renders an array of objects as a Markdown table', ()=>{ + const rows = [{a: 1, b: 'x'}, {a: 2, b: 'y'}]; + const out = serialize(rows, 'markdown'); + expect(out).toContain('| a | b |'); + expect(out).toContain('| --- | --- |'); + expect(out).toContain('| 1 | x |'); + expect(out).toContain('| 2 | y |'); + }); + + it('escapes pipes and newlines inside cells', ()=>{ + const rows = [{a: 'a|b', b: 'line1\nline2'}]; + const out = serialize(rows, 'markdown'); + expect(out).toContain('| a\\|b | line1 line2 |'); + }); + + it('falls back to a fenced JSON block for non-tabular data', ()=>{ + const out = serialize([1, 2, 3], 'markdown'); + expect(out.startsWith('```json')).toBe(true); + }); +}); + +describe('utils/output.serialize html', ()=>{ + it('renders an array of objects as an HTML table', ()=>{ + const rows = [{a: 1, b: ''}]; + const out = serialize(rows, 'html'); + expect(out).toContain(''); + expect(out).toContain(''); + }); + + it('escapes HTML in non-tabular fallback', ()=>{ + const out = serialize('
ab
1<x>