From 71e37907e660f3072989480fd820ac4d250012a1 Mon Sep 17 00:00:00 2001
From: anil-bd <anil@brightdata.com>
Date: Mon, 25 May 2026 10:42:34 +0200
Subject: [PATCH 1/2] fix(output): wire CSV/HTML/MD serializers, reject XLSX
 with helpful error
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Before this change, --output paths with .csv, .html, or .md extensions
silently wrote pretty-printed JSON to disk. format_from_ext() mapped the
extension to the correct Output_format, but serialize() only handled
'pretty', 'json', and string values — every other format fell through to
JSON.stringify(data, null, 2).

This breaks the documented contract ("Output file format from extension")
and corrupts downstream consumers: opening a .csv in Excel or a DataFrame
loader fails or yields a single column of JSON. The video creator brief for
Scraper Studio explicitly promises "JSON, CSV, or XLSX" output — today
only JSON works.

Changes
- serialize_csv: array-of-objects → RFC 4180 CSV with header row, embedded
  comma/quote/newline escaping, union of keys across heterogeneous rows
- serialize_markdown: array-of-objects → pipe-table; non-tabular data
  falls back to a fenced JSON block
- serialize_html: array-of-objects → minimal <table>; non-tabular data
  falls back to <pre>JSON</pre>; HTML-special chars escaped
- format_from_ext: rejects .xlsx / .xls up front with a clear message
  pointing to (a) --pretty -o file.json + xlsx-cli, or (b) the web UI's
  download button. Hard fail beats silent corruption.
- print(): no behavior change; serialize() now does the right thing for
  csv/html/markdown.

Tests
- 17 new tests in src/__tests__/utils/output.test.ts covering CSV escaping,
  key union across heterogeneous rows, markdown pipe-escaping, HTML
  entity escaping, the xlsx rejection, and end-to-end print() writes for
  each extension (regression coverage for the silent-JSON bug).

No public-API change; all existing exports preserved.

Refs: docs/audit DX issues N1, N2 (see scraper-studio-cli-demo/ISSUES.md)
---
 src/__tests__/utils/output.test.ts | 158 +++++++++++++++++++++++++++++
 src/utils/output.ts                | 122 +++++++++++++++++++++-
 2 files changed, 279 insertions(+), 1 deletion(-)
 create mode 100644 src/__tests__/utils/output.test.ts
diff --git a/src/__tests__/utils/output.test.ts b/src/__tests__/utils/output.test.ts
new file mode 100644
index 0000000..7c55cf9
--- /dev/null
+++ b/src/__tests__/utils/output.test.ts
@@ -0,0 +1,158 @@
+import {describe, it, expect, vi, beforeEach, afterEach} from 'vitest';
+import fs from 'fs';
+import path from 'path';
+import os from 'os';
+import {serialize, format_from_ext, print} from '../../utils/output';
+
+describe('utils/output.serialize csv', ()=>{
+    it('serializes array of flat objects as RFC 4180 CSV with header row', ()=>{
+        const rows = [
+            {url: 'https://a.test/1', title: 'A', price: 1.5},
+            {url: 'https://a.test/2', title: 'B', price: 2.0},
+        ];
+        const out = serialize(rows, 'csv');
+        const lines = out.trim().split('\n');
+        expect(lines[0]).toBe('url,title,price');
+        expect(lines[1]).toBe('https://a.test/1,A,1.5');
+        expect(lines[2]).toBe('https://a.test/2,B,2');
+    });
+
+    it('quotes and escapes embedded commas, quotes, and newlines', ()=>{
+        const rows = [{name: 'Smith, John', note: 'He said "hi"'},
+            {name: 'multi\nline', note: 'ok'}];
+        const out = serialize(rows, 'csv');
+        const lines = out.trim().split(/\n/);
+        // header
+        expect(lines[0]).toBe('name,note');
+        // row 1: both fields need quoting; embedded quote doubled
+        expect(lines[1]).toBe('"Smith, John","He said ""hi"""');
+    });
+
+    it('unions keys across heterogeneous rows', ()=>{
+        const rows = [{a: 1, b: 2}, {a: 3, c: 4}];
+        const out = serialize(rows, 'csv');
+        const lines = out.trim().split('\n');
+        expect(lines[0]).toBe('a,b,c');
+        expect(lines[1]).toBe('1,2,');
+        expect(lines[2]).toBe('3,,4');
+    });
+
+    it('wraps a single object as one CSV row', ()=>{
+        const out = serialize({a: 1, b: 'x'}, 'csv');
+        expect(out.trim()).toBe('a,b\n1,x');
+    });
+
+    it('serializes nested values via JSON', ()=>{
+        const rows = [{id: 1, meta: {tag: 'x'}}];
+        const out = serialize(rows, 'csv');
+        const lines = out.trim().split('\n');
+        expect(lines[1]).toBe('1,"{""tag"":""x""}"');
+    });
+});
+
+describe('utils/output.serialize markdown', ()=>{
+    it('renders an array of objects as a Markdown table', ()=>{
+        const rows = [{a: 1, b: 'x'}, {a: 2, b: 'y'}];
+        const out = serialize(rows, 'markdown');
+        expect(out).toContain('| a | b |');
+        expect(out).toContain('| --- | --- |');
+        expect(out).toContain('| 1 | x |');
+        expect(out).toContain('| 2 | y |');
+    });
+
+    it('escapes pipes and newlines inside cells', ()=>{
+        const rows = [{a: 'a|b', b: 'line1\nline2'}];
+        const out = serialize(rows, 'markdown');
+        expect(out).toContain('| a\\|b | line1 line2 |');
+    });
+
+    it('falls back to a fenced JSON block for non-tabular data', ()=>{
+        const out = serialize([1, 2, 3], 'markdown');
+        expect(out.startsWith('```json')).toBe(true);
+    });
+});
+
+describe('utils/output.serialize html', ()=>{
+    it('renders an array of objects as an HTML table', ()=>{
+        const rows = [{a: 1, b: '<x>'}];
+        const out = serialize(rows, 'html');
+        expect(out).toContain('<thead><tr><th>a</th><th>b</th></tr></thead>');
+        expect(out).toContain('<td>1</td><td>&lt;x&gt;</td>');
+    });
+
+    it('escapes HTML in non-tabular fallback', ()=>{
+        const out = serialize('<script>', 'html');
+        expect(out).toBe('<script>');
+    });
+});
+
+describe('utils/output.format_from_ext', ()=>{
+    it('maps known extensions', ()=>{
+        expect(format_from_ext('a.json')).toBe('json');
+        expect(format_from_ext('a.CSV')).toBe('csv');
+        expect(format_from_ext('a.md')).toBe('markdown');
+        expect(format_from_ext('a.html')).toBe('html');
+    });
+
+    it('returns null for unknown extensions', ()=>{
+        expect(format_from_ext('a.txt')).toBeNull();
+        expect(format_from_ext('noext')).toBeNull();
+    });
+
+    it('rejects .xlsx with a helpful message and exits 1', ()=>{
+        const exit = vi.spyOn(process, 'exit').mockImplementation(
+            ((_code?: number)=>{ throw new Error('exit'); }) as never);
+        const err = vi.spyOn(console, 'error').mockImplementation(()=>{});
+        expect(()=>format_from_ext('out.xlsx')).toThrow('exit');
+        const msg = err.mock.calls.map(c=>c.join(' ')).join(' ');
+        expect(msg).toMatch(/XLSX output is not supported/);
+        expect(msg).toMatch(/--pretty -o file\.json/);
+        expect(msg).toMatch(/brightdata\.com\/cp\/scrapers/);
+        exit.mockRestore();
+        err.mockRestore();
+    });
+});
+
+describe('utils/output.print writes correct format from extension', ()=>{
+    const tmp_files: string[] = [];
+    const make_tmp = (ext: string)=>{
+        const p = path.join(os.tmpdir(),
+            `bdata-output-test-${Date.now()}-${Math.random()}${ext}`);
+        tmp_files.push(p);
+        return p;
+    };
+    beforeEach(()=>{ vi.spyOn(console, 'error').mockImplementation(()=>{}); });
+    afterEach(()=>{
+        vi.restoreAllMocks();
+        for (const f of tmp_files) { try { fs.unlinkSync(f); } catch {} }
+    });
+
+    it('-o file.csv writes CSV (regression: was silently writing JSON)', ()=>{
+        const out = make_tmp('.csv');
+        print([{url: 'https://x.test', title: 'T'}], {output: out});
+        const content = fs.readFileSync(out, 'utf8');
+        expect(content.split('\n')[0]).toBe('url,title');
+        expect(content.split('\n')[1]).toBe('https://x.test,T');
+    });
+
+    it('-o file.html writes HTML (regression: was silently writing JSON)', ()=>{
+        const out = make_tmp('.html');
+        print([{a: 1}], {output: out});
+        const content = fs.readFileSync(out, 'utf8');
+        expect(content).toContain('<table>');
+    });
+
+    it('-o file.md writes Markdown (regression: was silently writing JSON)', ()=>{
+        const out = make_tmp('.md');
+        print([{a: 1}], {output: out});
+        const content = fs.readFileSync(out, 'utf8');
+        expect(content).toContain('| a |');
+    });
+
+    it('-o file.json writes JSON unchanged', ()=>{
+        const out = make_tmp('.json');
+        print([{a: 1}], {output: out});
+        const content = fs.readFileSync(out, 'utf8');
+        expect(JSON.parse(content)).toEqual([{a: 1}]);
+    });
+});
diff --git a/src/utils/output.ts b/src/utils/output.ts
index 65ffe92..7869d02 100644
--- a/src/utils/output.ts
+++ b/src/utils/output.ts
@@ -14,13 +14,24 @@ const dim    = (s: string)=>ansi('2', s);
 const success = (msg: string)=>console.error(green(`✓ ${msg}`));
 const warn    = (msg: string)=>console.error(yellow(`⚠ ${msg}`));
 const info    = (msg: string)=>console.error(dim(msg));
-const fail    = (msg: string)=>{ console.error(red(`✗ ${msg}`)); 
+const fail    = (msg: string)=>{ console.error(red(`✗ ${msg}`));
     process.exit(1); };
 
 type Output_format = 'markdown'|'json'|'pretty'|'html'|'csv'|'raw';
 
+const UNSUPPORTED_EXTS: Record<string, string> = {
+    '.xlsx': 'XLSX output is not supported. Use --pretty -o file.json '
+        +'and convert with a tool like xlsx-cli, or download as XLSX '
+        +'from the Bright Data web UI (https://brightdata.com/cp/scrapers).',
+    '.xls':  'XLS output is not supported. Use --pretty -o file.json '
+        +'and convert with a tool like xlsx-cli, or download from the '
+        +'Bright Data web UI (https://brightdata.com/cp/scrapers).',
+};
+
 const format_from_ext = (file_path: string): Output_format|null=>{
     const ext = path.extname(file_path).toLowerCase();
+    if (UNSUPPORTED_EXTS[ext])
+        fail(UNSUPPORTED_EXTS[ext]);
     if (ext == '.json') return 'json';
     if (ext == '.md')   return 'markdown';
     if (ext == '.html') return 'html';
@@ -35,11 +46,120 @@ type Print_opts = {
     format?: Output_format;
 };
 
+const to_rows = (data: unknown): Record<string, unknown>[]|null=>{
+    if (Array.isArray(data) && data.length
+        && data.every(d=>d && typeof d == 'object' && !Array.isArray(d)))
+    {
+        return data as Record<string, unknown>[];
+    }
+    if (data && typeof data == 'object' && !Array.isArray(data))
+        return [data as Record<string, unknown>];
+    return null;
+};
+
+const collect_keys = (rows: Record<string, unknown>[]): string[]=>{
+    const seen = new Set<string>();
+    const ordered: string[] = [];
+    for (const r of rows)
+    {
+        for (const k of Object.keys(r))
+        {
+            if (!seen.has(k))
+            {
+                seen.add(k);
+                ordered.push(k);
+            }
+        }
+    }
+    return ordered;
+};
+
+const cell_to_string = (val: unknown): string=>{
+    if (val === null || val === undefined)
+        return '';
+    if (typeof val == 'string')
+        return val;
+    if (typeof val == 'number' || typeof val == 'boolean')
+        return String(val);
+    return JSON.stringify(val);
+};
+
+const csv_escape = (val: unknown): string=>{
+    const s = cell_to_string(val);
+    if (/[",\r\n]/.test(s))
+        return '"'+s.replace(/"/g, '""')+'"';
+    return s;
+};
+
+const serialize_csv = (data: unknown): string=>{
+    if (typeof data == 'string')
+        return data;
+    const rows = to_rows(data);
+    if (!rows)
+    {
+        warn('CSV requires an object or array of objects; falling back '
+            +'to JSON. Use --json to silence this warning.');
+        return JSON.stringify(data, null, 2);
+    }
+    const keys = collect_keys(rows);
+    const header = keys.map(csv_escape).join(',');
+    const body = rows.map(r=>keys.map(k=>csv_escape(r[k])).join(',')).join('\n');
+    return header+'\n'+body+'\n';
+};
+
+const md_escape = (val: unknown): string=>
+    cell_to_string(val).replace(/\|/g, '\\|').replace(/\r?\n/g, ' ');
+
+const serialize_markdown = (data: unknown): string=>{
+    if (typeof data == 'string')
+        return data;
+    const rows = to_rows(data);
+    if (!rows)
+        return '```json\n'+JSON.stringify(data, null, 2)+'\n```\n';
+    const keys = collect_keys(rows);
+    const header = '| '+keys.join(' | ')+' |';
+    const divider = '| '+keys.map(()=>'---').join(' | ')+' |';
+    const body = rows.map(r=>
+        '| '+keys.map(k=>md_escape(r[k])).join(' | ')+' |').join('\n');
+    return [header, divider, body].join('\n')+'\n';
+};
+
+const html_escape = (val: unknown): string=>
+    cell_to_string(val)
+        .replace(/&/g, '&amp;')
+        .replace(/</g, '&lt;')
+        .replace(/>/g, '&gt;')
+        .replace(/"/g, '&quot;');
+
+const serialize_html = (data: unknown): string=>{
+    if (typeof data == 'string')
+        return data;
+    const rows = to_rows(data);
+    if (!rows)
+        return '<pre>'+html_escape(JSON.stringify(data, null, 2))+'</pre>\n';
+    const keys = collect_keys(rows);
+    const thead = '<thead><tr>'
+        +keys.map(k=>'<th>'+html_escape(k)+'</th>').join('')
+        +'</tr></thead>';
+    const tbody = '<tbody>'
+        +rows.map(r=>'<tr>'
+            +keys.map(k=>'<td>'+html_escape(r[k])+'</td>').join('')
+            +'</tr>').join('')
+        +'</tbody>';
+    return '<table>'+thead+tbody+'</table>\n';
+};
+
 const serialize = (data: unknown, fmt: Output_format): string=>{
     if (fmt == 'pretty')
         return JSON.stringify(data, null, 2);
     if (fmt == 'json')
         return JSON.stringify(data);
+    if (fmt == 'csv')
+        return serialize_csv(data);
+    if (fmt == 'markdown')
+        return serialize_markdown(data);
+    if (fmt == 'html')
+        return serialize_html(data);
     if (typeof data == 'string')
         return data;
     return JSON.stringify(data, null, 2);

From ec4ab4edb2daf592034f451dc4fca97ece332de7 Mon Sep 17 00:00:00 2001
From: meirk-brd <meirk@brightdata.com>
Date: Tue, 26 May 2026 14:17:04 +0300
Subject: [PATCH 2/2] style(output): drop redundant test comments

---
 src/__tests__/utils/output.test.ts | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/__tests__/utils/output.test.ts b/src/__tests__/utils/output.test.ts
index 7c55cf9..97c4deb 100644
--- a/src/__tests__/utils/output.test.ts
+++ b/src/__tests__/utils/output.test.ts
@@ -22,9 +22,7 @@ describe('utils/output.serialize csv', ()=>{
             {name: 'multi\nline', note: 'ok'}];
         const out = serialize(rows, 'csv');
         const lines = out.trim().split(/\n/);
-        // header
         expect(lines[0]).toBe('name,note');
-        // row 1: both fields need quoting; embedded quote doubled
         expect(lines[1]).toBe('"Smith, John","He said ""hi"""');
     });