almanac.httparchive.org/src/tools/generate/generate_figure_descriptions.js at main · HTTPArchive/almanac.httparchive.org · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
const fs = require('fs-extra');
const Tesseract = require('tesseract.js');
const { find_markdown_files } = require('./shared');

const generate_descriptions = async (chapter_match) => {
    // Escape special regex characters in a user-supplied string so it is treated literally.
    const escapeRegExp = (str) => str.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');

    let re;
    if (chapter_match) {
        chapter_match = chapter_match.replace(/\.md$/, '');
        chapter_match = chapter_match.replace(/^content[/\\]*/, '');
        const escapedChapter = escapeRegExp(chapter_match);
        const pattern = (process.platform != 'win32')
            ? 'content/' + '(' + escapedChapter.replace(/\//g, ')/(') + ').md'
            : 'content\\\\' + '(' + escapedChapter.replace(/\//g, ')\\\\(') + ').md';
        re = new RegExp(pattern);
    } else {
        console.log('Please provide an argument of the form: en/2020/performance');
        process.exit(1);
    }

    const files = await find_markdown_files();

    for (const file of files) {
        let language, year, chapter;
        try {
            const match = file.match(re);
            if (!match) continue;
            [, language, year, chapter] = match;
        } catch {
            continue;
        }

        if (language !== "en") {
            console.log("Skipping non-English chapter");
            continue;
        }

        console.log(`Generating descriptions for the ${year} ${chapter} chapter:`);
        let markdown = await fs.readFile(file, 'utf-8');

        // Matches figure_markup and captures the content inside (...)
        const figure_regexp = /{{[\s]*figure_markup\(([\s\S]*?)\)[\s]*}}/g;

        // Collect all blocks first to avoid regex state issues during async/await
        const blocks = [];
        let match;
        while ((match = figure_regexp.exec(markdown)) !== null) {
            blocks.push({
                full: match[0],
                content: match[1]
            });
        }

        for (const block of blocks) {
            // Extract attributes
            const imageMatch = block.content.match(/image="([^"]*)"/);
            const descriptionMatch = block.content.match(/description="([^"]*)"/);

            const image_file = imageMatch ? imageMatch[1] : null;
            const current_description = descriptionMatch ? descriptionMatch[1] : "";

            if (image_file && current_description === "") {
                const image_path = `static/images/${year}/${chapter}/${image_file}`;
                if (fs.existsSync(image_path)) {
                    console.log(`  Processing OCR for ${image_file}...`);
                    try {
                        const result = await Tesseract.recognize(image_path, 'eng');
                        const text = result.data.text;

                        // Basic cleanup: remove newlines, collapse spaces, escape backslashes and quotes
                        const description = text
                            .replace(/\n/g, ' ')
                            .replace(/\s+/g, ' ')
                            .trim()
                            .replace(/\\/g, '\\\\')
                            .replace(/"/g, '\\"');

                        if (description) {
                            const updatedBlock = block.full.replace(/description=""/, `description="${description}"`);
                            // Use a more specific replacement if possible to avoid colliding with other identical blocks
                            // Since each block should have a unique image, we can use that to find the exact block if needed
                            // but simple string replacement works if each block is unique in the file.
                            markdown = markdown.replace(block.full, updatedBlock);
                        }
                    } catch (err) {
                        console.error(`  Error processing OCR for ${image_file}:`, err);
                    }
                } else {
                    console.log(`  Image not found: ${image_path}`);
                }
            }
        }

        await fs.writeFile(file, markdown);
        console.log(`Finished updating ${file}`);
    }
};

(async () => {
    try {
        const arg = process.argv.slice(2)[0];
        await generate_descriptions(arg);
    } catch (error) {
        console.error(error);
        process.exit(1);
    }
})();