-
-
Notifications
You must be signed in to change notification settings - Fork 209
Expand file tree
/
Copy pathgenerate_figure_descriptions.js
More file actions
109 lines (94 loc) · 4.32 KB
/
generate_figure_descriptions.js
File metadata and controls
109 lines (94 loc) · 4.32 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
const fs = require('fs-extra');
const Tesseract = require('tesseract.js');
const { find_markdown_files } = require('./shared');
const generate_descriptions = async (chapter_match) => {
// Escape special regex characters in a user-supplied string so it is treated literally.
const escapeRegExp = (str) => str.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
let re;
if (chapter_match) {
chapter_match = chapter_match.replace(/\.md$/, '');
chapter_match = chapter_match.replace(/^content[/\\]*/, '');
const escapedChapter = escapeRegExp(chapter_match);
const pattern = (process.platform != 'win32')
? 'content/' + '(' + escapedChapter.replace(/\//g, ')/(') + ').md'
: 'content\\\\' + '(' + escapedChapter.replace(/\//g, ')\\\\(') + ').md';
re = new RegExp(pattern);
} else {
console.log('Please provide an argument of the form: en/2020/performance');
process.exit(1);
}
const files = await find_markdown_files();
for (const file of files) {
let language, year, chapter;
try {
const match = file.match(re);
if (!match) continue;
[, language, year, chapter] = match;
} catch {
continue;
}
if (language !== "en") {
console.log("Skipping non-English chapter");
continue;
}
console.log(`Generating descriptions for the ${year} ${chapter} chapter:`);
let markdown = await fs.readFile(file, 'utf-8');
// Matches figure_markup and captures the content inside (...)
const figure_regexp = /{{[\s]*figure_markup\(([\s\S]*?)\)[\s]*}}/g;
// Collect all blocks first to avoid regex state issues during async/await
const blocks = [];
let match;
while ((match = figure_regexp.exec(markdown)) !== null) {
blocks.push({
full: match[0],
content: match[1]
});
}
for (const block of blocks) {
// Extract attributes
const imageMatch = block.content.match(/image="([^"]*)"/);
const descriptionMatch = block.content.match(/description="([^"]*)"/);
const image_file = imageMatch ? imageMatch[1] : null;
const current_description = descriptionMatch ? descriptionMatch[1] : "";
if (image_file && current_description === "") {
const image_path = `static/images/${year}/${chapter}/${image_file}`;
if (fs.existsSync(image_path)) {
console.log(` Processing OCR for ${image_file}...`);
try {
const result = await Tesseract.recognize(image_path, 'eng');
const text = result.data.text;
// Basic cleanup: remove newlines, collapse spaces, escape backslashes and quotes
const description = text
.replace(/\n/g, ' ')
.replace(/\s+/g, ' ')
.trim()
.replace(/\\/g, '\\\\')
.replace(/"/g, '\\"');
if (description) {
const updatedBlock = block.full.replace(/description=""/, `description="${description}"`);
// Use a more specific replacement if possible to avoid colliding with other identical blocks
// Since each block should have a unique image, we can use that to find the exact block if needed
// but simple string replacement works if each block is unique in the file.
markdown = markdown.replace(block.full, updatedBlock);
}
} catch (err) {
console.error(` Error processing OCR for ${image_file}:`, err);
}
} else {
console.log(` Image not found: ${image_path}`);
}
}
}
await fs.writeFile(file, markdown);
console.log(`Finished updating ${file}`);
}
};
(async () => {
try {
const arg = process.argv.slice(2)[0];
await generate_descriptions(arg);
} catch (error) {
console.error(error);
process.exit(1);
}
})();