Skip to content

Commit 2ec80d8

Browse files
committed
Implement text analyzer
1 parent c1ee141 commit 2ec80d8

27 files changed

Lines changed: 4904 additions & 0 deletions

src/core/document.js

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,7 @@ import { StreamsSequenceStream } from "./decode_stream.js";
7979
import { StructTreePage } from "./struct_tree.js";
8080
import { XFAFactory } from "./xfa/factory.js";
8181
import { XRef } from "./xref.js";
82+
import { Module } from "./module/module.js";
8283

8384
const LETTER_SIZE_MEDIABOX = [0, 0, 612, 792];
8485

@@ -1041,6 +1042,7 @@ class PDFDocument {
10411042
this.pdfManager = pdfManager;
10421043
this.stream = stream;
10431044
this.xref = new XRef(stream, pdfManager);
1045+
this.module = new Module(this);
10441046

10451047
const idCounters = {
10461048
font: 0,

src/core/evaluator.js

Lines changed: 180 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2483,6 +2483,7 @@ class PartialEvaluator {
24832483
transform: null,
24842484
fontName: null,
24852485
hasEOL: false,
2486+
chars: [],
24862487
};
24872488

24882489
// Use a circular buffer (length === 2) to save the last chars in the
@@ -2723,6 +2724,7 @@ class PartialEvaluator {
27232724
transform: textChunk.transform,
27242725
fontName: textChunk.fontName,
27252726
hasEOL: textChunk.hasEOL,
2727+
chars: textChunk.chars,
27262728
};
27272729
}
27282730

@@ -3073,6 +3075,9 @@ class PartialEvaluator {
30733075
scaledDim = 0;
30743076
}
30753077

3078+
let prevWidth = textChunk.width;
3079+
let m = Util.transform(textState.ctm, textState.textMatrix);
3080+
30763081
if (!font.vertical) {
30773082
scaledDim *= textState.textHScale;
30783083
intersector?.addGlyph(
@@ -3116,6 +3121,180 @@ class PartialEvaluator {
31163121
textChunk.str.push(glyphUnicode);
31173122
}
31183123

3124+
function closestStandardAngle(degrees) {
3125+
const standardAngles = [0, 90, 180, 270];
3126+
let closestAngle = standardAngles[0];
3127+
let minDifference = Math.abs(degrees - closestAngle);
3128+
3129+
for (let i = 1; i < standardAngles.length; i++) {
3130+
const difference = Math.abs(degrees - standardAngles[i]);
3131+
if (difference < minDifference) {
3132+
minDifference = difference;
3133+
closestAngle = standardAngles[i];
3134+
}
3135+
}
3136+
3137+
return closestAngle;
3138+
}
3139+
3140+
function matrixToDegrees(matrix) {
3141+
let radians = Math.atan2(matrix[1], matrix[0]);
3142+
if (radians < 0) {
3143+
radians += (2 * Math.PI);
3144+
}
3145+
let degrees = Math.round(radians * (180 / Math.PI));
3146+
degrees = degrees % 360;
3147+
if (degrees < 0) {
3148+
degrees += 360;
3149+
}
3150+
degrees = closestStandardAngle(degrees);
3151+
return degrees;
3152+
}
3153+
3154+
let rotation = matrixToDegrees(m);
3155+
3156+
let ascent = font.ascent;
3157+
let descent = font.descent;
3158+
if (descent > 0) {
3159+
descent = -descent;
3160+
}
3161+
if (ascent && descent) {
3162+
if (ascent > 1) {
3163+
ascent = 0.75;
3164+
}
3165+
if (descent < -0.5) {
3166+
descent = -0.25;
3167+
}
3168+
}
3169+
else {
3170+
ascent = 0.75;
3171+
descent = -0.25;
3172+
}
3173+
3174+
if (font.capHeight && font.capHeight < ascent && font.capHeight > 0) {
3175+
ascent = font.capHeight;
3176+
}
3177+
3178+
let charWidth = textChunk.width - prevWidth;
3179+
let rect = [0, textState.fontSize * descent, charWidth, textState.fontSize * ascent]
3180+
3181+
if (
3182+
font.isType3Font &&
3183+
textState.fontSize <= 1 &&
3184+
!isArrayEqual(textState.fontMatrix, FONT_IDENTITY_MATRIX)
3185+
) {
3186+
const glyphHeight = font.bbox[3] - font.bbox[1];
3187+
if (glyphHeight > 0) {
3188+
rect[1] = font.bbox[1] * textState.fontMatrix[3];
3189+
rect[3] = font.bbox[3] * textState.fontMatrix[3];
3190+
}
3191+
}
3192+
3193+
let rect2 = [Infinity, Infinity, -Infinity, -Infinity];
3194+
Util.axialAlignedBoundingBox(rect, m, rect2);
3195+
rect = rect2;
3196+
3197+
let baselineRect = [Infinity, Infinity, -Infinity, -Infinity];
3198+
Util.axialAlignedBoundingBox([0, 0, 0, 0], m, baselineRect);
3199+
let baseline = 0;
3200+
if (rotation === 0 || rotation === 180) {
3201+
baseline = baselineRect[1];
3202+
}
3203+
else if (rotation === 90 || rotation === 270) {
3204+
baseline = baselineRect[0];
3205+
}
3206+
3207+
let p1 = [0, 0];
3208+
let p2 = [0, 1];
3209+
3210+
Util.applyTransform(p1, getCurrentTextTransform());
3211+
Util.applyTransform(p2, getCurrentTextTransform());
3212+
3213+
let [x1, y1] = p1;
3214+
let [x2, y2] = p2
3215+
3216+
let fontSize = Math.hypot(x1 - x2, y1 - y2);
3217+
3218+
let diagonal = rotation % 90 !== 0;
3219+
3220+
function normalizeChar(char) {
3221+
// Normalize the character to NFKD form to decompose ligatures and combined characters
3222+
let normalizedChar = char.normalize('NFKD');
3223+
3224+
// Handling known special cases where combining characters may still be decomposed
3225+
const specialCases = {
3226+
'e\u0301': 'é', // e + ´ -> é
3227+
'a\u0301': 'á', // a + ´ -> á
3228+
'i\u0301': 'í', // i + ´ -> í
3229+
'o\u0301': 'ó', // o + ´ -> ó
3230+
'u\u0301': 'ú', // u + ´ -> ú
3231+
'e\u0300': 'è', // e + ` -> è
3232+
'a\u0300': 'à', // a + ` -> à
3233+
'i\u0300': 'ì', // i + ` -> ì
3234+
'o\u0300': 'ò', // o + ` -> ò
3235+
'u\u0300': 'ù', // u + ` -> ù
3236+
'e\u0302': 'ê', // e + ^ -> ê
3237+
'a\u0302': 'â', // a + ^ -> â
3238+
'i\u0302': 'î', // i + ^ -> î
3239+
'o\u0302': 'ô', // o + ^ -> ô
3240+
'u\u0302': 'û', // u + ^ -> û
3241+
'e\u0308': 'ë', // e + ¨ -> ë
3242+
'a\u0308': 'ä', // a + ¨ -> ä
3243+
'i\u0308': 'ï', // i + ¨ -> ï
3244+
'o\u0308': 'ö', // o + ¨ -> ö
3245+
'u\u0308': 'ü', // u + ¨ -> ü
3246+
'c\u0327': 'ç', // c + ¸ -> ç
3247+
'n\u0303': 'ñ', // n + ˜ -> ñ
3248+
// Add other special cases here
3249+
};
3250+
3251+
// Check if the normalized character sequence matches a special case
3252+
if (specialCases[normalizedChar]) {
3253+
return specialCases[normalizedChar];
3254+
}
3255+
3256+
return normalizedChar;
3257+
}
3258+
3259+
let charCode = glyph.unicode.charCodeAt(0);
3260+
3261+
if (
3262+
glyph.unicode !== ' ' &&
3263+
fontSize !== 0 &&
3264+
// Skip null and other control characters to avoid breaking strings, DOM, end even browsers…
3265+
// TODO: Consider skipping other non-printable characters as well
3266+
// TODO: Determine whether it's better to skip or replace these characters
3267+
// since we may need to keep PDF.js text layer character offsets aligned with
3268+
// Zotero reader text layer character offsets
3269+
!(
3270+
// ASCII control characters
3271+
(charCode >= 0x00 && charCode <= 0x1F) ||
3272+
// Extended control characters
3273+
(charCode >= 0x7F && charCode <= 0x9F)
3274+
)
3275+
) {
3276+
textChunk.chars.push({
3277+
// Decomposed ligatures, normalized Arabic characters
3278+
c: normalizeChar(glyphUnicode),
3279+
// Normalizes Arabic characters others characters where length remains 1, but preserves
3280+
// ligatures and more importantly avoids 'e\u00be' being converted into 'e \u0301'
3281+
// which is quite common in Spanish author names and because of the space prevents
3282+
// author name recognition
3283+
// NOTE: THIS CAN STILL HAVE DECOMPOSED LIGATURES IF THE FONT HAS ITS OWN CHARACTER MAPPING,
3284+
// THEREFORE CONSIDER DITCHING THIS PROPERTY
3285+
u: glyphUnicode.length === 1 ? glyphUnicode : glyph.unicode,
3286+
rect,
3287+
fontSize,
3288+
fontName: textState.font.name,
3289+
bold: textState.font.bold,
3290+
italic: textState.font.italic,
3291+
glyphWidth,
3292+
baseline,
3293+
rotation,
3294+
diagonal,
3295+
});
3296+
}
3297+
31193298
if (charSpacing) {
31203299
if (!font.vertical) {
31213300
textState.translateTextMatrix(
@@ -3198,6 +3377,7 @@ class PartialEvaluator {
31983377
textContent.items.push(runBidiTransform(textContentItem));
31993378
textContentItem.initialized = false;
32003379
textContentItem.str.length = 0;
3380+
textContentItem.chars = [];
32013381
}
32023382

32033383
function enqueueChunk(batch = false) {

0 commit comments

Comments
 (0)