@@ -2483,6 +2483,7 @@ class PartialEvaluator {
24832483 transform : null ,
24842484 fontName : null ,
24852485 hasEOL : false ,
2486+ chars : [ ] ,
24862487 } ;
24872488
24882489 // Use a circular buffer (length === 2) to save the last chars in the
@@ -2723,6 +2724,7 @@ class PartialEvaluator {
27232724 transform : textChunk . transform ,
27242725 fontName : textChunk . fontName ,
27252726 hasEOL : textChunk . hasEOL ,
2727+ chars : textChunk . chars ,
27262728 } ;
27272729 }
27282730
@@ -3073,6 +3075,9 @@ class PartialEvaluator {
30733075 scaledDim = 0 ;
30743076 }
30753077
3078+ let prevWidth = textChunk . width ;
3079+ let m = Util . transform ( textState . ctm , textState . textMatrix ) ;
3080+
30763081 if ( ! font . vertical ) {
30773082 scaledDim *= textState . textHScale ;
30783083 intersector ?. addGlyph (
@@ -3116,6 +3121,180 @@ class PartialEvaluator {
31163121 textChunk . str . push ( glyphUnicode ) ;
31173122 }
31183123
3124+ function closestStandardAngle ( degrees ) {
3125+ const standardAngles = [ 0 , 90 , 180 , 270 ] ;
3126+ let closestAngle = standardAngles [ 0 ] ;
3127+ let minDifference = Math . abs ( degrees - closestAngle ) ;
3128+
3129+ for ( let i = 1 ; i < standardAngles . length ; i ++ ) {
3130+ const difference = Math . abs ( degrees - standardAngles [ i ] ) ;
3131+ if ( difference < minDifference ) {
3132+ minDifference = difference ;
3133+ closestAngle = standardAngles [ i ] ;
3134+ }
3135+ }
3136+
3137+ return closestAngle ;
3138+ }
3139+
3140+ function matrixToDegrees ( matrix ) {
3141+ let radians = Math . atan2 ( matrix [ 1 ] , matrix [ 0 ] ) ;
3142+ if ( radians < 0 ) {
3143+ radians += ( 2 * Math . PI ) ;
3144+ }
3145+ let degrees = Math . round ( radians * ( 180 / Math . PI ) ) ;
3146+ degrees = degrees % 360 ;
3147+ if ( degrees < 0 ) {
3148+ degrees += 360 ;
3149+ }
3150+ degrees = closestStandardAngle ( degrees ) ;
3151+ return degrees ;
3152+ }
3153+
3154+ let rotation = matrixToDegrees ( m ) ;
3155+
3156+ let ascent = font . ascent ;
3157+ let descent = font . descent ;
3158+ if ( descent > 0 ) {
3159+ descent = - descent ;
3160+ }
3161+ if ( ascent && descent ) {
3162+ if ( ascent > 1 ) {
3163+ ascent = 0.75 ;
3164+ }
3165+ if ( descent < - 0.5 ) {
3166+ descent = - 0.25 ;
3167+ }
3168+ }
3169+ else {
3170+ ascent = 0.75 ;
3171+ descent = - 0.25 ;
3172+ }
3173+
3174+ if ( font . capHeight && font . capHeight < ascent && font . capHeight > 0 ) {
3175+ ascent = font . capHeight ;
3176+ }
3177+
3178+ let charWidth = textChunk . width - prevWidth ;
3179+ let rect = [ 0 , textState . fontSize * descent , charWidth , textState . fontSize * ascent ]
3180+
3181+ if (
3182+ font . isType3Font &&
3183+ textState . fontSize <= 1 &&
3184+ ! isArrayEqual ( textState . fontMatrix , FONT_IDENTITY_MATRIX )
3185+ ) {
3186+ const glyphHeight = font . bbox [ 3 ] - font . bbox [ 1 ] ;
3187+ if ( glyphHeight > 0 ) {
3188+ rect [ 1 ] = font . bbox [ 1 ] * textState . fontMatrix [ 3 ] ;
3189+ rect [ 3 ] = font . bbox [ 3 ] * textState . fontMatrix [ 3 ] ;
3190+ }
3191+ }
3192+
3193+ let rect2 = [ Infinity , Infinity , - Infinity , - Infinity ] ;
3194+ Util . axialAlignedBoundingBox ( rect , m , rect2 ) ;
3195+ rect = rect2 ;
3196+
3197+ let baselineRect = [ Infinity , Infinity , - Infinity , - Infinity ] ;
3198+ Util . axialAlignedBoundingBox ( [ 0 , 0 , 0 , 0 ] , m , baselineRect ) ;
3199+ let baseline = 0 ;
3200+ if ( rotation === 0 || rotation === 180 ) {
3201+ baseline = baselineRect [ 1 ] ;
3202+ }
3203+ else if ( rotation === 90 || rotation === 270 ) {
3204+ baseline = baselineRect [ 0 ] ;
3205+ }
3206+
3207+ let p1 = [ 0 , 0 ] ;
3208+ let p2 = [ 0 , 1 ] ;
3209+
3210+ Util . applyTransform ( p1 , getCurrentTextTransform ( ) ) ;
3211+ Util . applyTransform ( p2 , getCurrentTextTransform ( ) ) ;
3212+
3213+ let [ x1 , y1 ] = p1 ;
3214+ let [ x2 , y2 ] = p2
3215+
3216+ let fontSize = Math . hypot ( x1 - x2 , y1 - y2 ) ;
3217+
3218+ let diagonal = rotation % 90 !== 0 ;
3219+
3220+ function normalizeChar ( char ) {
3221+ // Normalize the character to NFKD form to decompose ligatures and combined characters
3222+ let normalizedChar = char . normalize ( 'NFKD' ) ;
3223+
3224+ // Handling known special cases where combining characters may still be decomposed
3225+ const specialCases = {
3226+ 'e\u0301' : 'é' , // e + ´ -> é
3227+ 'a\u0301' : 'á' , // a + ´ -> á
3228+ 'i\u0301' : 'í' , // i + ´ -> í
3229+ 'o\u0301' : 'ó' , // o + ´ -> ó
3230+ 'u\u0301' : 'ú' , // u + ´ -> ú
3231+ 'e\u0300' : 'è' , // e + ` -> è
3232+ 'a\u0300' : 'à' , // a + ` -> à
3233+ 'i\u0300' : 'ì' , // i + ` -> ì
3234+ 'o\u0300' : 'ò' , // o + ` -> ò
3235+ 'u\u0300' : 'ù' , // u + ` -> ù
3236+ 'e\u0302' : 'ê' , // e + ^ -> ê
3237+ 'a\u0302' : 'â' , // a + ^ -> â
3238+ 'i\u0302' : 'î' , // i + ^ -> î
3239+ 'o\u0302' : 'ô' , // o + ^ -> ô
3240+ 'u\u0302' : 'û' , // u + ^ -> û
3241+ 'e\u0308' : 'ë' , // e + ¨ -> ë
3242+ 'a\u0308' : 'ä' , // a + ¨ -> ä
3243+ 'i\u0308' : 'ï' , // i + ¨ -> ï
3244+ 'o\u0308' : 'ö' , // o + ¨ -> ö
3245+ 'u\u0308' : 'ü' , // u + ¨ -> ü
3246+ 'c\u0327' : 'ç' , // c + ¸ -> ç
3247+ 'n\u0303' : 'ñ' , // n + ˜ -> ñ
3248+ // Add other special cases here
3249+ } ;
3250+
3251+ // Check if the normalized character sequence matches a special case
3252+ if ( specialCases [ normalizedChar ] ) {
3253+ return specialCases [ normalizedChar ] ;
3254+ }
3255+
3256+ return normalizedChar ;
3257+ }
3258+
3259+ let charCode = glyph . unicode . charCodeAt ( 0 ) ;
3260+
3261+ if (
3262+ glyph . unicode !== ' ' &&
3263+ fontSize !== 0 &&
3264+ // Skip null and other control characters to avoid breaking strings, DOM, end even browsers…
3265+ // TODO: Consider skipping other non-printable characters as well
3266+ // TODO: Determine whether it's better to skip or replace these characters
3267+ // since we may need to keep PDF.js text layer character offsets aligned with
3268+ // Zotero reader text layer character offsets
3269+ ! (
3270+ // ASCII control characters
3271+ ( charCode >= 0x00 && charCode <= 0x1F ) ||
3272+ // Extended control characters
3273+ ( charCode >= 0x7F && charCode <= 0x9F )
3274+ )
3275+ ) {
3276+ textChunk . chars . push ( {
3277+ // Decomposed ligatures, normalized Arabic characters
3278+ c : normalizeChar ( glyphUnicode ) ,
3279+ // Normalizes Arabic characters others characters where length remains 1, but preserves
3280+ // ligatures and more importantly avoids 'e\u00be' being converted into 'e \u0301'
3281+ // which is quite common in Spanish author names and because of the space prevents
3282+ // author name recognition
3283+ // NOTE: THIS CAN STILL HAVE DECOMPOSED LIGATURES IF THE FONT HAS ITS OWN CHARACTER MAPPING,
3284+ // THEREFORE CONSIDER DITCHING THIS PROPERTY
3285+ u : glyphUnicode . length === 1 ? glyphUnicode : glyph . unicode ,
3286+ rect,
3287+ fontSize,
3288+ fontName : textState . font . name ,
3289+ bold : textState . font . bold ,
3290+ italic : textState . font . italic ,
3291+ glyphWidth,
3292+ baseline,
3293+ rotation,
3294+ diagonal,
3295+ } ) ;
3296+ }
3297+
31193298 if ( charSpacing ) {
31203299 if ( ! font . vertical ) {
31213300 textState . translateTextMatrix (
@@ -3198,6 +3377,7 @@ class PartialEvaluator {
31983377 textContent . items . push ( runBidiTransform ( textContentItem ) ) ;
31993378 textContentItem . initialized = false ;
32003379 textContentItem . str . length = 0 ;
3380+ textContentItem . chars = [ ] ;
32013381 }
32023382
32033383 function enqueueChunk ( batch = false ) {
0 commit comments