11"""
22Process raw qstr file and output qstr data with length, hash and data bytes.
33
4- This script works with Python 2.7, 3.3 and 3.4.
4+ This script is only regularly tested with the same version of Python used
5+ during CI, typically the latest "3.x". However, incompatibilities with any
6+ supported CPython version are unintended.
57
68For documentation about the format of compressed translated strings, see
79supervisor/shared/translate/translate.h
1618
1719import collections
1820import gettext
19- import os . path
21+ import pathlib
2022
2123if hasattr (sys .stdout , "reconfigure" ):
2224 sys .stdout .reconfigure (encoding = "utf-8" )
2325 sys .stderr .reconfigure (errors = "backslashreplace" )
2426
25- py = os .path .dirname (sys .argv [0 ])
26- top = os .path .dirname (py )
27-
28- sys .path .append (os .path .join (top , "tools/huffman" ))
27+ sys .path .append (str (pathlib .Path (__file__ ).parent .parent / "tools/huffman" ))
2928
3029import huffman
31-
32- # Python 2/3 compatibility:
33- # - iterating through bytes is different
34- # - codepoint2name lives in a different module
35- import platform
36-
37- if platform .python_version_tuple ()[0 ] == "2" :
38- bytes_cons = lambda val , enc = None : bytearray (val )
39- from htmlentitydefs import codepoint2name
40- elif platform .python_version_tuple ()[0 ] == "3" :
41- bytes_cons = bytes
42- from html .entities import codepoint2name
43- # end compatibility code
30+ from html .entities import codepoint2name
4431
4532codepoint2name [ord ("-" )] = "hyphen"
4633
@@ -182,9 +169,15 @@ class EncodingTable:
182169 extractor : object
183170 apply_offset : object
184171 remove_offset : object
172+ translation_qstr_bits : int
173+ qstrs : object
174+ qstrs_inv : object
185175
186176
187- def compute_huffman_coding (translation_name , translations , f ):
177+ def compute_huffman_coding (qstrs , translation_name , translations , f ):
178+ # possible future improvement: some languages are better when consider len(k) > 2. try both?
179+ qstrs = dict ((k , v ) for k , v in qstrs .items () if len (k ) > 3 )
180+ qstr_strs = list (qstrs .keys ())
188181 texts = [t [1 ] for t in translations ]
189182 words = []
190183
@@ -234,10 +227,12 @@ def remove_offset(c):
234227 # if "the" is in words then not only will "the" not be considered
235228 # again, neither will "there" or "wither", since they have "the"
236229 # as substrings.
237- extractor = TextSplitter (words )
230+ extractor = TextSplitter (words + qstr_strs )
238231 counter = collections .Counter ()
239232 for t in texts :
240233 for atom in extractor .iter (t ):
234+ if atom in qstrs :
235+ atom = "\1 "
241236 counter [atom ] += 1
242237 cb = huffman .codebook (counter .items ())
243238 lengths = sorted (dict ((v , len (cb [k ])) for k , v in counter .items ()).items ())
@@ -304,10 +299,14 @@ def est_net_savings(s, occ):
304299 words .append (word )
305300
306301 words .sort (key = len )
307- extractor = TextSplitter (words )
302+ extractor = TextSplitter (words + qstr_strs )
308303 counter = collections .Counter ()
304+ used_qstr = 0
309305 for t in texts :
310306 for atom in extractor .iter (t ):
307+ if atom in qstrs :
308+ used_qstr = max (used_qstr , qstrs [atom ])
309+ atom = "\1 "
311310 counter [atom ] += 1
312311 cb = huffman .codebook (counter .items ())
313312
@@ -322,6 +321,8 @@ def est_net_savings(s, occ):
322321 last_length = None
323322 canonical = {}
324323 for atom , code in sorted (cb .items (), key = lambda x : (len (x [1 ]), x [0 ])):
324+ if atom in qstr_strs :
325+ atom = "\1 "
325326 values .append (atom )
326327 length = len (code )
327328 if length not in length_count :
@@ -359,6 +360,8 @@ def est_net_savings(s, occ):
359360 minlen = len (words [0 ])
360361 wlencount = [len ([None for w in words if len (w ) == l ]) for l in range (minlen , maxlen + 1 )]
361362
363+ translation_qstr_bits = used_qstr .bit_length ()
364+
362365 f .write ("typedef {} mchar_t;\n " .format (values_type ))
363366 f .write ("const uint8_t lengths[] = {{ {} }};\n " .format (", " .join (map (str , lengths ))))
364367 f .write (
@@ -383,34 +386,44 @@ def est_net_savings(s, occ):
383386 f .write ("#define maxlen {}\n " .format (maxlen ))
384387 f .write ("#define translation_offstart {}\n " .format (offstart ))
385388 f .write ("#define translation_offset {}\n " .format (offset ))
386-
387- return EncodingTable (values , lengths , words , canonical , extractor , apply_offset , remove_offset )
389+ f .write ("#define translation_qstr_bits {}\n " .format (translation_qstr_bits ))
390+
391+ qstrs_inv = dict ((v , k ) for k , v in qstrs .items ())
392+ return EncodingTable (
393+ values ,
394+ lengths ,
395+ words ,
396+ canonical ,
397+ extractor ,
398+ apply_offset ,
399+ remove_offset ,
400+ translation_qstr_bits ,
401+ qstrs ,
402+ qstrs_inv ,
403+ )
388404
389405
390406def decompress (encoding_table , encoded , encoded_length_bits ):
407+ qstrs_inv = encoding_table .qstrs_inv
391408 values = encoding_table .values
392409 lengths = encoding_table .lengths
393410 words = encoding_table .words
394411
412+ def bititer ():
413+ for byte in encoded :
414+ for bit in (0x80 , 0x40 , 0x20 , 0x10 , 0x8 , 0x4 , 0x2 , 0x1 ):
415+ yield bool (byte & bit )
416+
417+ nextbit = bititer ().__next__
418+
419+ def getnbits (n ):
420+ bits = 0
421+ for i in range (n ):
422+ bits = (bits << 1 ) | nextbit ()
423+ return bits
424+
395425 dec = []
396- this_byte = 0
397- this_bit = 7
398- b = encoded [this_byte ]
399- bits = 0
400- for i in range (encoded_length_bits ):
401- bits <<= 1
402- if 0x80 & b :
403- bits |= 1
404-
405- b <<= 1
406- if this_bit == 0 :
407- this_bit = 7
408- this_byte += 1
409- if this_byte < len (encoded ):
410- b = encoded [this_byte ]
411- else :
412- this_bit -= 1
413- length = bits
426+ length = getnbits (encoded_length_bits )
414427
415428 i = 0
416429 while i < length :
@@ -419,27 +432,19 @@ def decompress(encoding_table, encoded, encoded_length_bits):
419432 max_code = lengths [0 ]
420433 searched_length = lengths [0 ]
421434 while True :
422- bits <<= 1
423- if 0x80 & b :
424- bits |= 1
425-
426- b <<= 1
435+ bits = (bits << 1 ) | nextbit ()
427436 bit_length += 1
428- if this_bit == 0 :
429- this_bit = 7
430- this_byte += 1
431- if this_byte < len (encoded ):
432- b = encoded [this_byte ]
433- else :
434- this_bit -= 1
435437 if max_code > 0 and bits < max_code :
436438 # print('{0:0{width}b}'.format(bits, width=bit_length))
437439 break
438440 max_code = (max_code << 1 ) + lengths [bit_length ]
439441 searched_length += lengths [bit_length ]
440442
441443 v = values [searched_length + bits - max_code ]
442- if v >= chr (0x80 ) and v < chr (0x80 + len (words )):
444+ if v == chr (1 ):
445+ qstr_idx = getnbits (encoding_table .translation_qstr_bits )
446+ v = qstrs_inv [qstr_idx ]
447+ elif v >= chr (0x80 ) and v < chr (0x80 + len (words )):
443448 v = words [ord (v ) - 0x80 ]
444449 i += len (v .encode ("utf-8" ))
445450 dec .append (v )
@@ -449,36 +454,37 @@ def decompress(encoding_table, encoded, encoded_length_bits):
449454def compress (encoding_table , decompressed , encoded_length_bits , len_translation_encoded ):
450455 if not isinstance (decompressed , str ):
451456 raise TypeError ()
457+ qstrs = encoding_table .qstrs
452458 canonical = encoding_table .canonical
453459 extractor = encoding_table .extractor
454460
455- enc = bytearray (len (decompressed ) * 3 )
456- current_bit = 7
457- current_byte = 0
458-
459- bits = encoded_length_bits + 1
460- for i in range (bits - 1 , 0 , - 1 ):
461- if len_translation_encoded & (1 << (i - 1 )):
462- enc [current_byte ] |= 1 << current_bit
463- if current_bit == 0 :
464- current_bit = 7
465- current_byte += 1
466- else :
467- current_bit -= 1
461+ enc = 1
462+
463+ def put_bit (enc , b ):
464+ return (enc << 1 ) | bool (b )
465+
466+ def put_bits (enc , b , n ):
467+ for i in range (n - 1 , - 1 , - 1 ):
468+ enc = put_bit (enc , b & (1 << i ))
469+ return enc
470+
471+ enc = put_bits (enc , len_translation_encoded , encoded_length_bits )
468472
469473 for atom in extractor .iter (decompressed ):
470- for b in canonical [atom ]:
471- if b == "1" :
472- enc [current_byte ] |= 1 << current_bit
473- if current_bit == 0 :
474- current_bit = 7
475- current_byte += 1
476- else :
477- current_bit -= 1
474+ if atom in qstrs :
475+ can = canonical ["\1 " ]
476+ else :
477+ can = canonical [atom ]
478+ for b in can :
479+ enc = put_bit (enc , b == "1" )
480+ if atom in qstrs :
481+ enc = put_bits (enc , qstrs [atom ], encoding_table .translation_qstr_bits )
482+
483+ while enc .bit_length () % 8 != 1 :
484+ enc = put_bit (enc , 0 )
478485
479- if current_bit != 7 :
480- current_byte += 1
481- return enc [:current_byte ]
486+ r = enc .to_bytes ((enc .bit_length () + 7 ) // 8 , "big" )
487+ return r [1 :]
482488
483489
484490def qstr_escape (qst ):
@@ -493,10 +499,20 @@ def esc_char(m):
493499 return re .sub (r"[^A-Za-z0-9_]" , esc_char , qst )
494500
495501
502+ def parse_qstrs (infile ):
503+ r = {}
504+ rx = re .compile (r'QDEF\([A-Za-z0-9_]+,\s*\d+,\s*\d+,\s*(?P<cstr>"(?:[^"\\\\]*|\\.)")\)' )
505+ content = infile .read ()
506+ for i , mat in enumerate (rx .findall (content , re .M )):
507+ mat = eval (mat )
508+ r [mat ] = i
509+ return r
510+
511+
496512def parse_input_headers (infiles ):
497513 i18ns = set ()
498514
499- # read the qstrs in from the input files
515+ # read the TRANSLATE strings in from the input files
500516 for infile in infiles :
501517 with open (infile , "rt" ) as f :
502518 for line in f :
@@ -516,12 +532,12 @@ def escape_bytes(qstr):
516532 return qstr
517533 else :
518534 # qstr contains non-printable codes so render entire thing as hex pairs
519- qbytes = bytes_cons (qstr , "utf8" )
535+ qbytes = bytes (qstr , "utf8" )
520536 return "" .join (("\\ x%02x" % b ) for b in qbytes )
521537
522538
523539def make_bytes (cfg_bytes_len , cfg_bytes_hash , qstr ):
524- qbytes = bytes_cons (qstr , "utf8" )
540+ qbytes = bytes (qstr , "utf8" )
525541 qlen = len (qbytes )
526542 qhash = compute_hash (qbytes , cfg_bytes_hash )
527543 if qlen >= (1 << (8 * cfg_bytes_len )):
@@ -551,7 +567,7 @@ def output_translation_data(encoding_table, i18ns, out):
551567 )
552568 total_text_compressed_size += len (compressed )
553569 decompressed = decompress (encoding_table , compressed , encoded_length_bits )
554- assert decompressed == translation
570+ assert decompressed == translation , ( decompressed , translation )
555571 for c in C_ESCAPES :
556572 decompressed = decompressed .replace (c , C_ESCAPES [c ])
557573 formatted = ["{:d}" .format (x ) for x in compressed ]
@@ -572,7 +588,7 @@ def output_translation_data(encoding_table, i18ns, out):
572588 import argparse
573589
574590 parser = argparse .ArgumentParser (
575- description = "Process QSTR definitions into headers for compilation"
591+ description = "Process TRANSLATE strings into headers for compilation"
576592 )
577593 parser .add_argument (
578594 "infiles" , metavar = "N" , type = str , nargs = "+" , help = "an integer for the accumulator"
@@ -590,13 +606,19 @@ def output_translation_data(encoding_table, i18ns, out):
590606 type = argparse .FileType ("w" , encoding = "UTF-8" ),
591607 help = "c file for translation data" ,
592608 )
609+ parser .add_argument (
610+ "--qstrdefs_filename" ,
611+ type = argparse .FileType ("r" , encoding = "UTF-8" ),
612+ help = "" ,
613+ )
593614
594615 args = parser .parse_args ()
595616
617+ qstrs = parse_qstrs (args .qstrdefs_filename )
596618 i18ns = parse_input_headers (args .infiles )
597619 i18ns = sorted (i18ns )
598620 translations = translate (args .translation , i18ns )
599621 encoding_table = compute_huffman_coding (
600- args .translation , translations , args .compression_filename
622+ qstrs , args .translation , translations , args .compression_filename
601623 )
602624 output_translation_data (encoding_table , translations , args .translation_filename )
0 commit comments