@@ -174,7 +174,7 @@ class EncodingTable:
174174 qstrs_inv : object
175175
176176
177- def compute_huffman_coding (qstrs , translation_name , translations , f ):
177+ def compute_huffman_coding (qstrs , translation_name , translations , f , compression_level ):
178178 # possible future improvement: some languages are better when consider len(k) > 2. try both?
179179 qstrs = dict ((k , v ) for k , v in qstrs .items () if len (k ) > 3 )
180180 qstr_strs = list (qstrs .keys ())
@@ -209,6 +209,8 @@ def remove_offset(c):
209209 if 0x80 <= ord_c < 0xFF :
210210 end_unused = min (ord_c , end_unused )
211211 max_words = end_unused - 0x80
212+ if compression_level < 5 :
213+ max_words = 0
212214
213215 bits_per_codepoint = 16 if max_ord > 255 else 8
214216 values_type = "uint16_t" if max_ord > 255 else "uint8_t"
@@ -298,8 +300,12 @@ def est_net_savings(s, occ):
298300 word = scores [0 ][0 ]
299301 words .append (word )
300302
303+ splitters = words [:]
304+ if compression_level > 3 :
305+ splitters .extend (qstr_strs )
306+
301307 words .sort (key = len )
302- extractor = TextSplitter (words + qstr_strs )
308+ extractor = TextSplitter (splitters )
303309 counter = collections .Counter ()
304310 used_qstr = 0
305311 for t in texts :
@@ -356,8 +362,8 @@ def est_net_savings(s, occ):
356362 len (translation .encode ("utf-8" )) for (original , translation ) in translations
357363 )
358364
359- maxlen = len (words [- 1 ])
360- minlen = len (words [0 ])
365+ maxlen = len (words [- 1 ]) if words else 0
366+ minlen = len (words [0 ]) if words else 0
361367 wlencount = [len ([None for w in words if len (w ) == l ]) for l in range (minlen , maxlen + 1 )]
362368
363369 translation_qstr_bits = used_qstr .bit_length ()
@@ -596,6 +602,12 @@ def output_translation_data(encoding_table, i18ns, out):
596602 parser .add_argument (
597603 "--translation" , default = None , type = str , help = "translations for i18n() items"
598604 )
605+ parser .add_argument (
606+ "--compression_level" ,
607+ type = int ,
608+ default = 9 ,
609+ help = "degree of compression (>5: construct dictionary; >3: use qstrs)" ,
610+ )
599611 parser .add_argument (
600612 "--compression_filename" ,
601613 type = argparse .FileType ("w" , encoding = "UTF-8" ),
@@ -619,6 +631,6 @@ def output_translation_data(encoding_table, i18ns, out):
619631 i18ns = sorted (i18ns )
620632 translations = translate (args .translation , i18ns )
621633 encoding_table = compute_huffman_coding (
622- qstrs , args .translation , translations , args .compression_filename
634+ qstrs , args .translation , translations , args .compression_filename , args . compression_level
623635 )
624636 output_translation_data (encoding_table , translations , args .translation_filename )
0 commit comments