Merge remote-tracking branch 'origin/main'

weblate · weblate · commit 28e765aabb1b · 2023-09-05T22:12:24.000+02:00
diff --git a/py/maketranslationdata.py b/py/maketranslationdata.py
@@ -1,7 +1,9 @@
 """
 Process raw qstr file and output qstr data with length, hash and data bytes.
 
-This script works with Python 2.7, 3.3 and 3.4.
+This script is only regularly tested with the same version of Python used
+during CI, typically the latest "3.x". However, incompatibilities with any
+supported CPython version are unintended.
 
 For documentation about the format of compressed translated strings, see
 supervisor/shared/translate/translate.h
@@ -16,31 +18,16 @@
 
 import collections
 import gettext
-import os.path
+import pathlib
 
 if hasattr(sys.stdout, "reconfigure"):
     sys.stdout.reconfigure(encoding="utf-8")
     sys.stderr.reconfigure(errors="backslashreplace")
 
-py = os.path.dirname(sys.argv[0])
-top = os.path.dirname(py)
-
-sys.path.append(os.path.join(top, "tools/huffman"))
+sys.path.append(str(pathlib.Path(__file__).parent.parent / "tools/huffman"))
 
 import huffman
-
-# Python 2/3 compatibility:
-#   - iterating through bytes is different
-#   - codepoint2name lives in a different module
-import platform
-
-if platform.python_version_tuple()[0] == "2":
-    bytes_cons = lambda val, enc=None: bytearray(val)
-    from htmlentitydefs import codepoint2name
-elif platform.python_version_tuple()[0] == "3":
-    bytes_cons = bytes
-    from html.entities import codepoint2name
-# end compatibility code
+from html.entities import codepoint2name
 
 codepoint2name[ord("-")] = "hyphen"
 
@@ -182,9 +169,15 @@ class EncodingTable:
     extractor: object
     apply_offset: object
     remove_offset: object
+    translation_qstr_bits: int
+    qstrs: object
+    qstrs_inv: object
 
 
-def compute_huffman_coding(translation_name, translations, f):
+def compute_huffman_coding(qstrs, translation_name, translations, f):
+    # possible future improvement: some languages are better when consider len(k) > 2. try both?
+    qstrs = dict((k, v) for k, v in qstrs.items() if len(k) > 3)
+    qstr_strs = list(qstrs.keys())
     texts = [t[1] for t in translations]
     words = []
 
@@ -234,10 +227,12 @@ def remove_offset(c):
         # if "the" is in words then not only will "the" not be considered
         # again, neither will "there" or "wither", since they have "the"
         # as substrings.
-        extractor = TextSplitter(words)
+        extractor = TextSplitter(words + qstr_strs)
         counter = collections.Counter()
         for t in texts:
             for atom in extractor.iter(t):
+                if atom in qstrs:
+                    atom = "\1"
                 counter[atom] += 1
         cb = huffman.codebook(counter.items())
         lengths = sorted(dict((v, len(cb[k])) for k, v in counter.items()).items())
@@ -304,10 +299,14 @@ def est_net_savings(s, occ):
         words.append(word)
 
     words.sort(key=len)
-    extractor = TextSplitter(words)
+    extractor = TextSplitter(words + qstr_strs)
     counter = collections.Counter()
+    used_qstr = 0
     for t in texts:
         for atom in extractor.iter(t):
+            if atom in qstrs:
+                used_qstr = max(used_qstr, qstrs[atom])
+                atom = "\1"
             counter[atom] += 1
     cb = huffman.codebook(counter.items())
 
@@ -322,6 +321,8 @@ def est_net_savings(s, occ):
     last_length = None
     canonical = {}
     for atom, code in sorted(cb.items(), key=lambda x: (len(x[1]), x[0])):
+        if atom in qstr_strs:
+            atom = "\1"
         values.append(atom)
         length = len(code)
         if length not in length_count:
@@ -359,6 +360,8 @@ def est_net_savings(s, occ):
     minlen = len(words[0])
     wlencount = [len([None for w in words if len(w) == l]) for l in range(minlen, maxlen + 1)]
 
+    translation_qstr_bits = used_qstr.bit_length()
+
     f.write("typedef {} mchar_t;\n".format(values_type))
     f.write("const uint8_t lengths[] = {{ {} }};\n".format(", ".join(map(str, lengths))))
     f.write(
@@ -383,34 +386,44 @@ def est_net_savings(s, occ):
     f.write("#define maxlen {}\n".format(maxlen))
     f.write("#define translation_offstart {}\n".format(offstart))
     f.write("#define translation_offset {}\n".format(offset))
-
-    return EncodingTable(values, lengths, words, canonical, extractor, apply_offset, remove_offset)
+    f.write("#define translation_qstr_bits {}\n".format(translation_qstr_bits))
+
+    qstrs_inv = dict((v, k) for k, v in qstrs.items())
+    return EncodingTable(
+        values,
+        lengths,
+        words,
+        canonical,
+        extractor,
+        apply_offset,
+        remove_offset,
+        translation_qstr_bits,
+        qstrs,
+        qstrs_inv,
+    )
 
 
 def decompress(encoding_table, encoded, encoded_length_bits):
+    qstrs_inv = encoding_table.qstrs_inv
     values = encoding_table.values
     lengths = encoding_table.lengths
     words = encoding_table.words
 
+    def bititer():
+        for byte in encoded:
+            for bit in (0x80, 0x40, 0x20, 0x10, 0x8, 0x4, 0x2, 0x1):
+                yield bool(byte & bit)
+
+    nextbit = bititer().__next__
+
+    def getnbits(n):
+        bits = 0
+        for i in range(n):
+            bits = (bits << 1) | nextbit()
+        return bits
+
     dec = []
-    this_byte = 0
-    this_bit = 7
-    b = encoded[this_byte]
-    bits = 0
-    for i in range(encoded_length_bits):
-        bits <<= 1
-        if 0x80 & b:
-            bits |= 1
-
-        b <<= 1
-        if this_bit == 0:
-            this_bit = 7
-            this_byte += 1
-            if this_byte < len(encoded):
-                b = encoded[this_byte]
-        else:
-            this_bit -= 1
-    length = bits
+    length = getnbits(encoded_length_bits)
 
     i = 0
     while i < length:
@@ -419,27 +432,19 @@ def decompress(encoding_table, encoded, encoded_length_bits):
         max_code = lengths[0]
         searched_length = lengths[0]
         while True:
-            bits <<= 1
-            if 0x80 & b:
-                bits |= 1
-
-            b <<= 1
+            bits = (bits << 1) | nextbit()
             bit_length += 1
-            if this_bit == 0:
-                this_bit = 7
-                this_byte += 1
-                if this_byte < len(encoded):
-                    b = encoded[this_byte]
-            else:
-                this_bit -= 1
             if max_code > 0 and bits < max_code:
                 # print('{0:0{width}b}'.format(bits, width=bit_length))
                 break
             max_code = (max_code << 1) + lengths[bit_length]
             searched_length += lengths[bit_length]
 
         v = values[searched_length + bits - max_code]
-        if v >= chr(0x80) and v < chr(0x80 + len(words)):
+        if v == chr(1):
+            qstr_idx = getnbits(encoding_table.translation_qstr_bits)
+            v = qstrs_inv[qstr_idx]
+        elif v >= chr(0x80) and v < chr(0x80 + len(words)):
             v = words[ord(v) - 0x80]
         i += len(v.encode("utf-8"))
         dec.append(v)
@@ -449,36 +454,37 @@ def decompress(encoding_table, encoded, encoded_length_bits):
 def compress(encoding_table, decompressed, encoded_length_bits, len_translation_encoded):
     if not isinstance(decompressed, str):
         raise TypeError()
+    qstrs = encoding_table.qstrs
     canonical = encoding_table.canonical
     extractor = encoding_table.extractor
 
-    enc = bytearray(len(decompressed) * 3)
-    current_bit = 7
-    current_byte = 0
-
-    bits = encoded_length_bits + 1
-    for i in range(bits - 1, 0, -1):
-        if len_translation_encoded & (1 << (i - 1)):
-            enc[current_byte] |= 1 << current_bit
-        if current_bit == 0:
-            current_bit = 7
-            current_byte += 1
-        else:
-            current_bit -= 1
+    enc = 1
+
+    def put_bit(enc, b):
+        return (enc << 1) | bool(b)
+
+    def put_bits(enc, b, n):
+        for i in range(n - 1, -1, -1):
+            enc = put_bit(enc, b & (1 << i))
+        return enc
+
+    enc = put_bits(enc, len_translation_encoded, encoded_length_bits)
 
     for atom in extractor.iter(decompressed):
-        for b in canonical[atom]:
-            if b == "1":
-                enc[current_byte] |= 1 << current_bit
-            if current_bit == 0:
-                current_bit = 7
-                current_byte += 1
-            else:
-                current_bit -= 1
+        if atom in qstrs:
+            can = canonical["\1"]
+        else:
+            can = canonical[atom]
+        for b in can:
+            enc = put_bit(enc, b == "1")
+        if atom in qstrs:
+            enc = put_bits(enc, qstrs[atom], encoding_table.translation_qstr_bits)
+
+    while enc.bit_length() % 8 != 1:
+        enc = put_bit(enc, 0)
 
-    if current_bit != 7:
-        current_byte += 1
-    return enc[:current_byte]
+    r = enc.to_bytes((enc.bit_length() + 7) // 8, "big")
+    return r[1:]
 
 
 def qstr_escape(qst):
@@ -493,10 +499,20 @@ def esc_char(m):
     return re.sub(r"[^A-Za-z0-9_]", esc_char, qst)
 
 
+def parse_qstrs(infile):
+    r = {}
+    rx = re.compile(r'QDEF\([A-Za-z0-9_]+,\s*\d+,\s*\d+,\s*(?P<cstr>"(?:[^"\\\\]*|\\.)")\)')
+    content = infile.read()
+    for i, mat in enumerate(rx.findall(content, re.M)):
+        mat = eval(mat)
+        r[mat] = i
+    return r
+
+
 def parse_input_headers(infiles):
     i18ns = set()
 
-    # read the qstrs in from the input files
+    # read the TRANSLATE strings in from the input files
     for infile in infiles:
         with open(infile, "rt") as f:
             for line in f:
@@ -516,12 +532,12 @@ def escape_bytes(qstr):
         return qstr
     else:
         # qstr contains non-printable codes so render entire thing as hex pairs
-        qbytes = bytes_cons(qstr, "utf8")
+        qbytes = bytes(qstr, "utf8")
         return "".join(("\\x%02x" % b) for b in qbytes)
 
 
 def make_bytes(cfg_bytes_len, cfg_bytes_hash, qstr):
-    qbytes = bytes_cons(qstr, "utf8")
+    qbytes = bytes(qstr, "utf8")
     qlen = len(qbytes)
     qhash = compute_hash(qbytes, cfg_bytes_hash)
     if qlen >= (1 << (8 * cfg_bytes_len)):
@@ -551,7 +567,7 @@ def output_translation_data(encoding_table, i18ns, out):
         )
         total_text_compressed_size += len(compressed)
         decompressed = decompress(encoding_table, compressed, encoded_length_bits)
-        assert decompressed == translation
+        assert decompressed == translation, (decompressed, translation)
         for c in C_ESCAPES:
             decompressed = decompressed.replace(c, C_ESCAPES[c])
         formatted = ["{:d}".format(x) for x in compressed]
@@ -572,7 +588,7 @@ def output_translation_data(encoding_table, i18ns, out):
     import argparse
 
     parser = argparse.ArgumentParser(
-        description="Process QSTR definitions into headers for compilation"
+        description="Process TRANSLATE strings into headers for compilation"
     )
     parser.add_argument(
         "infiles", metavar="N", type=str, nargs="+", help="an integer for the accumulator"
@@ -590,13 +606,19 @@ def output_translation_data(encoding_table, i18ns, out):
         type=argparse.FileType("w", encoding="UTF-8"),
         help="c file for translation data",
     )
+    parser.add_argument(
+        "--qstrdefs_filename",
+        type=argparse.FileType("r", encoding="UTF-8"),
+        help="",
+    )
 
     args = parser.parse_args()
 
+    qstrs = parse_qstrs(args.qstrdefs_filename)
     i18ns = parse_input_headers(args.infiles)
     i18ns = sorted(i18ns)
     translations = translate(args.translation, i18ns)
     encoding_table = compute_huffman_coding(
-        args.translation, translations, args.compression_filename
+        qstrs, args.translation, translations, args.compression_filename
     )
     output_translation_data(encoding_table, translations, args.translation_filename)
diff --git a/py/py.mk b/py/py.mk
@@ -270,7 +270,7 @@ $(PY_BUILD)/translations-$(TRANSLATION).c: $(HEADER_BUILD)/compressed_translatio
 $(HEADER_BUILD)/compressed_translations.generated.h: $(PY_SRC)/maketranslationdata.py $(HEADER_BUILD)/$(TRANSLATION).mo $(HEADER_BUILD)/qstrdefs.generated.h
 	$(STEPECHO) "GEN $@"
 	$(Q)mkdir -p $(PY_BUILD)
-	$(Q)$(PYTHON) $(PY_SRC)/maketranslationdata.py --compression_filename $(HEADER_BUILD)/compressed_translations.generated.h --translation $(HEADER_BUILD)/$(TRANSLATION).mo --translation_filename $(PY_BUILD)/translations-$(TRANSLATION).c $(HEADER_BUILD)/qstrdefs.preprocessed.h
+	$(Q)$(PYTHON) $(PY_SRC)/maketranslationdata.py --compression_filename $(HEADER_BUILD)/compressed_translations.generated.h --translation $(HEADER_BUILD)/$(TRANSLATION).mo --translation_filename $(PY_BUILD)/translations-$(TRANSLATION).c --qstrdefs_filename  $(HEADER_BUILD)/qstrdefs.generated.h $(HEADER_BUILD)/qstrdefs.preprocessed.h
 
 PY_CORE_O += $(PY_BUILD)/translations-$(TRANSLATION).o
 
diff --git a/supervisor/shared/translate/compressed_string.h b/supervisor/shared/translate/compressed_string.h
@@ -53,13 +53,28 @@
 //   speaking, words.  They're just spans of code points that frequently
 //   occur together.  They are ordered shortest to longest.
 //
+// - If the translation uses a lot of code points or widely spaced code points,
+//   then the huffman table entries are UTF-16 code points. But if the translation
+//   uses only ASCII 7-bit code points plus a SMALL range of higher code points that
+//   still fit in 8 bits, translation_offset and translation_offstart are used to
+//   renumber the code points so that they still fit within 8 bits. (it's very beneficial
+//   for mchar_t to be 8 bits instead of 16!)
+//
 // - dictionary entries are non-overlapping, and the _ending_ index of each
 //   entry is stored in an array.  A count of words of each length, from
 //   minlen to maxlen, is given in the array called wlencount.  From
 //   this small array, the start and end of the N'th word can be
 //   calculated by an efficient, small loop.  (A bit of time is traded
 //   to reduce the size of this table indicating lengths)
 //
+// - Value 1 ('\1') is used to indicate that a QSTR number follows. the
+//   QSTR is encoded as a fixed number of bits (translation_qstr_bits), e.g.,
+//   10 bits if the highest core qstr is from 512 to 1023 inclusive.
+//   (maketranslationdata uses a simple heuristic where any qstr >= 3
+//   characters long is encoded in this way; this is simple but probably not
+//   optimal. In fact, the rule of >= 2 characters is better for SOME languages
+//   on SOME boards.)
+//
 // The "data" / "tail" construct is so that the struct's last member is a
 // "flexible array".  However, the _only_ member is not permitted to be
 // a flexible member, so we have to declare the first byte as a separate
diff --git a/supervisor/shared/translate/translate.c b/supervisor/shared/translate/translate.c