perf(tokens): cache TokenizerCore per thread (#7547)

tobymao · web-flow · commit 63f8dc6dfc19 · 2026-04-23T09:15:28.000-07:00
Rebuilding TokenizerCore on every `parse_one` call was ~6µs of wasted work;
the core's construction is purely a function of the Tokenizer subclass, so
caching it per (thread, class) is safe. Also drops two `list[T](...)`
subscripted-generic constructions that were pure type-annotation theatre,
and narrows `bit_strings` / `hex_strings` to `has_bit_strings` /
`has_hex_strings` bools since TokenizerCore only truthy-checks them.

ThreadLocalCache lives in tokens.py (not sqlglotc-compiled). Subclassing
threading.local inside a mypyc-compiled module causes a segfault because
mypyc's fixed-slot attribute access bypasses threading.local's per-thread
__dict__ swap, racing all threads on the same C slot.
diff --git a/sqlglot/tokenizer_core.py b/sqlglot/tokenizer_core.py
@@ -558,8 +558,8 @@ class TokenizerCore:
         "nested_comments",
         "hint_start",
         "tokens_preceding_hint",
-        "bit_strings",
-        "hex_strings",
+        "has_bit_strings",
+        "has_hex_strings",
         "numeric_literals",
         "var_single_tokens",
         "string_escapes_allowed_in_raw_strings",
@@ -589,8 +589,8 @@ def __init__(
         nested_comments: bool,
         hint_start: str,
         tokens_preceding_hint: set[TokenType],
-        bit_strings: list[str | tuple[str, str]],
-        hex_strings: list[str | tuple[str, str]],
+        has_bit_strings: bool,
+        has_hex_strings: bool,
         numeric_literals: dict[str, str],
         var_single_tokens: set[str],
         string_escapes_allowed_in_raw_strings: bool,
@@ -617,8 +617,8 @@ def __init__(
         self.nested_comments = nested_comments
         self.hint_start = hint_start
         self.tokens_preceding_hint = tokens_preceding_hint
-        self.bit_strings = bit_strings
-        self.hex_strings = hex_strings
+        self.has_bit_strings = has_bit_strings
+        self.has_hex_strings = has_hex_strings
         self.numeric_literals = numeric_literals
         self.var_single_tokens = var_single_tokens
         self.string_escapes_allowed_in_raw_strings = string_escapes_allowed_in_raw_strings
@@ -917,9 +917,9 @@ def _scan_number(self) -> None:
         if self._char == "0":
             peek = _CHAR_UPPER.get(self._peek, self._peek)
             if peek == "B":
-                return self._scan_bits() if self.bit_strings else self._add(TokenType.NUMBER)
+                return self._scan_bits() if self.has_bit_strings else self._add(TokenType.NUMBER)
             elif peek == "X":
-                return self._scan_hex() if self.hex_strings else self._add(TokenType.NUMBER)
+                return self._scan_hex() if self.has_hex_strings else self._add(TokenType.NUMBER)
 
         decimal = False
         scientific = 0
diff --git a/sqlglot/tokens.py b/sqlglot/tokens.py
@@ -1,11 +1,26 @@
 from __future__ import annotations
 
+import threading
 import typing as t
 
 from sqlglot.trie import new_trie
 
-# Import Token and TokenType from tokenizer_core (compiled with mypyc)
-from sqlglot.tokenizer_core import Token, TokenType
+from sqlglot.tokenizer_core import Token, TokenizerCore, TokenType
+
+T = t.TypeVar("T")
+
+
+class ThreadLocalCache(threading.local):
+    """Per-thread cache. Each thread sees its own dict; safe for caching stateful objects."""
+
+    def __init__(self) -> None:
+        self.cache: dict[type, t.Any] = {}
+
+    def get_or_build(self, key: type, build: t.Callable[[], T]) -> T:
+        if not (obj := self.cache.get(key)):
+            self.cache[key] = obj = build()
+        return obj
+
 
 try:
     import sqlglotc  # noqa: F401
@@ -532,18 +547,21 @@ class Tokenizer(_TokenizerBase):
 
     COMMENTS = ["--", ("/*", "*/")]
 
+    _core_cache: t.ClassVar[ThreadLocalCache] = ThreadLocalCache()
+
     __slots__ = (
         "dialect",
         "_core",
     )
 
     def __init__(self, dialect: DialectType = None) -> None:
         from sqlglot.dialects.dialect import Dialect
-        from sqlglot.tokenizer_core import TokenizerCore as _TokenizerCore
 
         self.dialect = Dialect.get_or_raise(dialect)
+        self._core = self._core_cache.get_or_build(type(self), self._init_core)
 
-        self._core = _TokenizerCore(
+    def _init_core(self) -> TokenizerCore:
+        return TokenizerCore(
             single_tokens=self.SINGLE_TOKENS,
             keywords=self.KEYWORDS,
             quotes=self._QUOTES,
@@ -559,8 +577,8 @@ def __init__(self, dialect: DialectType = None) -> None:
             nested_comments=self.NESTED_COMMENTS,
             hint_start=self.HINT_START,
             tokens_preceding_hint=self.TOKENS_PRECEDING_HINT,
-            bit_strings=list[t.Union[str, tuple[str, str]]](self.BIT_STRINGS),
-            hex_strings=list[t.Union[str, tuple[str, str]]](self.HEX_STRINGS),
+            has_bit_strings=bool(self.BIT_STRINGS),
+            has_hex_strings=bool(self.HEX_STRINGS),
             numeric_literals=self.NUMERIC_LITERALS,
             var_single_tokens=self.VAR_SINGLE_TOKENS,
             string_escapes_allowed_in_raw_strings=self.STRING_ESCAPES_ALLOWED_IN_RAW_STRINGS,