Skip to content

Commit 63f8dc6

Browse files
authored
perf(tokens): cache TokenizerCore per thread (#7547)
Rebuilding TokenizerCore on every `parse_one` call was ~6µs of wasted work; the core's construction is purely a function of the Tokenizer subclass, so caching it per (thread, class) is safe. Also drops two `list[T](...)` subscripted-generic constructions that were pure type-annotation theatre, and narrows `bit_strings` / `hex_strings` to `has_bit_strings` / `has_hex_strings` bools since TokenizerCore only truthy-checks them. ThreadLocalCache lives in tokens.py (not sqlglotc-compiled). Subclassing threading.local inside a mypyc-compiled module causes a segfault because mypyc's fixed-slot attribute access bypasses threading.local's per-thread __dict__ swap, racing all threads on the same C slot.
1 parent 5351ca1 commit 63f8dc6

2 files changed

Lines changed: 32 additions & 14 deletions

File tree

sqlglot/tokenizer_core.py

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -558,8 +558,8 @@ class TokenizerCore:
558558
"nested_comments",
559559
"hint_start",
560560
"tokens_preceding_hint",
561-
"bit_strings",
562-
"hex_strings",
561+
"has_bit_strings",
562+
"has_hex_strings",
563563
"numeric_literals",
564564
"var_single_tokens",
565565
"string_escapes_allowed_in_raw_strings",
@@ -589,8 +589,8 @@ def __init__(
589589
nested_comments: bool,
590590
hint_start: str,
591591
tokens_preceding_hint: set[TokenType],
592-
bit_strings: list[str | tuple[str, str]],
593-
hex_strings: list[str | tuple[str, str]],
592+
has_bit_strings: bool,
593+
has_hex_strings: bool,
594594
numeric_literals: dict[str, str],
595595
var_single_tokens: set[str],
596596
string_escapes_allowed_in_raw_strings: bool,
@@ -617,8 +617,8 @@ def __init__(
617617
self.nested_comments = nested_comments
618618
self.hint_start = hint_start
619619
self.tokens_preceding_hint = tokens_preceding_hint
620-
self.bit_strings = bit_strings
621-
self.hex_strings = hex_strings
620+
self.has_bit_strings = has_bit_strings
621+
self.has_hex_strings = has_hex_strings
622622
self.numeric_literals = numeric_literals
623623
self.var_single_tokens = var_single_tokens
624624
self.string_escapes_allowed_in_raw_strings = string_escapes_allowed_in_raw_strings
@@ -917,9 +917,9 @@ def _scan_number(self) -> None:
917917
if self._char == "0":
918918
peek = _CHAR_UPPER.get(self._peek, self._peek)
919919
if peek == "B":
920-
return self._scan_bits() if self.bit_strings else self._add(TokenType.NUMBER)
920+
return self._scan_bits() if self.has_bit_strings else self._add(TokenType.NUMBER)
921921
elif peek == "X":
922-
return self._scan_hex() if self.hex_strings else self._add(TokenType.NUMBER)
922+
return self._scan_hex() if self.has_hex_strings else self._add(TokenType.NUMBER)
923923

924924
decimal = False
925925
scientific = 0

sqlglot/tokens.py

Lines changed: 24 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,26 @@
11
from __future__ import annotations
22

3+
import threading
34
import typing as t
45

56
from sqlglot.trie import new_trie
67

7-
# Import Token and TokenType from tokenizer_core (compiled with mypyc)
8-
from sqlglot.tokenizer_core import Token, TokenType
8+
from sqlglot.tokenizer_core import Token, TokenizerCore, TokenType
9+
10+
T = t.TypeVar("T")
11+
12+
13+
class ThreadLocalCache(threading.local):
14+
"""Per-thread cache. Each thread sees its own dict; safe for caching stateful objects."""
15+
16+
def __init__(self) -> None:
17+
self.cache: dict[type, t.Any] = {}
18+
19+
def get_or_build(self, key: type, build: t.Callable[[], T]) -> T:
20+
if not (obj := self.cache.get(key)):
21+
self.cache[key] = obj = build()
22+
return obj
23+
924

1025
try:
1126
import sqlglotc # noqa: F401
@@ -532,18 +547,21 @@ class Tokenizer(_TokenizerBase):
532547

533548
COMMENTS = ["--", ("/*", "*/")]
534549

550+
_core_cache: t.ClassVar[ThreadLocalCache] = ThreadLocalCache()
551+
535552
__slots__ = (
536553
"dialect",
537554
"_core",
538555
)
539556

540557
def __init__(self, dialect: DialectType = None) -> None:
541558
from sqlglot.dialects.dialect import Dialect
542-
from sqlglot.tokenizer_core import TokenizerCore as _TokenizerCore
543559

544560
self.dialect = Dialect.get_or_raise(dialect)
561+
self._core = self._core_cache.get_or_build(type(self), self._init_core)
545562

546-
self._core = _TokenizerCore(
563+
def _init_core(self) -> TokenizerCore:
564+
return TokenizerCore(
547565
single_tokens=self.SINGLE_TOKENS,
548566
keywords=self.KEYWORDS,
549567
quotes=self._QUOTES,
@@ -559,8 +577,8 @@ def __init__(self, dialect: DialectType = None) -> None:
559577
nested_comments=self.NESTED_COMMENTS,
560578
hint_start=self.HINT_START,
561579
tokens_preceding_hint=self.TOKENS_PRECEDING_HINT,
562-
bit_strings=list[t.Union[str, tuple[str, str]]](self.BIT_STRINGS),
563-
hex_strings=list[t.Union[str, tuple[str, str]]](self.HEX_STRINGS),
580+
has_bit_strings=bool(self.BIT_STRINGS),
581+
has_hex_strings=bool(self.HEX_STRINGS),
564582
numeric_literals=self.NUMERIC_LITERALS,
565583
var_single_tokens=self.VAR_SINGLE_TOKENS,
566584
string_escapes_allowed_in_raw_strings=self.STRING_ESCAPES_ALLOWED_IN_RAW_STRINGS,

0 commit comments

Comments
 (0)