Skip to content

Commit cf1c9b7

Browse files
authored
feat(tests): add leakcheck harness for sqlglot[c] (#7533)
Detects refcount leaks in the mypyc-compiled extension by running the unit test suite N times against the compiled build and flagging any type whose gc.get_objects count grows monotonically with positive slope. Also reports sys.gettotalrefcount drift (debug Python only) and RSS drift as coarse backstops. Shares the exact same binary footprint as `make testc`: depends on install-devc, imports sqlglot (which resolves to .so ahead of .py), and runs the same TestLoader/TextTestRunner path as `python -m unittest`. The one intentional difference is in-process iteration — required to observe refcount growth across runs, which a subprocess runner would hide by resetting the heap each time. Caught the property-getter leak fixed upstream in sqlglot-mypy v1.20.0.post2: compiled build flagged linear +42/iter Identifier growth on roundtrip workloads while pure-Python stayed flat. Usage: make leakcheck python -m tests.leakcheck --pattern test_snowflake.py python -m tests.leakcheck --warmup 2 --samples 5 --verbose
1 parent daca12a commit cf1c9b7

2 files changed

Lines changed: 369 additions & 1 deletion

File tree

Makefile

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
.PHONY: install install-dev install-devc install-devc-release install-pre-commit bench bench-parse bench-transpile bench-optimize test test-fast unit testc unitc style check docs docs-serve hidec showc clean resolve-integration-conflicts update-fixtures
1+
.PHONY: install install-dev install-devc install-devc-release install-pre-commit bench bench-parse bench-transpile bench-optimize test test-fast unit testc unitc leakcheck style check docs docs-serve hidec showc clean resolve-integration-conflicts update-fixtures
22

33
ifdef UV
44
PIP := uv pip
@@ -77,6 +77,9 @@ testc: install-devc
7777
unitc: install-devc
7878
SKIP_INTEGRATION=1 python -m unittest
7979

80+
leakcheck: install-devc
81+
python -m tests.leakcheck
82+
8083
style:
8184
pre-commit run --all-files
8285
@if [ -f sqlglot-integration-tests/Makefile ]; then $(MAKE) -C sqlglot-integration-tests check-submodule; fi

tests/leakcheck.py

Lines changed: 365 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,365 @@
1+
"""Refleak detector for sqlglot[c].
2+
3+
Runs the unit-test suite N times against the compiled build, snapshotting
4+
``gc.get_objects()`` type counts between runs and flagging types whose count
5+
grows monotonically with positive slope. Also reports ``sys.gettotalrefcount``
6+
drift (debug Python only) and RSS drift as coarse backstops.
7+
8+
Why three signals, in order of strength:
9+
10+
1. **Per-type object-count growth**. Works on every Python build. Catches
11+
leaks of gc-tracked objects — the class of leak mypyc refcount bugs
12+
typically produce, because mypyc-generated classes are proper PyObject
13+
types registered with the gc.
14+
2. **``sys.gettotalrefcount`` drift**. Debug Python only (``--with-pydebug``).
15+
Catches leaks of non-gc-tracked objects (small ints, short strings,
16+
interned tuples) that signal #1 misses.
17+
3. **RSS drift**. OS-level backstop for raw-malloc leaks that never show up
18+
in CPython bookkeeping at all (e.g. unreleased arenas, third-party C
19+
libraries).
20+
21+
The workload is the real unit test suite — same as ``make unitc``. Tests are
22+
loaded once and re-run in-process each iteration; running them out-of-process
23+
would reset the heap and hide the growth we're trying to see.
24+
25+
Usage:
26+
python -m tests.leakcheck # discover tests/ pattern test_*.py
27+
python -m tests.leakcheck --top tests/dialects
28+
python -m tests.leakcheck --pattern test_snowflake.py
29+
python -m tests.leakcheck --warmup 2 --samples 5 --verbose
30+
"""
31+
32+
from __future__ import annotations
33+
34+
import argparse
35+
import gc
36+
import os
37+
import resource
38+
import sys
39+
import unittest
40+
from collections import Counter
41+
42+
43+
# Built-in bookkeeping types whose counts fluctuate for reasons unrelated to
44+
# the workload (stack frames come and go, tracebacks get created lazily,
45+
# iterators are ephemeral). Ignoring them keeps the "Growing types" report
46+
# focused on domain-meaningful types like Expression subclasses.
47+
#
48+
# unittest-machinery entries here: each TextTestRunner.run() call allocates a
49+
# fresh result/runner/stream wrapper, and the suite's subtest tracking tracks
50+
# new _SubTest/_OrderedChainMap per test. These grow linearly with iteration
51+
# count purely because we re-run the suite, not because sqlglot leaks.
52+
_NOISY = frozenset(
53+
{
54+
"frame",
55+
"traceback",
56+
"cell",
57+
"function",
58+
"method",
59+
"builtin_function_or_method",
60+
"method_descriptor",
61+
"wrapper_descriptor",
62+
"member_descriptor",
63+
"getset_descriptor",
64+
"classmethod_descriptor",
65+
"list_iterator",
66+
"tuple_iterator",
67+
"dict_keyiterator",
68+
"dict_valueiterator",
69+
"dict_itemiterator",
70+
"set_iterator",
71+
# unittest internals re-allocated per run
72+
"TestResult",
73+
"TextTestResult",
74+
"TextTestRunner",
75+
"_WritelnDecorator",
76+
"_SubTest",
77+
"_OrderedChainMap",
78+
"_GeneratorContextManager",
79+
"_Outcome",
80+
"generator",
81+
"ReferenceType",
82+
}
83+
)
84+
85+
# Structural containers are load-bearing but legitimately grow from lazy
86+
# caches (dialect registries, normalized-name caches, etc. keep filling).
87+
# Treat them with a higher delta threshold so first-run cache warmup doesn't
88+
# flip the verdict; a real leak still wins because its slope is linear and
89+
# unbounded.
90+
_STRUCTURAL = frozenset({"dict", "list", "tuple", "set", "frozenset"})
91+
92+
93+
# Opened once at import rather than per-iteration to avoid FD churn and to
94+
# keep it out of the per-sample object count.
95+
_NULL_STREAM = open(os.devnull, "w")
96+
97+
98+
def _maxrss_kb() -> int:
99+
"""Return current resident-set size in KB, normalized across OSes.
100+
101+
``resource.ru_maxrss`` is specified in KB on Linux but in bytes on macOS
102+
(and BSD). Normalizing here lets the RSS-drift threshold be a single
103+
value regardless of platform.
104+
"""
105+
raw = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
106+
return raw // 1024 if sys.platform == "darwin" else raw
107+
108+
109+
def _snapshot():
110+
"""Snapshot the live Python heap: per-type counts, totalrefcount, RSS.
111+
112+
Returns ``(counts, totalrefcount, maxrss_kb)`` where ``counts`` is
113+
``{type_name: count}`` over every gc-tracked object alive right now.
114+
115+
**Why two ``gc.collect`` calls.** A single collection can leave work for
116+
a second pass in two situations:
117+
118+
* Weakref callbacks fire during collection and can drop the last strong
119+
reference to an object; that object is then only freed on the next
120+
pass.
121+
* Objects with finalizers (``__del__``) are moved to ``gc.garbage``-like
122+
handling per PEP 442 and may only be reclaimed in a subsequent cycle.
123+
124+
Without the second pass the snapshot would include objects that are
125+
logically dead but not yet freed — iteration-to-iteration that "live"
126+
set fluctuates, producing non-monotonic noise that masks the linear
127+
leak signal we're trying to isolate. Two collects makes the heap state
128+
stable enough that the growth we measure reflects real retention.
129+
130+
**Why ``sys.gettotalrefcount`` is probed via ``getattr``.** It only
131+
exists on debug Python builds; on stock CPython the symbol is absent
132+
and ``getattr(..., None)()`` returns ``None`` so the caller can treat
133+
the signal as "unavailable" rather than raise.
134+
"""
135+
gc.collect()
136+
gc.collect()
137+
counts: Counter[str] = Counter()
138+
for obj in gc.get_objects():
139+
counts[type(obj).__name__] += 1
140+
rc = getattr(sys, "gettotalrefcount", lambda: None)()
141+
return dict(counts), rc, _maxrss_kb()
142+
143+
144+
def _slope(values):
145+
"""Least-squares slope of ``values`` against their index.
146+
147+
Used as the primary leak-signal metric rather than ``values[-1] -
148+
values[0]`` because:
149+
150+
* A per-iteration-constant leak produces a straight line (``+k, +k,
151+
+k, ...``); least-squares slope recovers ``k`` exactly.
152+
* Averaging across all samples is robust to a single noisy outlier
153+
that a first-vs-last delta would overweight.
154+
* A plateauing cache (``+k, +k, 0, 0, 0, ...``) produces a lower
155+
slope than a sustained leak of the same magnitude, so the
156+
threshold-based check naturally prefers the latter.
157+
"""
158+
n = len(values)
159+
mean_x = (n - 1) / 2
160+
mean_y = sum(values) / n
161+
num = sum((i - mean_x) * (v - mean_y) for i, v in enumerate(values))
162+
den = sum((i - mean_x) ** 2 for i in range(n))
163+
return num / den if den else 0.0
164+
165+
166+
def check_leaks(
167+
workload,
168+
*,
169+
warmup=1,
170+
samples=3,
171+
slope_min=0.5,
172+
# Full-suite workloads have a small baseline growth from test fixtures
173+
# that register module-level state (e.g. tests that declare new Dialect
174+
# subclasses). We observe ~3 such allocations per iteration in a clean
175+
# build, so delta_min=10 puts real leaks (always far louder) above the
176+
# floor while absorbing fixture noise.
177+
delta_min=10,
178+
# Structural containers (dict/list/tuple/set/frozenset) get a higher bar
179+
# because the full unit-test suite legitimately allocates thousands of
180+
# them per run (TestResult fields, collected-tests lists, etc.). A real
181+
# leak still wins — the post1 property-getter bug produced dict delta
182+
# ~750k per 3 samples, two orders of magnitude above this threshold.
183+
delta_min_structural=10000,
184+
# ``ru_maxrss`` is a peak high-water mark, not current RSS — once the
185+
# process touches a page, the counter never goes back down. Running the
186+
# full unit suite repeatedly causes significant variance from arena
187+
# fragmentation and test-specific working sets (we've observed 20 MB on
188+
# one run and 470 MB on the next). Treat RSS as a coarse backstop for
189+
# truly pathological growth rather than a precise gate.
190+
rss_threshold_kb=1_000_000,
191+
refcount_threshold=2000,
192+
):
193+
"""Run ``workload`` repeatedly and return growth findings.
194+
195+
Execution model:
196+
197+
* **Warmup phase** — ``warmup`` runs without sampling, letting lazy
198+
module-level state converge (dialect class registries, tokenizer
199+
trie construction, normalized-name caches, etc.). Without warmup
200+
the first few "samples" would conflate first-use cache population
201+
with real leak growth.
202+
* **Sampling phase** — ``samples`` more runs, snapshotting the heap
203+
both before the first run and after each subsequent one. The
204+
resulting per-type series is what the detection logic operates on.
205+
206+
A type is flagged if **all three** hold:
207+
208+
1. Its count is strictly non-decreasing across samples. Any
209+
oscillation rules it out as noise or churn — caches sometimes
210+
shrink briefly under GC, real leaks never do.
211+
2. Its least-squares slope is at least ``slope_min``. A constant
212+
slope is the fingerprint of a per-call leak of a fixed number
213+
of objects.
214+
3. Its total delta is at least ``delta_min`` (or
215+
``delta_min_structural`` for dict/list/tuple/set/frozenset, which
216+
grow legitimately during cache warmup and need a stricter bar).
217+
218+
Also computed:
219+
220+
* ``rc_delta`` — ``sys.gettotalrefcount`` drift from first to last
221+
snapshot. ``None`` on release Python builds.
222+
* ``rss_delta`` — process RSS growth in KB.
223+
224+
The overall verdict is ``leaked = bool(reasons)`` where ``reasons``
225+
aggregates any of: growing types found, refcount drift above
226+
``refcount_threshold``, RSS drift above ``rss_threshold_kb``.
227+
228+
Returns ``(growing, rc_delta, rss_delta_kb, reasons)``. ``growing``
229+
is ``[(type_name, slope, delta, values), ...]`` sorted by slope
230+
descending; ``reasons`` is a list of short strings (empty ⇒ clean).
231+
"""
232+
for _ in range(warmup):
233+
workload()
234+
235+
snaps = [_snapshot()]
236+
for _ in range(samples):
237+
workload()
238+
snaps.append(_snapshot())
239+
240+
# Union of every type that appeared in any snapshot — a type can first
241+
# appear mid-run (lazy import), so we can't just take the first snapshot.
242+
names: set[str] = set()
243+
for counts, _, _ in snaps:
244+
names.update(counts)
245+
246+
growing = []
247+
for tname in sorted(names - _NOISY):
248+
vals = [c.get(tname, 0) for c, _, _ in snaps]
249+
if not all(vals[i + 1] >= vals[i] for i in range(len(vals) - 1)):
250+
continue
251+
slope = _slope(vals)
252+
delta = vals[-1] - vals[0]
253+
thresh = delta_min_structural if tname in _STRUCTURAL else delta_min
254+
if slope >= slope_min and delta >= thresh:
255+
growing.append((tname, slope, delta, vals))
256+
# Steepest slopes first so truncated reports keep the most-interesting rows.
257+
growing.sort(key=lambda g: -g[1])
258+
259+
rc0, rc1 = snaps[0][1], snaps[-1][1]
260+
rc_delta = rc1 - rc0 if rc0 is not None else None
261+
rss_delta = snaps[-1][2] - snaps[0][2]
262+
263+
reasons = []
264+
if growing:
265+
reasons.append(f"{len(growing)} type(s) growing")
266+
if rc_delta is not None and rc_delta >= refcount_threshold:
267+
reasons.append(f"totalrefcount +{rc_delta}")
268+
if rss_delta >= rss_threshold_kb:
269+
reasons.append(f"RSS +{rss_delta} KB")
270+
271+
return growing, rc_delta, rss_delta, reasons
272+
273+
274+
def _print_report(name, growing, rc_delta, rss_delta, reasons, *, verbose=False):
275+
"""Format the tuple returned by ``check_leaks`` to stdout.
276+
277+
Default truncates at 15 growing types; ``verbose=True`` prints all of
278+
them. Each row shows the full per-iteration count trail so the reader
279+
can eyeball monotonicity without re-running.
280+
"""
281+
status = "LEAK" if reasons else " OK "
282+
header = f"[{status}] {name}"
283+
if reasons:
284+
header += " — " + "; ".join(reasons)
285+
print(header)
286+
if growing:
287+
print(" Growing object types:")
288+
shown = growing if verbose else growing[:15]
289+
for tname, slope, delta, vals in shown:
290+
trail = " ".join(str(v) for v in vals)
291+
print(f" {tname:<36} slope={slope:+6.2f}/iter delta={delta:+d} [{trail}]")
292+
if not verbose and len(growing) > len(shown):
293+
print(f" ... and {len(growing) - len(shown)} more")
294+
if rc_delta is not None:
295+
print(f" sys.gettotalrefcount delta: {rc_delta:+d}")
296+
print(f" maxrss delta: {rss_delta:+d} KB")
297+
298+
299+
def _make_workload(top, pattern):
300+
"""Return a zero-arg callable that re-runs the unit test suite.
301+
302+
The suite is discovered **once** so the ``type`` objects for each
303+
TestCase subclass are created exactly one time; a fresh discover per
304+
iteration would inflate the ``type`` count and produce false-positive
305+
growth on that row.
306+
307+
``unittest.BaseTestSuite._cleanup`` defaults to ``True``: after each
308+
test runs, the suite sets its slot to ``None`` to drop the reference
309+
(a memory optimisation for normal single-shot runs). That would leave
310+
the second iteration running zero tests. We disable it class-wide so
311+
the same suite object can be replayed N times.
312+
"""
313+
unittest.BaseTestSuite._cleanup = False
314+
suite = unittest.TestLoader().discover(top, pattern=pattern)
315+
316+
def run():
317+
unittest.TextTestRunner(stream=_NULL_STREAM, verbosity=0).run(suite)
318+
319+
return run
320+
321+
322+
def main():
323+
"""CLI entry point. Exits 1 if a leak was flagged, else 0."""
324+
p = argparse.ArgumentParser(description=__doc__.splitlines()[0])
325+
p.add_argument("--top", default="tests", help="Test discovery root (default: tests)")
326+
p.add_argument("--pattern", default="test_*.py", help="Test file glob (default: test_*.py)")
327+
p.add_argument("--warmup", type=int, default=1)
328+
p.add_argument("--samples", type=int, default=3)
329+
p.add_argument("--verbose", action="store_true")
330+
args = p.parse_args()
331+
332+
# Integration tests hit external DBs and are flaky/slow — unfit for a
333+
# harness that runs the suite multiple times. ``setdefault`` lets the
334+
# caller opt in explicitly by pre-exporting ``SKIP_INTEGRATION=0``.
335+
os.environ.setdefault("SKIP_INTEGRATION", "1")
336+
337+
# Detect whether we're running against the compiled extension. The .so
338+
# is resolved by the normal import machinery (it wins over the co-located
339+
# .py), so probing any mypyc-compiled module's __file__ is sufficient.
340+
import sqlglot.expressions.core as _ec
341+
342+
compiled = _ec.__file__.endswith(".so")
343+
print(f"build: {'sqlglotc (compiled)' if compiled else 'pure python'}")
344+
print(f"debug Python: {'yes' if hasattr(sys, 'gettotalrefcount') else 'no'}")
345+
print(f"workload: unittest discover {args.top} pattern {args.pattern}")
346+
print(f"warmup={args.warmup} samples={args.samples}")
347+
print()
348+
349+
workload = _make_workload(args.top, args.pattern)
350+
growing, rc_delta, rss_delta, reasons = check_leaks(
351+
workload, warmup=args.warmup, samples=args.samples
352+
)
353+
_print_report(
354+
f"{args.top} ({args.pattern})",
355+
growing,
356+
rc_delta,
357+
rss_delta,
358+
reasons,
359+
verbose=args.verbose,
360+
)
361+
return 1 if reasons else 0
362+
363+
364+
if __name__ == "__main__":
365+
raise SystemExit(main())

0 commit comments

Comments
 (0)