|
| 1 | +"""Refleak detector for sqlglot[c]. |
| 2 | +
|
| 3 | +Runs the unit-test suite N times against the compiled build, snapshotting |
| 4 | +``gc.get_objects()`` type counts between runs and flagging types whose count |
| 5 | +grows monotonically with positive slope. Also reports ``sys.gettotalrefcount`` |
| 6 | +drift (debug Python only) and RSS drift as coarse backstops. |
| 7 | +
|
| 8 | +Why three signals, in order of strength: |
| 9 | +
|
| 10 | +1. **Per-type object-count growth**. Works on every Python build. Catches |
| 11 | + leaks of gc-tracked objects — the class of leak mypyc refcount bugs |
| 12 | + typically produce, because mypyc-generated classes are proper PyObject |
| 13 | + types registered with the gc. |
| 14 | +2. **``sys.gettotalrefcount`` drift**. Debug Python only (``--with-pydebug``). |
| 15 | + Catches leaks of non-gc-tracked objects (small ints, short strings, |
| 16 | + interned tuples) that signal #1 misses. |
| 17 | +3. **RSS drift**. OS-level backstop for raw-malloc leaks that never show up |
| 18 | + in CPython bookkeeping at all (e.g. unreleased arenas, third-party C |
| 19 | + libraries). |
| 20 | +
|
| 21 | +The workload is the real unit test suite — same as ``make unitc``. Tests are |
| 22 | +loaded once and re-run in-process each iteration; running them out-of-process |
| 23 | +would reset the heap and hide the growth we're trying to see. |
| 24 | +
|
| 25 | +Usage: |
| 26 | + python -m tests.leakcheck # discover tests/ pattern test_*.py |
| 27 | + python -m tests.leakcheck --top tests/dialects |
| 28 | + python -m tests.leakcheck --pattern test_snowflake.py |
| 29 | + python -m tests.leakcheck --warmup 2 --samples 5 --verbose |
| 30 | +""" |
| 31 | + |
| 32 | +from __future__ import annotations |
| 33 | + |
| 34 | +import argparse |
| 35 | +import gc |
| 36 | +import os |
| 37 | +import resource |
| 38 | +import sys |
| 39 | +import unittest |
| 40 | +from collections import Counter |
| 41 | + |
| 42 | + |
| 43 | +# Built-in bookkeeping types whose counts fluctuate for reasons unrelated to |
| 44 | +# the workload (stack frames come and go, tracebacks get created lazily, |
| 45 | +# iterators are ephemeral). Ignoring them keeps the "Growing types" report |
| 46 | +# focused on domain-meaningful types like Expression subclasses. |
| 47 | +# |
| 48 | +# unittest-machinery entries here: each TextTestRunner.run() call allocates a |
| 49 | +# fresh result/runner/stream wrapper, and the suite's subtest tracking tracks |
| 50 | +# new _SubTest/_OrderedChainMap per test. These grow linearly with iteration |
| 51 | +# count purely because we re-run the suite, not because sqlglot leaks. |
| 52 | +_NOISY = frozenset( |
| 53 | + { |
| 54 | + "frame", |
| 55 | + "traceback", |
| 56 | + "cell", |
| 57 | + "function", |
| 58 | + "method", |
| 59 | + "builtin_function_or_method", |
| 60 | + "method_descriptor", |
| 61 | + "wrapper_descriptor", |
| 62 | + "member_descriptor", |
| 63 | + "getset_descriptor", |
| 64 | + "classmethod_descriptor", |
| 65 | + "list_iterator", |
| 66 | + "tuple_iterator", |
| 67 | + "dict_keyiterator", |
| 68 | + "dict_valueiterator", |
| 69 | + "dict_itemiterator", |
| 70 | + "set_iterator", |
| 71 | + # unittest internals re-allocated per run |
| 72 | + "TestResult", |
| 73 | + "TextTestResult", |
| 74 | + "TextTestRunner", |
| 75 | + "_WritelnDecorator", |
| 76 | + "_SubTest", |
| 77 | + "_OrderedChainMap", |
| 78 | + "_GeneratorContextManager", |
| 79 | + "_Outcome", |
| 80 | + "generator", |
| 81 | + "ReferenceType", |
| 82 | + } |
| 83 | +) |
| 84 | + |
| 85 | +# Structural containers are load-bearing but legitimately grow from lazy |
| 86 | +# caches (dialect registries, normalized-name caches, etc. keep filling). |
| 87 | +# Treat them with a higher delta threshold so first-run cache warmup doesn't |
| 88 | +# flip the verdict; a real leak still wins because its slope is linear and |
| 89 | +# unbounded. |
| 90 | +_STRUCTURAL = frozenset({"dict", "list", "tuple", "set", "frozenset"}) |
| 91 | + |
| 92 | + |
| 93 | +# Opened once at import rather than per-iteration to avoid FD churn and to |
| 94 | +# keep it out of the per-sample object count. |
| 95 | +_NULL_STREAM = open(os.devnull, "w") |
| 96 | + |
| 97 | + |
| 98 | +def _maxrss_kb() -> int: |
| 99 | + """Return current resident-set size in KB, normalized across OSes. |
| 100 | +
|
| 101 | + ``resource.ru_maxrss`` is specified in KB on Linux but in bytes on macOS |
| 102 | + (and BSD). Normalizing here lets the RSS-drift threshold be a single |
| 103 | + value regardless of platform. |
| 104 | + """ |
| 105 | + raw = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss |
| 106 | + return raw // 1024 if sys.platform == "darwin" else raw |
| 107 | + |
| 108 | + |
| 109 | +def _snapshot(): |
| 110 | + """Snapshot the live Python heap: per-type counts, totalrefcount, RSS. |
| 111 | +
|
| 112 | + Returns ``(counts, totalrefcount, maxrss_kb)`` where ``counts`` is |
| 113 | + ``{type_name: count}`` over every gc-tracked object alive right now. |
| 114 | +
|
| 115 | + **Why two ``gc.collect`` calls.** A single collection can leave work for |
| 116 | + a second pass in two situations: |
| 117 | +
|
| 118 | + * Weakref callbacks fire during collection and can drop the last strong |
| 119 | + reference to an object; that object is then only freed on the next |
| 120 | + pass. |
| 121 | + * Objects with finalizers (``__del__``) are moved to ``gc.garbage``-like |
| 122 | + handling per PEP 442 and may only be reclaimed in a subsequent cycle. |
| 123 | +
|
| 124 | + Without the second pass the snapshot would include objects that are |
| 125 | + logically dead but not yet freed — iteration-to-iteration that "live" |
| 126 | + set fluctuates, producing non-monotonic noise that masks the linear |
| 127 | + leak signal we're trying to isolate. Two collects makes the heap state |
| 128 | + stable enough that the growth we measure reflects real retention. |
| 129 | +
|
| 130 | + **Why ``sys.gettotalrefcount`` is probed via ``getattr``.** It only |
| 131 | + exists on debug Python builds; on stock CPython the symbol is absent |
| 132 | + and ``getattr(..., None)()`` returns ``None`` so the caller can treat |
| 133 | + the signal as "unavailable" rather than raise. |
| 134 | + """ |
| 135 | + gc.collect() |
| 136 | + gc.collect() |
| 137 | + counts: Counter[str] = Counter() |
| 138 | + for obj in gc.get_objects(): |
| 139 | + counts[type(obj).__name__] += 1 |
| 140 | + rc = getattr(sys, "gettotalrefcount", lambda: None)() |
| 141 | + return dict(counts), rc, _maxrss_kb() |
| 142 | + |
| 143 | + |
| 144 | +def _slope(values): |
| 145 | + """Least-squares slope of ``values`` against their index. |
| 146 | +
|
| 147 | + Used as the primary leak-signal metric rather than ``values[-1] - |
| 148 | + values[0]`` because: |
| 149 | +
|
| 150 | + * A per-iteration-constant leak produces a straight line (``+k, +k, |
| 151 | + +k, ...``); least-squares slope recovers ``k`` exactly. |
| 152 | + * Averaging across all samples is robust to a single noisy outlier |
| 153 | + that a first-vs-last delta would overweight. |
| 154 | + * A plateauing cache (``+k, +k, 0, 0, 0, ...``) produces a lower |
| 155 | + slope than a sustained leak of the same magnitude, so the |
| 156 | + threshold-based check naturally prefers the latter. |
| 157 | + """ |
| 158 | + n = len(values) |
| 159 | + mean_x = (n - 1) / 2 |
| 160 | + mean_y = sum(values) / n |
| 161 | + num = sum((i - mean_x) * (v - mean_y) for i, v in enumerate(values)) |
| 162 | + den = sum((i - mean_x) ** 2 for i in range(n)) |
| 163 | + return num / den if den else 0.0 |
| 164 | + |
| 165 | + |
| 166 | +def check_leaks( |
| 167 | + workload, |
| 168 | + *, |
| 169 | + warmup=1, |
| 170 | + samples=3, |
| 171 | + slope_min=0.5, |
| 172 | + # Full-suite workloads have a small baseline growth from test fixtures |
| 173 | + # that register module-level state (e.g. tests that declare new Dialect |
| 174 | + # subclasses). We observe ~3 such allocations per iteration in a clean |
| 175 | + # build, so delta_min=10 puts real leaks (always far louder) above the |
| 176 | + # floor while absorbing fixture noise. |
| 177 | + delta_min=10, |
| 178 | + # Structural containers (dict/list/tuple/set/frozenset) get a higher bar |
| 179 | + # because the full unit-test suite legitimately allocates thousands of |
| 180 | + # them per run (TestResult fields, collected-tests lists, etc.). A real |
| 181 | + # leak still wins — the post1 property-getter bug produced dict delta |
| 182 | + # ~750k per 3 samples, two orders of magnitude above this threshold. |
| 183 | + delta_min_structural=10000, |
| 184 | + # ``ru_maxrss`` is a peak high-water mark, not current RSS — once the |
| 185 | + # process touches a page, the counter never goes back down. Running the |
| 186 | + # full unit suite repeatedly causes significant variance from arena |
| 187 | + # fragmentation and test-specific working sets (we've observed 20 MB on |
| 188 | + # one run and 470 MB on the next). Treat RSS as a coarse backstop for |
| 189 | + # truly pathological growth rather than a precise gate. |
| 190 | + rss_threshold_kb=1_000_000, |
| 191 | + refcount_threshold=2000, |
| 192 | +): |
| 193 | + """Run ``workload`` repeatedly and return growth findings. |
| 194 | +
|
| 195 | + Execution model: |
| 196 | +
|
| 197 | + * **Warmup phase** — ``warmup`` runs without sampling, letting lazy |
| 198 | + module-level state converge (dialect class registries, tokenizer |
| 199 | + trie construction, normalized-name caches, etc.). Without warmup |
| 200 | + the first few "samples" would conflate first-use cache population |
| 201 | + with real leak growth. |
| 202 | + * **Sampling phase** — ``samples`` more runs, snapshotting the heap |
| 203 | + both before the first run and after each subsequent one. The |
| 204 | + resulting per-type series is what the detection logic operates on. |
| 205 | +
|
| 206 | + A type is flagged if **all three** hold: |
| 207 | +
|
| 208 | + 1. Its count is strictly non-decreasing across samples. Any |
| 209 | + oscillation rules it out as noise or churn — caches sometimes |
| 210 | + shrink briefly under GC, real leaks never do. |
| 211 | + 2. Its least-squares slope is at least ``slope_min``. A constant |
| 212 | + slope is the fingerprint of a per-call leak of a fixed number |
| 213 | + of objects. |
| 214 | + 3. Its total delta is at least ``delta_min`` (or |
| 215 | + ``delta_min_structural`` for dict/list/tuple/set/frozenset, which |
| 216 | + grow legitimately during cache warmup and need a stricter bar). |
| 217 | +
|
| 218 | + Also computed: |
| 219 | +
|
| 220 | + * ``rc_delta`` — ``sys.gettotalrefcount`` drift from first to last |
| 221 | + snapshot. ``None`` on release Python builds. |
| 222 | + * ``rss_delta`` — process RSS growth in KB. |
| 223 | +
|
| 224 | + The overall verdict is ``leaked = bool(reasons)`` where ``reasons`` |
| 225 | + aggregates any of: growing types found, refcount drift above |
| 226 | + ``refcount_threshold``, RSS drift above ``rss_threshold_kb``. |
| 227 | +
|
| 228 | + Returns ``(growing, rc_delta, rss_delta_kb, reasons)``. ``growing`` |
| 229 | + is ``[(type_name, slope, delta, values), ...]`` sorted by slope |
| 230 | + descending; ``reasons`` is a list of short strings (empty ⇒ clean). |
| 231 | + """ |
| 232 | + for _ in range(warmup): |
| 233 | + workload() |
| 234 | + |
| 235 | + snaps = [_snapshot()] |
| 236 | + for _ in range(samples): |
| 237 | + workload() |
| 238 | + snaps.append(_snapshot()) |
| 239 | + |
| 240 | + # Union of every type that appeared in any snapshot — a type can first |
| 241 | + # appear mid-run (lazy import), so we can't just take the first snapshot. |
| 242 | + names: set[str] = set() |
| 243 | + for counts, _, _ in snaps: |
| 244 | + names.update(counts) |
| 245 | + |
| 246 | + growing = [] |
| 247 | + for tname in sorted(names - _NOISY): |
| 248 | + vals = [c.get(tname, 0) for c, _, _ in snaps] |
| 249 | + if not all(vals[i + 1] >= vals[i] for i in range(len(vals) - 1)): |
| 250 | + continue |
| 251 | + slope = _slope(vals) |
| 252 | + delta = vals[-1] - vals[0] |
| 253 | + thresh = delta_min_structural if tname in _STRUCTURAL else delta_min |
| 254 | + if slope >= slope_min and delta >= thresh: |
| 255 | + growing.append((tname, slope, delta, vals)) |
| 256 | + # Steepest slopes first so truncated reports keep the most-interesting rows. |
| 257 | + growing.sort(key=lambda g: -g[1]) |
| 258 | + |
| 259 | + rc0, rc1 = snaps[0][1], snaps[-1][1] |
| 260 | + rc_delta = rc1 - rc0 if rc0 is not None else None |
| 261 | + rss_delta = snaps[-1][2] - snaps[0][2] |
| 262 | + |
| 263 | + reasons = [] |
| 264 | + if growing: |
| 265 | + reasons.append(f"{len(growing)} type(s) growing") |
| 266 | + if rc_delta is not None and rc_delta >= refcount_threshold: |
| 267 | + reasons.append(f"totalrefcount +{rc_delta}") |
| 268 | + if rss_delta >= rss_threshold_kb: |
| 269 | + reasons.append(f"RSS +{rss_delta} KB") |
| 270 | + |
| 271 | + return growing, rc_delta, rss_delta, reasons |
| 272 | + |
| 273 | + |
| 274 | +def _print_report(name, growing, rc_delta, rss_delta, reasons, *, verbose=False): |
| 275 | + """Format the tuple returned by ``check_leaks`` to stdout. |
| 276 | +
|
| 277 | + Default truncates at 15 growing types; ``verbose=True`` prints all of |
| 278 | + them. Each row shows the full per-iteration count trail so the reader |
| 279 | + can eyeball monotonicity without re-running. |
| 280 | + """ |
| 281 | + status = "LEAK" if reasons else " OK " |
| 282 | + header = f"[{status}] {name}" |
| 283 | + if reasons: |
| 284 | + header += " — " + "; ".join(reasons) |
| 285 | + print(header) |
| 286 | + if growing: |
| 287 | + print(" Growing object types:") |
| 288 | + shown = growing if verbose else growing[:15] |
| 289 | + for tname, slope, delta, vals in shown: |
| 290 | + trail = " ".join(str(v) for v in vals) |
| 291 | + print(f" {tname:<36} slope={slope:+6.2f}/iter delta={delta:+d} [{trail}]") |
| 292 | + if not verbose and len(growing) > len(shown): |
| 293 | + print(f" ... and {len(growing) - len(shown)} more") |
| 294 | + if rc_delta is not None: |
| 295 | + print(f" sys.gettotalrefcount delta: {rc_delta:+d}") |
| 296 | + print(f" maxrss delta: {rss_delta:+d} KB") |
| 297 | + |
| 298 | + |
| 299 | +def _make_workload(top, pattern): |
| 300 | + """Return a zero-arg callable that re-runs the unit test suite. |
| 301 | +
|
| 302 | + The suite is discovered **once** so the ``type`` objects for each |
| 303 | + TestCase subclass are created exactly one time; a fresh discover per |
| 304 | + iteration would inflate the ``type`` count and produce false-positive |
| 305 | + growth on that row. |
| 306 | +
|
| 307 | + ``unittest.BaseTestSuite._cleanup`` defaults to ``True``: after each |
| 308 | + test runs, the suite sets its slot to ``None`` to drop the reference |
| 309 | + (a memory optimisation for normal single-shot runs). That would leave |
| 310 | + the second iteration running zero tests. We disable it class-wide so |
| 311 | + the same suite object can be replayed N times. |
| 312 | + """ |
| 313 | + unittest.BaseTestSuite._cleanup = False |
| 314 | + suite = unittest.TestLoader().discover(top, pattern=pattern) |
| 315 | + |
| 316 | + def run(): |
| 317 | + unittest.TextTestRunner(stream=_NULL_STREAM, verbosity=0).run(suite) |
| 318 | + |
| 319 | + return run |
| 320 | + |
| 321 | + |
| 322 | +def main(): |
| 323 | + """CLI entry point. Exits 1 if a leak was flagged, else 0.""" |
| 324 | + p = argparse.ArgumentParser(description=__doc__.splitlines()[0]) |
| 325 | + p.add_argument("--top", default="tests", help="Test discovery root (default: tests)") |
| 326 | + p.add_argument("--pattern", default="test_*.py", help="Test file glob (default: test_*.py)") |
| 327 | + p.add_argument("--warmup", type=int, default=1) |
| 328 | + p.add_argument("--samples", type=int, default=3) |
| 329 | + p.add_argument("--verbose", action="store_true") |
| 330 | + args = p.parse_args() |
| 331 | + |
| 332 | + # Integration tests hit external DBs and are flaky/slow — unfit for a |
| 333 | + # harness that runs the suite multiple times. ``setdefault`` lets the |
| 334 | + # caller opt in explicitly by pre-exporting ``SKIP_INTEGRATION=0``. |
| 335 | + os.environ.setdefault("SKIP_INTEGRATION", "1") |
| 336 | + |
| 337 | + # Detect whether we're running against the compiled extension. The .so |
| 338 | + # is resolved by the normal import machinery (it wins over the co-located |
| 339 | + # .py), so probing any mypyc-compiled module's __file__ is sufficient. |
| 340 | + import sqlglot.expressions.core as _ec |
| 341 | + |
| 342 | + compiled = _ec.__file__.endswith(".so") |
| 343 | + print(f"build: {'sqlglotc (compiled)' if compiled else 'pure python'}") |
| 344 | + print(f"debug Python: {'yes' if hasattr(sys, 'gettotalrefcount') else 'no'}") |
| 345 | + print(f"workload: unittest discover {args.top} pattern {args.pattern}") |
| 346 | + print(f"warmup={args.warmup} samples={args.samples}") |
| 347 | + print() |
| 348 | + |
| 349 | + workload = _make_workload(args.top, args.pattern) |
| 350 | + growing, rc_delta, rss_delta, reasons = check_leaks( |
| 351 | + workload, warmup=args.warmup, samples=args.samples |
| 352 | + ) |
| 353 | + _print_report( |
| 354 | + f"{args.top} ({args.pattern})", |
| 355 | + growing, |
| 356 | + rc_delta, |
| 357 | + rss_delta, |
| 358 | + reasons, |
| 359 | + verbose=args.verbose, |
| 360 | + ) |
| 361 | + return 1 if reasons else 0 |
| 362 | + |
| 363 | + |
| 364 | +if __name__ == "__main__": |
| 365 | + raise SystemExit(main()) |
0 commit comments