diff --git a/problems/linalg/qr_py/eval.py b/problems/linalg/qr_py/eval.py index c2e9137a..cd2c6bd3 100644 --- a/problems/linalg/qr_py/eval.py +++ b/problems/linalg/qr_py/eval.py @@ -241,7 +241,15 @@ def run_benchmarking(logger: PopcornOutput, pool: multiprocessing.Pool, tests: l logger.log("benchmark-count", len(tests)) for idx, test in enumerate(tests): logger.log(f"benchmark.{idx}.spec", test.spec) - result = run_single_benchmark(pool, test, False, 200, 10e9) + # recheck=True: re-validate the output of every timed iteration, not just + # the pre-timing warmup. Without this, the timed loop (which for the + # low-`count` shapes reuses one input object across all repeats) never + # re-checks its outputs, so a kernel that diverges only inside the timed + # region -- e.g. one that caches and replays an output keyed on the + # reused input -- is scored as fast without ever being caught locally. + # `leaderboard` mode already rechecks; this brings `benchmark` mode in + # line so a wrong timed output fails here too. + result = run_single_benchmark(pool, test, True, 200, 10e9) if isinstance(result, Stats): for field in dataclasses.fields(Stats): logger.log(f"benchmark.{idx}.{field.name}", getattr(result, field.name))