From 557e037c97e934586698c1a877b841047f44eb9b Mon Sep 17 00:00:00 2001
From: Bryce Adelstein Lelbach <brycelelbach@gmail.com>
Date: Sat, 13 Jun 2026 20:33:44 +0000
Subject: [PATCH 1/2] linalg/qr: benchmark heterogeneous & ill-conditioned
 batches

The QR benchmark set ranked only well-conditioned dense batches (cond 1-2),
while the ill-conditioned stress structures (rankdef, clustered, band,
rowscale, nearcollinear, nearrank) appeared only in the correctness tests.
Worse, every batch was conditioning-homogeneous: generate_input applied one
structure uniformly to all `batch` matrices.

Together these let a submission read a few matrices, conclude "the whole batch
is well-conditioned," and route the entire batch to a TF32/Cholesky fast path
that is only numerically valid for well-conditioned inputs -- winning the
ranked (all-dense) cases while the unranked stress cases were the only thing
that could have exposed the shortcut. On a realistic batch (per-layer / per-
block optimizer factors with varying conditioning, in random positions) such a
kernel is either wrong or silently falls back, but the benchmark never built
one.

This change makes conditioning robustness part of the score, not just a gate:

- reference.py: add a `mixed` case that assigns each matrix in the batch an
  independent conditioning profile (well-conditioned dense majority interleaved
  with the ill-conditioned structures) at a random, seeded position. The
  per-case logic is factored into `_apply_case` and reused; existing
  homogeneous cases produce bit-for-bit identical data (verified on CPU), so
  prior leaderboard results are unaffected.
- task.yml: add `mixed` cases to the tests AND the benchmarks, plus fully
  ill-conditioned homogeneous batches (rankdef/clustered/nearrank) at the
  dominant benchmark shapes, so the runtime cost of the accurate path on hard
  inputs is ranked too.

The reference `torch.geqrf` passes the checker on all new cases with wide
margin (scaled factor residual 0.002-0.015 vs gate 20; orthogonality
0.17-0.32 vs gate 100), so the problem stays well-posed. New benchmark cases
reuse existing shapes, so the memory/timeout envelope is unchanged.

Not yet validated on B200 (authored while the target GPU was busy); needs a
benchmark/leaderboard run to confirm timings and timeouts.

Co-Authored-By: Claude <noreply@anthropic.com>
---
 problems/linalg/qr_py/reference.py | 89 ++++++++++++++++++++++++------
 problems/linalg/qr_py/task.yml     | 21 +++++++
 2 files changed, 93 insertions(+), 17 deletions(-)

diff --git a/problems/linalg/qr_py/reference.py b/problems/linalg/qr_py/reference.py
index fc8ace77..696d3d51 100644
--- a/problems/linalg/qr_py/reference.py
+++ b/problems/linalg/qr_py/reference.py
@@ -20,18 +20,22 @@ def _band_mask(n: int, bandwidth: int, device: torch.device) -> torch.Tensor:
     return (idx[:, None] - idx[None, :]).abs() <= bandwidth
 
 
-def generate_input(batch: int, n: int, cond: int, seed: int, case: str = "dense") -> input_t:
-    assert batch > 0, "batch must be positive"
-    assert n > 0, "n must be positive"
-    assert cond >= 0, "cond must be non-negative"
-
-    device = "cuda" if torch.cuda.is_available() else "cpu"
-    gen = torch.Generator(device=device)
-    gen.manual_seed(seed)
-
-    case = case.lower()
-    a = torch.randn((batch, n, n), device=device, dtype=torch.float32, generator=gen)
-
+# Per-matrix conditioning profiles drawn for the "mixed" case. "dense" is the
+# well-conditioned majority; the rest are the ill-conditioned stress structures.
+_MIXED_PROFILES = ("dense", "rankdef", "nearrank", "clustered", "band", "rowscale", "nearcollinear")
+# Relative sampling weights (normalized by torch.multinomial); dense ~= 50%.
+_MIXED_WEIGHTS = (6.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0)
+
+
+def _apply_case(a: torch.Tensor, case: str, cond: int, gen: torch.Generator) -> torch.Tensor:
+    # Apply one conditioning profile to an already-drawn base batch `a` of shape
+    # (m, n, n), drawing any case-specific extra randomness from `gen`. Factored
+    # out of generate_input so the homogeneous cases and the per-matrix "mixed"
+    # case share a single implementation. The draw order (base first in the
+    # caller, then the case extras here) matches the original code, so every
+    # homogeneous case produces bit-for-bit identical data to before.
+    m, n = a.shape[0], a.shape[-1]
+    device = a.device
     if case == "dense":
         a = _apply_column_scaling(a, cond)
     elif case == "upper":
@@ -40,7 +44,7 @@ def generate_input(batch: int, n: int, cond: int, seed: int, case: str = "dense"
         a.diagonal(dim1=-2, dim2=-1).add_(diag_boost)
         a = _apply_column_scaling(a, cond)
     elif case == "diagonal":
-        diag = torch.randn((batch, n), device=device, dtype=torch.float32, generator=gen)
+        diag = torch.randn((m, n), device=device, dtype=torch.float32, generator=gen)
         diag = diag.sign().clamp(min=0.0).mul(2.0).sub(1.0) * torch.logspace(
             0.0, -float(max(cond, 2)), n, device=device, dtype=torch.float32
         )
@@ -54,7 +58,7 @@ def generate_input(batch: int, n: int, cond: int, seed: int, case: str = "dense"
         tail = n - rank
         if tail > 0:
             noise = torch.randn(
-                (batch, n, tail), device=device, dtype=torch.float32, generator=gen
+                (m, n, tail), device=device, dtype=torch.float32, generator=gen
             )
             a[:, :, rank:] = a[:, :, :tail] + 1.0e-5 * noise
         a = _apply_column_scaling(a, cond)
@@ -73,9 +77,9 @@ def generate_input(batch: int, n: int, cond: int, seed: int, case: str = "dense"
         a.diagonal(dim1=-2, dim2=-1).add_(diag_boost)
         a = _apply_column_scaling(a, cond)
     elif case == "nearcollinear":
-        base = torch.randn((batch, n, 1), device=device, dtype=torch.float32, generator=gen)
-        noise = torch.randn((batch, n, n), device=device, dtype=torch.float32, generator=gen)
-        a = base.expand(batch, n, n) + 1.0e-4 * noise
+        base = torch.randn((m, n, 1), device=device, dtype=torch.float32, generator=gen)
+        noise = torch.randn((m, n, n), device=device, dtype=torch.float32, generator=gen)
+        a = base.expand(m, n, n) + 1.0e-4 * noise
         a = _apply_column_scaling(a, cond)
     elif case == "rowscale":
         row_cond = max(cond, 4)
@@ -83,6 +87,57 @@ def generate_input(batch: int, n: int, cond: int, seed: int, case: str = "dense"
         a = scales.reshape(1, n, 1) * a
     else:
         raise ValueError(f"unknown QR test case: {case}")
+    return a
+
+
+def _generate_mixed(a: torch.Tensor, cond: int, gen: torch.Generator) -> torch.Tensor:
+    # Heterogeneous batch: assign each matrix an independent conditioning profile
+    # at a RANDOM position in the batch (seeded, so still deterministic), so
+    # well- and ill-conditioned matrices are interleaved rather than uniform
+    # across the batch. This matches the real optimizer-statistics regime (the
+    # per-layer / per-block factors have wildly different conditioning) and it
+    # removes the loophole where a kernel samples a few matrices, concludes the
+    # whole batch is well-conditioned, and routes it all to a fast path that is
+    # only numerically valid for well-conditioned inputs. With a mix present,
+    # passing the correctness gate requires handling each matrix on its merits.
+    m = a.shape[0]
+    device = a.device
+    weights = torch.tensor(_MIXED_WEIGHTS, dtype=torch.float32, device=device)
+    labels = torch.multinomial(weights, m, replacement=True, generator=gen)
+    # Guarantee both a well-conditioned and an ill-conditioned matrix are present.
+    # (Only relevant for tiny batches; large batches get both with high prob.)
+    if m >= 2:
+        is_dense = labels == 0
+        if not bool(is_dense.any()):
+            labels[int(torch.randint(0, m, (1,), device=device, generator=gen))] = 0
+        elif bool(is_dense.all()):
+            pos = int(torch.randint(0, m, (1,), device=device, generator=gen))
+            labels[pos] = int(torch.randint(1, len(_MIXED_PROFILES), (1,), device=device, generator=gen))
+    # Process profiles in fixed order over the present labels so the RNG draws
+    # inside _apply_case are deterministic for a given seed.
+    for k, prof in enumerate(_MIXED_PROFILES):
+        mask = labels == k
+        if bool(mask.any()):
+            a[mask] = _apply_case(a[mask], prof, cond, gen)
+    return a
+
+
+def generate_input(batch: int, n: int, cond: int, seed: int, case: str = "dense") -> input_t:
+    assert batch > 0, "batch must be positive"
+    assert n > 0, "n must be positive"
+    assert cond >= 0, "cond must be non-negative"
+
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    gen = torch.Generator(device=device)
+    gen.manual_seed(seed)
+
+    case = case.lower()
+    a = torch.randn((batch, n, n), device=device, dtype=torch.float32, generator=gen)
+
+    if case == "mixed":
+        a = _generate_mixed(a, cond, gen)
+    else:
+        a = _apply_case(a, case, cond, gen)
 
     return a.contiguous()
 
diff --git a/problems/linalg/qr_py/task.yml b/problems/linalg/qr_py/task.yml
index 8e935eba..bc7d071d 100644
--- a/problems/linalg/qr_py/task.yml
+++ b/problems/linalg/qr_py/task.yml
@@ -39,6 +39,19 @@ description: |
   structure, such as rank-deficient, near-rank-deficient, banded, row-scaled,
   near-collinear, upper-triangular, or clustered-scale inputs.
 
+  The `mixed` case builds a heterogeneous batch: each matrix is independently
+  assigned a conditioning profile (a well-conditioned dense majority interleaved
+  with the ill-conditioned stress structures above) at a random position in the
+  batch. This mirrors the real optimizer-statistics regime, where the per-layer
+  or per-block factors batched into one call have widely varying conditioning,
+  rather than all sharing one structure. The benchmark set (not just the test
+  set) now includes both `mixed` batches and fully ill-conditioned homogeneous
+  batches, so conditioning robustness is ranked, not only gated: an
+  implementation cannot inspect a few matrices, decide the whole batch is
+  well-conditioned, and route it to a path that is only valid for well-conditioned
+  inputs, and the runtime cost of the accurate path on hard inputs is part of the
+  score. Each matrix must be factored correctly on its own merits.
+
   Correctness is a hard gate against the original FP32 input and the FP32
   `torch.geqrf` compact-factor contract. Low-bit FP16, FP8, or NVFP4 work is
   allowed only as an internal implementation strategy: returned factors must
@@ -89,6 +102,9 @@ tests:
   - {"batch": 2, "n": 2048, "cond": 2, "seed": 224466, "case": "dense"}
   - {"batch": 2, "n": 2048, "cond": 0, "seed": 224467, "case": "rankdef"}
   - {"batch": 1, "n": 4096, "cond": 0, "seed": 75343, "case": "upper"}
+  - {"batch": 16, "n": 512, "cond": 2, "seed": 32530, "case": "mixed"}
+  - {"batch": 4, "n": 1024, "cond": 2, "seed": 4332, "case": "mixed"}
+  - {"batch": 2, "n": 2048, "cond": 2, "seed": 224468, "case": "mixed"}
 
 benchmarks:
   - {"batch": 20, "n": 32, "cond": 1, "seed": 43214}
@@ -98,3 +114,8 @@ benchmarks:
   - {"batch": 60, "n": 1024, "cond": 2, "seed": 75342}
   - {"batch": 8, "n": 2048, "cond": 1, "seed": 224466}
   - {"batch": 2, "n": 4096, "cond": 1, "seed": 32412}
+  - {"batch": 640, "n": 512, "cond": 2, "seed": 770001, "case": "mixed"}
+  - {"batch": 60, "n": 1024, "cond": 2, "seed": 770002, "case": "mixed"}
+  - {"batch": 640, "n": 512, "cond": 0, "seed": 770003, "case": "rankdef"}
+  - {"batch": 640, "n": 512, "cond": 0, "seed": 770004, "case": "clustered"}
+  - {"batch": 60, "n": 1024, "cond": 0, "seed": 770005, "case": "nearrank"}

From a8f2143193b47647b24cd1c1edc7141fce375e7a Mon Sep 17 00:00:00 2001
From: Bryce Adelstein Lelbach <brycelelbach@gmail.com>
Date: Mon, 15 Jun 2026 02:06:09 +0000
Subject: [PATCH 2/2] linalg/qr: check factor residual per matrix

---
 problems/linalg/qr_py/reference.py | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/problems/linalg/qr_py/reference.py b/problems/linalg/qr_py/reference.py
index 696d3d51..b4b66e93 100644
--- a/problems/linalg/qr_py/reference.py
+++ b/problems/linalg/qr_py/reference.py
@@ -202,15 +202,18 @@ def check_implementation(data: input_t, output: output_t) -> tuple[bool, str]:
     q_check = q.double()
     r_check = r.double()
     projected = q_check.transpose(-1, -2) @ a_check
-    factor_residual = _matrix_l1_norm(r_check - projected).amax()
-    factor_scale = _matrix_l1_norm(a_check).amax()
+    factor_residual = _matrix_l1_norm(r_check - projected)
+    factor_scale = _matrix_l1_norm(a_check)
     factor_allowed = factor_rtol * factor_scale
     factor_scaled = _scaled_residual(factor_residual, factor_scale, n)
-    if factor_residual.item() > factor_allowed.item():
+    factor_failed = factor_residual > factor_allowed
+    if bool(factor_failed.any().item()):
+        worst = int(factor_scaled.argmax().item())
         return False, (
             "R - Q.T @ A is too large: "
-            f"residual={factor_residual.item():.3g}, allowed={factor_allowed.item():.3g}, "
-            f"scaled={factor_scaled.item():.3g}"
+            f"matrix={worst}, residual={factor_residual[worst].item():.3g}, "
+            f"allowed={factor_allowed[worst].item():.3g}, "
+            f"scaled={factor_scaled[worst].item():.3g}"
         )
 
     eye = torch.eye(n, device=a.device, dtype=torch.float64).expand(batch, n, n)
@@ -239,7 +242,7 @@ def check_implementation(data: input_t, output: output_t) -> tuple[bool, str]:
     return True, (
         f"factor_rtol={factor_rtol:.3g}; "
         f"orth_rtol={orth_rtol:.3g}; "
-        f"scaled_factor_residual={factor_scaled.item():.3g}; "
+        f"scaled_factor_residual={factor_scaled.amax().item():.3g}; "
         f"scaled_reconstruction_residual={recon_scaled.item():.3g}; "
         f"scaled_triangular_residual={tri_scaled.item():.3g}; "
         f"scaled_orthogonality_residual={orth_scaled.item():.3g}; "