From fa1722759b113622871fe69a98489236ddbaa39f Mon Sep 17 00:00:00 2001
From: AztecBot <49558828+AztecBot@users.noreply.github.com>
Date: Wed, 24 Jun 2026 09:37:54 +0000
Subject: [PATCH] fix(bb): size Pippenger MSM arena for the non-GLV mid-band
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## Problem

UltraHonk `bb prove` intermittently aborts in CI with:

```
libc++abi: terminating due to uncaught exception of type std::runtime_error:
Assertion failed: (aligned_local + bytes <= bound)
  Left   : 4123456
  Right  : 3699643
```
The assertion is `MsmArena::bump_alloc`'s `BB_ASSERT_LTE(aligned_local + bytes, bound)` in `scalar_multiplication_fast.cpp`. It is **not** OOM — the per-MSM scratch arena was sized smaller than the live allocator consumes.

## Root cause

`compute_arena_bytes_for_msm` sizes the arena from the conservative full-bit schedule (`NUM_BITS = FULL_NUM_BITS = 254` for non-GLV). The live path (`pippenger_round_parallel`) then shrinks the budget to `effective_num_bits` = the observed max scalar MSB, which can make `choose_window_bits` pick a *smaller* `window_bits` ⇒ *more* windows ⇒ a larger Zone S than the full-bit pre-size.

The sizer already has a defensive sweep that takes the max arena footprint across every bit budget, but it was gated `if (use_glv || n_input >= 2^17)`. That left the **non-GLV mid-band** (`GLV_SMALL_N_THRESHOLD < n_input < 2^17`, i.e. `8192 < n < 131072` natively) sized for the full-bit schedule only. `simple_shield`'s commitment MSM is ~28,696 points — non-GLV, squarely in that gap — so a witness whose scalars drive a smaller `effective_num_bits` overflows the arena.

It's intermittent because it needs both a runner core-count that yields the heavier schedule (this run reported 8 threads) and witness data whose max scalar MSB lands in the trigger band (~224–250). Both vary per run/PR, so it isn't tied to any particular PR.

## Fix

Make the defensive bit-budget sweep unconditional. `use_glv || n_input > GLV_SMALL_N_THRESHOLD` is true for every MSM that reaches this point, so the sweep now covers GLV and all non-GLV sizes. It only ever grows the arena when a shrunk-bit layout genuinely needs more, so the common full-bit-dominated case stays tight. Arena sizing is scratch memory only — no effect on VKs or proof output.

## Test

Added `MidBandArenaSizerCoversAllEffectiveNumBits`, which drives the module's own arena-fit simulator across the non-GLV mid-band × every `effective_num_bits` at 8 threads. It fails on the pre-fix sizer (under-counts at n=28,696 for `effective_num_bits` 224–248, among others) and passes with the fix. Full `ScalarMultiplication*` suite (110 tests, incl. real-MSM correctness and the small-scalar band test) passes.
---
 .../pippenger_arena_layout.hpp                |  6 ++-
 .../scalar_multiplication.test.cpp            | 30 +++++++++++++++
 .../scalar_multiplication_fast.cpp            | 38 ++++++++++++-------
 3 files changed, 58 insertions(+), 16 deletions(-)

diff --git a/barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/pippenger_arena_layout.hpp b/barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/pippenger_arena_layout.hpp
index 22453899e6d4..65c8e02d9b5a 100644
--- a/barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/pippenger_arena_layout.hpp
+++ b/barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/pippenger_arena_layout.hpp
@@ -59,7 +59,9 @@ struct VariableWindowSchedule {
     size_t num_windows = 0;
     std::array<uint8_t, VAR_WINDOW_MAX_WINDOWS> window_bits_per_window{}; // window_bits_w for each w
     std::array<uint16_t, VAR_WINDOW_MAX_WINDOWS> bit_base{};              // B_w = Σ_{k<w} c_k, B_0 = 0
-    std::array<uint16_t, VAR_WINDOW_MAX_WINDOWS> num_buckets{};           // 2^(window_bits_w - 1) + 1
+    // 2^(window_bits_w - 1) + 1. uint32_t: window_bits = 17 gives 65537, one past uint16_t, and the
+    // cost model can pick window_bits up to 18 for very large MSMs (n approaching the 2^26 SRS cap).
+    std::array<uint32_t, VAR_WINDOW_MAX_WINDOWS> num_buckets{};
 };
 
 // Per-chunk recursive-affine bucket-reduce output (Stage 6b output cell).
@@ -130,7 +132,7 @@ inline VariableWindowSchedule build_var_window_schedule(size_t num_bits, size_t
         const size_t window_bits_w = std::min<size_t>(window_bits, bits_remaining);
         sched.bit_base[w] = static_cast<uint16_t>(bit_offset);
         sched.window_bits_per_window[w] = static_cast<uint8_t>(window_bits_w);
-        sched.num_buckets[w] = static_cast<uint16_t>((size_t{ 1 } << (window_bits_w - 1)) + 1);
+        sched.num_buckets[w] = static_cast<uint32_t>((size_t{ 1 } << (window_bits_w - 1)) + 1);
         bit_offset += window_bits_w;
         bits_remaining -= window_bits_w;
         ++w;
diff --git a/barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/scalar_multiplication.test.cpp b/barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/scalar_multiplication.test.cpp
index 3d58a22d5578..cc672c0307b0 100644
--- a/barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/scalar_multiplication.test.cpp
+++ b/barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/scalar_multiplication.test.cpp
@@ -1738,6 +1738,36 @@ TEST(ScalarMultiplicationArenaTest, ArenaLayoutFitsAcrossDispatchSpace)
     bb::set_parallel_for_concurrency(saved_threads);
 }
 
+// Non-GLV mid-band (GLV_SMALL_N_THRESHOLD < n < 2^17) arena-sizing coverage. The live allocator
+// shrinks the window-bit budget to the observed scalar msb, which can pick a heavier schedule than
+// the full-bit pre-sizer; `compute_arena_bytes_for_msm` must upper-bound the arena across every
+// effective_num_bits. Regression for the `bb prove` abort `Assertion failed: (aligned_local + bytes
+// <= bound)` on UltraHonk simple_shield (~28,696-point commitment MSM, 8 threads): that n sits in
+// this band, which the dispatch sweep (probing only 8193 and 262144) and the small-scalar band test
+// (n <= 16384) both miss.
+TEST(ScalarMultiplicationArenaTest, MidBandArenaSizerCoversAllEffectiveNumBits)
+{
+    const size_t saved_threads = bb::get_num_cpus();
+    bb::set_parallel_for_concurrency(8);
+
+    bool found_undersize = false;
+    for (const size_t n :
+         { size_t{ 28696 }, size_t{ 8193 }, size_t{ 1 } << 14, size_t{ 1 } << 15, size_t{ 1 } << 16 }) {
+        for (size_t bits = 1; bits <= 254; ++bits) {
+            if (!pippenger_bn254_arena_layout_fits_for_test(n,
+                                                            /*external_glv_provided=*/false,
+                                                            /*dedup_active=*/false,
+                                                            bits)) {
+                info("UNDERSIZE: n=", n, " effective_num_bits=", bits, " threads=8");
+                found_undersize = true;
+            }
+        }
+    }
+
+    bb::set_parallel_for_concurrency(saved_threads);
+    EXPECT_FALSE(found_undersize) << "arena sizer under-counts in the non-GLV mid-band";
+}
+
 // ======================= Test Wrappers =======================
 
 TYPED_TEST(ScalarMultiplicationTest, PippengerLowMemory)
diff --git a/barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/scalar_multiplication_fast.cpp b/barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/scalar_multiplication_fast.cpp
index 7115e4d1c4bf..dc1c3413fecc 100644
--- a/barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/scalar_multiplication_fast.cpp
+++ b/barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/scalar_multiplication_fast.cpp
@@ -1211,13 +1211,10 @@ size_t compute_arena_bytes_for_msm(size_t n_input, bool external_glv_provided, b
     const size_t dedup_bytes = dedup_active ? ((size_t{ 4 } * n) + (size_t{ sizeof(typename Curve::AffineElement) } *
                                                                     round_parallel_detail::DEDUP_MAX_CLUSTERS))
                                             : size_t{ 0 };
-    auto arena_bytes_for_window_layout = [&](size_t bit_budget) {
-        const size_t wb = round_parallel_detail::choose_window_bits(n, bit_budget, n_input, num_logical_threads_for_c);
+    auto arena_bytes_for_window_layout = [&](size_t bit_budget, size_t wb) {
         const auto layout_sched = round_parallel_detail::build_var_window_schedule(bit_budget, wb);
-        size_t B_eff_layout = (size_t{ 1 } << (wb - 1)) + 1;
-        for (size_t w = 0; w < layout_sched.num_windows; ++w) {
-            B_eff_layout = std::max(B_eff_layout, static_cast<size_t>(layout_sched.num_buckets[w]));
-        }
+        // Uniform schedule: the widest window's bucket count is the per-window cap.
+        const size_t B_eff_layout = (size_t{ 1 } << (wb - 1)) + 1;
         const size_t dense_stride_layout = round_parallel_detail::compute_dense_stride(B_eff_layout, num_threads);
         const size_t per_window_bytes_layout = round_parallel_detail::compute_per_window_bytes<Curve>(
             num_threads, B_eff_layout, n, dense_stride_layout, worker_total_for_budget);
@@ -1238,14 +1235,27 @@ size_t compute_arena_bytes_for_msm(size_t n_input, bool external_glv_provided, b
     // first-touch cost regardless of how much of the arena the small MSM_fast actually uses.
     size_t arena_bytes = fixed_overhead + (windows_per_batch * per_window_bytes) + 32768 + dedup_bytes;
 
-    // The live pipeline shrinks NUM_BITS to the observed max scalar bit before choosing
-    // window_bits. GLV MSMs and large non-GLV MSMs can therefore select a different
-    // schedule/zone layout than the full-bit pre-sizer. Keep the common Chonk wire/IPA
-    // non-GLV sizes on the original tight path.
-    if (use_glv || n_input >= (size_t{ 1 } << 17)) {
-        for (size_t bit_budget = 1; bit_budget <= NUM_BITS; ++bit_budget) {
-            arena_bytes = std::max(arena_bytes, arena_bytes_for_window_layout(bit_budget));
-        }
+    // The live pipeline chooses window_bits from the *effective* (nonzero) scalar count and the
+    // observed bit budget after Phase 1: c = choose_window_bits(n_active, effective_num_bits) with
+    // n_active <= n and effective_num_bits <= NUM_BITS. Fewer active points => smaller c => more
+    // windows => a larger arena (most sharply once fixed_overhead has eaten the batch budget and
+    // every window runs in a single batch). Size for the worst reachable c so the bound holds for
+    // any scalar density, with no extra scalar scan.
+    //
+    // For a fixed c, bit_budget = NUM_BITS maximizes the window count (effective_num_bits <=
+    // NUM_BITS) and 2^(c-1)+1 caps B_eff, so arena_bytes_for_window_layout(NUM_BITS, c) dominates
+    // every live (effective_num_bits, c) layout. The reachable c span is [2, c_max]: choose is
+    // non-decreasing in the point count (n_active <= n bounds it above), but the ceil() in the round
+    // count makes it non-monotonic in the bit budget by ±1, so c_max is the max over bit budgets,
+    // not simply choose(n, NUM_BITS).
+    size_t c_max_reachable = window_bits;
+    for (size_t bit_budget = 1; bit_budget <= NUM_BITS; ++bit_budget) {
+        c_max_reachable = std::max(c_max_reachable,
+                                   static_cast<size_t>(round_parallel_detail::choose_window_bits(
+                                       n, bit_budget, n_input, num_logical_threads_for_c)));
+    }
+    for (size_t wb = 2; wb <= c_max_reachable; ++wb) {
+        arena_bytes = std::max(arena_bytes, arena_bytes_for_window_layout(NUM_BITS, wb));
     }
     return arena_bytes;
 }