AztecProtocol · AztecBot · Jun 23, 2026
diff --git a/barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/pippenger_arena_layout.hpp b/barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/pippenger_arena_layout.hpp
@@ -59,7 +59,9 @@ struct VariableWindowSchedule {
     size_t num_windows = 0;
     std::array<uint8_t, VAR_WINDOW_MAX_WINDOWS> window_bits_per_window{}; // window_bits_w for each w
     std::array<uint16_t, VAR_WINDOW_MAX_WINDOWS> bit_base{};              // B_w = Σ_{k<w} c_k, B_0 = 0
-    std::array<uint16_t, VAR_WINDOW_MAX_WINDOWS> num_buckets{};           // 2^(window_bits_w - 1) + 1
+    // 2^(window_bits_w - 1) + 1. uint32_t: window_bits = 17 gives 65537, one past uint16_t, and the
+    // cost model can pick window_bits up to 18 for very large MSMs (n approaching the 2^26 SRS cap).
+    std::array<uint32_t, VAR_WINDOW_MAX_WINDOWS> num_buckets{};
 };
 
 // Per-chunk recursive-affine bucket-reduce output (Stage 6b output cell).
@@ -130,7 +132,7 @@ inline VariableWindowSchedule build_var_window_schedule(size_t num_bits, size_t
         const size_t window_bits_w = std::min<size_t>(window_bits, bits_remaining);
         sched.bit_base[w] = static_cast<uint16_t>(bit_offset);
         sched.window_bits_per_window[w] = static_cast<uint8_t>(window_bits_w);
-        sched.num_buckets[w] = static_cast<uint16_t>((size_t{ 1 } << (window_bits_w - 1)) + 1);
+        sched.num_buckets[w] = static_cast<uint32_t>((size_t{ 1 } << (window_bits_w - 1)) + 1);
         bit_offset += window_bits_w;
         bits_remaining -= window_bits_w;
         ++w;

diff --git a/barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/scalar_multiplication.test.cpp b/barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/scalar_multiplication.test.cpp
@@ -1738,6 +1738,33 @@ TEST(ScalarMultiplicationArenaTest, ArenaLayoutFitsAcrossDispatchSpace)
     bb::set_parallel_for_concurrency(saved_threads);
 }
 
+// Non-GLV mid-band (GLV_SMALL_N_THRESHOLD < n < 2^17) arena-sizing coverage. The live allocator
+// shrinks the window-bit budget to the observed scalar msb, which can pick a heavier schedule than
+// the full-bit pre-sizer; `compute_arena_bytes_for_msm` must upper-bound the arena across every
+// effective_num_bits. Regression for the `bb prove` abort `Assertion failed: (aligned_local + bytes
+// <= bound)` on UltraHonk simple_shield (~28,696-point commitment MSM, 8 threads): that n sits in
+// this band, which the dispatch sweep (probing only 8193 and 262144) and the small-scalar band test
+// (n <= 16384) both miss.
+TEST(ScalarMultiplicationArenaTest, MidBandArenaSizerCoversAllEffectiveNumBits)
+{
+    const size_t saved_threads = bb::get_num_cpus();
+    bb::set_parallel_for_concurrency(8);
+
+    bool found_undersize = false;
+    for (const size_t n : { size_t{ 28696 }, size_t{ 8193 }, size_t{ 1 } << 14, size_t{ 1 } << 15, size_t{ 1 } << 16 }) {
+        for (size_t bits = 1; bits <= 254; ++bits) {
+            if (!pippenger_bn254_arena_layout_fits_for_test(n, /*external_glv_provided=*/false,
+                                                            /*dedup_active=*/false, bits)) {
+                info("UNDERSIZE: n=", n, " effective_num_bits=", bits, " threads=8");
+                found_undersize = true;
+            }
+        }
+    }
+
+    bb::set_parallel_for_concurrency(saved_threads);
+    EXPECT_FALSE(found_undersize) << "arena sizer under-counts in the non-GLV mid-band";
+}
+
 // ======================= Test Wrappers =======================
 
 TYPED_TEST(ScalarMultiplicationTest, PippengerLowMemory)

diff --git a/barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/scalar_multiplication_fast.cpp b/barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/scalar_multiplication_fast.cpp
@@ -1211,13 +1211,10 @@ size_t compute_arena_bytes_for_msm(size_t n_input, bool external_glv_provided, b
     const size_t dedup_bytes = dedup_active ? ((size_t{ 4 } * n) + (size_t{ sizeof(typename Curve::AffineElement) } *
                                                                     round_parallel_detail::DEDUP_MAX_CLUSTERS))
                                             : size_t{ 0 };
-    auto arena_bytes_for_window_layout = [&](size_t bit_budget) {
-        const size_t wb = round_parallel_detail::choose_window_bits(n, bit_budget, n_input, num_logical_threads_for_c);
+    auto arena_bytes_for_window_layout = [&](size_t bit_budget, size_t wb) {
         const auto layout_sched = round_parallel_detail::build_var_window_schedule(bit_budget, wb);
-        size_t B_eff_layout = (size_t{ 1 } << (wb - 1)) + 1;
-        for (size_t w = 0; w < layout_sched.num_windows; ++w) {
-            B_eff_layout = std::max(B_eff_layout, static_cast<size_t>(layout_sched.num_buckets[w]));
-        }
+        // Uniform schedule: the widest window's bucket count is the per-window cap.
+        const size_t B_eff_layout = (size_t{ 1 } << (wb - 1)) + 1;
         const size_t dense_stride_layout = round_parallel_detail::compute_dense_stride(B_eff_layout, num_threads);
         const size_t per_window_bytes_layout = round_parallel_detail::compute_per_window_bytes<Curve>(
             num_threads, B_eff_layout, n, dense_stride_layout, worker_total_for_budget);
@@ -1238,14 +1235,27 @@ size_t compute_arena_bytes_for_msm(size_t n_input, bool external_glv_provided, b
     // first-touch cost regardless of how much of the arena the small MSM_fast actually uses.
     size_t arena_bytes = fixed_overhead + (windows_per_batch * per_window_bytes) + 32768 + dedup_bytes;
 
-    // The live pipeline shrinks NUM_BITS to the observed max scalar bit before choosing
-    // window_bits. GLV MSMs and large non-GLV MSMs can therefore select a different
-    // schedule/zone layout than the full-bit pre-sizer. Keep the common Chonk wire/IPA
-    // non-GLV sizes on the original tight path.
-    if (use_glv || n_input >= (size_t{ 1 } << 17)) {
-        for (size_t bit_budget = 1; bit_budget <= NUM_BITS; ++bit_budget) {
-            arena_bytes = std::max(arena_bytes, arena_bytes_for_window_layout(bit_budget));
-        }
+    // The live pipeline chooses window_bits from the *effective* (nonzero) scalar count and the
+    // observed bit budget after Phase 1: c = choose_window_bits(n_active, effective_num_bits) with
+    // n_active <= n and effective_num_bits <= NUM_BITS. Fewer active points => smaller c => more
+    // windows => a larger arena (most sharply once fixed_overhead has eaten the batch budget and
+    // every window runs in a single batch). Size for the worst reachable c so the bound holds for
+    // any scalar density, with no extra scalar scan.
+    //
+    // For a fixed c, bit_budget = NUM_BITS maximizes the window count (effective_num_bits <=
+    // NUM_BITS) and 2^(c-1)+1 caps B_eff, so arena_bytes_for_window_layout(NUM_BITS, c) dominates
+    // every live (effective_num_bits, c) layout. The reachable c span is [2, c_max]: choose is
+    // non-decreasing in the point count (n_active <= n bounds it above), but the ceil() in the round
+    // count makes it non-monotonic in the bit budget by ±1, so c_max is the max over bit budgets,
+    // not simply choose(n, NUM_BITS).
+    size_t c_max_reachable = window_bits;
+    for (size_t bit_budget = 1; bit_budget <= NUM_BITS; ++bit_budget) {
+        c_max_reachable = std::max(c_max_reachable,
+                                   static_cast<size_t>(round_parallel_detail::choose_window_bits(
+                                       n, bit_budget, n_input, num_logical_threads_for_c)));
+    }
+    for (size_t wb = 2; wb <= c_max_reachable; ++wb) {
+        arena_bytes = std::max(arena_bytes, arena_bytes_for_window_layout(NUM_BITS, wb));
     }
     return arena_bytes;
 }