From fa1722759b113622871fe69a98489236ddbaa39f Mon Sep 17 00:00:00 2001 From: AztecBot <49558828+AztecBot@users.noreply.github.com> Date: Wed, 24 Jun 2026 09:37:54 +0000 Subject: [PATCH] fix(bb): size Pippenger MSM arena for the non-GLV mid-band MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Problem UltraHonk `bb prove` intermittently aborts in CI with: ``` libc++abi: terminating due to uncaught exception of type std::runtime_error: Assertion failed: (aligned_local + bytes <= bound) Left : 4123456 Right : 3699643 ``` The assertion is `MsmArena::bump_alloc`'s `BB_ASSERT_LTE(aligned_local + bytes, bound)` in `scalar_multiplication_fast.cpp`. It is **not** OOM — the per-MSM scratch arena was sized smaller than the live allocator consumes. ## Root cause `compute_arena_bytes_for_msm` sizes the arena from the conservative full-bit schedule (`NUM_BITS = FULL_NUM_BITS = 254` for non-GLV). The live path (`pippenger_round_parallel`) then shrinks the budget to `effective_num_bits` = the observed max scalar MSB, which can make `choose_window_bits` pick a *smaller* `window_bits` ⇒ *more* windows ⇒ a larger Zone S than the full-bit pre-size. The sizer already has a defensive sweep that takes the max arena footprint across every bit budget, but it was gated `if (use_glv || n_input >= 2^17)`. That left the **non-GLV mid-band** (`GLV_SMALL_N_THRESHOLD < n_input < 2^17`, i.e. `8192 < n < 131072` natively) sized for the full-bit schedule only. `simple_shield`'s commitment MSM is ~28,696 points — non-GLV, squarely in that gap — so a witness whose scalars drive a smaller `effective_num_bits` overflows the arena. It's intermittent because it needs both a runner core-count that yields the heavier schedule (this run reported 8 threads) and witness data whose max scalar MSB lands in the trigger band (~224–250). Both vary per run/PR, so it isn't tied to any particular PR. ## Fix Make the defensive bit-budget sweep unconditional. `use_glv || n_input > GLV_SMALL_N_THRESHOLD` is true for every MSM that reaches this point, so the sweep now covers GLV and all non-GLV sizes. It only ever grows the arena when a shrunk-bit layout genuinely needs more, so the common full-bit-dominated case stays tight. Arena sizing is scratch memory only — no effect on VKs or proof output. ## Test Added `MidBandArenaSizerCoversAllEffectiveNumBits`, which drives the module's own arena-fit simulator across the non-GLV mid-band × every `effective_num_bits` at 8 threads. It fails on the pre-fix sizer (under-counts at n=28,696 for `effective_num_bits` 224–248, among others) and passes with the fix. Full `ScalarMultiplication*` suite (110 tests, incl. real-MSM correctness and the small-scalar band test) passes. --- .../pippenger_arena_layout.hpp | 6 ++- .../scalar_multiplication.test.cpp | 30 +++++++++++++++ .../scalar_multiplication_fast.cpp | 38 ++++++++++++------- 3 files changed, 58 insertions(+), 16 deletions(-) diff --git a/barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/pippenger_arena_layout.hpp b/barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/pippenger_arena_layout.hpp index 22453899e6d4..65c8e02d9b5a 100644 --- a/barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/pippenger_arena_layout.hpp +++ b/barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/pippenger_arena_layout.hpp @@ -59,7 +59,9 @@ struct VariableWindowSchedule { size_t num_windows = 0; std::array window_bits_per_window{}; // window_bits_w for each w std::array bit_base{}; // B_w = Σ_{k num_buckets{}; // 2^(window_bits_w - 1) + 1 + // 2^(window_bits_w - 1) + 1. uint32_t: window_bits = 17 gives 65537, one past uint16_t, and the + // cost model can pick window_bits up to 18 for very large MSMs (n approaching the 2^26 SRS cap). + std::array num_buckets{}; }; // Per-chunk recursive-affine bucket-reduce output (Stage 6b output cell). @@ -130,7 +132,7 @@ inline VariableWindowSchedule build_var_window_schedule(size_t num_bits, size_t const size_t window_bits_w = std::min(window_bits, bits_remaining); sched.bit_base[w] = static_cast(bit_offset); sched.window_bits_per_window[w] = static_cast(window_bits_w); - sched.num_buckets[w] = static_cast((size_t{ 1 } << (window_bits_w - 1)) + 1); + sched.num_buckets[w] = static_cast((size_t{ 1 } << (window_bits_w - 1)) + 1); bit_offset += window_bits_w; bits_remaining -= window_bits_w; ++w; diff --git a/barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/scalar_multiplication.test.cpp b/barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/scalar_multiplication.test.cpp index 3d58a22d5578..cc672c0307b0 100644 --- a/barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/scalar_multiplication.test.cpp +++ b/barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/scalar_multiplication.test.cpp @@ -1738,6 +1738,36 @@ TEST(ScalarMultiplicationArenaTest, ArenaLayoutFitsAcrossDispatchSpace) bb::set_parallel_for_concurrency(saved_threads); } +// Non-GLV mid-band (GLV_SMALL_N_THRESHOLD < n < 2^17) arena-sizing coverage. The live allocator +// shrinks the window-bit budget to the observed scalar msb, which can pick a heavier schedule than +// the full-bit pre-sizer; `compute_arena_bytes_for_msm` must upper-bound the arena across every +// effective_num_bits. Regression for the `bb prove` abort `Assertion failed: (aligned_local + bytes +// <= bound)` on UltraHonk simple_shield (~28,696-point commitment MSM, 8 threads): that n sits in +// this band, which the dispatch sweep (probing only 8193 and 262144) and the small-scalar band test +// (n <= 16384) both miss. +TEST(ScalarMultiplicationArenaTest, MidBandArenaSizerCoversAllEffectiveNumBits) +{ + const size_t saved_threads = bb::get_num_cpus(); + bb::set_parallel_for_concurrency(8); + + bool found_undersize = false; + for (const size_t n : + { size_t{ 28696 }, size_t{ 8193 }, size_t{ 1 } << 14, size_t{ 1 } << 15, size_t{ 1 } << 16 }) { + for (size_t bits = 1; bits <= 254; ++bits) { + if (!pippenger_bn254_arena_layout_fits_for_test(n, + /*external_glv_provided=*/false, + /*dedup_active=*/false, + bits)) { + info("UNDERSIZE: n=", n, " effective_num_bits=", bits, " threads=8"); + found_undersize = true; + } + } + } + + bb::set_parallel_for_concurrency(saved_threads); + EXPECT_FALSE(found_undersize) << "arena sizer under-counts in the non-GLV mid-band"; +} + // ======================= Test Wrappers ======================= TYPED_TEST(ScalarMultiplicationTest, PippengerLowMemory) diff --git a/barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/scalar_multiplication_fast.cpp b/barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/scalar_multiplication_fast.cpp index 7115e4d1c4bf..dc1c3413fecc 100644 --- a/barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/scalar_multiplication_fast.cpp +++ b/barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/scalar_multiplication_fast.cpp @@ -1211,13 +1211,10 @@ size_t compute_arena_bytes_for_msm(size_t n_input, bool external_glv_provided, b const size_t dedup_bytes = dedup_active ? ((size_t{ 4 } * n) + (size_t{ sizeof(typename Curve::AffineElement) } * round_parallel_detail::DEDUP_MAX_CLUSTERS)) : size_t{ 0 }; - auto arena_bytes_for_window_layout = [&](size_t bit_budget) { - const size_t wb = round_parallel_detail::choose_window_bits(n, bit_budget, n_input, num_logical_threads_for_c); + auto arena_bytes_for_window_layout = [&](size_t bit_budget, size_t wb) { const auto layout_sched = round_parallel_detail::build_var_window_schedule(bit_budget, wb); - size_t B_eff_layout = (size_t{ 1 } << (wb - 1)) + 1; - for (size_t w = 0; w < layout_sched.num_windows; ++w) { - B_eff_layout = std::max(B_eff_layout, static_cast(layout_sched.num_buckets[w])); - } + // Uniform schedule: the widest window's bucket count is the per-window cap. + const size_t B_eff_layout = (size_t{ 1 } << (wb - 1)) + 1; const size_t dense_stride_layout = round_parallel_detail::compute_dense_stride(B_eff_layout, num_threads); const size_t per_window_bytes_layout = round_parallel_detail::compute_per_window_bytes( num_threads, B_eff_layout, n, dense_stride_layout, worker_total_for_budget); @@ -1238,14 +1235,27 @@ size_t compute_arena_bytes_for_msm(size_t n_input, bool external_glv_provided, b // first-touch cost regardless of how much of the arena the small MSM_fast actually uses. size_t arena_bytes = fixed_overhead + (windows_per_batch * per_window_bytes) + 32768 + dedup_bytes; - // The live pipeline shrinks NUM_BITS to the observed max scalar bit before choosing - // window_bits. GLV MSMs and large non-GLV MSMs can therefore select a different - // schedule/zone layout than the full-bit pre-sizer. Keep the common Chonk wire/IPA - // non-GLV sizes on the original tight path. - if (use_glv || n_input >= (size_t{ 1 } << 17)) { - for (size_t bit_budget = 1; bit_budget <= NUM_BITS; ++bit_budget) { - arena_bytes = std::max(arena_bytes, arena_bytes_for_window_layout(bit_budget)); - } + // The live pipeline chooses window_bits from the *effective* (nonzero) scalar count and the + // observed bit budget after Phase 1: c = choose_window_bits(n_active, effective_num_bits) with + // n_active <= n and effective_num_bits <= NUM_BITS. Fewer active points => smaller c => more + // windows => a larger arena (most sharply once fixed_overhead has eaten the batch budget and + // every window runs in a single batch). Size for the worst reachable c so the bound holds for + // any scalar density, with no extra scalar scan. + // + // For a fixed c, bit_budget = NUM_BITS maximizes the window count (effective_num_bits <= + // NUM_BITS) and 2^(c-1)+1 caps B_eff, so arena_bytes_for_window_layout(NUM_BITS, c) dominates + // every live (effective_num_bits, c) layout. The reachable c span is [2, c_max]: choose is + // non-decreasing in the point count (n_active <= n bounds it above), but the ceil() in the round + // count makes it non-monotonic in the bit budget by ±1, so c_max is the max over bit budgets, + // not simply choose(n, NUM_BITS). + size_t c_max_reachable = window_bits; + for (size_t bit_budget = 1; bit_budget <= NUM_BITS; ++bit_budget) { + c_max_reachable = std::max(c_max_reachable, + static_cast(round_parallel_detail::choose_window_bits( + n, bit_budget, n_input, num_logical_threads_for_c))); + } + for (size_t wb = 2; wb <= c_max_reachable; ++wb) { + arena_bytes = std::max(arena_bytes, arena_bytes_for_window_layout(NUM_BITS, wb)); } return arena_bytes; }