diff --git a/barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/pippenger_arena_layout.hpp b/barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/pippenger_arena_layout.hpp index 22453899e6d4..65c8e02d9b5a 100644 --- a/barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/pippenger_arena_layout.hpp +++ b/barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/pippenger_arena_layout.hpp @@ -59,7 +59,9 @@ struct VariableWindowSchedule { size_t num_windows = 0; std::array window_bits_per_window{}; // window_bits_w for each w std::array bit_base{}; // B_w = Σ_{k num_buckets{}; // 2^(window_bits_w - 1) + 1 + // 2^(window_bits_w - 1) + 1. uint32_t: window_bits = 17 gives 65537, one past uint16_t, and the + // cost model can pick window_bits up to 18 for very large MSMs (n approaching the 2^26 SRS cap). + std::array num_buckets{}; }; // Per-chunk recursive-affine bucket-reduce output (Stage 6b output cell). @@ -130,7 +132,7 @@ inline VariableWindowSchedule build_var_window_schedule(size_t num_bits, size_t const size_t window_bits_w = std::min(window_bits, bits_remaining); sched.bit_base[w] = static_cast(bit_offset); sched.window_bits_per_window[w] = static_cast(window_bits_w); - sched.num_buckets[w] = static_cast((size_t{ 1 } << (window_bits_w - 1)) + 1); + sched.num_buckets[w] = static_cast((size_t{ 1 } << (window_bits_w - 1)) + 1); bit_offset += window_bits_w; bits_remaining -= window_bits_w; ++w; diff --git a/barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/scalar_multiplication.test.cpp b/barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/scalar_multiplication.test.cpp index 3d58a22d5578..cc672c0307b0 100644 --- a/barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/scalar_multiplication.test.cpp +++ b/barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/scalar_multiplication.test.cpp @@ -1738,6 +1738,36 @@ TEST(ScalarMultiplicationArenaTest, ArenaLayoutFitsAcrossDispatchSpace) bb::set_parallel_for_concurrency(saved_threads); } +// Non-GLV mid-band (GLV_SMALL_N_THRESHOLD < n < 2^17) arena-sizing coverage. The live allocator +// shrinks the window-bit budget to the observed scalar msb, which can pick a heavier schedule than +// the full-bit pre-sizer; `compute_arena_bytes_for_msm` must upper-bound the arena across every +// effective_num_bits. Regression for the `bb prove` abort `Assertion failed: (aligned_local + bytes +// <= bound)` on UltraHonk simple_shield (~28,696-point commitment MSM, 8 threads): that n sits in +// this band, which the dispatch sweep (probing only 8193 and 262144) and the small-scalar band test +// (n <= 16384) both miss. +TEST(ScalarMultiplicationArenaTest, MidBandArenaSizerCoversAllEffectiveNumBits) +{ + const size_t saved_threads = bb::get_num_cpus(); + bb::set_parallel_for_concurrency(8); + + bool found_undersize = false; + for (const size_t n : + { size_t{ 28696 }, size_t{ 8193 }, size_t{ 1 } << 14, size_t{ 1 } << 15, size_t{ 1 } << 16 }) { + for (size_t bits = 1; bits <= 254; ++bits) { + if (!pippenger_bn254_arena_layout_fits_for_test(n, + /*external_glv_provided=*/false, + /*dedup_active=*/false, + bits)) { + info("UNDERSIZE: n=", n, " effective_num_bits=", bits, " threads=8"); + found_undersize = true; + } + } + } + + bb::set_parallel_for_concurrency(saved_threads); + EXPECT_FALSE(found_undersize) << "arena sizer under-counts in the non-GLV mid-band"; +} + // ======================= Test Wrappers ======================= TYPED_TEST(ScalarMultiplicationTest, PippengerLowMemory) diff --git a/barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/scalar_multiplication_fast.cpp b/barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/scalar_multiplication_fast.cpp index 7115e4d1c4bf..dc1c3413fecc 100644 --- a/barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/scalar_multiplication_fast.cpp +++ b/barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/scalar_multiplication_fast.cpp @@ -1211,13 +1211,10 @@ size_t compute_arena_bytes_for_msm(size_t n_input, bool external_glv_provided, b const size_t dedup_bytes = dedup_active ? ((size_t{ 4 } * n) + (size_t{ sizeof(typename Curve::AffineElement) } * round_parallel_detail::DEDUP_MAX_CLUSTERS)) : size_t{ 0 }; - auto arena_bytes_for_window_layout = [&](size_t bit_budget) { - const size_t wb = round_parallel_detail::choose_window_bits(n, bit_budget, n_input, num_logical_threads_for_c); + auto arena_bytes_for_window_layout = [&](size_t bit_budget, size_t wb) { const auto layout_sched = round_parallel_detail::build_var_window_schedule(bit_budget, wb); - size_t B_eff_layout = (size_t{ 1 } << (wb - 1)) + 1; - for (size_t w = 0; w < layout_sched.num_windows; ++w) { - B_eff_layout = std::max(B_eff_layout, static_cast(layout_sched.num_buckets[w])); - } + // Uniform schedule: the widest window's bucket count is the per-window cap. + const size_t B_eff_layout = (size_t{ 1 } << (wb - 1)) + 1; const size_t dense_stride_layout = round_parallel_detail::compute_dense_stride(B_eff_layout, num_threads); const size_t per_window_bytes_layout = round_parallel_detail::compute_per_window_bytes( num_threads, B_eff_layout, n, dense_stride_layout, worker_total_for_budget); @@ -1238,14 +1235,27 @@ size_t compute_arena_bytes_for_msm(size_t n_input, bool external_glv_provided, b // first-touch cost regardless of how much of the arena the small MSM_fast actually uses. size_t arena_bytes = fixed_overhead + (windows_per_batch * per_window_bytes) + 32768 + dedup_bytes; - // The live pipeline shrinks NUM_BITS to the observed max scalar bit before choosing - // window_bits. GLV MSMs and large non-GLV MSMs can therefore select a different - // schedule/zone layout than the full-bit pre-sizer. Keep the common Chonk wire/IPA - // non-GLV sizes on the original tight path. - if (use_glv || n_input >= (size_t{ 1 } << 17)) { - for (size_t bit_budget = 1; bit_budget <= NUM_BITS; ++bit_budget) { - arena_bytes = std::max(arena_bytes, arena_bytes_for_window_layout(bit_budget)); - } + // The live pipeline chooses window_bits from the *effective* (nonzero) scalar count and the + // observed bit budget after Phase 1: c = choose_window_bits(n_active, effective_num_bits) with + // n_active <= n and effective_num_bits <= NUM_BITS. Fewer active points => smaller c => more + // windows => a larger arena (most sharply once fixed_overhead has eaten the batch budget and + // every window runs in a single batch). Size for the worst reachable c so the bound holds for + // any scalar density, with no extra scalar scan. + // + // For a fixed c, bit_budget = NUM_BITS maximizes the window count (effective_num_bits <= + // NUM_BITS) and 2^(c-1)+1 caps B_eff, so arena_bytes_for_window_layout(NUM_BITS, c) dominates + // every live (effective_num_bits, c) layout. The reachable c span is [2, c_max]: choose is + // non-decreasing in the point count (n_active <= n bounds it above), but the ceil() in the round + // count makes it non-monotonic in the bit budget by ±1, so c_max is the max over bit budgets, + // not simply choose(n, NUM_BITS). + size_t c_max_reachable = window_bits; + for (size_t bit_budget = 1; bit_budget <= NUM_BITS; ++bit_budget) { + c_max_reachable = std::max(c_max_reachable, + static_cast(round_parallel_detail::choose_window_bits( + n, bit_budget, n_input, num_logical_threads_for_c))); + } + for (size_t wb = 2; wb <= c_max_reachable; ++wb) { + arena_bytes = std::max(arena_bytes, arena_bytes_for_window_layout(NUM_BITS, wb)); } return arena_bytes; }