Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,9 @@ struct VariableWindowSchedule {
size_t num_windows = 0;
std::array<uint8_t, VAR_WINDOW_MAX_WINDOWS> window_bits_per_window{}; // window_bits_w for each w
std::array<uint16_t, VAR_WINDOW_MAX_WINDOWS> bit_base{}; // B_w = Σ_{k<w} c_k, B_0 = 0
std::array<uint16_t, VAR_WINDOW_MAX_WINDOWS> num_buckets{}; // 2^(window_bits_w - 1) + 1
// 2^(window_bits_w - 1) + 1. uint32_t: window_bits = 17 gives 65537, one past uint16_t, and the
// cost model can pick window_bits up to 18 for very large MSMs (n approaching the 2^26 SRS cap).
std::array<uint32_t, VAR_WINDOW_MAX_WINDOWS> num_buckets{};
};

// Per-chunk recursive-affine bucket-reduce output (Stage 6b output cell).
Expand Down Expand Up @@ -130,7 +132,7 @@ inline VariableWindowSchedule build_var_window_schedule(size_t num_bits, size_t
const size_t window_bits_w = std::min<size_t>(window_bits, bits_remaining);
sched.bit_base[w] = static_cast<uint16_t>(bit_offset);
sched.window_bits_per_window[w] = static_cast<uint8_t>(window_bits_w);
sched.num_buckets[w] = static_cast<uint16_t>((size_t{ 1 } << (window_bits_w - 1)) + 1);
sched.num_buckets[w] = static_cast<uint32_t>((size_t{ 1 } << (window_bits_w - 1)) + 1);
bit_offset += window_bits_w;
bits_remaining -= window_bits_w;
++w;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1738,6 +1738,33 @@ TEST(ScalarMultiplicationArenaTest, ArenaLayoutFitsAcrossDispatchSpace)
bb::set_parallel_for_concurrency(saved_threads);
}

// Non-GLV mid-band (GLV_SMALL_N_THRESHOLD < n < 2^17) arena-sizing coverage. The live allocator
// shrinks the window-bit budget to the observed scalar msb, which can pick a heavier schedule than
// the full-bit pre-sizer; `compute_arena_bytes_for_msm` must upper-bound the arena across every
// effective_num_bits. Regression for the `bb prove` abort `Assertion failed: (aligned_local + bytes
// <= bound)` on UltraHonk simple_shield (~28,696-point commitment MSM, 8 threads): that n sits in
// this band, which the dispatch sweep (probing only 8193 and 262144) and the small-scalar band test
// (n <= 16384) both miss.
TEST(ScalarMultiplicationArenaTest, MidBandArenaSizerCoversAllEffectiveNumBits)
{
const size_t saved_threads = bb::get_num_cpus();
bb::set_parallel_for_concurrency(8);

bool found_undersize = false;
for (const size_t n : { size_t{ 28696 }, size_t{ 8193 }, size_t{ 1 } << 14, size_t{ 1 } << 15, size_t{ 1 } << 16 }) {
for (size_t bits = 1; bits <= 254; ++bits) {
if (!pippenger_bn254_arena_layout_fits_for_test(n, /*external_glv_provided=*/false,
/*dedup_active=*/false, bits)) {
info("UNDERSIZE: n=", n, " effective_num_bits=", bits, " threads=8");
found_undersize = true;
}
}
}

bb::set_parallel_for_concurrency(saved_threads);
EXPECT_FALSE(found_undersize) << "arena sizer under-counts in the non-GLV mid-band";
}

// ======================= Test Wrappers =======================

TYPED_TEST(ScalarMultiplicationTest, PippengerLowMemory)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1211,13 +1211,10 @@ size_t compute_arena_bytes_for_msm(size_t n_input, bool external_glv_provided, b
const size_t dedup_bytes = dedup_active ? ((size_t{ 4 } * n) + (size_t{ sizeof(typename Curve::AffineElement) } *
round_parallel_detail::DEDUP_MAX_CLUSTERS))
: size_t{ 0 };
auto arena_bytes_for_window_layout = [&](size_t bit_budget) {
const size_t wb = round_parallel_detail::choose_window_bits(n, bit_budget, n_input, num_logical_threads_for_c);
auto arena_bytes_for_window_layout = [&](size_t bit_budget, size_t wb) {
const auto layout_sched = round_parallel_detail::build_var_window_schedule(bit_budget, wb);
size_t B_eff_layout = (size_t{ 1 } << (wb - 1)) + 1;
for (size_t w = 0; w < layout_sched.num_windows; ++w) {
B_eff_layout = std::max(B_eff_layout, static_cast<size_t>(layout_sched.num_buckets[w]));
}
// Uniform schedule: the widest window's bucket count is the per-window cap.
const size_t B_eff_layout = (size_t{ 1 } << (wb - 1)) + 1;
const size_t dense_stride_layout = round_parallel_detail::compute_dense_stride(B_eff_layout, num_threads);
const size_t per_window_bytes_layout = round_parallel_detail::compute_per_window_bytes<Curve>(
num_threads, B_eff_layout, n, dense_stride_layout, worker_total_for_budget);
Expand All @@ -1238,14 +1235,27 @@ size_t compute_arena_bytes_for_msm(size_t n_input, bool external_glv_provided, b
// first-touch cost regardless of how much of the arena the small MSM_fast actually uses.
size_t arena_bytes = fixed_overhead + (windows_per_batch * per_window_bytes) + 32768 + dedup_bytes;

// The live pipeline shrinks NUM_BITS to the observed max scalar bit before choosing
// window_bits. GLV MSMs and large non-GLV MSMs can therefore select a different
// schedule/zone layout than the full-bit pre-sizer. Keep the common Chonk wire/IPA
// non-GLV sizes on the original tight path.
if (use_glv || n_input >= (size_t{ 1 } << 17)) {
for (size_t bit_budget = 1; bit_budget <= NUM_BITS; ++bit_budget) {
arena_bytes = std::max(arena_bytes, arena_bytes_for_window_layout(bit_budget));
}
// The live pipeline chooses window_bits from the *effective* (nonzero) scalar count and the
// observed bit budget after Phase 1: c = choose_window_bits(n_active, effective_num_bits) with
// n_active <= n and effective_num_bits <= NUM_BITS. Fewer active points => smaller c => more
// windows => a larger arena (most sharply once fixed_overhead has eaten the batch budget and
// every window runs in a single batch). Size for the worst reachable c so the bound holds for
// any scalar density, with no extra scalar scan.
//
// For a fixed c, bit_budget = NUM_BITS maximizes the window count (effective_num_bits <=
// NUM_BITS) and 2^(c-1)+1 caps B_eff, so arena_bytes_for_window_layout(NUM_BITS, c) dominates
// every live (effective_num_bits, c) layout. The reachable c span is [2, c_max]: choose is
// non-decreasing in the point count (n_active <= n bounds it above), but the ceil() in the round
// count makes it non-monotonic in the bit budget by ±1, so c_max is the max over bit budgets,
// not simply choose(n, NUM_BITS).
size_t c_max_reachable = window_bits;
for (size_t bit_budget = 1; bit_budget <= NUM_BITS; ++bit_budget) {
c_max_reachable = std::max(c_max_reachable,
static_cast<size_t>(round_parallel_detail::choose_window_bits(
n, bit_budget, n_input, num_logical_threads_for_c)));
}
for (size_t wb = 2; wb <= c_max_reachable; ++wb) {
arena_bytes = std::max(arena_bytes, arena_bytes_for_window_layout(NUM_BITS, wb));
}
return arena_bytes;
}
Expand Down
Loading