-
Notifications
You must be signed in to change notification settings - Fork 109
Add frequency measurement to the stopping criterion #372
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,120 @@ | ||
| /* | ||
| * Copyright 2026 NVIDIA Corporation | ||
| * | ||
| * Licensed under the Apache License, Version 2.0 with the LLVM exception | ||
| * (the "License"); you may not use this file except in compliance with | ||
| * the License. | ||
| * | ||
| * You may obtain a copy of the License at | ||
| * | ||
| * http://llvm.org/foundation/relicensing/LICENSE.txt | ||
| * | ||
| * Unless required by applicable law or agreed to in writing, software | ||
| * distributed under the License is distributed on an "AS IS" BASIS, | ||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| * See the License for the specific language governing permissions and | ||
| * limitations under the License. | ||
| */ | ||
|
|
||
| #include <nvbench/nvbench.cuh> | ||
|
|
||
| // Grab some testing kernels from NVBench: | ||
| #include <nvbench/test_kernels.cuh> | ||
|
|
||
| // Thrust vectors simplify memory management: | ||
| #include <thrust/device_vector.h> | ||
|
|
||
| #include <stdexcept> | ||
|
|
||
| // This example shows how to write a stopping criterion that *requires* the GPU | ||
| // clock frequency that NVBench observes for every cold-measurement sample. | ||
| // | ||
| // In addition to `do_add_measurement()`, a criterion may override | ||
| // `do_add_frequency()` to receive the SM clock rate (in Hz) measured during the | ||
| // sample. NVBench calls `add_frequency()` immediately before `add_measurement()` | ||
| // for the same sample -- but only when it can measure the clock. It is NOT | ||
| // called while profiling (the `--profile` option) or for CPU-only benchmarks | ||
| // (`nvbench::exec_tag::cpu_only` / `no_gpu`). | ||
| // | ||
| // Like the `fixed` criterion in `custom_criterion.cu`, this one simply runs for | ||
| // a fixed number of samples. The difference is that it also collects the | ||
| // per-sample frequency and throws if a sample arrives without one. The thrown | ||
| // exception is caught per-benchmark by NVBench and reported as a failure, so | ||
| // running this benchmark with `--profile` produces a clear error instead of | ||
| // silently ignoring the missing frequency. | ||
|
|
||
| // Inherit from the stopping_criterion_base class: | ||
| class frequency_criterion final : public nvbench::stopping_criterion_base | ||
| { | ||
| nvbench::int64_t m_num_samples{}; | ||
| bool m_has_frequency{false}; | ||
|
|
||
| public: | ||
| frequency_criterion() | ||
| : nvbench::stopping_criterion_base{"frequency", {{"max-samples", nvbench::int64_t{42}}}} | ||
| {} | ||
|
|
||
| protected: | ||
| // Setup the criterion in the `do_initialize()` method: | ||
| virtual void do_initialize() override | ||
| { | ||
| m_num_samples = 0; | ||
| m_has_frequency = false; | ||
| } | ||
|
|
||
| // Collect the GPU clock frequency for the current sample. NVBench calls this | ||
| // before `do_add_measurement()` whenever a frequency is available: | ||
| virtual void do_add_frequency(nvbench::float32_t /* frequency_hz */) override | ||
| { | ||
| m_has_frequency = true; | ||
| } | ||
|
|
||
| // Process new measurements in the `do_add_measurement()` method: | ||
| virtual void do_add_measurement(nvbench::float64_t /* measurement */) override | ||
| { | ||
| // This criterion requires a frequency for every sample. NVBench calls | ||
| // `do_add_frequency()` before `do_add_measurement()` when one is available, | ||
| // so a missing frequency here means none was provided for this sample: | ||
| if (!m_has_frequency) | ||
| { | ||
| throw std::runtime_error( | ||
| "frequency_criterion requires a GPU clock frequency for every sample, but none was " | ||
| "provided. NVBench does not measure the clock frequency when profiling (--profile) or for " | ||
| "CPU-only benchmarks (nvbench::exec_tag::cpu_only / no_gpu)."); | ||
| } | ||
|
|
||
| m_has_frequency = false; // consume it; the next sample must provide its own | ||
| m_num_samples++; | ||
| } | ||
|
|
||
| // Check if the stopping criterion is met in the `do_is_finished()` method: | ||
| virtual bool do_is_finished() override | ||
| { | ||
| return m_num_samples >= m_params.get_int64("max-samples"); | ||
| } | ||
| }; | ||
|
|
||
| // Register the criterion with NVBench: | ||
| NVBENCH_REGISTER_CRITERION(frequency_criterion); | ||
|
|
||
| void throughput_bench(nvbench::state &state) | ||
| { | ||
| // Allocate input data: | ||
| const std::size_t num_values = 64 * 1024 * 1024 / sizeof(nvbench::int32_t); | ||
| thrust::device_vector<nvbench::int32_t> input(num_values); | ||
| thrust::device_vector<nvbench::int32_t> output(num_values); | ||
|
|
||
| // Provide throughput information: | ||
| state.add_element_count(num_values, "NumElements"); | ||
| state.add_global_memory_reads<nvbench::int32_t>(num_values, "DataSize"); | ||
| state.add_global_memory_writes<nvbench::int32_t>(num_values); | ||
|
|
||
| state.exec([&input, &output, num_values](nvbench::launch &launch) { | ||
| (void)num_values; // clang thinks this is unused... | ||
| nvbench::copy_kernel<<<256, 256, 0, launch.get_stream()>>>( | ||
| thrust::raw_pointer_cast(input.data()), | ||
| thrust::raw_pointer_cast(output.data()), | ||
| num_values); | ||
| }); | ||
| } | ||
| NVBENCH_BENCH(throughput_bench).set_stopping_criterion("frequency"); | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -109,6 +109,15 @@ public: | |
| this->do_initialize(); | ||
| } | ||
|
|
||
| /** | ||
| * Provide the GPU clock frequency (Hz) observed for the current sample. It is not called when | ||
| * doing CPU-only benchmarking (i.e.: using `nvbench::exec_tag::cpu_only` or | ||
| * `nvbench::exec_tag::no_gpu`) or profiling (`--profile` option). When called, it is done before | ||
| * calling `add_measurement` for the same sample. The number of frequency measurements will be | ||
| * either 0 or equal to the number of calls to `add_measurement`. | ||
|
Comment on lines
+116
to
+117
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. important: Documented pairing invariant is incorrect under exception flow. Line 116-Line 117 says frequency-call count is always 0 or equal to |
||
| */ | ||
| void add_frequency(nvbench::float32_t frequency_hz) { this->do_add_frequency(frequency_hz); } | ||
|
Comment on lines
+112
to
+119
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think the documentation could be improved as @oleksandr-pavlyk requested, specifying that number of frequency measurements is either 0 or equal to the number of sample measurements
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Done! |
||
|
|
||
| /** | ||
| * Add the latest measurement to the criterion | ||
| */ | ||
|
|
@@ -134,6 +143,11 @@ protected: | |
| * Check if the criterion has been met for all measurements processed by `add_measurement` | ||
| */ | ||
| virtual bool do_is_finished() = 0; | ||
|
|
||
| /** | ||
| * Receive the GPU clock frequency (Hz) for the current sample. Default no-op. | ||
| */ | ||
| virtual void do_add_frequency(nvbench::float32_t /*frequency_hz*/) {} | ||
| }; | ||
|
|
||
| } // namespace nvbench | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -6,6 +6,7 @@ set(test_srcs | |
| cuda_timer.cu | ||
| cuda_stream.cu | ||
| cpu_timer.cu | ||
| criterion_exception.cu | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. important: Gate this GPU-dependent test or ensure runtime skip on no-device environments. Source: Path instructions |
||
| criterion_manager.cu | ||
| criterion_params.cu | ||
| custom_main_custom_args.cu | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,176 @@ | ||
| // SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. | ||
| // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | ||
|
|
||
| #include <nvbench/benchmark.cuh> | ||
| #include <nvbench/callable.cuh> | ||
| #include <nvbench/criterion_manager.cuh> | ||
| #include <nvbench/cuda_call.cuh> | ||
| #include <nvbench/exec_tag.cuh> | ||
| #include <nvbench/launch.cuh> | ||
| #include <nvbench/state.cuh> | ||
| #include <nvbench/stopping_criterion.cuh> | ||
| #include <nvbench/type_list.cuh> | ||
| #include <nvbench/types.cuh> | ||
|
|
||
| #include <cuda_runtime.h> | ||
|
|
||
| #include <fmt/format.h> | ||
|
|
||
| #include <stdexcept> | ||
| #include <string> | ||
|
|
||
| #include "test_asserts.cuh" | ||
|
|
||
| // Verifies that an exception thrown by a stopping criterion aborts the | ||
| // benchmark (the state is marked as failed) instead of being silently swallowed. | ||
|
|
||
| __global__ void spin_kernel(nvbench::uint64_t target_cycles) | ||
| { | ||
| const auto start = static_cast<nvbench::uint64_t>(clock64()); | ||
| while (static_cast<nvbench::uint64_t>(clock64()) - start < target_cycles) | ||
| { | ||
| } | ||
| } | ||
|
|
||
| constexpr nvbench::uint64_t spin_cycles = 100000; | ||
|
|
||
| // Where the criterion should throw from: | ||
| enum class throw_site | ||
| { | ||
| frequency, | ||
| measurement, | ||
| }; | ||
|
|
||
| // Shared probe so the test can observe how many times the criterion was | ||
| // consulted. If the exception were swallowed and sampling continued, these | ||
| // counts would climb well past the single call that throws. | ||
| struct criterion_probe | ||
| { | ||
| throw_site site{throw_site::measurement}; | ||
| int frequency_calls{0}; | ||
| int measurement_calls{0}; | ||
| }; | ||
|
|
||
| criterion_probe g_probe; | ||
|
|
||
| // A stopping criterion that throws on demand. The throw happens on the first | ||
| // sample, which should abort the run before `do_is_finished()` is ever | ||
| // consulted. If `do_is_finished()` *is* reached, the exception must have been | ||
| // swallowed, so it returns true to end the run immediately -- this keeps the | ||
| // regression case from spinning until the benchmark timeout. | ||
| class throwing_criterion final : public nvbench::stopping_criterion_base | ||
| { | ||
| public: | ||
| throwing_criterion() | ||
| : nvbench::stopping_criterion_base{"test_throwing", {}} | ||
| {} | ||
|
|
||
| protected: | ||
| void do_initialize() override {} | ||
|
|
||
| void do_add_frequency(nvbench::float32_t /* frequency_hz */) override | ||
| { | ||
| ++g_probe.frequency_calls; | ||
| if (g_probe.site == throw_site::frequency) | ||
| { | ||
| throw std::runtime_error{"criterion failure from add_frequency"}; | ||
| } | ||
| } | ||
|
|
||
| void do_add_measurement(nvbench::float64_t /* measurement */) override | ||
| { | ||
| ++g_probe.measurement_calls; | ||
| if (g_probe.site == throw_site::measurement) | ||
| { | ||
| throw std::runtime_error{"criterion failure from add_measurement"}; | ||
| } | ||
| } | ||
|
|
||
| bool do_is_finished() override | ||
| { | ||
| // Only reachable if a sample completed without the throw aborting the run, | ||
| // i.e. the exception was swallowed. Finish immediately so the test fails | ||
| // fast on the is_skipped() check instead of sampling until the timeout. | ||
| return true; | ||
| } | ||
| }; | ||
| NVBENCH_REGISTER_CRITERION(throwing_criterion); | ||
|
|
||
| struct spin_generator | ||
| { | ||
| void operator()(nvbench::state &state, nvbench::type_list<>) const | ||
| { | ||
| state.exec(nvbench::exec_tag::impl::cold, [](nvbench::launch &launch) { | ||
| spin_kernel<<<1, 1, 0, launch.get_stream()>>>(spin_cycles); | ||
| }); | ||
| } | ||
| }; | ||
|
|
||
| using benchmark_type = nvbench::benchmark<spin_generator>; | ||
|
|
||
| // Runs a benchmark whose criterion throws from `site`, and asserts that the | ||
| // benchmark failed (state skipped with the criterion's error) rather than | ||
| // completing. | ||
| void run_and_expect_failure(throw_site site) | ||
| { | ||
| g_probe = criterion_probe{}; | ||
| g_probe.site = site; | ||
|
|
||
| benchmark_type bench{spin_generator{}}; | ||
| bench.add_device(0); | ||
| bench.set_stopping_criterion("test_throwing"); | ||
|
|
||
| // Disable throttle detection. Otherwise the unreliable clock reading of this | ||
| // tiny kernel can look like throttling, causing record_measurements() to | ||
| // discard the sample before the criterion is ever consulted -- the throw | ||
| // would never fire and the run would simply time out. | ||
| bench.set_throttle_threshold(0.f); | ||
|
|
||
| bench.run(); | ||
|
|
||
| NVBENCH_CUDA_CALL(cudaDeviceSynchronize()); | ||
|
|
||
| const auto &states = bench.get_states(); | ||
| ASSERT(!states.empty()); | ||
| for (const auto &state : states) | ||
| { | ||
| ASSERT(state.is_skipped()); | ||
| ASSERT(state.get_skip_reason().find("criterion failure") != std::string::npos); | ||
| } | ||
| } | ||
|
|
||
| // A throw from `add_measurement` must stop the run after the first sample. | ||
| void test_add_measurement_exception_stops_benchmark() | ||
| { | ||
| run_and_expect_failure(throw_site::measurement); | ||
|
|
||
| // The skip check above is what proves the run aborted; this confirms the | ||
| // throw happened on the very first measurement, | ||
| // and that a frequency measurement was collected as well. | ||
| ASSERT(g_probe.frequency_calls == 1); | ||
| ASSERT(g_probe.measurement_calls == 1); | ||
| } | ||
|
|
||
| // A throw from `add_frequency` must stop the run before the measurement for that | ||
| // sample is ever recorded. | ||
| void test_add_frequency_exception_stops_benchmark() | ||
| { | ||
| run_and_expect_failure(throw_site::frequency); | ||
|
|
||
| ASSERT(g_probe.frequency_calls == 1); | ||
| ASSERT(g_probe.measurement_calls == 0); | ||
| } | ||
|
|
||
| int main() | ||
| try | ||
| { | ||
| test_add_measurement_exception_stops_benchmark(); | ||
| test_add_frequency_exception_stops_benchmark(); | ||
|
|
||
| return 0; | ||
| } | ||
| catch (std::exception &e) | ||
| { | ||
| fmt::print("{}\n", e.what()); | ||
| return 1; | ||
| } |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
suggestion: Line 82’s failure explanation is incomplete:
add_frequency()is also skipped when throttling checks are disabled, so this criterion can throw outside--profileand CPU-only modes. Update the exception text (or explicitly enforce a nonzero throttle threshold in this example) so users get an accurate cause.As per path instructions, “Check that examples are minimal, buildable, technically correct, use NVBench APIs idiomatically, avoid excessive benchmark runtime, and demonstrate behavior that is useful to users.”
Source: Path instructions