From 3f53f04796eda02fa77df0503c28a4157b78d284 Mon Sep 17 00:00:00 2001 From: Phil Windle Date: Tue, 23 Jun 2026 21:12:49 +0000 Subject: [PATCH] feat(bench): schema v4 + scrape proving-infra and per-role saturation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Phase 1 (A-1221) + Phase 2 (A-1222) of the nightly benchmarking revamp. Schema v4 (additive over v3, all v3 fields retained): - New optional sections provingInfra and saturation (metricSeriesMap: open slug -> timeSeries map), plus run.sweepId/sweepLabel so a night's 1/5/10 TPS points group as one sweep. - Version gate bumped to "4" in bench_output.schema.json and the network_bench_upload check in bootstrap.sh; index.json entry now carries sweepId/sweepLabel. (Dashboard SUPPORTED_RUN_VERSION lives in AztecProtocol/explorations and must be bumped there separately.) - 10tps-readiness-spec.md: the canonical spec — tx-lifecycle stage waterfall mapped to metrics, headline KPIs + thresholds, sweep/run-group notion. Scraper (bench_scrape.ts): - provingInfra: prover-node hint-gen (public_processor.* + prover_node block/checkpoint processing, scoped to the prover-node pod) and proving-queue series broken down by aztec_proving_job_type (size/active/job_duration/ timed-out/resolved). There is no prover_node.execution.duration metric; the re-execution is the public_processor.* path, mapped accordingly. - saturation: per-role ELU/CPU/memory as max (hottest pod) AND avg, never a single hand-picked pod. ELU/mem from nodejs.*, CPU from host-metrics. - Sections scrape independently so one failure cannot drop the others. CPU and ELU may be telemetry-gated in the bench env (empty series, non-fatal); flagged in the spec for live verification per A-1222 acceptance. --- spartan/bootstrap.sh | 6 +- .../bench_10tps/10tps-readiness-spec.md | 69 ++++++ .../bench_10tps/bench_output.schema.json | 222 +++++------------- spartan/scripts/bench_10tps/bench_scrape.ts | 182 +++++++++++++- 4 files changed, 311 insertions(+), 168 deletions(-) create mode 100644 spartan/scripts/bench_10tps/10tps-readiness-spec.md diff --git a/spartan/bootstrap.sh b/spartan/bootstrap.sh index 4a4a19db9d69..fc4e3b322a51 100755 --- a/spartan/bootstrap.sh +++ b/spartan/bootstrap.sh @@ -286,8 +286,8 @@ function network_bench_upload { # Reject anything that's not the schema we've designed the index against. local schema=$(jq -r .schemaVersion "$run_json") - if [[ "$schema" != "3" ]]; then - echo "[network_bench] run JSON has schemaVersion '$schema', expected '3'; skipping upload" + if [[ "$schema" != "4" ]]; then + echo "[network_bench] run JSON has schemaVersion '$schema', expected '4'; skipping upload" return 0 fi @@ -304,6 +304,8 @@ function network_bench_upload { startedAt: .run.startedAt, endedAt: .run.endedAt, targetTps: .run.targetTps, + sweepId: .run.sweepId, + sweepLabel: .run.sweepLabel, workload: .run.workload, testDurationSeconds: .run.testDurationSeconds, namespace: .run.namespace, diff --git a/spartan/scripts/bench_10tps/10tps-readiness-spec.md b/spartan/scripts/bench_10tps/10tps-readiness-spec.md new file mode 100644 index 000000000000..60e620f09d58 --- /dev/null +++ b/spartan/scripts/bench_10tps/10tps-readiness-spec.md @@ -0,0 +1,69 @@ +# 10 TPS readiness benchmark — spec (schema v4) + +Canonical contract for the custom benchmark pipeline: +`bench_scrape.ts` → `gs://aztec-testnet/network_bench/.json` (+ `index.json`) → `network-dashboard` (in `AztecProtocol/explorations`). + +This doc is the Phase 1 deliverable (Linear A-1221). It defines the tx-lifecycle stage list, the headline KPIs and their thresholds, and the sweep/run-group notion. The machine-readable contract is `bench_output.schema.json` (v4); the scraper that produces it is `bench_scrape.ts`. + +## 1. tx-lifecycle stage waterfall + +A tx's journey from client submit to epoch proof, each stage mapped to the Prometheus metric (from `yarn-project/telemetry-client/src/metrics.ts`) and where it lands in the run JSON. "ms" durations are histograms (use `_bucket` for quantiles, `_sum`/`_count` for means); never `sum(rate(...))` a metric every node observes — collapse per role with `avg`/`max` (see `network-dashboard/docs/dashboard-design.md`). + +| # | Stage | Primary metric(s) | Run-JSON location | +|---|---|---|---| +| 1 | Submit / ingest | `aztec.node.receive_tx_count` (RPC only — load hits one node) | `timeSeries.ingressTps` | +| 2 | P2P propagation | `aztec.p2p.gossip.message_latency`, `agg_message_latency_p50/p90`, `tx_received_count` | `timeSeries` (gossip) | +| 3 | Mempool wait | `aztec.mempool.tx_count` (pending depth), `aztec.mempool.tx_mined_delay` | `timeSeries.mempoolSize*`, `mempoolMinedMax` | +| 4 | Block build | `aztec.sequencer.block.build_duration`, `build_mana_per_second` | `timeSeries`, `sequencerStateSlots` | +| 5 | Public processing | `aztec.public_processor.tx_duration`, `phase_duration`, `gas_rate` | `timeSeries.publicProcessorGasRate`; **prover-node copy** in `provingInfra.hintGen*` | +| 6 | Attestation / consensus | `aztec.mempool.attestations_mined_delay`; attestation-collect duration vs slot allowance | `timeSeries.attestationsCollect*` | +| 7 | Checkpoint assemble → broadcast | `aztec.archiver.checkpoint_height`, checkpoint block/tx counts | `timeSeries.checkpoint*` | +| 8 | L1 inclusion | `aztec.archiver.checkpoint_l1_inclusion_delay`, `l1_block_height`, `block_height` | `timeSeries`, `blocks` | +| 9 | Proving (epoch) | `aztec.prover_node.checkpoint_proving.duration`, `aztec.archiver.rollup_proof_delay`, `aztec.proving_queue.*` by `job_type`, prover-node block/checkpoint processing | `provingInfra.*` | + +**Authoritative user-perceived latency** is client-observed, not Prometheus: `n_tps_test.tx_inclusion_time` (`timeSeries`, `source: client_observed`) — the wall-clock submit→mined delta for high-value txs, computed in the scraper from `n_tps.test.ts` records. Stages 1–8 explain *where* that latency is spent; stage 9 is the separate proving path. + +## 2. Headline KPIs + pass/fail thresholds + +Two independent verdicts — a run can pass inclusion and fail proving (or vice versa). Thresholds are starting points to refine against baselines; encode them in the dashboard, not the scraper (the scraper stays a faithful recorder). + +| KPI | Definition | Pass threshold | +|---|---|---| +| **Inclusion-TPS achieved / target** | `summary.inclusionTpsMean / run.targetTps` | ≥ 0.95 | +| **User-perceived inclusion latency p50** | p50 of `tx_inclusion_time` | ≤ 1 × `AZTEC_SLOT_DURATION` | +| **User-perceived inclusion latency p99** | p99 of `tx_inclusion_time` | ≤ 3 × `AZTEC_SLOT_DURATION` | +| **Proving headroom** | does each epoch's proof land within `AZTEC_PROOF_SUBMISSION_EPOCHS` of epoch close (no proof-window-expiry reorg)? | every epoch proven in window; `reorgCount` from window-expiry = 0 | +| **Reorgs** | `summary.reorgCount` | 0 | + +"Proving headroom" is the proving knee = distance from 10 TPS: the highest `targetTps` at which every epoch still proves within its window. Below the knee, headroom is positive; above it, epochs miss the window and the pending chain is pruned (the run #95 failure mode). + +## 3. Sweep / run-group notion + +A night's 1/5/10 TPS points are distinct runs (distinct namespaces — queries are namespace-scoped, one run per namespace) that the dashboard must view together. Grouping fields (schema v4, on `run` + mirrored into `index.json`): + +- `run.sweepId` — shared id across the points of one sweep (e.g. `incl-20260623`). Set via `--sweep-id` / `BENCH_SWEEP_ID`. +- `run.sweepLabel` — human label (e.g. `inclusion-sweep`, `proving-sweep`). `--sweep-label` / `BENCH_SWEEP_LABEL`. +- `run.targetTps` — the point within the sweep (already present in v3). + +`index.json` entries carry `sweepId`/`sweepLabel`/`targetTps` so the dashboard can group + order points without fetching every run JSON. + +## 4. schema v4 additions (additive over v3) + +All v3 fields retained; a v3-shaped run re-stamped `"4"` still validates (the new sections are optional). New: + +- `provingInfra` (`metricSeriesMap`): prover-node hint-gen (`public_processor.*` + `prover_node.*_processing.duration` scoped to the prover-node pod) and proving-queue series broken down by `aztec_proving_job_type` (size / active / job_duration p50·p99 / timed-out · resolved rates). **Note:** there is no `aztec.prover_node.execution.duration` metric — hint-gen is the `public_processor.*` re-execution on the prover-node pod, mapped accordingly. +- `saturation` (`metricSeriesMap`): per-role ELU / CPU / memory, each as **max (hottest pod)** and **avg**, for validator / rpc / fullNode / proverNode / broker / agent. Never a single hand-picked pod. ELU = `nodejs_eventloop_utilization`, memory = `nodejs_memory_v8_heap_usage` (both `nodejs.*`, not `aztec_`); CPU = `process_cpu_utilization` (from `@opentelemetry/host-metrics`). +- `run.sweepId` / `run.sweepLabel` (§3). + +### Version gate — three places, must stay in sync + +Bumping the schema version requires updating all three or v4 runs are silently rejected / mis-rendered: + +1. `bench_output.schema.json` — `schemaVersion.const` (✅ `"4"`). +2. `spartan/bootstrap.sh` — `network_bench_upload` schemaVersion check (✅ `"4"`). +3. **`network-dashboard/data.js` `SUPPORTED_RUN_VERSION`** — in `AztecProtocol/explorations`, **not this repo**. Must be bumped to `"4"` there before v4 runs render. Tracked as dashboard work (Phase 5). + +## 5. Verify-on-live caveats (A-1222 acceptance) + +- **CPU** (`process_cpu_utilization`) and **ELU** (`nodejs_eventloop_utilization`) come from telemetry that may be gated in the bench env. The scraper emits empty series (non-fatal) if a metric is absent; confirm both flow on a live bench run and fix the exporter/metric name if not. +- Proving-infra durations are recorded in **ms** by convention; confirm units against the live histograms before trusting absolute values. diff --git a/spartan/scripts/bench_10tps/bench_output.schema.json b/spartan/scripts/bench_10tps/bench_output.schema.json index 3685c72960d8..ca1fd691187e 100644 --- a/spartan/scripts/bench_10tps/bench_output.schema.json +++ b/spartan/scripts/bench_10tps/bench_output.schema.json @@ -16,8 +16,8 @@ "properties": { "schemaVersion": { "type": "string", - "const": "3", - "description": "Bump when breaking the schema. Old JSONs keep their previous version so the dashboard can render them side-by-side. v3: timeSeries entries carry `series: [{labels, points}]` instead of bare `points` to support per-pod / per-label data." + "const": "4", + "description": "Bump when breaking the schema. Old JSONs keep their previous version so the dashboard can render them side-by-side. v3: timeSeries entries carry `series: [{labels, points}]` instead of bare `points`. v4 (additive): adds optional `provingInfra` (hint-gen + proving-queue-by-job_type series) and `saturation` (per-role ELU/CPU/memory, max + avg) sections, plus `run.sweepId`/`run.sweepLabel` so a night's 1/5/10 TPS points group as one sweep. All v3 fields retained; a v3 run is unchanged apart from this version string." }, "run": { "$ref": "#/$defs/runMeta" @@ -30,6 +30,14 @@ "$ref": "#/$defs/timeSeriesSection", "description": "PromQL query_range results. Continuous-sampled metrics keyed by unixEpoch; the dashboard normalises to time-within-run via unixEpoch - run.startedAt at render time so multiple runs can overlay on the same x-axis." }, + "provingInfra": { + "$ref": "#/$defs/metricSeriesMap", + "description": "v4. Proving-path series for the proving-infra view: prover-node hint-gen / tx re-execution (public_processor.* + prover_node.*_processing.duration scoped to the prover-node pod) and proving-queue behaviour broken down by job_type (aztec_proving_job_type label). Optional — empty/absent on inclusion-only runs." + }, + "saturation": { + "$ref": "#/$defs/metricSeriesMap", + "description": "v4. Per-role resource saturation: ELU, CPU, and memory for each role (validator / rpc / fullNode / proverNode / broker / agent), each emitted as both the hottest pod (max across pods) and the role average (avg across pods). Never a single hand-picked pod. Optional — absent on older runs." + }, "blocks": { "type": "array", "description": "Per-block records parsed from structured logs (each block emits one `Processed N successful txs and M failed txs ...` info line). Authoritative for per-block facts — Prometheus histograms cannot recover per-block samples.", @@ -63,12 +71,7 @@ "runMeta": { "type": "object", "additionalProperties": false, - "required": [ - "runId", - "startedAt", - "endedAt", - "namespace" - ], + "required": ["runId", "startedAt", "endedAt", "namespace"], "properties": { "runId": { "type": "string", @@ -96,9 +99,7 @@ }, "namespace": { "type": "string", - "examples": [ - "bench-10tps" - ] + "examples": ["bench-10tps"] }, "gcpProject": { "type": "string", @@ -120,15 +121,21 @@ "type": "number", "minimum": 0 }, + "sweepId": { + "type": "string", + "description": "v4. Groups the runs of one sweep (e.g. a night's 1/5/10 TPS points) so the dashboard can plot them together. Shared across the points of a sweep; absent for standalone runs." + }, + "sweepLabel": { + "type": "string", + "description": "v4. Human-readable label for the sweep this run belongs to (e.g. 'inclusion-sweep' or 'proving-sweep'). Optional." + }, "testDurationSeconds": { "type": "integer", "minimum": 0 }, "workload": { "type": "string", - "examples": [ - "sha256_hash_1024" - ] + "examples": ["sha256_hash_1024"] }, "aztecConfig": { "type": "object", @@ -167,11 +174,7 @@ "items": { "type": "object", "additionalProperties": false, - "required": [ - "role", - "podName", - "nodeName" - ], + "required": ["role", "podName", "nodeName"], "properties": { "role": { "type": "string" @@ -205,9 +208,7 @@ }, "profile": { "type": "string", - "examples": [ - "network-requirements" - ] + "examples": ["network-requirements"] } } }, @@ -239,40 +240,25 @@ "description": "Maximum time the scraper was allowed to wait for validator pending TxPool depth to reach zero." }, "pendingAtScrape": { - "type": [ - "number", - "null" - ], + "type": ["number", "null"], "minimum": 0, "description": "Validator pending TxPool depth observed when scraping started, or null when the pending drain gate was disabled." }, "pendingByRoleAtScrape": { - "type": [ - "object", - "null" - ], + "type": ["object", "null"], "description": "Pending TxPool depth by pod role at scrape start. RPC/full-node pending can remain non-zero after validators drain, which indicates load that did not propagate to proposers before expiry.", "additionalProperties": false, "properties": { "rpc": { - "type": [ - "number", - "null" - ], + "type": ["number", "null"], "minimum": 0 }, "validator": { - "type": [ - "number", - "null" - ], + "type": ["number", "null"], "minimum": 0 }, "fullNode": { - "type": [ - "number", - "null" - ], + "type": ["number", "null"], "minimum": 0 } } @@ -288,118 +274,66 @@ "summary": { "type": "object", "additionalProperties": false, - "required": [ - "headlineKpi", - "inclusionTpsMean", - "targetTps" - ], + "required": ["headlineKpi", "inclusionTpsMean", "targetTps"], "properties": { "headlineKpi": { - "type": [ - "number", - "null" - ], + "type": ["number", "null"], "description": "inclusionTpsMean / targetTps. The single number on the dashboard top strip." }, "targetTps": { "type": "number" }, "inclusionTpsMean": { - "type": [ - "number", - "null" - ], + "type": ["number", "null"], "description": "Inclusion throughput over the observed inclusion window. Uses exact block-log throughput when block records are available, otherwise falls back to the Prometheus inclusionTps mean." }, "inclusionTpsPeak": { - "type": [ - "number", - "null" - ], + "type": ["number", "null"], "description": "Peak sampled Prometheus rolling inclusion rate over the observed scrape window." }, "inclusionLatencyP50Ms": { - "type": [ - "number", - "null" - ] + "type": ["number", "null"] }, "inclusionLatencyP95Ms": { - "type": [ - "number", - "null" - ] + "type": ["number", "null"] }, "inclusionLatencyP99Ms": { - "type": [ - "number", - "null" - ] + "type": ["number", "null"] }, "blockBuildDurationP50Ms": { - "type": [ - "number", - "null" - ] + "type": ["number", "null"] }, "blockBuildDurationP95Ms": { - "type": [ - "number", - "null" - ] + "type": ["number", "null"] }, "publicProcessorTxDurationP50Ms": { - "type": [ - "number", - "null" - ] + "type": ["number", "null"] }, "publicProcessorTxDurationP95Ms": { - "type": [ - "number", - "null" - ] + "type": ["number", "null"] }, "totalTxsMined": { - "type": [ - "integer", - "null" - ], + "type": ["integer", "null"], "description": "Exact sum from per-block logs. Null when block logs were unavailable and inclusionTpsMean came from Prometheus." }, "totalTxsFailed": { - "type": [ - "integer", - "null" - ], + "type": ["integer", "null"], "description": "Exact sum from per-block logs. Null when block logs were unavailable." }, "totalSilentSkipCount": { - "type": [ - "integer", - "null" - ], + "type": ["integer", "null"], "description": "Sum of per-block silentlySkippedCount. > 0 means the post-process blob-field revert path fired during the run." }, "totalSilentSkipDurationMs": { - "type": [ - "integer", - "null" - ], + "type": ["integer", "null"], "description": "Sum of per-block silentlySkippedDurationMs. Wall-clock 'wasted' on silently-skipped txs across the run." }, "reorgCount": { - "type": [ - "integer", - "null" - ], + "type": ["integer", "null"], "description": "Count of `Chain pruned` events during the run." }, "deepestReorgBlocks": { - "type": [ - "integer", - "null" - ], + "type": ["integer", "null"], "description": "Max (fromBlock - toBlock) across reorg events. 0 if no reorgs." } } @@ -510,14 +444,17 @@ } } }, + "metricSeriesMap": { + "type": "object", + "description": "v4. Open map of slug -> timeSeries, same per-series shape as timeSeriesSection but without a fixed slug list. Used for provingInfra and saturation, whose slugs are generated per role / job_type.", + "additionalProperties": { + "$ref": "#/$defs/timeSeries" + } + }, "timeSeries": { "type": "object", "additionalProperties": false, - "required": [ - "metric", - "source", - "series" - ], + "required": ["metric", "source", "series"], "properties": { "metric": { "type": "string", @@ -525,19 +462,11 @@ }, "unit": { "type": "string", - "examples": [ - "ms", - "tps", - "mana/s", - "count" - ] + "examples": ["ms", "tps", "mana/s", "count"] }, "source": { "type": "string", - "enum": [ - "promql", - "client_observed" - ], + "enum": ["promql", "client_observed"], "description": "Provenance: 'promql' = scraped via PromQL from cluster Prometheus; 'client_observed' = computed in this scraper from per-tx records emitted by n_tps.test.ts (e.g. headline tx_mined_delay)." }, "query": { @@ -560,10 +489,7 @@ "seriesEntry": { "type": "object", "additionalProperties": false, - "required": [ - "labels", - "points" - ], + "required": ["labels", "points"], "properties": { "labels": { "type": "object", @@ -584,20 +510,14 @@ "tsPoint": { "type": "object", "additionalProperties": false, - "required": [ - "unixEpoch", - "value" - ], + "required": ["unixEpoch", "value"], "properties": { "unixEpoch": { "type": "integer", "description": "Seconds since unix epoch for this sample. Dashboards normalise to time-within-run via unixEpoch - run.startedAt at render time." }, "value": { - "type": [ - "number", - "null" - ], + "type": ["number", "null"], "description": "Metric value. null if Prom returned NaN / no data for this step." } } @@ -605,11 +525,7 @@ "blockRecord": { "type": "object", "additionalProperties": false, - "required": [ - "blockNumber", - "blockNumberInTest", - "minedAt" - ], + "required": ["blockNumber", "blockNumberInTest", "minedAt"], "properties": { "blockNumber": { "type": "integer", @@ -671,10 +587,7 @@ "event": { "type": "object", "additionalProperties": false, - "required": [ - "at", - "type" - ], + "required": ["at", "type"], "properties": { "at": { "type": "string", @@ -682,10 +595,7 @@ }, "type": { "type": "string", - "enum": [ - "chainPruned", - "slotSummary" - ] + "enum": ["chainPruned", "slotSummary"] }, "source": { "type": "string", @@ -761,13 +671,7 @@ "sequencerStateSlot": { "type": "object", "additionalProperties": false, - "required": [ - "slotNumber", - "startedAt", - "endedAt", - "totalMs", - "states" - ], + "required": ["slotNumber", "startedAt", "endedAt", "totalMs", "states"], "properties": { "slotNumber": { "type": "integer", @@ -803,4 +707,4 @@ } } } -} \ No newline at end of file +} diff --git a/spartan/scripts/bench_10tps/bench_scrape.ts b/spartan/scripts/bench_10tps/bench_scrape.ts index 77d1805d2d53..5197aa32e4d6 100755 --- a/spartan/scripts/bench_10tps/bench_scrape.ts +++ b/spartan/scripts/bench_10tps/bench_scrape.ts @@ -1,9 +1,15 @@ #!/usr/bin/env -S node --experimental-strip-types --no-warnings // // Scrape a completed bench-10tps run into a schema-conformant JSON payload. -// Contract: bench_output.schema.json (v3). Invoked by the bench_10tps function +// Contract: bench_output.schema.json (v4). Invoked by the bench_10tps function // in spartan/bootstrap.sh after n_tps.test.ts finishes. // +// v4 adds two PromQL sections alongside the inclusion timeSeries: +// - provingInfra: prover-node hint-gen (tx re-execution) + proving-queue +// behaviour broken down by job_type. +// - saturation: per-role ELU/CPU/memory, each as max (hottest pod) + avg. +// Both scrape independently so one failing does not abort the others. +// // Two independent scrape paths so one failing does not abort the other: // 1. Prometheus (port-forward to the cluster-shared metrics-prometheus-server) // 2. gcloud logging read (per-block and discrete-event records) @@ -56,6 +62,8 @@ type Args = { inclusionRecords: string | undefined; waitForPendingZero: boolean; maxPendingWaitSeconds: number; + sweepId: string | undefined; + sweepLabel: string | undefined; }; function parseArgs(): Args { @@ -94,6 +102,8 @@ function parseArgs(): Args { String(DEFAULT_MAX_PENDING_WAIT_SECONDS), ), ), + sweepId: get("--sweep-id", env.BENCH_SWEEP_ID ?? "") || undefined, + sweepLabel: get("--sweep-label", env.BENCH_SWEEP_LABEL ?? "") || undefined, }; } @@ -560,12 +570,145 @@ const TIME_SERIES_DEFS: Record = { }, }; -async function scrapeTimeSeries( +// --- v4: per-role resource saturation (ELU / CPU / memory) --- +// Roles are matched by pod-name prefix within the namespace. The proposer +// rotates, so never hand-pick a pod: emit max() (hottest pod) AND avg() per role. +const SATURATION_ROLES: Record = { + validator: `${NAMESPACE}-validator.*`, + rpc: `${NAMESPACE}-rpc.*`, + fullNode: `${NAMESPACE}-full-node.*`, + proverNode: `${NAMESPACE}-prover-node.*`, + broker: `${NAMESPACE}-prover-broker.*`, + agent: `${NAMESPACE}-prover-agent.*`, +}; + +// OTel metric -> Prometheus name. ELU + heap come from +// telemetry-client/src/nodejs_metrics_monitor.ts (nodejs.* prefix, NOT aztec_). +// CPU comes from @opentelemetry/host-metrics (process.cpu.utilization), not the +// nodejs monitor. NOTE: ELU and especially CPU may be telemetry-gated in the +// bench env — if so these series come back empty (A-1222 acceptance: verify on +// the live env and adjust the metric name / enable the exporter as needed). +const SATURATION_METRICS: { key: string; metric: string; unit: string }[] = [ + { key: "elu", metric: "nodejs_eventloop_utilization", unit: "ratio" }, + { key: "cpu", metric: "process_cpu_utilization", unit: "ratio" }, + { key: "mem", metric: "nodejs_memory_v8_heap_usage", unit: "bytes" }, +]; + +function buildSaturationDefs(): Record { + const defs: Record = {}; + for (const [role, podPattern] of Object.entries(SATURATION_ROLES)) { + const sel = `{k8s_namespace_name="${NAMESPACE}",k8s_pod_name=~"${podPattern}"}`; + const cap = role.charAt(0).toUpperCase() + role.slice(1); + for (const { key, metric, unit } of SATURATION_METRICS) { + // max() across pods = hottest pod; avg() = role average. Single series each. + defs[`${key}${cap}Max`] = { metric, unit, query: `max(${metric}${sel})` }; + defs[`${key}${cap}Avg`] = { metric, unit, query: `avg(${metric}${sel})` }; + } + } + return defs; +} +const SATURATION_DEFS = buildSaturationDefs(); + +// --- v4: proving-infra (hint-gen on the prover-node + proving-queue by job_type) --- +// "Hint generation" is the prover node re-executing the epoch's txs. There is no +// `aztec.prover_node.execution.duration` metric; the re-execution is instrumented +// as public_processor.* + prover_node.*_processing.duration on the prover-node +// pod. Proving-queue behaviour is broken down by the aztec_proving_job_type label. +const PROVER_NODE_SEL = `{k8s_namespace_name="${NAMESPACE}",k8s_pod_name=~"${NAMESPACE}-prover-node.*"}`; +const JOB_TYPE = "aztec_proving_job_type"; +const proverNodeHist = (q: number, bucket: string) => + `histogram_quantile(${q}, sum by (le)(rate(${bucket}${PROVER_NODE_SEL}[1m])))`; +const queueByJobType = (metric: string) => + `sum by (${JOB_TYPE})(${metric}${NS})`; +const queueRateByJobType = (metric: string) => + `sum by (${JOB_TYPE})(rate(${metric}${NS}[1m]))`; +const queueHistByJobType = (q: number, bucket: string) => + `histogram_quantile(${q}, sum by (le, ${JOB_TYPE})(rate(${bucket}${NS}[1m])))`; + +const PROVING_INFRA_DEFS: Record = { + // Hint-gen: prover-node tx re-execution (the proving bottleneck at high TPS). + hintGenPublicTxDurationP50: { + metric: "aztec_public_processor_tx_duration", + unit: "ms", + query: proverNodeHist(0.5, "aztec_public_processor_tx_duration_bucket"), + }, + hintGenPublicTxDurationP99: { + metric: "aztec_public_processor_tx_duration", + unit: "ms", + query: proverNodeHist(0.99, "aztec_public_processor_tx_duration_bucket"), + }, + hintGenPublicPhaseDurationP50: { + metric: "aztec_public_processor_phase_duration", + unit: "ms", + query: proverNodeHist(0.5, "aztec_public_processor_phase_duration_bucket"), + }, + hintGenBlockProcessingDurationP50: { + metric: "aztec_prover_node_block_processing_duration", + unit: "ms", + query: proverNodeHist( + 0.5, + "aztec_prover_node_block_processing_duration_bucket", + ), + }, + hintGenBlockProcessingDurationP99: { + metric: "aztec_prover_node_block_processing_duration", + unit: "ms", + query: proverNodeHist( + 0.99, + "aztec_prover_node_block_processing_duration_bucket", + ), + }, + hintGenCheckpointProcessingDurationP50: { + metric: "aztec_prover_node_checkpoint_processing_duration", + unit: "ms", + query: proverNodeHist( + 0.5, + "aztec_prover_node_checkpoint_processing_duration_bucket", + ), + }, + // Proving queue, broken down by job_type (one series per job type). + provingQueueSizeByJobType: { + metric: "aztec_proving_queue_size", + unit: "count", + query: queueByJobType("aztec_proving_queue_size"), + }, + provingQueueActiveJobsByJobType: { + metric: "aztec_proving_queue_active_jobs_count", + unit: "count", + query: queueByJobType("aztec_proving_queue_active_jobs_count"), + }, + provingQueueJobDurationP50ByJobType: { + metric: "aztec_proving_queue_job_duration", + unit: "ms", + query: queueHistByJobType(0.5, "aztec_proving_queue_job_duration_bucket"), + }, + provingQueueJobDurationP99ByJobType: { + metric: "aztec_proving_queue_job_duration", + unit: "ms", + query: queueHistByJobType(0.99, "aztec_proving_queue_job_duration_bucket"), + }, + // Rates of terminal job outcomes — the run #95 stall showed up as timeouts. + provingQueueTimedOutJobsByJobType: { + metric: "aztec_proving_queue_timed_out_jobs_count", + unit: "count", + query: queueRateByJobType("aztec_proving_queue_timed_out_jobs_count"), + }, + provingQueueResolvedJobsByJobType: { + metric: "aztec_proving_queue_resolved_jobs_count", + unit: "count", + query: queueRateByJobType("aztec_proving_queue_resolved_jobs_count"), + }, +}; + +// Scrape a map of slug -> PromQL def via query_range. One failing query emits an +// empty series for that slug rather than aborting the whole section. +async function scrapeDefs( + defs: Record, startedAtEpoch: number, endedAtEpoch: number, ): Promise> { const out: Record = {}; - for (const [slug, def] of Object.entries(TIME_SERIES_DEFS)) { + for (const [slug, def] of Object.entries(defs)) { try { const series = await queryRange(def.query, startedAtEpoch, endedAtEpoch); out[slug] = { @@ -577,7 +720,7 @@ async function scrapeTimeSeries( series, }; } catch (err) { - log(`timeSeries.${slug} scrape failed, emitting empty series`, { + log(`series.${slug} scrape failed, emitting empty series`, { err: err instanceof Error ? err.message : String(err), }); out[slug] = { @@ -593,6 +736,9 @@ async function scrapeTimeSeries( return out; } +const scrapeTimeSeries = (startedAtEpoch: number, endedAtEpoch: number) => + scrapeDefs(TIME_SERIES_DEFS, startedAtEpoch, endedAtEpoch); + // --- gcloud log scrape --- type GcloudEntry = { @@ -1668,6 +1814,8 @@ function assertShape(payload: Record): void { "run", "summary", "timeSeries", + "provingInfra", + "saturation", "blocks", "events", ] as const; @@ -1676,9 +1824,9 @@ function assertShape(payload: Record): void { throw new Error(`output missing required top-level key: ${key}`); } } - if (payload.schemaVersion !== "3") { + if (payload.schemaVersion !== "4") { throw new Error( - `schemaVersion must be "3", got ${String(payload.schemaVersion)}`, + `schemaVersion must be "4", got ${String(payload.schemaVersion)}`, ); } const run = payload.run as Record; @@ -1913,6 +2061,22 @@ async function main(): Promise { log("Scraping Prometheus time-series"); const timeSeries = await scrapeTimeSeries(startedAtEpoch, promEndEpoch); + // v4: proving-infra (hint-gen + queue by job_type) and per-role saturation. + // Independent of the inclusion timeSeries scrape so a failure here cannot + // drop inclusion data, and vice versa. + log("Scraping proving-infra series (hint-gen + queue by job_type)"); + const provingInfra = await scrapeDefs( + PROVING_INFRA_DEFS, + startedAtEpoch, + promEndEpoch, + ); + log("Scraping per-role saturation series (ELU/CPU/memory, max + avg)"); + const saturation = await scrapeDefs( + SATURATION_DEFS, + startedAtEpoch, + promEndEpoch, + ); + log("Loading client-observed inclusion records"); const inclusionRecords = await loadInclusionRecords(args.inclusionRecords); // Compute the headline inclusion-latency time series from per-tx records @@ -1999,7 +2163,7 @@ async function main(): Promise { }); const payload = { - schemaVersion: "3", + schemaVersion: "4", run: { runId: args.runId, startedAt: args.startedAt, @@ -2014,6 +2178,8 @@ async function main(): Promise { gkeCluster: GKE_CLUSTER, ...(image !== undefined && { image }), targetTps: args.targetTps, + ...(args.sweepId !== undefined && { sweepId: args.sweepId }), + ...(args.sweepLabel !== undefined && { sweepLabel: args.sweepLabel }), testDurationSeconds: windowSec, workload: args.workload, ...(Object.keys(aztecConfig).length > 0 && { aztecConfig }), @@ -2031,6 +2197,8 @@ async function main(): Promise { }, summary, timeSeries, + provingInfra, + saturation, blocks, events, sequencerStateSlots,