From 3f53f04796eda02fa77df0503c28a4157b78d284 Mon Sep 17 00:00:00 2001
From: Phil Windle <philip.windle@gmail.com>
Date: Tue, 23 Jun 2026 21:12:49 +0000
Subject: [PATCH] feat(bench): schema v4 + scrape proving-infra and per-role
 saturation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Phase 1 (A-1221) + Phase 2 (A-1222) of the nightly benchmarking revamp.

Schema v4 (additive over v3, all v3 fields retained):
- New optional sections provingInfra and saturation (metricSeriesMap: open
  slug -> timeSeries map), plus run.sweepId/sweepLabel so a night's 1/5/10 TPS
  points group as one sweep.
- Version gate bumped to "4" in bench_output.schema.json and the
  network_bench_upload check in bootstrap.sh; index.json entry now carries
  sweepId/sweepLabel. (Dashboard SUPPORTED_RUN_VERSION lives in
  AztecProtocol/explorations and must be bumped there separately.)
- 10tps-readiness-spec.md: the canonical spec — tx-lifecycle stage waterfall
  mapped to metrics, headline KPIs + thresholds, sweep/run-group notion.

Scraper (bench_scrape.ts):
- provingInfra: prover-node hint-gen (public_processor.* + prover_node
  block/checkpoint processing, scoped to the prover-node pod) and proving-queue
  series broken down by aztec_proving_job_type (size/active/job_duration/
  timed-out/resolved). There is no prover_node.execution.duration metric; the
  re-execution is the public_processor.* path, mapped accordingly.
- saturation: per-role ELU/CPU/memory as max (hottest pod) AND avg, never a
  single hand-picked pod. ELU/mem from nodejs.*, CPU from host-metrics.
- Sections scrape independently so one failure cannot drop the others.

CPU and ELU may be telemetry-gated in the bench env (empty series, non-fatal);
flagged in the spec for live verification per A-1222 acceptance.
---
 spartan/bootstrap.sh                          |   6 +-
 .../bench_10tps/10tps-readiness-spec.md       |  69 ++++++
 .../bench_10tps/bench_output.schema.json      | 222 +++++-------------
 spartan/scripts/bench_10tps/bench_scrape.ts   | 182 +++++++++++++-
 4 files changed, 311 insertions(+), 168 deletions(-)
 create mode 100644 spartan/scripts/bench_10tps/10tps-readiness-spec.md
diff --git a/spartan/bootstrap.sh b/spartan/bootstrap.sh
index 4a4a19db9d69..fc4e3b322a51 100755
--- a/spartan/bootstrap.sh
+++ b/spartan/bootstrap.sh
@@ -286,8 +286,8 @@ function network_bench_upload {
 
   # Reject anything that's not the schema we've designed the index against.
   local schema=$(jq -r .schemaVersion "$run_json")
-  if [[ "$schema" != "3" ]]; then
-    echo "[network_bench] run JSON has schemaVersion '$schema', expected '3'; skipping upload"
+  if [[ "$schema" != "4" ]]; then
+    echo "[network_bench] run JSON has schemaVersion '$schema', expected '4'; skipping upload"
     return 0
   fi
 
@@ -304,6 +304,8 @@ function network_bench_upload {
     startedAt: .run.startedAt,
     endedAt: .run.endedAt,
     targetTps: .run.targetTps,
+    sweepId: .run.sweepId,
+    sweepLabel: .run.sweepLabel,
     workload: .run.workload,
     testDurationSeconds: .run.testDurationSeconds,
     namespace: .run.namespace,
diff --git a/spartan/scripts/bench_10tps/10tps-readiness-spec.md b/spartan/scripts/bench_10tps/10tps-readiness-spec.md
new file mode 100644
index 000000000000..60e620f09d58
--- /dev/null
+++ b/spartan/scripts/bench_10tps/10tps-readiness-spec.md
@@ -0,0 +1,69 @@
+# 10 TPS readiness benchmark — spec (schema v4)
+
+Canonical contract for the custom benchmark pipeline:
+`bench_scrape.ts` → `gs://aztec-testnet/network_bench/<runId>.json` (+ `index.json`) → `network-dashboard` (in `AztecProtocol/explorations`).
+
+This doc is the Phase 1 deliverable (Linear A-1221). It defines the tx-lifecycle stage list, the headline KPIs and their thresholds, and the sweep/run-group notion. The machine-readable contract is `bench_output.schema.json` (v4); the scraper that produces it is `bench_scrape.ts`.
+
+## 1. tx-lifecycle stage waterfall
+
+A tx's journey from client submit to epoch proof, each stage mapped to the Prometheus metric (from `yarn-project/telemetry-client/src/metrics.ts`) and where it lands in the run JSON. "ms" durations are histograms (use `_bucket` for quantiles, `_sum`/`_count` for means); never `sum(rate(...))` a metric every node observes — collapse per role with `avg`/`max` (see `network-dashboard/docs/dashboard-design.md`).
+
+| # | Stage | Primary metric(s) | Run-JSON location |
+|---|---|---|---|
+| 1 | Submit / ingest | `aztec.node.receive_tx_count` (RPC only — load hits one node) | `timeSeries.ingressTps` |
+| 2 | P2P propagation | `aztec.p2p.gossip.message_latency`, `agg_message_latency_p50/p90`, `tx_received_count` | `timeSeries` (gossip) |
+| 3 | Mempool wait | `aztec.mempool.tx_count` (pending depth), `aztec.mempool.tx_mined_delay` | `timeSeries.mempoolSize*`, `mempoolMinedMax` |
+| 4 | Block build | `aztec.sequencer.block.build_duration`, `build_mana_per_second` | `timeSeries`, `sequencerStateSlots` |
+| 5 | Public processing | `aztec.public_processor.tx_duration`, `phase_duration`, `gas_rate` | `timeSeries.publicProcessorGasRate`; **prover-node copy** in `provingInfra.hintGen*` |
+| 6 | Attestation / consensus | `aztec.mempool.attestations_mined_delay`; attestation-collect duration vs slot allowance | `timeSeries.attestationsCollect*` |
+| 7 | Checkpoint assemble → broadcast | `aztec.archiver.checkpoint_height`, checkpoint block/tx counts | `timeSeries.checkpoint*` |
+| 8 | L1 inclusion | `aztec.archiver.checkpoint_l1_inclusion_delay`, `l1_block_height`, `block_height` | `timeSeries`, `blocks` |
+| 9 | Proving (epoch) | `aztec.prover_node.checkpoint_proving.duration`, `aztec.archiver.rollup_proof_delay`, `aztec.proving_queue.*` by `job_type`, prover-node block/checkpoint processing | `provingInfra.*` |
+
+**Authoritative user-perceived latency** is client-observed, not Prometheus: `n_tps_test.tx_inclusion_time` (`timeSeries`, `source: client_observed`) — the wall-clock submit→mined delta for high-value txs, computed in the scraper from `n_tps.test.ts` records. Stages 1–8 explain *where* that latency is spent; stage 9 is the separate proving path.
+
+## 2. Headline KPIs + pass/fail thresholds
+
+Two independent verdicts — a run can pass inclusion and fail proving (or vice versa). Thresholds are starting points to refine against baselines; encode them in the dashboard, not the scraper (the scraper stays a faithful recorder).
+
+| KPI | Definition | Pass threshold |
+|---|---|---|
+| **Inclusion-TPS achieved / target** | `summary.inclusionTpsMean / run.targetTps` | ≥ 0.95 |
+| **User-perceived inclusion latency p50** | p50 of `tx_inclusion_time` | ≤ 1 × `AZTEC_SLOT_DURATION` |
+| **User-perceived inclusion latency p99** | p99 of `tx_inclusion_time` | ≤ 3 × `AZTEC_SLOT_DURATION` |
+| **Proving headroom** | does each epoch's proof land within `AZTEC_PROOF_SUBMISSION_EPOCHS` of epoch close (no proof-window-expiry reorg)? | every epoch proven in window; `reorgCount` from window-expiry = 0 |
+| **Reorgs** | `summary.reorgCount` | 0 |
+
+"Proving headroom" is the proving knee = distance from 10 TPS: the highest `targetTps` at which every epoch still proves within its window. Below the knee, headroom is positive; above it, epochs miss the window and the pending chain is pruned (the run #95 failure mode).
+
+## 3. Sweep / run-group notion
+
+A night's 1/5/10 TPS points are distinct runs (distinct namespaces — queries are namespace-scoped, one run per namespace) that the dashboard must view together. Grouping fields (schema v4, on `run` + mirrored into `index.json`):
+
+- `run.sweepId` — shared id across the points of one sweep (e.g. `incl-20260623`). Set via `--sweep-id` / `BENCH_SWEEP_ID`.
+- `run.sweepLabel` — human label (e.g. `inclusion-sweep`, `proving-sweep`). `--sweep-label` / `BENCH_SWEEP_LABEL`.
+- `run.targetTps` — the point within the sweep (already present in v3).
+
+`index.json` entries carry `sweepId`/`sweepLabel`/`targetTps` so the dashboard can group + order points without fetching every run JSON.
+
+## 4. schema v4 additions (additive over v3)
+
+All v3 fields retained; a v3-shaped run re-stamped `"4"` still validates (the new sections are optional). New:
+
+- `provingInfra` (`metricSeriesMap`): prover-node hint-gen (`public_processor.*` + `prover_node.*_processing.duration` scoped to the prover-node pod) and proving-queue series broken down by `aztec_proving_job_type` (size / active / job_duration p50·p99 / timed-out · resolved rates). **Note:** there is no `aztec.prover_node.execution.duration` metric — hint-gen is the `public_processor.*` re-execution on the prover-node pod, mapped accordingly.
+- `saturation` (`metricSeriesMap`): per-role ELU / CPU / memory, each as **max (hottest pod)** and **avg**, for validator / rpc / fullNode / proverNode / broker / agent. Never a single hand-picked pod. ELU = `nodejs_eventloop_utilization`, memory = `nodejs_memory_v8_heap_usage` (both `nodejs.*`, not `aztec_`); CPU = `process_cpu_utilization` (from `@opentelemetry/host-metrics`).
+- `run.sweepId` / `run.sweepLabel` (§3).
+
+### Version gate — three places, must stay in sync
+
+Bumping the schema version requires updating all three or v4 runs are silently rejected / mis-rendered:
+
+1. `bench_output.schema.json` — `schemaVersion.const` (✅ `"4"`).
+2. `spartan/bootstrap.sh` — `network_bench_upload` schemaVersion check (✅ `"4"`).
+3. **`network-dashboard/data.js` `SUPPORTED_RUN_VERSION`** — in `AztecProtocol/explorations`, **not this repo**. Must be bumped to `"4"` there before v4 runs render. Tracked as dashboard work (Phase 5).
+
+## 5. Verify-on-live caveats (A-1222 acceptance)
+
+- **CPU** (`process_cpu_utilization`) and **ELU** (`nodejs_eventloop_utilization`) come from telemetry that may be gated in the bench env. The scraper emits empty series (non-fatal) if a metric is absent; confirm both flow on a live bench run and fix the exporter/metric name if not.
+- Proving-infra durations are recorded in **ms** by convention; confirm units against the live histograms before trusting absolute values.
diff --git a/spartan/scripts/bench_10tps/bench_output.schema.json b/spartan/scripts/bench_10tps/bench_output.schema.json
index 3685c72960d8..ca1fd691187e 100644
--- a/spartan/scripts/bench_10tps/bench_output.schema.json
+++ b/spartan/scripts/bench_10tps/bench_output.schema.json
@@ -16,8 +16,8 @@
   "properties": {
     "schemaVersion": {
       "type": "string",
-      "const": "3",
-      "description": "Bump when breaking the schema. Old JSONs keep their previous version so the dashboard can render them side-by-side. v3: timeSeries entries carry `series: [{labels, points}]` instead of bare `points` to support per-pod / per-label data."
+      "const": "4",
+      "description": "Bump when breaking the schema. Old JSONs keep their previous version so the dashboard can render them side-by-side. v3: timeSeries entries carry `series: [{labels, points}]` instead of bare `points`. v4 (additive): adds optional `provingInfra` (hint-gen + proving-queue-by-job_type series) and `saturation` (per-role ELU/CPU/memory, max + avg) sections, plus `run.sweepId`/`run.sweepLabel` so a night's 1/5/10 TPS points group as one sweep. All v3 fields retained; a v3 run is unchanged apart from this version string."
     },
     "run": {
       "$ref": "#/$defs/runMeta"
@@ -30,6 +30,14 @@
       "$ref": "#/$defs/timeSeriesSection",
       "description": "PromQL query_range results. Continuous-sampled metrics keyed by unixEpoch; the dashboard normalises to time-within-run via unixEpoch - run.startedAt at render time so multiple runs can overlay on the same x-axis."
     },
+    "provingInfra": {
+      "$ref": "#/$defs/metricSeriesMap",
+      "description": "v4. Proving-path series for the proving-infra view: prover-node hint-gen / tx re-execution (public_processor.* + prover_node.*_processing.duration scoped to the prover-node pod) and proving-queue behaviour broken down by job_type (aztec_proving_job_type label). Optional — empty/absent on inclusion-only runs."
+    },
+    "saturation": {
+      "$ref": "#/$defs/metricSeriesMap",
+      "description": "v4. Per-role resource saturation: ELU, CPU, and memory for each role (validator / rpc / fullNode / proverNode / broker / agent), each emitted as both the hottest pod (max across pods) and the role average (avg across pods). Never a single hand-picked pod. Optional — absent on older runs."
+    },
     "blocks": {
       "type": "array",
       "description": "Per-block records parsed from structured logs (each block emits one `Processed N successful txs and M failed txs ...` info line). Authoritative for per-block facts — Prometheus histograms cannot recover per-block samples.",
@@ -63,12 +71,7 @@
     "runMeta": {
       "type": "object",
       "additionalProperties": false,
-      "required": [
-        "runId",
-        "startedAt",
-        "endedAt",
-        "namespace"
-      ],
+      "required": ["runId", "startedAt", "endedAt", "namespace"],
       "properties": {
         "runId": {
           "type": "string",
@@ -96,9 +99,7 @@
         },
         "namespace": {
           "type": "string",
-          "examples": [
-            "bench-10tps"
-          ]
+          "examples": ["bench-10tps"]
         },
         "gcpProject": {
           "type": "string",
@@ -120,15 +121,21 @@
           "type": "number",
           "minimum": 0
         },
+        "sweepId": {
+          "type": "string",
+          "description": "v4. Groups the runs of one sweep (e.g. a night's 1/5/10 TPS points) so the dashboard can plot them together. Shared across the points of a sweep; absent for standalone runs."
+        },
+        "sweepLabel": {
+          "type": "string",
+          "description": "v4. Human-readable label for the sweep this run belongs to (e.g. 'inclusion-sweep' or 'proving-sweep'). Optional."
+        },
         "testDurationSeconds": {
           "type": "integer",
           "minimum": 0
         },
         "workload": {
           "type": "string",
-          "examples": [
-            "sha256_hash_1024"
-          ]
+          "examples": ["sha256_hash_1024"]
         },
         "aztecConfig": {
           "type": "object",
@@ -167,11 +174,7 @@
                     "items": {
                       "type": "object",
                       "additionalProperties": false,
-                      "required": [
-                        "role",
-                        "podName",
-                        "nodeName"
-                      ],
+                      "required": ["role", "podName", "nodeName"],
                       "properties": {
                         "role": {
                           "type": "string"
@@ -205,9 +208,7 @@
             },
             "profile": {
               "type": "string",
-              "examples": [
-                "network-requirements"
-              ]
+              "examples": ["network-requirements"]
             }
           }
         },
@@ -239,40 +240,25 @@
               "description": "Maximum time the scraper was allowed to wait for validator pending TxPool depth to reach zero."
             },
             "pendingAtScrape": {
-              "type": [
-                "number",
-                "null"
-              ],
+              "type": ["number", "null"],
               "minimum": 0,
               "description": "Validator pending TxPool depth observed when scraping started, or null when the pending drain gate was disabled."
             },
             "pendingByRoleAtScrape": {
-              "type": [
-                "object",
-                "null"
-              ],
+              "type": ["object", "null"],
               "description": "Pending TxPool depth by pod role at scrape start. RPC/full-node pending can remain non-zero after validators drain, which indicates load that did not propagate to proposers before expiry.",
               "additionalProperties": false,
               "properties": {
                 "rpc": {
-                  "type": [
-                    "number",
-                    "null"
-                  ],
+                  "type": ["number", "null"],
                   "minimum": 0
                 },
                 "validator": {
-                  "type": [
-                    "number",
-                    "null"
-                  ],
+                  "type": ["number", "null"],
                   "minimum": 0
                 },
                 "fullNode": {
-                  "type": [
-                    "number",
-                    "null"
-                  ],
+                  "type": ["number", "null"],
                   "minimum": 0
                 }
               }
@@ -288,118 +274,66 @@
     "summary": {
       "type": "object",
       "additionalProperties": false,
-      "required": [
-        "headlineKpi",
-        "inclusionTpsMean",
-        "targetTps"
-      ],
+      "required": ["headlineKpi", "inclusionTpsMean", "targetTps"],
       "properties": {
         "headlineKpi": {
-          "type": [
-            "number",
-            "null"
-          ],
+          "type": ["number", "null"],
           "description": "inclusionTpsMean / targetTps. The single number on the dashboard top strip."
         },
         "targetTps": {
           "type": "number"
         },
         "inclusionTpsMean": {
-          "type": [
-            "number",
-            "null"
-          ],
+          "type": ["number", "null"],
           "description": "Inclusion throughput over the observed inclusion window. Uses exact block-log throughput when block records are available, otherwise falls back to the Prometheus inclusionTps mean."
         },
         "inclusionTpsPeak": {
-          "type": [
-            "number",
-            "null"
-          ],
+          "type": ["number", "null"],
           "description": "Peak sampled Prometheus rolling inclusion rate over the observed scrape window."
         },
         "inclusionLatencyP50Ms": {
-          "type": [
-            "number",
-            "null"
-          ]
+          "type": ["number", "null"]
         },
         "inclusionLatencyP95Ms": {
-          "type": [
-            "number",
-            "null"
-          ]
+          "type": ["number", "null"]
         },
         "inclusionLatencyP99Ms": {
-          "type": [
-            "number",
-            "null"
-          ]
+          "type": ["number", "null"]
         },
         "blockBuildDurationP50Ms": {
-          "type": [
-            "number",
-            "null"
-          ]
+          "type": ["number", "null"]
         },
         "blockBuildDurationP95Ms": {
-          "type": [
-            "number",
-            "null"
-          ]
+          "type": ["number", "null"]
         },
         "publicProcessorTxDurationP50Ms": {
-          "type": [
-            "number",
-            "null"
-          ]
+          "type": ["number", "null"]
         },
         "publicProcessorTxDurationP95Ms": {
-          "type": [
-            "number",
-            "null"
-          ]
+          "type": ["number", "null"]
         },
         "totalTxsMined": {
-          "type": [
-            "integer",
-            "null"
-          ],
+          "type": ["integer", "null"],
           "description": "Exact sum from per-block logs. Null when block logs were unavailable and inclusionTpsMean came from Prometheus."
         },
         "totalTxsFailed": {
-          "type": [
-            "integer",
-            "null"
-          ],
+          "type": ["integer", "null"],
           "description": "Exact sum from per-block logs. Null when block logs were unavailable."
         },
         "totalSilentSkipCount": {
-          "type": [
-            "integer",
-            "null"
-          ],
+          "type": ["integer", "null"],
           "description": "Sum of per-block silentlySkippedCount. > 0 means the post-process blob-field revert path fired during the run."
         },
         "totalSilentSkipDurationMs": {
-          "type": [
-            "integer",
-            "null"
-          ],
+          "type": ["integer", "null"],
           "description": "Sum of per-block silentlySkippedDurationMs. Wall-clock 'wasted' on silently-skipped txs across the run."
         },
         "reorgCount": {
-          "type": [
-            "integer",
-            "null"
-          ],
+          "type": ["integer", "null"],
           "description": "Count of `Chain pruned` events during the run."
         },
         "deepestReorgBlocks": {
-          "type": [
-            "integer",
-            "null"
-          ],
+          "type": ["integer", "null"],
           "description": "Max (fromBlock - toBlock) across reorg events. 0 if no reorgs."
         }
       }
@@ -510,14 +444,17 @@
         }
       }
     },
+    "metricSeriesMap": {
+      "type": "object",
+      "description": "v4. Open map of slug -> timeSeries, same per-series shape as timeSeriesSection but without a fixed slug list. Used for provingInfra and saturation, whose slugs are generated per role / job_type.",
+      "additionalProperties": {
+        "$ref": "#/$defs/timeSeries"
+      }
+    },
     "timeSeries": {
       "type": "object",
       "additionalProperties": false,
-      "required": [
-        "metric",
-        "source",
-        "series"
-      ],
+      "required": ["metric", "source", "series"],
       "properties": {
         "metric": {
           "type": "string",
@@ -525,19 +462,11 @@
         },
         "unit": {
           "type": "string",
-          "examples": [
-            "ms",
-            "tps",
-            "mana/s",
-            "count"
-          ]
+          "examples": ["ms", "tps", "mana/s", "count"]
         },
         "source": {
           "type": "string",
-          "enum": [
-            "promql",
-            "client_observed"
-          ],
+          "enum": ["promql", "client_observed"],
           "description": "Provenance: 'promql' = scraped via PromQL from cluster Prometheus; 'client_observed' = computed in this scraper from per-tx records emitted by n_tps.test.ts (e.g. headline tx_mined_delay)."
         },
         "query": {
@@ -560,10 +489,7 @@
     "seriesEntry": {
       "type": "object",
       "additionalProperties": false,
-      "required": [
-        "labels",
-        "points"
-      ],
+      "required": ["labels", "points"],
       "properties": {
         "labels": {
           "type": "object",
@@ -584,20 +510,14 @@
     "tsPoint": {
       "type": "object",
       "additionalProperties": false,
-      "required": [
-        "unixEpoch",
-        "value"
-      ],
+      "required": ["unixEpoch", "value"],
       "properties": {
         "unixEpoch": {
           "type": "integer",
           "description": "Seconds since unix epoch for this sample. Dashboards normalise to time-within-run via unixEpoch - run.startedAt at render time."
         },
         "value": {
-          "type": [
-            "number",
-            "null"
-          ],
+          "type": ["number", "null"],
           "description": "Metric value. null if Prom returned NaN / no data for this step."
         }
       }
@@ -605,11 +525,7 @@
     "blockRecord": {
       "type": "object",
       "additionalProperties": false,
-      "required": [
-        "blockNumber",
-        "blockNumberInTest",
-        "minedAt"
-      ],
+      "required": ["blockNumber", "blockNumberInTest", "minedAt"],
       "properties": {
         "blockNumber": {
           "type": "integer",
@@ -671,10 +587,7 @@
     "event": {
       "type": "object",
       "additionalProperties": false,
-      "required": [
-        "at",
-        "type"
-      ],
+      "required": ["at", "type"],
       "properties": {
         "at": {
           "type": "string",
@@ -682,10 +595,7 @@
         },
         "type": {
           "type": "string",
-          "enum": [
-            "chainPruned",
-            "slotSummary"
-          ]
+          "enum": ["chainPruned", "slotSummary"]
         },
         "source": {
           "type": "string",
@@ -761,13 +671,7 @@
     "sequencerStateSlot": {
       "type": "object",
       "additionalProperties": false,
-      "required": [
-        "slotNumber",
-        "startedAt",
-        "endedAt",
-        "totalMs",
-        "states"
-      ],
+      "required": ["slotNumber", "startedAt", "endedAt", "totalMs", "states"],
       "properties": {
         "slotNumber": {
           "type": "integer",
@@ -803,4 +707,4 @@
       }
     }
   }
-}
\ No newline at end of file
+}
diff --git a/spartan/scripts/bench_10tps/bench_scrape.ts b/spartan/scripts/bench_10tps/bench_scrape.ts
index 77d1805d2d53..5197aa32e4d6 100755
--- a/spartan/scripts/bench_10tps/bench_scrape.ts
+++ b/spartan/scripts/bench_10tps/bench_scrape.ts
@@ -1,9 +1,15 @@
 #!/usr/bin/env -S node --experimental-strip-types --no-warnings
 //
 // Scrape a completed bench-10tps run into a schema-conformant JSON payload.
-// Contract: bench_output.schema.json (v3). Invoked by the bench_10tps function
+// Contract: bench_output.schema.json (v4). Invoked by the bench_10tps function
 // in spartan/bootstrap.sh after n_tps.test.ts finishes.
 //
+// v4 adds two PromQL sections alongside the inclusion timeSeries:
+//   - provingInfra: prover-node hint-gen (tx re-execution) + proving-queue
+//     behaviour broken down by job_type.
+//   - saturation:   per-role ELU/CPU/memory, each as max (hottest pod) + avg.
+// Both scrape independently so one failing does not abort the others.
+//
 // Two independent scrape paths so one failing does not abort the other:
 //   1. Prometheus (port-forward to the cluster-shared metrics-prometheus-server)
 //   2. gcloud logging read (per-block and discrete-event records)
@@ -56,6 +62,8 @@ type Args = {
   inclusionRecords: string | undefined;
   waitForPendingZero: boolean;
   maxPendingWaitSeconds: number;
+  sweepId: string | undefined;
+  sweepLabel: string | undefined;
 };
 
 function parseArgs(): Args {
@@ -94,6 +102,8 @@ function parseArgs(): Args {
           String(DEFAULT_MAX_PENDING_WAIT_SECONDS),
       ),
     ),
+    sweepId: get("--sweep-id", env.BENCH_SWEEP_ID ?? "") || undefined,
+    sweepLabel: get("--sweep-label", env.BENCH_SWEEP_LABEL ?? "") || undefined,
   };
 }
 
@@ -560,12 +570,145 @@ const TIME_SERIES_DEFS: Record<string, TimeSeriesDef> = {
   },
 };
 
-async function scrapeTimeSeries(
+// --- v4: per-role resource saturation (ELU / CPU / memory) ---
+// Roles are matched by pod-name prefix within the namespace. The proposer
+// rotates, so never hand-pick a pod: emit max() (hottest pod) AND avg() per role.
+const SATURATION_ROLES: Record<string, string> = {
+  validator: `${NAMESPACE}-validator.*`,
+  rpc: `${NAMESPACE}-rpc.*`,
+  fullNode: `${NAMESPACE}-full-node.*`,
+  proverNode: `${NAMESPACE}-prover-node.*`,
+  broker: `${NAMESPACE}-prover-broker.*`,
+  agent: `${NAMESPACE}-prover-agent.*`,
+};
+
+// OTel metric -> Prometheus name. ELU + heap come from
+// telemetry-client/src/nodejs_metrics_monitor.ts (nodejs.* prefix, NOT aztec_).
+// CPU comes from @opentelemetry/host-metrics (process.cpu.utilization), not the
+// nodejs monitor. NOTE: ELU and especially CPU may be telemetry-gated in the
+// bench env — if so these series come back empty (A-1222 acceptance: verify on
+// the live env and adjust the metric name / enable the exporter as needed).
+const SATURATION_METRICS: { key: string; metric: string; unit: string }[] = [
+  { key: "elu", metric: "nodejs_eventloop_utilization", unit: "ratio" },
+  { key: "cpu", metric: "process_cpu_utilization", unit: "ratio" },
+  { key: "mem", metric: "nodejs_memory_v8_heap_usage", unit: "bytes" },
+];
+
+function buildSaturationDefs(): Record<string, TimeSeriesDef> {
+  const defs: Record<string, TimeSeriesDef> = {};
+  for (const [role, podPattern] of Object.entries(SATURATION_ROLES)) {
+    const sel = `{k8s_namespace_name="${NAMESPACE}",k8s_pod_name=~"${podPattern}"}`;
+    const cap = role.charAt(0).toUpperCase() + role.slice(1);
+    for (const { key, metric, unit } of SATURATION_METRICS) {
+      // max() across pods = hottest pod; avg() = role average. Single series each.
+      defs[`${key}${cap}Max`] = { metric, unit, query: `max(${metric}${sel})` };
+      defs[`${key}${cap}Avg`] = { metric, unit, query: `avg(${metric}${sel})` };
+    }
+  }
+  return defs;
+}
+const SATURATION_DEFS = buildSaturationDefs();
+
+// --- v4: proving-infra (hint-gen on the prover-node + proving-queue by job_type) ---
+// "Hint generation" is the prover node re-executing the epoch's txs. There is no
+// `aztec.prover_node.execution.duration` metric; the re-execution is instrumented
+// as public_processor.* + prover_node.*_processing.duration on the prover-node
+// pod. Proving-queue behaviour is broken down by the aztec_proving_job_type label.
+const PROVER_NODE_SEL = `{k8s_namespace_name="${NAMESPACE}",k8s_pod_name=~"${NAMESPACE}-prover-node.*"}`;
+const JOB_TYPE = "aztec_proving_job_type";
+const proverNodeHist = (q: number, bucket: string) =>
+  `histogram_quantile(${q}, sum by (le)(rate(${bucket}${PROVER_NODE_SEL}[1m])))`;
+const queueByJobType = (metric: string) =>
+  `sum by (${JOB_TYPE})(${metric}${NS})`;
+const queueRateByJobType = (metric: string) =>
+  `sum by (${JOB_TYPE})(rate(${metric}${NS}[1m]))`;
+const queueHistByJobType = (q: number, bucket: string) =>
+  `histogram_quantile(${q}, sum by (le, ${JOB_TYPE})(rate(${bucket}${NS}[1m])))`;
+
+const PROVING_INFRA_DEFS: Record<string, TimeSeriesDef> = {
+  // Hint-gen: prover-node tx re-execution (the proving bottleneck at high TPS).
+  hintGenPublicTxDurationP50: {
+    metric: "aztec_public_processor_tx_duration",
+    unit: "ms",
+    query: proverNodeHist(0.5, "aztec_public_processor_tx_duration_bucket"),
+  },
+  hintGenPublicTxDurationP99: {
+    metric: "aztec_public_processor_tx_duration",
+    unit: "ms",
+    query: proverNodeHist(0.99, "aztec_public_processor_tx_duration_bucket"),
+  },
+  hintGenPublicPhaseDurationP50: {
+    metric: "aztec_public_processor_phase_duration",
+    unit: "ms",
+    query: proverNodeHist(0.5, "aztec_public_processor_phase_duration_bucket"),
+  },
+  hintGenBlockProcessingDurationP50: {
+    metric: "aztec_prover_node_block_processing_duration",
+    unit: "ms",
+    query: proverNodeHist(
+      0.5,
+      "aztec_prover_node_block_processing_duration_bucket",
+    ),
+  },
+  hintGenBlockProcessingDurationP99: {
+    metric: "aztec_prover_node_block_processing_duration",
+    unit: "ms",
+    query: proverNodeHist(
+      0.99,
+      "aztec_prover_node_block_processing_duration_bucket",
+    ),
+  },
+  hintGenCheckpointProcessingDurationP50: {
+    metric: "aztec_prover_node_checkpoint_processing_duration",
+    unit: "ms",
+    query: proverNodeHist(
+      0.5,
+      "aztec_prover_node_checkpoint_processing_duration_bucket",
+    ),
+  },
+  // Proving queue, broken down by job_type (one series per job type).
+  provingQueueSizeByJobType: {
+    metric: "aztec_proving_queue_size",
+    unit: "count",
+    query: queueByJobType("aztec_proving_queue_size"),
+  },
+  provingQueueActiveJobsByJobType: {
+    metric: "aztec_proving_queue_active_jobs_count",
+    unit: "count",
+    query: queueByJobType("aztec_proving_queue_active_jobs_count"),
+  },
+  provingQueueJobDurationP50ByJobType: {
+    metric: "aztec_proving_queue_job_duration",
+    unit: "ms",
+    query: queueHistByJobType(0.5, "aztec_proving_queue_job_duration_bucket"),
+  },
+  provingQueueJobDurationP99ByJobType: {
+    metric: "aztec_proving_queue_job_duration",
+    unit: "ms",
+    query: queueHistByJobType(0.99, "aztec_proving_queue_job_duration_bucket"),
+  },
+  // Rates of terminal job outcomes — the run #95 stall showed up as timeouts.
+  provingQueueTimedOutJobsByJobType: {
+    metric: "aztec_proving_queue_timed_out_jobs_count",
+    unit: "count",
+    query: queueRateByJobType("aztec_proving_queue_timed_out_jobs_count"),
+  },
+  provingQueueResolvedJobsByJobType: {
+    metric: "aztec_proving_queue_resolved_jobs_count",
+    unit: "count",
+    query: queueRateByJobType("aztec_proving_queue_resolved_jobs_count"),
+  },
+};
+
+// Scrape a map of slug -> PromQL def via query_range. One failing query emits an
+// empty series for that slug rather than aborting the whole section.
+async function scrapeDefs(
+  defs: Record<string, TimeSeriesDef>,
   startedAtEpoch: number,
   endedAtEpoch: number,
 ): Promise<Record<string, unknown>> {
   const out: Record<string, unknown> = {};
-  for (const [slug, def] of Object.entries(TIME_SERIES_DEFS)) {
+  for (const [slug, def] of Object.entries(defs)) {
     try {
       const series = await queryRange(def.query, startedAtEpoch, endedAtEpoch);
       out[slug] = {
@@ -577,7 +720,7 @@ async function scrapeTimeSeries(
         series,
       };
     } catch (err) {
-      log(`timeSeries.${slug} scrape failed, emitting empty series`, {
+      log(`series.${slug} scrape failed, emitting empty series`, {
         err: err instanceof Error ? err.message : String(err),
       });
       out[slug] = {
@@ -593,6 +736,9 @@ async function scrapeTimeSeries(
   return out;
 }
 
+const scrapeTimeSeries = (startedAtEpoch: number, endedAtEpoch: number) =>
+  scrapeDefs(TIME_SERIES_DEFS, startedAtEpoch, endedAtEpoch);
+
 // --- gcloud log scrape ---
 
 type GcloudEntry = {
@@ -1668,6 +1814,8 @@ function assertShape(payload: Record<string, unknown>): void {
     "run",
     "summary",
     "timeSeries",
+    "provingInfra",
+    "saturation",
     "blocks",
     "events",
   ] as const;
@@ -1676,9 +1824,9 @@ function assertShape(payload: Record<string, unknown>): void {
       throw new Error(`output missing required top-level key: ${key}`);
     }
   }
-  if (payload.schemaVersion !== "3") {
+  if (payload.schemaVersion !== "4") {
     throw new Error(
-      `schemaVersion must be "3", got ${String(payload.schemaVersion)}`,
+      `schemaVersion must be "4", got ${String(payload.schemaVersion)}`,
     );
   }
   const run = payload.run as Record<string, unknown>;
@@ -1913,6 +2061,22 @@ async function main(): Promise<void> {
     log("Scraping Prometheus time-series");
     const timeSeries = await scrapeTimeSeries(startedAtEpoch, promEndEpoch);
 
+    // v4: proving-infra (hint-gen + queue by job_type) and per-role saturation.
+    // Independent of the inclusion timeSeries scrape so a failure here cannot
+    // drop inclusion data, and vice versa.
+    log("Scraping proving-infra series (hint-gen + queue by job_type)");
+    const provingInfra = await scrapeDefs(
+      PROVING_INFRA_DEFS,
+      startedAtEpoch,
+      promEndEpoch,
+    );
+    log("Scraping per-role saturation series (ELU/CPU/memory, max + avg)");
+    const saturation = await scrapeDefs(
+      SATURATION_DEFS,
+      startedAtEpoch,
+      promEndEpoch,
+    );
+
     log("Loading client-observed inclusion records");
     const inclusionRecords = await loadInclusionRecords(args.inclusionRecords);
     // Compute the headline inclusion-latency time series from per-tx records
@@ -1999,7 +2163,7 @@ async function main(): Promise<void> {
     });
 
     const payload = {
-      schemaVersion: "3",
+      schemaVersion: "4",
       run: {
         runId: args.runId,
         startedAt: args.startedAt,
@@ -2014,6 +2178,8 @@ async function main(): Promise<void> {
         gkeCluster: GKE_CLUSTER,
         ...(image !== undefined && { image }),
         targetTps: args.targetTps,
+        ...(args.sweepId !== undefined && { sweepId: args.sweepId }),
+        ...(args.sweepLabel !== undefined && { sweepLabel: args.sweepLabel }),
         testDurationSeconds: windowSec,
         workload: args.workload,
         ...(Object.keys(aztecConfig).length > 0 && { aztecConfig }),
@@ -2031,6 +2197,8 @@ async function main(): Promise<void> {
       },
       summary,
       timeSeries,
+      provingInfra,
+      saturation,
       blocks,
       events,
       sequencerStateSlots,