diff --git a/VX_config.toml b/VX_config.toml
index 1072c1250..a8cd1875e 100644
--- a/VX_config.toml
+++ b/VX_config.toml
@@ -185,7 +185,8 @@ VX_CFG_DCACHE_WRITEBACK = 0
 VX_CFG_DCACHE_DIRTYBYTES = "expr: $VX_CFG_DCACHE_WRITEBACK"
 VX_CFG_DCACHE_REPL_POLICY = "expr: $__cache_repl_fifo"
 VX_CFG_DCACHE_MSHR_SIZE = 16
-VX_CFG_DCACHE_MREQ_SIZE = "expr: 4 + $VX_CFG_DCACHE_WRITEBACK * ($VX_CFG_DCACHE_MSHR_SIZE - 4)"
+VX_CFG_DCACHE_LATENCY = 2
+VX_CFG_DCACHE_MREQ_SIZE = "expr: 2 * $VX_CFG_DCACHE_LATENCY + $VX_CFG_DCACHE_WRITEBACK * ($VX_CFG_DCACHE_MSHR_SIZE - 2 * $VX_CFG_DCACHE_LATENCY)"
 VX_CFG_DCACHE_MRSQ_SIZE = 4
 VX_CFG_DCACHE_CRSQ_SIZE = 2
 
@@ -200,11 +201,12 @@ VX_CFG_L1_MEM_PORTS = "expr: min($VX_CFG_DCACHE_NUM_BANKS, $VX_CFG_PLATFORM_MEMO
 [l2cache]
 VX_CFG_L2_CACHE_SIZE = 1048576
 VX_CFG_L2_NUM_WAYS = 8
-VX_CFG_L2_WRITEBACK = 1
+VX_CFG_L2_WRITEBACK = 0
 VX_CFG_L2_DIRTYBYTES = "expr: $VX_CFG_L2_WRITEBACK"
 VX_CFG_L2_REPL_POLICY = "expr: $__cache_repl_fifo"
 VX_CFG_L2_MSHR_SIZE = 16
-VX_CFG_L2_MREQ_SIZE = "expr: 4 + $VX_CFG_L2_WRITEBACK * ($VX_CFG_L2_MSHR_SIZE - 4)"
+VX_CFG_L2_LATENCY = 4
+VX_CFG_L2_MREQ_SIZE = "expr: 2 * $VX_CFG_L2_LATENCY + $VX_CFG_L2_WRITEBACK * ($VX_CFG_L2_MSHR_SIZE - 2 * $VX_CFG_L2_LATENCY)"
 VX_CFG_L2_MRSQ_SIZE = 4
 VX_CFG_L2_CRSQ_SIZE = 2
 
@@ -214,11 +216,12 @@ VX_CFG_L2_MEM_PORTS = "expr: min($VX_CFG_L2_NUM_BANKS, $VX_CFG_PLATFORM_MEMORY_N
 [l3cache]
 VX_CFG_L3_CACHE_SIZE = 2097152
 VX_CFG_L3_NUM_WAYS = 8
-VX_CFG_L3_WRITEBACK = 1
+VX_CFG_L3_WRITEBACK = 0
 VX_CFG_L3_DIRTYBYTES = "expr: $VX_CFG_L3_WRITEBACK"
 VX_CFG_L3_REPL_POLICY = "expr: $__cache_repl_fifo"
 VX_CFG_L3_MSHR_SIZE = 16
-VX_CFG_L3_MREQ_SIZE = "expr: 4 + $VX_CFG_L3_WRITEBACK * ($VX_CFG_L3_MSHR_SIZE - 4)"
+VX_CFG_L3_LATENCY = 4
+VX_CFG_L3_MREQ_SIZE = "expr: 2 * $VX_CFG_L3_LATENCY + $VX_CFG_L3_WRITEBACK * ($VX_CFG_L3_MSHR_SIZE - 2 * $VX_CFG_L3_LATENCY)"
 VX_CFG_L3_MRSQ_SIZE = 4
 VX_CFG_L3_CRSQ_SIZE = 2
 
diff --git a/docs/proposals/cache_elastic_pipeline_proposal.md b/docs/proposals/cache_elastic_pipeline_proposal.md
new file mode 100644
index 000000000..fefd93c11
--- /dev/null
+++ b/docs/proposals/cache_elastic_pipeline_proposal.md
@@ -0,0 +1,401 @@
+# Elastic Cache-Bank Pipeline (configurable AMAT) Proposal
+
+## Summary
+
+`VX_cache_bank` is hardwired to a 2-stage lookup/commit pipeline. That depth is
+correct for a small, latency-critical L1, but it cannot close timing at 300 MHz
+on a large last-level cache: the tag-array read, the way-resolving compare, and
+the data-array access are forced into a single clock cycle, producing a
+BRAM-to-BRAM critical path whose delay is dominated by routing and cannot be
+retimed away.
+
+This proposal refactors the bank into an **elastic pipeline** whose depth is a
+per-cache parameter (`VX_CFG_<CACHE>_LATENCY`, default 2 = current behavior).
+Larger caches raise the knob to insert register stages on the long paths,
+trading a few cycles of hit latency — which a non-blocking, MSHR-backed cache
+hides — for the Fmax needed to run the whole device at 300 MHz. We propose
+`LATENCY = 4` for any L2/L3 larger than 64 KB.
+
+This is the architecture real GPU L2/L3 caches use: deep, fully pipelined,
+latency-tolerant behind a large miss-handling pool, rather than a shallow
+single-cycle-lookup structure.
+
+## Motivation
+
+On the U55C at the 300 MHz platform clock (period 3.333 ns), a 2-core build with
+the 1 MB 8-way L2 fails timing. Measured post-route WNS on the standalone
+`Vortex` DUT (`xcu55c`, post-route, after the dirty-mask LUTRAM fix):
+
+| Config | WNS @300 MHz | Implied Fmax | Worst path |
+|--------|-------------:|-------------:|------------|
+| L2 write-back | **-1.380 ns** | ~212 MHz | `cache_tags/tag_store` → `cache_data/.../data_store` (EN/WE) |
+| L2 write-through | **-1.008 ns** | ~230 MHz | same structure |
+
+Because this path sits in the L2, the *entire* device is capped at ~210–230 MHz.
+Once integrated into the full XRT platform (HBM + PCIe + SLR crossings) the
+slack erodes further. No amount of placement or logic restructuring closes a
+single-cycle BRAM→BRAM dependency whose delay is ~78% routing — the cycle
+boundary has to move.
+
+## Current timing bottlenecks in `VX_cache_bank`
+
+The bank runs a fixed two-stage pipe (`sel → S0 → S1`, two `VX_pipe_register`s).
+Tag and data arrays are read at issue; the hit/way is resolved combinationally
+in S0 and immediately drives the data array in the same cycle. The bottlenecks,
+in order of severity:
+
+1. **[PRIMARY] Tag-compare → data-array write-enable (S0).**
+   `tag_store` (RAMB) clk-to-out → per-way tag compare (XNOR/AND tree) →
+   `hit_any = |tag_matches` → `slice_write = fill || (write && hit_any &&
+   word_en)` → `data_store` `ENARDEN`/`ENBWREN`/`WEA`. The failing endpoints are
+   the data-array **enable/write-enable pins**, not the address pins. BRAM→BRAM,
+   ~78% routing. This is the −1.38 ns path.
+
+2. **Tag-compare → data-array address.** The way-folded array is addressed
+   `data_addr = {hit_way, line_idx}`, so `hit_way` (from the same S0 compare)
+   feeds the data BRAM address pins. Currently meets with thin slack; becomes
+   the next wall the instant bottleneck (1) is broken.
+
+3. **[ALREADY RESOLVED — prerequisite] Per-byte dirty mask (`byteen_store`).**
+   Was the #1 path at −3.762 ns (xrt, 300 MHz). The mask needs 1-bit write
+   granularity (`WRENW = LINE_SIZE`), which block RAM cannot do, so a
+   `LUTRAM=1` instance was silently inferred as *shattered* BRAM. Fixed by making
+   `VX_sp_ram`/`VX_dp_ram` honor `LUTRAM=1` via the portable `USE_FAST_BRAM`
+   (`ram_style="distributed"`) attribute; the mask now maps to distributed RAM
+   (LUTRAM 216 → 16,600, RAMB ≈ unchanged) and leaves the critical path. This
+   refactor assumes that fix is in place.
+
+4. **Replacement state (`cache_repl`) → data/tag.** FIFO/PLRU victim select and
+   state update. At 250 MHz the worst path was `cache_repl` FIFO → `byteen_store`;
+   the lookup/update feedback (`lookup_valid`/`repl_valid`) is a second-tier path
+   that benefits from extra slack.
+
+5. **MSHR probe/allocate (`cache_mshr`).** `probe_addr` is compared (CAM-style)
+   against in-flight entries to produce `probe_pending_*`, which gates admission
+   and AMO ordering. The compare fanout over `MSHR_SIZE` entries is a control path
+   that tightens as MSHR grows.
+
+6. **AMO read-modify-write (LLC, S1).** `read_word_st1` → AMO ALU
+   (add/min/max/swap/compare) → writeback register → re-inject as a synthetic
+   write. Only synthesized for the AMO-capable LLC bank, but it is a genuine S1
+   compute path.
+
+7. **Read-data → response / writeback formatting.** `read_data_st1` → `crsp`
+   word select, and `evict_byteen`/`is_dirty` → `mem_req_queue`. Registered and
+   comfortably met today, listed for completeness.
+
+The elastic pipeline targets (1) and (2) directly (deferring the data access to a
+later, register-fed stage) and relaxes (4)–(6) by giving each its own stage
+budget instead of cramming lookup+commit into two cycles.
+
+## Proposed design: elastic pipeline
+
+### Single knob, distributed internally
+
+Expose one parameter per cache, `LATENCY` (carried from
+`VX_CFG_<CACHE>_LATENCY`), with `LATENCY = 2` reproducing today's behavior
+bit-for-bit. The bank derives internal stage placement from it:
+
+| Internal budget | Cuts | Implementation |
+|-----------------|------|----------------|
+| `TAG_RD_LAT` | sel→tag routing + tag BRAM clk-to-out | tag RAM output pipeline registers |
+| **hit→data register** | bottleneck (1)/(2): compare → data EN/addr | one pipe stage (the key cut) |
+| `DATA_RD_LAT` | data BRAM clk-to-out → way mux | data RAM output pipeline registers |
+| response register | read-data → crsp/mreq | one pipe stage |
+
+Extra RAM output registers retime into the BRAM/cascade and cost almost nothing
+in fabric while buying most of the Fmax. The single new *logical* stage is the
+hit→data register that moves the data access off the same cycle as the compare.
+
+### Spine refactor (readability + elasticity)
+
+Replace the ~40 parallel `_sel`/`_st0`/`_st1` wires and two hand-instantiated
+pipe registers with:
+
+1. **A packed payload struct** carrying all per-request control/data:
+   ```systemverilog
+   typedef struct packed {
+       logic                          valid;
+       logic [`CS_LINE_ADDR_WIDTH-1:0] addr;
+       logic                          rw;
+       logic [WORD_SIZE-1:0]          byteen;
+       logic [`CS_WORD_WIDTH-1:0]     word;
+       logic [`CS_WAY_SEL_WIDTH-1:0]  way;
+       logic                          hit;
+       // tag, idx, mshr_id, is_fill/flush/replay/dirty, amo, ...
+   } pipe_t;
+   ```
+
+2. **A generate-loop register chain** of depth `LATENCY`:
+   ```systemverilog
+   pipe_t stg [0:LATENCY-1];          // stg[0] = arbitrated/selected request
+   for (genvar i = 1; i < LATENCY; ++i) begin : g_pipe
+       VX_pipe_register #(.DATAW($bits(pipe_t)), .RESETW(1)) reg_i (
+           .clk, .reset, .enable(~pipe_stall),
+           .data_in(stg[i-1]), .data_out(stg[i]));
+   end
+   ```
+   Adding depth is a wider array — no `if (LATENCY==2) … else if (==3)` ladder.
+
+3. **Control anchored to symbolic stage indices**, not literal `st0/st1`:
+   ```systemverilog
+   localparam HIT_ST  = TAG_RD_LAT;        // tag compare consumes stg[HIT_ST]
+   localparam DATA_ST = HIT_ST + 1;        // data access uses *registered* way
+   localparam RESP_ST = LATENCY - 1;       // crsp / mem-req fire here
+   ```
+   `cache_repl` lookup/update, `cache_mshr` allocate/finalize, tag write, and the
+   response all key off these names, so the feedback loops stay one-request-per-
+   cycle at any depth.
+
+### Deferred whole-array access — no hazard logic required (implemented)
+
+The implemented design is **simpler than the 1R1W split originally sketched**.
+The data array stays a single-port `VX_sp_ram`; the *entire* access (read **and**
+write, plus fill/flush) is deferred together by `PIPE_EX = LATENCY-2` register
+stages. Two consequences:
+
+- **The tag→data critical path is cut.** The data array is driven by *registered*
+  `tag_matches` (and the registered way/line/word/byteen), so neither the write
+  enable (bottleneck 1) nor the address (bottleneck 2) carries the combinational
+  tag-compare result. Path becomes register→BRAM, intra-stage.
+- **No store→load hazard, no forwarding, no stall scoreboard.** Because the
+  array's read and write move to the *same* deferred stage, pipeline order is
+  preserved: a younger same-line read always reaches the array *after* an older
+  write, so store→load forwarding is automatic. (The 1R1W/forwarding scheme is
+  unnecessary — keeping read+write co-located is strictly simpler and lower-risk.)
+
+The tag array is left entirely at S0/S1, so its existing read-during-write
+bypasses (`rdw_fill`/`rdw_write`) are unchanged.
+
+### Decoupled pipeline — the MSHR must NOT be deferred (critical constraint)
+
+`VX_cache_mshr` is **strongly coupled** to the bank pipeline: its coalescing
+chain requires `allocate` (S0) and `finalize` (S1) to remain **exactly one cycle
+apart**. The tail-find (`prev_idx`) only sees a predecessor's link once that
+predecessor finalizes; deferring finalize makes 3+ coalesced same-line misses
+(e.g. sequential icache fetches to one line) all link to the same predecessor,
+orphaning intermediate entries → they never replay → **bank deadlock**. This was
+confirmed empirically (a first "defer everything" attempt hung at both LATENCY=3
+and 4).
+
+So the implemented pipeline is **decoupled**:
+
+- **S0 / S1 (fixed, 1 cycle apart):** tag compare, replacement victim-select,
+  MSHR allocate **and finalize**, replacement update. Untouched.
+- **stD = S0 + PIPE_EX:** the data-array access (read+write) — a pass-through
+  register chain off S0 (`pipe_bubble_data`).
+- **stC = S1 + PIPE_EX:** the core response and the memory request — a
+  pass-through register chain off S1 (`pipe_bubble_commit`), aligned with the
+  deferred data output `read_data_stC`.
+
+`PIPE_EX=0` collapses stD→S0 and stC→S1, reproducing the classic 2-stage bank
+bit-for-bit (verified: LATENCY=2 gives identical cycle counts).
+
+### Memory-request queue sizing (constraint)
+
+The mem-request push now fires `LATENCY` stages after admission, so the queue's
+almost-full margin must reserve `LATENCY` slots (`PIPELINE_STAGES = LATENCY`).
+This requires **`MREQ_SIZE > LATENCY`** (else `ALM_FULL ≤ 0` → permanent
+almost-full → admission deadlock). Default small-cache `MREQ_SIZE = 4` is fine
+for `LATENCY ≤ 3`; enabling `LATENCY = 4` on L2/L3 requires bumping their
+`MREQ_SIZE` (see config section).
+
+## Atomics (`AMO_ENABLE`) under elastic latency
+
+`VX_cache_amo` is the most stage-coupled block in the bank and the part most
+affected by changing depth, so it is called out separately. Today it reaches
+*directly* into the fixed two-stage structure: it consumes lookup-stage signals
+(`valid_st0`, `is_hit_st0`, `is_creq_st0`, `word_idx_st0`, `addr_st0`) and
+commit-stage signals (`is_hit_st1`, `read_word_st1`, `do_write_st1`,
+`byteen_st1`, `write_word_st1`, `addr_st1`, `mshr_id_st1`), performs the
+read-modify-write, and re-injects the result as a synthetic writeback through the
+admit path. Three mechanisms encode the assumption that commit is exactly one
+cycle behind lookup.
+
+**1. The RMW datapath becomes a stage budget, not a single-cycle path.**
+The LLC atomic reads the line word at the data-output stage, runs the ALU
+(add/min/max/swap/compare), and writes it back — bottleneck (6). At `LATENCY=2`
+this is one S1 cycle. Under the elastic pipe it maps to the same symbolic stage
+indices as the data path: read at the data-output stage, ALU in the following
+stage, writeback at the commit stage. So deepening *relaxes* the AMO ALU path
+(it gets its own stage) rather than complicating it — the engine must be
+re-parameterized on `HIT_ST`/`DATA_ST`/`RESP_ST` instead of literal `st0`/`st1`.
+
+**2. Same-line AMO chaining is the tightest interaction.** A chained atomic to a
+line with an in-flight commit must observe the *previous* atomic's result. Today
+`chain_stall` paces the follower by one cycle so the prior result reaches the
+writeback register; `commit_busy` holds new admits while a single LLC commit is
+outstanding. With depth `L`, the commit→visible round trip is `L-1` cycles, so
+both pacing windows scale with `LATENCY`. The same-line stall scoreboard proposed
+for general RAW hazards **covers AMO chains by construction** (a chained atomic
+targets a line the scoreboard already marks in-flight); `chain_stall`/
+`commit_busy` collapse into that one mechanism, sized to `L`, rather than a
+separate hand-tuned 1-cycle pacer.
+
+**3. Non-LLC forward / passthru-replay ordering is latency-agnostic.** A non-LLC
+AMO forwards downstream, invalidates its local copy, and returns via a passthru
+replay (`is_amo_fwd_*`, `is_amo_replay_st1`, `req_input_defer`). These are
+event-ordered, not cycle-counted, so they carry over unchanged once they key off
+the stage constants instead of `st0`/`st1`.
+
+**LR/SC reservations** (`VX_CFG_AMO_RS_SIZE`) track line addresses, not pipeline
+cycles, and are unaffected by depth beyond keeping the reservation-clear (any
+intervening write to the line) anchored to the commit stage.
+
+Net: `AMO_ENABLE` requires the engine's stage anchors to be re-expressed in terms
+of the elastic stage constants and its chain pacing to be folded into the
+depth-sized same-line scoreboard. At `LATENCY=2` the behavior is identical to
+today (chain window = 1). The atomics regression (LR/SC, same-line AMO chains,
+mixed AMO/load ordering) is part of the rtlsim sweep across `LATENCY` values.
+
+## Proposed latency configuration
+
+Add a per-cache knob in `VX_config.toml` (default 2):
+
+```
+VX_CFG_DCACHE_LATENCY = 2
+VX_CFG_L2_LATENCY = "expr: 4 if $VX_CFG_L2_CACHE_SIZE > 65536 else 2"
+VX_CFG_L3_LATENCY = "expr: 4 if $VX_CFG_L3_CACHE_SIZE > 65536 else 2"
+
+# MREQ_SIZE must exceed LATENCY (margin); grow it with the deferral depth:
+VX_CFG_L2_MREQ_SIZE = "expr: 4 + ($VX_CFG_L2_LATENCY - 2) + $VX_CFG_L2_WRITEBACK * ($VX_CFG_L2_MSHR_SIZE - 4)"
+VX_CFG_L3_MREQ_SIZE = "expr: 4 + ($VX_CFG_L3_LATENCY - 2) + $VX_CFG_L3_WRITEBACK * ($VX_CFG_L3_MSHR_SIZE - 4)"
+```
+
+Rationale for the 64 KB threshold: below it the tag/data arrays fit in a few
+BRAMs placed adjacently and the single-cycle path closes; above it (the 1 MB L2,
+the 2 MB L3) the arrays span many BRAM columns and the cross-array route cannot
+meet 3.333 ns.
+
+The `MREQ_SIZE` expr adds `(LATENCY-2)` to the base so the almost-full margin
+(`MREQ_SIZE - LATENCY`) stays constant as depth grows — `LATENCY=4` ⇒ base 6,
+margin 2 (same as today's `LATENCY=2` margin). Without this, `LATENCY=4` with the
+default `MREQ_SIZE=4` deadlocks (margin 0).
+
+The bank parameter `LATENCY` is threaded from these macros through
+`VX_cache`/`VX_cache_cluster` to each `VX_cache_bank` instance.
+
+## How it resolves the timing violations
+
+| Path | Today (2-stage) | Elastic (`LATENCY=4`) |
+|------|-----------------|------------------------|
+| (1) tag-compare → data EN/WE | single cycle, −1.38 ns | compare registered at `HIT_ST`; write driven by registers at `DATA_ST` — path is reg→reg, intra-stage |
+| (2) hit_way → data address | single cycle, marginal | read addr still speculative but tag-read is itself registered (`TAG_RD_LAT`), so the source is a BRAM output reg, not a cross-array combinational chain |
+| (4) repl, (5) mshr, (6) amo | share the 2 cycles | each gets its own stage slack |
+
+The −1.38 ns path is replaced by register-to-register hops within a stage, each
+comfortably under 3.333 ns. The tag and data BRAMs no longer have a same-cycle
+dependency, so their placement is decoupled and the dominant routing term is
+removed. Target: **WNS ≥ 0 at 300 MHz** for the 1 MB L2 in the full build.
+
+## Area cost estimate
+
+Per L2 bank (1 MB, 8-way, data array 16384 × 512 b), going `LATENCY` 2 → 4:
+
+- **Flip-flops:** two extra payload stages. The wide field is the 512 b write
+  word; with control (~70 b) the payload is ~590 b → ~1,180 FF/bank for the two
+  added stages, plus the BRAM output pipeline regs (absorbed into the BRAM).
+  Against the measured 117 k FF for the 2-core build, that is **~+1%**.
+- **Block RAM:** unchanged. Data stays in the same BRAMs; the read/write split is
+  BRAM-native dual-port.
+- **LUTRAM / LUT:** the deferred-write mux + the same-line stall scoreboard add a
+  few hundred LUTs per bank. (The 16,600 LUTRAM for the dirty mask is the
+  separate, already-landed write-back cost, not attributable to this refactor.)
+
+Net: **~+1% FF, ~0 BRAM, small LUT per large-cache bank** — cheap relative to a
++40% clock.
+
+## AMAT impact
+
+`LATENCY = 4` raises the L2 **hit** latency by 2 cycles. The bank stays fully
+pipelined (one request/cycle throughput is unchanged), and the cache is
+non-blocking (16-entry MSHR), so the added cycles overlap with in-flight misses.
+
+Average-memory-access-time effect:
+
+```
+AMAT_overall ≈ t_L1 + m_L1 · (t_L2 + m_L2 · t_mem)
+Δ(t_L2) = +2 cycles ⇒ ΔAMAT_overall = m_L1 · 2 cycles
+```
+
+For a typical L1 miss rate `m_L1 ≈ 0.10–0.20`, that is **+0.2–0.4 cycle** of
+average access time — against a `t_mem` of hundreds of cycles, it is in the noise.
+
+The decisive comparison is absolute wall-clock, because today the *whole device*
+is stuck at the L2's Fmax:
+
+| | 2-stage @ 212 MHz | 4-stage @ 300 MHz |
+|---|---|---|
+| L2 hit latency | 2 cyc = 9.4 ns | 4 cyc = 13.3 ns |
+| Device clock | 212 MHz | **300 MHz (+42%)** |
+
+A single L2 hit is ~3.9 ns slower, but every cycle everywhere else is 42%
+faster, and that latency is hidden by the MSHR. Throughput-bound GPU workloads
+win decisively.
+
+## SimX model (cycle parity)
+
+The elastic depth must be reflected in SimX or the SimX↔RTL cycle-parity target
+drifts. No structural SimX work is needed: the bank model already carries a
+configurable depth — `Cache::Config::latency` ("pipeline latency") sizes the
+per-bank request pipe (`pipe_req_ = TFifo<bank_req_t>::Create("",
+config.latency)` in `sim/simx/mem/cache.cpp`), so SimX already simulates a
+`latency`-deep pipelined bank.
+
+The gap is only that the value is **hardcoded** at construction instead of
+sourced from config. This proposal targets the large caches, so only those are
+rewired; the others keep their current literals and are out of scope:
+
+| Cache | SimX site | Today | This proposal |
+|-------|-----------|------:|---------------|
+| **L2** | `sim/simx/cluster.cpp:82` | `2` | `VX_CFG_L2_LATENCY` (→ 4 when >64 KB) |
+| **L3** | `sim/simx/processor.cpp:77` | `2` | `VX_CFG_L3_LATENCY` (→ 4 when >64 KB) |
+| L1 D$/I$ | `sim/simx/socket.cpp:47,67` | `1` | unchanged (separate, pre-calibrated) |
+| T$/O$/R$ | `sim/simx/cluster.cpp:196,266,320` | `2` | unchanged |
+
+Because the `VX_CFG_L2_LATENCY`/`VX_CFG_L3_LATENCY` macros are emitted from the
+same `VX_config.toml`, replacing those two literals with their macro makes the
+**one config value drive both the RTL bank parameter and the SimX pipe depth**,
+so they cannot diverge. The RTL bank's existing 2-cycle floor and SimX's L1
+`latency=1` modeling are a pre-existing parity calibration this change does not
+touch; the knob raises only L2/L3, where both sides read 2 today.
+
+Two parity details to keep honest:
+- **Same-line hazard stall.** The RTL adds a same-line in-flight stall at higher
+  depth. SimX already accounts bank occupancy/contention (`bank_stalls`); the
+  same-line RAW stall must be modeled in the SimX bank as well (a marked-line
+  check on the `pipe_req_` occupancy) so the throughput effect matches, not just
+  the latency. If same-line conflicts are rare for a workload the residual sits
+  inside the <5% parity budget, but the mechanism should be present.
+- **AMO chain pacing.** The SimX LLC atomic path must pace same-line chains over
+  the same `LATENCY`-sized window (it collapses into the same marked-line check),
+  matching the RTL `chain_stall`/`commit_busy` behavior at depth.
+
+Parity is then re-confirmed by the existing SimX↔RTL trace-diff methodology at
+each `LATENCY` value (default 2 must be unchanged from today).
+
+## Validation plan / status
+
+1. **[DONE]** `LATENCY = 2` bit-identical — rtlsim 2-core+L2 vecadd: `cycles=2164`,
+   identical to the pre-refactor baseline.
+2. **[DONE]** `LATENCY = 3` functional — vecadd `cycles=2239` (+3.5%, the one
+   deferred stage, mostly MSHR-hidden) and sgemm (RAW-heavy reuse, exercises
+   store→load across the deferral) both PASS.
+3. **[pending]** `LATENCY = 4` once L2/L3 `MREQ_SIZE` is bumped (margin), plus the
+   atomics-enabled sweep for the AMO path (`LATENCY ∈ {2,3,4}`).
+4. **[pending]** SimX parity update (L2/L3 latency from `VX_CFG_*_LATENCY`) and
+   trace-diff at each depth.
+5. **[pending]** DUT synth of the 1 MB L2 bank at `LATENCY = 4`; confirm WNS ≥ 0
+   @300 MHz and the new worst path is outside the cache.
+6. **[pending]** Full 2-core `xrt` build at 300 MHz; on-card validation (#364).
+
+## Risk / compatibility
+
+- Correctness-sensitive (cache data path); gated on the rtlsim sweep above
+  before any synthesis.
+- L1 and all small caches default to `LATENCY = 2` and the 1-deep forward, so
+  their behavior, latency, and area are unchanged.
+- The spine refactor (struct + generate pipe + stage-indexed control) is a
+  net readability improvement over the current parallel-wire style.
+- Depends on the `VX_sp_ram`/`VX_dp_ram` `LUTRAM`/`USE_FAST_BRAM` fix (dirty-mask
+  bottleneck #3) already being present.
diff --git a/hw/rtl/VX_cluster.sv b/hw/rtl/VX_cluster.sv
index 20fc96ba2..a7a49d21f 100644
--- a/hw/rtl/VX_cluster.sv
+++ b/hw/rtl/VX_cluster.sv
@@ -205,6 +205,7 @@ module VX_cluster import VX_gpu_pkg::*;
         .MSHR_SIZE      (`VX_CFG_L2_MSHR_SIZE),
         .MRSQ_SIZE      (`VX_CFG_L2_MRSQ_SIZE),
         .MREQ_SIZE      (`VX_CFG_L2_MREQ_SIZE),
+        .LATENCY        (`VX_CFG_L2_LATENCY),
         .TAG_WIDTH      (L2_TAG_WIDTH),
         .WRITE_ENABLE   (1),
         .WRITEBACK      (`VX_CFG_L2_WRITEBACK),
diff --git a/hw/rtl/VX_socket.sv b/hw/rtl/VX_socket.sv
index 6da8634bd..37261448e 100644
--- a/hw/rtl/VX_socket.sv
+++ b/hw/rtl/VX_socket.sv
@@ -85,7 +85,7 @@ module VX_socket import VX_gpu_pkg::*;
         .clk        (clk),
         .reset      (reset),
         .bus_in_if  (kmu_bus_if),
-        .bus_out_if (per_core_kmu_bus_if[`VX_CFG_SOCKET_SIZE-1:0])
+        .bus_out_if (per_core_kmu_bus_if)
     );
 
     VX_gbar_bus_if per_core_gbar_bus_if[`VX_CFG_SOCKET_SIZE]();
@@ -185,6 +185,7 @@ module VX_socket import VX_gpu_pkg::*;
         .MSHR_SIZE      (`VX_CFG_DCACHE_MSHR_SIZE),
         .MRSQ_SIZE      (`VX_CFG_DCACHE_MRSQ_SIZE),
         .MREQ_SIZE      (`VX_CFG_DCACHE_MREQ_SIZE),
+        .LATENCY        (`VX_CFG_DCACHE_LATENCY),
         .TAG_WIDTH      (DCACHE_TAG_WIDTH),
         .WRITE_ENABLE   (1),
         .WRITEBACK      (`VX_CFG_DCACHE_WRITEBACK),
diff --git a/hw/rtl/Vortex.sv b/hw/rtl/Vortex.sv
index cfadfeb2a..52f874f07 100644
--- a/hw/rtl/Vortex.sv
+++ b/hw/rtl/Vortex.sv
@@ -131,6 +131,7 @@ module Vortex import VX_gpu_pkg::*, VX_trace_pkg::*; (
         .MSHR_SIZE      (`VX_CFG_L3_MSHR_SIZE),
         .MRSQ_SIZE      (`VX_CFG_L3_MRSQ_SIZE),
         .MREQ_SIZE      (`VX_CFG_L3_MREQ_SIZE),
+        .LATENCY        (`VX_CFG_L3_LATENCY),
         .TAG_WIDTH      (L3_TAG_WIDTH),
         .WRITE_ENABLE   (1),
         .WRITEBACK      (`VX_CFG_L3_WRITEBACK),
@@ -180,7 +181,7 @@ module Vortex import VX_gpu_pkg::*, VX_trace_pkg::*; (
         .clk        (clk),
         .reset      (reset),
         .bus_in_if  (kmu_bus_in),
-        .bus_out_if (per_cluster_kmu_bus_if[`VX_CFG_NUM_CLUSTERS-1:0])
+        .bus_out_if (per_cluster_kmu_bus_if)
     );
 
     VX_dcr_bus_if per_cluster_dcr_bus_if[`VX_CFG_NUM_CLUSTERS]();
diff --git a/hw/rtl/cache/VX_cache.sv b/hw/rtl/cache/VX_cache.sv
index 759237bdc..d096c9662 100644
--- a/hw/rtl/cache/VX_cache.sv
+++ b/hw/rtl/cache/VX_cache.sv
@@ -42,6 +42,9 @@ module VX_cache import VX_gpu_pkg::*; #(
     // Memory Request Queue Size
     parameter MREQ_SIZE             = 4,
 
+    // Bank pipeline depth (2 = classic lookup+commit; larger defers the data array)
+    parameter LATENCY               = 2,
+
     // Enable cache writeable
     parameter WRITE_ENABLE          = 1,
 
@@ -390,6 +393,7 @@ module VX_cache import VX_gpu_pkg::*; #(
             .MSHR_SIZE    (MSHR_SIZE),
             .MRSQ_SIZE    (MRSQ_SIZE),
             .MREQ_SIZE    (MREQ_SIZE),
+            .LATENCY      (LATENCY),
             .TAG_WIDTH    (TAG_WIDTH),
             .CORE_OUT_BUF (CORE_RSP_BUF_ENABLE ? 2 : 0),
             .MEM_OUT_BUF  (MEM_REQ_BUF_ENABLE ? 2 : 0),
diff --git a/hw/rtl/cache/VX_cache_amo.sv b/hw/rtl/cache/VX_cache_amo.sv
index d45d4de05..302d4cc45 100644
--- a/hw/rtl/cache/VX_cache_amo.sv
+++ b/hw/rtl/cache/VX_cache_amo.sv
@@ -39,7 +39,11 @@ module VX_cache_amo import VX_gpu_pkg::*; #(
     parameter ATTR_WIDTH      = 1,
     parameter MSHR_SIZE       = 1,
     parameter MSHR_ADDR_WIDTH = 1,
-    parameter WORDS_PER_LINE  = 1
+    parameter WORDS_PER_LINE  = 1,
+    // Deferred-commit depth: the commit ports (_st1) are fed from the bank's
+    // stC stage, which sits PIPE_EX+1 cycles behind the S0 lookup. 0 = classic
+    // 2-stage bank (stC == S1).
+    parameter PIPE_EX         = 0
 ) (
     input  wire                          clk,
     input  wire                          reset,
@@ -340,20 +344,51 @@ module VX_cache_amo import VX_gpu_pkg::*; #(
             end
         end
 
-        // response (fired at S1): SC -> 0/1; other -> old value (LSU sexts).
-        // The old value is available at S1 directly, no ALU needed.
-        wire [63:0] rsp_word = (amo_st1.amo_op == AMO_OP_SC) ? {63'h0, sc_fail_st1} : old_st1;
-        if (WORD_WIDTH < 64) begin : g_rsp_upper_unused
-            `UNUSED_VAR (rsp_word[63:WORD_WIDTH])
+        // Response (fired at S1; in-place, no ALU): the requester extracts its
+        // target word by byte offset, so the old value can stay where it sits in
+        // the line with the other bytes masked off -- this avoids a full-width
+        // barrel shift on the hot read->response path (read_word -> rsp_data was
+        // the critical path: a >>bit_off then <<bit_off round-trip just to mask).
+        // The byte mask comes straight from byteen (one line bit per set byte);
+        // masking is bit-identical to (old_st1 << bit_off) for the consumed bytes.
+        // SC returns 0/1 placed at the offset (rare path, 1-bit shift input).
+        wire [WORD_WIDTH-1:0] rsp_byte_mask;
+        for (genvar b = 0; b < WORD_SIZE; ++b) begin : g_rsp_mask
+            assign rsp_byte_mask[b*8 +: 8] = {8{byteen_st1[b]}};
         end
+        wire [WORD_WIDTH-1:0] amo_old_inplace = line_word_st1 & rsp_byte_mask;
+        wire [WORD_WIDTH-1:0] sc_rsp_inplace  = WORD_WIDTH'(sc_fail_st1) << bit_off_st1;
 
         assign amo_hit_st1 = amo_hit_w;
-        assign rsp_data    = WORD_WIDTH'(rsp_word) << bit_off_st1;
+        assign rsp_data    = (amo_st1.amo_op == AMO_OP_SC) ? sc_rsp_inplace : amo_old_inplace;
+        // Bridge the S0 prediction across the deferred lookup->commit window:
+        // with PIPE_EX>0 the AMO sits in the commit bubble for PIPE_EX cycles
+        // between do_store_st0 (S0) and do_store_st1 (stC), so commit_busy would
+        // gap and let a same-line request race the writeback. A PIPE_EX-deep
+        // shift of do_store_st0 fills the gap (continuous S0..stC hold).
+        wire amo_inflight;
+        if (PIPE_EX == 0) begin : g_no_bridge
+            assign amo_inflight = 1'b0;
+        end else begin : g_bridge
+            reg [PIPE_EX-1:0] store_inflight;
+            always @(posedge clk) begin
+                if (reset) begin
+                    store_inflight <= '0;
+                end else if (~pipe_stall) begin
+                    store_inflight[0] <= do_store_st0;
+                    for (int i = 1; i < PIPE_EX; ++i) begin
+                        store_inflight[i] <= store_inflight[i-1];
+                    end
+                end
+            end
+            assign amo_inflight = (| store_inflight);
+        end
+
         // Commit in flight: holds off new core-request admission from the S0
-        // prediction through the compute stage and the writeback. Replays are
-        // NOT blocked (the MSHR streams coalesced same-line AMOs back to back);
-        // those are paced instead by chain_stall.
-        assign commit_busy = do_store_st0 || do_store_st1 || cmp_valid || wb_pending_r;
+        // prediction through the deferred bubble, the compute stage and the
+        // writeback. Replays are NOT blocked (the MSHR streams coalesced same-
+        // line AMOs back to back); those are paced instead by chain_stall.
+        assign commit_busy = do_store_st0 || amo_inflight || do_store_st1 || cmp_valid || wb_pending_r;
         // Pace any same-line request sitting behind an in-flight compute by one
         // cycle, so the result lands in wb_data_r and forwards cleanly. Gated on
         // cmp_valid (an AMO is computing), so it never fires for baseline traffic.
diff --git a/hw/rtl/cache/VX_cache_bank.sv b/hw/rtl/cache/VX_cache_bank.sv
index 8a6f75b32..180da7494 100644
--- a/hw/rtl/cache/VX_cache_bank.sv
+++ b/hw/rtl/cache/VX_cache_bank.sv
@@ -16,58 +16,30 @@
 module VX_cache_bank import VX_gpu_pkg::*; #(
     parameter `STRING INSTANCE_ID= "",
     parameter BANK_ID           = 0,
-
-    // Number of Word requests per cycle
     parameter NUM_REQS          = 1,
-
-    // Size of cache in bytes
-    parameter CACHE_SIZE        = 1024,
-    // Size of line inside a bank in bytes
-    parameter LINE_SIZE         = 16,
-    // Number of banks
+    parameter CACHE_SIZE        = 1024,     // cache size in bytes
+    parameter LINE_SIZE         = 16,       // line size in bytes
     parameter NUM_BANKS         = 1,
-    // Number of associative ways
     parameter NUM_WAYS          = 1,
-    // Size of a word in bytes
-    parameter WORD_SIZE         = 4,
-
-    // Core Response Queue Size
-    parameter CRSQ_SIZE         = 1,
-    // Miss Reserv Queue Knob
-    parameter MSHR_SIZE         = 1,
-    // Memory Response Queue Size (sized at the cache wrapper; bank
-    // currently flows responses straight through, so unused locally.)
-    parameter MRSQ_SIZE         = 1,
-    // Memory Request Queue Size
-    parameter MREQ_SIZE         = 1,
-
-    // Enable cache writeable
+    parameter WORD_SIZE         = 4,        // word size in bytes
+    parameter CRSQ_SIZE         = 1,        // core response queue size
+    parameter MSHR_SIZE         = 1,        // miss reservation queue size
+    parameter MRSQ_SIZE         = 1,        // memory response queue size (sized at wrapper)
+    parameter MREQ_SIZE         = 1,        // memory request queue size
     parameter WRITE_ENABLE      = 1,
-
-    // Enable cache writeback
     parameter WRITEBACK         = 0,
-
-    // Enable dirty bytes on writeback
     parameter DIRTY_BYTES       = 0,
-
-    // Replacement policy
     parameter REPL_POLICY       = `CS_REPL_FIFO,
-
-    // core request tag size
     parameter TAG_WIDTH         = UUID_WIDTH + 1,
-
-    // Core response output buffer (TO_OUT_BUF_* encoding)
     parameter CORE_OUT_BUF      = 0,
-
-    // Memory request output buffer (TO_OUT_BUF_* encoding)
     parameter MEM_OUT_BUF       = 0,
-
-    // This bank is the last-level cache (AMOs commit locally here).
-    parameter IS_LLC            = 0,
-
-    // This bank supports atomic ops (AMO logic synthesizes only when 1).
-    parameter AMO_ENABLE        = 0,
-
+    parameter IS_LLC            = 0,        // last-level cache: AMOs commit locally here
+    parameter AMO_ENABLE        = 0,        // synthesize atomic-op logic
+    // Bank pipeline depth (register stages from request-select to commit). 2 is
+    // the classic lookup(S0)+commit(S1) pipeline; larger values defer the data
+    // array by (LATENCY-2) stages to break the tag->data critical path on large
+    // caches (tags/replacement/MSHR stay at S0/S1).
+    parameter LATENCY           = 2,
     parameter MSHR_ADDR_WIDTH   = `LOG2UP(MSHR_SIZE),
     parameter MEM_TAG_WIDTH     = UUID_WIDTH + MSHR_ADDR_WIDTH,
     parameter REQ_SEL_WIDTH     = `UP(`CS_REQ_SEL_BITS),
@@ -83,19 +55,19 @@ module VX_cache_bank import VX_gpu_pkg::*; #(
     output wire perf_mshr_stall,
 `endif
 
-    // Core Request
+    // Core request
     input wire                          core_req_valid,
     input wire [`CS_LINE_ADDR_WIDTH-1:0] core_req_addr,
-    input wire                          core_req_rw,    // write enable
-    input wire [WORD_SEL_WIDTH-1:0]     core_req_wsel,  // select the word in a cacheline, e.g. word size = 4 bytes, cacheline size = 64 bytes, it should have log(64/4)= 4 bits
-    input wire [WORD_SIZE-1:0]          core_req_byteen,// which bytes in data to write
-    input wire [`CS_WORD_WIDTH-1:0]     core_req_data,  // data to be written
-    input wire [TAG_WIDTH-1:0]          core_req_tag,   // identifier of the request (request id)
-    input wire [REQ_SEL_WIDTH-1:0]      core_req_idx,   // index of the request in the core request array
+    input wire                          core_req_rw,
+    input wire [WORD_SEL_WIDTH-1:0]     core_req_wsel,
+    input wire [WORD_SIZE-1:0]          core_req_byteen,
+    input wire [`CS_WORD_WIDTH-1:0]     core_req_data,
+    input wire [TAG_WIDTH-1:0]          core_req_tag,
+    input wire [REQ_SEL_WIDTH-1:0]      core_req_idx,
     input wire [`UP(MEM_ATTR_WIDTH)-1:0] core_req_attr,
     output wire                         core_req_ready,
 
-    // Core Response
+    // Core response
     output wire                         core_rsp_valid,
     output wire [`CS_WORD_WIDTH-1:0]    core_rsp_data,
     output wire [TAG_WIDTH-1:0]         core_rsp_tag,
@@ -118,113 +90,161 @@ module VX_cache_bank import VX_gpu_pkg::*; #(
     input wire [MEM_TAG_WIDTH-1:0]      mem_rsp_tag,
     output wire                         mem_rsp_ready,
 
-    // flush
+    // Flush
     input wire                          flush_begin,
     input wire [`UP(UUID_WIDTH)-1:0]    flush_uuid,
     output wire                         flush_end
 );
-
-    localparam PIPELINE_STAGES = 2;
-
-    // MRSQ_SIZE is sized at the cache wrapper; bank flows responses
-    // straight through, so it is unused locally.
+    localparam PIPELINE_STAGES = LATENCY;
+    localparam PIPE_EX = LATENCY - 2;       // extra data-deferral stages (0 = classic 2-stage)
+    `STATIC_ASSERT(LATENCY >= 2, ("invalid parameter: cache bank LATENCY must be >= 2"))
     `UNUSED_PARAM (MRSQ_SIZE)
 
-    // AMO sideband, extracted from the attr field (gated by AMO_ENABLE).
-    amo_req_t core_req_amo;
-    assign core_req_amo = AMO_ENABLE ?
-        amo_req_t'(core_req_attr[MEM_ATTR_AMO_OFFS +: AMO_REQ_BITS])
-      : amo_req_t'('0);
-
-`IGNORE_UNUSED_BEGIN
-    wire [`UP(UUID_WIDTH)-1:0] req_uuid_sel, req_uuid_st0, req_uuid_st1;
-`IGNORE_UNUSED_END
-
-    wire                            crsp_queue_stall;
-    wire                            mshr_alm_full;
-    wire                            mshr_probe_pending_ld;
-    wire                            mshr_probe_pending_amo;
-    wire                            mreq_queue_empty;
-    wire                            mreq_queue_alm_full;
-
-    wire [`CS_LINE_ADDR_WIDTH-1:0]  mem_rsp_addr;
-
-    wire                            replay_valid;
-    wire [`CS_LINE_ADDR_WIDTH-1:0]  replay_addr;
-    wire                            replay_rw;
-    wire [WORD_SEL_WIDTH-1:0]       replay_wsel;
-    wire [WORD_SIZE-1:0]            replay_byteen;
-    wire [`CS_WORD_WIDTH-1:0]       replay_data;
-    wire [TAG_WIDTH-1:0]            replay_tag;
-    wire [REQ_SEL_WIDTH-1:0]        replay_idx;
-    wire [MSHR_ADDR_WIDTH-1:0]      replay_id;
-    wire                            replay_ready;
-    amo_req_t                       replay_amo;
-
-
-    wire                            valid_sel, valid_st0, valid_st1;
-    wire                            is_init_st0;
-    wire                            is_creq_st0, is_creq_st1;
-    wire                            is_fill_st0, is_fill_st1;
-    wire                            is_flush_st0, is_flush_st1;
-    wire [`CS_WAY_SEL_WIDTH-1:0]    flush_way_st0, evict_way_st0;
-    wire [`CS_WAY_SEL_WIDTH-1:0]    way_idx_st0, way_idx_st1;
-
-    wire [`CS_LINE_ADDR_WIDTH-1:0]  addr_sel, addr_st0, addr_st1;
-    wire [`CS_LINE_SEL_BITS-1:0]    line_idx_sel, line_idx_st0, line_idx_st1;
-    wire [`CS_TAG_SEL_BITS-1:0]     line_tag_st0, line_tag_st1;
-    wire [`CS_TAG_SEL_BITS-1:0]     evict_tag_st0, evict_tag_st1;
-    wire                            rw_sel, rw_st0, rw_st1;
-    wire [WORD_SEL_WIDTH-1:0]       word_idx_sel, word_idx_st0, word_idx_st1;
-    wire [WORD_SIZE-1:0]            byteen_sel, byteen_st0, byteen_st1;
-    wire [REQ_SEL_WIDTH-1:0]        req_idx_sel, req_idx_st0, req_idx_st1;
-    wire [TAG_WIDTH-1:0]            tag_sel, tag_st0, tag_st1;
-    wire [`CS_WORD_WIDTH-1:0]       write_word_st0, write_word_st1;
-    wire [`CS_LINE_WIDTH-1:0]       data_sel, data_st0;
-    wire [MSHR_ADDR_WIDTH-1:0]      mshr_id_st0, mshr_id_st1;
-    wire [MSHR_ADDR_WIDTH-1:0]      replay_id_st0;
-    wire                            is_dirty_st0, is_dirty_st1;
-    wire                            is_replay_st0, is_replay_st1;
-    wire                            is_hit_st0, is_hit_st1;
-    wire [`UP(MEM_ATTR_WIDTH)-1:0] attr_sel, attr_st0, attr_st1;
-    amo_req_t                       amo_sel, amo_st0, amo_st1;
-
-    // AMO interconnect (driven by the VX_cache_amo engine, tied off when the
-    // bank carries no AMO logic). Declared here because the input arbitration
-    // and sel mux consume them ahead of the instantiation.
-    wire                            amo_hit_st1;       // AMO commits locally at S1 (LLC)
-    wire                            amo_commit_busy;   // LLC commit in flight
-    wire                            amo_chain_stall;   // pace same-line chained AMO
-    wire                            amo_wb_pending;    // synthetic writeback request live
-    wire [`CS_WORD_WIDTH-1:0]       amo_rsp_data;      // LLC AMO response word
-    wire [`CS_LINE_ADDR_WIDTH-1:0]  amo_wb_addr;
-    wire [WORD_SEL_WIDTH-1:0]       amo_wb_word_idx;
-    wire [WORD_SIZE-1:0]            amo_wb_byteen;
-    wire [`CS_WORD_WIDTH-1:0]       amo_wb_data;
-    wire [TAG_WIDTH-1:0]            amo_wb_tag;
-    wire [REQ_SEL_WIDTH-1:0]        amo_wb_idx;
-    wire [`UP(MEM_ATTR_WIDTH)-1:0]  amo_wb_attr;
-    wire                            is_amo_fwd_st0;    // non-LLC AMO first pass (S0)
-    wire                            is_amo_fwd_st1;    // non-LLC AMO first pass (S1)
-    wire                            is_amo_replay_st1; // non-LLC AMO result replay
-    wire                            is_passthru_fill_sel;
-    wire [`CS_WORD_WIDTH-1:0]       amo_ptw_word_st1;
-    wire                            req_input_defer;   // non-LLC age-ordering hold
-
-    wire                            mshr_pending_raw_st0;
-    wire                            mshr_pending_st0, mshr_pending_st1;
-    wire [MSHR_ADDR_WIDTH-1:0]      mshr_previd_st0, mshr_previd_st1;
-    wire                            mshr_empty;
-    wire                            is_passthru_fill_st0; // fill targets a passthru entry
-
-    wire flush_valid;
-    wire init_valid;
+    // ========================================================================
+    // Pipeline payload types
+    //
+    // The request travels as a struct and the S0-computed lookup results are a
+    // separate `lookup_t` delta, composed into `commit_t` for the response /
+    // memory-request stage. The wide fill `data` line and `tag_matches` ride
+    // only the data-array path (`data_t`), never the commit path, so the deeper
+    // commit pipeline stays narrow.
+    //   sel -> S0     : data_t  (st0)            -- request + fill line
+    //   S0  -> stD    : data_t  (stD)            -- drives the data array
+    //   S0  -> S1->stC: commit_t (st1, stC)      -- request + lookup delta
+    // `way_idx` and `mshr_id` are reused across stages (flush_way/replay_id at
+    // select; resolved way / allocated id at commit). PIPE_EX=0 collapses
+    // stD->S0 and stC->S1: the classic 2-stage bank.
+    // ========================================================================
+    typedef struct packed {
+        logic                           valid, is_init, is_fill, is_flush, is_creq, is_replay, is_passthru_fill, rw;
+        logic [`UP(MEM_ATTR_WIDTH)-1:0] attr;
+        logic [`CS_WAY_SEL_WIDTH-1:0]   way_idx;     // flush_way @sel, resolved way @S1
+        logic [`CS_LINE_ADDR_WIDTH-1:0] addr;
+        logic [WORD_SIZE-1:0]           byteen;
+        logic [WORD_SEL_WIDTH-1:0]      word_idx;
+        logic [REQ_SEL_WIDTH-1:0]       req_idx;
+        logic [TAG_WIDTH-1:0]           tag;
+        logic [MSHR_ADDR_WIDTH-1:0]     mshr_id;     // replay_id @sel, alloc/replay id @S1
+        amo_req_t                       amo;
+    } req_t;
+
+    typedef struct packed {            // S0-computed lookup delta (commit side)
+        logic                          is_hit, is_dirty, mshr_pending;
+        logic [`CS_TAG_SEL_BITS-1:0]   evict_tag;
+        logic [`CS_WORD_WIDTH-1:0]      write_word;
+        logic [MSHR_ADDR_WIDTH-1:0]    mshr_previd;
+    } lookup_t;
+
+    typedef struct packed {            // data-array drive (S0 -> stD)
+        req_t                          req;
+        logic [`CS_LINE_WIDTH-1:0]     data;
+        logic [NUM_WAYS-1:0]           tag_matches;
+    } data_t;
+
+    typedef struct packed {            // response + memory request (S0 -> S1 -> stC)
+        req_t                          req;
+        lookup_t                       lk;
+    } commit_t;
+
+    data_t   sel_req, st0, dat_in, stD;   // request + fill line: sel -> S0 -> stD
+    commit_t cmt_in, st1, stC;            // request + lookup delta: S0 -> S1 -> stC
+    lookup_t lk_st0;                      // S0 lookup results
+
+    // ------------------------------------------------------------------------
+    // Shared signals
+    // ------------------------------------------------------------------------
+    wire crsp_queue_stall, mshr_alm_full, mshr_empty;
+    wire mshr_probe_pending_ld, mshr_probe_pending_amo;
+    wire mreq_queue_empty, mreq_queue_alm_full;
+    wire [`CS_LINE_ADDR_WIDTH-1:0] mem_rsp_addr;
+    wire [MSHR_ADDR_WIDTH-1:0] mshr_alloc_id, mshr_previd;
+    wire mshr_pending_raw;
+
+    // MSHR replay (dequeue) sideband
+    wire                           replay_valid, replay_ready, replay_rw;
+    wire [`CS_LINE_ADDR_WIDTH-1:0] replay_addr;
+    wire [WORD_SEL_WIDTH-1:0]      replay_wsel;
+    wire [WORD_SIZE-1:0]           replay_byteen;
+    wire [`CS_WORD_WIDTH-1:0]      replay_data;
+    wire [TAG_WIDTH-1:0]           replay_tag;
+    wire [REQ_SEL_WIDTH-1:0]       replay_idx;
+    wire [MSHR_ADDR_WIDTH-1:0]     replay_id;
+    amo_req_t                      replay_amo;
+
+    // AMO engine interconnect (tied to 0 when the bank carries no AMO logic).
+    wire                          amo_hit_st1, amo_commit_busy, amo_chain_stall, amo_wb_pending;
+    wire [`CS_WORD_WIDTH-1:0]     amo_rsp_data;
+    wire [`CS_LINE_ADDR_WIDTH-1:0] amo_wb_addr;
+    wire [WORD_SEL_WIDTH-1:0]     amo_wb_word_idx;
+    wire [WORD_SIZE-1:0]          amo_wb_byteen;
+    wire [`CS_WORD_WIDTH-1:0]     amo_wb_data;
+    wire [TAG_WIDTH-1:0]          amo_wb_tag;
+    wire [REQ_SEL_WIDTH-1:0]      amo_wb_idx;
+    wire [`UP(MEM_ATTR_WIDTH)-1:0] amo_wb_attr;
+    wire                          is_amo_fwd_st0, is_amo_fwd_st1, is_amo_replay_st1;
+    wire                          is_passthru_fill_sel, req_input_defer;
+    wire [`CS_WORD_WIDTH-1:0]     amo_ptw_word_st1;
+
+    wire flush_valid, flush_ready, init_valid;
     wire [`CS_LINE_SEL_BITS-1:0] flush_sel;
     wire [`CS_WAY_SEL_WIDTH-1:0] flush_way;
-    wire flush_ready;
 
-    // ensure we have no pending memory request in the bank
-    wire no_pending_req = ~valid_st0 && ~valid_st1 && mreq_queue_empty;
+    // AMO sideband, extracted from the attr field (gated by AMO_ENABLE).
+    amo_req_t core_req_amo;
+    assign core_req_amo = AMO_ENABLE ? amo_req_t'(core_req_attr[MEM_ATTR_AMO_OFFS +: AMO_REQ_BITS])
+                                     : amo_req_t'('0);
+
+    // ------------------------------------------------------------------------
+    // Per-stage decoded operations
+    // ------------------------------------------------------------------------
+    wire do_init_st0  = st0.req.valid && st0.req.is_init;
+    wire do_flush_st0 = st0.req.valid && st0.req.is_flush;
+    wire do_read_st0  = st0.req.valid && st0.req.is_creq && ~st0.req.rw;
+    wire do_write_st0 = st0.req.valid && st0.req.is_creq && st0.req.rw;
+    wire do_fill_st0  = st0.req.valid && st0.req.is_fill;
+    wire do_lookup_st0 = do_read_st0 || do_write_st0;
+
+    wire do_read_st1  = st1.req.valid && st1.req.is_creq && ~st1.req.rw;
+    wire do_write_st1 = st1.req.valid && st1.req.is_creq && st1.req.rw;
+    wire do_lookup_st1 = do_read_st1 || do_write_st1;
+
+    wire do_read_stc  = stC.req.valid && stC.req.is_creq && ~stC.req.rw;
+    wire do_write_stc = stC.req.valid && stC.req.is_creq && stC.req.rw;
+
+    wire do_init_std  = stD.req.valid && stD.req.is_init;
+    wire do_fill_std  = stD.req.valid && stD.req.is_fill;
+    wire do_flush_std = stD.req.valid && stD.req.is_flush;
+    wire do_read_std  = stD.req.valid && stD.req.is_creq && ~stD.req.rw;
+    wire do_write_std = stD.req.valid && stD.req.is_creq && stD.req.rw;
+
+    wire [`CS_LINE_SEL_BITS-1:0] line_idx_st0 = st0.req.addr[`CS_LINE_SEL_BITS-1:0];
+    wire [`CS_TAG_SEL_BITS-1:0]  line_tag_st0 = `CS_LINE_ADDR_TAG(st0.req.addr);
+    wire [`CS_WORD_WIDTH-1:0]    write_word_st0 = st0.data[`CS_WORD_WIDTH-1:0];
+    wire [`CS_LINE_ADDR_WIDTH-1:0] addr_stc = stC.req.addr;
+
+    // ------------------------------------------------------------------------
+    // Bank-empty detection (gates flush). A request occupies S0, S1 and the
+    // PIPE_EX commit-bubble stages (valid_st1 delayed 1..PIPE_EX); the parallel
+    // data bubble S0->stD is subsumed by this window.
+    // ------------------------------------------------------------------------
+    wire pipe_inflight;
+    if (PIPE_EX == 0) begin : g_no_bubble_occ
+        assign pipe_inflight = st0.req.valid || st1.req.valid;
+    end else begin : g_bubble_occ
+        reg [PIPE_EX-1:0] commit_valid;
+        always @(posedge clk) begin
+            if (reset) begin
+                commit_valid <= '0;
+            end else if (~pipe_stall) begin
+                commit_valid[0] <= st1.req.valid;
+                for (int i = 1; i < PIPE_EX; ++i) begin
+                    commit_valid[i] <= commit_valid[i-1];
+                end
+            end
+        end
+        assign pipe_inflight = st0.req.valid || st1.req.valid || (| commit_valid);
+    end
+    wire no_pending_req = ~pipe_inflight && mreq_queue_empty;
 
     VX_cache_flush #(
         .BANK_ID    (BANK_ID),
@@ -248,67 +268,50 @@ module VX_cache_bank import VX_gpu_pkg::*; #(
     );
 
     // amo_chain_stall paces a same-line AMO behind an in-flight commit by one
-    // cycle so the prior result reaches the writeback register; it is 0 for all
-    // non-AMO traffic, so the baseline pipe is unaffected.
+    // cycle; it is 0 for non-AMO traffic, so the baseline pipe is unaffected.
     wire pipe_stall = crsp_queue_stall || amo_chain_stall;
 
-    // inputs arbitration:
-    // mshr replay has highest priority to maximize utilization since there is no miss.
-    // handle memory responses next to prevent deadlock with potential memory request from a miss.
-    // flush has precedence over core requests to ensure that the cache is in a consistent state.
-    wire replay_grant = ~init_valid;
+    // ========================================================================
+    // Input arbitration
+    //   priority: init > replay > fill(mem_rsp) > flush > core-req
+    // replay maximizes utilization (guaranteed hit); fill precedes flush/creq to
+    // avoid deadlock on a miss; flush precedes creq for consistency.
+    // ========================================================================
+    wire replay_grant  = ~init_valid;
     wire replay_enable = replay_grant && replay_valid;
-
-    wire fill_grant  = ~init_valid && ~replay_enable;
-    wire fill_enable = fill_grant && mem_rsp_valid;
-
-    wire flush_grant  = ~init_valid && ~replay_enable && ~fill_enable;
-    wire flush_enable = flush_grant && flush_valid;
-
-    wire creq_grant  = ~init_valid && ~replay_enable && ~fill_enable && ~flush_enable;
-    // creq fires from a real core_req or from a pending LLC AMO writeback
-    // (the synthetic write injected after a commit); the two are mutually
-    // exclusive. amo_commit_busy holds off new admits while a single-
-    // outstanding LLC commit is in flight; req_input_defer enforces non-LLC
-    // age-ordering. Both, plus amo_wb_pending/amo_hit_st1, are driven by the
-    // AMO engine below and tie to 0 when the bank carries no AMO logic.
+    wire fill_grant    = replay_grant && ~replay_enable;
+    wire fill_enable   = fill_grant && mem_rsp_valid;
+    wire flush_grant   = fill_grant && ~fill_enable;
+    wire flush_enable  = flush_grant && flush_valid;
+    wire creq_grant    = flush_grant && ~flush_enable;
+
+    // A core-request slot fires from a real core_req or a pending LLC AMO
+    // writeback (synthetic write injected after a commit); mutually exclusive.
+    // amo_commit_busy/req_input_defer enforce AMO ordering (0 for non-AMO banks).
     wire amo_creq_path = core_req_valid && ~amo_commit_busy && ~req_input_defer;
     wire amo_wb_path   = amo_wb_pending && ~amo_hit_st1;
     wire creq_enable   = creq_grant && (amo_creq_path || amo_wb_path);
 
-    assign replay_ready = replay_grant
-                       && ~(!WRITEBACK && replay_rw && mreq_queue_alm_full) // needed for writethrough
-                       && ~pipe_stall;
-
-    assign mem_rsp_ready = fill_grant
-                        && ~(WRITEBACK && mreq_queue_alm_full) // needed for writeback
-                        && ~pipe_stall;
-
-    assign flush_ready = flush_grant
-                      && ~(WRITEBACK && mreq_queue_alm_full) // needed for writeback
-                      && ~pipe_stall;
-
-    assign core_req_ready = creq_grant
-                         && ~mreq_queue_alm_full // needed for fill requests
-                         && ~mshr_alm_full // needed for mshr allocation
-                         && ~pipe_stall
-                         && ~amo_commit_busy    // hold off core_req while an LLC AMO commit is in flight
-                         && ~req_input_defer    // age-order AMO/load vs in-flight entry
-                         ;
-
-    wire init_fire     = init_valid;
-    wire replay_fire   = replay_valid && replay_ready;
-    wire mem_rsp_fire  = mem_rsp_valid && mem_rsp_ready;
-    wire flush_fire    = flush_valid && flush_ready;
-    // amo_wb_path already excludes the cycle a fresh AMO commits at S1
-    // (amo_hit_st1), so the writeback never races the chain update.
-    wire amo_wb_fire   = amo_wb_path && creq_grant
-                      && ~mreq_queue_alm_full && ~mshr_alm_full && ~pipe_stall;
+    assign replay_ready   = replay_grant && ~(!WRITEBACK && replay_rw && mreq_queue_alm_full) && ~pipe_stall;
+    assign mem_rsp_ready  = fill_grant && ~(WRITEBACK && mreq_queue_alm_full) && ~pipe_stall;
+    assign flush_ready    = flush_grant && ~(WRITEBACK && mreq_queue_alm_full) && ~pipe_stall;
+    assign core_req_ready = creq_grant && ~mreq_queue_alm_full && ~mshr_alm_full && ~pipe_stall
+                         && ~amo_commit_busy && ~req_input_defer;
+
+    wire init_fire    = init_valid;
+    wire replay_fire  = replay_valid && replay_ready;
+    wire mem_rsp_fire = mem_rsp_valid && mem_rsp_ready;
+    wire flush_fire   = flush_valid && flush_ready;
+    // amo_wb_path already excludes the cycle a fresh AMO commits at S1, so the
+    // writeback never races the chain update.
+    wire amo_wb_fire   = amo_wb_path && creq_grant && ~mreq_queue_alm_full && ~mshr_alm_full && ~pipe_stall;
     wire core_req_fire = (amo_creq_path || amo_wb_path) && creq_grant
                        && ~mreq_queue_alm_full && ~mshr_alm_full && ~pipe_stall;
 
     wire [MSHR_ADDR_WIDTH-1:0] mem_rsp_id = mem_rsp_tag[MSHR_ADDR_WIDTH-1:0];
 
+    // generate-guarded width selects (the dead branch must not elaborate an
+    // out-of-range slice when the other width path is taken).
     wire [TAG_WIDTH-1:0] mem_rsp_tag_s;
     if (TAG_WIDTH > MEM_TAG_WIDTH) begin : g_mem_rsp_tag_s_pad
         assign mem_rsp_tag_s = {mem_rsp_tag, (TAG_WIDTH-MEM_TAG_WIDTH)'(1'b0)};
@@ -329,105 +332,90 @@ module VX_cache_bank import VX_gpu_pkg::*; #(
         assign flush_tag = '0;
     end
 
-    // Input arbitration mux. The AMO writeback fields tie to 0 when no LLC
-    // commit engine is present, so the wb arms prune away for non-AMO banks.
-    assign valid_sel   = init_fire || replay_fire || mem_rsp_fire || flush_fire || core_req_fire;
-    assign rw_sel      = replay_valid ? replay_rw
-                       : (amo_wb_pending ? 1'b1 : core_req_rw);
-    assign byteen_sel  = replay_valid ? replay_byteen
-                       : (amo_wb_pending ? amo_wb_byteen : core_req_byteen);
-    assign addr_sel    = (init_valid | flush_valid) ? `CS_LINE_ADDR_WIDTH'(flush_sel) :
-                            (replay_valid ? replay_addr : (mem_rsp_valid ? mem_rsp_addr :
-                                (amo_wb_pending ? amo_wb_addr : core_req_addr)));
-    assign word_idx_sel= replay_valid ? replay_wsel
-                       : (amo_wb_pending ? amo_wb_word_idx : core_req_wsel);
-    assign req_idx_sel = replay_valid ? replay_idx
-                       : (amo_wb_pending ? amo_wb_idx : core_req_idx);
-    assign tag_sel     = (init_valid | flush_valid) ? (flush_valid ? flush_tag : '0) :
-                            (replay_valid ? replay_tag : (mem_rsp_valid ? mem_rsp_tag_s :
-                                (amo_wb_pending ? amo_wb_tag : core_req_tag)));
-    assign attr_sel   = amo_wb_pending ? amo_wb_attr
-                       : (core_req_valid ? core_req_attr : '0);
-    // AMO sideband priority must match the sel mux (replay > wb > core_req):
-    // a replay can fire during a pending wb (chained AMO replays from MSHR
-    // after a fill), so it must not be cleared by amo_wb_pending. The
-    // synthetic writeback carries amo.valid=0 so it never re-commits at S1.
-    assign amo_sel = replay_valid  ? replay_amo
-                   : (amo_wb_pending ? amo_req_t'('0)
-                   : (core_req_valid ? core_req_amo : amo_req_t'('0)));
-
+    // Per-bit fill/write data mux. AMO writeback fields tie to 0 for non-AMO
+    // banks, so the wb arms prune away.
+    wire [`CS_LINE_WIDTH-1:0] data_sel;
     if (WRITE_ENABLE) begin : g_data_sel
         for (genvar i = 0; i < `CS_LINE_WIDTH; ++i) begin : g_i
             if (i < `CS_WORD_WIDTH) begin : g_lo
-                assign data_sel[i] = replay_valid ? replay_data[i] :
-                                      (mem_rsp_valid ? mem_rsp_data[i] :
-                                       (amo_wb_pending ? amo_wb_data[i] : core_req_data[i]));
+                assign data_sel[i] = replay_valid ? replay_data[i]
+                                   : (mem_rsp_valid ? mem_rsp_data[i]
+                                   : (amo_wb_pending ? amo_wb_data[i] : core_req_data[i]));
             end else begin : g_hi
-                assign data_sel[i] = mem_rsp_data[i]; // only the memory response fills the upper words of data_sel
+                assign data_sel[i] = mem_rsp_data[i]; // only the fill carries upper words
             end
         end
     end else begin : g_data_sel_ro
         assign data_sel = mem_rsp_data;
-        `UNUSED_VAR (core_req_data)
-        `UNUSED_VAR (replay_data)
-        `UNUSED_VAR (amo_wb_data) // read-only banks have no writeback data
+        `UNUSED_VAR ({core_req_data, replay_data, amo_wb_data})
     end
 
-    if (UUID_WIDTH != 0) begin : g_req_uuid_sel
-        assign req_uuid_sel = tag_sel[TAG_WIDTH-1 -: UUID_WIDTH];
-    end else begin : g_req_uuid_sel_0
-        assign req_uuid_sel = '0;
+    // Input mux -> arbitrated request (whole-struct populate). AMO priority
+    // matches the mux (replay > wb > core_req): a replay can fire during a
+    // pending wb (chained AMO replays from MSHR after a fill) and must not be
+    // cleared by amo_wb_pending; the synthetic writeback carries amo.valid=0 so
+    // it never re-commits at S1.
+    always @(*) begin
+        sel_req = '0;
+        sel_req.req.valid    = init_fire || replay_fire || mem_rsp_fire || flush_fire || core_req_fire;
+        sel_req.req.is_init  = init_valid;
+        sel_req.req.is_fill  = fill_enable;
+        sel_req.req.is_flush = flush_enable;
+        sel_req.req.is_creq  = creq_enable || replay_enable;
+        sel_req.req.is_replay = replay_enable;
+        sel_req.req.is_passthru_fill = is_passthru_fill_sel;
+        sel_req.req.rw       = replay_valid ? replay_rw : (amo_wb_pending ? 1'b1 : core_req_rw);
+        sel_req.req.attr     = amo_wb_pending ? amo_wb_attr : (core_req_valid ? core_req_attr : '0);
+        sel_req.req.way_idx  = flush_way;
+        sel_req.req.addr     = (init_valid | flush_valid) ? `CS_LINE_ADDR_WIDTH'(flush_sel)
+                             : (replay_valid ? replay_addr : (mem_rsp_valid ? mem_rsp_addr
+                             : (amo_wb_pending ? amo_wb_addr : core_req_addr)));
+        sel_req.req.byteen   = replay_valid ? replay_byteen : (amo_wb_pending ? amo_wb_byteen : core_req_byteen);
+        sel_req.req.word_idx = replay_valid ? replay_wsel : (amo_wb_pending ? amo_wb_word_idx : core_req_wsel);
+        sel_req.req.req_idx  = replay_valid ? replay_idx : (amo_wb_pending ? amo_wb_idx : core_req_idx);
+        sel_req.req.tag      = (init_valid | flush_valid) ? (flush_valid ? flush_tag : '0)
+                             : (replay_valid ? replay_tag : (mem_rsp_valid ? mem_rsp_tag_s
+                             : (amo_wb_pending ? amo_wb_tag : core_req_tag)));
+        sel_req.req.mshr_id  = replay_id;
+        sel_req.req.amo      = replay_valid ? replay_amo : (amo_wb_pending ? amo_req_t'('0)
+                             : (core_req_valid ? core_req_amo : amo_req_t'('0)));
+        sel_req.data         = data_sel;
+        // tag_matches is computed at S0; left 0 here (overridden at the data bubble).
     end
 
-    wire is_init_sel   = init_valid;
-    wire is_creq_sel   = creq_enable || replay_enable;
-    wire is_fill_sel   = fill_enable;
-    wire is_flush_sel  = flush_enable;
-    wire is_replay_sel = replay_enable;
+    // UUID extraction (debug + MSHR ordering): per stage, from the carried tag.
+    wire [`UP(UUID_WIDTH)-1:0] req_uuid_sel, req_uuid_st0, req_uuid_st1, req_uuid_stc;
+    if (UUID_WIDTH != 0) begin : g_req_uuid
+        assign req_uuid_sel = sel_req.req.tag[TAG_WIDTH-1 -: UUID_WIDTH];
+        assign req_uuid_st0 = st0.req.tag[TAG_WIDTH-1 -: UUID_WIDTH];
+        assign req_uuid_st1 = st1.req.tag[TAG_WIDTH-1 -: UUID_WIDTH];
+        assign req_uuid_stc = stC.req.tag[TAG_WIDTH-1 -: UUID_WIDTH];
+    end else begin : g_req_uuid_0
+        assign {req_uuid_sel, req_uuid_st0, req_uuid_st1, req_uuid_stc} = '0;
+    end
+    `UNUSED_VAR ({req_uuid_st0, req_uuid_st1})
 
+    // S0 register
     VX_pipe_register #(
-        .DATAW  (1 + 1 + 1 + 1 + 1 + 1 + 1 + `UP(MEM_ATTR_WIDTH) + `CS_WAY_SEL_WIDTH + `CS_LINE_ADDR_WIDTH + `CS_LINE_WIDTH + 1 + WORD_SIZE + WORD_SEL_WIDTH + REQ_SEL_WIDTH + TAG_WIDTH + MSHR_ADDR_WIDTH + AMO_REQ_BITS),
+        .DATAW  ($bits(data_t)),
         .RESETW (1)
-    ) pipe_reg0 (
+    ) reg_s0 (
         .clk      (clk),
         .reset    (reset),
         .enable   (~pipe_stall),
-        .data_in  ({valid_sel, is_init_sel, is_fill_sel, is_flush_sel, is_creq_sel, is_replay_sel, is_passthru_fill_sel, attr_sel, flush_way,     addr_sel, data_sel, rw_sel, byteen_sel, word_idx_sel, req_idx_sel, tag_sel, replay_id,     amo_sel}),
-        .data_out ({valid_st0, is_init_st0, is_fill_st0, is_flush_st0, is_creq_st0, is_replay_st0, is_passthru_fill_st0, attr_st0, flush_way_st0, addr_st0, data_st0, rw_st0, byteen_st0, word_idx_st0, req_idx_st0, tag_st0, replay_id_st0, amo_st0})
+        .data_in  (sel_req),
+        .data_out (st0)
     );
 
-    if (UUID_WIDTH != 0) begin : g_req_uuid_st0
-        assign req_uuid_st0 = tag_st0[TAG_WIDTH-1 -: UUID_WIDTH];
-    end else begin : g_req_uuid_st0_0
-        assign req_uuid_st0 = '0;
-    end
-
-    wire is_read_st0  = is_creq_st0 && ~rw_st0;
-    wire is_write_st0 = is_creq_st0 && rw_st0;
-
-    wire do_init_st0  = valid_st0 && is_init_st0;
-    wire do_flush_st0 = valid_st0 && is_flush_st0;
-    wire do_read_st0  = valid_st0 && is_read_st0;
-    wire do_write_st0 = valid_st0 && is_write_st0;
-    wire do_fill_st0  = valid_st0 && is_fill_st0;
-
-    wire is_read_st1  = is_creq_st1 && ~rw_st1;
-    wire is_write_st1 = is_creq_st1 && rw_st1;
-
-    wire do_read_st1  = valid_st1 && is_read_st1;
-    wire do_write_st1 = valid_st1 && is_write_st1;
-
-    assign line_idx_sel = addr_sel[`CS_LINE_SEL_BITS-1:0];
-    assign line_idx_st0 = addr_st0[`CS_LINE_SEL_BITS-1:0];
-    assign line_tag_st0 = `CS_LINE_ADDR_TAG(addr_st0);
-
-    assign write_word_st0 = data_st0[`CS_WORD_WIDTH-1:0];
-
-    wire do_lookup_st0 = do_read_st0 || do_write_st0;
-    wire do_lookup_st1 = do_read_st1 || do_write_st1;
-
-    wire [`CS_WAY_SEL_WIDTH-1:0] victim_way_st0;
+    // ========================================================================
+    // S0 lookup: replacement + tags + way-encode + MSHR allocate
+    // ========================================================================
+    wire [`CS_WAY_SEL_WIDTH-1:0] victim_way;
+    wire [`CS_WAY_SEL_WIDTH-1:0] evict_way_st0 = st0.req.is_fill ? victim_way : st0.req.way_idx;
     wire [NUM_WAYS-1:0] tag_matches_st0;
+    wire [`CS_WAY_SEL_WIDTH-1:0] hit_idx_st0;
+    wire evict_dirty_st0;
+    wire [`CS_TAG_SEL_BITS-1:0] evict_tag_st0;
 
     VX_cache_repl #(
         .CACHE_SIZE  (CACHE_SIZE),
@@ -436,21 +424,19 @@ module VX_cache_bank import VX_gpu_pkg::*; #(
         .NUM_WAYS    (NUM_WAYS),
         .REPL_POLICY (REPL_POLICY)
     ) cache_repl (
-        .clk        (clk),
-        .reset      (reset),
-        .stall      (pipe_stall),
-        .init       (do_init_st0),
-        .lookup_valid(do_lookup_st1 && ~pipe_stall),
-        .lookup_hit (is_hit_st1),
-        .lookup_line(line_idx_st1),
-        .lookup_way (way_idx_st1),
-        .repl_valid (do_fill_st0 && ~is_passthru_fill_st0 && ~pipe_stall),
-        .repl_line  (line_idx_st0),
-        .repl_way   (victim_way_st0)
+        .clk          (clk),
+        .reset        (reset),
+        .stall        (pipe_stall),
+        .init         (do_init_st0),
+        .lookup_valid (do_lookup_st1 && ~pipe_stall),
+        .lookup_hit   (st1.lk.is_hit),
+        .lookup_line  (st1.req.addr[`CS_LINE_SEL_BITS-1:0]),
+        .lookup_way   (st1.req.way_idx),
+        .repl_valid   (do_fill_st0 && ~st0.req.is_passthru_fill && ~pipe_stall),
+        .repl_line    (line_idx_st0),
+        .repl_way     (victim_way)
     );
 
-    assign evict_way_st0 = is_fill_st0 ? victim_way_st0 : flush_way_st0;
-
     VX_cache_tags #(
         .CACHE_SIZE (CACHE_SIZE),
         .LINE_SIZE  (LINE_SIZE),
@@ -460,29 +446,26 @@ module VX_cache_bank import VX_gpu_pkg::*; #(
         .WRITEBACK  (WRITEBACK),
         .AMO_ENABLE ((AMO_ENABLE != 0) && (IS_LLC == 0))
     ) cache_tags (
-        .clk        (clk),
-        .reset      (reset),
-        // inputs
-        .stall      (pipe_stall),
-        .init       (do_init_st0),
-        .flush      (do_flush_st0 && ~pipe_stall),
-        .fill       (do_fill_st0 && ~is_passthru_fill_st0 && ~pipe_stall),
-        .read       (do_read_st0 && ~pipe_stall),
-        .write      (do_write_st0 && ~pipe_stall),
-        // non-LLC AMO forwards downstream and invalidates its own cached
-        // copy so the issuer's later plain load refetches the new value.
-        .invalidate (is_amo_fwd_st0 && is_hit_st0 && ~pipe_stall),
-        .line_idx   (line_idx_st0),
-        .line_idx_n (line_idx_sel),
-        .line_tag   (line_tag_st0),
-        .evict_way  (evict_way_st0),
-        // outputs
-        .tag_matches(tag_matches_st0),
-        .evict_dirty(is_dirty_st0),
-        .evict_tag  (evict_tag_st0)
+        .clk         (clk),
+        .reset       (reset),
+        .stall       (pipe_stall),
+        .init        (do_init_st0),
+        .flush       (do_flush_st0 && ~pipe_stall),
+        .fill        (do_fill_st0 && ~st0.req.is_passthru_fill && ~pipe_stall),
+        .read        (do_read_st0 && ~pipe_stall),
+        .write       (do_write_st0 && ~pipe_stall),
+        // non-LLC AMO forwards downstream and invalidates its own copy so the
+        // issuer's later plain load refetches the new value.
+        .invalidate  (is_amo_fwd_st0 && lk_st0.is_hit && ~pipe_stall),
+        .line_idx    (line_idx_st0),
+        .line_idx_n  (sel_req.req.addr[`CS_LINE_SEL_BITS-1:0]),
+        .line_tag    (line_tag_st0),
+        .evict_way   (evict_way_st0),
+        .tag_matches (tag_matches_st0),
+        .evict_dirty (evict_dirty_st0),
+        .evict_tag   (evict_tag_st0)
     );
 
-    wire [`CS_WAY_SEL_WIDTH-1:0] hit_idx_st0;
     VX_onehot_encoder #(
         .N (NUM_WAYS)
     ) way_idx_enc (
@@ -491,37 +474,94 @@ module VX_cache_bank import VX_gpu_pkg::*; #(
         `UNUSED_PIN (valid_out)
     );
 
-    assign way_idx_st0 = is_creq_st0 ? hit_idx_st0 : evict_way_st0;
-    assign is_hit_st0 = (| tag_matches_st0);
+    // S0 lookup delta (single combinational driver). The AMO requester is forced
+    // non-pending so it never coalesces onto a prior same-line entry.
+    always @(*) begin
+        lk_st0 = '0;
+        lk_st0.is_hit       = (| tag_matches_st0);
+        lk_st0.is_dirty     = evict_dirty_st0;
+        lk_st0.evict_tag    = evict_tag_st0;
+        lk_st0.write_word   = write_word_st0;
+        lk_st0.mshr_previd  = mshr_previd;
+        lk_st0.mshr_pending = mshr_pending_raw && ~is_amo_fwd_st0;
+    end
+
+    // ========================================================================
+    // Pipeline registration
+    //
+    // Tags / replacement / MSHR (allocate AND finalize) stay at S0/S1: the MSHR
+    // coalescing chain requires allocate(S0)->finalize(S1) exactly one cycle
+    // apart (deferring it orphans coalesced same-line entries -> deadlock). Only
+    // the data array (stD) and the commit consumers (stC) defer by PIPE_EX, so
+    // the array is driven by *registered* tag-compare results — breaking the
+    // tag->data critical path. Read and write both move to the same deferred
+    // stage, so pipeline order is preserved (no store->load hazard logic).
+    // ========================================================================
+
+    // data path: carry the request + fill line + tag compare, resolving the way
+    // for the data array (victim way for fill/flush, hit way otherwise).
+    always @(*) begin
+        dat_in = st0;
+        dat_in.req.way_idx = evict_way_st0;
+        dat_in.tag_matches = tag_matches_st0;
+    end
 
-    wire [MSHR_ADDR_WIDTH-1:0] mshr_alloc_id_st0;
-    assign mshr_id_st0 = is_replay_st0 ? replay_id_st0 : mshr_alloc_id_st0;
+    // commit path: the request (with the resolved hit/victim way and MSHR id)
+    // plus the lookup delta. The wide fill line is dropped here.
+    always @(*) begin
+        cmt_in.req = st0.req;
+        cmt_in.req.way_idx = st0.req.is_creq ? hit_idx_st0 : evict_way_st0;
+        cmt_in.req.mshr_id = st0.req.is_replay ? st0.req.mshr_id : mshr_alloc_id;
+        cmt_in.lk = lk_st0;
+    end
 
     VX_pipe_register #(
-        .DATAW  (1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + `UP(MEM_ATTR_WIDTH) + `CS_WAY_SEL_WIDTH + `CS_TAG_SEL_BITS + `CS_TAG_SEL_BITS + `CS_LINE_SEL_BITS + `CS_WORD_WIDTH + WORD_SIZE + WORD_SEL_WIDTH + REQ_SEL_WIDTH + TAG_WIDTH + MSHR_ADDR_WIDTH + MSHR_ADDR_WIDTH + 1 + AMO_REQ_BITS),
-        .RESETW (1)
-    ) pipe_reg1 (
+        .DATAW  ($bits(data_t)),
+        .RESETW (1),
+        .DEPTH  (PIPE_EX)
+    ) reg_dat (
         .clk      (clk),
         .reset    (reset),
         .enable   (~pipe_stall),
-        .data_in  ({valid_st0, is_fill_st0, is_flush_st0, is_creq_st0, is_replay_st0, is_dirty_st0, is_hit_st0, rw_st0, attr_st0, way_idx_st0, evict_tag_st0, line_tag_st0, line_idx_st0, write_word_st0, byteen_st0, word_idx_st0, req_idx_st0, tag_st0, mshr_id_st0, mshr_previd_st0, mshr_pending_st0, amo_st0}),
-        .data_out ({valid_st1, is_fill_st1, is_flush_st1, is_creq_st1, is_replay_st1, is_dirty_st1, is_hit_st1, rw_st1, attr_st1, way_idx_st1, evict_tag_st1, line_tag_st1, line_idx_st1, write_word_st1, byteen_st1, word_idx_st1, req_idx_st1, tag_st1, mshr_id_st1, mshr_previd_st1, mshr_pending_st1, amo_st1})
+        .data_in  (dat_in),
+        .data_out (stD)
     );
 
-    if (UUID_WIDTH != 0) begin : g_req_uuid_st1
-        assign req_uuid_st1 = tag_st1[TAG_WIDTH-1 -: UUID_WIDTH];
-    end else begin : g_req_uuid_st1_0
-        assign req_uuid_st1 = '0;
-    end
+    VX_pipe_register #(
+        .DATAW  ($bits(commit_t)),
+        .RESETW (1)
+    ) reg_s1 (
+        .clk      (clk),
+        .reset    (reset),
+        .enable   (~pipe_stall),
+        .data_in  (cmt_in),
+        .data_out (st1)
+    );
 
-    assign addr_st1 = {line_tag_st1, line_idx_st1};
+    VX_pipe_register #(
+        .DATAW  ($bits(commit_t)),
+        .RESETW (1),
+        .DEPTH  (PIPE_EX)
+    ) reg_cmt (
+        .clk      (clk),
+        .reset    (reset),
+        .enable   (~pipe_stall),
+        .data_in  (st1),
+        .data_out (stC)
+    );
 
-    // ensure mshr replay always get a hit (a passthru-AMO replay carries
-    // its result word instead of an installed line, so it counts as a hit)
-    `RUNTIME_ASSERT (~(valid_st1 && is_replay_st1 && ~eff_hit_st1), ("missed mshr replay"))
+    // a passthru-AMO replay carries its result word instead of an installed
+    // line, so it counts as a hit at the commit stage.
+    wire eff_hit_st1 = st1.lk.is_hit || is_amo_replay_st1;
+    wire eff_hit_stc = stC.lk.is_hit || is_amo_replay_st1;
+    `RUNTIME_ASSERT (~(st1.req.valid && st1.req.is_replay && ~eff_hit_st1), ("missed mshr replay"))
 
-    wire[`CS_WORDS_PER_LINE-1:0][`CS_WORD_WIDTH-1:0] read_data_st1;
-    wire [LINE_SIZE-1:0] evict_byteen_st1;
+    // ========================================================================
+    // Data array (driven at stD; outputs land at stC)
+    // ========================================================================
+    wire[`CS_WORDS_PER_LINE-1:0][`CS_WORD_WIDTH-1:0] read_data_stc;
+    wire [LINE_SIZE-1:0] evict_byteen_stc;
+    wire [`CS_WORD_WIDTH-1:0] read_word_stc = read_data_stc[stC.req.word_idx];
 
     VX_cache_data #(
         .CACHE_SIZE   (CACHE_SIZE),
@@ -533,52 +573,48 @@ module VX_cache_bank import VX_gpu_pkg::*; #(
         .WRITEBACK    (WRITEBACK),
         .DIRTY_BYTES  (DIRTY_BYTES)
     ) cache_data (
-        .clk        (clk),
-        .reset      (reset),
-        // inputs
-        .init       (do_init_st0),
-        .fill       (do_fill_st0 && ~is_passthru_fill_st0 && ~pipe_stall),
-        .flush      (do_flush_st0 && ~pipe_stall),
-        .read       (do_read_st0 && ~pipe_stall),
-        .write      (do_write_st0 && ~pipe_stall),
-        .evict_way  (evict_way_st0),
-        .tag_matches(tag_matches_st0),
-        .line_idx   (line_idx_st0),
-        .fill_data  (data_st0),
-        .write_word (write_word_st0),
-        .word_idx   (word_idx_st0),
-        .write_byteen(byteen_st0),
-        .way_idx_r  (way_idx_st1),
-        // outputs
-        .read_data  (read_data_st1),
-        .evict_byteen(evict_byteen_st1)
+        .clk          (clk),
+        .reset        (reset),
+        .init         (do_init_std),
+        .fill         (do_fill_std && ~stD.req.is_passthru_fill && ~pipe_stall),
+        .flush        (do_flush_std && ~pipe_stall),
+        .read         (do_read_std && ~pipe_stall),
+        .write        (do_write_std && ~pipe_stall),
+        .evict_way    (stD.req.way_idx),
+        .tag_matches  (stD.tag_matches),
+        .line_idx     (stD.req.addr[`CS_LINE_SEL_BITS-1:0]),
+        .fill_data    (stD.data),
+        .write_word   (stD.data[`CS_WORD_WIDTH-1:0]),
+        .word_idx     (stD.req.word_idx),
+        .write_byteen (stD.req.byteen),
+        .way_idx_r    (stC.req.way_idx),
+        .read_data    (read_data_stc),
+        .evict_byteen (evict_byteen_stc)
     );
 
-    // only allocate MSHR entries for non-replay core requests
-    wire mshr_allocate_st0 = valid_st0 && is_creq_st0 && ~is_replay_st0;
-    wire mshr_finalize_st1 = valid_st1 && is_creq_st1 && ~is_replay_st1;
+    // ========================================================================
+    // MSHR (allocate at S0, finalize at S1)
+    // ========================================================================
+    wire mshr_allocate_st0 = st0.req.valid && st0.req.is_creq && ~st0.req.is_replay;
+    wire mshr_finalize_st1 = st1.req.valid && st1.req.is_creq && ~st1.req.is_replay;
 
-    // release allocated mshr entry if we had a hit
+    // release the entry on a hit. A forwarded AMO keeps its entry until its
+    // downstream response returns (fill/dequeue frees it), so never release it.
     wire mshr_release_st1;
-    // A forwarded AMO keeps its entry allocated until its downstream
-    // response returns (the fill/dequeue frees it), so never release it
-    // at finalize even on a local hit.
     if (WRITEBACK) begin : g_mshr_release
-        assign mshr_release_st1 = is_hit_st1 && ~is_amo_fwd_st1;
+        assign mshr_release_st1 = st1.lk.is_hit && ~is_amo_fwd_st1;
     end else begin : g_mshr_release_ro
-        // we need to keep missed write requests in MSHR if there is already a pending entry to the same address.
-        // this ensures that missed write requests are replayed locally in case a pending fill arrives without the write content.
-        // this can happen when writes are sent to memory late, when a related fill was already in flight.
-        assign mshr_release_st1 = (is_hit_st1 || (rw_st1 && ~mshr_pending_st1)) && ~is_amo_fwd_st1;
+        // keep missed writes in MSHR if a pending entry exists for the line, so a
+        // pending fill arriving without the write content replays them locally.
+        assign mshr_release_st1 = (st1.lk.is_hit || (st1.req.rw && ~st1.lk.mshr_pending)) && ~is_amo_fwd_st1;
     end
-
     wire mshr_release_fire = mshr_finalize_st1 && mshr_release_st1 && ~pipe_stall;
 
     wire [1:0] mshr_dequeue;
     `POP_COUNT(mshr_dequeue, {replay_fire, mshr_release_fire});
 
     VX_pending_size #(
-        .SIZE (MSHR_SIZE),
+        .SIZE  (MSHR_SIZE),
         .DECRW (2)
     ) mshr_pending_size (
         .clk   (clk),
@@ -602,58 +638,47 @@ module VX_cache_bank import VX_gpu_pkg::*; #(
         .AMO_ENABLE  ((AMO_ENABLE != 0) && (IS_LLC == 0)),
         .DATA_WIDTH  (WORD_SEL_WIDTH + WORD_SIZE + `CS_WORD_WIDTH + TAG_WIDTH + REQ_SEL_WIDTH + AMO_REQ_BITS)
     ) cache_mshr (
-        .clk            (clk),
-        .reset          (reset),
-
-        .deq_req_uuid   (req_uuid_sel),
-        .alc_req_uuid   (req_uuid_st0),
-        .fin_req_uuid   (req_uuid_st1),
-
-        // memory fill
-        .fill_valid     (mem_rsp_fire),
-        .fill_id        (mem_rsp_id),
-        .fill_addr      (mem_rsp_addr),
-
-        // probe: pending entries for the incoming request's line, by type.
-        .probe_addr     (core_req_addr),
-        .probe_pending_ld (mshr_probe_pending_ld),
-        .probe_pending_amo (mshr_probe_pending_amo),
-
-        // dequeue
-        .dequeue_valid  (replay_valid),
-        .dequeue_addr   (replay_addr),
-        .dequeue_rw     (replay_rw),
-        .dequeue_data   ({replay_wsel, replay_byteen, replay_data, replay_tag, replay_idx, replay_amo}),
-        .dequeue_id     (replay_id),
-        .dequeue_ready  (replay_ready),
-
-        // allocate
-        .allocate_valid (mshr_allocate_st0 && ~pipe_stall),
-        .allocate_addr  (addr_st0),
-        .allocate_rw    (rw_st0),
-        // Only non-LLC AMOs must not coalesce (each forwards its own
-        // round-trip). At the LLC, same-line AMOs coalesce and serialize
-        // their commits on the single filled line.
-        .allocate_is_amo((AMO_ENABLE && !IS_LLC) ? amo_st0.amo_valid : 1'b0),
-        .allocate_data  ({word_idx_st0, byteen_st0, write_word_st0, tag_st0, req_idx_st0, amo_st0}),
-        .allocate_id    (mshr_alloc_id_st0),
-        .allocate_pending(mshr_pending_raw_st0),
-        .allocate_previd(mshr_previd_st0),
-        `UNUSED_PIN     (allocate_ready),
-
-        // finalize
-        .finalize_valid (mshr_finalize_st1 && ~pipe_stall),
-        .finalize_is_release(mshr_release_st1),
-        .finalize_is_pending(mshr_pending_st1),
-        .finalize_id    (mshr_id_st1),
-        .finalize_previd(mshr_previd_st1)
+        .clk                 (clk),
+        .reset               (reset),
+        .deq_req_uuid        (req_uuid_sel),
+        .alc_req_uuid        (req_uuid_st0),
+        .fin_req_uuid        (req_uuid_st1),
+        .fill_valid          (mem_rsp_fire),
+        .fill_id             (mem_rsp_id),
+        .fill_addr           (mem_rsp_addr),
+        .probe_addr          (core_req_addr),
+        .probe_pending_ld    (mshr_probe_pending_ld),
+        .probe_pending_amo   (mshr_probe_pending_amo),
+        .dequeue_valid       (replay_valid),
+        .dequeue_addr        (replay_addr),
+        .dequeue_rw          (replay_rw),
+        .dequeue_data        ({replay_wsel, replay_byteen, replay_data, replay_tag, replay_idx, replay_amo}),
+        .dequeue_id          (replay_id),
+        .dequeue_ready       (replay_ready),
+        .allocate_valid      (mshr_allocate_st0 && ~pipe_stall),
+        .allocate_addr       (st0.req.addr),
+        .allocate_rw         (st0.req.rw),
+        // Only non-LLC AMOs must not coalesce; at the LLC same-line AMOs coalesce
+        // and serialize their commits on the single filled line.
+        .allocate_is_amo     ((AMO_ENABLE && !IS_LLC) ? st0.req.amo.amo_valid : 1'b0),
+        .allocate_data       ({st0.req.word_idx, st0.req.byteen, write_word_st0, st0.req.tag, st0.req.req_idx, st0.req.amo}),
+        .allocate_id         (mshr_alloc_id),
+        .allocate_pending    (mshr_pending_raw),
+        .allocate_previd     (mshr_previd),
+        `UNUSED_PIN (allocate_ready),
+        .finalize_valid      (mshr_finalize_st1 && ~pipe_stall),
+        .finalize_is_release (mshr_release_st1),
+        .finalize_is_pending (st1.lk.mshr_pending),
+        .finalize_id         (st1.req.mshr_id),
+        .finalize_previd     (st1.lk.mshr_previd)
     );
 
-    // ============================================================
+    // ========================================================================
     // AMO engine
-    // ============================================================
-    wire [`CS_WORD_WIDTH-1:0] read_word_st1 = read_data_st1[word_idx_st1];
-
+    //
+    // The read word lands at the deferred commit stage stC; the engine consumes
+    // it at S1 (== stC when PIPE_EX=0, the validated case).
+    // ========================================================================
     if (AMO_ENABLE) begin : g_amo
         VX_cache_amo #(
             .IS_LLC          (IS_LLC),
@@ -667,65 +692,69 @@ module VX_cache_bank import VX_gpu_pkg::*; #(
             .ATTR_WIDTH      (`UP(MEM_ATTR_WIDTH)),
             .MSHR_SIZE       (MSHR_SIZE),
             .MSHR_ADDR_WIDTH (MSHR_ADDR_WIDTH),
-            .WORDS_PER_LINE  (`CS_WORDS_PER_LINE)
+            .WORDS_PER_LINE  (`CS_WORDS_PER_LINE),
+            .PIPE_EX         (PIPE_EX)
         ) amo (
-            .clk             (clk),
-            .reset           (reset),
-            .pipe_stall      (pipe_stall),
-            .amo_st0         (amo_st0),
-            .valid_st0       (valid_st0),
-            .is_creq_st0     (is_creq_st0),
-            .is_hit_st0      (is_hit_st0),
-            .is_replay_st0   (is_replay_st0),
-            .amo_st1         (amo_st1),
-            .valid_st1       (valid_st1),
-            .is_creq_st1     (is_creq_st1),
-            .is_hit_st1      (is_hit_st1),
-            .is_replay_st1   (is_replay_st1),
-            .do_write_st1    (do_write_st1),
-            .read_word_st1   (read_word_st1),
-            .byteen_st1      (byteen_st1),
-            .write_word_st1  (write_word_st1),
-            .word_idx_st0    (word_idx_st0),
-            .word_idx_st1    (word_idx_st1),
-            .addr_st0        (addr_st0),
-            .addr_st1        (addr_st1),
-            .tag_st1         (tag_st1),
-            .req_idx_st1     (req_idx_st1),
-            .attr_st1        (attr_st1),
-            .wb_fire         (amo_wb_fire),
-            .mshr_allocate_st0 (mshr_allocate_st0),
-            .mshr_alloc_id_st0 (mshr_alloc_id_st0),
-            .mshr_id_st1     (mshr_id_st1),
-            .mem_rsp_fire    (mem_rsp_fire),
-            .mem_rsp_id      (mem_rsp_id),
-            .mem_rsp_data    (mem_rsp_data),
-            .is_fill_sel     (is_fill_sel),
-            .core_req_valid  (core_req_valid),
-            .core_req_is_amo (core_req_amo.amo_valid),
-            .core_req_rw     (core_req_rw),
-            .core_req_addr   (core_req_addr),
-            .rw_st0          (rw_st0),
-            .mshr_probe_pending_ld (mshr_probe_pending_ld),
+            .clk                    (clk),
+            .reset                  (reset),
+            .pipe_stall             (pipe_stall),
+            .amo_st0                (st0.req.amo),
+            .valid_st0              (st0.req.valid),
+            .is_creq_st0            (st0.req.is_creq),
+            .is_hit_st0             (lk_st0.is_hit),
+            .is_replay_st0          (st0.req.is_replay),
+            // Commit ports are fed from stC (the deferred data-output stage), so
+            // the AMO RMW operands and the read word align at PIPE_EX>0. At
+            // PIPE_EX=0, stC == S1 and this is identical to the classic bank.
+            .amo_st1                (stC.req.amo),
+            .valid_st1              (stC.req.valid),
+            .is_creq_st1            (stC.req.is_creq),
+            .is_hit_st1             (stC.lk.is_hit),
+            .is_replay_st1          (stC.req.is_replay),
+            .do_write_st1           (do_write_stc),
+            .read_word_st1          (read_word_stc),
+            .byteen_st1             (stC.req.byteen),
+            .write_word_st1         (stC.lk.write_word),
+            .word_idx_st0           (st0.req.word_idx),
+            .word_idx_st1           (stC.req.word_idx),
+            .addr_st0               (st0.req.addr),
+            .addr_st1               (addr_stc),
+            .tag_st1                (stC.req.tag),
+            .req_idx_st1            (stC.req.req_idx),
+            .attr_st1               (stC.req.attr),
+            .wb_fire                (amo_wb_fire),
+            .mshr_allocate_st0      (mshr_allocate_st0),
+            .mshr_alloc_id_st0      (mshr_alloc_id),
+            .mshr_id_st1            (stC.req.mshr_id),
+            .mem_rsp_fire           (mem_rsp_fire),
+            .mem_rsp_id             (mem_rsp_id),
+            .mem_rsp_data           (mem_rsp_data),
+            .is_fill_sel            (fill_enable),
+            .core_req_valid         (core_req_valid),
+            .core_req_is_amo        (core_req_amo.amo_valid),
+            .core_req_rw            (core_req_rw),
+            .core_req_addr          (core_req_addr),
+            .rw_st0                 (st0.req.rw),
+            .mshr_probe_pending_ld  (mshr_probe_pending_ld),
             .mshr_probe_pending_amo (mshr_probe_pending_amo),
-            .amo_hit_st1     (amo_hit_st1),
-            .commit_busy     (amo_commit_busy),
-            .chain_stall     (amo_chain_stall),
-            .wb_pending      (amo_wb_pending),
-            .rsp_data        (amo_rsp_data),
-            .wb_addr         (amo_wb_addr),
-            .wb_word_idx     (amo_wb_word_idx),
-            .wb_byteen       (amo_wb_byteen),
-            .wb_data         (amo_wb_data),
-            .wb_tag          (amo_wb_tag),
-            .wb_idx          (amo_wb_idx),
-            .wb_attr         (amo_wb_attr),
-            .is_amo_fwd_st0  (is_amo_fwd_st0),
-            .is_amo_fwd_st1  (is_amo_fwd_st1),
-            .is_amo_replay_st1 (is_amo_replay_st1),
-            .is_passthru_fill_sel (is_passthru_fill_sel),
-            .amo_ptw_word_st1 (amo_ptw_word_st1),
-            .req_input_defer (req_input_defer)
+            .amo_hit_st1            (amo_hit_st1),
+            .commit_busy            (amo_commit_busy),
+            .chain_stall            (amo_chain_stall),
+            .wb_pending             (amo_wb_pending),
+            .rsp_data               (amo_rsp_data),
+            .wb_addr                (amo_wb_addr),
+            .wb_word_idx            (amo_wb_word_idx),
+            .wb_byteen              (amo_wb_byteen),
+            .wb_data                (amo_wb_data),
+            .wb_tag                 (amo_wb_tag),
+            .wb_idx                 (amo_wb_idx),
+            .wb_attr                (amo_wb_attr),
+            .is_amo_fwd_st0         (is_amo_fwd_st0),
+            .is_amo_fwd_st1         (is_amo_fwd_st1),
+            .is_amo_replay_st1      (is_amo_replay_st1),
+            .is_passthru_fill_sel   (is_passthru_fill_sel),
+            .amo_ptw_word_st1       (amo_ptw_word_st1),
+            .req_input_defer        (req_input_defer)
         );
     end else begin : g_no_amo
         assign {amo_hit_st1, amo_commit_busy, amo_wb_pending, amo_chain_stall} = '0;
@@ -733,42 +762,22 @@ module VX_cache_bank import VX_gpu_pkg::*; #(
         assign {amo_wb_data, amo_wb_tag, amo_wb_idx, amo_wb_attr} = '0;
         assign {is_amo_fwd_st0, is_amo_fwd_st1, is_amo_replay_st1} = '0;
         assign {is_passthru_fill_sel, amo_ptw_word_st1, req_input_defer} = '0;
-        `UNUSED_VAR (amo_st1)
-        `UNUSED_VAR (amo_wb_fire)
-        `UNUSED_VAR (mshr_probe_pending_ld)
-        `UNUSED_VAR (mshr_probe_pending_amo)
+        // S1-only signals consumed solely by the AMO engine.
+        `UNUSED_VAR ({amo_wb_fire, mshr_probe_pending_ld, mshr_probe_pending_amo, st1.req.amo, st1.req.attr, st1.req.req_idx, st1.req.word_idx, st1.req.byteen, st1.lk.write_word})
     end
 
-    // Force the AMO requester non-pending so it never coalesces onto a prior
-    // entry for the same line — each atomic takes its own downstream trip.
-    assign mshr_pending_st0 = mshr_pending_raw_st0 && ~is_amo_fwd_st0;
-
-    // Passthru replay counts as a hit (its line was never installed): fires
-    // the core response, allocates no mreq, releases the MSHR entry.
-    wire eff_hit_st1 = is_hit_st1 || is_amo_replay_st1;
-
-    // schedule core response
-
-    wire crsp_queue_valid, crsp_queue_ready;
-    wire [`CS_WORD_WIDTH-1:0] crsp_queue_data;
-    wire [REQ_SEL_WIDTH-1:0] crsp_queue_idx;
-    wire [TAG_WIDTH-1:0] crsp_queue_tag;
-
-    // crsp_queue fires for reads and AMO commits at S1 on hit, but not
-    // for the synthetic writeback write (rw=1). A non-LLC AMO's first
-    // pass forwards downstream and must NOT respond locally; its result
-    // returns later via the passthru replay (eff_hit covers that replay).
-    // Suppress the response while a same-line AMO is chain-stalled at S1, so a
-    // read held for the extra pacing cycle enqueues its response exactly once
-    // (it fires when the op advances). amo_chain_stall is 0 for non-AMO traffic.
-    assign crsp_queue_valid = do_read_st1 && eff_hit_st1 && ~is_amo_fwd_st1 && ~amo_chain_stall;
-    assign crsp_queue_idx   = req_idx_st1;
-    // Response data: passthru replay returns the latched downstream result,
-    // an LLC AMO commit returns its formatted result word, else plain load.
-    assign crsp_queue_data  = is_amo_replay_st1 ? amo_ptw_word_st1
-                            : (amo_hit_st1 ? amo_rsp_data
-                            : read_word_st1);
-    assign crsp_queue_tag   = tag_st1;
+    // ========================================================================
+    // Core response (stC)
+    //
+    // Fires for reads (and LLC AMO commits) on hit, never for the synthetic
+    // writeback (rw=1). A non-LLC AMO's first pass forwards downstream and must
+    // not respond locally (its result returns via the passthru replay). Suppress
+    // while a same-line AMO is chain-stalled so a held read enqueues once.
+    // ========================================================================
+    wire crsp_queue_valid = do_read_stc && eff_hit_stc && ~is_amo_fwd_st1 && ~amo_chain_stall;
+    wire crsp_queue_ready;
+    wire [`CS_WORD_WIDTH-1:0] crsp_queue_data = is_amo_replay_st1 ? amo_ptw_word_st1
+                                              : (amo_hit_st1 ? amo_rsp_data : read_word_stc);
 
     VX_elastic_buffer #(
         .DATAW   (TAG_WIDTH + `CS_WORD_WIDTH + REQ_SEL_WIDTH),
@@ -779,98 +788,81 @@ module VX_cache_bank import VX_gpu_pkg::*; #(
         .reset     (reset),
         .valid_in  (crsp_queue_valid),
         .ready_in  (crsp_queue_ready),
-        .data_in   ({crsp_queue_tag, crsp_queue_data, crsp_queue_idx}),
+        .data_in   ({stC.req.tag, crsp_queue_data, stC.req.req_idx}),
         .data_out  ({core_rsp_tag, core_rsp_data, core_rsp_idx}),
         .valid_out (core_rsp_valid),
         .ready_out (core_rsp_ready)
     );
-
     assign crsp_queue_stall = crsp_queue_valid && ~crsp_queue_ready;
 
-    // schedule memory request
-
+    // ========================================================================
+    // Memory request (stC)
+    // ========================================================================
     wire mreq_queue_push, mreq_queue_pop;
     wire [`CS_LINE_WIDTH-1:0] mreq_queue_data;
     wire [LINE_SIZE-1:0] mreq_queue_byteen;
     wire [`CS_LINE_ADDR_WIDTH-1:0] mreq_queue_addr;
     wire [MEM_TAG_WIDTH-1:0] mreq_queue_tag;
     wire mreq_queue_rw;
-    wire [`UP(MEM_ATTR_WIDTH)-1:0] mreq_queue_attr;
 
-    wire is_fill_or_flush_st1 = is_fill_st1 || (is_flush_st1 && WRITEBACK);
-    wire do_fill_or_flush_st1 = valid_st1 && is_fill_or_flush_st1;
-    wire do_writeback_st1 = do_fill_or_flush_st1 && is_dirty_st1;
-    wire [`CS_LINE_ADDR_WIDTH-1:0] evict_addr_st1 = {evict_tag_st1, line_idx_st1};
+    wire is_fill_or_flush_stc = stC.req.is_fill || (stC.req.is_flush && WRITEBACK);
+    wire do_fill_or_flush_stc = stC.req.valid && is_fill_or_flush_stc;
+    wire do_writeback_stc = do_fill_or_flush_stc && stC.lk.is_dirty;
+    wire [`CS_LINE_ADDR_WIDTH-1:0] evict_addr_stc = {stC.lk.evict_tag, stC.req.addr[`CS_LINE_SEL_BITS-1:0]};
 
     if (WRITE_ENABLE) begin : g_mreq_queue
         if (WRITEBACK) begin : g_wb
             if (DIRTY_BYTES) begin : g_dirty_bytes
-                // ensure dirty bytes match the tag info
-                wire has_dirty_bytes = (| evict_byteen_st1);
-                `RUNTIME_ASSERT (~do_fill_or_flush_st1 || (is_dirty_st1 == has_dirty_bytes), ("missmatch dirty bytes: dirty_line=%b, dirty_bytes=%b, addr=0x%0h", is_dirty_st1, has_dirty_bytes, `CS_BANK_TO_FULL_ADDR(addr_st1, BANK_ID)))
+                wire has_dirty_bytes = (| evict_byteen_stc);
+                `RUNTIME_ASSERT (~do_fill_or_flush_stc || (stC.lk.is_dirty == has_dirty_bytes), ("missmatch dirty bytes: dirty_line=%b, dirty_bytes=%b, addr=0x%0h", stC.lk.is_dirty, has_dirty_bytes, `CS_BANK_TO_FULL_ADDR(addr_stc, BANK_ID)))
             end
-            // issue a fill request on a read/write miss
-            // issue a writeback on a dirty line eviction
-            assign mreq_queue_push = ((do_lookup_st1 && ~is_hit_st1 && ~mshr_pending_st1)
-                                   || do_writeback_st1)
-                                  && ~pipe_stall;
-            assign mreq_queue_addr = is_fill_or_flush_st1 ? evict_addr_st1 : addr_st1;
-            assign mreq_queue_rw = is_fill_or_flush_st1;
-            assign mreq_queue_data = read_data_st1;
-            assign mreq_queue_byteen = is_fill_or_flush_st1 ? evict_byteen_st1 : '1;
-            `UNUSED_VAR (write_word_st1)
-            `UNUSED_VAR (byteen_st1)
+            // fill on a read/write miss; writeback on a dirty-line eviction.
+            assign mreq_queue_push = (((do_read_stc || do_write_stc) && ~stC.lk.is_hit && ~stC.lk.mshr_pending)
+                                   || do_writeback_stc) && ~pipe_stall;
+            assign mreq_queue_addr = is_fill_or_flush_stc ? evict_addr_stc : addr_stc;
+            assign mreq_queue_rw = is_fill_or_flush_stc;
+            assign mreq_queue_data = read_data_stc;
+            assign mreq_queue_byteen = is_fill_or_flush_stc ? evict_byteen_stc : '1;
+            `UNUSED_VAR ({stC.lk.write_word, stC.req.byteen, stC.req.is_replay})
         end else begin : g_wt
             wire [LINE_SIZE-1:0] line_byteen;
             VX_demux #(
                 .DATAW (WORD_SIZE),
-                .N (`CS_WORDS_PER_LINE)
+                .N     (`CS_WORDS_PER_LINE)
             ) byteen_demux (
-                .sel_in   (word_idx_st1),
-                .data_in  (byteen_st1),
+                .sel_in   (stC.req.word_idx),
+                .data_in  (stC.req.byteen),
                 .data_out (line_byteen)
             );
-            // issue a fill request on a read miss
-            // issue a memory write on a write request (ensure write replays don't send again)
-            // forward a non-LLC AMO downstream (always, even on a local hit);
-            // its passthru replay (eff_hit) must NOT re-issue a fill.
-            assign mreq_queue_push = ((do_read_st1 && ~eff_hit_st1 && ~mshr_pending_st1)
-                                  || (do_write_st1 && ~is_replay_st1)
-                                  || is_amo_fwd_st1)
-                                  && ~pipe_stall;
-            assign mreq_queue_addr = addr_st1;
-            assign mreq_queue_rw = rw_st1;
-            assign mreq_queue_data = {`CS_WORDS_PER_LINE{write_word_st1}};
-            // an AMO forward carries its single word's byteen (rw=0 but the
-            // downstream LLC reads it via the AMO sideband, not as a write).
-            assign mreq_queue_byteen = (rw_st1 || is_amo_fwd_st1) ? line_byteen : '1;
-            `UNUSED_VAR (is_fill_or_flush_st1)
-            `UNUSED_VAR (do_writeback_st1)
-            `UNUSED_VAR (evict_addr_st1)
-            `UNUSED_VAR (evict_byteen_st1)
+            // fill on a read miss; memory write on a write (don't resend replays);
+            // forward a non-LLC AMO downstream (its passthru replay must not refill).
+            assign mreq_queue_push = ((do_read_stc && ~eff_hit_stc && ~stC.lk.mshr_pending)
+                                  || (do_write_stc && ~stC.req.is_replay)
+                                  || is_amo_fwd_st1) && ~pipe_stall;
+            assign mreq_queue_addr = addr_stc;
+            assign mreq_queue_rw = stC.req.rw;
+            assign mreq_queue_data = {`CS_WORDS_PER_LINE{stC.lk.write_word}};
+            // an AMO forward carries its single word's byteen (read downstream via
+            // the AMO sideband, not as a write).
+            assign mreq_queue_byteen = (stC.req.rw || is_amo_fwd_st1) ? line_byteen : '1;
+            `UNUSED_VAR ({is_fill_or_flush_stc, do_writeback_stc, evict_addr_stc, evict_byteen_stc, stC.lk.evict_tag, stC.lk.is_dirty})
         end
     end else begin : g_mreq_queue_ro
-        // issue a fill request on a read miss
-        assign mreq_queue_push = (do_read_st1 && ~is_hit_st1 && ~mshr_pending_st1) && ~pipe_stall;
-        assign mreq_queue_addr = addr_st1;
+        assign mreq_queue_push = (do_read_stc && ~stC.lk.is_hit && ~stC.lk.mshr_pending) && ~pipe_stall;
+        assign mreq_queue_addr = addr_stc;
         assign mreq_queue_rw = 0;
         assign mreq_queue_data = '0;
         assign mreq_queue_byteen = '1;
-        `UNUSED_VAR (do_writeback_st1)
-        `UNUSED_VAR (evict_addr_st1)
-        `UNUSED_VAR (evict_byteen_st1)
-        `UNUSED_VAR (write_word_st1)
-        `UNUSED_VAR (byteen_st1)
+        `UNUSED_VAR ({do_writeback_stc, evict_addr_stc, evict_byteen_stc, stC.lk.write_word, stC.lk.evict_tag, stC.lk.is_dirty, stC.req.byteen, stC.req.word_idx, stC.req.is_replay, do_write_stc})
     end
 
     if (UUID_WIDTH != 0) begin : g_mreq_queue_tag_uuid
-        assign mreq_queue_tag = {req_uuid_st1, mshr_id_st1};
+        assign mreq_queue_tag = {req_uuid_stc, stC.req.mshr_id};
     end else begin : g_mreq_queue_tag
-        assign mreq_queue_tag = mshr_id_st1;
+        assign mreq_queue_tag = stC.req.mshr_id;
     end
 
     assign mreq_queue_pop = mem_req_valid && mem_req_ready;
-    assign mreq_queue_attr = attr_st1;
 
     VX_fifo_queue #(
         .DATAW    (1 + `CS_LINE_ADDR_WIDTH + LINE_SIZE + `CS_LINE_WIDTH + MEM_TAG_WIDTH + `UP(MEM_ATTR_WIDTH)),
@@ -878,19 +870,18 @@ module VX_cache_bank import VX_gpu_pkg::*; #(
         .ALM_FULL (MREQ_SIZE - PIPELINE_STAGES),
         .OUT_REG  (`TO_OUT_BUF_REG(MEM_OUT_BUF))
     ) mem_req_queue (
-        .clk        (clk),
-        .reset      (reset),
-        .push       (mreq_queue_push),
-        .pop        (mreq_queue_pop),
-        .data_in    ({mreq_queue_rw, mreq_queue_addr, mreq_queue_byteen, mreq_queue_data, mreq_queue_tag, mreq_queue_attr}),
-        .data_out   ({mem_req_rw,    mem_req_addr,    mem_req_byteen,    mem_req_data,    mem_req_tag,    mem_req_attr}),
-        .empty      (mreq_queue_empty),
-        .alm_full   (mreq_queue_alm_full),
+        .clk      (clk),
+        .reset    (reset),
+        .push     (mreq_queue_push),
+        .pop      (mreq_queue_pop),
+        .data_in  ({mreq_queue_rw, mreq_queue_addr, mreq_queue_byteen, mreq_queue_data, mreq_queue_tag, stC.req.attr}),
+        .data_out ({mem_req_rw, mem_req_addr, mem_req_byteen, mem_req_data, mem_req_tag, mem_req_attr}),
+        .empty    (mreq_queue_empty),
+        .alm_full (mreq_queue_alm_full),
         `UNUSED_PIN (full),
         `UNUSED_PIN (alm_empty),
         `UNUSED_PIN (size)
     );
-
     assign mem_req_valid = ~mreq_queue_empty;
 
     `UNUSED_VAR (do_lookup_st0)
@@ -898,9 +889,9 @@ module VX_cache_bank import VX_gpu_pkg::*; #(
 ///////////////////////////////////////////////////////////////////////////////
 
 `ifdef PERF_ENABLE
-    assign perf_read_miss  = do_read_st1 && ~is_hit_st1;
-    assign perf_write_miss = do_write_st1 && ~is_hit_st1;
-    assign perf_evictions  = do_writeback_st1; // dirty-line writeback eviction
+    assign perf_read_miss  = do_read_st1 && ~st1.lk.is_hit;
+    assign perf_write_miss = do_write_st1 && ~st1.lk.is_hit;
+    assign perf_evictions  = do_writeback_stc;
     assign perf_mshr_stall = mshr_alm_full;
 `endif
 
@@ -912,8 +903,8 @@ module VX_cache_bank import VX_gpu_pkg::*; #(
     wire [`VX_CFG_XLEN-1:0] mem_rsp_full_addr = `CS_BANK_TO_FULL_ADDR(mem_rsp_addr, BANK_ID);
     wire [`VX_CFG_XLEN-1:0] replay_full_addr = `CS_BANK_TO_FULL_ADDR(replay_addr, BANK_ID);
     wire [`VX_CFG_XLEN-1:0] core_req_full_addr = `CS_BANK_TO_FULL_ADDR(core_req_addr, BANK_ID);
-    wire [`VX_CFG_XLEN-1:0] full_addr_st0 = `CS_BANK_TO_FULL_ADDR(addr_st0, BANK_ID);
-    wire [`VX_CFG_XLEN-1:0] full_addr_st1 = `CS_BANK_TO_FULL_ADDR(addr_st1, BANK_ID);
+    wire [`VX_CFG_XLEN-1:0] full_addr_st0 = `CS_BANK_TO_FULL_ADDR(st0.req.addr, BANK_ID);
+    wire [`VX_CFG_XLEN-1:0] full_addr_st1 = `CS_BANK_TO_FULL_ADDR(st1.req.addr, BANK_ID);
     wire [`VX_CFG_XLEN-1:0] mreq_queue_full_addr = `CS_BANK_TO_FULL_ADDR(mreq_queue_addr, BANK_ID);
 
     always @(posedge clk) begin
@@ -943,51 +934,51 @@ module VX_cache_bank import VX_gpu_pkg::*; #(
         end
         if (do_fill_st0 && ~pipe_stall) begin
             `TRACE(3, ("%t: %s tags-fill: addr=0x%0h, way=%0d, line=%0d, dirty=%b (#%0d)\n", $time, INSTANCE_ID,
-                full_addr_st0, evict_way_st0, line_idx_st0, is_dirty_st0, req_uuid_st0))
+                full_addr_st0, evict_way_st0, line_idx_st0, lk_st0.is_dirty, req_uuid_st0))
         end
         if (do_flush_st0 && ~pipe_stall) begin
             `TRACE(3, ("%t: %s tags-flush: addr=0x%0h, way=%0d, line=%0d, dirty=%b (#%0d)\n", $time, INSTANCE_ID,
-                full_addr_st0, evict_way_st0, line_idx_st0, is_dirty_st0, req_uuid_st0))
+                full_addr_st0, evict_way_st0, line_idx_st0, lk_st0.is_dirty, req_uuid_st0))
         end
         if (do_lookup_st0 && ~pipe_stall) begin
-            if (is_hit_st0) begin
+            if (lk_st0.is_hit) begin
                 `TRACE(3, ("%t: %s tags-hit: addr=0x%0h, rw=%b, way=%0d, line=%0d, tag=0x%0h (#%0d)\n", $time, INSTANCE_ID,
-                    full_addr_st0, rw_st0, way_idx_st0, line_idx_st0, line_tag_st0, req_uuid_st0))
+                    full_addr_st0, st0.req.rw, hit_idx_st0, line_idx_st0, line_tag_st0, req_uuid_st0))
             end else begin
                 `TRACE(3, ("%t: %s tags-miss: addr=0x%0h, rw=%b, way=%0d, line=%0d, tag=0x%0h (#%0d)\n", $time, INSTANCE_ID,
-                    full_addr_st0, rw_st0, way_idx_st0, line_idx_st0, line_tag_st0, req_uuid_st0))
+                    full_addr_st0, st0.req.rw, hit_idx_st0, line_idx_st0, line_tag_st0, req_uuid_st0))
             end
         end
         if (do_fill_st0 && ~pipe_stall) begin
             `TRACE(3, ("%t: %s data-fill: addr=0x%0h, way=%0d, line=%0d, data=0x%h (#%0d)\n", $time, INSTANCE_ID,
-                full_addr_st0, way_idx_st0, line_idx_st0, data_st0, req_uuid_st0))
+                full_addr_st0, evict_way_st0, line_idx_st0, st0.data, req_uuid_st0))
         end
         if (do_flush_st0 && ~pipe_stall) begin
             `TRACE(3, ("%t: %s data-flush: addr=0x%0h, way=%0d, line=%0d (#%0d)\n", $time, INSTANCE_ID,
-                full_addr_st0, way_idx_st0, line_idx_st0, req_uuid_st0))
+                full_addr_st0, evict_way_st0, line_idx_st0, req_uuid_st0))
         end
-        if (do_read_st1 && is_hit_st1 && ~pipe_stall) begin
-            `TRACE(3, ("%t: %s data-read: addr=0x%0h, way=%0d, line=%0d, wsel=%0d, data=0x%h (#%0d)\n", $time, INSTANCE_ID,
-                full_addr_st1, way_idx_st1, line_idx_st1, word_idx_st1, crsp_queue_data, req_uuid_st1))
+        if (do_read_st1 && st1.lk.is_hit && ~pipe_stall) begin
+            `TRACE(3, ("%t: %s data-read: addr=0x%0h, way=%0d, line=%0d, wsel=%0d (#%0d)\n", $time, INSTANCE_ID,
+                full_addr_st1, st1.req.way_idx, st1.req.addr[`CS_LINE_SEL_BITS-1:0], st1.req.word_idx, req_uuid_st1))
         end
-        if (do_write_st1 && is_hit_st1 && ~pipe_stall) begin
+        if (do_write_st1 && st1.lk.is_hit && ~pipe_stall) begin
             `TRACE(3, ("%t: %s data-write: addr=0x%0h, way=%0d, line=%0d, wsel=%0d, byteen=0x%h, data=0x%h (#%0d)\n", $time, INSTANCE_ID,
-                full_addr_st1, way_idx_st1, line_idx_st1, word_idx_st1, byteen_st1, write_word_st1, req_uuid_st1))
+                full_addr_st1, st1.req.way_idx, st1.req.addr[`CS_LINE_SEL_BITS-1:0], st1.req.word_idx, st1.req.byteen, st1.lk.write_word, req_uuid_st1))
         end
         if (crsp_queue_fire) begin
             `TRACE(2, ("%t: %s core-rd-rsp: addr=0x%0h, tag=0x%0h, req_idx=%0d, data=0x%h (#%0d)\n", $time, INSTANCE_ID,
-                full_addr_st1, crsp_queue_tag, crsp_queue_idx, crsp_queue_data, req_uuid_st1))
+                addr_stc, stC.req.tag, stC.req.req_idx, crsp_queue_data, req_uuid_stc))
         end
         if (mreq_queue_push) begin
-            if (!WRITEBACK && do_write_st1) begin
+            if (!WRITEBACK && do_write_stc) begin
                 `TRACE(2, ("%t: %s writethrough: addr=0x%0h, byteen=0x%h, data=0x%h (#%0d)\n", $time, INSTANCE_ID,
-                    mreq_queue_full_addr, mreq_queue_byteen, mreq_queue_data, req_uuid_st1))
-            end else if (WRITEBACK && do_writeback_st1) begin
+                    mreq_queue_full_addr, mreq_queue_byteen, mreq_queue_data, req_uuid_stc))
+            end else if (WRITEBACK && do_writeback_stc) begin
                 `TRACE(2, ("%t: %s writeback: addr=0x%0h, byteen=0x%h, data=0x%h (#%0d)\n", $time, INSTANCE_ID,
-                    mreq_queue_full_addr, mreq_queue_byteen, mreq_queue_data, req_uuid_st1))
+                    mreq_queue_full_addr, mreq_queue_byteen, mreq_queue_data, req_uuid_stc))
             end else begin
                 `TRACE(2, ("%t: %s fill-req: addr=0x%0h, mshr_id=%0d (#%0d)\n", $time, INSTANCE_ID,
-                    mreq_queue_full_addr, mshr_id_st1, req_uuid_st1))
+                    mreq_queue_full_addr, stC.req.mshr_id, req_uuid_stc))
             end
         end
     end
diff --git a/hw/rtl/cache/VX_cache_cluster.sv b/hw/rtl/cache/VX_cache_cluster.sv
index 981b69ec3..6971eb4a5 100644
--- a/hw/rtl/cache/VX_cache_cluster.sv
+++ b/hw/rtl/cache/VX_cache_cluster.sv
@@ -46,6 +46,9 @@ module VX_cache_cluster import VX_gpu_pkg::*; #(
     // Memory Request Queue Size
     parameter MREQ_SIZE             = 4,
 
+    // Bank pipeline depth (2 = classic lookup+commit; larger defers the data array)
+    parameter LATENCY               = 2,
+
     // Enable cache writeable
     parameter WRITE_ENABLE          = 1,
 
@@ -167,6 +170,7 @@ module VX_cache_cluster import VX_gpu_pkg::*; #(
             .MSHR_SIZE    (MSHR_SIZE),
             .MRSQ_SIZE    (MRSQ_SIZE),
             .MREQ_SIZE    (MREQ_SIZE),
+            .LATENCY      (LATENCY),
             .TAG_WIDTH    (ARB_TAG_WIDTH),
             .TAG_SEL_IDX  (TAG_SEL_IDX),
             .CORE_OUT_BUF ((NUM_INPUTS != NUM_CACHES) ? 2 : CORE_OUT_BUF),
diff --git a/hw/rtl/cache/VX_cache_wrap.sv b/hw/rtl/cache/VX_cache_wrap.sv
index f5511cc2f..b4e310e9f 100644
--- a/hw/rtl/cache/VX_cache_wrap.sv
+++ b/hw/rtl/cache/VX_cache_wrap.sv
@@ -44,6 +44,9 @@ module VX_cache_wrap import VX_gpu_pkg::*; #(
     // Memory Request Queue Size
     parameter MREQ_SIZE             = 4,
 
+    // Bank pipeline depth (2 = classic lookup+commit; larger defers the data array)
+    parameter LATENCY               = 2,
+
     // Enable cache writeable
     parameter WRITE_ENABLE          = 1,
 
@@ -187,6 +190,7 @@ module VX_cache_wrap import VX_gpu_pkg::*; #(
             .MSHR_SIZE    (MSHR_SIZE),
             .MRSQ_SIZE    (MRSQ_SIZE),
             .MREQ_SIZE    (MREQ_SIZE),
+            .LATENCY      (LATENCY),
             .TAG_WIDTH    (TAG_WIDTH),
             .CORE_OUT_BUF (BYPASS_ENABLE ? 1 : CORE_OUT_BUF),
             .MEM_OUT_BUF  (BYPASS_ENABLE ? 1 : MEM_OUT_BUF),
diff --git a/hw/unittest/cache/VX_cache_top.sv b/hw/unittest/cache/VX_cache_top.sv
index dc51ae59c..3c8be3588 100644
--- a/hw/unittest/cache/VX_cache_top.sv
+++ b/hw/unittest/cache/VX_cache_top.sv
@@ -22,16 +22,16 @@ module VX_cache_top import VX_gpu_pkg::*; #(
     // Number of memory ports
     parameter MEM_PORTS             = 1,
 
-    // Size of cache in bytes
-    parameter CACHE_SIZE            = 65536,
+    // Size of cache in bytes (L2 config: reproduces the 1MB 8-way data array)
+    parameter CACHE_SIZE            = `VX_CFG_L2_CACHE_SIZE,
     // Size of line inside a bank in bytes
-    parameter LINE_SIZE             = 64,
+    parameter LINE_SIZE             = `VX_CFG_L2_LINE_SIZE,
     // Number of banks
-    parameter NUM_BANKS             = 4,
+    parameter NUM_BANKS             = 8,
     // Number of associative ways
-    parameter NUM_WAYS              = 4,
-    // Size of a word in bytes
-    parameter WORD_SIZE             = 16,
+    parameter NUM_WAYS              = `VX_CFG_L2_NUM_WAYS,
+    // Size of a word in bytes (L2 word = L1 line = 512-bit data-array slice)
+    parameter WORD_SIZE             = `VX_CFG_L2_LINE_SIZE,
 
     // Core Response Queue Size
     parameter CRSQ_SIZE             = 8,
@@ -45,11 +45,14 @@ module VX_cache_top import VX_gpu_pkg::*; #(
     // Enable cache writeable
     parameter WRITE_ENABLE          = 1,
 
-    // Enable cache writeback
-    parameter WRITEBACK             = 1,
+    // Enable cache writeback (L2 ships writethrough)
+    parameter WRITEBACK             = `VX_CFG_L2_WRITEBACK,
 
     // Enable dirty bytes on writeback
-    parameter DIRTY_BYTES           = 1,
+    parameter DIRTY_BYTES           = `VX_CFG_L2_DIRTYBYTES,
+
+    // Bank pipeline depth (L2 deferral: 4 above 64KB)
+    parameter LATENCY               = `VX_CFG_L2_LATENCY,
 
     // core request tag size
     parameter TAG_WIDTH             = 16 + UUID_WIDTH,
@@ -57,8 +60,12 @@ module VX_cache_top import VX_gpu_pkg::*; #(
     // Core response output buffer
     parameter CORE_OUT_BUF          = 3,
 
-    // Enable AMO support (tracks the A extension by default)
-    parameter AMO_ENABLE            = `VX_CFG_EXT_A_ENABLED,
+    // Enable AMO support (synth #1 = 0; flip to 1 for the AMO timing run)
+    parameter AMO_ENABLE            = 1,
+
+    // LLC role: 1 exercises the local AMO read-modify-write commit (g_commit),
+    // the path that interacts with the deferred data pipeline.
+    parameter IS_LLC                = 1,
 
     // Memory request output buffer
     parameter MEM_OUT_BUF           = 3,
@@ -166,11 +173,13 @@ module VX_cache_top import VX_gpu_pkg::*; #(
         .MSHR_SIZE      (MSHR_SIZE),
         .MRSQ_SIZE      (MRSQ_SIZE),
         .MREQ_SIZE      (MREQ_SIZE),
+        .LATENCY        (LATENCY),
         .TAG_WIDTH      (TAG_WIDTH),
         .WRITE_ENABLE   (WRITE_ENABLE),
         .WRITEBACK      (WRITEBACK),
         .DIRTY_BYTES    (DIRTY_BYTES),
         .AMO_ENABLE     (AMO_ENABLE),
+        .IS_LLC         (IS_LLC),
         .CORE_OUT_BUF   (CORE_OUT_BUF),
         .MEM_OUT_BUF    (MEM_OUT_BUF)
     ) cache (
diff --git a/sim/simx/cluster.cpp b/sim/simx/cluster.cpp
index cd19d6216..968460f1e 100644
--- a/sim/simx/cluster.cpp
+++ b/sim/simx/cluster.cpp
@@ -79,7 +79,7 @@ class Cluster::Impl {
       VX_CFG_L2_WRITEBACK,           // write-back
       false,                  // write response
       VX_CFG_L2_MSHR_SIZE,           // mshr size
-      2,                      // pipeline latency
+      VX_CFG_L2_LATENCY,             // pipeline latency
       VX_CFG_L2_REPL_POLICY,         // replacement policy
       (VX_CFG_L2_ENABLED != 0) && (VX_CFG_L3_ENABLED == 0), // is_llc
     });
diff --git a/sim/simx/processor.cpp b/sim/simx/processor.cpp
index 68480fa16..aa9d58a98 100644
--- a/sim/simx/processor.cpp
+++ b/sim/simx/processor.cpp
@@ -74,7 +74,7 @@ ProcessorImpl::ProcessorImpl()
     VX_CFG_L3_WRITEBACK,             // write-back
     false,                    // write response
     VX_CFG_L3_MSHR_SIZE,             // mshr size
-    2,                        // pipeline latency
+    VX_CFG_L3_LATENCY,               // pipeline latency
     VX_CFG_L3_REPL_POLICY,           // replacement policy
     VX_CFG_L3_ENABLED != 0,          // is_llc when L3 is the LLC
     }