diff --git a/VX_config.toml b/VX_config.toml index 1072c1250..a8cd1875e 100644 --- a/VX_config.toml +++ b/VX_config.toml @@ -185,7 +185,8 @@ VX_CFG_DCACHE_WRITEBACK = 0 VX_CFG_DCACHE_DIRTYBYTES = "expr: $VX_CFG_DCACHE_WRITEBACK" VX_CFG_DCACHE_REPL_POLICY = "expr: $__cache_repl_fifo" VX_CFG_DCACHE_MSHR_SIZE = 16 -VX_CFG_DCACHE_MREQ_SIZE = "expr: 4 + $VX_CFG_DCACHE_WRITEBACK * ($VX_CFG_DCACHE_MSHR_SIZE - 4)" +VX_CFG_DCACHE_LATENCY = 2 +VX_CFG_DCACHE_MREQ_SIZE = "expr: 2 * $VX_CFG_DCACHE_LATENCY + $VX_CFG_DCACHE_WRITEBACK * ($VX_CFG_DCACHE_MSHR_SIZE - 2 * $VX_CFG_DCACHE_LATENCY)" VX_CFG_DCACHE_MRSQ_SIZE = 4 VX_CFG_DCACHE_CRSQ_SIZE = 2 @@ -200,11 +201,12 @@ VX_CFG_L1_MEM_PORTS = "expr: min($VX_CFG_DCACHE_NUM_BANKS, $VX_CFG_PLATFORM_MEMO [l2cache] VX_CFG_L2_CACHE_SIZE = 1048576 VX_CFG_L2_NUM_WAYS = 8 -VX_CFG_L2_WRITEBACK = 1 +VX_CFG_L2_WRITEBACK = 0 VX_CFG_L2_DIRTYBYTES = "expr: $VX_CFG_L2_WRITEBACK" VX_CFG_L2_REPL_POLICY = "expr: $__cache_repl_fifo" VX_CFG_L2_MSHR_SIZE = 16 -VX_CFG_L2_MREQ_SIZE = "expr: 4 + $VX_CFG_L2_WRITEBACK * ($VX_CFG_L2_MSHR_SIZE - 4)" +VX_CFG_L2_LATENCY = 4 +VX_CFG_L2_MREQ_SIZE = "expr: 2 * $VX_CFG_L2_LATENCY + $VX_CFG_L2_WRITEBACK * ($VX_CFG_L2_MSHR_SIZE - 2 * $VX_CFG_L2_LATENCY)" VX_CFG_L2_MRSQ_SIZE = 4 VX_CFG_L2_CRSQ_SIZE = 2 @@ -214,11 +216,12 @@ VX_CFG_L2_MEM_PORTS = "expr: min($VX_CFG_L2_NUM_BANKS, $VX_CFG_PLATFORM_MEMORY_N [l3cache] VX_CFG_L3_CACHE_SIZE = 2097152 VX_CFG_L3_NUM_WAYS = 8 -VX_CFG_L3_WRITEBACK = 1 +VX_CFG_L3_WRITEBACK = 0 VX_CFG_L3_DIRTYBYTES = "expr: $VX_CFG_L3_WRITEBACK" VX_CFG_L3_REPL_POLICY = "expr: $__cache_repl_fifo" VX_CFG_L3_MSHR_SIZE = 16 -VX_CFG_L3_MREQ_SIZE = "expr: 4 + $VX_CFG_L3_WRITEBACK * ($VX_CFG_L3_MSHR_SIZE - 4)" +VX_CFG_L3_LATENCY = 4 +VX_CFG_L3_MREQ_SIZE = "expr: 2 * $VX_CFG_L3_LATENCY + $VX_CFG_L3_WRITEBACK * ($VX_CFG_L3_MSHR_SIZE - 2 * $VX_CFG_L3_LATENCY)" VX_CFG_L3_MRSQ_SIZE = 4 VX_CFG_L3_CRSQ_SIZE = 2 diff --git a/docs/proposals/cache_elastic_pipeline_proposal.md b/docs/proposals/cache_elastic_pipeline_proposal.md new file mode 100644 index 000000000..fefd93c11 --- /dev/null +++ b/docs/proposals/cache_elastic_pipeline_proposal.md @@ -0,0 +1,401 @@ +# Elastic Cache-Bank Pipeline (configurable AMAT) Proposal + +## Summary + +`VX_cache_bank` is hardwired to a 2-stage lookup/commit pipeline. That depth is +correct for a small, latency-critical L1, but it cannot close timing at 300 MHz +on a large last-level cache: the tag-array read, the way-resolving compare, and +the data-array access are forced into a single clock cycle, producing a +BRAM-to-BRAM critical path whose delay is dominated by routing and cannot be +retimed away. + +This proposal refactors the bank into an **elastic pipeline** whose depth is a +per-cache parameter (`VX_CFG__LATENCY`, default 2 = current behavior). +Larger caches raise the knob to insert register stages on the long paths, +trading a few cycles of hit latency — which a non-blocking, MSHR-backed cache +hides — for the Fmax needed to run the whole device at 300 MHz. We propose +`LATENCY = 4` for any L2/L3 larger than 64 KB. + +This is the architecture real GPU L2/L3 caches use: deep, fully pipelined, +latency-tolerant behind a large miss-handling pool, rather than a shallow +single-cycle-lookup structure. + +## Motivation + +On the U55C at the 300 MHz platform clock (period 3.333 ns), a 2-core build with +the 1 MB 8-way L2 fails timing. Measured post-route WNS on the standalone +`Vortex` DUT (`xcu55c`, post-route, after the dirty-mask LUTRAM fix): + +| Config | WNS @300 MHz | Implied Fmax | Worst path | +|--------|-------------:|-------------:|------------| +| L2 write-back | **-1.380 ns** | ~212 MHz | `cache_tags/tag_store` → `cache_data/.../data_store` (EN/WE) | +| L2 write-through | **-1.008 ns** | ~230 MHz | same structure | + +Because this path sits in the L2, the *entire* device is capped at ~210–230 MHz. +Once integrated into the full XRT platform (HBM + PCIe + SLR crossings) the +slack erodes further. No amount of placement or logic restructuring closes a +single-cycle BRAM→BRAM dependency whose delay is ~78% routing — the cycle +boundary has to move. + +## Current timing bottlenecks in `VX_cache_bank` + +The bank runs a fixed two-stage pipe (`sel → S0 → S1`, two `VX_pipe_register`s). +Tag and data arrays are read at issue; the hit/way is resolved combinationally +in S0 and immediately drives the data array in the same cycle. The bottlenecks, +in order of severity: + +1. **[PRIMARY] Tag-compare → data-array write-enable (S0).** + `tag_store` (RAMB) clk-to-out → per-way tag compare (XNOR/AND tree) → + `hit_any = |tag_matches` → `slice_write = fill || (write && hit_any && + word_en)` → `data_store` `ENARDEN`/`ENBWREN`/`WEA`. The failing endpoints are + the data-array **enable/write-enable pins**, not the address pins. BRAM→BRAM, + ~78% routing. This is the −1.38 ns path. + +2. **Tag-compare → data-array address.** The way-folded array is addressed + `data_addr = {hit_way, line_idx}`, so `hit_way` (from the same S0 compare) + feeds the data BRAM address pins. Currently meets with thin slack; becomes + the next wall the instant bottleneck (1) is broken. + +3. **[ALREADY RESOLVED — prerequisite] Per-byte dirty mask (`byteen_store`).** + Was the #1 path at −3.762 ns (xrt, 300 MHz). The mask needs 1-bit write + granularity (`WRENW = LINE_SIZE`), which block RAM cannot do, so a + `LUTRAM=1` instance was silently inferred as *shattered* BRAM. Fixed by making + `VX_sp_ram`/`VX_dp_ram` honor `LUTRAM=1` via the portable `USE_FAST_BRAM` + (`ram_style="distributed"`) attribute; the mask now maps to distributed RAM + (LUTRAM 216 → 16,600, RAMB ≈ unchanged) and leaves the critical path. This + refactor assumes that fix is in place. + +4. **Replacement state (`cache_repl`) → data/tag.** FIFO/PLRU victim select and + state update. At 250 MHz the worst path was `cache_repl` FIFO → `byteen_store`; + the lookup/update feedback (`lookup_valid`/`repl_valid`) is a second-tier path + that benefits from extra slack. + +5. **MSHR probe/allocate (`cache_mshr`).** `probe_addr` is compared (CAM-style) + against in-flight entries to produce `probe_pending_*`, which gates admission + and AMO ordering. The compare fanout over `MSHR_SIZE` entries is a control path + that tightens as MSHR grows. + +6. **AMO read-modify-write (LLC, S1).** `read_word_st1` → AMO ALU + (add/min/max/swap/compare) → writeback register → re-inject as a synthetic + write. Only synthesized for the AMO-capable LLC bank, but it is a genuine S1 + compute path. + +7. **Read-data → response / writeback formatting.** `read_data_st1` → `crsp` + word select, and `evict_byteen`/`is_dirty` → `mem_req_queue`. Registered and + comfortably met today, listed for completeness. + +The elastic pipeline targets (1) and (2) directly (deferring the data access to a +later, register-fed stage) and relaxes (4)–(6) by giving each its own stage +budget instead of cramming lookup+commit into two cycles. + +## Proposed design: elastic pipeline + +### Single knob, distributed internally + +Expose one parameter per cache, `LATENCY` (carried from +`VX_CFG__LATENCY`), with `LATENCY = 2` reproducing today's behavior +bit-for-bit. The bank derives internal stage placement from it: + +| Internal budget | Cuts | Implementation | +|-----------------|------|----------------| +| `TAG_RD_LAT` | sel→tag routing + tag BRAM clk-to-out | tag RAM output pipeline registers | +| **hit→data register** | bottleneck (1)/(2): compare → data EN/addr | one pipe stage (the key cut) | +| `DATA_RD_LAT` | data BRAM clk-to-out → way mux | data RAM output pipeline registers | +| response register | read-data → crsp/mreq | one pipe stage | + +Extra RAM output registers retime into the BRAM/cascade and cost almost nothing +in fabric while buying most of the Fmax. The single new *logical* stage is the +hit→data register that moves the data access off the same cycle as the compare. + +### Spine refactor (readability + elasticity) + +Replace the ~40 parallel `_sel`/`_st0`/`_st1` wires and two hand-instantiated +pipe registers with: + +1. **A packed payload struct** carrying all per-request control/data: + ```systemverilog + typedef struct packed { + logic valid; + logic [`CS_LINE_ADDR_WIDTH-1:0] addr; + logic rw; + logic [WORD_SIZE-1:0] byteen; + logic [`CS_WORD_WIDTH-1:0] word; + logic [`CS_WAY_SEL_WIDTH-1:0] way; + logic hit; + // tag, idx, mshr_id, is_fill/flush/replay/dirty, amo, ... + } pipe_t; + ``` + +2. **A generate-loop register chain** of depth `LATENCY`: + ```systemverilog + pipe_t stg [0:LATENCY-1]; // stg[0] = arbitrated/selected request + for (genvar i = 1; i < LATENCY; ++i) begin : g_pipe + VX_pipe_register #(.DATAW($bits(pipe_t)), .RESETW(1)) reg_i ( + .clk, .reset, .enable(~pipe_stall), + .data_in(stg[i-1]), .data_out(stg[i])); + end + ``` + Adding depth is a wider array — no `if (LATENCY==2) … else if (==3)` ladder. + +3. **Control anchored to symbolic stage indices**, not literal `st0/st1`: + ```systemverilog + localparam HIT_ST = TAG_RD_LAT; // tag compare consumes stg[HIT_ST] + localparam DATA_ST = HIT_ST + 1; // data access uses *registered* way + localparam RESP_ST = LATENCY - 1; // crsp / mem-req fire here + ``` + `cache_repl` lookup/update, `cache_mshr` allocate/finalize, tag write, and the + response all key off these names, so the feedback loops stay one-request-per- + cycle at any depth. + +### Deferred whole-array access — no hazard logic required (implemented) + +The implemented design is **simpler than the 1R1W split originally sketched**. +The data array stays a single-port `VX_sp_ram`; the *entire* access (read **and** +write, plus fill/flush) is deferred together by `PIPE_EX = LATENCY-2` register +stages. Two consequences: + +- **The tag→data critical path is cut.** The data array is driven by *registered* + `tag_matches` (and the registered way/line/word/byteen), so neither the write + enable (bottleneck 1) nor the address (bottleneck 2) carries the combinational + tag-compare result. Path becomes register→BRAM, intra-stage. +- **No store→load hazard, no forwarding, no stall scoreboard.** Because the + array's read and write move to the *same* deferred stage, pipeline order is + preserved: a younger same-line read always reaches the array *after* an older + write, so store→load forwarding is automatic. (The 1R1W/forwarding scheme is + unnecessary — keeping read+write co-located is strictly simpler and lower-risk.) + +The tag array is left entirely at S0/S1, so its existing read-during-write +bypasses (`rdw_fill`/`rdw_write`) are unchanged. + +### Decoupled pipeline — the MSHR must NOT be deferred (critical constraint) + +`VX_cache_mshr` is **strongly coupled** to the bank pipeline: its coalescing +chain requires `allocate` (S0) and `finalize` (S1) to remain **exactly one cycle +apart**. The tail-find (`prev_idx`) only sees a predecessor's link once that +predecessor finalizes; deferring finalize makes 3+ coalesced same-line misses +(e.g. sequential icache fetches to one line) all link to the same predecessor, +orphaning intermediate entries → they never replay → **bank deadlock**. This was +confirmed empirically (a first "defer everything" attempt hung at both LATENCY=3 +and 4). + +So the implemented pipeline is **decoupled**: + +- **S0 / S1 (fixed, 1 cycle apart):** tag compare, replacement victim-select, + MSHR allocate **and finalize**, replacement update. Untouched. +- **stD = S0 + PIPE_EX:** the data-array access (read+write) — a pass-through + register chain off S0 (`pipe_bubble_data`). +- **stC = S1 + PIPE_EX:** the core response and the memory request — a + pass-through register chain off S1 (`pipe_bubble_commit`), aligned with the + deferred data output `read_data_stC`. + +`PIPE_EX=0` collapses stD→S0 and stC→S1, reproducing the classic 2-stage bank +bit-for-bit (verified: LATENCY=2 gives identical cycle counts). + +### Memory-request queue sizing (constraint) + +The mem-request push now fires `LATENCY` stages after admission, so the queue's +almost-full margin must reserve `LATENCY` slots (`PIPELINE_STAGES = LATENCY`). +This requires **`MREQ_SIZE > LATENCY`** (else `ALM_FULL ≤ 0` → permanent +almost-full → admission deadlock). Default small-cache `MREQ_SIZE = 4` is fine +for `LATENCY ≤ 3`; enabling `LATENCY = 4` on L2/L3 requires bumping their +`MREQ_SIZE` (see config section). + +## Atomics (`AMO_ENABLE`) under elastic latency + +`VX_cache_amo` is the most stage-coupled block in the bank and the part most +affected by changing depth, so it is called out separately. Today it reaches +*directly* into the fixed two-stage structure: it consumes lookup-stage signals +(`valid_st0`, `is_hit_st0`, `is_creq_st0`, `word_idx_st0`, `addr_st0`) and +commit-stage signals (`is_hit_st1`, `read_word_st1`, `do_write_st1`, +`byteen_st1`, `write_word_st1`, `addr_st1`, `mshr_id_st1`), performs the +read-modify-write, and re-injects the result as a synthetic writeback through the +admit path. Three mechanisms encode the assumption that commit is exactly one +cycle behind lookup. + +**1. The RMW datapath becomes a stage budget, not a single-cycle path.** +The LLC atomic reads the line word at the data-output stage, runs the ALU +(add/min/max/swap/compare), and writes it back — bottleneck (6). At `LATENCY=2` +this is one S1 cycle. Under the elastic pipe it maps to the same symbolic stage +indices as the data path: read at the data-output stage, ALU in the following +stage, writeback at the commit stage. So deepening *relaxes* the AMO ALU path +(it gets its own stage) rather than complicating it — the engine must be +re-parameterized on `HIT_ST`/`DATA_ST`/`RESP_ST` instead of literal `st0`/`st1`. + +**2. Same-line AMO chaining is the tightest interaction.** A chained atomic to a +line with an in-flight commit must observe the *previous* atomic's result. Today +`chain_stall` paces the follower by one cycle so the prior result reaches the +writeback register; `commit_busy` holds new admits while a single LLC commit is +outstanding. With depth `L`, the commit→visible round trip is `L-1` cycles, so +both pacing windows scale with `LATENCY`. The same-line stall scoreboard proposed +for general RAW hazards **covers AMO chains by construction** (a chained atomic +targets a line the scoreboard already marks in-flight); `chain_stall`/ +`commit_busy` collapse into that one mechanism, sized to `L`, rather than a +separate hand-tuned 1-cycle pacer. + +**3. Non-LLC forward / passthru-replay ordering is latency-agnostic.** A non-LLC +AMO forwards downstream, invalidates its local copy, and returns via a passthru +replay (`is_amo_fwd_*`, `is_amo_replay_st1`, `req_input_defer`). These are +event-ordered, not cycle-counted, so they carry over unchanged once they key off +the stage constants instead of `st0`/`st1`. + +**LR/SC reservations** (`VX_CFG_AMO_RS_SIZE`) track line addresses, not pipeline +cycles, and are unaffected by depth beyond keeping the reservation-clear (any +intervening write to the line) anchored to the commit stage. + +Net: `AMO_ENABLE` requires the engine's stage anchors to be re-expressed in terms +of the elastic stage constants and its chain pacing to be folded into the +depth-sized same-line scoreboard. At `LATENCY=2` the behavior is identical to +today (chain window = 1). The atomics regression (LR/SC, same-line AMO chains, +mixed AMO/load ordering) is part of the rtlsim sweep across `LATENCY` values. + +## Proposed latency configuration + +Add a per-cache knob in `VX_config.toml` (default 2): + +``` +VX_CFG_DCACHE_LATENCY = 2 +VX_CFG_L2_LATENCY = "expr: 4 if $VX_CFG_L2_CACHE_SIZE > 65536 else 2" +VX_CFG_L3_LATENCY = "expr: 4 if $VX_CFG_L3_CACHE_SIZE > 65536 else 2" + +# MREQ_SIZE must exceed LATENCY (margin); grow it with the deferral depth: +VX_CFG_L2_MREQ_SIZE = "expr: 4 + ($VX_CFG_L2_LATENCY - 2) + $VX_CFG_L2_WRITEBACK * ($VX_CFG_L2_MSHR_SIZE - 4)" +VX_CFG_L3_MREQ_SIZE = "expr: 4 + ($VX_CFG_L3_LATENCY - 2) + $VX_CFG_L3_WRITEBACK * ($VX_CFG_L3_MSHR_SIZE - 4)" +``` + +Rationale for the 64 KB threshold: below it the tag/data arrays fit in a few +BRAMs placed adjacently and the single-cycle path closes; above it (the 1 MB L2, +the 2 MB L3) the arrays span many BRAM columns and the cross-array route cannot +meet 3.333 ns. + +The `MREQ_SIZE` expr adds `(LATENCY-2)` to the base so the almost-full margin +(`MREQ_SIZE - LATENCY`) stays constant as depth grows — `LATENCY=4` ⇒ base 6, +margin 2 (same as today's `LATENCY=2` margin). Without this, `LATENCY=4` with the +default `MREQ_SIZE=4` deadlocks (margin 0). + +The bank parameter `LATENCY` is threaded from these macros through +`VX_cache`/`VX_cache_cluster` to each `VX_cache_bank` instance. + +## How it resolves the timing violations + +| Path | Today (2-stage) | Elastic (`LATENCY=4`) | +|------|-----------------|------------------------| +| (1) tag-compare → data EN/WE | single cycle, −1.38 ns | compare registered at `HIT_ST`; write driven by registers at `DATA_ST` — path is reg→reg, intra-stage | +| (2) hit_way → data address | single cycle, marginal | read addr still speculative but tag-read is itself registered (`TAG_RD_LAT`), so the source is a BRAM output reg, not a cross-array combinational chain | +| (4) repl, (5) mshr, (6) amo | share the 2 cycles | each gets its own stage slack | + +The −1.38 ns path is replaced by register-to-register hops within a stage, each +comfortably under 3.333 ns. The tag and data BRAMs no longer have a same-cycle +dependency, so their placement is decoupled and the dominant routing term is +removed. Target: **WNS ≥ 0 at 300 MHz** for the 1 MB L2 in the full build. + +## Area cost estimate + +Per L2 bank (1 MB, 8-way, data array 16384 × 512 b), going `LATENCY` 2 → 4: + +- **Flip-flops:** two extra payload stages. The wide field is the 512 b write + word; with control (~70 b) the payload is ~590 b → ~1,180 FF/bank for the two + added stages, plus the BRAM output pipeline regs (absorbed into the BRAM). + Against the measured 117 k FF for the 2-core build, that is **~+1%**. +- **Block RAM:** unchanged. Data stays in the same BRAMs; the read/write split is + BRAM-native dual-port. +- **LUTRAM / LUT:** the deferred-write mux + the same-line stall scoreboard add a + few hundred LUTs per bank. (The 16,600 LUTRAM for the dirty mask is the + separate, already-landed write-back cost, not attributable to this refactor.) + +Net: **~+1% FF, ~0 BRAM, small LUT per large-cache bank** — cheap relative to a ++40% clock. + +## AMAT impact + +`LATENCY = 4` raises the L2 **hit** latency by 2 cycles. The bank stays fully +pipelined (one request/cycle throughput is unchanged), and the cache is +non-blocking (16-entry MSHR), so the added cycles overlap with in-flight misses. + +Average-memory-access-time effect: + +``` +AMAT_overall ≈ t_L1 + m_L1 · (t_L2 + m_L2 · t_mem) +Δ(t_L2) = +2 cycles ⇒ ΔAMAT_overall = m_L1 · 2 cycles +``` + +For a typical L1 miss rate `m_L1 ≈ 0.10–0.20`, that is **+0.2–0.4 cycle** of +average access time — against a `t_mem` of hundreds of cycles, it is in the noise. + +The decisive comparison is absolute wall-clock, because today the *whole device* +is stuck at the L2's Fmax: + +| | 2-stage @ 212 MHz | 4-stage @ 300 MHz | +|---|---|---| +| L2 hit latency | 2 cyc = 9.4 ns | 4 cyc = 13.3 ns | +| Device clock | 212 MHz | **300 MHz (+42%)** | + +A single L2 hit is ~3.9 ns slower, but every cycle everywhere else is 42% +faster, and that latency is hidden by the MSHR. Throughput-bound GPU workloads +win decisively. + +## SimX model (cycle parity) + +The elastic depth must be reflected in SimX or the SimX↔RTL cycle-parity target +drifts. No structural SimX work is needed: the bank model already carries a +configurable depth — `Cache::Config::latency` ("pipeline latency") sizes the +per-bank request pipe (`pipe_req_ = TFifo::Create("", +config.latency)` in `sim/simx/mem/cache.cpp`), so SimX already simulates a +`latency`-deep pipelined bank. + +The gap is only that the value is **hardcoded** at construction instead of +sourced from config. This proposal targets the large caches, so only those are +rewired; the others keep their current literals and are out of scope: + +| Cache | SimX site | Today | This proposal | +|-------|-----------|------:|---------------| +| **L2** | `sim/simx/cluster.cpp:82` | `2` | `VX_CFG_L2_LATENCY` (→ 4 when >64 KB) | +| **L3** | `sim/simx/processor.cpp:77` | `2` | `VX_CFG_L3_LATENCY` (→ 4 when >64 KB) | +| L1 D$/I$ | `sim/simx/socket.cpp:47,67` | `1` | unchanged (separate, pre-calibrated) | +| T$/O$/R$ | `sim/simx/cluster.cpp:196,266,320` | `2` | unchanged | + +Because the `VX_CFG_L2_LATENCY`/`VX_CFG_L3_LATENCY` macros are emitted from the +same `VX_config.toml`, replacing those two literals with their macro makes the +**one config value drive both the RTL bank parameter and the SimX pipe depth**, +so they cannot diverge. The RTL bank's existing 2-cycle floor and SimX's L1 +`latency=1` modeling are a pre-existing parity calibration this change does not +touch; the knob raises only L2/L3, where both sides read 2 today. + +Two parity details to keep honest: +- **Same-line hazard stall.** The RTL adds a same-line in-flight stall at higher + depth. SimX already accounts bank occupancy/contention (`bank_stalls`); the + same-line RAW stall must be modeled in the SimX bank as well (a marked-line + check on the `pipe_req_` occupancy) so the throughput effect matches, not just + the latency. If same-line conflicts are rare for a workload the residual sits + inside the <5% parity budget, but the mechanism should be present. +- **AMO chain pacing.** The SimX LLC atomic path must pace same-line chains over + the same `LATENCY`-sized window (it collapses into the same marked-line check), + matching the RTL `chain_stall`/`commit_busy` behavior at depth. + +Parity is then re-confirmed by the existing SimX↔RTL trace-diff methodology at +each `LATENCY` value (default 2 must be unchanged from today). + +## Validation plan / status + +1. **[DONE]** `LATENCY = 2` bit-identical — rtlsim 2-core+L2 vecadd: `cycles=2164`, + identical to the pre-refactor baseline. +2. **[DONE]** `LATENCY = 3` functional — vecadd `cycles=2239` (+3.5%, the one + deferred stage, mostly MSHR-hidden) and sgemm (RAW-heavy reuse, exercises + store→load across the deferral) both PASS. +3. **[pending]** `LATENCY = 4` once L2/L3 `MREQ_SIZE` is bumped (margin), plus the + atomics-enabled sweep for the AMO path (`LATENCY ∈ {2,3,4}`). +4. **[pending]** SimX parity update (L2/L3 latency from `VX_CFG_*_LATENCY`) and + trace-diff at each depth. +5. **[pending]** DUT synth of the 1 MB L2 bank at `LATENCY = 4`; confirm WNS ≥ 0 + @300 MHz and the new worst path is outside the cache. +6. **[pending]** Full 2-core `xrt` build at 300 MHz; on-card validation (#364). + +## Risk / compatibility + +- Correctness-sensitive (cache data path); gated on the rtlsim sweep above + before any synthesis. +- L1 and all small caches default to `LATENCY = 2` and the 1-deep forward, so + their behavior, latency, and area are unchanged. +- The spine refactor (struct + generate pipe + stage-indexed control) is a + net readability improvement over the current parallel-wire style. +- Depends on the `VX_sp_ram`/`VX_dp_ram` `LUTRAM`/`USE_FAST_BRAM` fix (dirty-mask + bottleneck #3) already being present. diff --git a/hw/rtl/VX_cluster.sv b/hw/rtl/VX_cluster.sv index 20fc96ba2..a7a49d21f 100644 --- a/hw/rtl/VX_cluster.sv +++ b/hw/rtl/VX_cluster.sv @@ -205,6 +205,7 @@ module VX_cluster import VX_gpu_pkg::*; .MSHR_SIZE (`VX_CFG_L2_MSHR_SIZE), .MRSQ_SIZE (`VX_CFG_L2_MRSQ_SIZE), .MREQ_SIZE (`VX_CFG_L2_MREQ_SIZE), + .LATENCY (`VX_CFG_L2_LATENCY), .TAG_WIDTH (L2_TAG_WIDTH), .WRITE_ENABLE (1), .WRITEBACK (`VX_CFG_L2_WRITEBACK), diff --git a/hw/rtl/VX_socket.sv b/hw/rtl/VX_socket.sv index 6da8634bd..37261448e 100644 --- a/hw/rtl/VX_socket.sv +++ b/hw/rtl/VX_socket.sv @@ -85,7 +85,7 @@ module VX_socket import VX_gpu_pkg::*; .clk (clk), .reset (reset), .bus_in_if (kmu_bus_if), - .bus_out_if (per_core_kmu_bus_if[`VX_CFG_SOCKET_SIZE-1:0]) + .bus_out_if (per_core_kmu_bus_if) ); VX_gbar_bus_if per_core_gbar_bus_if[`VX_CFG_SOCKET_SIZE](); @@ -185,6 +185,7 @@ module VX_socket import VX_gpu_pkg::*; .MSHR_SIZE (`VX_CFG_DCACHE_MSHR_SIZE), .MRSQ_SIZE (`VX_CFG_DCACHE_MRSQ_SIZE), .MREQ_SIZE (`VX_CFG_DCACHE_MREQ_SIZE), + .LATENCY (`VX_CFG_DCACHE_LATENCY), .TAG_WIDTH (DCACHE_TAG_WIDTH), .WRITE_ENABLE (1), .WRITEBACK (`VX_CFG_DCACHE_WRITEBACK), diff --git a/hw/rtl/Vortex.sv b/hw/rtl/Vortex.sv index cfadfeb2a..52f874f07 100644 --- a/hw/rtl/Vortex.sv +++ b/hw/rtl/Vortex.sv @@ -131,6 +131,7 @@ module Vortex import VX_gpu_pkg::*, VX_trace_pkg::*; ( .MSHR_SIZE (`VX_CFG_L3_MSHR_SIZE), .MRSQ_SIZE (`VX_CFG_L3_MRSQ_SIZE), .MREQ_SIZE (`VX_CFG_L3_MREQ_SIZE), + .LATENCY (`VX_CFG_L3_LATENCY), .TAG_WIDTH (L3_TAG_WIDTH), .WRITE_ENABLE (1), .WRITEBACK (`VX_CFG_L3_WRITEBACK), @@ -180,7 +181,7 @@ module Vortex import VX_gpu_pkg::*, VX_trace_pkg::*; ( .clk (clk), .reset (reset), .bus_in_if (kmu_bus_in), - .bus_out_if (per_cluster_kmu_bus_if[`VX_CFG_NUM_CLUSTERS-1:0]) + .bus_out_if (per_cluster_kmu_bus_if) ); VX_dcr_bus_if per_cluster_dcr_bus_if[`VX_CFG_NUM_CLUSTERS](); diff --git a/hw/rtl/cache/VX_cache.sv b/hw/rtl/cache/VX_cache.sv index 759237bdc..d096c9662 100644 --- a/hw/rtl/cache/VX_cache.sv +++ b/hw/rtl/cache/VX_cache.sv @@ -42,6 +42,9 @@ module VX_cache import VX_gpu_pkg::*; #( // Memory Request Queue Size parameter MREQ_SIZE = 4, + // Bank pipeline depth (2 = classic lookup+commit; larger defers the data array) + parameter LATENCY = 2, + // Enable cache writeable parameter WRITE_ENABLE = 1, @@ -390,6 +393,7 @@ module VX_cache import VX_gpu_pkg::*; #( .MSHR_SIZE (MSHR_SIZE), .MRSQ_SIZE (MRSQ_SIZE), .MREQ_SIZE (MREQ_SIZE), + .LATENCY (LATENCY), .TAG_WIDTH (TAG_WIDTH), .CORE_OUT_BUF (CORE_RSP_BUF_ENABLE ? 2 : 0), .MEM_OUT_BUF (MEM_REQ_BUF_ENABLE ? 2 : 0), diff --git a/hw/rtl/cache/VX_cache_amo.sv b/hw/rtl/cache/VX_cache_amo.sv index d45d4de05..302d4cc45 100644 --- a/hw/rtl/cache/VX_cache_amo.sv +++ b/hw/rtl/cache/VX_cache_amo.sv @@ -39,7 +39,11 @@ module VX_cache_amo import VX_gpu_pkg::*; #( parameter ATTR_WIDTH = 1, parameter MSHR_SIZE = 1, parameter MSHR_ADDR_WIDTH = 1, - parameter WORDS_PER_LINE = 1 + parameter WORDS_PER_LINE = 1, + // Deferred-commit depth: the commit ports (_st1) are fed from the bank's + // stC stage, which sits PIPE_EX+1 cycles behind the S0 lookup. 0 = classic + // 2-stage bank (stC == S1). + parameter PIPE_EX = 0 ) ( input wire clk, input wire reset, @@ -340,20 +344,51 @@ module VX_cache_amo import VX_gpu_pkg::*; #( end end - // response (fired at S1): SC -> 0/1; other -> old value (LSU sexts). - // The old value is available at S1 directly, no ALU needed. - wire [63:0] rsp_word = (amo_st1.amo_op == AMO_OP_SC) ? {63'h0, sc_fail_st1} : old_st1; - if (WORD_WIDTH < 64) begin : g_rsp_upper_unused - `UNUSED_VAR (rsp_word[63:WORD_WIDTH]) + // Response (fired at S1; in-place, no ALU): the requester extracts its + // target word by byte offset, so the old value can stay where it sits in + // the line with the other bytes masked off -- this avoids a full-width + // barrel shift on the hot read->response path (read_word -> rsp_data was + // the critical path: a >>bit_off then <commit window: + // with PIPE_EX>0 the AMO sits in the commit bubble for PIPE_EX cycles + // between do_store_st0 (S0) and do_store_st1 (stC), so commit_busy would + // gap and let a same-line request race the writeback. A PIPE_EX-deep + // shift of do_store_st0 fills the gap (continuous S0..stC hold). + wire amo_inflight; + if (PIPE_EX == 0) begin : g_no_bridge + assign amo_inflight = 1'b0; + end else begin : g_bridge + reg [PIPE_EX-1:0] store_inflight; + always @(posedge clk) begin + if (reset) begin + store_inflight <= '0; + end else if (~pipe_stall) begin + store_inflight[0] <= do_store_st0; + for (int i = 1; i < PIPE_EX; ++i) begin + store_inflight[i] <= store_inflight[i-1]; + end + end + end + assign amo_inflight = (| store_inflight); + end + // Commit in flight: holds off new core-request admission from the S0 - // prediction through the compute stage and the writeback. Replays are - // NOT blocked (the MSHR streams coalesced same-line AMOs back to back); - // those are paced instead by chain_stall. - assign commit_busy = do_store_st0 || do_store_st1 || cmp_valid || wb_pending_r; + // prediction through the deferred bubble, the compute stage and the + // writeback. Replays are NOT blocked (the MSHR streams coalesced same- + // line AMOs back to back); those are paced instead by chain_stall. + assign commit_busy = do_store_st0 || amo_inflight || do_store_st1 || cmp_valid || wb_pending_r; // Pace any same-line request sitting behind an in-flight compute by one // cycle, so the result lands in wb_data_r and forwards cleanly. Gated on // cmp_valid (an AMO is computing), so it never fires for baseline traffic. diff --git a/hw/rtl/cache/VX_cache_bank.sv b/hw/rtl/cache/VX_cache_bank.sv index 8a6f75b32..180da7494 100644 --- a/hw/rtl/cache/VX_cache_bank.sv +++ b/hw/rtl/cache/VX_cache_bank.sv @@ -16,58 +16,30 @@ module VX_cache_bank import VX_gpu_pkg::*; #( parameter `STRING INSTANCE_ID= "", parameter BANK_ID = 0, - - // Number of Word requests per cycle parameter NUM_REQS = 1, - - // Size of cache in bytes - parameter CACHE_SIZE = 1024, - // Size of line inside a bank in bytes - parameter LINE_SIZE = 16, - // Number of banks + parameter CACHE_SIZE = 1024, // cache size in bytes + parameter LINE_SIZE = 16, // line size in bytes parameter NUM_BANKS = 1, - // Number of associative ways parameter NUM_WAYS = 1, - // Size of a word in bytes - parameter WORD_SIZE = 4, - - // Core Response Queue Size - parameter CRSQ_SIZE = 1, - // Miss Reserv Queue Knob - parameter MSHR_SIZE = 1, - // Memory Response Queue Size (sized at the cache wrapper; bank - // currently flows responses straight through, so unused locally.) - parameter MRSQ_SIZE = 1, - // Memory Request Queue Size - parameter MREQ_SIZE = 1, - - // Enable cache writeable + parameter WORD_SIZE = 4, // word size in bytes + parameter CRSQ_SIZE = 1, // core response queue size + parameter MSHR_SIZE = 1, // miss reservation queue size + parameter MRSQ_SIZE = 1, // memory response queue size (sized at wrapper) + parameter MREQ_SIZE = 1, // memory request queue size parameter WRITE_ENABLE = 1, - - // Enable cache writeback parameter WRITEBACK = 0, - - // Enable dirty bytes on writeback parameter DIRTY_BYTES = 0, - - // Replacement policy parameter REPL_POLICY = `CS_REPL_FIFO, - - // core request tag size parameter TAG_WIDTH = UUID_WIDTH + 1, - - // Core response output buffer (TO_OUT_BUF_* encoding) parameter CORE_OUT_BUF = 0, - - // Memory request output buffer (TO_OUT_BUF_* encoding) parameter MEM_OUT_BUF = 0, - - // This bank is the last-level cache (AMOs commit locally here). - parameter IS_LLC = 0, - - // This bank supports atomic ops (AMO logic synthesizes only when 1). - parameter AMO_ENABLE = 0, - + parameter IS_LLC = 0, // last-level cache: AMOs commit locally here + parameter AMO_ENABLE = 0, // synthesize atomic-op logic + // Bank pipeline depth (register stages from request-select to commit). 2 is + // the classic lookup(S0)+commit(S1) pipeline; larger values defer the data + // array by (LATENCY-2) stages to break the tag->data critical path on large + // caches (tags/replacement/MSHR stay at S0/S1). + parameter LATENCY = 2, parameter MSHR_ADDR_WIDTH = `LOG2UP(MSHR_SIZE), parameter MEM_TAG_WIDTH = UUID_WIDTH + MSHR_ADDR_WIDTH, parameter REQ_SEL_WIDTH = `UP(`CS_REQ_SEL_BITS), @@ -83,19 +55,19 @@ module VX_cache_bank import VX_gpu_pkg::*; #( output wire perf_mshr_stall, `endif - // Core Request + // Core request input wire core_req_valid, input wire [`CS_LINE_ADDR_WIDTH-1:0] core_req_addr, - input wire core_req_rw, // write enable - input wire [WORD_SEL_WIDTH-1:0] core_req_wsel, // select the word in a cacheline, e.g. word size = 4 bytes, cacheline size = 64 bytes, it should have log(64/4)= 4 bits - input wire [WORD_SIZE-1:0] core_req_byteen,// which bytes in data to write - input wire [`CS_WORD_WIDTH-1:0] core_req_data, // data to be written - input wire [TAG_WIDTH-1:0] core_req_tag, // identifier of the request (request id) - input wire [REQ_SEL_WIDTH-1:0] core_req_idx, // index of the request in the core request array + input wire core_req_rw, + input wire [WORD_SEL_WIDTH-1:0] core_req_wsel, + input wire [WORD_SIZE-1:0] core_req_byteen, + input wire [`CS_WORD_WIDTH-1:0] core_req_data, + input wire [TAG_WIDTH-1:0] core_req_tag, + input wire [REQ_SEL_WIDTH-1:0] core_req_idx, input wire [`UP(MEM_ATTR_WIDTH)-1:0] core_req_attr, output wire core_req_ready, - // Core Response + // Core response output wire core_rsp_valid, output wire [`CS_WORD_WIDTH-1:0] core_rsp_data, output wire [TAG_WIDTH-1:0] core_rsp_tag, @@ -118,113 +90,161 @@ module VX_cache_bank import VX_gpu_pkg::*; #( input wire [MEM_TAG_WIDTH-1:0] mem_rsp_tag, output wire mem_rsp_ready, - // flush + // Flush input wire flush_begin, input wire [`UP(UUID_WIDTH)-1:0] flush_uuid, output wire flush_end ); - - localparam PIPELINE_STAGES = 2; - - // MRSQ_SIZE is sized at the cache wrapper; bank flows responses - // straight through, so it is unused locally. + localparam PIPELINE_STAGES = LATENCY; + localparam PIPE_EX = LATENCY - 2; // extra data-deferral stages (0 = classic 2-stage) + `STATIC_ASSERT(LATENCY >= 2, ("invalid parameter: cache bank LATENCY must be >= 2")) `UNUSED_PARAM (MRSQ_SIZE) - // AMO sideband, extracted from the attr field (gated by AMO_ENABLE). - amo_req_t core_req_amo; - assign core_req_amo = AMO_ENABLE ? - amo_req_t'(core_req_attr[MEM_ATTR_AMO_OFFS +: AMO_REQ_BITS]) - : amo_req_t'('0); - -`IGNORE_UNUSED_BEGIN - wire [`UP(UUID_WIDTH)-1:0] req_uuid_sel, req_uuid_st0, req_uuid_st1; -`IGNORE_UNUSED_END - - wire crsp_queue_stall; - wire mshr_alm_full; - wire mshr_probe_pending_ld; - wire mshr_probe_pending_amo; - wire mreq_queue_empty; - wire mreq_queue_alm_full; - - wire [`CS_LINE_ADDR_WIDTH-1:0] mem_rsp_addr; - - wire replay_valid; - wire [`CS_LINE_ADDR_WIDTH-1:0] replay_addr; - wire replay_rw; - wire [WORD_SEL_WIDTH-1:0] replay_wsel; - wire [WORD_SIZE-1:0] replay_byteen; - wire [`CS_WORD_WIDTH-1:0] replay_data; - wire [TAG_WIDTH-1:0] replay_tag; - wire [REQ_SEL_WIDTH-1:0] replay_idx; - wire [MSHR_ADDR_WIDTH-1:0] replay_id; - wire replay_ready; - amo_req_t replay_amo; - - - wire valid_sel, valid_st0, valid_st1; - wire is_init_st0; - wire is_creq_st0, is_creq_st1; - wire is_fill_st0, is_fill_st1; - wire is_flush_st0, is_flush_st1; - wire [`CS_WAY_SEL_WIDTH-1:0] flush_way_st0, evict_way_st0; - wire [`CS_WAY_SEL_WIDTH-1:0] way_idx_st0, way_idx_st1; - - wire [`CS_LINE_ADDR_WIDTH-1:0] addr_sel, addr_st0, addr_st1; - wire [`CS_LINE_SEL_BITS-1:0] line_idx_sel, line_idx_st0, line_idx_st1; - wire [`CS_TAG_SEL_BITS-1:0] line_tag_st0, line_tag_st1; - wire [`CS_TAG_SEL_BITS-1:0] evict_tag_st0, evict_tag_st1; - wire rw_sel, rw_st0, rw_st1; - wire [WORD_SEL_WIDTH-1:0] word_idx_sel, word_idx_st0, word_idx_st1; - wire [WORD_SIZE-1:0] byteen_sel, byteen_st0, byteen_st1; - wire [REQ_SEL_WIDTH-1:0] req_idx_sel, req_idx_st0, req_idx_st1; - wire [TAG_WIDTH-1:0] tag_sel, tag_st0, tag_st1; - wire [`CS_WORD_WIDTH-1:0] write_word_st0, write_word_st1; - wire [`CS_LINE_WIDTH-1:0] data_sel, data_st0; - wire [MSHR_ADDR_WIDTH-1:0] mshr_id_st0, mshr_id_st1; - wire [MSHR_ADDR_WIDTH-1:0] replay_id_st0; - wire is_dirty_st0, is_dirty_st1; - wire is_replay_st0, is_replay_st1; - wire is_hit_st0, is_hit_st1; - wire [`UP(MEM_ATTR_WIDTH)-1:0] attr_sel, attr_st0, attr_st1; - amo_req_t amo_sel, amo_st0, amo_st1; - - // AMO interconnect (driven by the VX_cache_amo engine, tied off when the - // bank carries no AMO logic). Declared here because the input arbitration - // and sel mux consume them ahead of the instantiation. - wire amo_hit_st1; // AMO commits locally at S1 (LLC) - wire amo_commit_busy; // LLC commit in flight - wire amo_chain_stall; // pace same-line chained AMO - wire amo_wb_pending; // synthetic writeback request live - wire [`CS_WORD_WIDTH-1:0] amo_rsp_data; // LLC AMO response word - wire [`CS_LINE_ADDR_WIDTH-1:0] amo_wb_addr; - wire [WORD_SEL_WIDTH-1:0] amo_wb_word_idx; - wire [WORD_SIZE-1:0] amo_wb_byteen; - wire [`CS_WORD_WIDTH-1:0] amo_wb_data; - wire [TAG_WIDTH-1:0] amo_wb_tag; - wire [REQ_SEL_WIDTH-1:0] amo_wb_idx; - wire [`UP(MEM_ATTR_WIDTH)-1:0] amo_wb_attr; - wire is_amo_fwd_st0; // non-LLC AMO first pass (S0) - wire is_amo_fwd_st1; // non-LLC AMO first pass (S1) - wire is_amo_replay_st1; // non-LLC AMO result replay - wire is_passthru_fill_sel; - wire [`CS_WORD_WIDTH-1:0] amo_ptw_word_st1; - wire req_input_defer; // non-LLC age-ordering hold - - wire mshr_pending_raw_st0; - wire mshr_pending_st0, mshr_pending_st1; - wire [MSHR_ADDR_WIDTH-1:0] mshr_previd_st0, mshr_previd_st1; - wire mshr_empty; - wire is_passthru_fill_st0; // fill targets a passthru entry - - wire flush_valid; - wire init_valid; + // ======================================================================== + // Pipeline payload types + // + // The request travels as a struct and the S0-computed lookup results are a + // separate `lookup_t` delta, composed into `commit_t` for the response / + // memory-request stage. The wide fill `data` line and `tag_matches` ride + // only the data-array path (`data_t`), never the commit path, so the deeper + // commit pipeline stays narrow. + // sel -> S0 : data_t (st0) -- request + fill line + // S0 -> stD : data_t (stD) -- drives the data array + // S0 -> S1->stC: commit_t (st1, stC) -- request + lookup delta + // `way_idx` and `mshr_id` are reused across stages (flush_way/replay_id at + // select; resolved way / allocated id at commit). PIPE_EX=0 collapses + // stD->S0 and stC->S1: the classic 2-stage bank. + // ======================================================================== + typedef struct packed { + logic valid, is_init, is_fill, is_flush, is_creq, is_replay, is_passthru_fill, rw; + logic [`UP(MEM_ATTR_WIDTH)-1:0] attr; + logic [`CS_WAY_SEL_WIDTH-1:0] way_idx; // flush_way @sel, resolved way @S1 + logic [`CS_LINE_ADDR_WIDTH-1:0] addr; + logic [WORD_SIZE-1:0] byteen; + logic [WORD_SEL_WIDTH-1:0] word_idx; + logic [REQ_SEL_WIDTH-1:0] req_idx; + logic [TAG_WIDTH-1:0] tag; + logic [MSHR_ADDR_WIDTH-1:0] mshr_id; // replay_id @sel, alloc/replay id @S1 + amo_req_t amo; + } req_t; + + typedef struct packed { // S0-computed lookup delta (commit side) + logic is_hit, is_dirty, mshr_pending; + logic [`CS_TAG_SEL_BITS-1:0] evict_tag; + logic [`CS_WORD_WIDTH-1:0] write_word; + logic [MSHR_ADDR_WIDTH-1:0] mshr_previd; + } lookup_t; + + typedef struct packed { // data-array drive (S0 -> stD) + req_t req; + logic [`CS_LINE_WIDTH-1:0] data; + logic [NUM_WAYS-1:0] tag_matches; + } data_t; + + typedef struct packed { // response + memory request (S0 -> S1 -> stC) + req_t req; + lookup_t lk; + } commit_t; + + data_t sel_req, st0, dat_in, stD; // request + fill line: sel -> S0 -> stD + commit_t cmt_in, st1, stC; // request + lookup delta: S0 -> S1 -> stC + lookup_t lk_st0; // S0 lookup results + + // ------------------------------------------------------------------------ + // Shared signals + // ------------------------------------------------------------------------ + wire crsp_queue_stall, mshr_alm_full, mshr_empty; + wire mshr_probe_pending_ld, mshr_probe_pending_amo; + wire mreq_queue_empty, mreq_queue_alm_full; + wire [`CS_LINE_ADDR_WIDTH-1:0] mem_rsp_addr; + wire [MSHR_ADDR_WIDTH-1:0] mshr_alloc_id, mshr_previd; + wire mshr_pending_raw; + + // MSHR replay (dequeue) sideband + wire replay_valid, replay_ready, replay_rw; + wire [`CS_LINE_ADDR_WIDTH-1:0] replay_addr; + wire [WORD_SEL_WIDTH-1:0] replay_wsel; + wire [WORD_SIZE-1:0] replay_byteen; + wire [`CS_WORD_WIDTH-1:0] replay_data; + wire [TAG_WIDTH-1:0] replay_tag; + wire [REQ_SEL_WIDTH-1:0] replay_idx; + wire [MSHR_ADDR_WIDTH-1:0] replay_id; + amo_req_t replay_amo; + + // AMO engine interconnect (tied to 0 when the bank carries no AMO logic). + wire amo_hit_st1, amo_commit_busy, amo_chain_stall, amo_wb_pending; + wire [`CS_WORD_WIDTH-1:0] amo_rsp_data; + wire [`CS_LINE_ADDR_WIDTH-1:0] amo_wb_addr; + wire [WORD_SEL_WIDTH-1:0] amo_wb_word_idx; + wire [WORD_SIZE-1:0] amo_wb_byteen; + wire [`CS_WORD_WIDTH-1:0] amo_wb_data; + wire [TAG_WIDTH-1:0] amo_wb_tag; + wire [REQ_SEL_WIDTH-1:0] amo_wb_idx; + wire [`UP(MEM_ATTR_WIDTH)-1:0] amo_wb_attr; + wire is_amo_fwd_st0, is_amo_fwd_st1, is_amo_replay_st1; + wire is_passthru_fill_sel, req_input_defer; + wire [`CS_WORD_WIDTH-1:0] amo_ptw_word_st1; + + wire flush_valid, flush_ready, init_valid; wire [`CS_LINE_SEL_BITS-1:0] flush_sel; wire [`CS_WAY_SEL_WIDTH-1:0] flush_way; - wire flush_ready; - // ensure we have no pending memory request in the bank - wire no_pending_req = ~valid_st0 && ~valid_st1 && mreq_queue_empty; + // AMO sideband, extracted from the attr field (gated by AMO_ENABLE). + amo_req_t core_req_amo; + assign core_req_amo = AMO_ENABLE ? amo_req_t'(core_req_attr[MEM_ATTR_AMO_OFFS +: AMO_REQ_BITS]) + : amo_req_t'('0); + + // ------------------------------------------------------------------------ + // Per-stage decoded operations + // ------------------------------------------------------------------------ + wire do_init_st0 = st0.req.valid && st0.req.is_init; + wire do_flush_st0 = st0.req.valid && st0.req.is_flush; + wire do_read_st0 = st0.req.valid && st0.req.is_creq && ~st0.req.rw; + wire do_write_st0 = st0.req.valid && st0.req.is_creq && st0.req.rw; + wire do_fill_st0 = st0.req.valid && st0.req.is_fill; + wire do_lookup_st0 = do_read_st0 || do_write_st0; + + wire do_read_st1 = st1.req.valid && st1.req.is_creq && ~st1.req.rw; + wire do_write_st1 = st1.req.valid && st1.req.is_creq && st1.req.rw; + wire do_lookup_st1 = do_read_st1 || do_write_st1; + + wire do_read_stc = stC.req.valid && stC.req.is_creq && ~stC.req.rw; + wire do_write_stc = stC.req.valid && stC.req.is_creq && stC.req.rw; + + wire do_init_std = stD.req.valid && stD.req.is_init; + wire do_fill_std = stD.req.valid && stD.req.is_fill; + wire do_flush_std = stD.req.valid && stD.req.is_flush; + wire do_read_std = stD.req.valid && stD.req.is_creq && ~stD.req.rw; + wire do_write_std = stD.req.valid && stD.req.is_creq && stD.req.rw; + + wire [`CS_LINE_SEL_BITS-1:0] line_idx_st0 = st0.req.addr[`CS_LINE_SEL_BITS-1:0]; + wire [`CS_TAG_SEL_BITS-1:0] line_tag_st0 = `CS_LINE_ADDR_TAG(st0.req.addr); + wire [`CS_WORD_WIDTH-1:0] write_word_st0 = st0.data[`CS_WORD_WIDTH-1:0]; + wire [`CS_LINE_ADDR_WIDTH-1:0] addr_stc = stC.req.addr; + + // ------------------------------------------------------------------------ + // Bank-empty detection (gates flush). A request occupies S0, S1 and the + // PIPE_EX commit-bubble stages (valid_st1 delayed 1..PIPE_EX); the parallel + // data bubble S0->stD is subsumed by this window. + // ------------------------------------------------------------------------ + wire pipe_inflight; + if (PIPE_EX == 0) begin : g_no_bubble_occ + assign pipe_inflight = st0.req.valid || st1.req.valid; + end else begin : g_bubble_occ + reg [PIPE_EX-1:0] commit_valid; + always @(posedge clk) begin + if (reset) begin + commit_valid <= '0; + end else if (~pipe_stall) begin + commit_valid[0] <= st1.req.valid; + for (int i = 1; i < PIPE_EX; ++i) begin + commit_valid[i] <= commit_valid[i-1]; + end + end + end + assign pipe_inflight = st0.req.valid || st1.req.valid || (| commit_valid); + end + wire no_pending_req = ~pipe_inflight && mreq_queue_empty; VX_cache_flush #( .BANK_ID (BANK_ID), @@ -248,67 +268,50 @@ module VX_cache_bank import VX_gpu_pkg::*; #( ); // amo_chain_stall paces a same-line AMO behind an in-flight commit by one - // cycle so the prior result reaches the writeback register; it is 0 for all - // non-AMO traffic, so the baseline pipe is unaffected. + // cycle; it is 0 for non-AMO traffic, so the baseline pipe is unaffected. wire pipe_stall = crsp_queue_stall || amo_chain_stall; - // inputs arbitration: - // mshr replay has highest priority to maximize utilization since there is no miss. - // handle memory responses next to prevent deadlock with potential memory request from a miss. - // flush has precedence over core requests to ensure that the cache is in a consistent state. - wire replay_grant = ~init_valid; + // ======================================================================== + // Input arbitration + // priority: init > replay > fill(mem_rsp) > flush > core-req + // replay maximizes utilization (guaranteed hit); fill precedes flush/creq to + // avoid deadlock on a miss; flush precedes creq for consistency. + // ======================================================================== + wire replay_grant = ~init_valid; wire replay_enable = replay_grant && replay_valid; - - wire fill_grant = ~init_valid && ~replay_enable; - wire fill_enable = fill_grant && mem_rsp_valid; - - wire flush_grant = ~init_valid && ~replay_enable && ~fill_enable; - wire flush_enable = flush_grant && flush_valid; - - wire creq_grant = ~init_valid && ~replay_enable && ~fill_enable && ~flush_enable; - // creq fires from a real core_req or from a pending LLC AMO writeback - // (the synthetic write injected after a commit); the two are mutually - // exclusive. amo_commit_busy holds off new admits while a single- - // outstanding LLC commit is in flight; req_input_defer enforces non-LLC - // age-ordering. Both, plus amo_wb_pending/amo_hit_st1, are driven by the - // AMO engine below and tie to 0 when the bank carries no AMO logic. + wire fill_grant = replay_grant && ~replay_enable; + wire fill_enable = fill_grant && mem_rsp_valid; + wire flush_grant = fill_grant && ~fill_enable; + wire flush_enable = flush_grant && flush_valid; + wire creq_grant = flush_grant && ~flush_enable; + + // A core-request slot fires from a real core_req or a pending LLC AMO + // writeback (synthetic write injected after a commit); mutually exclusive. + // amo_commit_busy/req_input_defer enforce AMO ordering (0 for non-AMO banks). wire amo_creq_path = core_req_valid && ~amo_commit_busy && ~req_input_defer; wire amo_wb_path = amo_wb_pending && ~amo_hit_st1; wire creq_enable = creq_grant && (amo_creq_path || amo_wb_path); - assign replay_ready = replay_grant - && ~(!WRITEBACK && replay_rw && mreq_queue_alm_full) // needed for writethrough - && ~pipe_stall; - - assign mem_rsp_ready = fill_grant - && ~(WRITEBACK && mreq_queue_alm_full) // needed for writeback - && ~pipe_stall; - - assign flush_ready = flush_grant - && ~(WRITEBACK && mreq_queue_alm_full) // needed for writeback - && ~pipe_stall; - - assign core_req_ready = creq_grant - && ~mreq_queue_alm_full // needed for fill requests - && ~mshr_alm_full // needed for mshr allocation - && ~pipe_stall - && ~amo_commit_busy // hold off core_req while an LLC AMO commit is in flight - && ~req_input_defer // age-order AMO/load vs in-flight entry - ; - - wire init_fire = init_valid; - wire replay_fire = replay_valid && replay_ready; - wire mem_rsp_fire = mem_rsp_valid && mem_rsp_ready; - wire flush_fire = flush_valid && flush_ready; - // amo_wb_path already excludes the cycle a fresh AMO commits at S1 - // (amo_hit_st1), so the writeback never races the chain update. - wire amo_wb_fire = amo_wb_path && creq_grant - && ~mreq_queue_alm_full && ~mshr_alm_full && ~pipe_stall; + assign replay_ready = replay_grant && ~(!WRITEBACK && replay_rw && mreq_queue_alm_full) && ~pipe_stall; + assign mem_rsp_ready = fill_grant && ~(WRITEBACK && mreq_queue_alm_full) && ~pipe_stall; + assign flush_ready = flush_grant && ~(WRITEBACK && mreq_queue_alm_full) && ~pipe_stall; + assign core_req_ready = creq_grant && ~mreq_queue_alm_full && ~mshr_alm_full && ~pipe_stall + && ~amo_commit_busy && ~req_input_defer; + + wire init_fire = init_valid; + wire replay_fire = replay_valid && replay_ready; + wire mem_rsp_fire = mem_rsp_valid && mem_rsp_ready; + wire flush_fire = flush_valid && flush_ready; + // amo_wb_path already excludes the cycle a fresh AMO commits at S1, so the + // writeback never races the chain update. + wire amo_wb_fire = amo_wb_path && creq_grant && ~mreq_queue_alm_full && ~mshr_alm_full && ~pipe_stall; wire core_req_fire = (amo_creq_path || amo_wb_path) && creq_grant && ~mreq_queue_alm_full && ~mshr_alm_full && ~pipe_stall; wire [MSHR_ADDR_WIDTH-1:0] mem_rsp_id = mem_rsp_tag[MSHR_ADDR_WIDTH-1:0]; + // generate-guarded width selects (the dead branch must not elaborate an + // out-of-range slice when the other width path is taken). wire [TAG_WIDTH-1:0] mem_rsp_tag_s; if (TAG_WIDTH > MEM_TAG_WIDTH) begin : g_mem_rsp_tag_s_pad assign mem_rsp_tag_s = {mem_rsp_tag, (TAG_WIDTH-MEM_TAG_WIDTH)'(1'b0)}; @@ -329,105 +332,90 @@ module VX_cache_bank import VX_gpu_pkg::*; #( assign flush_tag = '0; end - // Input arbitration mux. The AMO writeback fields tie to 0 when no LLC - // commit engine is present, so the wb arms prune away for non-AMO banks. - assign valid_sel = init_fire || replay_fire || mem_rsp_fire || flush_fire || core_req_fire; - assign rw_sel = replay_valid ? replay_rw - : (amo_wb_pending ? 1'b1 : core_req_rw); - assign byteen_sel = replay_valid ? replay_byteen - : (amo_wb_pending ? amo_wb_byteen : core_req_byteen); - assign addr_sel = (init_valid | flush_valid) ? `CS_LINE_ADDR_WIDTH'(flush_sel) : - (replay_valid ? replay_addr : (mem_rsp_valid ? mem_rsp_addr : - (amo_wb_pending ? amo_wb_addr : core_req_addr))); - assign word_idx_sel= replay_valid ? replay_wsel - : (amo_wb_pending ? amo_wb_word_idx : core_req_wsel); - assign req_idx_sel = replay_valid ? replay_idx - : (amo_wb_pending ? amo_wb_idx : core_req_idx); - assign tag_sel = (init_valid | flush_valid) ? (flush_valid ? flush_tag : '0) : - (replay_valid ? replay_tag : (mem_rsp_valid ? mem_rsp_tag_s : - (amo_wb_pending ? amo_wb_tag : core_req_tag))); - assign attr_sel = amo_wb_pending ? amo_wb_attr - : (core_req_valid ? core_req_attr : '0); - // AMO sideband priority must match the sel mux (replay > wb > core_req): - // a replay can fire during a pending wb (chained AMO replays from MSHR - // after a fill), so it must not be cleared by amo_wb_pending. The - // synthetic writeback carries amo.valid=0 so it never re-commits at S1. - assign amo_sel = replay_valid ? replay_amo - : (amo_wb_pending ? amo_req_t'('0) - : (core_req_valid ? core_req_amo : amo_req_t'('0))); - + // Per-bit fill/write data mux. AMO writeback fields tie to 0 for non-AMO + // banks, so the wb arms prune away. + wire [`CS_LINE_WIDTH-1:0] data_sel; if (WRITE_ENABLE) begin : g_data_sel for (genvar i = 0; i < `CS_LINE_WIDTH; ++i) begin : g_i if (i < `CS_WORD_WIDTH) begin : g_lo - assign data_sel[i] = replay_valid ? replay_data[i] : - (mem_rsp_valid ? mem_rsp_data[i] : - (amo_wb_pending ? amo_wb_data[i] : core_req_data[i])); + assign data_sel[i] = replay_valid ? replay_data[i] + : (mem_rsp_valid ? mem_rsp_data[i] + : (amo_wb_pending ? amo_wb_data[i] : core_req_data[i])); end else begin : g_hi - assign data_sel[i] = mem_rsp_data[i]; // only the memory response fills the upper words of data_sel + assign data_sel[i] = mem_rsp_data[i]; // only the fill carries upper words end end end else begin : g_data_sel_ro assign data_sel = mem_rsp_data; - `UNUSED_VAR (core_req_data) - `UNUSED_VAR (replay_data) - `UNUSED_VAR (amo_wb_data) // read-only banks have no writeback data + `UNUSED_VAR ({core_req_data, replay_data, amo_wb_data}) end - if (UUID_WIDTH != 0) begin : g_req_uuid_sel - assign req_uuid_sel = tag_sel[TAG_WIDTH-1 -: UUID_WIDTH]; - end else begin : g_req_uuid_sel_0 - assign req_uuid_sel = '0; + // Input mux -> arbitrated request (whole-struct populate). AMO priority + // matches the mux (replay > wb > core_req): a replay can fire during a + // pending wb (chained AMO replays from MSHR after a fill) and must not be + // cleared by amo_wb_pending; the synthetic writeback carries amo.valid=0 so + // it never re-commits at S1. + always @(*) begin + sel_req = '0; + sel_req.req.valid = init_fire || replay_fire || mem_rsp_fire || flush_fire || core_req_fire; + sel_req.req.is_init = init_valid; + sel_req.req.is_fill = fill_enable; + sel_req.req.is_flush = flush_enable; + sel_req.req.is_creq = creq_enable || replay_enable; + sel_req.req.is_replay = replay_enable; + sel_req.req.is_passthru_fill = is_passthru_fill_sel; + sel_req.req.rw = replay_valid ? replay_rw : (amo_wb_pending ? 1'b1 : core_req_rw); + sel_req.req.attr = amo_wb_pending ? amo_wb_attr : (core_req_valid ? core_req_attr : '0); + sel_req.req.way_idx = flush_way; + sel_req.req.addr = (init_valid | flush_valid) ? `CS_LINE_ADDR_WIDTH'(flush_sel) + : (replay_valid ? replay_addr : (mem_rsp_valid ? mem_rsp_addr + : (amo_wb_pending ? amo_wb_addr : core_req_addr))); + sel_req.req.byteen = replay_valid ? replay_byteen : (amo_wb_pending ? amo_wb_byteen : core_req_byteen); + sel_req.req.word_idx = replay_valid ? replay_wsel : (amo_wb_pending ? amo_wb_word_idx : core_req_wsel); + sel_req.req.req_idx = replay_valid ? replay_idx : (amo_wb_pending ? amo_wb_idx : core_req_idx); + sel_req.req.tag = (init_valid | flush_valid) ? (flush_valid ? flush_tag : '0) + : (replay_valid ? replay_tag : (mem_rsp_valid ? mem_rsp_tag_s + : (amo_wb_pending ? amo_wb_tag : core_req_tag))); + sel_req.req.mshr_id = replay_id; + sel_req.req.amo = replay_valid ? replay_amo : (amo_wb_pending ? amo_req_t'('0) + : (core_req_valid ? core_req_amo : amo_req_t'('0))); + sel_req.data = data_sel; + // tag_matches is computed at S0; left 0 here (overridden at the data bubble). end - wire is_init_sel = init_valid; - wire is_creq_sel = creq_enable || replay_enable; - wire is_fill_sel = fill_enable; - wire is_flush_sel = flush_enable; - wire is_replay_sel = replay_enable; + // UUID extraction (debug + MSHR ordering): per stage, from the carried tag. + wire [`UP(UUID_WIDTH)-1:0] req_uuid_sel, req_uuid_st0, req_uuid_st1, req_uuid_stc; + if (UUID_WIDTH != 0) begin : g_req_uuid + assign req_uuid_sel = sel_req.req.tag[TAG_WIDTH-1 -: UUID_WIDTH]; + assign req_uuid_st0 = st0.req.tag[TAG_WIDTH-1 -: UUID_WIDTH]; + assign req_uuid_st1 = st1.req.tag[TAG_WIDTH-1 -: UUID_WIDTH]; + assign req_uuid_stc = stC.req.tag[TAG_WIDTH-1 -: UUID_WIDTH]; + end else begin : g_req_uuid_0 + assign {req_uuid_sel, req_uuid_st0, req_uuid_st1, req_uuid_stc} = '0; + end + `UNUSED_VAR ({req_uuid_st0, req_uuid_st1}) + // S0 register VX_pipe_register #( - .DATAW (1 + 1 + 1 + 1 + 1 + 1 + 1 + `UP(MEM_ATTR_WIDTH) + `CS_WAY_SEL_WIDTH + `CS_LINE_ADDR_WIDTH + `CS_LINE_WIDTH + 1 + WORD_SIZE + WORD_SEL_WIDTH + REQ_SEL_WIDTH + TAG_WIDTH + MSHR_ADDR_WIDTH + AMO_REQ_BITS), + .DATAW ($bits(data_t)), .RESETW (1) - ) pipe_reg0 ( + ) reg_s0 ( .clk (clk), .reset (reset), .enable (~pipe_stall), - .data_in ({valid_sel, is_init_sel, is_fill_sel, is_flush_sel, is_creq_sel, is_replay_sel, is_passthru_fill_sel, attr_sel, flush_way, addr_sel, data_sel, rw_sel, byteen_sel, word_idx_sel, req_idx_sel, tag_sel, replay_id, amo_sel}), - .data_out ({valid_st0, is_init_st0, is_fill_st0, is_flush_st0, is_creq_st0, is_replay_st0, is_passthru_fill_st0, attr_st0, flush_way_st0, addr_st0, data_st0, rw_st0, byteen_st0, word_idx_st0, req_idx_st0, tag_st0, replay_id_st0, amo_st0}) + .data_in (sel_req), + .data_out (st0) ); - if (UUID_WIDTH != 0) begin : g_req_uuid_st0 - assign req_uuid_st0 = tag_st0[TAG_WIDTH-1 -: UUID_WIDTH]; - end else begin : g_req_uuid_st0_0 - assign req_uuid_st0 = '0; - end - - wire is_read_st0 = is_creq_st0 && ~rw_st0; - wire is_write_st0 = is_creq_st0 && rw_st0; - - wire do_init_st0 = valid_st0 && is_init_st0; - wire do_flush_st0 = valid_st0 && is_flush_st0; - wire do_read_st0 = valid_st0 && is_read_st0; - wire do_write_st0 = valid_st0 && is_write_st0; - wire do_fill_st0 = valid_st0 && is_fill_st0; - - wire is_read_st1 = is_creq_st1 && ~rw_st1; - wire is_write_st1 = is_creq_st1 && rw_st1; - - wire do_read_st1 = valid_st1 && is_read_st1; - wire do_write_st1 = valid_st1 && is_write_st1; - - assign line_idx_sel = addr_sel[`CS_LINE_SEL_BITS-1:0]; - assign line_idx_st0 = addr_st0[`CS_LINE_SEL_BITS-1:0]; - assign line_tag_st0 = `CS_LINE_ADDR_TAG(addr_st0); - - assign write_word_st0 = data_st0[`CS_WORD_WIDTH-1:0]; - - wire do_lookup_st0 = do_read_st0 || do_write_st0; - wire do_lookup_st1 = do_read_st1 || do_write_st1; - - wire [`CS_WAY_SEL_WIDTH-1:0] victim_way_st0; + // ======================================================================== + // S0 lookup: replacement + tags + way-encode + MSHR allocate + // ======================================================================== + wire [`CS_WAY_SEL_WIDTH-1:0] victim_way; + wire [`CS_WAY_SEL_WIDTH-1:0] evict_way_st0 = st0.req.is_fill ? victim_way : st0.req.way_idx; wire [NUM_WAYS-1:0] tag_matches_st0; + wire [`CS_WAY_SEL_WIDTH-1:0] hit_idx_st0; + wire evict_dirty_st0; + wire [`CS_TAG_SEL_BITS-1:0] evict_tag_st0; VX_cache_repl #( .CACHE_SIZE (CACHE_SIZE), @@ -436,21 +424,19 @@ module VX_cache_bank import VX_gpu_pkg::*; #( .NUM_WAYS (NUM_WAYS), .REPL_POLICY (REPL_POLICY) ) cache_repl ( - .clk (clk), - .reset (reset), - .stall (pipe_stall), - .init (do_init_st0), - .lookup_valid(do_lookup_st1 && ~pipe_stall), - .lookup_hit (is_hit_st1), - .lookup_line(line_idx_st1), - .lookup_way (way_idx_st1), - .repl_valid (do_fill_st0 && ~is_passthru_fill_st0 && ~pipe_stall), - .repl_line (line_idx_st0), - .repl_way (victim_way_st0) + .clk (clk), + .reset (reset), + .stall (pipe_stall), + .init (do_init_st0), + .lookup_valid (do_lookup_st1 && ~pipe_stall), + .lookup_hit (st1.lk.is_hit), + .lookup_line (st1.req.addr[`CS_LINE_SEL_BITS-1:0]), + .lookup_way (st1.req.way_idx), + .repl_valid (do_fill_st0 && ~st0.req.is_passthru_fill && ~pipe_stall), + .repl_line (line_idx_st0), + .repl_way (victim_way) ); - assign evict_way_st0 = is_fill_st0 ? victim_way_st0 : flush_way_st0; - VX_cache_tags #( .CACHE_SIZE (CACHE_SIZE), .LINE_SIZE (LINE_SIZE), @@ -460,29 +446,26 @@ module VX_cache_bank import VX_gpu_pkg::*; #( .WRITEBACK (WRITEBACK), .AMO_ENABLE ((AMO_ENABLE != 0) && (IS_LLC == 0)) ) cache_tags ( - .clk (clk), - .reset (reset), - // inputs - .stall (pipe_stall), - .init (do_init_st0), - .flush (do_flush_st0 && ~pipe_stall), - .fill (do_fill_st0 && ~is_passthru_fill_st0 && ~pipe_stall), - .read (do_read_st0 && ~pipe_stall), - .write (do_write_st0 && ~pipe_stall), - // non-LLC AMO forwards downstream and invalidates its own cached - // copy so the issuer's later plain load refetches the new value. - .invalidate (is_amo_fwd_st0 && is_hit_st0 && ~pipe_stall), - .line_idx (line_idx_st0), - .line_idx_n (line_idx_sel), - .line_tag (line_tag_st0), - .evict_way (evict_way_st0), - // outputs - .tag_matches(tag_matches_st0), - .evict_dirty(is_dirty_st0), - .evict_tag (evict_tag_st0) + .clk (clk), + .reset (reset), + .stall (pipe_stall), + .init (do_init_st0), + .flush (do_flush_st0 && ~pipe_stall), + .fill (do_fill_st0 && ~st0.req.is_passthru_fill && ~pipe_stall), + .read (do_read_st0 && ~pipe_stall), + .write (do_write_st0 && ~pipe_stall), + // non-LLC AMO forwards downstream and invalidates its own copy so the + // issuer's later plain load refetches the new value. + .invalidate (is_amo_fwd_st0 && lk_st0.is_hit && ~pipe_stall), + .line_idx (line_idx_st0), + .line_idx_n (sel_req.req.addr[`CS_LINE_SEL_BITS-1:0]), + .line_tag (line_tag_st0), + .evict_way (evict_way_st0), + .tag_matches (tag_matches_st0), + .evict_dirty (evict_dirty_st0), + .evict_tag (evict_tag_st0) ); - wire [`CS_WAY_SEL_WIDTH-1:0] hit_idx_st0; VX_onehot_encoder #( .N (NUM_WAYS) ) way_idx_enc ( @@ -491,37 +474,94 @@ module VX_cache_bank import VX_gpu_pkg::*; #( `UNUSED_PIN (valid_out) ); - assign way_idx_st0 = is_creq_st0 ? hit_idx_st0 : evict_way_st0; - assign is_hit_st0 = (| tag_matches_st0); + // S0 lookup delta (single combinational driver). The AMO requester is forced + // non-pending so it never coalesces onto a prior same-line entry. + always @(*) begin + lk_st0 = '0; + lk_st0.is_hit = (| tag_matches_st0); + lk_st0.is_dirty = evict_dirty_st0; + lk_st0.evict_tag = evict_tag_st0; + lk_st0.write_word = write_word_st0; + lk_st0.mshr_previd = mshr_previd; + lk_st0.mshr_pending = mshr_pending_raw && ~is_amo_fwd_st0; + end + + // ======================================================================== + // Pipeline registration + // + // Tags / replacement / MSHR (allocate AND finalize) stay at S0/S1: the MSHR + // coalescing chain requires allocate(S0)->finalize(S1) exactly one cycle + // apart (deferring it orphans coalesced same-line entries -> deadlock). Only + // the data array (stD) and the commit consumers (stC) defer by PIPE_EX, so + // the array is driven by *registered* tag-compare results — breaking the + // tag->data critical path. Read and write both move to the same deferred + // stage, so pipeline order is preserved (no store->load hazard logic). + // ======================================================================== + + // data path: carry the request + fill line + tag compare, resolving the way + // for the data array (victim way for fill/flush, hit way otherwise). + always @(*) begin + dat_in = st0; + dat_in.req.way_idx = evict_way_st0; + dat_in.tag_matches = tag_matches_st0; + end - wire [MSHR_ADDR_WIDTH-1:0] mshr_alloc_id_st0; - assign mshr_id_st0 = is_replay_st0 ? replay_id_st0 : mshr_alloc_id_st0; + // commit path: the request (with the resolved hit/victim way and MSHR id) + // plus the lookup delta. The wide fill line is dropped here. + always @(*) begin + cmt_in.req = st0.req; + cmt_in.req.way_idx = st0.req.is_creq ? hit_idx_st0 : evict_way_st0; + cmt_in.req.mshr_id = st0.req.is_replay ? st0.req.mshr_id : mshr_alloc_id; + cmt_in.lk = lk_st0; + end VX_pipe_register #( - .DATAW (1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + `UP(MEM_ATTR_WIDTH) + `CS_WAY_SEL_WIDTH + `CS_TAG_SEL_BITS + `CS_TAG_SEL_BITS + `CS_LINE_SEL_BITS + `CS_WORD_WIDTH + WORD_SIZE + WORD_SEL_WIDTH + REQ_SEL_WIDTH + TAG_WIDTH + MSHR_ADDR_WIDTH + MSHR_ADDR_WIDTH + 1 + AMO_REQ_BITS), - .RESETW (1) - ) pipe_reg1 ( + .DATAW ($bits(data_t)), + .RESETW (1), + .DEPTH (PIPE_EX) + ) reg_dat ( .clk (clk), .reset (reset), .enable (~pipe_stall), - .data_in ({valid_st0, is_fill_st0, is_flush_st0, is_creq_st0, is_replay_st0, is_dirty_st0, is_hit_st0, rw_st0, attr_st0, way_idx_st0, evict_tag_st0, line_tag_st0, line_idx_st0, write_word_st0, byteen_st0, word_idx_st0, req_idx_st0, tag_st0, mshr_id_st0, mshr_previd_st0, mshr_pending_st0, amo_st0}), - .data_out ({valid_st1, is_fill_st1, is_flush_st1, is_creq_st1, is_replay_st1, is_dirty_st1, is_hit_st1, rw_st1, attr_st1, way_idx_st1, evict_tag_st1, line_tag_st1, line_idx_st1, write_word_st1, byteen_st1, word_idx_st1, req_idx_st1, tag_st1, mshr_id_st1, mshr_previd_st1, mshr_pending_st1, amo_st1}) + .data_in (dat_in), + .data_out (stD) ); - if (UUID_WIDTH != 0) begin : g_req_uuid_st1 - assign req_uuid_st1 = tag_st1[TAG_WIDTH-1 -: UUID_WIDTH]; - end else begin : g_req_uuid_st1_0 - assign req_uuid_st1 = '0; - end + VX_pipe_register #( + .DATAW ($bits(commit_t)), + .RESETW (1) + ) reg_s1 ( + .clk (clk), + .reset (reset), + .enable (~pipe_stall), + .data_in (cmt_in), + .data_out (st1) + ); - assign addr_st1 = {line_tag_st1, line_idx_st1}; + VX_pipe_register #( + .DATAW ($bits(commit_t)), + .RESETW (1), + .DEPTH (PIPE_EX) + ) reg_cmt ( + .clk (clk), + .reset (reset), + .enable (~pipe_stall), + .data_in (st1), + .data_out (stC) + ); - // ensure mshr replay always get a hit (a passthru-AMO replay carries - // its result word instead of an installed line, so it counts as a hit) - `RUNTIME_ASSERT (~(valid_st1 && is_replay_st1 && ~eff_hit_st1), ("missed mshr replay")) + // a passthru-AMO replay carries its result word instead of an installed + // line, so it counts as a hit at the commit stage. + wire eff_hit_st1 = st1.lk.is_hit || is_amo_replay_st1; + wire eff_hit_stc = stC.lk.is_hit || is_amo_replay_st1; + `RUNTIME_ASSERT (~(st1.req.valid && st1.req.is_replay && ~eff_hit_st1), ("missed mshr replay")) - wire[`CS_WORDS_PER_LINE-1:0][`CS_WORD_WIDTH-1:0] read_data_st1; - wire [LINE_SIZE-1:0] evict_byteen_st1; + // ======================================================================== + // Data array (driven at stD; outputs land at stC) + // ======================================================================== + wire[`CS_WORDS_PER_LINE-1:0][`CS_WORD_WIDTH-1:0] read_data_stc; + wire [LINE_SIZE-1:0] evict_byteen_stc; + wire [`CS_WORD_WIDTH-1:0] read_word_stc = read_data_stc[stC.req.word_idx]; VX_cache_data #( .CACHE_SIZE (CACHE_SIZE), @@ -533,52 +573,48 @@ module VX_cache_bank import VX_gpu_pkg::*; #( .WRITEBACK (WRITEBACK), .DIRTY_BYTES (DIRTY_BYTES) ) cache_data ( - .clk (clk), - .reset (reset), - // inputs - .init (do_init_st0), - .fill (do_fill_st0 && ~is_passthru_fill_st0 && ~pipe_stall), - .flush (do_flush_st0 && ~pipe_stall), - .read (do_read_st0 && ~pipe_stall), - .write (do_write_st0 && ~pipe_stall), - .evict_way (evict_way_st0), - .tag_matches(tag_matches_st0), - .line_idx (line_idx_st0), - .fill_data (data_st0), - .write_word (write_word_st0), - .word_idx (word_idx_st0), - .write_byteen(byteen_st0), - .way_idx_r (way_idx_st1), - // outputs - .read_data (read_data_st1), - .evict_byteen(evict_byteen_st1) + .clk (clk), + .reset (reset), + .init (do_init_std), + .fill (do_fill_std && ~stD.req.is_passthru_fill && ~pipe_stall), + .flush (do_flush_std && ~pipe_stall), + .read (do_read_std && ~pipe_stall), + .write (do_write_std && ~pipe_stall), + .evict_way (stD.req.way_idx), + .tag_matches (stD.tag_matches), + .line_idx (stD.req.addr[`CS_LINE_SEL_BITS-1:0]), + .fill_data (stD.data), + .write_word (stD.data[`CS_WORD_WIDTH-1:0]), + .word_idx (stD.req.word_idx), + .write_byteen (stD.req.byteen), + .way_idx_r (stC.req.way_idx), + .read_data (read_data_stc), + .evict_byteen (evict_byteen_stc) ); - // only allocate MSHR entries for non-replay core requests - wire mshr_allocate_st0 = valid_st0 && is_creq_st0 && ~is_replay_st0; - wire mshr_finalize_st1 = valid_st1 && is_creq_st1 && ~is_replay_st1; + // ======================================================================== + // MSHR (allocate at S0, finalize at S1) + // ======================================================================== + wire mshr_allocate_st0 = st0.req.valid && st0.req.is_creq && ~st0.req.is_replay; + wire mshr_finalize_st1 = st1.req.valid && st1.req.is_creq && ~st1.req.is_replay; - // release allocated mshr entry if we had a hit + // release the entry on a hit. A forwarded AMO keeps its entry until its + // downstream response returns (fill/dequeue frees it), so never release it. wire mshr_release_st1; - // A forwarded AMO keeps its entry allocated until its downstream - // response returns (the fill/dequeue frees it), so never release it - // at finalize even on a local hit. if (WRITEBACK) begin : g_mshr_release - assign mshr_release_st1 = is_hit_st1 && ~is_amo_fwd_st1; + assign mshr_release_st1 = st1.lk.is_hit && ~is_amo_fwd_st1; end else begin : g_mshr_release_ro - // we need to keep missed write requests in MSHR if there is already a pending entry to the same address. - // this ensures that missed write requests are replayed locally in case a pending fill arrives without the write content. - // this can happen when writes are sent to memory late, when a related fill was already in flight. - assign mshr_release_st1 = (is_hit_st1 || (rw_st1 && ~mshr_pending_st1)) && ~is_amo_fwd_st1; + // keep missed writes in MSHR if a pending entry exists for the line, so a + // pending fill arriving without the write content replays them locally. + assign mshr_release_st1 = (st1.lk.is_hit || (st1.req.rw && ~st1.lk.mshr_pending)) && ~is_amo_fwd_st1; end - wire mshr_release_fire = mshr_finalize_st1 && mshr_release_st1 && ~pipe_stall; wire [1:0] mshr_dequeue; `POP_COUNT(mshr_dequeue, {replay_fire, mshr_release_fire}); VX_pending_size #( - .SIZE (MSHR_SIZE), + .SIZE (MSHR_SIZE), .DECRW (2) ) mshr_pending_size ( .clk (clk), @@ -602,58 +638,47 @@ module VX_cache_bank import VX_gpu_pkg::*; #( .AMO_ENABLE ((AMO_ENABLE != 0) && (IS_LLC == 0)), .DATA_WIDTH (WORD_SEL_WIDTH + WORD_SIZE + `CS_WORD_WIDTH + TAG_WIDTH + REQ_SEL_WIDTH + AMO_REQ_BITS) ) cache_mshr ( - .clk (clk), - .reset (reset), - - .deq_req_uuid (req_uuid_sel), - .alc_req_uuid (req_uuid_st0), - .fin_req_uuid (req_uuid_st1), - - // memory fill - .fill_valid (mem_rsp_fire), - .fill_id (mem_rsp_id), - .fill_addr (mem_rsp_addr), - - // probe: pending entries for the incoming request's line, by type. - .probe_addr (core_req_addr), - .probe_pending_ld (mshr_probe_pending_ld), - .probe_pending_amo (mshr_probe_pending_amo), - - // dequeue - .dequeue_valid (replay_valid), - .dequeue_addr (replay_addr), - .dequeue_rw (replay_rw), - .dequeue_data ({replay_wsel, replay_byteen, replay_data, replay_tag, replay_idx, replay_amo}), - .dequeue_id (replay_id), - .dequeue_ready (replay_ready), - - // allocate - .allocate_valid (mshr_allocate_st0 && ~pipe_stall), - .allocate_addr (addr_st0), - .allocate_rw (rw_st0), - // Only non-LLC AMOs must not coalesce (each forwards its own - // round-trip). At the LLC, same-line AMOs coalesce and serialize - // their commits on the single filled line. - .allocate_is_amo((AMO_ENABLE && !IS_LLC) ? amo_st0.amo_valid : 1'b0), - .allocate_data ({word_idx_st0, byteen_st0, write_word_st0, tag_st0, req_idx_st0, amo_st0}), - .allocate_id (mshr_alloc_id_st0), - .allocate_pending(mshr_pending_raw_st0), - .allocate_previd(mshr_previd_st0), - `UNUSED_PIN (allocate_ready), - - // finalize - .finalize_valid (mshr_finalize_st1 && ~pipe_stall), - .finalize_is_release(mshr_release_st1), - .finalize_is_pending(mshr_pending_st1), - .finalize_id (mshr_id_st1), - .finalize_previd(mshr_previd_st1) + .clk (clk), + .reset (reset), + .deq_req_uuid (req_uuid_sel), + .alc_req_uuid (req_uuid_st0), + .fin_req_uuid (req_uuid_st1), + .fill_valid (mem_rsp_fire), + .fill_id (mem_rsp_id), + .fill_addr (mem_rsp_addr), + .probe_addr (core_req_addr), + .probe_pending_ld (mshr_probe_pending_ld), + .probe_pending_amo (mshr_probe_pending_amo), + .dequeue_valid (replay_valid), + .dequeue_addr (replay_addr), + .dequeue_rw (replay_rw), + .dequeue_data ({replay_wsel, replay_byteen, replay_data, replay_tag, replay_idx, replay_amo}), + .dequeue_id (replay_id), + .dequeue_ready (replay_ready), + .allocate_valid (mshr_allocate_st0 && ~pipe_stall), + .allocate_addr (st0.req.addr), + .allocate_rw (st0.req.rw), + // Only non-LLC AMOs must not coalesce; at the LLC same-line AMOs coalesce + // and serialize their commits on the single filled line. + .allocate_is_amo ((AMO_ENABLE && !IS_LLC) ? st0.req.amo.amo_valid : 1'b0), + .allocate_data ({st0.req.word_idx, st0.req.byteen, write_word_st0, st0.req.tag, st0.req.req_idx, st0.req.amo}), + .allocate_id (mshr_alloc_id), + .allocate_pending (mshr_pending_raw), + .allocate_previd (mshr_previd), + `UNUSED_PIN (allocate_ready), + .finalize_valid (mshr_finalize_st1 && ~pipe_stall), + .finalize_is_release (mshr_release_st1), + .finalize_is_pending (st1.lk.mshr_pending), + .finalize_id (st1.req.mshr_id), + .finalize_previd (st1.lk.mshr_previd) ); - // ============================================================ + // ======================================================================== // AMO engine - // ============================================================ - wire [`CS_WORD_WIDTH-1:0] read_word_st1 = read_data_st1[word_idx_st1]; - + // + // The read word lands at the deferred commit stage stC; the engine consumes + // it at S1 (== stC when PIPE_EX=0, the validated case). + // ======================================================================== if (AMO_ENABLE) begin : g_amo VX_cache_amo #( .IS_LLC (IS_LLC), @@ -667,65 +692,69 @@ module VX_cache_bank import VX_gpu_pkg::*; #( .ATTR_WIDTH (`UP(MEM_ATTR_WIDTH)), .MSHR_SIZE (MSHR_SIZE), .MSHR_ADDR_WIDTH (MSHR_ADDR_WIDTH), - .WORDS_PER_LINE (`CS_WORDS_PER_LINE) + .WORDS_PER_LINE (`CS_WORDS_PER_LINE), + .PIPE_EX (PIPE_EX) ) amo ( - .clk (clk), - .reset (reset), - .pipe_stall (pipe_stall), - .amo_st0 (amo_st0), - .valid_st0 (valid_st0), - .is_creq_st0 (is_creq_st0), - .is_hit_st0 (is_hit_st0), - .is_replay_st0 (is_replay_st0), - .amo_st1 (amo_st1), - .valid_st1 (valid_st1), - .is_creq_st1 (is_creq_st1), - .is_hit_st1 (is_hit_st1), - .is_replay_st1 (is_replay_st1), - .do_write_st1 (do_write_st1), - .read_word_st1 (read_word_st1), - .byteen_st1 (byteen_st1), - .write_word_st1 (write_word_st1), - .word_idx_st0 (word_idx_st0), - .word_idx_st1 (word_idx_st1), - .addr_st0 (addr_st0), - .addr_st1 (addr_st1), - .tag_st1 (tag_st1), - .req_idx_st1 (req_idx_st1), - .attr_st1 (attr_st1), - .wb_fire (amo_wb_fire), - .mshr_allocate_st0 (mshr_allocate_st0), - .mshr_alloc_id_st0 (mshr_alloc_id_st0), - .mshr_id_st1 (mshr_id_st1), - .mem_rsp_fire (mem_rsp_fire), - .mem_rsp_id (mem_rsp_id), - .mem_rsp_data (mem_rsp_data), - .is_fill_sel (is_fill_sel), - .core_req_valid (core_req_valid), - .core_req_is_amo (core_req_amo.amo_valid), - .core_req_rw (core_req_rw), - .core_req_addr (core_req_addr), - .rw_st0 (rw_st0), - .mshr_probe_pending_ld (mshr_probe_pending_ld), + .clk (clk), + .reset (reset), + .pipe_stall (pipe_stall), + .amo_st0 (st0.req.amo), + .valid_st0 (st0.req.valid), + .is_creq_st0 (st0.req.is_creq), + .is_hit_st0 (lk_st0.is_hit), + .is_replay_st0 (st0.req.is_replay), + // Commit ports are fed from stC (the deferred data-output stage), so + // the AMO RMW operands and the read word align at PIPE_EX>0. At + // PIPE_EX=0, stC == S1 and this is identical to the classic bank. + .amo_st1 (stC.req.amo), + .valid_st1 (stC.req.valid), + .is_creq_st1 (stC.req.is_creq), + .is_hit_st1 (stC.lk.is_hit), + .is_replay_st1 (stC.req.is_replay), + .do_write_st1 (do_write_stc), + .read_word_st1 (read_word_stc), + .byteen_st1 (stC.req.byteen), + .write_word_st1 (stC.lk.write_word), + .word_idx_st0 (st0.req.word_idx), + .word_idx_st1 (stC.req.word_idx), + .addr_st0 (st0.req.addr), + .addr_st1 (addr_stc), + .tag_st1 (stC.req.tag), + .req_idx_st1 (stC.req.req_idx), + .attr_st1 (stC.req.attr), + .wb_fire (amo_wb_fire), + .mshr_allocate_st0 (mshr_allocate_st0), + .mshr_alloc_id_st0 (mshr_alloc_id), + .mshr_id_st1 (stC.req.mshr_id), + .mem_rsp_fire (mem_rsp_fire), + .mem_rsp_id (mem_rsp_id), + .mem_rsp_data (mem_rsp_data), + .is_fill_sel (fill_enable), + .core_req_valid (core_req_valid), + .core_req_is_amo (core_req_amo.amo_valid), + .core_req_rw (core_req_rw), + .core_req_addr (core_req_addr), + .rw_st0 (st0.req.rw), + .mshr_probe_pending_ld (mshr_probe_pending_ld), .mshr_probe_pending_amo (mshr_probe_pending_amo), - .amo_hit_st1 (amo_hit_st1), - .commit_busy (amo_commit_busy), - .chain_stall (amo_chain_stall), - .wb_pending (amo_wb_pending), - .rsp_data (amo_rsp_data), - .wb_addr (amo_wb_addr), - .wb_word_idx (amo_wb_word_idx), - .wb_byteen (amo_wb_byteen), - .wb_data (amo_wb_data), - .wb_tag (amo_wb_tag), - .wb_idx (amo_wb_idx), - .wb_attr (amo_wb_attr), - .is_amo_fwd_st0 (is_amo_fwd_st0), - .is_amo_fwd_st1 (is_amo_fwd_st1), - .is_amo_replay_st1 (is_amo_replay_st1), - .is_passthru_fill_sel (is_passthru_fill_sel), - .amo_ptw_word_st1 (amo_ptw_word_st1), - .req_input_defer (req_input_defer) + .amo_hit_st1 (amo_hit_st1), + .commit_busy (amo_commit_busy), + .chain_stall (amo_chain_stall), + .wb_pending (amo_wb_pending), + .rsp_data (amo_rsp_data), + .wb_addr (amo_wb_addr), + .wb_word_idx (amo_wb_word_idx), + .wb_byteen (amo_wb_byteen), + .wb_data (amo_wb_data), + .wb_tag (amo_wb_tag), + .wb_idx (amo_wb_idx), + .wb_attr (amo_wb_attr), + .is_amo_fwd_st0 (is_amo_fwd_st0), + .is_amo_fwd_st1 (is_amo_fwd_st1), + .is_amo_replay_st1 (is_amo_replay_st1), + .is_passthru_fill_sel (is_passthru_fill_sel), + .amo_ptw_word_st1 (amo_ptw_word_st1), + .req_input_defer (req_input_defer) ); end else begin : g_no_amo assign {amo_hit_st1, amo_commit_busy, amo_wb_pending, amo_chain_stall} = '0; @@ -733,42 +762,22 @@ module VX_cache_bank import VX_gpu_pkg::*; #( assign {amo_wb_data, amo_wb_tag, amo_wb_idx, amo_wb_attr} = '0; assign {is_amo_fwd_st0, is_amo_fwd_st1, is_amo_replay_st1} = '0; assign {is_passthru_fill_sel, amo_ptw_word_st1, req_input_defer} = '0; - `UNUSED_VAR (amo_st1) - `UNUSED_VAR (amo_wb_fire) - `UNUSED_VAR (mshr_probe_pending_ld) - `UNUSED_VAR (mshr_probe_pending_amo) + // S1-only signals consumed solely by the AMO engine. + `UNUSED_VAR ({amo_wb_fire, mshr_probe_pending_ld, mshr_probe_pending_amo, st1.req.amo, st1.req.attr, st1.req.req_idx, st1.req.word_idx, st1.req.byteen, st1.lk.write_word}) end - // Force the AMO requester non-pending so it never coalesces onto a prior - // entry for the same line — each atomic takes its own downstream trip. - assign mshr_pending_st0 = mshr_pending_raw_st0 && ~is_amo_fwd_st0; - - // Passthru replay counts as a hit (its line was never installed): fires - // the core response, allocates no mreq, releases the MSHR entry. - wire eff_hit_st1 = is_hit_st1 || is_amo_replay_st1; - - // schedule core response - - wire crsp_queue_valid, crsp_queue_ready; - wire [`CS_WORD_WIDTH-1:0] crsp_queue_data; - wire [REQ_SEL_WIDTH-1:0] crsp_queue_idx; - wire [TAG_WIDTH-1:0] crsp_queue_tag; - - // crsp_queue fires for reads and AMO commits at S1 on hit, but not - // for the synthetic writeback write (rw=1). A non-LLC AMO's first - // pass forwards downstream and must NOT respond locally; its result - // returns later via the passthru replay (eff_hit covers that replay). - // Suppress the response while a same-line AMO is chain-stalled at S1, so a - // read held for the extra pacing cycle enqueues its response exactly once - // (it fires when the op advances). amo_chain_stall is 0 for non-AMO traffic. - assign crsp_queue_valid = do_read_st1 && eff_hit_st1 && ~is_amo_fwd_st1 && ~amo_chain_stall; - assign crsp_queue_idx = req_idx_st1; - // Response data: passthru replay returns the latched downstream result, - // an LLC AMO commit returns its formatted result word, else plain load. - assign crsp_queue_data = is_amo_replay_st1 ? amo_ptw_word_st1 - : (amo_hit_st1 ? amo_rsp_data - : read_word_st1); - assign crsp_queue_tag = tag_st1; + // ======================================================================== + // Core response (stC) + // + // Fires for reads (and LLC AMO commits) on hit, never for the synthetic + // writeback (rw=1). A non-LLC AMO's first pass forwards downstream and must + // not respond locally (its result returns via the passthru replay). Suppress + // while a same-line AMO is chain-stalled so a held read enqueues once. + // ======================================================================== + wire crsp_queue_valid = do_read_stc && eff_hit_stc && ~is_amo_fwd_st1 && ~amo_chain_stall; + wire crsp_queue_ready; + wire [`CS_WORD_WIDTH-1:0] crsp_queue_data = is_amo_replay_st1 ? amo_ptw_word_st1 + : (amo_hit_st1 ? amo_rsp_data : read_word_stc); VX_elastic_buffer #( .DATAW (TAG_WIDTH + `CS_WORD_WIDTH + REQ_SEL_WIDTH), @@ -779,98 +788,81 @@ module VX_cache_bank import VX_gpu_pkg::*; #( .reset (reset), .valid_in (crsp_queue_valid), .ready_in (crsp_queue_ready), - .data_in ({crsp_queue_tag, crsp_queue_data, crsp_queue_idx}), + .data_in ({stC.req.tag, crsp_queue_data, stC.req.req_idx}), .data_out ({core_rsp_tag, core_rsp_data, core_rsp_idx}), .valid_out (core_rsp_valid), .ready_out (core_rsp_ready) ); - assign crsp_queue_stall = crsp_queue_valid && ~crsp_queue_ready; - // schedule memory request - + // ======================================================================== + // Memory request (stC) + // ======================================================================== wire mreq_queue_push, mreq_queue_pop; wire [`CS_LINE_WIDTH-1:0] mreq_queue_data; wire [LINE_SIZE-1:0] mreq_queue_byteen; wire [`CS_LINE_ADDR_WIDTH-1:0] mreq_queue_addr; wire [MEM_TAG_WIDTH-1:0] mreq_queue_tag; wire mreq_queue_rw; - wire [`UP(MEM_ATTR_WIDTH)-1:0] mreq_queue_attr; - wire is_fill_or_flush_st1 = is_fill_st1 || (is_flush_st1 && WRITEBACK); - wire do_fill_or_flush_st1 = valid_st1 && is_fill_or_flush_st1; - wire do_writeback_st1 = do_fill_or_flush_st1 && is_dirty_st1; - wire [`CS_LINE_ADDR_WIDTH-1:0] evict_addr_st1 = {evict_tag_st1, line_idx_st1}; + wire is_fill_or_flush_stc = stC.req.is_fill || (stC.req.is_flush && WRITEBACK); + wire do_fill_or_flush_stc = stC.req.valid && is_fill_or_flush_stc; + wire do_writeback_stc = do_fill_or_flush_stc && stC.lk.is_dirty; + wire [`CS_LINE_ADDR_WIDTH-1:0] evict_addr_stc = {stC.lk.evict_tag, stC.req.addr[`CS_LINE_SEL_BITS-1:0]}; if (WRITE_ENABLE) begin : g_mreq_queue if (WRITEBACK) begin : g_wb if (DIRTY_BYTES) begin : g_dirty_bytes - // ensure dirty bytes match the tag info - wire has_dirty_bytes = (| evict_byteen_st1); - `RUNTIME_ASSERT (~do_fill_or_flush_st1 || (is_dirty_st1 == has_dirty_bytes), ("missmatch dirty bytes: dirty_line=%b, dirty_bytes=%b, addr=0x%0h", is_dirty_st1, has_dirty_bytes, `CS_BANK_TO_FULL_ADDR(addr_st1, BANK_ID))) + wire has_dirty_bytes = (| evict_byteen_stc); + `RUNTIME_ASSERT (~do_fill_or_flush_stc || (stC.lk.is_dirty == has_dirty_bytes), ("missmatch dirty bytes: dirty_line=%b, dirty_bytes=%b, addr=0x%0h", stC.lk.is_dirty, has_dirty_bytes, `CS_BANK_TO_FULL_ADDR(addr_stc, BANK_ID))) end - // issue a fill request on a read/write miss - // issue a writeback on a dirty line eviction - assign mreq_queue_push = ((do_lookup_st1 && ~is_hit_st1 && ~mshr_pending_st1) - || do_writeback_st1) - && ~pipe_stall; - assign mreq_queue_addr = is_fill_or_flush_st1 ? evict_addr_st1 : addr_st1; - assign mreq_queue_rw = is_fill_or_flush_st1; - assign mreq_queue_data = read_data_st1; - assign mreq_queue_byteen = is_fill_or_flush_st1 ? evict_byteen_st1 : '1; - `UNUSED_VAR (write_word_st1) - `UNUSED_VAR (byteen_st1) + // fill on a read/write miss; writeback on a dirty-line eviction. + assign mreq_queue_push = (((do_read_stc || do_write_stc) && ~stC.lk.is_hit && ~stC.lk.mshr_pending) + || do_writeback_stc) && ~pipe_stall; + assign mreq_queue_addr = is_fill_or_flush_stc ? evict_addr_stc : addr_stc; + assign mreq_queue_rw = is_fill_or_flush_stc; + assign mreq_queue_data = read_data_stc; + assign mreq_queue_byteen = is_fill_or_flush_stc ? evict_byteen_stc : '1; + `UNUSED_VAR ({stC.lk.write_word, stC.req.byteen, stC.req.is_replay}) end else begin : g_wt wire [LINE_SIZE-1:0] line_byteen; VX_demux #( .DATAW (WORD_SIZE), - .N (`CS_WORDS_PER_LINE) + .N (`CS_WORDS_PER_LINE) ) byteen_demux ( - .sel_in (word_idx_st1), - .data_in (byteen_st1), + .sel_in (stC.req.word_idx), + .data_in (stC.req.byteen), .data_out (line_byteen) ); - // issue a fill request on a read miss - // issue a memory write on a write request (ensure write replays don't send again) - // forward a non-LLC AMO downstream (always, even on a local hit); - // its passthru replay (eff_hit) must NOT re-issue a fill. - assign mreq_queue_push = ((do_read_st1 && ~eff_hit_st1 && ~mshr_pending_st1) - || (do_write_st1 && ~is_replay_st1) - || is_amo_fwd_st1) - && ~pipe_stall; - assign mreq_queue_addr = addr_st1; - assign mreq_queue_rw = rw_st1; - assign mreq_queue_data = {`CS_WORDS_PER_LINE{write_word_st1}}; - // an AMO forward carries its single word's byteen (rw=0 but the - // downstream LLC reads it via the AMO sideband, not as a write). - assign mreq_queue_byteen = (rw_st1 || is_amo_fwd_st1) ? line_byteen : '1; - `UNUSED_VAR (is_fill_or_flush_st1) - `UNUSED_VAR (do_writeback_st1) - `UNUSED_VAR (evict_addr_st1) - `UNUSED_VAR (evict_byteen_st1) + // fill on a read miss; memory write on a write (don't resend replays); + // forward a non-LLC AMO downstream (its passthru replay must not refill). + assign mreq_queue_push = ((do_read_stc && ~eff_hit_stc && ~stC.lk.mshr_pending) + || (do_write_stc && ~stC.req.is_replay) + || is_amo_fwd_st1) && ~pipe_stall; + assign mreq_queue_addr = addr_stc; + assign mreq_queue_rw = stC.req.rw; + assign mreq_queue_data = {`CS_WORDS_PER_LINE{stC.lk.write_word}}; + // an AMO forward carries its single word's byteen (read downstream via + // the AMO sideband, not as a write). + assign mreq_queue_byteen = (stC.req.rw || is_amo_fwd_st1) ? line_byteen : '1; + `UNUSED_VAR ({is_fill_or_flush_stc, do_writeback_stc, evict_addr_stc, evict_byteen_stc, stC.lk.evict_tag, stC.lk.is_dirty}) end end else begin : g_mreq_queue_ro - // issue a fill request on a read miss - assign mreq_queue_push = (do_read_st1 && ~is_hit_st1 && ~mshr_pending_st1) && ~pipe_stall; - assign mreq_queue_addr = addr_st1; + assign mreq_queue_push = (do_read_stc && ~stC.lk.is_hit && ~stC.lk.mshr_pending) && ~pipe_stall; + assign mreq_queue_addr = addr_stc; assign mreq_queue_rw = 0; assign mreq_queue_data = '0; assign mreq_queue_byteen = '1; - `UNUSED_VAR (do_writeback_st1) - `UNUSED_VAR (evict_addr_st1) - `UNUSED_VAR (evict_byteen_st1) - `UNUSED_VAR (write_word_st1) - `UNUSED_VAR (byteen_st1) + `UNUSED_VAR ({do_writeback_stc, evict_addr_stc, evict_byteen_stc, stC.lk.write_word, stC.lk.evict_tag, stC.lk.is_dirty, stC.req.byteen, stC.req.word_idx, stC.req.is_replay, do_write_stc}) end if (UUID_WIDTH != 0) begin : g_mreq_queue_tag_uuid - assign mreq_queue_tag = {req_uuid_st1, mshr_id_st1}; + assign mreq_queue_tag = {req_uuid_stc, stC.req.mshr_id}; end else begin : g_mreq_queue_tag - assign mreq_queue_tag = mshr_id_st1; + assign mreq_queue_tag = stC.req.mshr_id; end assign mreq_queue_pop = mem_req_valid && mem_req_ready; - assign mreq_queue_attr = attr_st1; VX_fifo_queue #( .DATAW (1 + `CS_LINE_ADDR_WIDTH + LINE_SIZE + `CS_LINE_WIDTH + MEM_TAG_WIDTH + `UP(MEM_ATTR_WIDTH)), @@ -878,19 +870,18 @@ module VX_cache_bank import VX_gpu_pkg::*; #( .ALM_FULL (MREQ_SIZE - PIPELINE_STAGES), .OUT_REG (`TO_OUT_BUF_REG(MEM_OUT_BUF)) ) mem_req_queue ( - .clk (clk), - .reset (reset), - .push (mreq_queue_push), - .pop (mreq_queue_pop), - .data_in ({mreq_queue_rw, mreq_queue_addr, mreq_queue_byteen, mreq_queue_data, mreq_queue_tag, mreq_queue_attr}), - .data_out ({mem_req_rw, mem_req_addr, mem_req_byteen, mem_req_data, mem_req_tag, mem_req_attr}), - .empty (mreq_queue_empty), - .alm_full (mreq_queue_alm_full), + .clk (clk), + .reset (reset), + .push (mreq_queue_push), + .pop (mreq_queue_pop), + .data_in ({mreq_queue_rw, mreq_queue_addr, mreq_queue_byteen, mreq_queue_data, mreq_queue_tag, stC.req.attr}), + .data_out ({mem_req_rw, mem_req_addr, mem_req_byteen, mem_req_data, mem_req_tag, mem_req_attr}), + .empty (mreq_queue_empty), + .alm_full (mreq_queue_alm_full), `UNUSED_PIN (full), `UNUSED_PIN (alm_empty), `UNUSED_PIN (size) ); - assign mem_req_valid = ~mreq_queue_empty; `UNUSED_VAR (do_lookup_st0) @@ -898,9 +889,9 @@ module VX_cache_bank import VX_gpu_pkg::*; #( /////////////////////////////////////////////////////////////////////////////// `ifdef PERF_ENABLE - assign perf_read_miss = do_read_st1 && ~is_hit_st1; - assign perf_write_miss = do_write_st1 && ~is_hit_st1; - assign perf_evictions = do_writeback_st1; // dirty-line writeback eviction + assign perf_read_miss = do_read_st1 && ~st1.lk.is_hit; + assign perf_write_miss = do_write_st1 && ~st1.lk.is_hit; + assign perf_evictions = do_writeback_stc; assign perf_mshr_stall = mshr_alm_full; `endif @@ -912,8 +903,8 @@ module VX_cache_bank import VX_gpu_pkg::*; #( wire [`VX_CFG_XLEN-1:0] mem_rsp_full_addr = `CS_BANK_TO_FULL_ADDR(mem_rsp_addr, BANK_ID); wire [`VX_CFG_XLEN-1:0] replay_full_addr = `CS_BANK_TO_FULL_ADDR(replay_addr, BANK_ID); wire [`VX_CFG_XLEN-1:0] core_req_full_addr = `CS_BANK_TO_FULL_ADDR(core_req_addr, BANK_ID); - wire [`VX_CFG_XLEN-1:0] full_addr_st0 = `CS_BANK_TO_FULL_ADDR(addr_st0, BANK_ID); - wire [`VX_CFG_XLEN-1:0] full_addr_st1 = `CS_BANK_TO_FULL_ADDR(addr_st1, BANK_ID); + wire [`VX_CFG_XLEN-1:0] full_addr_st0 = `CS_BANK_TO_FULL_ADDR(st0.req.addr, BANK_ID); + wire [`VX_CFG_XLEN-1:0] full_addr_st1 = `CS_BANK_TO_FULL_ADDR(st1.req.addr, BANK_ID); wire [`VX_CFG_XLEN-1:0] mreq_queue_full_addr = `CS_BANK_TO_FULL_ADDR(mreq_queue_addr, BANK_ID); always @(posedge clk) begin @@ -943,51 +934,51 @@ module VX_cache_bank import VX_gpu_pkg::*; #( end if (do_fill_st0 && ~pipe_stall) begin `TRACE(3, ("%t: %s tags-fill: addr=0x%0h, way=%0d, line=%0d, dirty=%b (#%0d)\n", $time, INSTANCE_ID, - full_addr_st0, evict_way_st0, line_idx_st0, is_dirty_st0, req_uuid_st0)) + full_addr_st0, evict_way_st0, line_idx_st0, lk_st0.is_dirty, req_uuid_st0)) end if (do_flush_st0 && ~pipe_stall) begin `TRACE(3, ("%t: %s tags-flush: addr=0x%0h, way=%0d, line=%0d, dirty=%b (#%0d)\n", $time, INSTANCE_ID, - full_addr_st0, evict_way_st0, line_idx_st0, is_dirty_st0, req_uuid_st0)) + full_addr_st0, evict_way_st0, line_idx_st0, lk_st0.is_dirty, req_uuid_st0)) end if (do_lookup_st0 && ~pipe_stall) begin - if (is_hit_st0) begin + if (lk_st0.is_hit) begin `TRACE(3, ("%t: %s tags-hit: addr=0x%0h, rw=%b, way=%0d, line=%0d, tag=0x%0h (#%0d)\n", $time, INSTANCE_ID, - full_addr_st0, rw_st0, way_idx_st0, line_idx_st0, line_tag_st0, req_uuid_st0)) + full_addr_st0, st0.req.rw, hit_idx_st0, line_idx_st0, line_tag_st0, req_uuid_st0)) end else begin `TRACE(3, ("%t: %s tags-miss: addr=0x%0h, rw=%b, way=%0d, line=%0d, tag=0x%0h (#%0d)\n", $time, INSTANCE_ID, - full_addr_st0, rw_st0, way_idx_st0, line_idx_st0, line_tag_st0, req_uuid_st0)) + full_addr_st0, st0.req.rw, hit_idx_st0, line_idx_st0, line_tag_st0, req_uuid_st0)) end end if (do_fill_st0 && ~pipe_stall) begin `TRACE(3, ("%t: %s data-fill: addr=0x%0h, way=%0d, line=%0d, data=0x%h (#%0d)\n", $time, INSTANCE_ID, - full_addr_st0, way_idx_st0, line_idx_st0, data_st0, req_uuid_st0)) + full_addr_st0, evict_way_st0, line_idx_st0, st0.data, req_uuid_st0)) end if (do_flush_st0 && ~pipe_stall) begin `TRACE(3, ("%t: %s data-flush: addr=0x%0h, way=%0d, line=%0d (#%0d)\n", $time, INSTANCE_ID, - full_addr_st0, way_idx_st0, line_idx_st0, req_uuid_st0)) + full_addr_st0, evict_way_st0, line_idx_st0, req_uuid_st0)) end - if (do_read_st1 && is_hit_st1 && ~pipe_stall) begin - `TRACE(3, ("%t: %s data-read: addr=0x%0h, way=%0d, line=%0d, wsel=%0d, data=0x%h (#%0d)\n", $time, INSTANCE_ID, - full_addr_st1, way_idx_st1, line_idx_st1, word_idx_st1, crsp_queue_data, req_uuid_st1)) + if (do_read_st1 && st1.lk.is_hit && ~pipe_stall) begin + `TRACE(3, ("%t: %s data-read: addr=0x%0h, way=%0d, line=%0d, wsel=%0d (#%0d)\n", $time, INSTANCE_ID, + full_addr_st1, st1.req.way_idx, st1.req.addr[`CS_LINE_SEL_BITS-1:0], st1.req.word_idx, req_uuid_st1)) end - if (do_write_st1 && is_hit_st1 && ~pipe_stall) begin + if (do_write_st1 && st1.lk.is_hit && ~pipe_stall) begin `TRACE(3, ("%t: %s data-write: addr=0x%0h, way=%0d, line=%0d, wsel=%0d, byteen=0x%h, data=0x%h (#%0d)\n", $time, INSTANCE_ID, - full_addr_st1, way_idx_st1, line_idx_st1, word_idx_st1, byteen_st1, write_word_st1, req_uuid_st1)) + full_addr_st1, st1.req.way_idx, st1.req.addr[`CS_LINE_SEL_BITS-1:0], st1.req.word_idx, st1.req.byteen, st1.lk.write_word, req_uuid_st1)) end if (crsp_queue_fire) begin `TRACE(2, ("%t: %s core-rd-rsp: addr=0x%0h, tag=0x%0h, req_idx=%0d, data=0x%h (#%0d)\n", $time, INSTANCE_ID, - full_addr_st1, crsp_queue_tag, crsp_queue_idx, crsp_queue_data, req_uuid_st1)) + addr_stc, stC.req.tag, stC.req.req_idx, crsp_queue_data, req_uuid_stc)) end if (mreq_queue_push) begin - if (!WRITEBACK && do_write_st1) begin + if (!WRITEBACK && do_write_stc) begin `TRACE(2, ("%t: %s writethrough: addr=0x%0h, byteen=0x%h, data=0x%h (#%0d)\n", $time, INSTANCE_ID, - mreq_queue_full_addr, mreq_queue_byteen, mreq_queue_data, req_uuid_st1)) - end else if (WRITEBACK && do_writeback_st1) begin + mreq_queue_full_addr, mreq_queue_byteen, mreq_queue_data, req_uuid_stc)) + end else if (WRITEBACK && do_writeback_stc) begin `TRACE(2, ("%t: %s writeback: addr=0x%0h, byteen=0x%h, data=0x%h (#%0d)\n", $time, INSTANCE_ID, - mreq_queue_full_addr, mreq_queue_byteen, mreq_queue_data, req_uuid_st1)) + mreq_queue_full_addr, mreq_queue_byteen, mreq_queue_data, req_uuid_stc)) end else begin `TRACE(2, ("%t: %s fill-req: addr=0x%0h, mshr_id=%0d (#%0d)\n", $time, INSTANCE_ID, - mreq_queue_full_addr, mshr_id_st1, req_uuid_st1)) + mreq_queue_full_addr, stC.req.mshr_id, req_uuid_stc)) end end end diff --git a/hw/rtl/cache/VX_cache_cluster.sv b/hw/rtl/cache/VX_cache_cluster.sv index 981b69ec3..6971eb4a5 100644 --- a/hw/rtl/cache/VX_cache_cluster.sv +++ b/hw/rtl/cache/VX_cache_cluster.sv @@ -46,6 +46,9 @@ module VX_cache_cluster import VX_gpu_pkg::*; #( // Memory Request Queue Size parameter MREQ_SIZE = 4, + // Bank pipeline depth (2 = classic lookup+commit; larger defers the data array) + parameter LATENCY = 2, + // Enable cache writeable parameter WRITE_ENABLE = 1, @@ -167,6 +170,7 @@ module VX_cache_cluster import VX_gpu_pkg::*; #( .MSHR_SIZE (MSHR_SIZE), .MRSQ_SIZE (MRSQ_SIZE), .MREQ_SIZE (MREQ_SIZE), + .LATENCY (LATENCY), .TAG_WIDTH (ARB_TAG_WIDTH), .TAG_SEL_IDX (TAG_SEL_IDX), .CORE_OUT_BUF ((NUM_INPUTS != NUM_CACHES) ? 2 : CORE_OUT_BUF), diff --git a/hw/rtl/cache/VX_cache_wrap.sv b/hw/rtl/cache/VX_cache_wrap.sv index f5511cc2f..b4e310e9f 100644 --- a/hw/rtl/cache/VX_cache_wrap.sv +++ b/hw/rtl/cache/VX_cache_wrap.sv @@ -44,6 +44,9 @@ module VX_cache_wrap import VX_gpu_pkg::*; #( // Memory Request Queue Size parameter MREQ_SIZE = 4, + // Bank pipeline depth (2 = classic lookup+commit; larger defers the data array) + parameter LATENCY = 2, + // Enable cache writeable parameter WRITE_ENABLE = 1, @@ -187,6 +190,7 @@ module VX_cache_wrap import VX_gpu_pkg::*; #( .MSHR_SIZE (MSHR_SIZE), .MRSQ_SIZE (MRSQ_SIZE), .MREQ_SIZE (MREQ_SIZE), + .LATENCY (LATENCY), .TAG_WIDTH (TAG_WIDTH), .CORE_OUT_BUF (BYPASS_ENABLE ? 1 : CORE_OUT_BUF), .MEM_OUT_BUF (BYPASS_ENABLE ? 1 : MEM_OUT_BUF), diff --git a/hw/unittest/cache/VX_cache_top.sv b/hw/unittest/cache/VX_cache_top.sv index dc51ae59c..3c8be3588 100644 --- a/hw/unittest/cache/VX_cache_top.sv +++ b/hw/unittest/cache/VX_cache_top.sv @@ -22,16 +22,16 @@ module VX_cache_top import VX_gpu_pkg::*; #( // Number of memory ports parameter MEM_PORTS = 1, - // Size of cache in bytes - parameter CACHE_SIZE = 65536, + // Size of cache in bytes (L2 config: reproduces the 1MB 8-way data array) + parameter CACHE_SIZE = `VX_CFG_L2_CACHE_SIZE, // Size of line inside a bank in bytes - parameter LINE_SIZE = 64, + parameter LINE_SIZE = `VX_CFG_L2_LINE_SIZE, // Number of banks - parameter NUM_BANKS = 4, + parameter NUM_BANKS = 8, // Number of associative ways - parameter NUM_WAYS = 4, - // Size of a word in bytes - parameter WORD_SIZE = 16, + parameter NUM_WAYS = `VX_CFG_L2_NUM_WAYS, + // Size of a word in bytes (L2 word = L1 line = 512-bit data-array slice) + parameter WORD_SIZE = `VX_CFG_L2_LINE_SIZE, // Core Response Queue Size parameter CRSQ_SIZE = 8, @@ -45,11 +45,14 @@ module VX_cache_top import VX_gpu_pkg::*; #( // Enable cache writeable parameter WRITE_ENABLE = 1, - // Enable cache writeback - parameter WRITEBACK = 1, + // Enable cache writeback (L2 ships writethrough) + parameter WRITEBACK = `VX_CFG_L2_WRITEBACK, // Enable dirty bytes on writeback - parameter DIRTY_BYTES = 1, + parameter DIRTY_BYTES = `VX_CFG_L2_DIRTYBYTES, + + // Bank pipeline depth (L2 deferral: 4 above 64KB) + parameter LATENCY = `VX_CFG_L2_LATENCY, // core request tag size parameter TAG_WIDTH = 16 + UUID_WIDTH, @@ -57,8 +60,12 @@ module VX_cache_top import VX_gpu_pkg::*; #( // Core response output buffer parameter CORE_OUT_BUF = 3, - // Enable AMO support (tracks the A extension by default) - parameter AMO_ENABLE = `VX_CFG_EXT_A_ENABLED, + // Enable AMO support (synth #1 = 0; flip to 1 for the AMO timing run) + parameter AMO_ENABLE = 1, + + // LLC role: 1 exercises the local AMO read-modify-write commit (g_commit), + // the path that interacts with the deferred data pipeline. + parameter IS_LLC = 1, // Memory request output buffer parameter MEM_OUT_BUF = 3, @@ -166,11 +173,13 @@ module VX_cache_top import VX_gpu_pkg::*; #( .MSHR_SIZE (MSHR_SIZE), .MRSQ_SIZE (MRSQ_SIZE), .MREQ_SIZE (MREQ_SIZE), + .LATENCY (LATENCY), .TAG_WIDTH (TAG_WIDTH), .WRITE_ENABLE (WRITE_ENABLE), .WRITEBACK (WRITEBACK), .DIRTY_BYTES (DIRTY_BYTES), .AMO_ENABLE (AMO_ENABLE), + .IS_LLC (IS_LLC), .CORE_OUT_BUF (CORE_OUT_BUF), .MEM_OUT_BUF (MEM_OUT_BUF) ) cache ( diff --git a/sim/simx/cluster.cpp b/sim/simx/cluster.cpp index cd19d6216..968460f1e 100644 --- a/sim/simx/cluster.cpp +++ b/sim/simx/cluster.cpp @@ -79,7 +79,7 @@ class Cluster::Impl { VX_CFG_L2_WRITEBACK, // write-back false, // write response VX_CFG_L2_MSHR_SIZE, // mshr size - 2, // pipeline latency + VX_CFG_L2_LATENCY, // pipeline latency VX_CFG_L2_REPL_POLICY, // replacement policy (VX_CFG_L2_ENABLED != 0) && (VX_CFG_L3_ENABLED == 0), // is_llc }); diff --git a/sim/simx/processor.cpp b/sim/simx/processor.cpp index 68480fa16..aa9d58a98 100644 --- a/sim/simx/processor.cpp +++ b/sim/simx/processor.cpp @@ -74,7 +74,7 @@ ProcessorImpl::ProcessorImpl() VX_CFG_L3_WRITEBACK, // write-back false, // write response VX_CFG_L3_MSHR_SIZE, // mshr size - 2, // pipeline latency + VX_CFG_L3_LATENCY, // pipeline latency VX_CFG_L3_REPL_POLICY, // replacement policy VX_CFG_L3_ENABLED != 0, // is_llc when L3 is the LLC }