vortexgpgpu · tinebp · Jun 18, 2026 · Jun 16, 2026 · Jun 18, 2026
diff --git a/VX_config.toml b/VX_config.toml
@@ -185,7 +185,8 @@ VX_CFG_DCACHE_WRITEBACK = 0
 VX_CFG_DCACHE_DIRTYBYTES = "expr: $VX_CFG_DCACHE_WRITEBACK"
 VX_CFG_DCACHE_REPL_POLICY = "expr: $__cache_repl_fifo"
 VX_CFG_DCACHE_MSHR_SIZE = 16
-VX_CFG_DCACHE_MREQ_SIZE = "expr: 4 + $VX_CFG_DCACHE_WRITEBACK * ($VX_CFG_DCACHE_MSHR_SIZE - 4)"
+VX_CFG_DCACHE_LATENCY = 2
+VX_CFG_DCACHE_MREQ_SIZE = "expr: 2 * $VX_CFG_DCACHE_LATENCY + $VX_CFG_DCACHE_WRITEBACK * ($VX_CFG_DCACHE_MSHR_SIZE - 2 * $VX_CFG_DCACHE_LATENCY)"
 VX_CFG_DCACHE_MRSQ_SIZE = 4
 VX_CFG_DCACHE_CRSQ_SIZE = 2
 
@@ -200,11 +201,12 @@ VX_CFG_L1_MEM_PORTS = "expr: min($VX_CFG_DCACHE_NUM_BANKS, $VX_CFG_PLATFORM_MEMO
 [l2cache]
 VX_CFG_L2_CACHE_SIZE = 1048576
 VX_CFG_L2_NUM_WAYS = 8
-VX_CFG_L2_WRITEBACK = 1
+VX_CFG_L2_WRITEBACK = 0
 VX_CFG_L2_DIRTYBYTES = "expr: $VX_CFG_L2_WRITEBACK"
 VX_CFG_L2_REPL_POLICY = "expr: $__cache_repl_fifo"
 VX_CFG_L2_MSHR_SIZE = 16
-VX_CFG_L2_MREQ_SIZE = "expr: 4 + $VX_CFG_L2_WRITEBACK * ($VX_CFG_L2_MSHR_SIZE - 4)"
+VX_CFG_L2_LATENCY = 4
+VX_CFG_L2_MREQ_SIZE = "expr: 2 * $VX_CFG_L2_LATENCY + $VX_CFG_L2_WRITEBACK * ($VX_CFG_L2_MSHR_SIZE - 2 * $VX_CFG_L2_LATENCY)"
 VX_CFG_L2_MRSQ_SIZE = 4
 VX_CFG_L2_CRSQ_SIZE = 2
 
@@ -214,11 +216,12 @@ VX_CFG_L2_MEM_PORTS = "expr: min($VX_CFG_L2_NUM_BANKS, $VX_CFG_PLATFORM_MEMORY_N
 [l3cache]
 VX_CFG_L3_CACHE_SIZE = 2097152
 VX_CFG_L3_NUM_WAYS = 8
-VX_CFG_L3_WRITEBACK = 1
+VX_CFG_L3_WRITEBACK = 0
 VX_CFG_L3_DIRTYBYTES = "expr: $VX_CFG_L3_WRITEBACK"
 VX_CFG_L3_REPL_POLICY = "expr: $__cache_repl_fifo"
 VX_CFG_L3_MSHR_SIZE = 16
-VX_CFG_L3_MREQ_SIZE = "expr: 4 + $VX_CFG_L3_WRITEBACK * ($VX_CFG_L3_MSHR_SIZE - 4)"
+VX_CFG_L3_LATENCY = 4
+VX_CFG_L3_MREQ_SIZE = "expr: 2 * $VX_CFG_L3_LATENCY + $VX_CFG_L3_WRITEBACK * ($VX_CFG_L3_MSHR_SIZE - 2 * $VX_CFG_L3_LATENCY)"
 VX_CFG_L3_MRSQ_SIZE = 4
 VX_CFG_L3_CRSQ_SIZE = 2
 

diff --git a/docs/proposals/cache_elastic_pipeline_proposal.md b/docs/proposals/cache_elastic_pipeline_proposal.md
diff --git a/hw/rtl/VX_cluster.sv b/hw/rtl/VX_cluster.sv
@@ -205,6 +205,7 @@ module VX_cluster import VX_gpu_pkg::*;
         .MSHR_SIZE      (`VX_CFG_L2_MSHR_SIZE),
         .MRSQ_SIZE      (`VX_CFG_L2_MRSQ_SIZE),
         .MREQ_SIZE      (`VX_CFG_L2_MREQ_SIZE),
+        .LATENCY        (`VX_CFG_L2_LATENCY),
         .TAG_WIDTH      (L2_TAG_WIDTH),
         .WRITE_ENABLE   (1),
         .WRITEBACK      (`VX_CFG_L2_WRITEBACK),

diff --git a/hw/rtl/VX_socket.sv b/hw/rtl/VX_socket.sv
@@ -85,7 +85,7 @@ module VX_socket import VX_gpu_pkg::*;
         .clk        (clk),
         .reset      (reset),
         .bus_in_if  (kmu_bus_if),
-        .bus_out_if (per_core_kmu_bus_if[`VX_CFG_SOCKET_SIZE-1:0])
+        .bus_out_if (per_core_kmu_bus_if)
     );
 
     VX_gbar_bus_if per_core_gbar_bus_if[`VX_CFG_SOCKET_SIZE]();
@@ -185,6 +185,7 @@ module VX_socket import VX_gpu_pkg::*;
         .MSHR_SIZE      (`VX_CFG_DCACHE_MSHR_SIZE),
         .MRSQ_SIZE      (`VX_CFG_DCACHE_MRSQ_SIZE),
         .MREQ_SIZE      (`VX_CFG_DCACHE_MREQ_SIZE),
+        .LATENCY        (`VX_CFG_DCACHE_LATENCY),
         .TAG_WIDTH      (DCACHE_TAG_WIDTH),
         .WRITE_ENABLE   (1),
         .WRITEBACK      (`VX_CFG_DCACHE_WRITEBACK),

diff --git a/hw/rtl/Vortex.sv b/hw/rtl/Vortex.sv
@@ -131,6 +131,7 @@ module Vortex import VX_gpu_pkg::*, VX_trace_pkg::*; (
         .MSHR_SIZE      (`VX_CFG_L3_MSHR_SIZE),
         .MRSQ_SIZE      (`VX_CFG_L3_MRSQ_SIZE),
         .MREQ_SIZE      (`VX_CFG_L3_MREQ_SIZE),
+        .LATENCY        (`VX_CFG_L3_LATENCY),
         .TAG_WIDTH      (L3_TAG_WIDTH),
         .WRITE_ENABLE   (1),
         .WRITEBACK      (`VX_CFG_L3_WRITEBACK),
@@ -180,7 +181,7 @@ module Vortex import VX_gpu_pkg::*, VX_trace_pkg::*; (
         .clk        (clk),
         .reset      (reset),
         .bus_in_if  (kmu_bus_in),
-        .bus_out_if (per_cluster_kmu_bus_if[`VX_CFG_NUM_CLUSTERS-1:0])
+        .bus_out_if (per_cluster_kmu_bus_if)
     );
 
     VX_dcr_bus_if per_cluster_dcr_bus_if[`VX_CFG_NUM_CLUSTERS]();

diff --git a/hw/rtl/cache/VX_cache.sv b/hw/rtl/cache/VX_cache.sv
@@ -42,6 +42,9 @@ module VX_cache import VX_gpu_pkg::*; #(
     // Memory Request Queue Size
     parameter MREQ_SIZE             = 4,
 
+    // Bank pipeline depth (2 = classic lookup+commit; larger defers the data array)
+    parameter LATENCY               = 2,
+
     // Enable cache writeable
     parameter WRITE_ENABLE          = 1,
 
@@ -390,6 +393,7 @@ module VX_cache import VX_gpu_pkg::*; #(
             .MSHR_SIZE    (MSHR_SIZE),
             .MRSQ_SIZE    (MRSQ_SIZE),
             .MREQ_SIZE    (MREQ_SIZE),
+            .LATENCY      (LATENCY),
             .TAG_WIDTH    (TAG_WIDTH),
             .CORE_OUT_BUF (CORE_RSP_BUF_ENABLE ? 2 : 0),
             .MEM_OUT_BUF  (MEM_REQ_BUF_ENABLE ? 2 : 0),

diff --git a/hw/rtl/cache/VX_cache_amo.sv b/hw/rtl/cache/VX_cache_amo.sv
@@ -39,7 +39,11 @@ module VX_cache_amo import VX_gpu_pkg::*; #(
     parameter ATTR_WIDTH      = 1,
     parameter MSHR_SIZE       = 1,
     parameter MSHR_ADDR_WIDTH = 1,
-    parameter WORDS_PER_LINE  = 1
+    parameter WORDS_PER_LINE  = 1,
+    // Deferred-commit depth: the commit ports (_st1) are fed from the bank's
+    // stC stage, which sits PIPE_EX+1 cycles behind the S0 lookup. 0 = classic
+    // 2-stage bank (stC == S1).
+    parameter PIPE_EX         = 0
 ) (
     input  wire                          clk,
     input  wire                          reset,
@@ -340,20 +344,51 @@ module VX_cache_amo import VX_gpu_pkg::*; #(
             end
         end
 
-        // response (fired at S1): SC -> 0/1; other -> old value (LSU sexts).
-        // The old value is available at S1 directly, no ALU needed.
-        wire [63:0] rsp_word = (amo_st1.amo_op == AMO_OP_SC) ? {63'h0, sc_fail_st1} : old_st1;
-        if (WORD_WIDTH < 64) begin : g_rsp_upper_unused
-            `UNUSED_VAR (rsp_word[63:WORD_WIDTH])
+        // Response (fired at S1; in-place, no ALU): the requester extracts its
+        // target word by byte offset, so the old value can stay where it sits in
+        // the line with the other bytes masked off -- this avoids a full-width
+        // barrel shift on the hot read->response path (read_word -> rsp_data was
+        // the critical path: a >>bit_off then <<bit_off round-trip just to mask).
+        // The byte mask comes straight from byteen (one line bit per set byte);
+        // masking is bit-identical to (old_st1 << bit_off) for the consumed bytes.
+        // SC returns 0/1 placed at the offset (rare path, 1-bit shift input).
+        wire [WORD_WIDTH-1:0] rsp_byte_mask;
+        for (genvar b = 0; b < WORD_SIZE; ++b) begin : g_rsp_mask
+            assign rsp_byte_mask[b*8 +: 8] = {8{byteen_st1[b]}};
         end
+        wire [WORD_WIDTH-1:0] amo_old_inplace = line_word_st1 & rsp_byte_mask;
+        wire [WORD_WIDTH-1:0] sc_rsp_inplace  = WORD_WIDTH'(sc_fail_st1) << bit_off_st1;
 
         assign amo_hit_st1 = amo_hit_w;
-        assign rsp_data    = WORD_WIDTH'(rsp_word) << bit_off_st1;
+        assign rsp_data    = (amo_st1.amo_op == AMO_OP_SC) ? sc_rsp_inplace : amo_old_inplace;
+        // Bridge the S0 prediction across the deferred lookup->commit window:
+        // with PIPE_EX>0 the AMO sits in the commit bubble for PIPE_EX cycles
+        // between do_store_st0 (S0) and do_store_st1 (stC), so commit_busy would
+        // gap and let a same-line request race the writeback. A PIPE_EX-deep
+        // shift of do_store_st0 fills the gap (continuous S0..stC hold).
+        wire amo_inflight;
+        if (PIPE_EX == 0) begin : g_no_bridge
+            assign amo_inflight = 1'b0;
+        end else begin : g_bridge
+            reg [PIPE_EX-1:0] store_inflight;
+            always @(posedge clk) begin
+                if (reset) begin
+                    store_inflight <= '0;
+                end else if (~pipe_stall) begin
+                    store_inflight[0] <= do_store_st0;
+                    for (int i = 1; i < PIPE_EX; ++i) begin
+                        store_inflight[i] <= store_inflight[i-1];
+                    end
+                end
+            end
+            assign amo_inflight = (| store_inflight);
+        end
+
         // Commit in flight: holds off new core-request admission from the S0
-        // prediction through the compute stage and the writeback. Replays are
-        // NOT blocked (the MSHR streams coalesced same-line AMOs back to back);
-        // those are paced instead by chain_stall.
-        assign commit_busy = do_store_st0 || do_store_st1 || cmp_valid || wb_pending_r;
+        // prediction through the deferred bubble, the compute stage and the
+        // writeback. Replays are NOT blocked (the MSHR streams coalesced same-
+        // line AMOs back to back); those are paced instead by chain_stall.
+        assign commit_busy = do_store_st0 || amo_inflight || do_store_st1 || cmp_valid || wb_pending_r;
         // Pace any same-line request sitting behind an in-flight compute by one
         // cycle, so the result lands in wb_data_r and forwards cleanly. Gated on
         // cmp_valid (an AMO is computing), so it never fires for baseline traffic.