Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 8 additions & 5 deletions VX_config.toml
Original file line number Diff line number Diff line change
Expand Up @@ -185,7 +185,8 @@ VX_CFG_DCACHE_WRITEBACK = 0
VX_CFG_DCACHE_DIRTYBYTES = "expr: $VX_CFG_DCACHE_WRITEBACK"
VX_CFG_DCACHE_REPL_POLICY = "expr: $__cache_repl_fifo"
VX_CFG_DCACHE_MSHR_SIZE = 16
VX_CFG_DCACHE_MREQ_SIZE = "expr: 4 + $VX_CFG_DCACHE_WRITEBACK * ($VX_CFG_DCACHE_MSHR_SIZE - 4)"
VX_CFG_DCACHE_LATENCY = 2
VX_CFG_DCACHE_MREQ_SIZE = "expr: 2 * $VX_CFG_DCACHE_LATENCY + $VX_CFG_DCACHE_WRITEBACK * ($VX_CFG_DCACHE_MSHR_SIZE - 2 * $VX_CFG_DCACHE_LATENCY)"
VX_CFG_DCACHE_MRSQ_SIZE = 4
VX_CFG_DCACHE_CRSQ_SIZE = 2

Expand All @@ -200,11 +201,12 @@ VX_CFG_L1_MEM_PORTS = "expr: min($VX_CFG_DCACHE_NUM_BANKS, $VX_CFG_PLATFORM_MEMO
[l2cache]
VX_CFG_L2_CACHE_SIZE = 1048576
VX_CFG_L2_NUM_WAYS = 8
VX_CFG_L2_WRITEBACK = 1
VX_CFG_L2_WRITEBACK = 0
VX_CFG_L2_DIRTYBYTES = "expr: $VX_CFG_L2_WRITEBACK"
VX_CFG_L2_REPL_POLICY = "expr: $__cache_repl_fifo"
VX_CFG_L2_MSHR_SIZE = 16
VX_CFG_L2_MREQ_SIZE = "expr: 4 + $VX_CFG_L2_WRITEBACK * ($VX_CFG_L2_MSHR_SIZE - 4)"
VX_CFG_L2_LATENCY = 4
VX_CFG_L2_MREQ_SIZE = "expr: 2 * $VX_CFG_L2_LATENCY + $VX_CFG_L2_WRITEBACK * ($VX_CFG_L2_MSHR_SIZE - 2 * $VX_CFG_L2_LATENCY)"
VX_CFG_L2_MRSQ_SIZE = 4
VX_CFG_L2_CRSQ_SIZE = 2

Expand All @@ -214,11 +216,12 @@ VX_CFG_L2_MEM_PORTS = "expr: min($VX_CFG_L2_NUM_BANKS, $VX_CFG_PLATFORM_MEMORY_N
[l3cache]
VX_CFG_L3_CACHE_SIZE = 2097152
VX_CFG_L3_NUM_WAYS = 8
VX_CFG_L3_WRITEBACK = 1
VX_CFG_L3_WRITEBACK = 0
VX_CFG_L3_DIRTYBYTES = "expr: $VX_CFG_L3_WRITEBACK"
VX_CFG_L3_REPL_POLICY = "expr: $__cache_repl_fifo"
VX_CFG_L3_MSHR_SIZE = 16
VX_CFG_L3_MREQ_SIZE = "expr: 4 + $VX_CFG_L3_WRITEBACK * ($VX_CFG_L3_MSHR_SIZE - 4)"
VX_CFG_L3_LATENCY = 4
VX_CFG_L3_MREQ_SIZE = "expr: 2 * $VX_CFG_L3_LATENCY + $VX_CFG_L3_WRITEBACK * ($VX_CFG_L3_MSHR_SIZE - 2 * $VX_CFG_L3_LATENCY)"
VX_CFG_L3_MRSQ_SIZE = 4
VX_CFG_L3_CRSQ_SIZE = 2

Expand Down
401 changes: 401 additions & 0 deletions docs/proposals/cache_elastic_pipeline_proposal.md

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions hw/rtl/VX_cluster.sv
Original file line number Diff line number Diff line change
Expand Up @@ -205,6 +205,7 @@ module VX_cluster import VX_gpu_pkg::*;
.MSHR_SIZE (`VX_CFG_L2_MSHR_SIZE),
.MRSQ_SIZE (`VX_CFG_L2_MRSQ_SIZE),
.MREQ_SIZE (`VX_CFG_L2_MREQ_SIZE),
.LATENCY (`VX_CFG_L2_LATENCY),
.TAG_WIDTH (L2_TAG_WIDTH),
.WRITE_ENABLE (1),
.WRITEBACK (`VX_CFG_L2_WRITEBACK),
Expand Down
3 changes: 2 additions & 1 deletion hw/rtl/VX_socket.sv
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ module VX_socket import VX_gpu_pkg::*;
.clk (clk),
.reset (reset),
.bus_in_if (kmu_bus_if),
.bus_out_if (per_core_kmu_bus_if[`VX_CFG_SOCKET_SIZE-1:0])
.bus_out_if (per_core_kmu_bus_if)
);

VX_gbar_bus_if per_core_gbar_bus_if[`VX_CFG_SOCKET_SIZE]();
Expand Down Expand Up @@ -185,6 +185,7 @@ module VX_socket import VX_gpu_pkg::*;
.MSHR_SIZE (`VX_CFG_DCACHE_MSHR_SIZE),
.MRSQ_SIZE (`VX_CFG_DCACHE_MRSQ_SIZE),
.MREQ_SIZE (`VX_CFG_DCACHE_MREQ_SIZE),
.LATENCY (`VX_CFG_DCACHE_LATENCY),
.TAG_WIDTH (DCACHE_TAG_WIDTH),
.WRITE_ENABLE (1),
.WRITEBACK (`VX_CFG_DCACHE_WRITEBACK),
Expand Down
3 changes: 2 additions & 1 deletion hw/rtl/Vortex.sv
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,7 @@ module Vortex import VX_gpu_pkg::*, VX_trace_pkg::*; (
.MSHR_SIZE (`VX_CFG_L3_MSHR_SIZE),
.MRSQ_SIZE (`VX_CFG_L3_MRSQ_SIZE),
.MREQ_SIZE (`VX_CFG_L3_MREQ_SIZE),
.LATENCY (`VX_CFG_L3_LATENCY),
.TAG_WIDTH (L3_TAG_WIDTH),
.WRITE_ENABLE (1),
.WRITEBACK (`VX_CFG_L3_WRITEBACK),
Expand Down Expand Up @@ -180,7 +181,7 @@ module Vortex import VX_gpu_pkg::*, VX_trace_pkg::*; (
.clk (clk),
.reset (reset),
.bus_in_if (kmu_bus_in),
.bus_out_if (per_cluster_kmu_bus_if[`VX_CFG_NUM_CLUSTERS-1:0])
.bus_out_if (per_cluster_kmu_bus_if)
);

VX_dcr_bus_if per_cluster_dcr_bus_if[`VX_CFG_NUM_CLUSTERS]();
Expand Down
4 changes: 4 additions & 0 deletions hw/rtl/cache/VX_cache.sv
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,9 @@ module VX_cache import VX_gpu_pkg::*; #(
// Memory Request Queue Size
parameter MREQ_SIZE = 4,

// Bank pipeline depth (2 = classic lookup+commit; larger defers the data array)
parameter LATENCY = 2,

// Enable cache writeable
parameter WRITE_ENABLE = 1,

Expand Down Expand Up @@ -390,6 +393,7 @@ module VX_cache import VX_gpu_pkg::*; #(
.MSHR_SIZE (MSHR_SIZE),
.MRSQ_SIZE (MRSQ_SIZE),
.MREQ_SIZE (MREQ_SIZE),
.LATENCY (LATENCY),
.TAG_WIDTH (TAG_WIDTH),
.CORE_OUT_BUF (CORE_RSP_BUF_ENABLE ? 2 : 0),
.MEM_OUT_BUF (MEM_REQ_BUF_ENABLE ? 2 : 0),
Expand Down
57 changes: 46 additions & 11 deletions hw/rtl/cache/VX_cache_amo.sv
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,11 @@ module VX_cache_amo import VX_gpu_pkg::*; #(
parameter ATTR_WIDTH = 1,
parameter MSHR_SIZE = 1,
parameter MSHR_ADDR_WIDTH = 1,
parameter WORDS_PER_LINE = 1
parameter WORDS_PER_LINE = 1,
// Deferred-commit depth: the commit ports (_st1) are fed from the bank's
// stC stage, which sits PIPE_EX+1 cycles behind the S0 lookup. 0 = classic
// 2-stage bank (stC == S1).
parameter PIPE_EX = 0
) (
input wire clk,
input wire reset,
Expand Down Expand Up @@ -340,20 +344,51 @@ module VX_cache_amo import VX_gpu_pkg::*; #(
end
end

// response (fired at S1): SC -> 0/1; other -> old value (LSU sexts).
// The old value is available at S1 directly, no ALU needed.
wire [63:0] rsp_word = (amo_st1.amo_op == AMO_OP_SC) ? {63'h0, sc_fail_st1} : old_st1;
if (WORD_WIDTH < 64) begin : g_rsp_upper_unused
`UNUSED_VAR (rsp_word[63:WORD_WIDTH])
// Response (fired at S1; in-place, no ALU): the requester extracts its
// target word by byte offset, so the old value can stay where it sits in
// the line with the other bytes masked off -- this avoids a full-width
// barrel shift on the hot read->response path (read_word -> rsp_data was
// the critical path: a >>bit_off then <<bit_off round-trip just to mask).
// The byte mask comes straight from byteen (one line bit per set byte);
// masking is bit-identical to (old_st1 << bit_off) for the consumed bytes.
// SC returns 0/1 placed at the offset (rare path, 1-bit shift input).
wire [WORD_WIDTH-1:0] rsp_byte_mask;
for (genvar b = 0; b < WORD_SIZE; ++b) begin : g_rsp_mask
assign rsp_byte_mask[b*8 +: 8] = {8{byteen_st1[b]}};
end
wire [WORD_WIDTH-1:0] amo_old_inplace = line_word_st1 & rsp_byte_mask;
wire [WORD_WIDTH-1:0] sc_rsp_inplace = WORD_WIDTH'(sc_fail_st1) << bit_off_st1;

assign amo_hit_st1 = amo_hit_w;
assign rsp_data = WORD_WIDTH'(rsp_word) << bit_off_st1;
assign rsp_data = (amo_st1.amo_op == AMO_OP_SC) ? sc_rsp_inplace : amo_old_inplace;
// Bridge the S0 prediction across the deferred lookup->commit window:
// with PIPE_EX>0 the AMO sits in the commit bubble for PIPE_EX cycles
// between do_store_st0 (S0) and do_store_st1 (stC), so commit_busy would
// gap and let a same-line request race the writeback. A PIPE_EX-deep
// shift of do_store_st0 fills the gap (continuous S0..stC hold).
wire amo_inflight;
if (PIPE_EX == 0) begin : g_no_bridge
assign amo_inflight = 1'b0;
end else begin : g_bridge
reg [PIPE_EX-1:0] store_inflight;
always @(posedge clk) begin
if (reset) begin
store_inflight <= '0;
end else if (~pipe_stall) begin
store_inflight[0] <= do_store_st0;
for (int i = 1; i < PIPE_EX; ++i) begin
store_inflight[i] <= store_inflight[i-1];
end
end
end
assign amo_inflight = (| store_inflight);
end

// Commit in flight: holds off new core-request admission from the S0
// prediction through the compute stage and the writeback. Replays are
// NOT blocked (the MSHR streams coalesced same-line AMOs back to back);
// those are paced instead by chain_stall.
assign commit_busy = do_store_st0 || do_store_st1 || cmp_valid || wb_pending_r;
// prediction through the deferred bubble, the compute stage and the
// writeback. Replays are NOT blocked (the MSHR streams coalesced same-
// line AMOs back to back); those are paced instead by chain_stall.
assign commit_busy = do_store_st0 || amo_inflight || do_store_st1 || cmp_valid || wb_pending_r;
// Pace any same-line request sitting behind an in-flight compute by one
// cycle, so the result lands in wb_data_r and forwards cleanly. Gated on
// cmp_valid (an AMO is computing), so it never fires for baseline traffic.
Expand Down
Loading