From da0b130aa9a6d9587d5942cfb8188ac82eb3ac32 Mon Sep 17 00:00:00 2001
From: tinebp <tinebp@yahoo.com>
Date: Wed, 17 Jun 2026 20:03:24 -0700
Subject: [PATCH 1/2] issue: shorten scoreboard issue-stage critical path

Fold per-FU dispatch-queue back-pressure (fu_goingfull) and the FU-lock
serialization into the registered operands_ready, so the arbiter request is
a pure flop; fu_locked_n consults the next-state so it stays bit-exact.
Share in_use_mask between the busy check and operand trace, and replace the
selection with a cyclic arbiter. Add the VX_CFG_DISPATCH_QUEUE_SIZE credit
knob (default 4) with dispatcher/issue-slice fu_release plumbing.

Also: DUT synth-flow fixes (project.tcl/xdc clock knob, scoped
async_bram_patch, xrt Makefile) and coding-guideline doc tweaks.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 VX_config.toml                         |   1 +
 docs/coding_guidelines_cpp.md          |   2 +-
 docs/coding_guidelines_verilog.md      |   2 +-
 hw/rtl/VX_gpu_pkg.sv                   |   2 +
 hw/rtl/core/VX_dispatcher.sv           |  12 ++-
 hw/rtl/core/VX_issue_slice.sv          |   6 +-
 hw/rtl/core/VX_scoreboard.sv           | 128 +++++++++++++++----------
 hw/scripts/xilinx_async_bram_patch.tcl |  20 +++-
 hw/syn/xilinx/dut/project.tcl          |  19 ++++
 hw/syn/xilinx/dut/project.xdc          |  13 ++-
 hw/syn/xilinx/xrt/Makefile             |   8 +-
 11 files changed, 143 insertions(+), 70 deletions(-)

diff --git a/VX_config.toml b/VX_config.toml
index 1072c1250d..0b28c6a4a0 100644
--- a/VX_config.toml
+++ b/VX_config.toml
@@ -68,6 +68,7 @@ VX_CFG_NUM_BARRIERS = 8
 VX_CFG_MAX_BAR_EVENTS = 32
 
 VX_CFG_IBUF_SIZE = 4
+VX_CFG_DISPATCH_QUEUE_SIZE = 4
 VX_CFG_ISSUE_WIDTH = "expr: up($VX_CFG_NUM_WARPS / 16)"
 VX_CFG_SIMD_WIDTH = "expr: $VX_CFG_NUM_THREADS"
 VX_CFG_NUM_OPCS = "expr: up($VX_CFG_NUM_WARPS / (4 * $VX_CFG_ISSUE_WIDTH))"
diff --git a/docs/coding_guidelines_cpp.md b/docs/coding_guidelines_cpp.md
index fabbc15ac7..09113da34b 100644
--- a/docs/coding_guidelines_cpp.md
+++ b/docs/coding_guidelines_cpp.md
@@ -211,4 +211,4 @@ DT(2, "req: wid=" << wid << " pc=0x" << std::hex << pc << std::endl);
 
 ## 10. Comment Content & Intent
 
-Comments describe what the adjacent code does and why, not the process that produced it. Prefer self-documenting code — good abstractions and consistent naming — and drop comments on code whose intent is already obvious; keep the rest brief, one or two lines per block as the norm (longer only where genuinely warranted, at the author's discretion), since over-detailed comments obscure the code and drift out of sync with later changes. Never embed development metadata or history (phase/step/version/part/feature/bug numbers, "proposal", "spec"), debugging or change narration ("fixing bug…", "was broken because…" — that is what commit messages are for), or references to design documents. SimX and other host-side models must not reference RTL details in comments or naming: the two evolve independently, so any such reference silently goes stale. These rules apply to every source file and script.
\ No newline at end of file
+Comments describe what the adjacent code does and why, not the process that produced it. Prefer self-documenting code — good abstractions and consistent naming — and drop comments on code whose intent is already obvious; keep the rest brief, one or two lines per block as the norm (longer only where genuinely warranted, at the author's discretion), since over-detailed comments obscure the code and drift out of sync with later changes. Never embed development metadata or history (phase/step/version/part/feature/bug numbers, "proposal", "spec"), debugging or change narration ("fixing bug…", "was broken because…" — that is what commit messages are for), or references to design documents. Comments and names must not reference the other implementation layer's internals: host-side models (SimX, runtime, drivers) must not name RTL signals or parameters, and RTL must not name host-side/SimX details. The layers evolve independently, so any such reference silently goes stale. These rules apply to every source file and script.
\ No newline at end of file
diff --git a/docs/coding_guidelines_verilog.md b/docs/coding_guidelines_verilog.md
index 55fee0f903..240eeaca48 100644
--- a/docs/coding_guidelines_verilog.md
+++ b/docs/coding_guidelines_verilog.md
@@ -221,7 +221,7 @@ Incorrect (space-separated entries):
 
 ## 9. Comment Content & Intent
 
-Comments describe what the adjacent code does and why, not the process that produced it. Prefer self-documenting code — good abstractions and consistent naming — and drop comments on code whose intent is already obvious; keep the rest brief, one or two lines per block as the norm (longer only where genuinely warranted, at the author's discretion), since over-detailed comments obscure the code and drift out of sync with later changes. Never embed development metadata or history (phase/step/version/part/feature/bug numbers, "proposal", "spec"), debugging or change narration ("fixing bug…", "was broken because…" — that is what commit messages are for), or references to design documents. Do not anchor comments to volatile cross-component details (e.g., the SimX/software model) that evolve independently and will silently go stale. These rules apply to every source file and script.
+Comments describe what the adjacent code does and why, not the process that produced it. Prefer self-documenting code — good abstractions and consistent naming — and drop comments on code whose intent is already obvious; keep the rest brief, one or two lines per block as the norm (longer only where genuinely warranted, at the author's discretion), since over-detailed comments obscure the code and drift out of sync with later changes. Never embed development metadata or history (phase/step/version/part/feature/bug numbers, "proposal", "spec"), debugging or change narration ("fixing bug…", "was broken because…" — that is what commit messages are for), or references to design documents. Comments and names must not reference the other implementation layer's internals: host-side models (SimX, runtime, drivers) must not name RTL signals or parameters, and RTL must not name host-side/SimX details. The layers evolve independently, so any such reference silently goes stale. These rules apply to every source file and script.
 
 ## 10. Combinational Logic Depth & Timing Closure
 
diff --git a/hw/rtl/VX_gpu_pkg.sv b/hw/rtl/VX_gpu_pkg.sv
index 3131a99c72..79a2f06808 100644
--- a/hw/rtl/VX_gpu_pkg.sv
+++ b/hw/rtl/VX_gpu_pkg.sv
@@ -542,6 +542,8 @@ package VX_gpu_pkg;
     localparam ISSUE_WIS_BITS = `CLOG2(PER_ISSUE_WARPS);
     localparam ISSUE_WIS_W = `UP(ISSUE_WIS_BITS);
 
+    localparam DISPATCH_QSIZE = `VX_CFG_DISPATCH_QUEUE_SIZE;
+
     localparam PER_OPC_WARPS = PER_ISSUE_WARPS / `VX_CFG_NUM_OPCS;
     localparam PER_OPC_NW_BITS = `CLOG2(PER_OPC_WARPS);
     localparam PER_OPC_NW_W = `UP(PER_OPC_NW_BITS);
diff --git a/hw/rtl/core/VX_dispatcher.sv b/hw/rtl/core/VX_dispatcher.sv
index 3d314f176d..4326a5e47d 100644
--- a/hw/rtl/core/VX_dispatcher.sv
+++ b/hw/rtl/core/VX_dispatcher.sv
@@ -28,7 +28,7 @@ module VX_dispatcher import VX_gpu_pkg::*; #(
     VX_operands_if.slave    operands_if,
 
     // outputs
-    output wire [NUM_EX_UNITS-1:0] dispatch_ready,
+    output wire [NUM_EX_UNITS-1:0] fu_release,
     VX_dispatch_if.master   dispatch_if [NUM_EX_UNITS]
 );
     `UNUSED_SPARAM (INSTANCE_ID)
@@ -39,15 +39,17 @@ module VX_dispatcher import VX_gpu_pkg::*; #(
     wire [NUM_EX_UNITS-1:0] operands_ready_in;
     assign operands_if.ready = operands_ready_in[operands_if.data.ex_type];
 
-    // FU-availability feedback to scoreboard to avoid head-of-line blocking
-    assign dispatch_ready = operands_ready_in;
+    // Per-FU dispatch credit returned to the scoreboard on FU accept.
+    for (genvar i = 0; i < NUM_EX_UNITS; ++i) begin : g_fu_release
+        assign fu_release[i] = dispatch_if[i].valid && dispatch_if[i].ready;
+    end
 
     // Non-LSU execution units: pass operand data straight through
     for (genvar i = 0; i < NUM_EX_UNITS; ++i) begin : g_buffers
         if (i != EX_LSU) begin : g_non_lsu
             VX_elastic_buffer #(
                 .DATAW   (OUT_DATAW),
-                .SIZE    (2),
+                .SIZE    (DISPATCH_QSIZE),
                 .OUT_REG (1)
             ) buffer (
                 .clk        (clk),
@@ -108,7 +110,7 @@ module VX_dispatcher import VX_gpu_pkg::*; #(
     // LSU: substitute effective base address and cleared offset for bulk ops
     VX_elastic_buffer #(
         .DATAW   (OUT_DATAW),
-        .SIZE    (2),
+        .SIZE    (DISPATCH_QSIZE),
         .OUT_REG (1)
     ) lsu_buffer (
         .clk        (clk),
diff --git a/hw/rtl/core/VX_issue_slice.sv b/hw/rtl/core/VX_issue_slice.sv
index 12e49584ea..b827b1e0ee 100644
--- a/hw/rtl/core/VX_issue_slice.sv
+++ b/hw/rtl/core/VX_issue_slice.sv
@@ -36,7 +36,7 @@ module VX_issue_slice import VX_gpu_pkg::*; #(
     VX_scoreboard_if scoreboard_if();
     VX_operands_if operands_if();
 
-    wire [NUM_EX_UNITS-1:0] dispatch_ready;
+    wire [NUM_EX_UNITS-1:0] fu_release;
 
     VX_ibuffer #(
         .INSTANCE_ID (`SFORMATF(("%s-ibuffer", INSTANCE_ID))),
@@ -60,7 +60,7 @@ module VX_issue_slice import VX_gpu_pkg::*; #(
     `ifdef PERF_ENABLE
         .perf_stalls    (issue_perf.scb_stalls),
     `endif
-        .dispatch_ready (dispatch_ready),
+        .fu_release     (fu_release),
         .writeback_if   (writeback_if),
         .ibuffer_if     (ibuffer_if),
         .scoreboard_if  (scoreboard_if)
@@ -91,7 +91,7 @@ module VX_issue_slice import VX_gpu_pkg::*; #(
         .perf_instrs    (issue_perf.dispatch_instrs),
     `endif
         .operands_if    (operands_if),
-        .dispatch_ready (dispatch_ready),
+        .fu_release     (fu_release),
         .dispatch_if    (dispatch_if)
     );
 
diff --git a/hw/rtl/core/VX_scoreboard.sv b/hw/rtl/core/VX_scoreboard.sv
index 5b2b2c3011..7d611ecd07 100644
--- a/hw/rtl/core/VX_scoreboard.sv
+++ b/hw/rtl/core/VX_scoreboard.sv
@@ -24,7 +24,7 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
     output reg [PERF_CTR_BITS-1:0] perf_stalls,
 `endif
 
-    input wire [NUM_EX_UNITS-1:0] dispatch_ready,
+    input wire [NUM_EX_UNITS-1:0] fu_release,
     VX_writeback_if.slave   writeback_if,
     VX_ibuffer_if.slave     ibuffer_if [PER_ISSUE_WARPS],
     VX_scoreboard_if.master scoreboard_if
@@ -38,6 +38,30 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
     localparam OUT_DATAW = $bits(scoreboard_t) - ISSUE_WIS_W;
     localparam OUT_BUF   = 3; // Use skid buffer (SIZE=2, OUT_REG=1)
 
+    // Per-FU dispatch credits: spent at issue, reclaimed on FU accept, so a
+    // credit covers ops still in operand collection (not yet at the queue).
+    wire [NUM_EX_UNITS-1:0] fu_issue;
+    wire [NUM_EX_UNITS-1:0] fu_goingfull;
+
+    // going-full (not full): a 1-slot guard band keeps outstanding <= queue
+    // depth despite the registered suppress lag, so an issued op never stalls
+    // in the shared operand-collection path and HoL-blocks another FU.
+    for (genvar e = 0; e < NUM_EX_UNITS; ++e) begin : g_fu_goingfull
+        VX_pending_size #(
+            .SIZE (DISPATCH_QSIZE)
+        ) fu_pending (
+            .clk        (clk),
+            .reset      (reset),
+            .incr       (fu_issue[e]),
+            .decr       (fu_release[e]),
+            `UNUSED_PIN (empty),
+            `UNUSED_PIN (alm_empty),
+            `UNUSED_PIN (full),
+            .alm_full   (fu_goingfull[e]),
+            `UNUSED_PIN (size)
+        );
+    end
+
     VX_ibuffer_if staging_if [PER_ISSUE_WARPS]();
     wire [PER_ISSUE_WARPS-1:0] operands_ready;
 
@@ -105,10 +129,8 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
             end
         end
 
-        // Operand busy-check uses inuse after writeback release only, keeping the
-        // arbiter grant off the deep cone. The staging reserve goes into
-        // inuse_regs_n (which drives the registers) and is re-added to the busy
-        // check as a shallow conflict gate below.
+        // Writeback release feeds wb_inuse_regs; the staging reserve is added on
+        // top to form inuse_regs_n, which the busy check reads directly.
         reg [NUM_REGS-1:0]  wb_inuse_regs;
         reg [NUM_XREGS-1:0] wb_inuse_xregs;
         always @(*) begin
@@ -133,46 +155,50 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
             end
         end
 
+        // in_use_mask = inuse_regs_n masked by the operand-dependency set
+        // (the ibuffer instr on a fire, else the staging instr), shared by the
+        // regs_busy reduction and the per-operand operands_busy check.
         wire [REG_TYPES-1:0][RV_REGS-1:0] in_use_mask;
-        wire [REG_TYPES-1:0] rd_resv_hit; // staged rd vs operand set
         for (genvar i = 0; i < REG_TYPES; ++i) begin : g_in_use_mask
             wire [RV_REGS-1:0] ibf_reg_mask = ibf_opd_mask[0][i] | ibf_opd_mask[1][i] | ibf_opd_mask[2][i] | ibf_opd_mask[3][i];
             wire [RV_REGS-1:0] stg_reg_mask = stg_opd_mask[0][i] | stg_opd_mask[1][i] | stg_opd_mask[2][i] | stg_opd_mask[3][i];
             wire [RV_REGS-1:0] regs_mask = ibuffer_fire ? ibf_reg_mask : stg_reg_mask;
-            assign in_use_mask[i] = wb_inuse_regs[i * RV_REGS +: RV_REGS] & regs_mask;
-            assign rd_resv_hit[i] = |(stg_opd_mask[0][i] & regs_mask);
+            assign in_use_mask[i] = inuse_regs_n[i * RV_REGS +: RV_REGS] & regs_mask;
         end
 
         wire [REG_TYPES-1:0] regs_busy;
         for (genvar i = 0; i < REG_TYPES; ++i) begin : g_regs_busy
-            assign regs_busy[i] = (in_use_mask[i] != 0);
+            assign regs_busy[i] = (| in_use_mask[i]);
         end
 
         for (genvar i = 0; i < NUM_OPDS; ++i) begin : g_operands_busy
             wire [REG_TYPE_BITS-1:0] rtype = get_reg_type(stg_opds[i]);
-            assign operands_busy[i] = (in_use_mask[rtype] & stg_opd_mask[i][rtype]) != 0;
+            assign operands_busy[i] = | (in_use_mask[rtype] & stg_opd_mask[i][rtype]);
         end
 
-
         wire [NUM_XREGS-1:0] xregs_mask = ibuffer_fire ? ibf_xregs_mask : stg_xregs_mask;
-        wire [NUM_XREGS-1:0] xregs_busy = wb_inuse_xregs & xregs_mask;
-
-        // Shallow gates equivalent to reserving rd / wr_xregs in the busy reduce.
-        wire rd_resv_conflict = staging_fire && staging_if[w].data.wb && (|rd_resv_hit);
-        wire x_resv_conflict  = staging_fire && (|(staging_if[w].data.wr_xregs & xregs_mask));
+        wire xregs_busy = | (inuse_xregs_n & xregs_mask);
 
+        wire [EX_BITS-1:0] ex_sel = ibuffer_fire ? ibuffer_if[w].data.ex_type : staging_if[w].data.ex_type;
         reg operands_ready_r;
 
+        // Readiness folds data hazards, FU-congestion and FU-lock into one flop.
+        // fu_locked_n is the next-state, look-ahead-aligned with ex_sel.
+        wire fu_lock_sel = ibuffer_fire ? ibuffer_if[w].data.fu_lock : staging_if[w].data.fu_lock;
+        wire data_ready  = ~((|regs_busy) || xregs_busy);
+        wire operands_ready_n = data_ready
+                             && ~fu_goingfull[ex_sel]
+                             && ~(fu_locked_n[ex_sel] && fu_lock_sel);
+
         always @(posedge clk) begin
             if (reset) begin
                 inuse_regs  <= '0;
                 inuse_xregs <= '0;
             end else begin
-                inuse_regs <= inuse_regs_n;
+                inuse_regs  <= inuse_regs_n;
                 inuse_xregs <= inuse_xregs_n;
             end
-            operands_ready_r <= (regs_busy == 0) && !rd_resv_conflict
-                             && (xregs_busy == 0) && !x_resv_conflict;
+            operands_ready_r <= operands_ready_n;
         end
 
         assign operands_ready[w] = operands_ready_r;
@@ -214,24 +240,15 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
     end
 
     wire [PER_ISSUE_WARPS-1:0] arb_valid_in;
-    wire [PER_ISSUE_WARPS-1:0] arb_suppress;
     wire [PER_ISSUE_WARPS-1:0][OUT_DATAW-1:0] arb_data_in;
     wire [PER_ISSUE_WARPS-1:0] arb_ready_in;
 
-    reg  [NUM_EX_UNITS-1:0] fu_locked;
-
-    wire [PER_ISSUE_WARPS-1:0] fu_lock_block;
-    for (genvar w = 0; w < PER_ISSUE_WARPS; ++w) begin : g_fu_lock_block
-        wire [EX_BITS-1:0] w_ex = staging_if[w].data.ex_type;
-        // Block warps when FU is locked. fu_lock=1 means acquire request.
-        assign fu_lock_block[w] = fu_locked[w_ex] && staging_if[w].data.fu_lock;
-    end
+    reg  [NUM_EX_UNITS-1:0] fu_locked, fu_locked_n;
 
     for (genvar w = 0; w < PER_ISSUE_WARPS; ++w) begin : g_arb_data_in
-        // valid: data-hazard + FU-lock check (drives age tracking in GTO)
-        assign arb_valid_in[w] = staging_if[w].valid && operands_ready[w] && ~fu_lock_block[w];
-        // suppress: FU-full check (skips selection without resetting age)
-        assign arb_suppress[w] = ~dispatch_ready[staging_if[w].data.ex_type];
+        // operands_ready carries data-hazard + FU-congestion + FU-lock (all folded
+        // into operands_ready_r), so the request is just valid & operands_ready.
+        assign arb_valid_in[w] = staging_if[w].valid && operands_ready[w];
 
         assign arb_data_in[w] = {
             staging_if[w].data.uuid,
@@ -253,28 +270,24 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
         assign staging_if[w].ready = arb_ready_in[w] && operands_ready[w];
     end
 
-    // GTO arbiter with suppress: FU-stalled warps keep aging but are
-    // skipped for selection, preserving their priority for when the FU drains.
-    // Only suppress when at least one warp can issue to a free FU; otherwise
-    // let all warps through so the pipeline buffers absorb transient stalls.
 
-    localparam LOG_NUM_REQS = `LOG2UP(PER_ISSUE_WARPS);
+    // Cyclic arbiter; STICKY holds the grant under backpressure. requests is a
+    // pure flop (suppress already folded into operands_ready).
 
-    wire any_unsuppressed = |(arb_valid_in & ~arb_suppress);
-    wire [PER_ISSUE_WARPS-1:0] eff_suppress = any_unsuppressed ? arb_suppress : '0;
+    localparam LOG_NUM_REQS = `LOG2UP(PER_ISSUE_WARPS);
 
     wire                    arb_valid;
     wire [LOG_NUM_REQS-1:0] arb_index;
     wire [PER_ISSUE_WARPS-1:0] arb_onehot;
     wire                    arb_ready;
 
-    VX_gto_arbiter #(
-        .NUM_REQS (PER_ISSUE_WARPS)
+    VX_cyclic_arbiter #(
+        .NUM_REQS (PER_ISSUE_WARPS),
+        .STICKY   (1) // Greedy
     ) out_arb (
         .clk          (clk),
         .reset        (reset),
         .requests     (arb_valid_in),
-        .suppress     (eff_suppress),
         .grant_valid  (arb_valid),
         .grant_index  (arb_index),
         .grant_onehot (arb_onehot),
@@ -308,19 +321,34 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
         assign staging_fu_unlock_vec[w] = staging_if[w].data.fu_unlock;
     end
 
-    wire issue_fu_lock   = staging_fu_lock_vec[arb_index];
-    wire issue_fu_unlock = staging_fu_unlock_vec[arb_index];
     wire [EX_BITS-1:0] issue_ex = staging_ex_vec[arb_index];
 
+    for (genvar e = 0; e < NUM_EX_UNITS; ++e) begin : g_fu_issue
+        assign fu_issue[e] = issue_fire && (issue_ex == EX_BITS'(e));
+    end
+
+    // fu_locked next-state: the granted warp (arb_onehot is one-hot) acquires or
+    // releases its FU. Built from arb_onehot -- not issue_ex -- so its arbiter
+    // dependence matches ibuffer_fire's depth, keeping the fu_locked_n term folded
+    // into operands_ready_n off the critical path (parallel to the look-ahead select).
+    always @(*) begin
+        fu_locked_n = fu_locked;
+        for (integer i = 0; i < PER_ISSUE_WARPS; i = i + 1) begin
+            if (arb_onehot[i] && ready_out_w) begin
+                if (staging_fu_lock_vec[i] && ~staging_fu_unlock_vec[i]) begin
+                    fu_locked_n[staging_ex_vec[i]] = 1'b1;
+                end else if (~staging_fu_lock_vec[i] && staging_fu_unlock_vec[i]) begin
+                    fu_locked_n[staging_ex_vec[i]] = 1'b0;
+                end
+            end
+        end
+    end
+
     always @(posedge clk) begin
         if (reset) begin
             fu_locked <= '0;
-        end else if (issue_fire) begin
-            if (issue_fu_lock && ~issue_fu_unlock) begin
-                fu_locked[issue_ex] <= 1'b1;
-            end else if (~issue_fu_lock && issue_fu_unlock) begin
-                fu_locked[issue_ex] <= 1'b0;
-            end
+        end else begin
+            fu_locked <= fu_locked_n;
         end
     end
 
diff --git a/hw/scripts/xilinx_async_bram_patch.tcl b/hw/scripts/xilinx_async_bram_patch.tcl
index e017ea0579..f145035899 100644
--- a/hw/scripts/xilinx_async_bram_patch.tcl
+++ b/hw/scripts/xilinx_async_bram_patch.tcl
@@ -127,9 +127,22 @@ proc unique_net_name {name} {
 
 # --- Net Finding Utilities ---
 
+# Fast retrieval of the nets one level inside $cell (i.e. PARENT_CELL == $cell).
+# Scoping with current_instance is O(nets-in-cell); the equivalent
+# 'get_nets -hierarchical -filter {PARENT_CELL == $cell}' rescans the entire
+# design's net database on every call (O(total_nets)), which dominated runtime
+# on large multi-core designs. The NAME property is still the full hierarchical
+# name, so all downstream regexp/name matching is unchanged.
+proc cell_nets {cell} {
+  current_instance -quiet $cell
+  set nets [get_nets -quiet *]
+  current_instance -quiet
+  return $nets
+}
+
 proc find_cell_nets {cell name_match {required 1}} {
   set matching_nets {}
-  foreach net [get_nets -hierarchical -filter "PARENT_CELL == $cell"] {
+  foreach net [cell_nets $cell] {
     set name [get_property NAME $net]
     if {[regexp $name_match $name]} {
       lappend matching_nets $net
@@ -154,7 +167,10 @@ proc find_cell_net {cell name_match {required 1}} {
 }
 
 proc get_cell_net {cell name} {
-  set net [get_nets -hierarchical -filter "PARENT_CELL == $cell && NAME == $name"]
+  set net {}
+  foreach n [cell_nets $cell] {
+    if {[get_property NAME $n] == $name} { set net $n; break }
+  }
   if {[llength $net] == 0} {
     log_error "No matching net found for '$cell' matching '$name'."
   }
diff --git a/hw/syn/xilinx/dut/project.tcl b/hw/syn/xilinx/dut/project.tcl
index 162777085c..0752ee1b6f 100644
--- a/hw/syn/xilinx/dut/project.tcl
+++ b/hw/syn/xilinx/dut/project.tcl
@@ -35,6 +35,14 @@ puts "Using xdc_file=$xdc_file"
 puts "Using tool_dir=$tool_dir"
 puts "Using script_dir=$script_dir"
 
+# Default the target clock frequency (MHz) when CLK_FREQ_MHZ is not set in the
+# environment. Done here (real Tcl) because the XDC constraint parser rejects
+# 'if'; child synth/impl runs inherit this env var and project.xdc reads it.
+if {![info exists ::env(CLK_FREQ_MHZ)]} {
+  set ::env(CLK_FREQ_MHZ) 300
+}
+puts "Using CLK_FREQ_MHZ=$::env(CLK_FREQ_MHZ)"
+
 # Set the number of jobs based on MAX_JOBS environment variable
 if {[info exists ::env(MAX_JOBS)]} {
   set num_jobs $::env(MAX_JOBS)
@@ -107,6 +115,17 @@ proc run_setup {} {
   # Add constrains file
   read_xdc $xdc_file
 
+  # Clock constraint: generated here with a literal period because the XDC
+  # constraint parser sandbox rejects both 'if' and $::env(...) access. The
+  # frequency is resolved above (CLK_FREQ_MHZ env, default 300) and baked in.
+  set clk_period [expr {1000.0 / $::env(CLK_FREQ_MHZ)}]
+  set clk_xdc "${project_name}/clock.xdc"
+  set fh [open $clk_xdc w]
+  puts $fh "create_clock -name core_clock -period $clk_period \[get_ports clk\]"
+  close $fh
+  read_xdc $clk_xdc
+  puts "Clock constraint: core_clock period = ${clk_period} ns ($::env(CLK_FREQ_MHZ) MHz)"
+
   # Add the design sources
   add_files -norecurse -verbose $vsources_list
 
diff --git a/hw/syn/xilinx/dut/project.xdc b/hw/syn/xilinx/dut/project.xdc
index f786e78373..d09d544be0 100644
--- a/hw/syn/xilinx/dut/project.xdc
+++ b/hw/syn/xilinx/dut/project.xdc
@@ -1,4 +1,9 @@
-set CLK_FREQ_MHZ 300
-set clk_port_name clk
-set clk_port [get_ports $clk_port_name]
-create_clock -name core_clock -period [expr 1000.0 / $CLK_FREQ_MHZ] $clk_port
\ No newline at end of file
+# Design constraints for the DUT synthesis flow.
+#
+# The core clock (core_clock on port 'clk') is generated by project.tcl with a
+# literal period resolved from the CLK_FREQ_MHZ environment variable (default
+# 300). It is NOT defined here because the XDC constraint parser sandbox rejects
+# both 'if' and $::env(...) access. Override the frequency via:
+#   CLK_FREQ_MHZ=<n> make <target>
+#
+# Add any additional (non-clock) design constraints below.
diff --git a/hw/syn/xilinx/xrt/Makefile b/hw/syn/xilinx/xrt/Makefile
index b3cda2c0f0..95b26acc7d 100644
--- a/hw/syn/xilinx/xrt/Makefile
+++ b/hw/syn/xilinx/xrt/Makefile
@@ -158,10 +158,10 @@ ifneq (,$(filter -DVX_CFG_EXT_OM_ENABLE, $(XCONFIGS)))
 	RTL_INCLUDE += -I$(RTL_DIR)/om
 endif
 
-# Kernel clock target frequency (MHz). The U55C platform default is 300;
-# lowered to 250 for timing-closure margin. Override on the command line
-# (e.g. KERNEL_FREQ=300) to retarget.
-KERNEL_FREQ ?= 250
+# Kernel clock target frequency (MHz). Defaults to the U55C platform native
+# 300 MHz. Override on the command line (e.g. KERNEL_FREQ=250) for extra
+# timing-closure margin.
+KERNEL_FREQ ?= 300
 
 # Kernel compiler global settings
 VPP_FLAGS += --link --target $(TARGET) --platform $(PLATFORM) --save-temps --no_ip_cache

From dcccba457688aeadda0880f3c5e7ca6c8aa888d0 Mon Sep 17 00:00:00 2001
From: tinebp <tinebp@yahoo.com>
Date: Thu, 18 Jun 2026 14:47:47 -0700
Subject: [PATCH 2/2] issue: close scoreboard issue-stage timing at NT16

Select the issue arbiter by requester count via VX_generic_arbiter (matrix above 8, round-robin at or below) so grant logic stays shallow at NW16, and serialize multi-uop FU sequences with a single registered lock mask gating the arbiter requests instead of the per-FU locked state. Remove the unused suppress port from VX_gto_arbiter and its tie-off in VX_generic_arbiter.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 hw/rtl/core/VX_scoreboard.sv      | 64 +++++++++++++------------------
 hw/rtl/libs/VX_generic_arbiter.sv |  1 -
 hw/rtl/libs/VX_gto_arbiter.sv     | 54 +++++++++-----------------
 3 files changed, 45 insertions(+), 74 deletions(-)

diff --git a/hw/rtl/core/VX_scoreboard.sv b/hw/rtl/core/VX_scoreboard.sv
index 7d611ecd07..d9433482a0 100644
--- a/hw/rtl/core/VX_scoreboard.sv
+++ b/hw/rtl/core/VX_scoreboard.sv
@@ -182,13 +182,10 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
         wire [EX_BITS-1:0] ex_sel = ibuffer_fire ? ibuffer_if[w].data.ex_type : staging_if[w].data.ex_type;
         reg operands_ready_r;
 
-        // Readiness folds data hazards, FU-congestion and FU-lock into one flop.
-        // fu_locked_n is the next-state, look-ahead-aligned with ex_sel.
-        wire fu_lock_sel = ibuffer_fire ? ibuffer_if[w].data.fu_lock : staging_if[w].data.fu_lock;
-        wire data_ready  = ~((|regs_busy) || xregs_busy);
-        wire operands_ready_n = data_ready
-                             && ~fu_goingfull[ex_sel]
-                             && ~(fu_locked_n[ex_sel] && fu_lock_sel);
+        // Readiness folds data hazards and FU-congestion into one flop; FU-lock
+        // is enforced downstream by masking the arbiter requests.
+        wire data_ready = ~((|regs_busy) || xregs_busy);
+        wire operands_ready_n = data_ready && ~fu_goingfull[ex_sel];
 
         always @(posedge clk) begin
             if (reset) begin
@@ -243,12 +240,15 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
     wire [PER_ISSUE_WARPS-1:0][OUT_DATAW-1:0] arb_data_in;
     wire [PER_ISSUE_WARPS-1:0] arb_ready_in;
 
-    reg  [NUM_EX_UNITS-1:0] fu_locked, fu_locked_n;
+    // FU lock: a sequence must not interleave with another warp at the same FU.
+    // fu_locked ('1 = open, one-hot = locked) gates arb_valid_in so only the lock
+    // holder is requested while it holds the lock.
+    reg [PER_ISSUE_WARPS-1:0] fu_locked;
 
     for (genvar w = 0; w < PER_ISSUE_WARPS; ++w) begin : g_arb_data_in
-        // operands_ready carries data-hazard + FU-congestion + FU-lock (all folded
-        // into operands_ready_r), so the request is just valid & operands_ready.
-        assign arb_valid_in[w] = staging_if[w].valid && operands_ready[w];
+        // operands_ready carries data-hazard + FU-congestion; fu_locked adds the
+        // FU-lock gate so only the lock holder is requested during a sequence.
+        assign arb_valid_in[w] = staging_if[w].valid && operands_ready[w] && fu_locked[w];
 
         assign arb_data_in[w] = {
             staging_if[w].data.uuid,
@@ -270,10 +270,6 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
         assign staging_if[w].ready = arb_ready_in[w] && operands_ready[w];
     end
 
-
-    // Cyclic arbiter; STICKY holds the grant under backpressure. requests is a
-    // pure flop (suppress already folded into operands_ready).
-
     localparam LOG_NUM_REQS = `LOG2UP(PER_ISSUE_WARPS);
 
     wire                    arb_valid;
@@ -281,8 +277,10 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
     wire [PER_ISSUE_WARPS-1:0] arb_onehot;
     wire                    arb_ready;
 
-    VX_cyclic_arbiter #(
+    // Matrix arbiter scales better past 8 requesters; RR is cheaper below that.
+    VX_generic_arbiter #(
         .NUM_REQS (PER_ISSUE_WARPS),
+        .TYPE     ((PER_ISSUE_WARPS > 8) ? "M" : "R"),
         .STICKY   (1) // Greedy
     ) out_arb (
         .clk          (clk),
@@ -307,8 +305,8 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
 
     assign arb_ready = ready_out_w;
 
-    // FU lock: prevent warp interleaving during multi-uop sequences.
-    // 10=acquire (first uop), 00=middle, 01=release (last uop), 11=default.
+    // A sequence carries fu_lock on its first uop (acquire) and fu_unlock on its
+    // last (release); 11 = single-uop default.
 
     wire issue_fire = valid_out_w && ready_out_w;
 
@@ -327,28 +325,20 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
         assign fu_issue[e] = issue_fire && (issue_ex == EX_BITS'(e));
     end
 
-    // fu_locked next-state: the granted warp (arb_onehot is one-hot) acquires or
-    // releases its FU. Built from arb_onehot -- not issue_ex -- so its arbiter
-    // dependence matches ibuffer_fire's depth, keeping the fu_locked_n term folded
-    // into operands_ready_n off the critical path (parallel to the look-ahead select).
-    always @(*) begin
-        fu_locked_n = fu_locked;
-        for (integer i = 0; i < PER_ISSUE_WARPS; i = i + 1) begin
-            if (arb_onehot[i] && ready_out_w) begin
-                if (staging_fu_lock_vec[i] && ~staging_fu_unlock_vec[i]) begin
-                    fu_locked_n[staging_ex_vec[i]] = 1'b1;
-                end else if (~staging_fu_lock_vec[i] && staging_fu_unlock_vec[i]) begin
-                    fu_locked_n[staging_ex_vec[i]] = 1'b0;
-                end
-            end
-        end
-    end
+    // Lock to the granted warp on acquire, hold across its sequence, reopen on
+    // release. arb_onehot is the granted warp, registered into fu_locked.
+    wire issue_fu_lock   = staging_fu_lock_vec[arb_index];
+    wire issue_fu_unlock = staging_fu_unlock_vec[arb_index];
 
     always @(posedge clk) begin
         if (reset) begin
-            fu_locked <= '0;
-        end else begin
-            fu_locked <= fu_locked_n;
+            fu_locked <= '1;
+        end else if (issue_fire) begin
+            if (issue_fu_lock && ~issue_fu_unlock) begin
+                fu_locked <= arb_onehot;
+            end else if (issue_fu_unlock) begin
+                fu_locked <= '1;
+            end
         end
     end
 
diff --git a/hw/rtl/libs/VX_generic_arbiter.sv b/hw/rtl/libs/VX_generic_arbiter.sv
index 1501372a6c..6a8ed28ad2 100644
--- a/hw/rtl/libs/VX_generic_arbiter.sv
+++ b/hw/rtl/libs/VX_generic_arbiter.sv
@@ -104,7 +104,6 @@ module VX_generic_arbiter #(
             .clk          (clk),
             .reset        (reset),
             .requests     (requests),
-            .suppress     ({NUM_REQS{1'b0}}),
             .grant_valid  (grant_valid),
             .grant_index  (grant_index),
             .grant_onehot (grant_onehot),
diff --git a/hw/rtl/libs/VX_gto_arbiter.sv b/hw/rtl/libs/VX_gto_arbiter.sv
index b6a1489134..e9064fb6cc 100644
--- a/hw/rtl/libs/VX_gto_arbiter.sv
+++ b/hw/rtl/libs/VX_gto_arbiter.sv
@@ -13,16 +13,10 @@
 
 `include "VX_platform.vh"
 
-// Greedy-Then-Oldest (GTO) arbiter.
-//
-// Greedy: keep granting the same requester as long as it is active.
-// Oldest: when the current requester deasserts, grant the least-recently-
-//         granted requester.
-//
-// Selection priority is held in a dominance matrix: the granted requester is
-// demoted below all others, so "least recently granted" approximates "oldest"
-// and selection has constant logic depth (a one-hot dominance check) rather
-// than a serial age-counter max-reduction. Initial order is by index.
+// Greedy-Then-Oldest (GTO) arbiter: hold the current grantee while active,
+// else pick the least-recently-granted requester. Order is held in a dominance
+// matrix (granted requester demoted below all others), giving constant-depth
+// selection instead of a serial age-counter max-reduction. Initial order by index.
 
 `TRACING_OFF
 module VX_gto_arbiter #(
@@ -32,7 +26,6 @@ module VX_gto_arbiter #(
     input  wire                     clk,
     input  wire                     reset,
     input  wire [NUM_REQS-1:0]      requests,
-    input  wire [NUM_REQS-1:0]      suppress,
     output wire [LOG_NUM_REQS-1:0]  grant_index,
     output wire [NUM_REQS-1:0]      grant_onehot,
     output wire                     grant_valid,
@@ -42,19 +35,14 @@ module VX_gto_arbiter #(
 
         `UNUSED_VAR (clk)
         `UNUSED_VAR (reset)
-        `UNUSED_VAR (suppress)
         `UNUSED_VAR (grant_ready)
 
         assign grant_index  = '0;
         assign grant_onehot = requests;
-        assign grant_valid  = requests[0] && ~suppress[0];
+        assign grant_valid  = requests[0];
 
     end else begin : g_arbiter
 
-        // Effective requests: eligible for selection (not suppressed).
-        // Age tracking still uses `requests` so suppressed warps keep aging.
-        wire [NUM_REQS-1:0] eff_requests = requests & ~suppress;
-
         // -- Greedy: sticky grant ------------------------------------------
 
         reg [NUM_REQS-1:0] prev_grant;
@@ -66,19 +54,15 @@ module VX_gto_arbiter #(
             end
         end
 
-        wire retain_grant = |(prev_grant & eff_requests);
+        wire retain_grant = |(prev_grant & requests);
 
         wire grant_fire = grant_valid && grant_ready;
 
-        // -- Oldest: least-recently-granted via a priority matrix ----------
-        // rank_matrix[i][j] (upper triangle, i<j) = 1 means requester i
-        // outranks j. The reconstructed `outranks` relation is antisymmetric.
-        // On each grant the winner is demoted below every other requester,
-        // which preserves a valid total order, so `oldest_onehot` is always
-        // exactly one-hot. This replaces the serial age-counter max-reduction
-        // with a constant-depth dominance check (one AND-fanin level over the
-        // requesters). The matrix moves only on grant, so a suppressed-but-
-        // pending requester keeps its rank and wins once unsuppressed.
+        // -- Oldest: least-recently-granted via a dominance matrix ---------
+        // rank_matrix[i][j] (i<j) = 1 means i outranks j; `outranks` reconstructs
+        // the full antisymmetric relation. Demoting the winner below all others
+        // keeps a valid total order, so `oldest_onehot` is exactly one-hot. The
+        // matrix moves only on grant, so a pending requester keeps its rank.
 
         reg [NUM_REQS-1:0] rank_matrix [NUM_REQS];
 
@@ -96,9 +80,9 @@ module VX_gto_arbiter #(
             wire [NUM_REQS-1:0] dominates;
             for (genvar j = 0; j < NUM_REQS; ++j) begin : g_dom
                 // i is not blocked by j when it outranks j or j is not requesting.
-                assign dominates[j] = (i == j) ? 1'b1 : (outranks[i][j] | ~eff_requests[j]);
+                assign dominates[j] = (i == j) ? 1'b1 : (outranks[i][j] | ~requests[j]);
             end
-            assign oldest_onehot[i] = eff_requests[i] && (&dominates);
+            assign oldest_onehot[i] = requests[i] && (&dominates);
         end
 
         integer mr, mc;
@@ -117,15 +101,13 @@ module VX_gto_arbiter #(
             end
         end
 
-        // Greedy wins if the current grantee is still active; otherwise the
-        // matrix-selected oldest. prev_grant is already one-hot, so the greedy
-        // path needs no encoder.
-        assign grant_valid  = |eff_requests;
+        // Greedy wins if the current grantee is still active, else the oldest.
+        // prev_grant is already one-hot, so the greedy path needs no encoder.
+        assign grant_valid  = |requests;
         assign grant_onehot = retain_grant ? prev_grant : oldest_onehot;
 
-        // grant_onehot is one-hot (or zero when idle, gated by grant_valid), so
-        // grant_index is a flat per-bit OR-reduce: exact for a one-hot input,
-        // with no priority cascade and no separate encoders.
+        // grant_onehot is one-hot (zero when idle, gated by grant_valid), so
+        // grant_index is a flat per-bit OR-reduce with no priority cascade.
         for (genvar k = 0; k < LOG_NUM_REQS; ++k) begin : g_grant_index
             wire [NUM_REQS-1:0] kbits;
             for (genvar i = 0; i < NUM_REQS; ++i) begin : g_kbit