Skip to content

Commit ccf8a4a

Browse files
Switch to CSR-based composite shaders
1 parent 0d6f935 commit ccf8a4a

9 files changed

Lines changed: 272 additions & 1683 deletions

src/ImageSharp.Drawing.WebGPU/Shaders/PreparedCompositeFineComputeShader.cs renamed to src/ImageSharp.Drawing.WebGPU/Shaders/CompositeComputeShader.cs

Lines changed: 43 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ namespace SixLabors.ImageSharp.Drawing.Processing.Backends;
1212
/// <see cref="DefaultRasterizer"/>, operating per-tile with workgroup shared memory.
1313
/// Shader source is generated per texture format to match sampling/output requirements.
1414
/// </summary>
15-
internal static class PreparedCompositeFineComputeShader
15+
internal static class CompositeComputeShader
1616
{
1717
private static readonly object CacheSync = new();
1818
private static readonly Dictionary<TextureFormat, byte[]> ShaderCache = [];
@@ -81,12 +81,9 @@ struct DispatchConfig {
8181
@group(0) @binding(2) var brush_texture: texture_2d<__BACKDROP_TEXEL_TYPE__>;
8282
@group(0) @binding(3) var output_texture: texture_storage_2d<__OUTPUT_FORMAT__, write>;
8383
@group(0) @binding(4) var<storage, read> commands: array<Params>;
84-
@group(0) @binding(5) var<storage, read> tile_starts: array<u32>;
85-
@group(0) @binding(6) var<storage, read_write> tile_counts: array<atomic<u32>>;
86-
@group(0) @binding(7) var<storage, read> tile_command_indices: array<u32>;
87-
@group(0) @binding(8) var<uniform> dispatch_config: DispatchConfig;
88-
@group(0) @binding(9) var<storage, read> csr_offsets: array<u32>;
89-
@group(0) @binding(10) var<storage, read> csr_indices: array<u32>;
84+
@group(0) @binding(5) var<uniform> dispatch_config: DispatchConfig;
85+
@group(0) @binding(6) var<storage, read> csr_offsets: array<u32>;
86+
@group(0) @binding(7) var<storage, read> csr_indices: array<u32>;
9087
9188
// Workgroup shared memory for per-tile coverage accumulation.
9289
// Layout: 16 rows x 16 columns. Index = row * 16 + col.
@@ -798,26 +795,27 @@ fn cs_main(
798795
799796
let dest_x_i32 = i32(dest_x);
800797
let dest_y_i32 = i32(dest_y);
798+
let tile_min_x = i32(tile_x * 16u);
799+
let tile_min_y = i32(tile_y * 16u);
800+
let tile_max_x = tile_min_x + 16;
801+
let tile_max_y = tile_min_y + 16;
801802
802-
let tile_command_start = tile_starts[tile_index];
803-
let tile_command_count = atomicLoad(&tile_counts[tile_index]);
804-
805-
for (var tile_cmd_offset = 0u; tile_cmd_offset < tile_command_count; tile_cmd_offset++) {
806-
let command_index = tile_command_indices[tile_command_start + tile_cmd_offset];
803+
for (var command_index = 0u; command_index < dispatch_config.command_count; command_index++) {
807804
let command = commands[command_index];
808805
809-
// Clear shared coverage memory.
810-
atomicStore(&tile_cover[thread_id], 0);
811-
atomicStore(&tile_area[thread_id], 0);
812-
if px == 0u {
813-
atomicStore(&tile_start_cover[py], 0);
806+
// Tile vs command bounding box check (uniform across workgroup).
807+
let cmd_min_x = bitcast<i32>(command.destination_x);
808+
let cmd_min_y = bitcast<i32>(command.destination_y);
809+
let cmd_max_x = cmd_min_x + i32(command.destination_width);
810+
let cmd_max_y = cmd_min_y + i32(command.destination_height);
811+
if tile_max_x <= cmd_min_x || tile_min_x >= cmd_max_x || tile_max_y <= cmd_min_y || tile_min_y >= cmd_max_y {
812+
continue;
814813
}
815-
workgroupBarrier();
816814
817815
// Determine this tile's position in coverage-local space.
818-
let band_top = i32(tile_y * 16u) - command.edge_origin_y;
816+
let band_top = tile_min_y - command.edge_origin_y;
819817
let band_bottom = band_top + 16;
820-
let band_left_fixed = (i32(tile_x * 16u) - command.edge_origin_x) << FIXED_SHIFT;
818+
let band_left_fixed = (tile_min_x - command.edge_origin_x) << FIXED_SHIFT;
821819
822820
// CSR band lookup: which 16-row bands overlap this tile?
823821
var first_band = band_top / 16;
@@ -831,15 +829,38 @@ fn cs_main(
831829
}
832830
last_band = min(last_band, i32(command.csr_band_count) - 1);
833831
832+
// Early exit: skip if no CSR bands have edges for this tile (uniform).
833+
if first_band > last_band {
834+
continue;
835+
}
836+
var tile_has_edges = false;
837+
for (var b = first_band; b <= last_band; b++) {
838+
let s = csr_offsets[command.csr_offsets_start + u32(b)];
839+
let e = csr_offsets[command.csr_offsets_start + u32(b) + 1u];
840+
if e > s {
841+
tile_has_edges = true;
842+
break;
843+
}
844+
}
845+
if !tile_has_edges {
846+
continue;
847+
}
848+
849+
// Clear shared coverage memory.
850+
atomicStore(&tile_cover[thread_id], 0);
851+
atomicStore(&tile_area[thread_id], 0);
852+
if px == 0u {
853+
atomicStore(&tile_start_cover[py], 0);
854+
}
855+
workgroupBarrier();
856+
834857
// Cooperatively rasterize edges from the relevant CSR bands.
835858
let tile_top_fixed = band_top << FIXED_SHIFT;
836859
let tile_bottom_fixed = tile_top_fixed + (i32(16) << FIXED_SHIFT);
837860
for (var band = first_band; band <= last_band; band++) {
838861
let csr_start = csr_offsets[command.csr_offsets_start + u32(band)];
839862
let csr_end = csr_offsets[command.csr_offsets_start + u32(band) + 1u];
840863
let band_edge_count = csr_end - csr_start;
841-
// Clip to intersection of tile window and CSR band window
842-
// to avoid double-counting edges that span multiple CSR bands.
843864
let csr_band_top_fixed = (band * 16) << FIXED_SHIFT;
844865
let csr_band_bottom_fixed = csr_band_top_fixed + (i32(16) << FIXED_SHIFT);
845866
let clip_top = max(tile_top_fixed, csr_band_top_fixed);
@@ -859,12 +880,7 @@ fn cs_main(
859880
860881
// Compute coverage and compose for this pixel.
861882
if in_bounds {
862-
let cmd_min_x = bitcast<i32>(command.destination_x);
863-
let cmd_min_y = bitcast<i32>(command.destination_y);
864-
let cmd_max_x = cmd_min_x + i32(command.destination_width);
865-
let cmd_max_y = cmd_min_y + i32(command.destination_height);
866883
if dest_x_i32 >= cmd_min_x && dest_x_i32 < cmd_max_x && dest_y_i32 >= cmd_min_y && dest_y_i32 < cmd_max_y {
867-
// Prefix sum of cover deltas for this row.
868884
var cover = atomicLoad(&tile_start_cover[py]);
869885
for (var col = 0u; col < px; col++) {
870886
cover += atomicLoad(&tile_cover[py * 16u + col]);

src/ImageSharp.Drawing.WebGPU/Shaders/PreparedCompositeTilePrefixBlockScanComputeShader.cs renamed to src/ImageSharp.Drawing.WebGPU/Shaders/CsrPrefixBlockScanComputeShader.cs

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,11 +4,11 @@
44
namespace SixLabors.ImageSharp.Drawing.Processing.Backends;
55

66
/// <summary>
7-
/// Phase 2 of the parallel tile prefix sum: a single workgroup performs an
7+
/// Phase 2 of the parallel CSR prefix sum: a single workgroup performs an
88
/// in-place exclusive prefix sum over the block_sums array from phase 1.
9-
/// Supports up to 65536 blocks (256 * 256 = 16M tiles).
9+
/// Supports up to 65536 blocks (256 * 256 = 16M bands).
1010
/// </summary>
11-
internal static class PreparedCompositeTilePrefixBlockScanComputeShader
11+
internal static class CsrPrefixBlockScanComputeShader
1212
{
1313
private static readonly byte[] CodeBytes =
1414
[

src/ImageSharp.Drawing.WebGPU/Shaders/PreparedCompositeTilePrefixLocalComputeShader.cs renamed to src/ImageSharp.Drawing.WebGPU/Shaders/CsrPrefixLocalComputeShader.cs

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,11 +4,11 @@
44
namespace SixLabors.ImageSharp.Drawing.Processing.Backends;
55

66
/// <summary>
7-
/// Phase 1 of the parallel tile prefix sum: each workgroup computes a local
8-
/// exclusive prefix sum over 256 tile counts, writes per-tile starts, and
7+
/// Phase 1 of the parallel CSR prefix sum: each workgroup computes a local
8+
/// exclusive prefix sum over 256 band counts, writes per-band offsets, and
99
/// stores the workgroup total into a block_sums buffer.
1010
/// </summary>
11-
internal static class PreparedCompositeTilePrefixLocalComputeShader
11+
internal static class CsrPrefixLocalComputeShader
1212
{
1313
/// <summary>
1414
/// The number of tiles processed by each workgroup.

src/ImageSharp.Drawing.WebGPU/Shaders/PreparedCompositeTilePrefixPropagateComputeShader.cs renamed to src/ImageSharp.Drawing.WebGPU/Shaders/CsrPrefixPropagateComputeShader.cs

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,11 +4,11 @@
44
namespace SixLabors.ImageSharp.Drawing.Processing.Backends;
55

66
/// <summary>
7-
/// Phase 3 of the parallel tile prefix sum: each workgroup adds its
8-
/// block prefix from block_sums to all tile_starts in its range.
7+
/// Phase 3 of the parallel CSR prefix sum: each workgroup adds its
8+
/// block prefix from block_sums to all CSR offsets in its range.
99
/// Workgroup 0 is skipped (its prefix is 0).
1010
/// </summary>
11-
internal static class PreparedCompositeTilePrefixPropagateComputeShader
11+
internal static class CsrPrefixPropagateComputeShader
1212
{
1313
private static readonly byte[] CodeBytes =
1414
[

src/ImageSharp.Drawing.WebGPU/Shaders/PreparedCompositeBinningComputeShader.cs

Lines changed: 0 additions & 176 deletions
This file was deleted.

0 commit comments

Comments
 (0)