@@ -12,7 +12,7 @@ namespace SixLabors.ImageSharp.Drawing.Processing.Backends;
1212/// <see cref="DefaultRasterizer"/>, operating per-tile with workgroup shared memory.
1313/// Shader source is generated per texture format to match sampling/output requirements.
1414/// </summary>
15- internal static class PreparedCompositeFineComputeShader
15+ internal static class CompositeComputeShader
1616{
1717 private static readonly object CacheSync = new ( ) ;
1818 private static readonly Dictionary < TextureFormat , byte [ ] > ShaderCache = [ ] ;
@@ -81,12 +81,9 @@ struct DispatchConfig {
8181 @group(0) @binding(2) var brush_texture: texture_2d<__BACKDROP_TEXEL_TYPE__>;
8282 @group(0) @binding(3) var output_texture: texture_storage_2d<__OUTPUT_FORMAT__, write>;
8383 @group(0) @binding(4) var<storage, read> commands: array<Params>;
84- @group(0) @binding(5) var<storage, read> tile_starts: array<u32>;
85- @group(0) @binding(6) var<storage, read_write> tile_counts: array<atomic<u32>>;
86- @group(0) @binding(7) var<storage, read> tile_command_indices: array<u32>;
87- @group(0) @binding(8) var<uniform> dispatch_config: DispatchConfig;
88- @group(0) @binding(9) var<storage, read> csr_offsets: array<u32>;
89- @group(0) @binding(10) var<storage, read> csr_indices: array<u32>;
84+ @group(0) @binding(5) var<uniform> dispatch_config: DispatchConfig;
85+ @group(0) @binding(6) var<storage, read> csr_offsets: array<u32>;
86+ @group(0) @binding(7) var<storage, read> csr_indices: array<u32>;
9087
9188 // Workgroup shared memory for per-tile coverage accumulation.
9289 // Layout: 16 rows x 16 columns. Index = row * 16 + col.
@@ -798,26 +795,27 @@ fn cs_main(
798795
799796 let dest_x_i32 = i32(dest_x);
800797 let dest_y_i32 = i32(dest_y);
798+ let tile_min_x = i32(tile_x * 16u);
799+ let tile_min_y = i32(tile_y * 16u);
800+ let tile_max_x = tile_min_x + 16;
801+ let tile_max_y = tile_min_y + 16;
801802
802- let tile_command_start = tile_starts[tile_index];
803- let tile_command_count = atomicLoad(&tile_counts[tile_index]);
804-
805- for (var tile_cmd_offset = 0u; tile_cmd_offset < tile_command_count; tile_cmd_offset++) {
806- let command_index = tile_command_indices[tile_command_start + tile_cmd_offset];
803+ for (var command_index = 0u; command_index < dispatch_config.command_count; command_index++) {
807804 let command = commands[command_index];
808805
809- // Clear shared coverage memory.
810- atomicStore(&tile_cover[thread_id], 0);
811- atomicStore(&tile_area[thread_id], 0);
812- if px == 0u {
813- atomicStore(&tile_start_cover[py], 0);
806+ // Tile vs command bounding box check (uniform across workgroup).
807+ let cmd_min_x = bitcast<i32>(command.destination_x);
808+ let cmd_min_y = bitcast<i32>(command.destination_y);
809+ let cmd_max_x = cmd_min_x + i32(command.destination_width);
810+ let cmd_max_y = cmd_min_y + i32(command.destination_height);
811+ if tile_max_x <= cmd_min_x || tile_min_x >= cmd_max_x || tile_max_y <= cmd_min_y || tile_min_y >= cmd_max_y {
812+ continue;
814813 }
815- workgroupBarrier();
816814
817815 // Determine this tile's position in coverage-local space.
818- let band_top = i32(tile_y * 16u) - command.edge_origin_y;
816+ let band_top = tile_min_y - command.edge_origin_y;
819817 let band_bottom = band_top + 16;
820- let band_left_fixed = (i32(tile_x * 16u) - command.edge_origin_x) << FIXED_SHIFT;
818+ let band_left_fixed = (tile_min_x - command.edge_origin_x) << FIXED_SHIFT;
821819
822820 // CSR band lookup: which 16-row bands overlap this tile?
823821 var first_band = band_top / 16;
@@ -831,15 +829,38 @@ fn cs_main(
831829 }
832830 last_band = min(last_band, i32(command.csr_band_count) - 1);
833831
832+ // Early exit: skip if no CSR bands have edges for this tile (uniform).
833+ if first_band > last_band {
834+ continue;
835+ }
836+ var tile_has_edges = false;
837+ for (var b = first_band; b <= last_band; b++) {
838+ let s = csr_offsets[command.csr_offsets_start + u32(b)];
839+ let e = csr_offsets[command.csr_offsets_start + u32(b) + 1u];
840+ if e > s {
841+ tile_has_edges = true;
842+ break;
843+ }
844+ }
845+ if !tile_has_edges {
846+ continue;
847+ }
848+
849+ // Clear shared coverage memory.
850+ atomicStore(&tile_cover[thread_id], 0);
851+ atomicStore(&tile_area[thread_id], 0);
852+ if px == 0u {
853+ atomicStore(&tile_start_cover[py], 0);
854+ }
855+ workgroupBarrier();
856+
834857 // Cooperatively rasterize edges from the relevant CSR bands.
835858 let tile_top_fixed = band_top << FIXED_SHIFT;
836859 let tile_bottom_fixed = tile_top_fixed + (i32(16) << FIXED_SHIFT);
837860 for (var band = first_band; band <= last_band; band++) {
838861 let csr_start = csr_offsets[command.csr_offsets_start + u32(band)];
839862 let csr_end = csr_offsets[command.csr_offsets_start + u32(band) + 1u];
840863 let band_edge_count = csr_end - csr_start;
841- // Clip to intersection of tile window and CSR band window
842- // to avoid double-counting edges that span multiple CSR bands.
843864 let csr_band_top_fixed = (band * 16) << FIXED_SHIFT;
844865 let csr_band_bottom_fixed = csr_band_top_fixed + (i32(16) << FIXED_SHIFT);
845866 let clip_top = max(tile_top_fixed, csr_band_top_fixed);
@@ -859,12 +880,7 @@ fn cs_main(
859880
860881 // Compute coverage and compose for this pixel.
861882 if in_bounds {
862- let cmd_min_x = bitcast<i32>(command.destination_x);
863- let cmd_min_y = bitcast<i32>(command.destination_y);
864- let cmd_max_x = cmd_min_x + i32(command.destination_width);
865- let cmd_max_y = cmd_min_y + i32(command.destination_height);
866883 if dest_x_i32 >= cmd_min_x && dest_x_i32 < cmd_max_x && dest_y_i32 >= cmd_min_y && dest_y_i32 < cmd_max_y {
867- // Prefix sum of cover deltas for this row.
868884 var cover = atomicLoad(&tile_start_cover[py]);
869885 for (var col = 0u; col < px; col++) {
870886 cover += atomicLoad(&tile_cover[py * 16u + col]);
0 commit comments