@@ -13,33 +13,38 @@ internal static partial class SimdUtils
1313{
1414 public static class HwIntrinsics
1515 {
16- public static ReadOnlySpan < byte > PermuteMaskDeinterleave8x32 => new byte [ ] { 0 , 0 , 0 , 0 , 4 , 0 , 0 , 0 , 1 , 0 , 0 , 0 , 5 , 0 , 0 , 0 , 2 , 0 , 0 , 0 , 6 , 0 , 0 , 0 , 3 , 0 , 0 , 0 , 7 , 0 , 0 , 0 } ;
16+ [ MethodImpl ( MethodImplOptions . AggressiveInlining ) ] // too much IL for JIT to inline, so give a hint
17+ public static Vector256 < int > PermuteMaskDeinterleave8x32 ( ) => Vector256 . Create ( 0 , 0 , 0 , 0 , 4 , 0 , 0 , 0 , 1 , 0 , 0 , 0 , 5 , 0 , 0 , 0 , 2 , 0 , 0 , 0 , 6 , 0 , 0 , 0 , 3 , 0 , 0 , 0 , 7 , 0 , 0 , 0 ) . AsInt32 ( ) ;
1718
18- public static ReadOnlySpan < byte > PermuteMaskEvenOdd8x32 => new byte [ ] { 0 , 0 , 0 , 0 , 2 , 0 , 0 , 0 , 4 , 0 , 0 , 0 , 6 , 0 , 0 , 0 , 1 , 0 , 0 , 0 , 3 , 0 , 0 , 0 , 5 , 0 , 0 , 0 , 7 , 0 , 0 , 0 } ;
19+ [ MethodImpl ( MethodImplOptions . AggressiveInlining ) ]
20+ public static Vector256 < uint > PermuteMaskEvenOdd8x32 ( ) => Vector256 . Create ( 0 , 0 , 0 , 0 , 2 , 0 , 0 , 0 , 4 , 0 , 0 , 0 , 6 , 0 , 0 , 0 , 1 , 0 , 0 , 0 , 3 , 0 , 0 , 0 , 5 , 0 , 0 , 0 , 7 , 0 , 0 , 0 ) . AsUInt32 ( ) ;
1921
20- public static ReadOnlySpan < byte > PermuteMaskSwitchInnerDWords8x32 => new byte [ ] { 0 , 0 , 0 , 0 , 1 , 0 , 0 , 0 , 4 , 0 , 0 , 0 , 5 , 0 , 0 , 0 , 2 , 0 , 0 , 0 , 3 , 0 , 0 , 0 , 6 , 0 , 0 , 0 , 7 , 0 , 0 , 0 } ;
22+ [ MethodImpl ( MethodImplOptions . AggressiveInlining ) ]
23+ public static Vector256 < uint > PermuteMaskSwitchInnerDWords8x32 ( ) => Vector256 . Create ( 0 , 0 , 0 , 0 , 1 , 0 , 0 , 0 , 4 , 0 , 0 , 0 , 5 , 0 , 0 , 0 , 2 , 0 , 0 , 0 , 3 , 0 , 0 , 0 , 6 , 0 , 0 , 0 , 7 , 0 , 0 , 0 ) . AsUInt32 ( ) ;
2124
22- private static ReadOnlySpan < byte > MoveFirst24BytesToSeparateLanes => new byte [ ] { 0 , 0 , 0 , 0 , 1 , 0 , 0 , 0 , 2 , 0 , 0 , 0 , 6 , 0 , 0 , 0 , 3 , 0 , 0 , 0 , 4 , 0 , 0 , 0 , 5 , 0 , 0 , 0 , 7 , 0 , 0 , 0 } ;
25+ [ MethodImpl ( MethodImplOptions . AggressiveInlining ) ]
26+ private static Vector256 < uint > MoveFirst24BytesToSeparateLanes ( ) => Vector256 . Create ( 0 , 0 , 0 , 0 , 1 , 0 , 0 , 0 , 2 , 0 , 0 , 0 , 6 , 0 , 0 , 0 , 3 , 0 , 0 , 0 , 4 , 0 , 0 , 0 , 5 , 0 , 0 , 0 , 7 , 0 , 0 , 0 ) . AsUInt32 ( ) ;
2327
24- internal static ReadOnlySpan < byte > ExtractRgb => new byte [ ] { 0 , 3 , 6 , 9 , 1 , 4 , 7 , 10 , 2 , 5 , 8 , 11 , 0xFF , 0xFF , 0xFF , 0xFF , 0 , 3 , 6 , 9 , 1 , 4 , 7 , 10 , 2 , 5 , 8 , 11 , 0xFF , 0xFF , 0xFF , 0xFF } ;
28+ [ MethodImpl ( MethodImplOptions . AggressiveInlining ) ]
29+ internal static Vector256 < byte > ExtractRgb ( ) => Vector256 . Create ( 0 , 3 , 6 , 9 , 1 , 4 , 7 , 10 , 2 , 5 , 8 , 11 , 0xFF , 0xFF , 0xFF , 0xFF , 0 , 3 , 6 , 9 , 1 , 4 , 7 , 10 , 2 , 5 , 8 , 11 , 0xFF , 0xFF , 0xFF , 0xFF ) ;
2530
26- private static ReadOnlySpan < byte > ShuffleMaskPad4Nx16 => new byte [ ] { 0 , 1 , 2 , 0x80 , 3 , 4 , 5 , 0x80 , 6 , 7 , 8 , 0x80 , 9 , 10 , 11 , 0x80 } ;
31+ [ MethodImpl ( MethodImplOptions . AggressiveInlining ) ]
32+ private static Vector128 < byte > ShuffleMaskPad4Nx16 ( ) => Vector128 . Create ( 0 , 1 , 2 , 0x80 , 3 , 4 , 5 , 0x80 , 6 , 7 , 8 , 0x80 , 9 , 10 , 11 , 0x80 ) ;
2733
28- private static ReadOnlySpan < byte > ShuffleMaskSlice4Nx16 => new byte [ ] { 0 , 1 , 2 , 4 , 5 , 6 , 8 , 9 , 10 , 12 , 13 , 14 , 0x80 , 0x80 , 0x80 , 0x80 } ;
34+ [ MethodImpl ( MethodImplOptions . AggressiveInlining ) ]
35+ private static Vector128 < byte > ShuffleMaskSlice4Nx16 ( ) => Vector128 . Create ( 0 , 1 , 2 , 4 , 5 , 6 , 8 , 9 , 10 , 12 , 13 , 14 , 0x80 , 0x80 , 0x80 , 0x80 ) ;
2936
30- private static ReadOnlySpan < byte > ShuffleMaskShiftAlpha =>
31- new byte [ ]
32- {
33- 0 , 1 , 2 , 4 , 5 , 6 , 8 , 9 , 10 , 12 , 13 , 14 , 3 , 7 , 11 , 15 ,
34- 0 , 1 , 2 , 4 , 5 , 6 , 8 , 9 , 10 , 12 , 13 , 14 , 3 , 7 , 11 , 15
35- } ;
37+ #pragma warning disable SA1003 , SA1116 , SA1117 // Parameters should be on same line or separate lines
38+ [ MethodImpl ( MethodImplOptions . AggressiveInlining ) ]
39+ private static Vector256 < byte > ShuffleMaskShiftAlpha ( ) => Vector256 . Create ( ( byte )
40+ 0 , 1 , 2 , 4 , 5 , 6 , 8 , 9 , 10 , 12 , 13 , 14 , 3 , 7 , 11 , 15 ,
41+ 0 , 1 , 2 , 4 , 5 , 6 , 8 , 9 , 10 , 12 , 13 , 14 , 3 , 7 , 11 , 15 ) ;
3642
37- public static ReadOnlySpan < byte > PermuteMaskShiftAlpha8x32 =>
38- new byte [ ]
39- {
40- 0 , 0 , 0 , 0 , 1 , 0 , 0 , 0 , 2 , 0 , 0 , 0 , 4 , 0 , 0 , 0 ,
41- 5 , 0 , 0 , 0 , 6 , 0 , 0 , 0 , 3 , 0 , 0 , 0 , 7 , 0 , 0 , 0
42- } ;
43+ [ MethodImpl ( MethodImplOptions . AggressiveInlining ) ]
44+ public static Vector256 < uint > PermuteMaskShiftAlpha8x32 ( ) => Vector256 . Create (
45+ 0 , 0 , 0 , 0 , 1 , 0 , 0 , 0 , 2 , 0 , 0 , 0 , 4 , 0 , 0 , 0 ,
46+ 5 , 0 , 0 , 0 , 6 , 0 , 0 , 0 , 3 , 0 , 0 , 0 , 7 , 0 , 0 , 0 ) . AsUInt32 ( ) ;
47+ #pragma warning restore SA1003 , SA1116 , SA1117 // Parameters should be on same line or separate lines
4348
4449 /// <summary>
4550 /// Shuffle single-precision (32-bit) floating-point elements in <paramref name="source"/>
@@ -189,7 +194,7 @@ public static void Shuffle4Slice3Reduce(
189194 {
190195 if ( Ssse3 . IsSupported )
191196 {
192- int remainder = source . Length % ( Vector128 < byte > . Count * 4 ) ;
197+ int remainder = source . Length & ( Vector128 < byte > . Count * 4 - 1 ) ; // bit-hack for modulo
193198
194199 int sourceCount = source . Length - remainder ;
195200 int destCount = ( int ) ( ( uint ) sourceCount * 3 / 4 ) ;
@@ -221,7 +226,7 @@ private static void Shuffle4(
221226 ref Vector256 < float > destBase =
222227 ref Unsafe . As < float , Vector256 < float > > ( ref MemoryMarshal . GetReference ( dest ) ) ;
223228
224- nint n = ( nint ) ( uint ) ( dest . Length / Vector256 < float > . Count ) ;
229+ nint n = ( nint ) ( ( uint ) dest . Length / ( uint ) Vector256 < float > . Count ) ;
225230 nint m = Numerics . Modulo4 ( n ) ;
226231 nint u = n - m ;
227232
@@ -253,7 +258,7 @@ private static void Shuffle4(
253258 ref Vector128 < float > destBase =
254259 ref Unsafe . As < float , Vector128 < float > > ( ref MemoryMarshal . GetReference ( dest ) ) ;
255260
256- nint n = ( nint ) ( uint ) dest . Length / Vector128 < float > . Count ;
261+ nint n = ( nint ) ( ( uint ) dest . Length / ( uint ) Vector128 < float > . Count ) ;
257262 nint m = Numerics . Modulo4 ( n ) ;
258263 nint u = n - m ;
259264
@@ -306,7 +311,7 @@ private static void Shuffle4(
306311 ref Vector256 < byte > destBase =
307312 ref Unsafe . As < byte , Vector256 < byte > > ( ref MemoryMarshal . GetReference ( dest ) ) ;
308313
309- nint n = ( nint ) ( uint ) dest . Length / Vector256 < byte > . Count ;
314+ nint n = ( nint ) ( ( uint ) dest . Length / ( uint ) Vector256 < byte > . Count ) ;
310315 nint m = Numerics . Modulo4 ( n ) ;
311316 nint u = n - m ;
312317
@@ -342,7 +347,7 @@ private static void Shuffle4(
342347 ref Vector128 < byte > destBase =
343348 ref Unsafe . As < byte , Vector128 < byte > > ( ref MemoryMarshal . GetReference ( dest ) ) ;
344349
345- nint n = ( nint ) ( uint ) dest . Length / Vector128 < byte > . Count ;
350+ nint n = ( nint ) ( ( uint ) dest . Length / ( uint ) Vector128 < byte > . Count ) ;
346351 nint m = Numerics . Modulo4 ( n ) ;
347352 nint u = n - m ;
348353
@@ -375,10 +380,8 @@ private static void Shuffle3(
375380 {
376381 if ( Ssse3 . IsSupported )
377382 {
378- ref byte vmaskBase = ref MemoryMarshal . GetReference ( ShuffleMaskPad4Nx16 ) ;
379- Vector128 < byte > vmask = Unsafe . As < byte , Vector128 < byte > > ( ref vmaskBase ) ;
380- ref byte vmaskoBase = ref MemoryMarshal . GetReference ( ShuffleMaskSlice4Nx16 ) ;
381- Vector128 < byte > vmasko = Unsafe . As < byte , Vector128 < byte > > ( ref vmaskoBase ) ;
383+ Vector128 < byte > vmask = ShuffleMaskPad4Nx16 ( ) ;
384+ Vector128 < byte > vmasko = ShuffleMaskSlice4Nx16 ( ) ;
382385 Vector128 < byte > vmaske = Ssse3 . AlignRight ( vmasko , vmasko , 12 ) ;
383386
384387 Span < byte > bytes = stackalloc byte [ Vector128 < byte > . Count ] ;
@@ -440,8 +443,7 @@ private static void Pad3Shuffle4(
440443 {
441444 if ( Ssse3 . IsSupported )
442445 {
443- ref byte vmaskBase = ref MemoryMarshal . GetReference ( ShuffleMaskPad4Nx16 ) ;
444- Vector128 < byte > vmask = Unsafe . As < byte , Vector128 < byte > > ( ref vmaskBase ) ;
446+ Vector128 < byte > vmask = ShuffleMaskPad4Nx16 ( ) ;
445447 Vector128 < byte > vfill = Vector128 . Create ( 0xff000000ff000000ul ) . AsByte ( ) ;
446448
447449 Span < byte > bytes = stackalloc byte [ Vector128 < byte > . Count ] ;
@@ -484,8 +486,7 @@ private static void Shuffle4Slice3(
484486 {
485487 if ( Ssse3 . IsSupported )
486488 {
487- ref byte vmaskoBase = ref MemoryMarshal . GetReference ( ShuffleMaskSlice4Nx16 ) ;
488- Vector128 < byte > vmasko = Unsafe . As < byte , Vector128 < byte > > ( ref vmaskoBase ) ;
489+ Vector128 < byte > vmasko = ShuffleMaskSlice4Nx16 ( ) ;
489490 Vector128 < byte > vmaske = Ssse3 . AlignRight ( vmasko , vmasko , 12 ) ;
490491
491492 Span < byte > bytes = stackalloc byte [ Vector128 < byte > . Count ] ;
@@ -542,9 +543,9 @@ private static void Shuffle4Slice3(
542543 /// <returns>The <see cref="Vector256{T}"/>.</returns>
543544 [ MethodImpl ( InliningOptions . AlwaysInline ) ]
544545 public static Vector256 < float > MultiplyAdd (
545- in Vector256 < float > va ,
546- in Vector256 < float > vm0 ,
547- in Vector256 < float > vm1 )
546+ Vector256 < float > va ,
547+ Vector256 < float > vm0 ,
548+ Vector256 < float > vm1 )
548549 {
549550 if ( Fma . IsSupported )
550551 {
@@ -565,9 +566,9 @@ public static Vector256<float> MultiplyAdd(
565566 /// <returns>The <see cref="Vector256{T}"/>.</returns>
566567 [ MethodImpl ( InliningOptions . ShortMethod ) ]
567568 public static Vector256 < float > MultiplySubtract (
568- in Vector256 < float > vs ,
569- in Vector256 < float > vm0 ,
570- in Vector256 < float > vm1 )
569+ Vector256 < float > vs ,
570+ Vector256 < float > vm0 ,
571+ Vector256 < float > vm1 )
571572 {
572573 if ( Fma . IsSupported )
573574 {
@@ -587,9 +588,9 @@ public static Vector256<float> MultiplySubtract(
587588 /// <returns>The <see cref="Vector256{T}"/>.</returns>
588589 [ MethodImpl ( InliningOptions . ShortMethod ) ]
589590 public static Vector256 < float > MultiplyAddNegated (
590- in Vector256 < float > a ,
591- in Vector256 < float > b ,
592- in Vector256 < float > c )
591+ Vector256 < float > a ,
592+ Vector256 < float > b ,
593+ Vector256 < float > c )
593594 {
594595 if ( Fma . IsSupported )
595596 {
@@ -655,7 +656,7 @@ internal static unsafe void ByteToNormalizedFloat(
655656 ref Vector256 < float > destBase =
656657 ref Unsafe . As < float , Vector256 < float > > ( ref MemoryMarshal . GetReference ( dest ) ) ;
657658
658- var scale = Vector256 . Create ( 1 / ( float ) byte . MaxValue ) ;
659+ Vector256 < float > scale = Vector256 . Create ( 1 / ( float ) byte . MaxValue ) ;
659660
660661 for ( nuint i = 0 ; i < n ; i ++ )
661662 {
@@ -688,7 +689,7 @@ internal static unsafe void ByteToNormalizedFloat(
688689 ref Vector128 < float > destBase =
689690 ref Unsafe . As < float , Vector128 < float > > ( ref MemoryMarshal . GetReference ( dest ) ) ;
690691
691- var scale = Vector128 . Create ( 1 / ( float ) byte . MaxValue ) ;
692+ Vector128 < float > scale = Vector128 . Create ( 1 / ( float ) byte . MaxValue ) ;
692693 Vector128 < byte > zero = Vector128 < byte > . Zero ;
693694
694695 for ( nuint i = 0 ; i < n ; i ++ )
@@ -790,9 +791,8 @@ internal static void NormalizedFloatToByteSaturate(
790791 ref Vector256 < byte > destBase =
791792 ref Unsafe . As < byte , Vector256 < byte > > ( ref MemoryMarshal . GetReference ( dest ) ) ;
792793
793- var scale = Vector256 . Create ( ( float ) byte . MaxValue ) ;
794- ref byte maskBase = ref MemoryMarshal . GetReference ( PermuteMaskDeinterleave8x32 ) ;
795- Vector256 < int > mask = Unsafe . As < byte , Vector256 < int > > ( ref maskBase ) ;
794+ Vector256 < float > scale = Vector256 . Create ( ( float ) byte . MaxValue ) ;
795+ Vector256 < int > mask = PermuteMaskDeinterleave8x32 ( ) ;
796796
797797 for ( nuint i = 0 ; i < n ; i ++ )
798798 {
@@ -829,7 +829,7 @@ internal static void NormalizedFloatToByteSaturate(
829829 ref Vector128 < byte > destBase =
830830 ref Unsafe . As < byte , Vector128 < byte > > ( ref MemoryMarshal . GetReference ( dest ) ) ;
831831
832- var scale = Vector128 . Create ( ( float ) byte . MaxValue ) ;
832+ Vector128 < float > scale = Vector128 . Create ( ( float ) byte . MaxValue ) ;
833833
834834 for ( nuint i = 0 ; i < n ; i ++ )
835835 {
@@ -866,14 +866,12 @@ internal static void PackFromRgbPlanesAvx2Reduce(
866866
867867 nuint count = ( uint ) redChannel . Length / ( uint ) Vector256 < byte > . Count ;
868868
869- ref byte control1Bytes = ref MemoryMarshal . GetReference ( PermuteMaskEvenOdd8x32 ) ;
870- Vector256 < uint > control1 = Unsafe . As < byte , Vector256 < uint > > ( ref control1Bytes ) ;
869+ Vector256 < uint > control1 = PermuteMaskEvenOdd8x32 ( ) ;
871870
872- ref byte control2Bytes = ref MemoryMarshal . GetReference ( PermuteMaskShiftAlpha8x32 ) ;
873- Vector256 < uint > control2 = Unsafe . As < byte , Vector256 < uint > > ( ref control2Bytes ) ;
874- var a = Vector256 . Create ( ( byte ) 255 ) ;
871+ Vector256 < uint > control2 = PermuteMaskShiftAlpha8x32 ( ) ;
872+ Vector256 < byte > a = Vector256 . Create ( ( byte ) 255 ) ;
875873
876- Vector256 < byte > shuffleAlpha = Unsafe . As < byte , Vector256 < byte > > ( ref MemoryMarshal . GetReference ( ShuffleMaskShiftAlpha ) ) ;
874+ Vector256 < byte > shuffleAlpha = ShuffleMaskShiftAlpha ( ) ;
877875
878876 for ( nuint i = 0 ; i < count ; i ++ )
879877 {
@@ -937,9 +935,8 @@ internal static void PackFromRgbPlanesAvx2Reduce(
937935 ref Vector256 < byte > dBase = ref Unsafe . As < Rgba32 , Vector256 < byte > > ( ref MemoryMarshal . GetReference ( destination ) ) ;
938936
939937 nuint count = ( uint ) redChannel . Length / ( uint ) Vector256 < byte > . Count ;
940- ref byte control1Bytes = ref MemoryMarshal . GetReference ( PermuteMaskEvenOdd8x32 ) ;
941- Vector256 < uint > control1 = Unsafe . As < byte , Vector256 < uint > > ( ref control1Bytes ) ;
942- var a = Vector256 . Create ( ( byte ) 255 ) ;
938+ Vector256 < uint > control1 = PermuteMaskEvenOdd8x32 ( ) ;
939+ Vector256 < byte > a = Vector256 . Create ( ( byte ) 255 ) ;
943940
944941 for ( nuint i = 0 ; i < count ; i ++ )
945942 {
@@ -988,8 +985,8 @@ internal static void UnpackToRgbPlanesAvx2Reduce(
988985 ref Vector256 < float > destGRef = ref Unsafe . As < float , Vector256 < float > > ( ref MemoryMarshal . GetReference ( greenChannel ) ) ;
989986 ref Vector256 < float > destBRef = ref Unsafe . As < float , Vector256 < float > > ( ref MemoryMarshal . GetReference ( blueChannel ) ) ;
990987
991- Vector256 < uint > extractToLanesMask = Unsafe . As < byte , Vector256 < uint > > ( ref MemoryMarshal . GetReference ( MoveFirst24BytesToSeparateLanes ) ) ;
992- Vector256 < byte > extractRgbMask = Unsafe . As < byte , Vector256 < byte > > ( ref MemoryMarshal . GetReference ( ExtractRgb ) ) ;
988+ Vector256 < uint > extractToLanesMask = MoveFirst24BytesToSeparateLanes ( ) ;
989+ Vector256 < byte > extractRgbMask = ExtractRgb ( ) ;
993990 Vector256 < byte > rgb , rg , bx ;
994991 Vector256 < float > r , g , b ;
995992
0 commit comments