11use std:: arch:: x86_64:: {
2- __m256i,
3- _mm256_and_si256,
4- _mm256_cmpeq_epi8,
5- _mm256_extract_epi64,
6- _mm256_loadu_si256,
7- _mm256_sad_epu8,
8- _mm256_set1_epi8,
9- _mm256_setzero_si256,
10- _mm256_sub_epi8,
11- _mm256_xor_si256,
2+ __m256i, _mm256_and_si256, _mm256_cmpeq_epi8, _mm256_extract_epi64, _mm256_loadu_si256,
3+ _mm256_sad_epu8, _mm256_set1_epi8, _mm256_setzero_si256, _mm256_sub_epi8, _mm256_xor_si256,
124} ;
135
146#[ target_feature( enable = "avx2" ) ]
@@ -22,10 +14,9 @@ pub unsafe fn mm256_cmpneq_epi8(a: __m256i, b: __m256i) -> __m256i {
2214}
2315
2416const MASK : [ u8 ; 64 ] = [
25- 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
26- 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
27- 255 , 255 , 255 , 255 , 255 , 255 , 255 , 255 , 255 , 255 , 255 , 255 , 255 , 255 , 255 , 255 ,
28- 255 , 255 , 255 , 255 , 255 , 255 , 255 , 255 , 255 , 255 , 255 , 255 , 255 , 255 , 255 , 255 ,
17+ 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
18+ 255 , 255 , 255 , 255 , 255 , 255 , 255 , 255 , 255 , 255 , 255 , 255 , 255 , 255 , 255 , 255 , 255 , 255 , 255 ,
19+ 255 , 255 , 255 , 255 , 255 , 255 , 255 , 255 , 255 , 255 , 255 , 255 , 255 ,
2920] ;
3021
3122#[ target_feature( enable = "avx2" ) ]
@@ -36,10 +27,10 @@ unsafe fn mm256_from_offset(slice: &[u8], offset: usize) -> __m256i {
3627#[ target_feature( enable = "avx2" ) ]
3728unsafe fn sum ( u8s : & __m256i ) -> usize {
3829 let sums = _mm256_sad_epu8 ( * u8s, _mm256_setzero_si256 ( ) ) ;
39- (
40- _mm256_extract_epi64 ( sums , 0 ) + _mm256_extract_epi64 ( sums, 1 ) +
41- _mm256_extract_epi64 ( sums , 2 ) + _mm256_extract_epi64 ( sums, 3 )
42- ) as usize
30+ ( _mm256_extract_epi64 ( sums , 0 )
31+ + _mm256_extract_epi64 ( sums, 1 )
32+ + _mm256_extract_epi64 ( sums, 2 )
33+ + _mm256_extract_epi64 ( sums , 3 ) ) as usize
4334}
4435
4536#[ target_feature( enable = "avx2" ) ]
@@ -57,7 +48,7 @@ pub unsafe fn chunk_count(haystack: &[u8], needle: u8) -> usize {
5748 for _ in 0 ..255 {
5849 counts = _mm256_sub_epi8 (
5950 counts,
60- _mm256_cmpeq_epi8 ( mm256_from_offset ( haystack, offset) , needles)
51+ _mm256_cmpeq_epi8 ( mm256_from_offset ( haystack, offset) , needles) ,
6152 ) ;
6253 offset += 32 ;
6354 }
@@ -70,7 +61,7 @@ pub unsafe fn chunk_count(haystack: &[u8], needle: u8) -> usize {
7061 for _ in 0 ..128 {
7162 counts = _mm256_sub_epi8 (
7263 counts,
73- _mm256_cmpeq_epi8 ( mm256_from_offset ( haystack, offset) , needles)
64+ _mm256_cmpeq_epi8 ( mm256_from_offset ( haystack, offset) , needles) ,
7465 ) ;
7566 offset += 32 ;
7667 }
@@ -82,16 +73,16 @@ pub unsafe fn chunk_count(haystack: &[u8], needle: u8) -> usize {
8273 for i in 0 ..( haystack. len ( ) - offset) / 32 {
8374 counts = _mm256_sub_epi8 (
8475 counts,
85- _mm256_cmpeq_epi8 ( mm256_from_offset ( haystack, offset + i * 32 ) , needles)
76+ _mm256_cmpeq_epi8 ( mm256_from_offset ( haystack, offset + i * 32 ) , needles) ,
8677 ) ;
8778 }
8879 if haystack. len ( ) % 32 != 0 {
8980 counts = _mm256_sub_epi8 (
9081 counts,
9182 _mm256_and_si256 (
9283 _mm256_cmpeq_epi8 ( mm256_from_offset ( haystack, haystack. len ( ) - 32 ) , needles) ,
93- mm256_from_offset ( & MASK , haystack. len ( ) % 32 )
94- )
84+ mm256_from_offset ( & MASK , haystack. len ( ) % 32 ) ,
85+ ) ,
9586 ) ;
9687 }
9788 count += sum ( & counts) ;
@@ -101,7 +92,10 @@ pub unsafe fn chunk_count(haystack: &[u8], needle: u8) -> usize {
10192
10293#[ target_feature( enable = "avx2" ) ]
10394unsafe fn is_leading_utf8_byte ( u8s : __m256i ) -> __m256i {
104- mm256_cmpneq_epi8 ( _mm256_and_si256 ( u8s, _mm256_set1_epu8 ( 0b1100_0000 ) ) , _mm256_set1_epu8 ( 0b1000_0000 ) )
95+ mm256_cmpneq_epi8 (
96+ _mm256_and_si256 ( u8s, _mm256_set1_epu8 ( 0b1100_0000 ) ) ,
97+ _mm256_set1_epu8 ( 0b1000_0000 ) ,
98+ )
10599}
106100
107101#[ target_feature( enable = "avx2" ) ]
@@ -118,7 +112,7 @@ pub unsafe fn chunk_num_chars(utf8_chars: &[u8]) -> usize {
118112 for _ in 0 ..255 {
119113 counts = _mm256_sub_epi8 (
120114 counts,
121- is_leading_utf8_byte ( mm256_from_offset ( utf8_chars, offset) )
115+ is_leading_utf8_byte ( mm256_from_offset ( utf8_chars, offset) ) ,
122116 ) ;
123117 offset += 32 ;
124118 }
@@ -131,7 +125,7 @@ pub unsafe fn chunk_num_chars(utf8_chars: &[u8]) -> usize {
131125 for _ in 0 ..128 {
132126 counts = _mm256_sub_epi8 (
133127 counts,
134- is_leading_utf8_byte ( mm256_from_offset ( utf8_chars, offset) )
128+ is_leading_utf8_byte ( mm256_from_offset ( utf8_chars, offset) ) ,
135129 ) ;
136130 offset += 32 ;
137131 }
@@ -143,16 +137,16 @@ pub unsafe fn chunk_num_chars(utf8_chars: &[u8]) -> usize {
143137 for i in 0 ..( utf8_chars. len ( ) - offset) / 32 {
144138 counts = _mm256_sub_epi8 (
145139 counts,
146- is_leading_utf8_byte ( mm256_from_offset ( utf8_chars, offset + i * 32 ) )
140+ is_leading_utf8_byte ( mm256_from_offset ( utf8_chars, offset + i * 32 ) ) ,
147141 ) ;
148142 }
149143 if utf8_chars. len ( ) % 32 != 0 {
150144 counts = _mm256_sub_epi8 (
151145 counts,
152146 _mm256_and_si256 (
153147 is_leading_utf8_byte ( mm256_from_offset ( utf8_chars, utf8_chars. len ( ) - 32 ) ) ,
154- mm256_from_offset ( & MASK , utf8_chars. len ( ) % 32 )
155- )
148+ mm256_from_offset ( & MASK , utf8_chars. len ( ) % 32 ) ,
149+ ) ,
156150 ) ;
157151 }
158152 count += sum ( & counts) ;
0 commit comments