@@ -51,15 +51,21 @@ unsafe fn sum(u8s: v128) -> usize {
5151unsafe fn sum4 ( u1 : v128 , u2 : v128 , u3 : v128 , u4 : v128 ) -> usize {
5252 // sum < (2^2 * 2^3 * 2^8 = 2^13) < 2^16, therefore no overflow here
5353 let u16s = u16x8_add (
54- u16x8_add ( u16x8_extadd_pairwise_u8x16 ( u1) , u16x8_extadd_pairwise_u8x16 ( u2) ) ,
55- u16x8_add ( u16x8_extadd_pairwise_u8x16 ( u3) , u16x8_extadd_pairwise_u8x16 ( u4) ) ,
54+ u16x8_add (
55+ u16x8_extadd_pairwise_u8x16 ( u1) ,
56+ u16x8_extadd_pairwise_u8x16 ( u2) ,
57+ ) ,
58+ u16x8_add (
59+ u16x8_extadd_pairwise_u8x16 ( u3) ,
60+ u16x8_extadd_pairwise_u8x16 ( u4) ,
61+ ) ,
5662 ) ;
5763 let u32s = u32x4_extadd_pairwise_u16x8 ( u16s) ;
5864 let ( u1, u2, u3, u4) = (
65+ u32x4_extract_lane :: < 0 > ( u32s) ,
5966 u32x4_extract_lane :: < 1 > ( u32s) ,
6067 u32x4_extract_lane :: < 2 > ( u32s) ,
6168 u32x4_extract_lane :: < 3 > ( u32s) ,
62- u32x4_extract_lane :: < 4 > ( u32s) ,
6369 ) ;
6470 ( ( u1 + u2) + ( u3 + u4) ) as usize
6571}
@@ -69,10 +75,14 @@ pub unsafe fn chunk_count(haystack: &[u8], needle: u8) -> usize {
6975 let needles = u8x16_splat ( needle) ;
7076 let mut count = 0 ;
7177 let mut offset = 0 ;
72-
78+
7379 while haystack. len ( ) >= offset + 16 * 255 {
74- let ( mut count1, mut count2, mut count3, mut count4) =
75- ( u8x16_splat ( 0 ) , u8x16_splat ( 0 ) , u8x16_splat ( 0 ) , u8x16_splat ( 0 ) ) ;
80+ let ( mut count1, mut count2, mut count3, mut count4) = (
81+ u8x16_splat ( 0 ) ,
82+ u8x16_splat ( 0 ) ,
83+ u8x16_splat ( 0 ) ,
84+ u8x16_splat ( 0 ) ,
85+ ) ;
7686 for _ in 0 ..255 {
7787 let ( h1, h2, h3, h4) = u8x16x4_from_offset ( haystack, offset) ;
7888 count1 = u8x16_sub ( count1, u8x16_eq ( h1, needles) ) ;
@@ -83,10 +93,14 @@ pub unsafe fn chunk_count(haystack: &[u8], needle: u8) -> usize {
8393 }
8494 count += sum4 ( count1, count2, count3, count4) ;
8595 }
86-
96+
8797 // 64
88- let ( mut count1, mut count2, mut count3, mut count4) =
89- ( u8x16_splat ( 0 ) , u8x16_splat ( 0 ) , u8x16_splat ( 0 ) , u8x16_splat ( 0 ) ) ;
98+ let ( mut count1, mut count2, mut count3, mut count4) = (
99+ u8x16_splat ( 0 ) ,
100+ u8x16_splat ( 0 ) ,
101+ u8x16_splat ( 0 ) ,
102+ u8x16_splat ( 0 ) ,
103+ ) ;
90104 for _ in 0 ..( haystack. len ( ) - offset) / 64 {
91105 let ( h1, h2, h3, h4) = u8x16x4_from_offset ( haystack, offset) ;
92106 count1 = u8x16_sub ( count1, u8x16_eq ( h1, needles) ) ;
@@ -114,7 +128,7 @@ pub unsafe fn chunk_count(haystack: &[u8], needle: u8) -> usize {
114128 ) ,
115129 ) ;
116130 }
117- count + sum ( counts)
131+ count + sum ( counts)
118132}
119133
120134#[ target_feature( enable = "simd128" ) ]
@@ -134,23 +148,31 @@ pub unsafe fn chunk_num_chars(utf8_chars: &[u8]) -> usize {
134148
135149 // 4080
136150 while utf8_chars. len ( ) >= offset + 64 * 255 {
137- let ( mut count1, mut count2, mut count3, mut count4) =
138- ( u8x16_splat ( 0 ) , u8x16_splat ( 0 ) , u8x16_splat ( 0 ) , u8x16_splat ( 0 ) ) ;
151+ let ( mut count1, mut count2, mut count3, mut count4) = (
152+ u8x16_splat ( 0 ) ,
153+ u8x16_splat ( 0 ) ,
154+ u8x16_splat ( 0 ) ,
155+ u8x16_splat ( 0 ) ,
156+ ) ;
139157
140158 for _ in 0 ..255 {
141159 let ( h1, h2, h3, h4) = u8x16x4_from_offset ( utf8_chars, offset) ;
142- count1 = u8x16_sub ( count1, is_leading_utf8_byte ( h1) ) ;
143- count2 = u8x16_sub ( count2, is_leading_utf8_byte ( h2) ) ;
144- count3 = u8x16_sub ( count3, is_leading_utf8_byte ( h3) ) ;
145- count4 = u8x16_sub ( count4, is_leading_utf8_byte ( h4) ) ;
160+ count1 = u8x16_sub ( count1, is_leading_utf8_byte ( h1) ) ;
161+ count2 = u8x16_sub ( count2, is_leading_utf8_byte ( h2) ) ;
162+ count3 = u8x16_sub ( count3, is_leading_utf8_byte ( h3) ) ;
163+ count4 = u8x16_sub ( count4, is_leading_utf8_byte ( h4) ) ;
146164 offset += 64 ;
147165 }
148166 count += sum4 ( count1, count2, count3, count4) ;
149167 }
150168
151169 // 4080
152- let ( mut count1, mut count2, mut count3, mut count4) =
153- ( u8x16_splat ( 0 ) , u8x16_splat ( 0 ) , u8x16_splat ( 0 ) , u8x16_splat ( 0 ) ) ;
170+ let ( mut count1, mut count2, mut count3, mut count4) = (
171+ u8x16_splat ( 0 ) ,
172+ u8x16_splat ( 0 ) ,
173+ u8x16_splat ( 0 ) ,
174+ u8x16_splat ( 0 ) ,
175+ ) ;
154176 for _ in 0 ..( utf8_chars. len ( ) - offset) / 64 {
155177 let ( h1, h2, h3, h4) = u8x16x4_from_offset ( utf8_chars, offset) ;
156178 count1 = u8x16_sub ( count1, is_leading_utf8_byte ( h1) ) ;
0 commit comments