11use core:: arch:: aarch64:: {
2- uint8x16_t, vaddlvq_u8, vandq_u8, vceqq_u8, vcgtq_u8, vdupq_n_u8, vld1q_u8, vmvnq_u8, vsubq_u8,
2+ uint8x16_t, uint8x16x4_t, vaddlvq_u8, vandq_u8, vceqq_u8, vdupq_n_u8, vld1q_u8, vld1q_u8_x4,
3+ vmvnq_u8, vsubq_u8,
34} ;
45
56const MASK : [ u8 ; 32 ] = [
@@ -9,12 +10,29 @@ const MASK: [u8; 32] = [
910
1011#[ target_feature( enable = "neon" ) ]
1112unsafe fn u8x16_from_offset ( slice : & [ u8 ] , offset : usize ) -> uint8x16_t {
13+ debug_assert ! (
14+ offset + 16 <= slice. len( ) ,
15+ "{} + 16 ≥ {}" ,
16+ offset,
17+ slice. len( )
18+ ) ;
1219 vld1q_u8 ( slice. as_ptr ( ) . add ( offset) as * const _ ) // TODO: does this need to be aligned?
1320}
1421
1522#[ target_feature( enable = "neon" ) ]
16- unsafe fn sum ( u8s : & uint8x16_t ) -> usize {
17- vaddlvq_u8 ( * u8s) as usize
23+ unsafe fn u8x16_x4_from_offset ( slice : & [ u8 ] , offset : usize ) -> uint8x16x4_t {
24+ debug_assert ! (
25+ offset + 64 <= slice. len( ) ,
26+ "{} + 64 ≥ {}" ,
27+ offset,
28+ slice. len( )
29+ ) ;
30+ vld1q_u8_x4 ( slice. as_ptr ( ) . add ( offset) as * const _ )
31+ }
32+
33+ #[ target_feature( enable = "neon" ) ]
34+ unsafe fn sum ( u8s : uint8x16_t ) -> usize {
35+ vaddlvq_u8 ( u8s) as usize
1836}
1937
2038#[ target_feature( enable = "neon" ) ]
@@ -26,38 +44,40 @@ pub unsafe fn chunk_count(haystack: &[u8], needle: u8) -> usize {
2644
2745 let needles = vdupq_n_u8 ( needle) ;
2846
29- // 4080
30- while haystack. len ( ) >= offset + 16 * 255 {
31- let mut counts = vdupq_n_u8 ( 0 ) ;
47+ // 16320
48+ while haystack. len ( ) >= offset + 64 * 255 {
49+ let ( mut count1, mut count2, mut count3, mut count4) =
50+ ( vdupq_n_u8 ( 0 ) , vdupq_n_u8 ( 0 ) , vdupq_n_u8 ( 0 ) , vdupq_n_u8 ( 0 ) ) ;
3251 for _ in 0 ..255 {
33- counts = vsubq_u8 (
34- counts,
35- vceqq_u8 ( u8x16_from_offset ( haystack, offset) , needles) ,
36- ) ;
37- offset += 16 ;
52+ let uint8x16x4_t( h1, h2, h3, h4) = u8x16_x4_from_offset ( haystack, offset) ;
53+ count1 = vsubq_u8 ( count1, vceqq_u8 ( h1, needles) ) ;
54+ count2 = vsubq_u8 ( count2, vceqq_u8 ( h2, needles) ) ;
55+ count3 = vsubq_u8 ( count3, vceqq_u8 ( h3, needles) ) ;
56+ count4 = vsubq_u8 ( count4, vceqq_u8 ( h4, needles) ) ;
57+ offset += 64 ;
3858 }
39- count += sum ( & counts ) ;
59+ count += sum ( count1 ) + sum ( count2 ) + sum ( count3 ) + sum ( count4 ) ;
4060 }
4161
42- // 2048
43- if haystack. len ( ) >= offset + 16 * 128 {
44- let mut counts = vdupq_n_u8 ( 0 ) ;
45- for _ in 0 ..128 {
46- counts = vsubq_u8 (
47- counts,
48- vceqq_u8 ( u8x16_from_offset ( haystack, offset) , needles) ,
49- ) ;
50- offset += 16 ;
51- }
52- count += sum ( & counts) ;
62+ // 64
63+ let ( mut count1, mut count2, mut count3, mut count4) =
64+ ( vdupq_n_u8 ( 0 ) , vdupq_n_u8 ( 0 ) , vdupq_n_u8 ( 0 ) , vdupq_n_u8 ( 0 ) ) ;
65+ for _ in 0 ..( haystack. len ( ) - offset) / 64 {
66+ let uint8x16x4_t( h1, h2, h3, h4) = u8x16_x4_from_offset ( haystack, offset) ;
67+ count1 = vsubq_u8 ( count1, vceqq_u8 ( h1, needles) ) ;
68+ count2 = vsubq_u8 ( count2, vceqq_u8 ( h2, needles) ) ;
69+ count3 = vsubq_u8 ( count3, vceqq_u8 ( h3, needles) ) ;
70+ count4 = vsubq_u8 ( count4, vceqq_u8 ( h4, needles) ) ;
71+ offset += 64 ;
5372 }
73+ count += sum ( count1) + sum ( count2) + sum ( count3) + sum ( count4) ;
5474
55- // 16
5675 let mut counts = vdupq_n_u8 ( 0 ) ;
76+ // 16
5777 for i in 0 ..( haystack. len ( ) - offset) / 16 {
5878 counts = vsubq_u8 (
5979 counts,
60- vcgtq_u8 ( u8x16_from_offset ( haystack, offset + i * 32 ) , needles) ,
80+ vceqq_u8 ( u8x16_from_offset ( haystack, offset + i * 16 ) , needles) ,
6181 ) ;
6282 }
6383 if haystack. len ( ) % 16 != 0 {
@@ -69,9 +89,7 @@ pub unsafe fn chunk_count(haystack: &[u8], needle: u8) -> usize {
6989 ) ,
7090 ) ;
7191 }
72- count += sum ( & counts) ;
73-
74- count
92+ count + sum ( counts)
7593}
7694
7795#[ target_feature( enable = "neon" ) ]
@@ -100,7 +118,7 @@ pub unsafe fn chunk_num_chars(utf8_chars: &[u8]) -> usize {
100118 ) ;
101119 offset += 16 ;
102120 }
103- count += sum ( & counts) ;
121+ count += sum ( counts) ;
104122 }
105123
106124 // 2048
@@ -113,15 +131,15 @@ pub unsafe fn chunk_num_chars(utf8_chars: &[u8]) -> usize {
113131 ) ;
114132 offset += 16 ;
115133 }
116- count += sum ( & counts) ;
134+ count += sum ( counts) ;
117135 }
118136
119137 // 16
120138 let mut counts = vdupq_n_u8 ( 0 ) ;
121139 for i in 0 ..( utf8_chars. len ( ) - offset) / 16 {
122140 counts = vsubq_u8 (
123141 counts,
124- is_leading_utf8_byte ( u8x16_from_offset ( utf8_chars, offset + i * 32 ) ) ,
142+ is_leading_utf8_byte ( u8x16_from_offset ( utf8_chars, offset + i * 16 ) ) ,
125143 ) ;
126144 }
127145 if utf8_chars. len ( ) % 16 != 0 {
@@ -133,7 +151,7 @@ pub unsafe fn chunk_num_chars(utf8_chars: &[u8]) -> usize {
133151 ) ,
134152 ) ;
135153 }
136- count += sum ( & counts) ;
154+ count += sum ( counts) ;
137155
138156 count
139157}
0 commit comments