Skip to content

Commit ffd810a

Browse files
committed
add aarch64 to CI matrix
1 parent 5dc85ad commit ffd810a

2 files changed

Lines changed: 51 additions & 32 deletions

File tree

.github/workflows/ci.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ jobs:
2222
arch:
2323
- i686
2424
- x86_64
25+
- aarch64
2526
features:
2627
- default
2728
- runtime-dispatch-simd

src/simd/aarch64.rs

Lines changed: 50 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
use core::arch::aarch64::{
2-
uint8x16_t, vaddlvq_u8, vandq_u8, vceqq_u8, vcgtq_u8, vdupq_n_u8, vld1q_u8, vmvnq_u8, vsubq_u8,
2+
uint8x16_t, uint8x16x4_t, vaddlvq_u8, vandq_u8, vceqq_u8, vdupq_n_u8, vld1q_u8, vld1q_u8_x4,
3+
vmvnq_u8, vsubq_u8,
34
};
45

56
const MASK: [u8; 32] = [
@@ -9,12 +10,29 @@ const MASK: [u8; 32] = [
910

1011
#[target_feature(enable = "neon")]
1112
unsafe fn u8x16_from_offset(slice: &[u8], offset: usize) -> uint8x16_t {
13+
debug_assert!(
14+
offset + 16 <= slice.len(),
15+
"{} + 16 ≥ {}",
16+
offset,
17+
slice.len()
18+
);
1219
vld1q_u8(slice.as_ptr().add(offset) as *const _) // TODO: does this need to be aligned?
1320
}
1421

1522
#[target_feature(enable = "neon")]
16-
unsafe fn sum(u8s: &uint8x16_t) -> usize {
17-
vaddlvq_u8(*u8s) as usize
23+
unsafe fn u8x16_x4_from_offset(slice: &[u8], offset: usize) -> uint8x16x4_t {
24+
debug_assert!(
25+
offset + 64 <= slice.len(),
26+
"{} + 64 ≥ {}",
27+
offset,
28+
slice.len()
29+
);
30+
vld1q_u8_x4(slice.as_ptr().add(offset) as *const _)
31+
}
32+
33+
#[target_feature(enable = "neon")]
34+
unsafe fn sum(u8s: uint8x16_t) -> usize {
35+
vaddlvq_u8(u8s) as usize
1836
}
1937

2038
#[target_feature(enable = "neon")]
@@ -26,38 +44,40 @@ pub unsafe fn chunk_count(haystack: &[u8], needle: u8) -> usize {
2644

2745
let needles = vdupq_n_u8(needle);
2846

29-
// 4080
30-
while haystack.len() >= offset + 16 * 255 {
31-
let mut counts = vdupq_n_u8(0);
47+
// 16320
48+
while haystack.len() >= offset + 64 * 255 {
49+
let (mut count1, mut count2, mut count3, mut count4) =
50+
(vdupq_n_u8(0), vdupq_n_u8(0), vdupq_n_u8(0), vdupq_n_u8(0));
3251
for _ in 0..255 {
33-
counts = vsubq_u8(
34-
counts,
35-
vceqq_u8(u8x16_from_offset(haystack, offset), needles),
36-
);
37-
offset += 16;
52+
let uint8x16x4_t(h1, h2, h3, h4) = u8x16_x4_from_offset(haystack, offset);
53+
count1 = vsubq_u8(count1, vceqq_u8(h1, needles));
54+
count2 = vsubq_u8(count2, vceqq_u8(h2, needles));
55+
count3 = vsubq_u8(count3, vceqq_u8(h3, needles));
56+
count4 = vsubq_u8(count4, vceqq_u8(h4, needles));
57+
offset += 64;
3858
}
39-
count += sum(&counts);
59+
count += sum(count1) + sum(count2) + sum(count3) + sum(count4);
4060
}
4161

42-
// 2048
43-
if haystack.len() >= offset + 16 * 128 {
44-
let mut counts = vdupq_n_u8(0);
45-
for _ in 0..128 {
46-
counts = vsubq_u8(
47-
counts,
48-
vceqq_u8(u8x16_from_offset(haystack, offset), needles),
49-
);
50-
offset += 16;
51-
}
52-
count += sum(&counts);
62+
// 64
63+
let (mut count1, mut count2, mut count3, mut count4) =
64+
(vdupq_n_u8(0), vdupq_n_u8(0), vdupq_n_u8(0), vdupq_n_u8(0));
65+
for _ in 0..(haystack.len() - offset) / 64 {
66+
let uint8x16x4_t(h1, h2, h3, h4) = u8x16_x4_from_offset(haystack, offset);
67+
count1 = vsubq_u8(count1, vceqq_u8(h1, needles));
68+
count2 = vsubq_u8(count2, vceqq_u8(h2, needles));
69+
count3 = vsubq_u8(count3, vceqq_u8(h3, needles));
70+
count4 = vsubq_u8(count4, vceqq_u8(h4, needles));
71+
offset += 64;
5372
}
73+
count += sum(count1) + sum(count2) + sum(count3) + sum(count4);
5474

55-
// 16
5675
let mut counts = vdupq_n_u8(0);
76+
// 16
5777
for i in 0..(haystack.len() - offset) / 16 {
5878
counts = vsubq_u8(
5979
counts,
60-
vcgtq_u8(u8x16_from_offset(haystack, offset + i * 32), needles),
80+
vceqq_u8(u8x16_from_offset(haystack, offset + i * 16), needles),
6181
);
6282
}
6383
if haystack.len() % 16 != 0 {
@@ -69,9 +89,7 @@ pub unsafe fn chunk_count(haystack: &[u8], needle: u8) -> usize {
6989
),
7090
);
7191
}
72-
count += sum(&counts);
73-
74-
count
92+
count + sum(counts)
7593
}
7694

7795
#[target_feature(enable = "neon")]
@@ -100,7 +118,7 @@ pub unsafe fn chunk_num_chars(utf8_chars: &[u8]) -> usize {
100118
);
101119
offset += 16;
102120
}
103-
count += sum(&counts);
121+
count += sum(counts);
104122
}
105123

106124
// 2048
@@ -113,15 +131,15 @@ pub unsafe fn chunk_num_chars(utf8_chars: &[u8]) -> usize {
113131
);
114132
offset += 16;
115133
}
116-
count += sum(&counts);
134+
count += sum(counts);
117135
}
118136

119137
// 16
120138
let mut counts = vdupq_n_u8(0);
121139
for i in 0..(utf8_chars.len() - offset) / 16 {
122140
counts = vsubq_u8(
123141
counts,
124-
is_leading_utf8_byte(u8x16_from_offset(utf8_chars, offset + i * 32)),
142+
is_leading_utf8_byte(u8x16_from_offset(utf8_chars, offset + i * 16)),
125143
);
126144
}
127145
if utf8_chars.len() % 16 != 0 {
@@ -133,7 +151,7 @@ pub unsafe fn chunk_num_chars(utf8_chars: &[u8]) -> usize {
133151
),
134152
);
135153
}
136-
count += sum(&counts);
154+
count += sum(counts);
137155

138156
count
139157
}

0 commit comments

Comments
 (0)