Skip to content

ARM v7 NEON support #105

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 23 commits into from
Oct 20, 2024
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 28 additions & 1 deletion .github/workflows/ci.yml
Original file line number Diff line number Diff line change
@@ -138,7 +138,7 @@ jobs:
- uses: dtolnay/rust-toolchain@master
with:
toolchain: ${{ matrix.toolchain }}
targets: "aarch64-apple-darwin aarch64-unknown-linux-gnu"
targets: "aarch64-apple-darwin aarch64-unknown-linux-gnu armv7-unknown-linux-gnueabihf"
components: "llvm-tools-preview"
if: ${{ matrix.toolchain != '1.38.0' }}
- name: Check aarch64 inlining
@@ -150,6 +150,13 @@ jobs:
./check-inlining.sh $target expected-methods-aarch64-neon.txt "--features public_imp"
done
if: ${{ matrix.toolchain != '1.38.0' }}
- name: Check armv7 neon inlining
run: |
./check-inlining.sh armv7-unknown-linux-gnueabihf expected-methods-armv7-std.txt "--features armv7_neon"
./check-inlining.sh armv7-unknown-linux-gnueabihf expected-methods-armv7-std.txt "--features armv7_neon,public_imp"
RUSTFLAGS="-C target-feature=+neon" ./check-inlining.sh armv7-unknown-linux-gnueabihf expected-methods-armv7-neon.txt "--features armv7_neon"
RUSTFLAGS="-C target-feature=+neon" ./check-inlining.sh armv7-unknown-linux-gnueabihf expected-methods-armv7-neon.txt "--no-default-features --features armv7_neon"
if: ${{ matrix.toolchain == 'nightly' }}

test-doc:
runs-on: ubuntu-latest
@@ -189,3 +196,23 @@ jobs:
run: cargo clippy --all-targets --all-features
- name: Clippy - all features disabled
run: cargo clippy --all-targets --no-default-features

clippy_check_arm:
runs-on: ubuntu-latest
env:
RUSTFLAGS: "-D warnings"
steps:
- uses: actions/checkout@v4
- uses: dtolnay/rust-toolchain@nightly
with:
components: clippy
targets: "thumbv7neon-unknown-linux-gnueabihf, armv7-unknown-linux-gnueabihf, armv7-linux-androideabi"
- name: Run Clippy
run: |
for target in thumbv7neon-unknown-linux-gnueabihf armv7-unknown-linux-gnueabihf armv7-linux-androideabi; do
cargo clippy --target $target --all-targets
cargo clippy --target $target --all-targets --all-features
cargo clippy --target $target --all-targets --no-default-features
cargo clippy --target $target --all-targets --no-default-features --features armv7_neon
cargo clippy --target $target --all-targets --no-default-features --features armv7_neon,public_imp
done
32 changes: 32 additions & 0 deletions .github/workflows/cross.yml
Original file line number Diff line number Diff line change
@@ -16,6 +16,8 @@ jobs:
strategy:
matrix:
toolchain: [stable, beta, nightly]
env:
RUSTFLAGS: "-D warnings"
steps:
- uses: actions/checkout@v4
- name: Install cross
@@ -62,3 +64,33 @@ jobs:
run: |
cargo clean; /home/runner/work/_temp/cross +${{ matrix.toolchain }} test --no-default-features --verbose --target i686-unknown-linux-gnu --all-features
if: ${{ matrix.toolchain == 'nightly' }}

cross-test-armv7:
runs-on: ubuntu-latest
strategy:
matrix:
toolchain: [stable, beta, nightly]
steps:
- uses: actions/checkout@v4
- name: Cross test
uses: houseabsolute/actions-rust-cross@v0
with:
command: build
toolchain: ${{ matrix.toolchain }}
target: armv7-unknown-linux-gnueabihf
args: "--no-default-features --verbose"
- name: test with cross
run: |
for rustflags in "-D warnings" "-D warnings -Ctarget-feature=+neon"; do
for features in "" "--features std" "--features public_imp" "--features std,public_imp"; do
cargo clean; /home/runner/work/_temp/cross +${{ matrix.toolchain }} test --no-default-features --verbose --target armv7-unknown-linux-gnueabihf $features
done
done
- name: test with cross with armv7_neon enabled
run: |
for rustflags in "-D warnings" "-D warnings -Ctarget-feature=+neon"; do
for features in "--features armv7_neon" "--features armv7_neon,std" "--features armv7_neon,public_imp" "--features armv7_neon,std,public_imp" "--all-features"; do
cargo clean; /home/runner/work/_temp/cross +${{ matrix.toolchain }} test --no-default-features --verbose --target armv7-unknown-linux-gnueabihf $features
done
done
if: ${{ matrix.toolchain == 'nightly' }}
2 changes: 2 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
@@ -37,6 +37,8 @@ aarch64_neon = []
# enable aarch64 prefetching for minor speedup - requires nightly
aarch64_neon_prefetch = []

armv7_neon = []

# make the portable SIMD public implementation available (experimental, nightly only)
portable_public_imp = ["public_imp"]

7 changes: 4 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
@@ -14,10 +14,11 @@ This library has been thoroughly tested with sample data as well as fuzzing and
* `basic` API for the fastest validation, optimized for valid UTF-8
* `compat` API as a fully compatible replacement for `std::str::from_utf8()`
* Supports AVX 2 and SSE 4.2 implementations on x86 and x86-64
* 🆕 ARM64 (aarch64) SIMD is supported since Rust 1.61
* 🆕 WASM (wasm32) SIMD is supported
* x86-64: Up to 23 times faster than the std library on valid non-ASCII, up to four times faster on ASCII
* ARM64 (aarch64) SIMD is supported since Rust 1.61
* WASM (wasm32) SIMD is supported
* x86-64: Up to 23 times faster than the std library on valid non-ASCII, up to four times faster on ASCI
* aarch64: Up to eleven times faster than the std library on valid non-ASCII, up to four times faster on ASCII (Apple Silicon)
* 🆕 armv7 NEON support with the `armv7_neon` feature on nightly Rust
* Faster than the original simdjson implementation
* Selects the fastest implementation at runtime based on CPU support (on x86)
* Falls back to the excellent std implementation if SIMD extensions are not supported
5 changes: 4 additions & 1 deletion inlining/check-inlining.sh
Original file line number Diff line number Diff line change
@@ -11,9 +11,12 @@ nm_output=$($LLVM_NM --defined-only ../target/$target/release/libsimdutf8.rlib)
if [[ $target == *darwin* ]]; then
pattern=" (t|T) _"
cut_arg=21
elif [[ $target == *armv7* ]]; then
pattern=" (t|T) "
cut_arg=12
else
pattern=" (t|T) "
cut_arg=20
fi
inline_ignore_pattern='drop_in_place|::fmt::|^\$x\.|^<T as core::convert::From<T>>::from$'
inline_ignore_pattern='drop_in_place|::fmt::|^\$x\.|^<T as core::convert::From<T>>::from$|^core::result::Result<T,E>::map_err$'
echo "$nm_output" | rustfilt | egrep "$pattern" | cut -c "$cut_arg"- | grep -Ev "$inline_ignore_pattern" | sort | diff -u $expected_fns -
5 changes: 5 additions & 0 deletions inlining/expected-methods-armv7-neon.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
simdutf8::implementation::armv7::validate_utf8_basic
simdutf8::implementation::armv7::validate_utf8_basic_neon
simdutf8::implementation::armv7::validate_utf8_compat
simdutf8::implementation::armv7::validate_utf8_compat_neon
simdutf8::implementation::helpers::get_compat_error
7 changes: 7 additions & 0 deletions inlining/expected-methods-armv7-std.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
simdutf8::implementation::armv7::neon::validate_utf8_basic
simdutf8::implementation::armv7::neon::validate_utf8_compat
simdutf8::implementation::armv7::validate_utf8_basic::get_fastest
simdutf8::implementation::armv7::validate_utf8_compat::get_fastest
simdutf8::implementation::helpers::get_compat_error
simdutf8::implementation::validate_utf8_basic_fallback
simdutf8::implementation::validate_utf8_compat_fallback
19 changes: 19 additions & 0 deletions src/basic.rs
Original file line number Diff line number Diff line change
@@ -231,6 +231,25 @@ pub mod imp {
}
}

/// Includes the armv7 NEON SIMD implementations.
#[cfg(all(
target_arch = "arm",
target_feature = "v7",
target_endian = "little",
feature = "armv7_neon",
))]
pub mod armv7 {
/// Includes the Neon-based validation implementation for armv7 neon-compatible CPUs.
///
/// Using the provided functionality on CPUs which do not support neon is undefined
/// behavior and will very likely cause a crash.
pub mod neon {
pub use crate::implementation::armv7::neon::validate_utf8_basic as validate_utf8;
pub use crate::implementation::armv7::neon::ChunkedUtf8ValidatorImp;
pub use crate::implementation::armv7::neon::Utf8ValidatorImp;
}
}

/// Includes the wasm32 SIMD implementations.
#[cfg(all(target_arch = "wasm32", target_feature = "simd128"))]
pub mod wasm32 {
17 changes: 17 additions & 0 deletions src/compat.rs
Original file line number Diff line number Diff line change
@@ -124,6 +124,23 @@ pub mod imp {
}
}

/// Includes the armv7 NEON SIMD implementations.
#[cfg(all(
target_arch = "arm",
target_feature = "v7",
target_endian = "little",
feature = "armv7_neon",
))]
pub mod armv7 {
/// Includes the Neon-based validation implementation for armv7 neon-compatible CPUs.
///
/// Using the provided functionality on CPUs which do not support neon is undefined
/// behavior and will very likely cause a crash.
pub mod neon {
pub use crate::implementation::armv7::neon::validate_utf8_compat as validate_utf8;
}
}

/// Includes the wasm32 SIMD implementations.
#[cfg(all(target_arch = "wasm32", target_feature = "simd128"))]
pub mod wasm32 {
52 changes: 26 additions & 26 deletions src/implementation/algorithm.rs
Original file line number Diff line number Diff line change
@@ -183,15 +183,15 @@ macro_rules! algorithm_simd {
// WORKAROUND
// necessary because the for loop is not unrolled on ARM64
if input.vals.len() == 2 {
self.check_bytes(*input.vals.get_unchecked(0));
self.check_bytes(*input.vals.get_unchecked(1));
self.incomplete = Self::is_incomplete(*input.vals.get_unchecked(1));
self.check_bytes(*input.vals.as_ptr());
self.check_bytes(*input.vals.as_ptr().add(1));
self.incomplete = Self::is_incomplete(*input.vals.as_ptr().add(1));
} else if input.vals.len() == 4 {
self.check_bytes(*input.vals.get_unchecked(0));
self.check_bytes(*input.vals.get_unchecked(1));
self.check_bytes(*input.vals.get_unchecked(2));
self.check_bytes(*input.vals.get_unchecked(3));
self.incomplete = Self::is_incomplete(*input.vals.get_unchecked(3));
self.check_bytes(*input.vals.as_ptr());
self.check_bytes(*input.vals.as_ptr().add(1));
self.check_bytes(*input.vals.as_ptr().add(2));
self.check_bytes(*input.vals.as_ptr().add(3));
self.incomplete = Self::is_incomplete(*input.vals.as_ptr().add(3));
} else {
panic!("Unsupported number of chunks");
}
@@ -219,7 +219,7 @@ macro_rules! algorithm_simd {
let iter_lim = len - (len % SIMD_CHUNK_SIZE);

while idx < iter_lim {
let simd_input = SimdInput::new(input.get_unchecked(idx as usize..));
let simd_input = SimdInput::new(input.as_ptr().add(idx as usize));
idx += SIMD_CHUNK_SIZE;
if !simd_input.is_ascii() {
algorithm.check_block(simd_input);
@@ -231,7 +231,7 @@ macro_rules! algorithm_simd {
if PREFETCH {
simd_prefetch(input.as_ptr().add(idx + SIMD_CHUNK_SIZE * 2));
}
let input = SimdInput::new(input.get_unchecked(idx as usize..));
let input = SimdInput::new(input.as_ptr().add(idx as usize));
algorithm.check_utf8(input);
idx += SIMD_CHUNK_SIZE;
}
@@ -243,7 +243,7 @@ macro_rules! algorithm_simd {
tmpbuf.0.as_mut_ptr(),
len - idx,
);
let simd_input = SimdInput::new(&tmpbuf.0);
let simd_input = SimdInput::new(tmpbuf.0.as_ptr());
algorithm.check_utf8(simd_input);
}
algorithm.check_incomplete_pending();
@@ -286,7 +286,7 @@ macro_rules! algorithm_simd {
'outer: loop {
if only_ascii {
while idx < iter_lim {
let simd_input = SimdInput::new(input.get_unchecked(idx as usize..));
let simd_input = SimdInput::new(input.as_ptr().add(idx as usize));
if !simd_input.is_ascii() {
algorithm.check_block(simd_input);
if algorithm.has_error() {
@@ -304,7 +304,7 @@ macro_rules! algorithm_simd {
if PREFETCH {
simd_prefetch(input.as_ptr().add(idx + SIMD_CHUNK_SIZE * 2));
}
let simd_input = SimdInput::new(input.get_unchecked(idx as usize..));
let simd_input = SimdInput::new(input.as_ptr().add(idx as usize));
if simd_input.is_ascii() {
algorithm.check_incomplete_pending();
if algorithm.has_error() {
@@ -333,7 +333,7 @@ macro_rules! algorithm_simd {
tmpbuf.0.as_mut_ptr(),
len - idx,
);
let simd_input = SimdInput::new(&tmpbuf.0);
let simd_input = SimdInput::new(tmpbuf.0.as_ptr());

algorithm.check_utf8(simd_input);
}
@@ -362,7 +362,7 @@ macro_rules! algorithm_simd {
$(#[$feat])*
#[inline]
unsafe fn update_from_incomplete_data(&mut self) {
let simd_input = SimdInput::new(&self.incomplete_data);
let simd_input = SimdInput::new(self.incomplete_data.as_ptr());
self.algorithm.check_utf8(simd_input);
self.incomplete_len = 0;
}
@@ -407,7 +407,7 @@ macro_rules! algorithm_simd {
let mut idx: usize = 0;
let iter_lim = len - (len % SIMD_CHUNK_SIZE);
while idx < iter_lim {
let input = SimdInput::new(input.get_unchecked(idx as usize..));
let input = SimdInput::new(input.as_ptr().add(idx as usize));
self.algorithm.check_utf8(input);
idx += SIMD_CHUNK_SIZE;
}
@@ -469,7 +469,7 @@ macro_rules! algorithm_simd {
"Input size must be a multiple of 64."
);
for chunk in input.chunks_exact(SIMD_CHUNK_SIZE) {
let input = SimdInput::new(chunk);
let input = SimdInput::new(chunk.as_ptr());
self.algorithm.check_utf8(input);
}
}
@@ -497,7 +497,7 @@ macro_rules! algorithm_simd {
remaining_input.as_ptr(),
remaining_input.len(),
);
let simd_input = SimdInput::new(&tmpbuf.0);
let simd_input = SimdInput::new(tmpbuf.0.as_ptr());
self.algorithm.check_utf8(simd_input);
}
}
@@ -523,13 +523,13 @@ macro_rules! simd_input_128_bit {
impl SimdInput {
$(#[$feat])*
#[inline]
unsafe fn new(ptr: &[u8]) -> Self {
unsafe fn new(ptr: *const u8) -> Self {
Self {
vals: [
SimdU8Value::load_from(ptr.as_ptr()),
SimdU8Value::load_from(ptr.as_ptr().add(16)),
SimdU8Value::load_from(ptr.as_ptr().add(32)),
SimdU8Value::load_from(ptr.as_ptr().add(48)),
SimdU8Value::load_from(ptr),
SimdU8Value::load_from(ptr.add(16)),
SimdU8Value::load_from(ptr.add(32)),
SimdU8Value::load_from(ptr.add(48)),
],
}
}
@@ -556,11 +556,11 @@ macro_rules! simd_input_256_bit {
impl SimdInput {
$(#[$feat])*
#[inline]
unsafe fn new(ptr: &[u8]) -> Self {
unsafe fn new(ptr: *const u8) -> Self {
Self {
vals: [
SimdU8Value::load_from(ptr.as_ptr()),
SimdU8Value::load_from(ptr.as_ptr().add(32)),
SimdU8Value::load_from(ptr),
SimdU8Value::load_from(ptr.add(32)),
],
}
}
Loading