diff --git a/Cargo.toml b/Cargo.toml index 030b621..5711b1d 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -22,6 +22,7 @@ rand = "0.4" [features] default = ["std"] std = [] +nightly = [] [[bench]] name = "bench" diff --git a/README.md b/README.md index c696ee6..d984d70 100644 --- a/README.md +++ b/README.md @@ -27,6 +27,7 @@ This crate contains multiple CRC32 implementations: - A fast baseline implementation which processes up to 16 bytes per iteration - An optimized implementation for modern `x86` using `sse` and `pclmulqdq` instructions +- An optimized implementation for `aarch64` using `crc32` instructions Calling the `Hasher::new` constructor at runtime will perform a feature detection to select the most optimal implementation for the current CPU feature set. diff --git a/src/lib.rs b/src/lib.rs index b66023d..bba0378 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -19,6 +19,7 @@ //! optimal implementation for the current CPU feature set. #![cfg_attr(not(feature = "std"), no_std)] +#![cfg_attr(all(feature = "nightly", target_arch = "aarch64"), feature(stdsimd, aarch64_target_feature))] #[deny(missing_docs)] #[cfg(test)] diff --git a/src/specialized/aarch64.rs b/src/specialized/aarch64.rs new file mode 100644 index 0000000..6ce2f5a --- /dev/null +++ b/src/specialized/aarch64.rs @@ -0,0 +1,85 @@ +use std::arch::aarch64 as arch; + +#[derive(Clone)] +pub struct State { + state: u32, +} + +impl State { + pub fn new() -> Option { + if is_aarch64_feature_detected!("crc") { + // SAFETY: The conditions above ensure that all + // required instructions are supported by the CPU. + Some(Self { state: 0 }) + } else { + None + } + } + + pub fn update(&mut self, buf: &[u8]) { + // SAFETY: The `State::new` constructor ensures that all + // required instructions are supported by the CPU. + self.state = unsafe { calculate(self.state, buf) } + } + + pub fn finalize(self) -> u32 { + self.state + } + + pub fn reset(&mut self) { + self.state = 0; + } + + pub fn combine(&mut self, other: u32, amount: u64) { + self.state = ::combine::combine(self.state, other, amount); + } +} + +// target_feature is necessary to allow rustc to inline the crc32* wrappers +#[target_feature(enable = "crc")] +pub unsafe fn calculate(crc: u32, data: &[u8]) -> u32 { + let mut c32 = !crc; + let (pre_quad, quads, post_quad) = data.align_to::(); + + c32 = pre_quad.iter().fold(c32, |acc, &b| arch::__crc32b(acc, b)); + + // unrolling increases performance by a lot + let mut quad_iter = quads.chunks_exact(8); + for chunk in &mut quad_iter { + c32 = arch::__crc32d(c32, chunk[0]); + c32 = arch::__crc32d(c32, chunk[1]); + c32 = arch::__crc32d(c32, chunk[2]); + c32 = arch::__crc32d(c32, chunk[3]); + c32 = arch::__crc32d(c32, chunk[4]); + c32 = arch::__crc32d(c32, chunk[5]); + c32 = arch::__crc32d(c32, chunk[6]); + c32 = arch::__crc32d(c32, chunk[7]); + } + c32 = quad_iter.remainder().iter().fold(c32, |acc, &q| arch::__crc32d(acc, q)); + + c32 = post_quad.iter().fold(c32, |acc, &b| arch::__crc32b(acc, b)); + + !c32 +} + +#[cfg(test)] +mod test { + quickcheck! { + fn check_against_baseline(chunks: Vec<(Vec, usize)>) -> bool { + let mut baseline = super::super::super::baseline::State::new(); + let mut aarch64 = super::State::new().expect("not supported"); + for (chunk, mut offset) in chunks { + // simulate random alignments by offsetting the slice by up to 15 bytes + offset = offset & 0xF; + if chunk.len() <= offset { + baseline.update(&chunk); + aarch64.update(&chunk); + } else { + baseline.update(&chunk[offset..]); + aarch64.update(&chunk[offset..]); + } + } + aarch64.finalize() == baseline.finalize() + } + } +} diff --git a/src/specialized/mod.rs b/src/specialized/mod.rs index 59e1be6..24f8912 100644 --- a/src/specialized/mod.rs +++ b/src/specialized/mod.rs @@ -5,6 +5,9 @@ cfg_if! { ))] { mod pclmulqdq; pub use self::pclmulqdq::State; + } else if #[cfg(all(feature = "nightly", target_arch = "aarch64"))] { + mod aarch64; + pub use self::aarch64::State; } else { #[derive(Clone)] pub enum State {}