diff --git a/Cargo.toml b/Cargo.toml index b6c229d..aa1637a 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -23,6 +23,7 @@ criterion = "0.5" hashbrown = "0.14" uuid = "1.8" rand = "0.8" +rapidhash = "3.0" ahash = "0.8" fxhash = "0.2" chrono = "0.4" @@ -42,3 +43,6 @@ harness = false [profile.release] lto = "thin" +codegen-units = 1 +incremental = false +debug-assertions = false \ No newline at end of file diff --git a/benches/bench.rs b/benches/bench.rs index 81cc2e1..6a3a1a4 100644 --- a/benches/bench.rs +++ b/benches/bench.rs @@ -147,24 +147,28 @@ fn profile_distr(distr: D, map_size: usize, c: &mut Criterion) profile_hashonly::("foldhash-fast", distr.clone(), c); profile_hashonly::("foldhash-quality", distr.clone(), c); + profile_hashonly::("rapidhash-fast", distr.clone(), c); profile_hashonly::("fxhash", distr.clone(), c); profile_hashonly::("ahash", distr.clone(), c); profile_hashonly::("siphash", distr.clone(), c); profile_lookup_miss::("foldhash-fast", distr.clone(), map_size, c); profile_lookup_miss::("foldhash-quality", distr.clone(), map_size, c); + profile_lookup_miss::("rapidhash-fast", distr.clone(), map_size, c); profile_lookup_miss::("fxhash", distr.clone(), map_size, c); profile_lookup_miss::("ahash", distr.clone(), map_size, c); profile_lookup_miss::("siphash", distr.clone(), map_size, c); profile_lookup_hit::("foldhash-fast", distr.clone(), map_size, c); profile_lookup_hit::("foldhash-quality", distr.clone(), map_size, c); + profile_lookup_hit::("rapidhash-fast", distr.clone(), map_size, c); profile_lookup_hit::("fxhash", distr.clone(), map_size, c); profile_lookup_hit::("ahash", distr.clone(), map_size, c); profile_lookup_hit::("siphash", distr.clone(), map_size, c); profile_set_build::("foldhash-fast", distr.clone(), map_size, c); profile_set_build::("foldhash-quality", distr.clone(), map_size, c); + profile_set_build::("rapidhash-fast", distr.clone(), map_size, c); profile_set_build::("fxhash", distr.clone(), map_size, c); profile_set_build::("ahash", distr.clone(), map_size, c); profile_set_build::("siphash", distr.clone(), map_size, c); diff --git a/src/fast.rs b/src/fast.rs index a6f0f1e..8c98e1e 100644 --- a/src/fast.rs +++ b/src/fast.rs @@ -3,7 +3,7 @@ use core::hash::{BuildHasher, Hasher}; use crate::seed::{gen_per_hasher_seed, GlobalSeed, SharedSeed}; -use crate::{folded_multiply, hash_bytes_long, hash_bytes_medium, rotate_right, ARBITRARY3}; +use crate::{folded_multiply, hash_bytes_long, hash_bytes_medium, rapidhash_core_16_288, read_u32, read_u64, rotate_right, ARBITRARY3}; /// A [`Hasher`] instance implementing foldhash, optimized for speed. /// @@ -12,28 +12,22 @@ use crate::{folded_multiply, hash_bytes_long, hash_bytes_medium, rotate_right, A /// [`FixedState`] to create [`FoldHasher`]s. #[derive(Clone)] pub struct FoldHasher { - accumulator: u64, + pub(crate) accumulator: u64, + pub(crate) seeds: &'static [u64; 4], sponge: u128, sponge_len: u8, - fold_seed: u64, - expand_seed: u64, - expand_seed2: u64, - expand_seed3: u64, } impl FoldHasher { /// Initializes this [`FoldHasher`] with the given per-hasher seed and /// [`SharedSeed`]. #[inline] - pub fn with_seed(per_hasher_seed: u64, shared_seed: &SharedSeed) -> FoldHasher { + pub fn with_seed(per_hasher_seed: u64, shared_seed: &'static SharedSeed) -> FoldHasher { FoldHasher { accumulator: per_hasher_seed, + seeds: &shared_seed.seeds, sponge: 0, sponge_len: 0, - fold_seed: shared_seed.seeds[0], - expand_seed: shared_seed.seeds[1], - expand_seed2: shared_seed.seeds[2], - expand_seed3: shared_seed.seeds[3], } } @@ -43,7 +37,7 @@ impl FoldHasher { if self.sponge_len as usize + bits > 128 { let lo = self.sponge as u64; let hi = (self.sponge >> 64) as u64; - self.accumulator = folded_multiply(lo ^ self.accumulator, hi ^ self.fold_seed); + self.accumulator = folded_multiply(lo ^ self.accumulator, hi ^ self.seeds[0]); self.sponge = x.into(); self.sponge_len = bits as u8; } else { @@ -56,47 +50,44 @@ impl FoldHasher { impl Hasher for FoldHasher { #[inline(always)] fn write(&mut self, bytes: &[u8]) { - // We perform overlapping reads in the byte hash which could lead to - // trivial length-extension attacks. These should be defeated by - // adding a length-dependent rotation on our unpredictable seed - // which costs only a single cycle (or none if executed with - // instruction-level parallelism). - let len = bytes.len(); - let base_seed = rotate_right(self.accumulator, len as u32); - if len <= 16 { - let mut s0 = base_seed; - let mut s1 = self.expand_seed; + let accumulator = self.accumulator; + let seeds = self.seeds; + + // moving self.accumulator outside of this if block improves performance, I'm surprised the + // compiler can't do this automatically + self.accumulator = if bytes.len() <= 16 { + let mut s0 = 0; + let mut s1 = 0; + // XOR the input into s0, s1, then multiply and fold. - if len >= 8 { - s0 ^= u64::from_ne_bytes(bytes[0..8].try_into().unwrap()); - s1 ^= u64::from_ne_bytes(bytes[len - 8..].try_into().unwrap()); - } else if len >= 4 { - s0 ^= u32::from_ne_bytes(bytes[0..4].try_into().unwrap()) as u64; - s1 ^= u32::from_ne_bytes(bytes[len - 4..].try_into().unwrap()) as u64; - } else if len > 0 { + if bytes.len() >= 8 { + s0 = read_u64(bytes, 0); + s1 = read_u64(bytes, bytes.len() - 8); + } else if bytes.len() >= 4 { + s0 = read_u32(bytes, 0) as u64; + s1 = read_u32(bytes, bytes.len() - 4) as u64; + } else if bytes.len() > 0 { let lo = bytes[0]; - let mid = bytes[len / 2]; - let hi = bytes[len - 1]; - s0 ^= lo as u64; - s1 ^= ((hi as u64) << 8) | mid as u64; + let mid = bytes[bytes.len() / 2]; + let hi = bytes[bytes.len() - 1]; + s0 = hi as u64; + s1 = ((lo as u64) << 45) | mid as u64; } - self.accumulator = folded_multiply(s0, s1); - } else if len < 256 { - self.accumulator = hash_bytes_medium( - bytes, - base_seed, - base_seed.wrapping_add(self.expand_seed), - self.fold_seed, - ); + + // I prefer to wrapping add the length here, as not all platforms have a rotation, and + // although it has a smaller impact on the output hash, rapidhash's output quality and + // collision studies suggested this or an XOR are sufficient. Moving this to the bottom + // of the function appears to improve performance. + s0 ^= seeds[0]; + s1 ^= rotate_right(accumulator, bytes.len() as u32); + + folded_multiply(s0, s1) + } else if bytes.len() <= 288 { + // minimising the number of arguments, but self.accumulator and self.seeds can already + // be loaded into registers in this function, so passing them directly appears faster + rapidhash_core_16_288(accumulator, seeds, bytes) } else { - self.accumulator = hash_bytes_long( - bytes, - base_seed, - base_seed.wrapping_add(self.expand_seed), - base_seed.wrapping_add(self.expand_seed2), - base_seed.wrapping_add(self.expand_seed3), - self.fold_seed, - ); + hash_bytes_long(accumulator, seeds, bytes) } } @@ -124,7 +115,7 @@ impl Hasher for FoldHasher { fn write_u128(&mut self, i: u128) { let lo = i as u64; let hi = (i >> 64) as u64; - self.accumulator = folded_multiply(lo ^ self.accumulator, hi ^ self.fold_seed); + self.accumulator = folded_multiply(lo ^ self.accumulator, hi ^ self.seeds[0]); } #[inline(always)] @@ -141,7 +132,7 @@ impl Hasher for FoldHasher { if self.sponge_len > 0 { let lo = self.sponge as u64; let hi = (self.sponge >> 64) as u64; - folded_multiply(lo ^ self.accumulator, hi ^ self.fold_seed) + folded_multiply(lo ^ self.accumulator, hi ^ self.seeds[0]) } else { self.accumulator } diff --git a/src/lib.rs b/src/lib.rs index ab04556..d636b85 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -114,6 +114,7 @@ pub use seed::SharedSeed; mod convenience; #[cfg(feature = "std")] pub use convenience::*; +use crate::fast::FoldHasher; // Arbitrary constants with high entropy. Hexadecimal digits of pi were used. const ARBITRARY0: u64 = 0x243f6a8885a308d3; @@ -220,6 +221,46 @@ const fn rotate_right(x: u64, r: u32) -> u64 { } } +/// A helper method for doing an unaligned 32-bit read from a byte slice. +#[inline(always)] +fn read_u32(slice: &[u8], offset: usize) -> u32 { + // Uncomment the following to explicitly omit bounds checks for debugging: + // debug_assert!(offset as isize >= 0); + // debug_assert!(slice.len() >= 4 + offset); + // unsafe { core::ptr::read_unaligned(slice.as_ptr().offset(offset as isize) as *const u32) } + + // Equivalent to slice[offset..offset+4].try_into().unwrap(), but const-friendly + let maybe_buf = slice.split_at(offset).1.first_chunk::<4>(); + let buf = match maybe_buf { + Some(buf) => *buf, + None => panic!("read_u32: slice too short"), + }; + u32::from_ne_bytes(buf) +} + +/// A helper method for doing an unaligned 64-bit read from a byte slice. +/// +/// This function is specifically implemented this way to allow the compiler +/// to optimise away the bounds checks. The traditional approach of using +/// `u64::from_ne_bytes(slice[offset..offset + 8].try_into().unwrap())` does +/// not allow the compiler to fully optimise out the bounds checks for +/// unknown reasons. +#[inline(always)] +fn read_u64(slice: &[u8], offset: usize) -> u64 { + // Uncomment the following to explicitly omit bounds checks for debugging: + // debug_assert!(offset as isize >= 0); + // debug_assert!(slice.len() >= 4 + offset); + // unsafe { core::ptr::read_unaligned(slice.as_ptr().offset(offset as isize) as *const u64) } + + // equivalent to slice[offset..offset+8].try_into().unwrap(), but const-friendly + let maybe_buf = slice.split_at(offset).1.first_chunk::<8>(); + let buf = match maybe_buf { + Some(buf) => *buf, + None => panic!("read_u64: slice too short"), + }; + u64::from_ne_bytes(buf) +} + /// Hashes strings >= 16 bytes, has unspecified behavior when bytes.len() < 16. fn hash_bytes_medium(bytes: &[u8], mut s0: u64, mut s1: u64, fold_seed: u64) -> u64 { // Process 32 bytes per iteration, 16 bytes from the start, 16 bytes from @@ -246,17 +287,57 @@ fn hash_bytes_medium(bytes: &[u8], mut s0: u64, mut s1: u64, fold_seed: u64) -> s0 ^ s1 } +#[inline(never)] +fn rapidhash_core_16_288(accumulator: u64, seeds: &[u64; 4], data: &[u8]) -> u64 { + let mut seed = accumulator; + let mut slice = data; + + if slice.len() > 48 { + let mut see1 = seed; + let mut see2 = seed; + + while slice.len() >= 48 { + seed = folded_multiply(read_u64(slice, 0) ^ seeds[1], read_u64(slice, 8) ^ seed); + see1 = folded_multiply(read_u64(slice, 16) ^ seeds[2], read_u64(slice, 24) ^ see1); + see2 = folded_multiply(read_u64(slice, 32) ^ seeds[3], read_u64(slice, 40) ^ see2); + let (_, split) = slice.split_at(48); + slice = split; + } + + seed ^= see1 ^ see2; + } + + if slice.len() > 16 { + seed = folded_multiply(read_u64(slice, 0) ^ seeds[1], read_u64(slice, 8) ^ seed); + if slice.len() > 32 { + seed = folded_multiply(read_u64(slice, 16) ^ seeds[2], read_u64(slice, 24) ^ seed); + } + } + + let mut a = read_u64(data, data.len() - 16); + let mut b = read_u64(data, data.len() - 8); + + seed = rotate_right(seed, data.len() as u32); + a ^= seeds[2]; + b ^= seed; + folded_multiply(a, b) +} + /// Hashes strings >= 16 bytes, has unspecified behavior when bytes.len() < 16. #[cold] #[inline(never)] fn hash_bytes_long( + accumulator: u64, + seeds: &[u64; 4], bytes: &[u8], - mut s0: u64, - mut s1: u64, - mut s2: u64, - mut s3: u64, - fold_seed: u64, ) -> u64 { + let base_seed = rotate_right(accumulator, bytes.len() as u32); + let fold_seed = seeds[0]; + let mut s0 = base_seed; + let mut s1 = base_seed.wrapping_add(seeds[1]); + let mut s2 = base_seed.wrapping_add(seeds[2]); + let mut s3 = base_seed.wrapping_add(seeds[3]); + let chunks = bytes.chunks_exact(64); let remainder = chunks.remainder().len(); for chunk in chunks { diff --git a/src/quality.rs b/src/quality.rs index 939b60e..ce1dd5a 100644 --- a/src/quality.rs +++ b/src/quality.rs @@ -20,7 +20,7 @@ impl FoldHasher { /// Initializes this [`FoldHasher`] with the given per-hasher seed and /// [`SharedSeed`]. #[inline(always)] - pub fn with_seed(per_hasher_seed: u64, shared_seed: &SharedSeed) -> FoldHasher { + pub fn with_seed(per_hasher_seed: u64, shared_seed: &'static SharedSeed) -> FoldHasher { FoldHasher { inner: fast::FoldHasher::with_seed(per_hasher_seed, shared_seed), }