Skip to content
Draft
4 changes: 4 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ criterion = "0.5"
hashbrown = "0.14"
uuid = "1.8"
rand = "0.8"
rapidhash = "3.0"
ahash = "0.8"
fxhash = "0.2"
chrono = "0.4"
Expand All @@ -42,3 +43,6 @@ harness = false

[profile.release]
lto = "thin"
codegen-units = 1
incremental = false
debug-assertions = false
4 changes: 4 additions & 0 deletions benches/bench.rs
Original file line number Diff line number Diff line change
Expand Up @@ -147,24 +147,28 @@ fn profile_distr<D: Distribution>(distr: D, map_size: usize, c: &mut Criterion)

profile_hashonly::<foldhash::fast::RandomState, _>("foldhash-fast", distr.clone(), c);
profile_hashonly::<foldhash::quality::RandomState, _>("foldhash-quality", distr.clone(), c);
profile_hashonly::<rapidhash::fast::RandomState, _>("rapidhash-fast", distr.clone(), c);
profile_hashonly::<fxhash::FxBuildHasher, _>("fxhash", distr.clone(), c);
profile_hashonly::<ahash::RandomState, _>("ahash", distr.clone(), c);
profile_hashonly::<std::hash::RandomState, _>("siphash", distr.clone(), c);

profile_lookup_miss::<foldhash::fast::RandomState, _>("foldhash-fast", distr.clone(), map_size, c);
profile_lookup_miss::<foldhash::quality::RandomState, _>("foldhash-quality", distr.clone(), map_size, c);
profile_lookup_miss::<rapidhash::fast::RandomState, _>("rapidhash-fast", distr.clone(), map_size, c);
profile_lookup_miss::<fxhash::FxBuildHasher, _>("fxhash", distr.clone(), map_size, c);
profile_lookup_miss::<ahash::RandomState, _>("ahash", distr.clone(), map_size, c);
profile_lookup_miss::<std::hash::RandomState, _>("siphash", distr.clone(), map_size, c);

profile_lookup_hit::<foldhash::fast::RandomState, _>("foldhash-fast", distr.clone(), map_size, c);
profile_lookup_hit::<foldhash::quality::RandomState, _>("foldhash-quality", distr.clone(), map_size, c);
profile_lookup_hit::<rapidhash::fast::RandomState, _>("rapidhash-fast", distr.clone(), map_size, c);
profile_lookup_hit::<fxhash::FxBuildHasher, _>("fxhash", distr.clone(), map_size, c);
profile_lookup_hit::<ahash::RandomState, _>("ahash", distr.clone(), map_size, c);
profile_lookup_hit::<std::hash::RandomState, _>("siphash", distr.clone(), map_size, c);

profile_set_build::<foldhash::fast::RandomState, _>("foldhash-fast", distr.clone(), map_size, c);
profile_set_build::<foldhash::quality::RandomState, _>("foldhash-quality", distr.clone(), map_size, c);
profile_set_build::<rapidhash::fast::RandomState, _>("rapidhash-fast", distr.clone(), map_size, c);
profile_set_build::<fxhash::FxBuildHasher, _>("fxhash", distr.clone(), map_size, c);
profile_set_build::<ahash::RandomState, _>("ahash", distr.clone(), map_size, c);
profile_set_build::<std::hash::RandomState, _>("siphash", distr.clone(), map_size, c);
Expand Down
93 changes: 42 additions & 51 deletions src/fast.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
use core::hash::{BuildHasher, Hasher};

use crate::seed::{gen_per_hasher_seed, GlobalSeed, SharedSeed};
use crate::{folded_multiply, hash_bytes_long, hash_bytes_medium, rotate_right, ARBITRARY3};
use crate::{folded_multiply, hash_bytes_long, hash_bytes_medium, rapidhash_core_16_288, read_u32, read_u64, rotate_right, ARBITRARY3};

/// A [`Hasher`] instance implementing foldhash, optimized for speed.
///
Expand All @@ -12,28 +12,22 @@ use crate::{folded_multiply, hash_bytes_long, hash_bytes_medium, rotate_right, A
/// [`FixedState`] to create [`FoldHasher`]s.
#[derive(Clone)]
pub struct FoldHasher {
accumulator: u64,
pub(crate) accumulator: u64,
pub(crate) seeds: &'static [u64; 4],
sponge: u128,
sponge_len: u8,
fold_seed: u64,
expand_seed: u64,
expand_seed2: u64,
expand_seed3: u64,
}

impl FoldHasher {
/// Initializes this [`FoldHasher`] with the given per-hasher seed and
/// [`SharedSeed`].
#[inline]
pub fn with_seed(per_hasher_seed: u64, shared_seed: &SharedSeed) -> FoldHasher {
pub fn with_seed(per_hasher_seed: u64, shared_seed: &'static SharedSeed) -> FoldHasher {
FoldHasher {
accumulator: per_hasher_seed,
seeds: &shared_seed.seeds,
sponge: 0,
sponge_len: 0,
fold_seed: shared_seed.seeds[0],
expand_seed: shared_seed.seeds[1],
expand_seed2: shared_seed.seeds[2],
expand_seed3: shared_seed.seeds[3],
}
}

Expand All @@ -43,7 +37,7 @@ impl FoldHasher {
if self.sponge_len as usize + bits > 128 {
let lo = self.sponge as u64;
let hi = (self.sponge >> 64) as u64;
self.accumulator = folded_multiply(lo ^ self.accumulator, hi ^ self.fold_seed);
self.accumulator = folded_multiply(lo ^ self.accumulator, hi ^ self.seeds[0]);
self.sponge = x.into();
self.sponge_len = bits as u8;
} else {
Expand All @@ -56,47 +50,44 @@ impl FoldHasher {
impl Hasher for FoldHasher {
#[inline(always)]
fn write(&mut self, bytes: &[u8]) {
// We perform overlapping reads in the byte hash which could lead to
// trivial length-extension attacks. These should be defeated by
// adding a length-dependent rotation on our unpredictable seed
// which costs only a single cycle (or none if executed with
// instruction-level parallelism).
let len = bytes.len();
let base_seed = rotate_right(self.accumulator, len as u32);
if len <= 16 {
let mut s0 = base_seed;
let mut s1 = self.expand_seed;
let accumulator = self.accumulator;
let seeds = self.seeds;

// moving self.accumulator outside of this if block improves performance, I'm surprised the
// compiler can't do this automatically
self.accumulator = if bytes.len() <= 16 {
let mut s0 = 0;
let mut s1 = 0;

// XOR the input into s0, s1, then multiply and fold.
if len >= 8 {
s0 ^= u64::from_ne_bytes(bytes[0..8].try_into().unwrap());
s1 ^= u64::from_ne_bytes(bytes[len - 8..].try_into().unwrap());
} else if len >= 4 {
s0 ^= u32::from_ne_bytes(bytes[0..4].try_into().unwrap()) as u64;
s1 ^= u32::from_ne_bytes(bytes[len - 4..].try_into().unwrap()) as u64;
} else if len > 0 {
if bytes.len() >= 8 {
s0 = read_u64(bytes, 0);
s1 = read_u64(bytes, bytes.len() - 8);
} else if bytes.len() >= 4 {
s0 = read_u32(bytes, 0) as u64;
s1 = read_u32(bytes, bytes.len() - 4) as u64;
} else if bytes.len() > 0 {
let lo = bytes[0];
let mid = bytes[len / 2];
let hi = bytes[len - 1];
s0 ^= lo as u64;
s1 ^= ((hi as u64) << 8) | mid as u64;
let mid = bytes[bytes.len() / 2];
let hi = bytes[bytes.len() - 1];
s0 = hi as u64;
s1 = ((lo as u64) << 45) | mid as u64;
}
self.accumulator = folded_multiply(s0, s1);
} else if len < 256 {
self.accumulator = hash_bytes_medium(
bytes,
base_seed,
base_seed.wrapping_add(self.expand_seed),
self.fold_seed,
);

// I prefer to wrapping add the length here, as not all platforms have a rotation, and
// although it has a smaller impact on the output hash, rapidhash's output quality and
// collision studies suggested this or an XOR are sufficient. Moving this to the bottom
// of the function appears to improve performance.
s0 ^= seeds[0];
s1 ^= rotate_right(accumulator, bytes.len() as u32);

folded_multiply(s0, s1)
} else if bytes.len() <= 288 {
// minimising the number of arguments, but self.accumulator and self.seeds can already
// be loaded into registers in this function, so passing them directly appears faster
rapidhash_core_16_288(accumulator, seeds, bytes)
} else {
self.accumulator = hash_bytes_long(
bytes,
base_seed,
base_seed.wrapping_add(self.expand_seed),
base_seed.wrapping_add(self.expand_seed2),
base_seed.wrapping_add(self.expand_seed3),
self.fold_seed,
);
hash_bytes_long(accumulator, seeds, bytes)
}
}

Expand Down Expand Up @@ -124,7 +115,7 @@ impl Hasher for FoldHasher {
fn write_u128(&mut self, i: u128) {
let lo = i as u64;
let hi = (i >> 64) as u64;
self.accumulator = folded_multiply(lo ^ self.accumulator, hi ^ self.fold_seed);
self.accumulator = folded_multiply(lo ^ self.accumulator, hi ^ self.seeds[0]);
}

#[inline(always)]
Expand All @@ -141,7 +132,7 @@ impl Hasher for FoldHasher {
if self.sponge_len > 0 {
let lo = self.sponge as u64;
let hi = (self.sponge >> 64) as u64;
folded_multiply(lo ^ self.accumulator, hi ^ self.fold_seed)
folded_multiply(lo ^ self.accumulator, hi ^ self.seeds[0])
} else {
self.accumulator
}
Expand Down
91 changes: 86 additions & 5 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,7 @@ pub use seed::SharedSeed;
mod convenience;
#[cfg(feature = "std")]
pub use convenience::*;
use crate::fast::FoldHasher;

// Arbitrary constants with high entropy. Hexadecimal digits of pi were used.
const ARBITRARY0: u64 = 0x243f6a8885a308d3;
Expand Down Expand Up @@ -220,6 +221,46 @@ const fn rotate_right(x: u64, r: u32) -> u64 {
}
}

/// A helper method for doing an unaligned 32-bit read from a byte slice.
#[inline(always)]
fn read_u32(slice: &[u8], offset: usize) -> u32 {
// Uncomment the following to explicitly omit bounds checks for debugging:
// debug_assert!(offset as isize >= 0);
// debug_assert!(slice.len() >= 4 + offset);
// unsafe { core::ptr::read_unaligned(slice.as_ptr().offset(offset as isize) as *const u32) }

// Equivalent to slice[offset..offset+4].try_into().unwrap(), but const-friendly
let maybe_buf = slice.split_at(offset).1.first_chunk::<4>();
let buf = match maybe_buf {
Some(buf) => *buf,
None => panic!("read_u32: slice too short"),
};
u32::from_ne_bytes(buf)
}

/// A helper method for doing an unaligned 64-bit read from a byte slice.
///
/// This function is specifically implemented this way to allow the compiler
/// to optimise away the bounds checks. The traditional approach of using
/// `u64::from_ne_bytes(slice[offset..offset + 8].try_into().unwrap())` does
/// not allow the compiler to fully optimise out the bounds checks for
/// unknown reasons.
#[inline(always)]
fn read_u64(slice: &[u8], offset: usize) -> u64 {
// Uncomment the following to explicitly omit bounds checks for debugging:
// debug_assert!(offset as isize >= 0);
// debug_assert!(slice.len() >= 4 + offset);
// unsafe { core::ptr::read_unaligned(slice.as_ptr().offset(offset as isize) as *const u64) }

// equivalent to slice[offset..offset+8].try_into().unwrap(), but const-friendly
let maybe_buf = slice.split_at(offset).1.first_chunk::<8>();
let buf = match maybe_buf {
Some(buf) => *buf,
None => panic!("read_u64: slice too short"),
};
u64::from_ne_bytes(buf)
}

/// Hashes strings >= 16 bytes, has unspecified behavior when bytes.len() < 16.
fn hash_bytes_medium(bytes: &[u8], mut s0: u64, mut s1: u64, fold_seed: u64) -> u64 {
// Process 32 bytes per iteration, 16 bytes from the start, 16 bytes from
Expand All @@ -246,17 +287,57 @@ fn hash_bytes_medium(bytes: &[u8], mut s0: u64, mut s1: u64, fold_seed: u64) ->
s0 ^ s1
}

#[inline(never)]
fn rapidhash_core_16_288(accumulator: u64, seeds: &[u64; 4], data: &[u8]) -> u64 {
let mut seed = accumulator;
let mut slice = data;

if slice.len() > 48 {
let mut see1 = seed;
let mut see2 = seed;

while slice.len() >= 48 {
seed = folded_multiply(read_u64(slice, 0) ^ seeds[1], read_u64(slice, 8) ^ seed);
see1 = folded_multiply(read_u64(slice, 16) ^ seeds[2], read_u64(slice, 24) ^ see1);
see2 = folded_multiply(read_u64(slice, 32) ^ seeds[3], read_u64(slice, 40) ^ see2);
let (_, split) = slice.split_at(48);
slice = split;
}

seed ^= see1 ^ see2;
}

if slice.len() > 16 {
seed = folded_multiply(read_u64(slice, 0) ^ seeds[1], read_u64(slice, 8) ^ seed);
if slice.len() > 32 {
seed = folded_multiply(read_u64(slice, 16) ^ seeds[2], read_u64(slice, 24) ^ seed);
}
}

let mut a = read_u64(data, data.len() - 16);
let mut b = read_u64(data, data.len() - 8);

seed = rotate_right(seed, data.len() as u32);
a ^= seeds[2];
b ^= seed;
folded_multiply(a, b)
}

/// Hashes strings >= 16 bytes, has unspecified behavior when bytes.len() < 16.
#[cold]
#[inline(never)]
fn hash_bytes_long(
accumulator: u64,
seeds: &[u64; 4],
bytes: &[u8],
mut s0: u64,
mut s1: u64,
mut s2: u64,
mut s3: u64,
fold_seed: u64,
) -> u64 {
let base_seed = rotate_right(accumulator, bytes.len() as u32);
let fold_seed = seeds[0];
let mut s0 = base_seed;
let mut s1 = base_seed.wrapping_add(seeds[1]);
let mut s2 = base_seed.wrapping_add(seeds[2]);
let mut s3 = base_seed.wrapping_add(seeds[3]);

let chunks = bytes.chunks_exact(64);
let remainder = chunks.remainder().len();
for chunk in chunks {
Expand Down
2 changes: 1 addition & 1 deletion src/quality.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ impl FoldHasher {
/// Initializes this [`FoldHasher`] with the given per-hasher seed and
/// [`SharedSeed`].
#[inline(always)]
pub fn with_seed(per_hasher_seed: u64, shared_seed: &SharedSeed) -> FoldHasher {
pub fn with_seed(per_hasher_seed: u64, shared_seed: &'static SharedSeed) -> FoldHasher {
FoldHasher {
inner: fast::FoldHasher::with_seed(per_hasher_seed, shared_seed),
}
Expand Down