diff --git a/Cargo.toml b/Cargo.toml
index b6c229d..aa1637a 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -23,6 +23,7 @@ criterion = "0.5"
 hashbrown = "0.14"
 uuid = "1.8"
 rand = "0.8"
+rapidhash = "3.0"
 ahash = "0.8"
 fxhash = "0.2"
 chrono = "0.4"
@@ -42,3 +43,6 @@ harness = false
 
 [profile.release]
 lto = "thin"
+codegen-units = 1
+incremental = false
+debug-assertions = false
\ No newline at end of file
diff --git a/benches/bench.rs b/benches/bench.rs
index 81cc2e1..6a3a1a4 100644
--- a/benches/bench.rs
+++ b/benches/bench.rs
@@ -147,24 +147,28 @@ fn profile_distr<D: Distribution>(distr: D, map_size: usize, c: &mut Criterion)
 
     profile_hashonly::<foldhash::fast::RandomState, _>("foldhash-fast", distr.clone(), c);
     profile_hashonly::<foldhash::quality::RandomState, _>("foldhash-quality", distr.clone(), c);
+    profile_hashonly::<rapidhash::fast::RandomState, _>("rapidhash-fast", distr.clone(), c);
     profile_hashonly::<fxhash::FxBuildHasher, _>("fxhash", distr.clone(), c);
     profile_hashonly::<ahash::RandomState, _>("ahash", distr.clone(), c);
     profile_hashonly::<std::hash::RandomState, _>("siphash", distr.clone(), c);
 
     profile_lookup_miss::<foldhash::fast::RandomState, _>("foldhash-fast", distr.clone(), map_size, c);
     profile_lookup_miss::<foldhash::quality::RandomState, _>("foldhash-quality", distr.clone(), map_size, c);
+    profile_lookup_miss::<rapidhash::fast::RandomState, _>("rapidhash-fast", distr.clone(), map_size, c);
     profile_lookup_miss::<fxhash::FxBuildHasher, _>("fxhash", distr.clone(), map_size, c);
     profile_lookup_miss::<ahash::RandomState, _>("ahash", distr.clone(), map_size, c);
     profile_lookup_miss::<std::hash::RandomState, _>("siphash", distr.clone(), map_size, c);
 
     profile_lookup_hit::<foldhash::fast::RandomState, _>("foldhash-fast", distr.clone(), map_size, c);
     profile_lookup_hit::<foldhash::quality::RandomState, _>("foldhash-quality", distr.clone(), map_size, c);
+    profile_lookup_hit::<rapidhash::fast::RandomState, _>("rapidhash-fast", distr.clone(), map_size, c);
     profile_lookup_hit::<fxhash::FxBuildHasher, _>("fxhash", distr.clone(), map_size, c);
     profile_lookup_hit::<ahash::RandomState, _>("ahash", distr.clone(), map_size, c);
     profile_lookup_hit::<std::hash::RandomState, _>("siphash", distr.clone(), map_size, c);
 
     profile_set_build::<foldhash::fast::RandomState, _>("foldhash-fast", distr.clone(), map_size, c);
     profile_set_build::<foldhash::quality::RandomState, _>("foldhash-quality", distr.clone(), map_size, c);
+    profile_set_build::<rapidhash::fast::RandomState, _>("rapidhash-fast", distr.clone(), map_size, c);
     profile_set_build::<fxhash::FxBuildHasher, _>("fxhash", distr.clone(), map_size, c);
     profile_set_build::<ahash::RandomState, _>("ahash", distr.clone(), map_size, c);
     profile_set_build::<std::hash::RandomState, _>("siphash", distr.clone(), map_size, c);
diff --git a/src/fast.rs b/src/fast.rs
index a6f0f1e..8c98e1e 100644
--- a/src/fast.rs
+++ b/src/fast.rs
@@ -3,7 +3,7 @@
 use core::hash::{BuildHasher, Hasher};
 
 use crate::seed::{gen_per_hasher_seed, GlobalSeed, SharedSeed};
-use crate::{folded_multiply, hash_bytes_long, hash_bytes_medium, rotate_right, ARBITRARY3};
+use crate::{folded_multiply, hash_bytes_long, hash_bytes_medium, rapidhash_core_16_288, read_u32, read_u64, rotate_right, ARBITRARY3};
 
 /// A [`Hasher`] instance implementing foldhash, optimized for speed.
 ///
@@ -12,28 +12,22 @@ use crate::{folded_multiply, hash_bytes_long, hash_bytes_medium, rotate_right, A
 /// [`FixedState`] to create [`FoldHasher`]s.
 #[derive(Clone)]
 pub struct FoldHasher {
-    accumulator: u64,
+    pub(crate) accumulator: u64,
+    pub(crate) seeds: &'static [u64; 4],
     sponge: u128,
     sponge_len: u8,
-    fold_seed: u64,
-    expand_seed: u64,
-    expand_seed2: u64,
-    expand_seed3: u64,
 }
 
 impl FoldHasher {
     /// Initializes this [`FoldHasher`] with the given per-hasher seed and
     /// [`SharedSeed`].
     #[inline]
-    pub fn with_seed(per_hasher_seed: u64, shared_seed: &SharedSeed) -> FoldHasher {
+    pub fn with_seed(per_hasher_seed: u64, shared_seed: &'static SharedSeed) -> FoldHasher {
         FoldHasher {
             accumulator: per_hasher_seed,
+            seeds: &shared_seed.seeds,
             sponge: 0,
             sponge_len: 0,
-            fold_seed: shared_seed.seeds[0],
-            expand_seed: shared_seed.seeds[1],
-            expand_seed2: shared_seed.seeds[2],
-            expand_seed3: shared_seed.seeds[3],
         }
     }
 
@@ -43,7 +37,7 @@ impl FoldHasher {
         if self.sponge_len as usize + bits > 128 {
             let lo = self.sponge as u64;
             let hi = (self.sponge >> 64) as u64;
-            self.accumulator = folded_multiply(lo ^ self.accumulator, hi ^ self.fold_seed);
+            self.accumulator = folded_multiply(lo ^ self.accumulator, hi ^ self.seeds[0]);
             self.sponge = x.into();
             self.sponge_len = bits as u8;
         } else {
@@ -56,47 +50,44 @@ impl FoldHasher {
 impl Hasher for FoldHasher {
     #[inline(always)]
     fn write(&mut self, bytes: &[u8]) {
-        // We perform overlapping reads in the byte hash which could lead to
-        // trivial length-extension attacks. These should be defeated by
-        // adding a length-dependent rotation on our unpredictable seed
-        // which costs only a single cycle (or none if executed with
-        // instruction-level parallelism).
-        let len = bytes.len();
-        let base_seed = rotate_right(self.accumulator, len as u32);
-        if len <= 16 {
-            let mut s0 = base_seed;
-            let mut s1 = self.expand_seed;
+        let accumulator = self.accumulator;
+        let seeds = self.seeds;
+
+        // moving self.accumulator outside of this if block improves performance, I'm surprised the
+        // compiler can't do this automatically
+        self.accumulator = if bytes.len() <= 16 {
+            let mut s0 = 0;
+            let mut s1 = 0;
+
             // XOR the input into s0, s1, then multiply and fold.
-            if len >= 8 {
-                s0 ^= u64::from_ne_bytes(bytes[0..8].try_into().unwrap());
-                s1 ^= u64::from_ne_bytes(bytes[len - 8..].try_into().unwrap());
-            } else if len >= 4 {
-                s0 ^= u32::from_ne_bytes(bytes[0..4].try_into().unwrap()) as u64;
-                s1 ^= u32::from_ne_bytes(bytes[len - 4..].try_into().unwrap()) as u64;
-            } else if len > 0 {
+            if bytes.len() >= 8 {
+                s0 = read_u64(bytes, 0);
+                s1 = read_u64(bytes, bytes.len() - 8);
+            } else if bytes.len() >= 4 {
+                s0 = read_u32(bytes, 0) as u64;
+                s1 = read_u32(bytes, bytes.len() - 4) as u64;
+            } else if bytes.len() > 0 {
                 let lo = bytes[0];
-                let mid = bytes[len / 2];
-                let hi = bytes[len - 1];
-                s0 ^= lo as u64;
-                s1 ^= ((hi as u64) << 8) | mid as u64;
+                let mid = bytes[bytes.len() / 2];
+                let hi = bytes[bytes.len() - 1];
+                s0 = hi as u64;
+                s1 = ((lo as u64) << 45) | mid as u64;
             }
-            self.accumulator = folded_multiply(s0, s1);
-        } else if len < 256 {
-            self.accumulator = hash_bytes_medium(
-                bytes,
-                base_seed,
-                base_seed.wrapping_add(self.expand_seed),
-                self.fold_seed,
-            );
+
+            // I prefer to wrapping add the length here, as not all platforms have a rotation, and
+            // although it has a smaller impact on the output hash, rapidhash's output quality and
+            // collision studies suggested this or an XOR are sufficient. Moving this to the bottom
+            // of the function appears to improve performance.
+            s0 ^= seeds[0];
+            s1 ^= rotate_right(accumulator, bytes.len() as u32);
+
+            folded_multiply(s0, s1)
+        } else if bytes.len() <= 288 {
+            // minimising the number of arguments, but self.accumulator and self.seeds can already
+            // be loaded into registers in this function, so passing them directly appears faster
+            rapidhash_core_16_288(accumulator, seeds, bytes)
         } else {
-            self.accumulator = hash_bytes_long(
-                bytes,
-                base_seed,
-                base_seed.wrapping_add(self.expand_seed),
-                base_seed.wrapping_add(self.expand_seed2),
-                base_seed.wrapping_add(self.expand_seed3),
-                self.fold_seed,
-            );
+            hash_bytes_long(accumulator, seeds, bytes)
         }
     }
 
@@ -124,7 +115,7 @@ impl Hasher for FoldHasher {
     fn write_u128(&mut self, i: u128) {
         let lo = i as u64;
         let hi = (i >> 64) as u64;
-        self.accumulator = folded_multiply(lo ^ self.accumulator, hi ^ self.fold_seed);
+        self.accumulator = folded_multiply(lo ^ self.accumulator, hi ^ self.seeds[0]);
     }
 
     #[inline(always)]
@@ -141,7 +132,7 @@ impl Hasher for FoldHasher {
         if self.sponge_len > 0 {
             let lo = self.sponge as u64;
             let hi = (self.sponge >> 64) as u64;
-            folded_multiply(lo ^ self.accumulator, hi ^ self.fold_seed)
+            folded_multiply(lo ^ self.accumulator, hi ^ self.seeds[0])
         } else {
             self.accumulator
         }
diff --git a/src/lib.rs b/src/lib.rs
index ab04556..d636b85 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -114,6 +114,7 @@ pub use seed::SharedSeed;
 mod convenience;
 #[cfg(feature = "std")]
 pub use convenience::*;
+use crate::fast::FoldHasher;
 
 // Arbitrary constants with high entropy. Hexadecimal digits of pi were used.
 const ARBITRARY0: u64 = 0x243f6a8885a308d3;
@@ -220,6 +221,46 @@ const fn rotate_right(x: u64, r: u32) -> u64 {
     }
 }
 
+/// A helper method for doing an unaligned 32-bit read from a byte slice.
+#[inline(always)]
+fn read_u32(slice: &[u8], offset: usize) -> u32 {
+    // Uncomment the following to explicitly omit bounds checks for debugging:
+    // debug_assert!(offset as isize >= 0);
+    // debug_assert!(slice.len() >= 4 + offset);
+    // unsafe { core::ptr::read_unaligned(slice.as_ptr().offset(offset as isize) as *const u32) }
+
+    // Equivalent to slice[offset..offset+4].try_into().unwrap(), but const-friendly
+    let maybe_buf = slice.split_at(offset).1.first_chunk::<4>();
+    let buf = match maybe_buf {
+        Some(buf) => *buf,
+        None => panic!("read_u32: slice too short"),
+    };
+    u32::from_ne_bytes(buf)
+}
+
+/// A helper method for doing an unaligned 64-bit read from a byte slice.
+///
+/// This function is specifically implemented this way to allow the compiler
+/// to optimise away the bounds checks. The traditional approach of using
+/// `u64::from_ne_bytes(slice[offset..offset + 8].try_into().unwrap())` does
+/// not allow the compiler to fully optimise out the bounds checks for
+/// unknown reasons.
+#[inline(always)]
+fn read_u64(slice: &[u8], offset: usize) -> u64 {
+    // Uncomment the following to explicitly omit bounds checks for debugging:
+    // debug_assert!(offset as isize >= 0);
+    // debug_assert!(slice.len() >= 4 + offset);
+    // unsafe { core::ptr::read_unaligned(slice.as_ptr().offset(offset as isize) as *const u64) }
+
+    // equivalent to slice[offset..offset+8].try_into().unwrap(), but const-friendly
+    let maybe_buf = slice.split_at(offset).1.first_chunk::<8>();
+    let buf = match maybe_buf {
+        Some(buf) => *buf,
+        None => panic!("read_u64: slice too short"),
+    };
+    u64::from_ne_bytes(buf)
+}
+
 /// Hashes strings >= 16 bytes, has unspecified behavior when bytes.len() < 16.
 fn hash_bytes_medium(bytes: &[u8], mut s0: u64, mut s1: u64, fold_seed: u64) -> u64 {
     // Process 32 bytes per iteration, 16 bytes from the start, 16 bytes from
@@ -246,17 +287,57 @@ fn hash_bytes_medium(bytes: &[u8], mut s0: u64, mut s1: u64, fold_seed: u64) ->
     s0 ^ s1
 }
 
+#[inline(never)]
+fn rapidhash_core_16_288(accumulator: u64, seeds: &[u64; 4], data: &[u8]) -> u64 {
+    let mut seed = accumulator;
+    let mut slice = data;
+
+    if slice.len() > 48 {
+        let mut see1 = seed;
+        let mut see2 = seed;
+
+        while slice.len() >= 48 {
+            seed = folded_multiply(read_u64(slice, 0) ^ seeds[1], read_u64(slice, 8) ^ seed);
+            see1 = folded_multiply(read_u64(slice, 16) ^ seeds[2], read_u64(slice, 24) ^ see1);
+            see2 = folded_multiply(read_u64(slice, 32) ^ seeds[3], read_u64(slice, 40) ^ see2);
+            let (_, split) = slice.split_at(48);
+            slice = split;
+        }
+
+        seed ^= see1 ^ see2;
+    }
+
+    if slice.len() > 16 {
+        seed = folded_multiply(read_u64(slice, 0) ^ seeds[1], read_u64(slice, 8) ^ seed);
+        if slice.len() > 32 {
+            seed = folded_multiply(read_u64(slice, 16) ^ seeds[2], read_u64(slice, 24) ^ seed);
+        }
+    }
+
+    let mut a = read_u64(data, data.len() - 16);
+    let mut b = read_u64(data, data.len() - 8);
+
+    seed = rotate_right(seed, data.len() as u32);
+    a ^= seeds[2];
+    b ^= seed;
+    folded_multiply(a, b)
+}
+
 /// Hashes strings >= 16 bytes, has unspecified behavior when bytes.len() < 16.
 #[cold]
 #[inline(never)]
 fn hash_bytes_long(
+    accumulator: u64,
+    seeds: &[u64; 4],
     bytes: &[u8],
-    mut s0: u64,
-    mut s1: u64,
-    mut s2: u64,
-    mut s3: u64,
-    fold_seed: u64,
 ) -> u64 {
+    let base_seed = rotate_right(accumulator, bytes.len() as u32);
+    let fold_seed = seeds[0];
+    let mut s0 = base_seed;
+    let mut s1 = base_seed.wrapping_add(seeds[1]);
+    let mut s2 = base_seed.wrapping_add(seeds[2]);
+    let mut s3 = base_seed.wrapping_add(seeds[3]);
+
     let chunks = bytes.chunks_exact(64);
     let remainder = chunks.remainder().len();
     for chunk in chunks {
diff --git a/src/quality.rs b/src/quality.rs
index 939b60e..ce1dd5a 100644
--- a/src/quality.rs
+++ b/src/quality.rs
@@ -20,7 +20,7 @@ impl FoldHasher {
     /// Initializes this [`FoldHasher`] with the given per-hasher seed and
     /// [`SharedSeed`].
     #[inline(always)]
-    pub fn with_seed(per_hasher_seed: u64, shared_seed: &SharedSeed) -> FoldHasher {
+    pub fn with_seed(per_hasher_seed: u64, shared_seed: &'static SharedSeed) -> FoldHasher {
         FoldHasher {
             inner: fast::FoldHasher::with_seed(per_hasher_seed, shared_seed),
         }