trifectatechfoundation · folkertdev · Feb 25, 2025 · Feb 25, 2025 · Feb 25, 2025 · folkertdev
diff --git a/zlib-rs/src/inflate/writer.rs b/zlib-rs/src/inflate/writer.rs
@@ -88,9 +88,7 @@ impl<'a> Writer<'a> {
     ) {
         match FEATURES {
             #[cfg(target_arch = "x86_64")]
-            CpuFeatures::AVX2 => {
-                self.extend_from_window_help::<core::arch::x86_64::__m256i>(window, range)
-            }
+            CpuFeatures::AVX2 => self.extend_from_window_help::<32>(window, range),
             _ => self.extend_from_window_runtime_dispatch(window, range),
         }
     }
@@ -104,49 +102,49 @@ impl<'a> Writer<'a> {
         //
         //        #[cfg(target_arch = "x86_64")]
         //        if crate::cpu_features::is_enabled_avx512() {
-        //            return self.extend_from_window_help::<core::arch::x86_64::__m512i>(window, range);
+        //            return self.extend_from_window_help::<64>(window, range);
         //        }
 
         #[cfg(target_arch = "x86_64")]
         if crate::cpu_features::is_enabled_avx2() {
-            return self.extend_from_window_help::<core::arch::x86_64::__m256i>(window, range);
+            return self.extend_from_window_help::<32>(window, range);
         }
 
         #[cfg(target_arch = "x86_64")]
         if crate::cpu_features::is_enabled_sse() {
-            return self.extend_from_window_help::<core::arch::x86_64::__m128i>(window, range);
+            return self.extend_from_window_help::<16>(window, range);
         }
 
         #[cfg(target_arch = "aarch64")]
         if crate::cpu_features::is_enabled_neon() {
-            return self.extend_from_window_help::<core::arch::aarch64::uint8x16_t>(window, range);
+            return self.extend_from_window_help::<16>(window, range);
         }
 
         #[cfg(target_arch = "wasm32")]
         if crate::cpu_features::is_enabled_simd128() {
-            return self.extend_from_window_help::<core::arch::wasm32::v128>(window, range);
+            return self.extend_from_window_help::<16>(window, range);
         }
 
-        self.extend_from_window_help::<u64>(window, range)
+        self.extend_from_window_help::<8>(window, range)
     }
 
     #[inline(always)]
-    fn extend_from_window_help<C: Chunk>(
+    fn extend_from_window_help<const N: usize>(
         &mut self,
         window: &super::window::Window,
         range: Range<usize>,
     ) {
         let len = range.end - range.start;
 
-        if self.remaining() >= len + core::mem::size_of::<C>() {
+        if self.remaining() >= len + N {
             // SAFETY: we know that our window has at least a core::mem::size_of::<C>() extra bytes
             // at the end, making it always safe to perform an (unaligned) Chunk read anywhere in
             // the window slice.
             //
             // The calling function checks for CPU features requirements for C.
             unsafe {
                 let src = window.as_ptr();
-                Self::copy_chunk_unchecked::<C>(
+                Self::copy_chunk_unchecked::<N>(
                     src.wrapping_add(range.start).cast(),
                     self.next_out(),
                     len,
@@ -174,9 +172,7 @@ impl<'a> Writer<'a> {
     ) {
         match FEATURES {
             #[cfg(target_arch = "x86_64")]
-            CpuFeatures::AVX2 => {
-                self.copy_match_help::<core::arch::x86_64::__m256i>(offset_from_end, length)
-            }
+            CpuFeatures::AVX2 => self.copy_match_help::<32>(offset_from_end, length),
             _ => self.copy_match_runtime_dispatch(offset_from_end, length),
         }
     }
@@ -191,32 +187,31 @@ impl<'a> Writer<'a> {
 
         #[cfg(target_arch = "x86_64")]
         if crate::cpu_features::is_enabled_avx2() {
-            return self.copy_match_help::<core::arch::x86_64::__m256i>(offset_from_end, length);
+            return self.copy_match_help::<32>(offset_from_end, length);
         }
 
         #[cfg(target_arch = "x86_64")]
         if crate::cpu_features::is_enabled_sse() {
-            return self.copy_match_help::<core::arch::x86_64::__m128i>(offset_from_end, length);
+            return self.copy_match_help::<16>(offset_from_end, length);
         }
 
         #[cfg(target_arch = "aarch64")]
         if crate::cpu_features::is_enabled_neon() {
-            return self
-                .copy_match_help::<core::arch::aarch64::uint8x16_t>(offset_from_end, length);
+            return self.copy_match_help::<16>(offset_from_end, length);
         }
 
         #[cfg(target_arch = "wasm32")]
         if crate::cpu_features::is_enabled_simd128() {
-            return self.copy_match_help::<core::arch::wasm32::v128>(offset_from_end, length);
+            return self.copy_match_help::<16>(offset_from_end, length);
         }
 
-        self.copy_match_help::<u64>(offset_from_end, length)
+        self.copy_match_help::<8>(offset_from_end, length)
     }
 
     #[inline(always)]
-    fn copy_match_help<C: Chunk>(&mut self, offset_from_end: usize, length: usize) {
+    fn copy_match_help<const N: usize>(&mut self, offset_from_end: usize, length: usize) {
         let capacity = self.buf.len();
-        let len = Ord::min(self.filled + length + core::mem::size_of::<C>(), capacity);
+        let len = Ord::min(self.filled + length + N, capacity);
         let buf = &mut self.buf.as_mut_slice()[..len];
 
         let current = self.filled;
@@ -244,12 +239,12 @@ impl<'a> Writer<'a> {
                 }
             }
         } else {
-            Self::copy_chunked_within::<C>(buf, capacity, current, offset_from_end, length)
+            Self::copy_chunked_within::<N>(buf, capacity, current, offset_from_end, length)
         }
     }
 
     #[inline(always)]
-    fn copy_chunked_within<C: Chunk>(
+    fn copy_chunked_within<const N: usize>(
         buf: &mut [MaybeUninit<u8>],
         capacity: usize,
         current: usize,
@@ -258,10 +253,10 @@ impl<'a> Writer<'a> {
     ) {
         let start = current.checked_sub(offset_from_end).expect("in bounds");
 
-        if current + length + core::mem::size_of::<C>() < capacity {
+        if current + length + N < capacity {
             let ptr = buf.as_mut_ptr();
             // SAFETY: if statement and checked_sub ensures we stay in bounds.
-            unsafe { Self::copy_chunk_unchecked::<C>(ptr.add(start), ptr.add(current), length) }
+            unsafe { Self::copy_chunk_unchecked::<N>(ptr.add(start), ptr.add(current), length) }
         } else {
             // a full simd copy does not fit in the output buffer
             buf.copy_within(start..start + length, current);
@@ -273,29 +268,45 @@ impl<'a> Writer<'a> {
     /// `src` must be safe to perform unaligned reads in `core::mem::size_of::<C>()` chunks until
     /// `end` is reached. `dst` must be safe to (unalingned) write that number of chunks.
     #[inline(always)]
-    unsafe fn copy_chunk_unchecked<C: Chunk>(
+    unsafe fn copy_chunk_unchecked<const N: usize>(
         mut src: *const MaybeUninit<u8>,
         mut dst: *mut MaybeUninit<u8>,
         length: usize,
     ) {
         let end = src.add(length);
 
-        let chunk = C::load_chunk(src);
-        C::store_chunk(dst, chunk);
+        let chunk = load_chunk::<N>(src);
+        store_chunk::<N>(dst, chunk);
 
-        src = src.add(core::mem::size_of::<C>());
-        dst = dst.add(core::mem::size_of::<C>());
+        src = src.add(N);
+        dst = dst.add(N);
 
         while src < end {
-            let chunk = C::load_chunk(src);
-            C::store_chunk(dst, chunk);
+            let chunk = load_chunk::<N>(src);
+            store_chunk::<N>(dst, chunk);
 
-            src = src.add(core::mem::size_of::<C>());
-            dst = dst.add(core::mem::size_of::<C>());
+            src = src.add(N);
+            dst = dst.add(N);
         }
     }
 }
 
+/// # Safety
+///
+/// Must be valid to read a `[u8; N]` value from `from` with an unaligned read.
+#[inline(always)]
+unsafe fn load_chunk<const N: usize>(from: *const MaybeUninit<u8>) -> [u8; N] {
+    core::ptr::read_unaligned(from.cast::<[u8; N]>())
+}
+
+/// # Safety
+///
+/// Must be valid to write a `[u8; N]` value to `out` with an unaligned write.
+#[inline(always)]
+unsafe fn store_chunk<const N: usize>(out: *mut MaybeUninit<u8>, chunk: [u8; N]) {
+    core::ptr::write_unaligned(out.cast(), chunk)
+}
+
 impl fmt::Debug for Writer<'_> {
     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
         f.debug_struct("Writer")
@@ -309,109 +320,6 @@ fn slice_to_uninit(slice: &[u8]) -> &[MaybeUninit<u8>] {
     unsafe { &*(slice as *const [u8] as *const [MaybeUninit<u8>]) }
 }
 
-trait Chunk {
-    /// # Safety
-    ///
-    /// Must be valid to read a `Self::Chunk` value from `from` with an unaligned read.
-    ///
-    /// Implementations may have CPU feature specific requirements depending on the type.
-    unsafe fn load_chunk(from: *const MaybeUninit<u8>) -> Self;
-
-    /// # Safety
-    ///
-    /// Must be valid to write a `Self::Chunk` value to `out` with an unaligned write.
-    ///
-    /// Implementations may have CPU feature specific requirements depending on the type.
-    unsafe fn store_chunk(out: *mut MaybeUninit<u8>, chunk: Self);
-}
-
-impl Chunk for u64 {
-    unsafe fn load_chunk(from: *const MaybeUninit<u8>) -> Self {
-        u64::to_le(core::ptr::read_unaligned(from.cast()))
-    }
-
-    unsafe fn store_chunk(out: *mut MaybeUninit<u8>, chunk: Self) {
-        core::ptr::copy_nonoverlapping(
-            chunk.to_le_bytes().as_ptr().cast(),
-            out,
-            core::mem::size_of::<Self>(),
-        )
-    }
-}
-
-#[cfg(target_arch = "x86_64")]
-impl Chunk for core::arch::x86_64::__m128i {
-    #[inline(always)]
-    unsafe fn load_chunk(from: *const MaybeUninit<u8>) -> Self {
-        core::arch::x86_64::_mm_loadu_si128(from.cast())
-    }
-
-    #[inline(always)]
-    unsafe fn store_chunk(out: *mut MaybeUninit<u8>, chunk: Self) {
-        core::arch::x86_64::_mm_storeu_si128(out as *mut Self, chunk);
-    }
-}
-
-#[cfg(target_arch = "x86_64")]
-impl Chunk for core::arch::x86_64::__m256i {
-    #[inline(always)]
-    unsafe fn load_chunk(from: *const MaybeUninit<u8>) -> Self {
-        core::arch::x86_64::_mm256_loadu_si256(from.cast())
-    }
-
-    #[inline(always)]
-    unsafe fn store_chunk(out: *mut MaybeUninit<u8>, chunk: Self) {
-        core::arch::x86_64::_mm256_storeu_si256(out as *mut Self, chunk);
-    }
-}
-
-#[cfg(target_arch = "x86_64")]
-impl Chunk for core::arch::x86_64::__m512i {
-    #[inline(always)]
-    unsafe fn load_chunk(from: *const MaybeUninit<u8>) -> Self {
-        // TODO AVX-512 is effectively unstable.
-        // We cross our fingers that LLVM optimizes this into a vmovdqu32
-        //
-        // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_loadu_si512&expand=3420&ig_expand=4110
-        core::ptr::read_unaligned(from.cast())
-    }
-
-    #[inline(always)]
-    unsafe fn store_chunk(out: *mut MaybeUninit<u8>, chunk: Self) {
-        // TODO AVX-512 is effectively unstable.
-        // We cross our fingers that LLVM optimizes this into a vmovdqu32
-        //
-        // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_storeu_si512&expand=3420&ig_expand=4110,6550
-        core::ptr::write_unaligned(out.cast(), chunk)
-    }
-}
-
-#[cfg(target_arch = "aarch64")]
-impl Chunk for core::arch::aarch64::uint8x16_t {
-    #[inline(always)]
-    unsafe fn load_chunk(from: *const MaybeUninit<u8>) -> Self {
-        core::arch::aarch64::vld1q_u8(from.cast())
-    }
-
-    #[inline(always)]
-    unsafe fn store_chunk(out: *mut MaybeUninit<u8>, chunk: Self) {
-        core::arch::aarch64::vst1q_u8(out.cast(), chunk)
-    }
-}
-
-#[cfg(target_arch = "wasm32")]
-impl Chunk for core::arch::wasm32::v128 {
-    #[inline(always)]
-    unsafe fn load_chunk(from: *const MaybeUninit<u8>) -> Self {
-        core::arch::wasm32::v128_load(from.cast())
-    }
-
-    #[inline(always)]
-    unsafe fn store_chunk(out: *mut MaybeUninit<u8>, chunk: Self) {
-        core::arch::wasm32::v128_store(out as *mut Self, chunk)
-    }
-}
-
 #[cfg(test)]
 mod test {
     use super::*;
@@ -453,15 +361,6 @@ mod test {
         let offset_from_end = 17;
         let length = 17;
 
-        #[cfg(target_arch = "x86_64")]
-        use core::arch::x86_64::{__m128i, __m256i, __m512i};
-
-        #[cfg(target_arch = "aarch64")]
-        use core::arch::aarch64::uint8x16_t;
-
-        #[cfg(target_arch = "wasm32")]
-        use core::arch::wasm32::v128;
-
         macro_rules! helper {
             ($func:expr) => {
                 let mut buf = test_array();
@@ -476,30 +375,30 @@ mod test {
 
         #[cfg(target_arch = "x86_64")]
         if crate::cpu_features::is_enabled_avx512() {
-            helper!(Writer::copy_match_help::<__m512i>);
+            helper!(Writer::copy_match_help::<64>);
         }
 
         #[cfg(target_arch = "x86_64")]
         if crate::cpu_features::is_enabled_avx2() {
-            helper!(Writer::copy_match_help::<__m256i>);
+            helper!(Writer::copy_match_help::<32>);
         }
 
         #[cfg(target_arch = "x86_64")]
         if crate::cpu_features::is_enabled_sse() {
-            helper!(Writer::copy_match_help::<__m128i>);
+            helper!(Writer::copy_match_help::<16>);
         }
 
         #[cfg(target_arch = "aarch64")]
         if crate::cpu_features::is_enabled_neon() {
-            helper!(Writer::copy_match_help::<uint8x16_t>);
+            helper!(Writer::copy_match_help::<16>);
         }
 
         #[cfg(target_arch = "wasm32")]
         if crate::cpu_features::is_enabled_simd128() {
-            helper!(Writer::copy_match_help::<v128>);
+            helper!(Writer::copy_match_help::<16>);
         }
 
-        helper!(Writer::copy_match_help::<u64>);
+        helper!(Writer::copy_match_help::<8>);
     }
 
     #[test]