diff --git a/zlib-rs/src/inflate/writer.rs b/zlib-rs/src/inflate/writer.rs index d3c3823a..44271e48 100644 --- a/zlib-rs/src/inflate/writer.rs +++ b/zlib-rs/src/inflate/writer.rs @@ -88,9 +88,7 @@ impl<'a> Writer<'a> { ) { match FEATURES { #[cfg(target_arch = "x86_64")] - CpuFeatures::AVX2 => { - self.extend_from_window_help::(window, range) - } + CpuFeatures::AVX2 => self.extend_from_window_help::<32>(window, range), _ => self.extend_from_window_runtime_dispatch(window, range), } } @@ -104,41 +102,41 @@ impl<'a> Writer<'a> { // // #[cfg(target_arch = "x86_64")] // if crate::cpu_features::is_enabled_avx512() { - // return self.extend_from_window_help::(window, range); + // return self.extend_from_window_help::<64>(window, range); // } #[cfg(target_arch = "x86_64")] if crate::cpu_features::is_enabled_avx2() { - return self.extend_from_window_help::(window, range); + return self.extend_from_window_help::<32>(window, range); } #[cfg(target_arch = "x86_64")] if crate::cpu_features::is_enabled_sse() { - return self.extend_from_window_help::(window, range); + return self.extend_from_window_help::<16>(window, range); } #[cfg(target_arch = "aarch64")] if crate::cpu_features::is_enabled_neon() { - return self.extend_from_window_help::(window, range); + return self.extend_from_window_help::<16>(window, range); } #[cfg(target_arch = "wasm32")] if crate::cpu_features::is_enabled_simd128() { - return self.extend_from_window_help::(window, range); + return self.extend_from_window_help::<16>(window, range); } - self.extend_from_window_help::(window, range) + self.extend_from_window_help::<8>(window, range) } #[inline(always)] - fn extend_from_window_help( + fn extend_from_window_help( &mut self, window: &super::window::Window, range: Range, ) { let len = range.end - range.start; - if self.remaining() >= len + core::mem::size_of::() { + if self.remaining() >= len + N { // SAFETY: we know that our window has at least a core::mem::size_of::() extra bytes // at the end, making it always safe to perform an (unaligned) Chunk read anywhere in // the window slice. @@ -146,7 +144,7 @@ impl<'a> Writer<'a> { // The calling function checks for CPU features requirements for C. unsafe { let src = window.as_ptr(); - Self::copy_chunk_unchecked::( + Self::copy_chunk_unchecked::( src.wrapping_add(range.start).cast(), self.next_out(), len, @@ -174,9 +172,7 @@ impl<'a> Writer<'a> { ) { match FEATURES { #[cfg(target_arch = "x86_64")] - CpuFeatures::AVX2 => { - self.copy_match_help::(offset_from_end, length) - } + CpuFeatures::AVX2 => self.copy_match_help::<32>(offset_from_end, length), _ => self.copy_match_runtime_dispatch(offset_from_end, length), } } @@ -191,32 +187,31 @@ impl<'a> Writer<'a> { #[cfg(target_arch = "x86_64")] if crate::cpu_features::is_enabled_avx2() { - return self.copy_match_help::(offset_from_end, length); + return self.copy_match_help::<32>(offset_from_end, length); } #[cfg(target_arch = "x86_64")] if crate::cpu_features::is_enabled_sse() { - return self.copy_match_help::(offset_from_end, length); + return self.copy_match_help::<16>(offset_from_end, length); } #[cfg(target_arch = "aarch64")] if crate::cpu_features::is_enabled_neon() { - return self - .copy_match_help::(offset_from_end, length); + return self.copy_match_help::<16>(offset_from_end, length); } #[cfg(target_arch = "wasm32")] if crate::cpu_features::is_enabled_simd128() { - return self.copy_match_help::(offset_from_end, length); + return self.copy_match_help::<16>(offset_from_end, length); } - self.copy_match_help::(offset_from_end, length) + self.copy_match_help::<8>(offset_from_end, length) } #[inline(always)] - fn copy_match_help(&mut self, offset_from_end: usize, length: usize) { + fn copy_match_help(&mut self, offset_from_end: usize, length: usize) { let capacity = self.buf.len(); - let len = Ord::min(self.filled + length + core::mem::size_of::(), capacity); + let len = Ord::min(self.filled + length + N, capacity); let buf = &mut self.buf.as_mut_slice()[..len]; let current = self.filled; @@ -244,12 +239,12 @@ impl<'a> Writer<'a> { } } } else { - Self::copy_chunked_within::(buf, capacity, current, offset_from_end, length) + Self::copy_chunked_within::(buf, capacity, current, offset_from_end, length) } } #[inline(always)] - fn copy_chunked_within( + fn copy_chunked_within( buf: &mut [MaybeUninit], capacity: usize, current: usize, @@ -258,10 +253,10 @@ impl<'a> Writer<'a> { ) { let start = current.checked_sub(offset_from_end).expect("in bounds"); - if current + length + core::mem::size_of::() < capacity { + if current + length + N < capacity { let ptr = buf.as_mut_ptr(); // SAFETY: if statement and checked_sub ensures we stay in bounds. - unsafe { Self::copy_chunk_unchecked::(ptr.add(start), ptr.add(current), length) } + unsafe { Self::copy_chunk_unchecked::(ptr.add(start), ptr.add(current), length) } } else { // a full simd copy does not fit in the output buffer buf.copy_within(start..start + length, current); @@ -273,29 +268,45 @@ impl<'a> Writer<'a> { /// `src` must be safe to perform unaligned reads in `core::mem::size_of::()` chunks until /// `end` is reached. `dst` must be safe to (unalingned) write that number of chunks. #[inline(always)] - unsafe fn copy_chunk_unchecked( + unsafe fn copy_chunk_unchecked( mut src: *const MaybeUninit, mut dst: *mut MaybeUninit, length: usize, ) { let end = src.add(length); - let chunk = C::load_chunk(src); - C::store_chunk(dst, chunk); + let chunk = load_chunk::(src); + store_chunk::(dst, chunk); - src = src.add(core::mem::size_of::()); - dst = dst.add(core::mem::size_of::()); + src = src.add(N); + dst = dst.add(N); while src < end { - let chunk = C::load_chunk(src); - C::store_chunk(dst, chunk); + let chunk = load_chunk::(src); + store_chunk::(dst, chunk); - src = src.add(core::mem::size_of::()); - dst = dst.add(core::mem::size_of::()); + src = src.add(N); + dst = dst.add(N); } } } +/// # Safety +/// +/// Must be valid to read a `[u8; N]` value from `from` with an unaligned read. +#[inline(always)] +unsafe fn load_chunk(from: *const MaybeUninit) -> [u8; N] { + core::ptr::read_unaligned(from.cast::<[u8; N]>()) +} + +/// # Safety +/// +/// Must be valid to write a `[u8; N]` value to `out` with an unaligned write. +#[inline(always)] +unsafe fn store_chunk(out: *mut MaybeUninit, chunk: [u8; N]) { + core::ptr::write_unaligned(out.cast(), chunk) +} + impl fmt::Debug for Writer<'_> { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { f.debug_struct("Writer") @@ -309,109 +320,6 @@ fn slice_to_uninit(slice: &[u8]) -> &[MaybeUninit] { unsafe { &*(slice as *const [u8] as *const [MaybeUninit]) } } -trait Chunk { - /// # Safety - /// - /// Must be valid to read a `Self::Chunk` value from `from` with an unaligned read. - /// - /// Implementations may have CPU feature specific requirements depending on the type. - unsafe fn load_chunk(from: *const MaybeUninit) -> Self; - - /// # Safety - /// - /// Must be valid to write a `Self::Chunk` value to `out` with an unaligned write. - /// - /// Implementations may have CPU feature specific requirements depending on the type. - unsafe fn store_chunk(out: *mut MaybeUninit, chunk: Self); -} - -impl Chunk for u64 { - unsafe fn load_chunk(from: *const MaybeUninit) -> Self { - u64::to_le(core::ptr::read_unaligned(from.cast())) - } - - unsafe fn store_chunk(out: *mut MaybeUninit, chunk: Self) { - core::ptr::copy_nonoverlapping( - chunk.to_le_bytes().as_ptr().cast(), - out, - core::mem::size_of::(), - ) - } -} - -#[cfg(target_arch = "x86_64")] -impl Chunk for core::arch::x86_64::__m128i { - #[inline(always)] - unsafe fn load_chunk(from: *const MaybeUninit) -> Self { - core::arch::x86_64::_mm_loadu_si128(from.cast()) - } - - #[inline(always)] - unsafe fn store_chunk(out: *mut MaybeUninit, chunk: Self) { - core::arch::x86_64::_mm_storeu_si128(out as *mut Self, chunk); - } -} - -#[cfg(target_arch = "x86_64")] -impl Chunk for core::arch::x86_64::__m256i { - #[inline(always)] - unsafe fn load_chunk(from: *const MaybeUninit) -> Self { - core::arch::x86_64::_mm256_loadu_si256(from.cast()) - } - - #[inline(always)] - unsafe fn store_chunk(out: *mut MaybeUninit, chunk: Self) { - core::arch::x86_64::_mm256_storeu_si256(out as *mut Self, chunk); - } -} - -#[cfg(target_arch = "x86_64")] -impl Chunk for core::arch::x86_64::__m512i { - #[inline(always)] - unsafe fn load_chunk(from: *const MaybeUninit) -> Self { - // TODO AVX-512 is effectively unstable. - // We cross our fingers that LLVM optimizes this into a vmovdqu32 - // - // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_loadu_si512&expand=3420&ig_expand=4110 - core::ptr::read_unaligned(from.cast()) - } - - #[inline(always)] - unsafe fn store_chunk(out: *mut MaybeUninit, chunk: Self) { - // TODO AVX-512 is effectively unstable. - // We cross our fingers that LLVM optimizes this into a vmovdqu32 - // - // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_storeu_si512&expand=3420&ig_expand=4110,6550 - core::ptr::write_unaligned(out.cast(), chunk) - } -} - -#[cfg(target_arch = "aarch64")] -impl Chunk for core::arch::aarch64::uint8x16_t { - #[inline(always)] - unsafe fn load_chunk(from: *const MaybeUninit) -> Self { - core::arch::aarch64::vld1q_u8(from.cast()) - } - - #[inline(always)] - unsafe fn store_chunk(out: *mut MaybeUninit, chunk: Self) { - core::arch::aarch64::vst1q_u8(out.cast(), chunk) - } -} - -#[cfg(target_arch = "wasm32")] -impl Chunk for core::arch::wasm32::v128 { - #[inline(always)] - unsafe fn load_chunk(from: *const MaybeUninit) -> Self { - core::arch::wasm32::v128_load(from.cast()) - } - - #[inline(always)] - unsafe fn store_chunk(out: *mut MaybeUninit, chunk: Self) { - core::arch::wasm32::v128_store(out as *mut Self, chunk) - } -} - #[cfg(test)] mod test { use super::*; @@ -453,15 +361,6 @@ mod test { let offset_from_end = 17; let length = 17; - #[cfg(target_arch = "x86_64")] - use core::arch::x86_64::{__m128i, __m256i, __m512i}; - - #[cfg(target_arch = "aarch64")] - use core::arch::aarch64::uint8x16_t; - - #[cfg(target_arch = "wasm32")] - use core::arch::wasm32::v128; - macro_rules! helper { ($func:expr) => { let mut buf = test_array(); @@ -476,30 +375,30 @@ mod test { #[cfg(target_arch = "x86_64")] if crate::cpu_features::is_enabled_avx512() { - helper!(Writer::copy_match_help::<__m512i>); + helper!(Writer::copy_match_help::<64>); } #[cfg(target_arch = "x86_64")] if crate::cpu_features::is_enabled_avx2() { - helper!(Writer::copy_match_help::<__m256i>); + helper!(Writer::copy_match_help::<32>); } #[cfg(target_arch = "x86_64")] if crate::cpu_features::is_enabled_sse() { - helper!(Writer::copy_match_help::<__m128i>); + helper!(Writer::copy_match_help::<16>); } #[cfg(target_arch = "aarch64")] if crate::cpu_features::is_enabled_neon() { - helper!(Writer::copy_match_help::); + helper!(Writer::copy_match_help::<16>); } #[cfg(target_arch = "wasm32")] if crate::cpu_features::is_enabled_simd128() { - helper!(Writer::copy_match_help::); + helper!(Writer::copy_match_help::<16>); } - helper!(Writer::copy_match_help::); + helper!(Writer::copy_match_help::<8>); } #[test]