Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
207 changes: 53 additions & 154 deletions zlib-rs/src/inflate/writer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -88,9 +88,7 @@ impl<'a> Writer<'a> {
) {
match FEATURES {
#[cfg(target_arch = "x86_64")]
CpuFeatures::AVX2 => {
self.extend_from_window_help::<core::arch::x86_64::__m256i>(window, range)
}
CpuFeatures::AVX2 => self.extend_from_window_help::<32>(window, range),
_ => self.extend_from_window_runtime_dispatch(window, range),
}
}
Expand All @@ -104,49 +102,49 @@ impl<'a> Writer<'a> {
//
// #[cfg(target_arch = "x86_64")]
// if crate::cpu_features::is_enabled_avx512() {
// return self.extend_from_window_help::<core::arch::x86_64::__m512i>(window, range);
// return self.extend_from_window_help::<64>(window, range);
// }

#[cfg(target_arch = "x86_64")]
if crate::cpu_features::is_enabled_avx2() {
return self.extend_from_window_help::<core::arch::x86_64::__m256i>(window, range);
return self.extend_from_window_help::<32>(window, range);
}

#[cfg(target_arch = "x86_64")]
if crate::cpu_features::is_enabled_sse() {
return self.extend_from_window_help::<core::arch::x86_64::__m128i>(window, range);
return self.extend_from_window_help::<16>(window, range);
}

#[cfg(target_arch = "aarch64")]
if crate::cpu_features::is_enabled_neon() {
return self.extend_from_window_help::<core::arch::aarch64::uint8x16_t>(window, range);
return self.extend_from_window_help::<16>(window, range);
}

#[cfg(target_arch = "wasm32")]
if crate::cpu_features::is_enabled_simd128() {
return self.extend_from_window_help::<core::arch::wasm32::v128>(window, range);
return self.extend_from_window_help::<16>(window, range);
}

self.extend_from_window_help::<u64>(window, range)
self.extend_from_window_help::<8>(window, range)
}

#[inline(always)]
fn extend_from_window_help<C: Chunk>(
fn extend_from_window_help<const N: usize>(
&mut self,
window: &super::window::Window,
range: Range<usize>,
) {
let len = range.end - range.start;

if self.remaining() >= len + core::mem::size_of::<C>() {
if self.remaining() >= len + N {
// SAFETY: we know that our window has at least a core::mem::size_of::<C>() extra bytes
// at the end, making it always safe to perform an (unaligned) Chunk read anywhere in
// the window slice.
//
// The calling function checks for CPU features requirements for C.
unsafe {
let src = window.as_ptr();
Self::copy_chunk_unchecked::<C>(
Self::copy_chunk_unchecked::<N>(
src.wrapping_add(range.start).cast(),
self.next_out(),
len,
Expand Down Expand Up @@ -174,9 +172,7 @@ impl<'a> Writer<'a> {
) {
match FEATURES {
#[cfg(target_arch = "x86_64")]
CpuFeatures::AVX2 => {
self.copy_match_help::<core::arch::x86_64::__m256i>(offset_from_end, length)
}
CpuFeatures::AVX2 => self.copy_match_help::<32>(offset_from_end, length),
_ => self.copy_match_runtime_dispatch(offset_from_end, length),
}
}
Expand All @@ -191,32 +187,31 @@ impl<'a> Writer<'a> {

#[cfg(target_arch = "x86_64")]
if crate::cpu_features::is_enabled_avx2() {
return self.copy_match_help::<core::arch::x86_64::__m256i>(offset_from_end, length);
return self.copy_match_help::<32>(offset_from_end, length);
}

#[cfg(target_arch = "x86_64")]
if crate::cpu_features::is_enabled_sse() {
return self.copy_match_help::<core::arch::x86_64::__m128i>(offset_from_end, length);
return self.copy_match_help::<16>(offset_from_end, length);
}

#[cfg(target_arch = "aarch64")]
if crate::cpu_features::is_enabled_neon() {
return self
.copy_match_help::<core::arch::aarch64::uint8x16_t>(offset_from_end, length);
return self.copy_match_help::<16>(offset_from_end, length);
}

#[cfg(target_arch = "wasm32")]
if crate::cpu_features::is_enabled_simd128() {
return self.copy_match_help::<core::arch::wasm32::v128>(offset_from_end, length);
return self.copy_match_help::<16>(offset_from_end, length);
}

self.copy_match_help::<u64>(offset_from_end, length)
self.copy_match_help::<8>(offset_from_end, length)
}

#[inline(always)]
fn copy_match_help<C: Chunk>(&mut self, offset_from_end: usize, length: usize) {
fn copy_match_help<const N: usize>(&mut self, offset_from_end: usize, length: usize) {
let capacity = self.buf.len();
let len = Ord::min(self.filled + length + core::mem::size_of::<C>(), capacity);
let len = Ord::min(self.filled + length + N, capacity);
let buf = &mut self.buf.as_mut_slice()[..len];

let current = self.filled;
Expand Down Expand Up @@ -244,12 +239,12 @@ impl<'a> Writer<'a> {
}
}
} else {
Self::copy_chunked_within::<C>(buf, capacity, current, offset_from_end, length)
Self::copy_chunked_within::<N>(buf, capacity, current, offset_from_end, length)
}
}

#[inline(always)]
fn copy_chunked_within<C: Chunk>(
fn copy_chunked_within<const N: usize>(
buf: &mut [MaybeUninit<u8>],
capacity: usize,
current: usize,
Expand All @@ -258,10 +253,10 @@ impl<'a> Writer<'a> {
) {
let start = current.checked_sub(offset_from_end).expect("in bounds");

if current + length + core::mem::size_of::<C>() < capacity {
if current + length + N < capacity {
let ptr = buf.as_mut_ptr();
// SAFETY: if statement and checked_sub ensures we stay in bounds.
unsafe { Self::copy_chunk_unchecked::<C>(ptr.add(start), ptr.add(current), length) }
unsafe { Self::copy_chunk_unchecked::<N>(ptr.add(start), ptr.add(current), length) }
} else {
// a full simd copy does not fit in the output buffer
buf.copy_within(start..start + length, current);
Expand All @@ -273,29 +268,45 @@ impl<'a> Writer<'a> {
/// `src` must be safe to perform unaligned reads in `core::mem::size_of::<C>()` chunks until
/// `end` is reached. `dst` must be safe to (unalingned) write that number of chunks.
#[inline(always)]
unsafe fn copy_chunk_unchecked<C: Chunk>(
unsafe fn copy_chunk_unchecked<const N: usize>(
mut src: *const MaybeUninit<u8>,
mut dst: *mut MaybeUninit<u8>,
length: usize,
) {
let end = src.add(length);

let chunk = C::load_chunk(src);
C::store_chunk(dst, chunk);
let chunk = load_chunk::<N>(src);
store_chunk::<N>(dst, chunk);

src = src.add(core::mem::size_of::<C>());
dst = dst.add(core::mem::size_of::<C>());
src = src.add(N);
dst = dst.add(N);

while src < end {
let chunk = C::load_chunk(src);
C::store_chunk(dst, chunk);
let chunk = load_chunk::<N>(src);
store_chunk::<N>(dst, chunk);

src = src.add(core::mem::size_of::<C>());
dst = dst.add(core::mem::size_of::<C>());
src = src.add(N);
dst = dst.add(N);
}
}
}

/// # Safety
///
/// Must be valid to read a `[u8; N]` value from `from` with an unaligned read.
#[inline(always)]
unsafe fn load_chunk<const N: usize>(from: *const MaybeUninit<u8>) -> [u8; N] {
core::ptr::read_unaligned(from.cast::<[u8; N]>())
}

/// # Safety
///
/// Must be valid to write a `[u8; N]` value to `out` with an unaligned write.
#[inline(always)]
unsafe fn store_chunk<const N: usize>(out: *mut MaybeUninit<u8>, chunk: [u8; N]) {
core::ptr::write_unaligned(out.cast(), chunk)
}

impl fmt::Debug for Writer<'_> {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
f.debug_struct("Writer")
Expand All @@ -309,109 +320,6 @@ fn slice_to_uninit(slice: &[u8]) -> &[MaybeUninit<u8>] {
unsafe { &*(slice as *const [u8] as *const [MaybeUninit<u8>]) }
}

trait Chunk {
/// # Safety
///
/// Must be valid to read a `Self::Chunk` value from `from` with an unaligned read.
///
/// Implementations may have CPU feature specific requirements depending on the type.
unsafe fn load_chunk(from: *const MaybeUninit<u8>) -> Self;

/// # Safety
///
/// Must be valid to write a `Self::Chunk` value to `out` with an unaligned write.
///
/// Implementations may have CPU feature specific requirements depending on the type.
unsafe fn store_chunk(out: *mut MaybeUninit<u8>, chunk: Self);
}

impl Chunk for u64 {
unsafe fn load_chunk(from: *const MaybeUninit<u8>) -> Self {
u64::to_le(core::ptr::read_unaligned(from.cast()))
}

unsafe fn store_chunk(out: *mut MaybeUninit<u8>, chunk: Self) {
core::ptr::copy_nonoverlapping(
chunk.to_le_bytes().as_ptr().cast(),
out,
core::mem::size_of::<Self>(),
)
}
}

#[cfg(target_arch = "x86_64")]
impl Chunk for core::arch::x86_64::__m128i {
#[inline(always)]
unsafe fn load_chunk(from: *const MaybeUninit<u8>) -> Self {
core::arch::x86_64::_mm_loadu_si128(from.cast())
}

#[inline(always)]
unsafe fn store_chunk(out: *mut MaybeUninit<u8>, chunk: Self) {
core::arch::x86_64::_mm_storeu_si128(out as *mut Self, chunk);
}
}

#[cfg(target_arch = "x86_64")]
impl Chunk for core::arch::x86_64::__m256i {
#[inline(always)]
unsafe fn load_chunk(from: *const MaybeUninit<u8>) -> Self {
core::arch::x86_64::_mm256_loadu_si256(from.cast())
}

#[inline(always)]
unsafe fn store_chunk(out: *mut MaybeUninit<u8>, chunk: Self) {
core::arch::x86_64::_mm256_storeu_si256(out as *mut Self, chunk);
}
}

#[cfg(target_arch = "x86_64")]
impl Chunk for core::arch::x86_64::__m512i {
#[inline(always)]
unsafe fn load_chunk(from: *const MaybeUninit<u8>) -> Self {
// TODO AVX-512 is effectively unstable.
// We cross our fingers that LLVM optimizes this into a vmovdqu32
//
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_loadu_si512&expand=3420&ig_expand=4110
core::ptr::read_unaligned(from.cast())
}

#[inline(always)]
unsafe fn store_chunk(out: *mut MaybeUninit<u8>, chunk: Self) {
// TODO AVX-512 is effectively unstable.
// We cross our fingers that LLVM optimizes this into a vmovdqu32
//
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_storeu_si512&expand=3420&ig_expand=4110,6550
core::ptr::write_unaligned(out.cast(), chunk)
}
}

#[cfg(target_arch = "aarch64")]
impl Chunk for core::arch::aarch64::uint8x16_t {
#[inline(always)]
unsafe fn load_chunk(from: *const MaybeUninit<u8>) -> Self {
core::arch::aarch64::vld1q_u8(from.cast())
}

#[inline(always)]
unsafe fn store_chunk(out: *mut MaybeUninit<u8>, chunk: Self) {
core::arch::aarch64::vst1q_u8(out.cast(), chunk)
}
}

#[cfg(target_arch = "wasm32")]
impl Chunk for core::arch::wasm32::v128 {
#[inline(always)]
unsafe fn load_chunk(from: *const MaybeUninit<u8>) -> Self {
core::arch::wasm32::v128_load(from.cast())
}

#[inline(always)]
unsafe fn store_chunk(out: *mut MaybeUninit<u8>, chunk: Self) {
core::arch::wasm32::v128_store(out as *mut Self, chunk)
}
}

#[cfg(test)]
mod test {
use super::*;
Expand Down Expand Up @@ -453,15 +361,6 @@ mod test {
let offset_from_end = 17;
let length = 17;

#[cfg(target_arch = "x86_64")]
use core::arch::x86_64::{__m128i, __m256i, __m512i};

#[cfg(target_arch = "aarch64")]
use core::arch::aarch64::uint8x16_t;

#[cfg(target_arch = "wasm32")]
use core::arch::wasm32::v128;

macro_rules! helper {
($func:expr) => {
let mut buf = test_array();
Expand All @@ -476,30 +375,30 @@ mod test {

#[cfg(target_arch = "x86_64")]
if crate::cpu_features::is_enabled_avx512() {
helper!(Writer::copy_match_help::<__m512i>);
helper!(Writer::copy_match_help::<64>);
Comment on lines 376 to +378
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

given our changes to where the target feature is applied, this will not test a simd read/write any more. But those should be semantically equivalent to an array of the corresponding size.

}

#[cfg(target_arch = "x86_64")]
if crate::cpu_features::is_enabled_avx2() {
helper!(Writer::copy_match_help::<__m256i>);
helper!(Writer::copy_match_help::<32>);
}

#[cfg(target_arch = "x86_64")]
if crate::cpu_features::is_enabled_sse() {
helper!(Writer::copy_match_help::<__m128i>);
helper!(Writer::copy_match_help::<16>);
}

#[cfg(target_arch = "aarch64")]
if crate::cpu_features::is_enabled_neon() {
helper!(Writer::copy_match_help::<uint8x16_t>);
helper!(Writer::copy_match_help::<16>);
}

#[cfg(target_arch = "wasm32")]
if crate::cpu_features::is_enabled_simd128() {
helper!(Writer::copy_match_help::<v128>);
helper!(Writer::copy_match_help::<16>);
}

helper!(Writer::copy_match_help::<u64>);
helper!(Writer::copy_match_help::<8>);
}

#[test]
Expand Down
Loading