From 06a7d64ad22b8e8db805ade95416d0b64f9c6440 Mon Sep 17 00:00:00 2001 From: Mahmut Bulut Date: Sun, 15 Nov 2020 05:26:59 +0100 Subject: [PATCH] ARROW-10589 - Implement AVX-512 bit and operation --- rust/arrow/Cargo.toml | 1 + rust/arrow/benches/buffer_bit_ops.rs | 4 +- rust/arrow/src/buffer.rs | 107 ++++++++++++++++++++++--- rust/arrow/src/compute/kernels/cast.rs | 6 +- rust/arrow/src/lib.rs | 3 + rust/arrow/src/util/bit_util.rs | 16 ++++ 6 files changed, 123 insertions(+), 14 deletions(-) diff --git a/rust/arrow/Cargo.toml b/rust/arrow/Cargo.toml index 71445768207..b4f15001e56 100644 --- a/rust/arrow/Cargo.toml +++ b/rust/arrow/Cargo.toml @@ -53,6 +53,7 @@ prettytable-rs = { version = "0.8.0", optional = true } [features] default = [] +avx512 = [] simd = ["packed_simd"] prettyprint = ["prettytable-rs"] diff --git a/rust/arrow/benches/buffer_bit_ops.rs b/rust/arrow/benches/buffer_bit_ops.rs index f905a0cf78c..6dd3b0d5ca7 100644 --- a/rust/arrow/benches/buffer_bit_ops.rs +++ b/rust/arrow/benches/buffer_bit_ops.rs @@ -39,8 +39,8 @@ fn bench_buffer_and(left: &Buffer, right: &Buffer) { } fn bit_ops_benchmark(c: &mut Criterion) { - let left = create_buffer(512); - let right = create_buffer(512); + let left = create_buffer(512 * 10); + let right = create_buffer(512 * 10); c.bench_function("buffer_bit_ops and", |b| { b.iter(|| bench_buffer_and(&left, &right)) }); diff --git a/rust/arrow/src/buffer.rs b/rust/arrow/src/buffer.rs index d5b824e2992..899119696f1 100644 --- a/rust/arrow/src/buffer.rs +++ b/rust/arrow/src/buffer.rs @@ -36,7 +36,7 @@ use crate::memory; use crate::util::bit_chunk_iterator::BitChunks; use crate::util::bit_util; use crate::util::bit_util::ceil; -#[cfg(feature = "simd")] +#[cfg(any(feature = "simd", feature = "avx512"))] use std::borrow::BorrowMut; /// Buffer is a contiguous memory region of fixed size and is aligned at a 64-byte @@ -453,6 +453,23 @@ where result.freeze() } +#[cfg(all(target_arch = "x86_64", feature = "avx512"))] +const AVX512_U8X64_LANES: usize = 64; + +#[cfg(all(target_arch = "x86_64", feature = "avx512"))] +#[target_feature(enable = "avx512f")] +pub(super) unsafe fn avx512_bin_and(left: &[u8], right: &[u8], res: &mut [u8]) { + use core::arch::x86_64::{__m512i, _mm512_and_si512, _mm512_loadu_ps}; + + let l: __m512i = std::mem::transmute(_mm512_loadu_ps(left.as_ptr() as *const _)); + let r: __m512i = std::mem::transmute(_mm512_loadu_ps(right.as_ptr() as *const _)); + let f = _mm512_and_si512(l, r); + let s = &f as *const __m512i as *const u8; + let d = res.get_unchecked_mut(0) as *mut _ as *mut u8; + std::ptr::copy_nonoverlapping(s, d, std::mem::size_of::<__m512i>()); +} + +#[cfg(all(target_arch = "x86_64", feature = "avx512"))] pub(super) fn buffer_bin_and( left: &Buffer, left_offset_in_bits: usize, @@ -460,13 +477,67 @@ pub(super) fn buffer_bin_and( right_offset_in_bits: usize, len_in_bits: usize, ) -> Buffer { - // SIMD implementation if available and byte-aligned - #[cfg(all(any(target_arch = "x86", target_arch = "x86_64"), feature = "simd"))] if left_offset_in_bits % 8 == 0 && right_offset_in_bits % 8 == 0 && len_in_bits % 8 == 0 { - return bitwise_bin_op_simd_helper( + let len = len_in_bits / 8; + let left_offset = left_offset_in_bits / 8; + let right_offset = right_offset_in_bits / 8; + + let mut result = MutableBuffer::new(len).with_bitset(len, false); + + let mut left_chunks = left.data()[left_offset..].chunks_exact(AVX512_U8X64_LANES); + let mut right_chunks = + right.data()[right_offset..].chunks_exact(AVX512_U8X64_LANES); + let mut result_chunks = result.data_mut().chunks_exact_mut(AVX512_U8X64_LANES); + + result_chunks + .borrow_mut() + .zip(left_chunks.borrow_mut().zip(right_chunks.borrow_mut())) + .for_each(|(res, (left, right))| unsafe { + avx512_bin_and(left, right, res); + }); + + result_chunks + .into_remainder() + .iter_mut() + .zip( + left_chunks + .remainder() + .iter() + .zip(right_chunks.remainder().iter()), + ) + .for_each(|(res, (left, right))| { + *res = *left & *right; + }); + + result.freeze() + } else { + bitwise_bin_op_helper( + &left, + left_offset_in_bits, + right, + right_offset_in_bits, + len_in_bits, + |a, b| a & b, + ) + } +} + +#[cfg(all(any(target_arch = "x86", target_arch = "x86_64"), feature = "simd"))] +pub(super) fn buffer_bin_and( + left: &Buffer, + left_offset_in_bits: usize, + right: &Buffer, + right_offset_in_bits: usize, + len_in_bits: usize, +) -> Buffer { + if left_offset_in_bits % 8 == 0 + && right_offset_in_bits % 8 == 0 + && len_in_bits % 8 == 0 + { + bitwise_bin_op_simd_helper( &left, left_offset_in_bits / 8, &right, @@ -474,11 +545,8 @@ pub(super) fn buffer_bin_and( len_in_bits / 8, |a, b| a & b, |a, b| a & b, - ); - } - // Default implementation - #[allow(unreachable_code)] - { + ) + } else { bitwise_bin_op_helper( &left, left_offset_in_bits, @@ -490,6 +558,27 @@ pub(super) fn buffer_bin_and( } } +#[cfg(all( + any(target_arch = "x86", target_arch = "x86_64"), + not(any(feature = "simd", feature = "avx512")) +))] +pub(super) fn buffer_bin_and( + left: &Buffer, + left_offset_in_bits: usize, + right: &Buffer, + right_offset_in_bits: usize, + len_in_bits: usize, +) -> Buffer { + bitwise_bin_op_helper( + &left, + left_offset_in_bits, + right, + right_offset_in_bits, + len_in_bits, + |a, b| a & b, + ) +} + pub(super) fn buffer_bin_or( left: &Buffer, left_offset_in_bits: usize, diff --git a/rust/arrow/src/compute/kernels/cast.rs b/rust/arrow/src/compute/kernels/cast.rs index f054542b079..bd9340e0e2a 100644 --- a/rust/arrow/src/compute/kernels/cast.rs +++ b/rust/arrow/src/compute/kernels/cast.rs @@ -2742,14 +2742,14 @@ mod tests { (Ok(_), false) => { panic!("Was able to cast array from {:?} to {:?} but can_cast_types reported false", array.data_type(), to_type) - }, + } (Err(e), true) => { panic!("Was not able to cast array from {:?} to {:?} but can_cast_types reported true. \ Error was {:?}", array.data_type(), to_type, e) - }, + } // otherwise it was a match - _=> {}, + _ => {} }; } } diff --git a/rust/arrow/src/lib.rs b/rust/arrow/src/lib.rs index 09c04be4b2c..c1cc7b8449a 100644 --- a/rust/arrow/src/lib.rs +++ b/rust/arrow/src/lib.rs @@ -124,6 +124,9 @@ //! //! The parquet implementation is on a [separate crate](https://crates.io/crates/parquet) +#![cfg_attr(feature = "avx512", feature(stdsimd))] +#![cfg_attr(feature = "avx512", feature(repr_simd))] +#![cfg_attr(feature = "avx512", feature(avx512_target_feature))] #![allow(dead_code)] #![allow(non_camel_case_types)] #![allow(bare_trait_objects)] diff --git a/rust/arrow/src/util/bit_util.rs b/rust/arrow/src/util/bit_util.rs index 269eceb8be2..a7b3cebb382 100644 --- a/rust/arrow/src/util/bit_util.rs +++ b/rust/arrow/src/util/bit_util.rs @@ -364,6 +364,22 @@ mod tests { assert_eq!(ceil(10000000000, 1000000000), 10); } + #[test] + #[cfg(all(any(target_arch = "x86", target_arch = "x86_64"), feature = "avx512"))] + fn test_bitwise_and_avx512() { + use crate::buffer::avx512_bin_and; + + let buf1 = [0b00110011u8; 64]; + let buf2 = [0b11110000u8; 64]; + let mut buf3 = [0b00000000; 64]; + unsafe { + avx512_bin_and(&buf1, &buf2, &mut buf3); + }; + for i in buf3.iter() { + assert_eq!(&0b00110000u8, i); + } + } + #[test] #[cfg(all(any(target_arch = "x86", target_arch = "x86_64"), feature = "simd"))] fn test_bitwise_and_simd() {