diff --git a/arrow-buffer/src/buffer/boolean.rs b/arrow-buffer/src/buffer/boolean.rs index 548401ed4201..46a332cc8504 100644 --- a/arrow-buffer/src/buffer/boolean.rs +++ b/arrow-buffer/src/buffer/boolean.rs @@ -253,6 +253,107 @@ impl BooleanBuffer { Some(BooleanBuffer::new(buffer, 0, len_in_bits)) } + /// Create a new [`BooleanBuffer`] by applying the bitwise operation `op` to + /// the relevant bits from two input buffers. + /// + /// This function is faster than applying the operation bit by bit as + /// it processes input buffers in chunks of 64 bits (8 bytes) at a time + /// + /// # Notes: + /// See notes on [Self::from_bitwise_unary_op] + /// + /// # See Also + /// - [`BooleanBuffer::from_bitwise_unary_op`] for unary operations on a single input buffer. + /// - [`apply_bitwise_binary_op`](bit_util::apply_bitwise_binary_op) for in-place binary bitwise operations + /// + /// # Example: Create new [`BooleanBuffer`] from bitwise `AND` of two [`Buffer`]s + /// ``` + /// # use arrow_buffer::{Buffer, BooleanBuffer}; + /// let left = Buffer::from(vec![0b11001100u8, 0b10111010u8]); // 2 bytes = 16 bits + /// let right = Buffer::from(vec![0b10101010u8, 0b11011100u8, 0b11110000u8]); // 3 bytes = 24 bits + /// // AND of the first 12 bits + /// let result = BooleanBuffer::from_bitwise_binary_op( + /// &left, 0, &right, 0, 12, |a, b| a & b + /// ); + /// assert_eq!(result.inner().as_slice(), &[0b10001000u8, 0b00001000u8]); + /// ``` + /// + /// # Example: Create new [`BooleanBuffer`] from bitwise `OR` of two byte slices + /// ``` + /// # use arrow_buffer::BooleanBuffer; + /// let left = [0b11001100u8, 0b10111010u8]; + /// let right = [0b10101010u8, 0b11011100u8]; + /// // OR of bits 4..16 from left and bits 0..12 from right + /// let result = BooleanBuffer::from_bitwise_binary_op( + /// &left, 4, &right, 0, 12, |a, b| a | b + /// ); + /// assert_eq!(result.inner().as_slice(), &[0b10101110u8, 0b00001111u8]); + /// ``` + pub fn from_bitwise_binary_op( + left: impl AsRef<[u8]>, + left_offset_in_bits: usize, + right: impl AsRef<[u8]>, + right_offset_in_bits: usize, + len_in_bits: usize, + mut op: F, + ) -> Self + where + F: FnMut(u64, u64) -> u64, + { + let left = left.as_ref(); + let right = right.as_ref(); + // try fast path for aligned input + // If the underlying buffers are aligned to u64 we can apply the operation directly on the u64 slices + // to improve performance. + if left_offset_in_bits == 0 && right_offset_in_bits == 0 { + unsafe { + let (left_prefix, left_u64s, left_suffix) = left.align_to::(); + let (right_prefix, right_u64s, right_suffix) = right.align_to::(); + // if there is no prefix or suffix, both buffers are aligned and we can do the operation directly + // on u64s + // TODO also handle non empty suffixes by processing them separately + if left_prefix.is_empty() + && right_prefix.is_empty() + && left_suffix.is_empty() + && right_suffix.is_empty() + { + let result_u64s = left_u64s + .iter() + .zip(right_u64s.iter()) + .map(|(l, r)| op(*l, *r)) + .collect::>(); + return BooleanBuffer { + buffer: Buffer::from(result_u64s), + bit_offset: 0, + bit_len: len_in_bits, + }; + } + } + } + let left_chunks = BitChunks::new(left, left_offset_in_bits, len_in_bits); + let right_chunks = BitChunks::new(right, right_offset_in_bits, len_in_bits); + + let chunks = left_chunks + .iter() + .zip(right_chunks.iter()) + .map(|(left, right)| op(left, right)); + // Soundness: `BitChunks` is a `BitChunks` iterator which + // correctly reports its upper bound + let mut buffer = unsafe { MutableBuffer::from_trusted_len_iter(chunks) }; + + let remainder_bytes = bit_util::ceil(left_chunks.remainder_len(), 8); + let rem = op(left_chunks.remainder_bits(), right_chunks.remainder_bits()); + // we are counting its starting from the least significant bit, to to_le_bytes should be correct + let rem = &rem.to_le_bytes()[0..remainder_bytes]; + buffer.extend_from_slice(rem); + + BooleanBuffer { + buffer: Buffer::from(buffer), + bit_offset: 0, + bit_len: len_in_bits, + } + } + /// Returns the number of set bits in this buffer pub fn count_set_bits(&self) -> usize { self.buffer @@ -655,4 +756,42 @@ mod tests { assert_eq!(result, expected); } } + + #[test] + fn test_from_bitwise_binary_op() { + // pick random boolean inputs + let input_bools_left = (0..1024) + .map(|_| rand::random::()) + .collect::>(); + let input_bools_right = (0..1024) + .map(|_| rand::random::()) + .collect::>(); + let input_buffer_left = BooleanBuffer::from(&input_bools_left[..]); + let input_buffer_right = BooleanBuffer::from(&input_bools_right[..]); + + for left_offset in 0..200 { + for right_offset in [0, 4, 5, 17, 33, 24, 45, 64, 65, 100, 200] { + for len_offset in [0, 1, 44, 100, 256, 300, 512] { + let len = 1024 - len_offset - left_offset.max(right_offset); // ensure we don't go out of bounds + // compute with AND + let result = BooleanBuffer::from_bitwise_binary_op( + input_buffer_left.values(), + left_offset, + input_buffer_right.values(), + right_offset, + len, + |a, b| a & b, + ); + // compute directly from bools + let expected = input_bools_left[left_offset..] + .iter() + .zip(&input_bools_right[right_offset..]) + .take(len) + .map(|(a, b)| *a & *b) + .collect::(); + assert_eq!(result, expected); + } + } + } + } } diff --git a/arrow-buffer/src/buffer/ops.rs b/arrow-buffer/src/buffer/ops.rs index 05593504b1cf..5219caff07af 100644 --- a/arrow-buffer/src/buffer/ops.rs +++ b/arrow-buffer/src/buffer/ops.rs @@ -61,35 +61,28 @@ where /// Apply a bitwise operation `op` to two inputs and return the result as a Buffer. /// The inputs are treated as bitmaps, meaning that offsets and length are specified in number of bits. +/// +/// This is a comment for testing purposes pub fn bitwise_bin_op_helper( left: &Buffer, left_offset_in_bits: usize, right: &Buffer, right_offset_in_bits: usize, len_in_bits: usize, - mut op: F, + op: F, ) -> Buffer where F: FnMut(u64, u64) -> u64, { - let left_chunks = left.bit_chunks(left_offset_in_bits, len_in_bits); - let right_chunks = right.bit_chunks(right_offset_in_bits, len_in_bits); - - let chunks = left_chunks - .iter() - .zip(right_chunks.iter()) - .map(|(left, right)| op(left, right)); - // Soundness: `BitChunks` is a `BitChunks` iterator which - // correctly reports its upper bound - let mut buffer = unsafe { MutableBuffer::from_trusted_len_iter(chunks) }; - - let remainder_bytes = ceil(left_chunks.remainder_len(), 8); - let rem = op(left_chunks.remainder_bits(), right_chunks.remainder_bits()); - // we are counting its starting from the least significant bit, to to_le_bytes should be correct - let rem = &rem.to_le_bytes()[0..remainder_bytes]; - buffer.extend_from_slice(rem); - - buffer.into() + BooleanBuffer::from_bitwise_binary_op( + left, + left_offset_in_bits, + right, + right_offset_in_bits, + len_in_bits, + op, + ) + .into_inner() } /// Apply a bitwise operation `op` to one input and return the result as a Buffer.