-
Notifications
You must be signed in to change notification settings - Fork 1.1k
Speed up unary not kernel by 50%, add BooleanBuffer::from_bitwise_unary
#8996
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
Merged
Changes from all commits
Commits
Show all changes
8 commits
Select commit
Hold shift + click to select a range
3355044
Speed up unary not kernel / BooleanBuffer::from_bitwise_unary
alamb 0b7d796
Add boolean benchmark for byte aligned slices
alamb ef81fcf
Improve comments and reduce nesting
alamb 469f2ad
Use push_unchecked for consistency
alamb 1bd1321
Merge remote-tracking branch 'apache/main' into alamb/unary_op
alamb 313f2dc
Rename variables to avoid the term `left`
alamb e216bb8
Use explicit bitmask to check alignment
alamb 65d763b
Update arrow-buffer/src/buffer/boolean.rs
alamb File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -26,10 +26,10 @@ use std::ops::{BitAnd, BitOr, BitXor, Not}; | |
|
|
||
| /// A slice-able [`Buffer`] containing bit-packed booleans | ||
| /// | ||
| /// `BooleanBuffer`s can be creating using [`BooleanBufferBuilder`] | ||
| /// `BooleanBuffer`s can be modified using [`BooleanBufferBuilder`] | ||
| /// | ||
| /// # See Also | ||
| /// | ||
| /// # See Also | ||
| /// * [`NullBuffer`] for representing null values in Arrow arrays | ||
| /// | ||
| /// [`NullBuffer`]: crate::NullBuffer | ||
|
|
@@ -96,12 +96,128 @@ impl BooleanBuffer { | |
| Self::new(buffer.into(), 0, len) | ||
| } | ||
|
|
||
| /// Create a new [`BooleanBuffer`] by copying the relevant bits from an | ||
| /// input buffer. | ||
| /// | ||
| /// # Notes: | ||
| /// * The new `BooleanBuffer` has zero offset, even if `offset_in_bits` is non-zero | ||
| /// | ||
| /// # Example: Create a new [`BooleanBuffer`] copying a bit slice from in input slice | ||
| /// ``` | ||
| /// # use arrow_buffer::BooleanBuffer; | ||
| /// let input = [0b11001100u8, 0b10111010u8]; | ||
| /// // // Copy bits 4..16 from input | ||
| /// let result = BooleanBuffer::from_bits(&input, 4, 12); | ||
| /// assert_eq!(result.values(), &[0b10101100u8, 0b00001011u8]); | ||
| pub fn from_bits(src: impl AsRef<[u8]>, offset_in_bits: usize, len_in_bits: usize) -> Self { | ||
| Self::from_bitwise_unary_op(src, offset_in_bits, len_in_bits, |a| a) | ||
| } | ||
|
|
||
| /// Create a new [`BooleanBuffer`] by applying the bitwise operation to `op` | ||
| /// to an input buffer. | ||
| /// | ||
| /// This function is faster than applying the operation bit by bit as | ||
| /// it processes input buffers in chunks of 64 bits (8 bytes) at a time | ||
| /// | ||
| /// # Notes: | ||
| /// * `op` takes a single `u64` inputs and produces one `u64` output. | ||
| /// * `op` must only apply bitwise operations | ||
| /// on the relevant bits; the input `u64` may contain irrelevant bits | ||
| /// and may be processed differently on different endian architectures. | ||
| /// * The output always has zero offset | ||
| /// | ||
| /// # See Also | ||
| /// - [`apply_bitwise_unary_op`](bit_util::apply_bitwise_unary_op) for in-place unary bitwise operations | ||
| /// | ||
| /// # Example: Create new [`BooleanBuffer`] from bitwise `NOT` of an input [`Buffer`] | ||
| /// ``` | ||
| /// # use arrow_buffer::BooleanBuffer; | ||
| /// let input = [0b11001100u8, 0b10111010u8]; // 2 bytes = 16 bits | ||
| /// // NOT of the first 12 bits | ||
| /// let result = BooleanBuffer::from_bitwise_unary_op( | ||
| /// &input, 0, 12, |a| !a | ||
| /// ); | ||
| /// assert_eq!(result.values(), &[0b00110011u8, 0b11110101u8]); | ||
| /// ``` | ||
| pub fn from_bitwise_unary_op<F>( | ||
| src: impl AsRef<[u8]>, | ||
| offset_in_bits: usize, | ||
| len_in_bits: usize, | ||
| mut op: F, | ||
| ) -> Self | ||
| where | ||
| F: FnMut(u64) -> u64, | ||
| { | ||
| // try fast path for aligned input | ||
| if offset_in_bits & 0x7 == 0 { | ||
| // align to byte boundary | ||
| let aligned = &src.as_ref()[offset_in_bits / 8..]; | ||
| if let Some(result) = | ||
| Self::try_from_aligned_bitwise_unary_op(aligned, len_in_bits, &mut op) | ||
| { | ||
| return result; | ||
| } | ||
| } | ||
|
|
||
| let chunks = BitChunks::new(src.as_ref(), offset_in_bits, len_in_bits); | ||
| let mut result = MutableBuffer::with_capacity(chunks.num_u64s() * 8); | ||
| for chunk in chunks.iter() { | ||
| // SAFETY: reserved enough capacity above, (exactly num_u64s() | ||
| // items) and we assume `BitChunks` correctly reports upper bound | ||
| unsafe { | ||
| result.push_unchecked(op(chunk)); | ||
| } | ||
| } | ||
| if chunks.remainder_len() > 0 { | ||
| debug_assert!(result.capacity() >= result.len() + 8); // should not reallocate | ||
| // SAFETY: reserved enough capacity above, (exactly num_u64s() | ||
| // items) and we assume `BitChunks` correctly reports upper bound | ||
| unsafe { | ||
| result.push_unchecked(op(chunks.remainder_bits())); | ||
| } | ||
| // Just pushed one u64, which may have trailing zeros | ||
| result.truncate(chunks.num_bytes()); | ||
| } | ||
|
|
||
| let buffer = Buffer::from(result); | ||
| BooleanBuffer { | ||
| buffer, | ||
| offset: 0, | ||
| len: len_in_bits, | ||
| } | ||
| } | ||
|
|
||
| /// Fast path for [`Self::from_bitwise_unary_op`] when input is aligned to | ||
| /// 8-byte (64-bit) boundaries | ||
| /// | ||
| /// Returns None if the fast path cannot be taken | ||
| fn try_from_aligned_bitwise_unary_op<F>( | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. BTW I wrote a version of this code to handle works for byte aligned, but it actually seems to have made performance worse, so I am going to update the comments and leave it this way What I tried /// Like [`Self::from_bitwise_unary_op`] but optimized for the case where the
/// input is aligned to byte boundaries
fn try_from_aligned_bitwise_unary_op<F>(
left: &[u8],
len_in_bits: usize,
op: &mut F,
) -> Option<Self>
where
F: FnMut(u64) -> u64,
{
// safety: all valid bytes are valid u64s
let (left_prefix, left_u64s, left_suffix) = unsafe { left.align_to::<u64>() };
// if there is no prefix or suffix, the buffer is aligned and we can do
// the operation directly on u64s
if left_prefix.is_empty() && left_suffix.is_empty() {
let result_u64s: Vec<u64> = left_u64s.iter().map(|l| op(*l)).collect();
let buffer = Buffer::from(result_u64s);
return Some(BooleanBuffer::new(buffer, 0, len_in_bits));
}
let mut result = MutableBuffer::with_capacity(
left_prefix.len() + left_u64s.len() * 8 + left_suffix.len(),
);
let prefix_u64 = op(Self::byte_slice_to_u64(left_prefix));
result.extend_from_slice(&prefix_u64.to_be_bytes()[0..left_prefix.len()]);
assert!(result.capacity() >= result.len() + left_u64s.len() * 8);
for &left in left_u64s.iter() {
// SAFETY: we asserted there is enough capacity above
unsafe {
result.push_unchecked(op(left));
}
}
let suffix_u64 = op(Self::byte_slice_to_u64(left_suffix));
result.extend_from_slice(&suffix_u64.to_be_bytes()[0..left_suffix.len()]);
Some(BooleanBuffer::new(result.into(), 0, len_in_bits))
}diff --git a/arrow-buffer/src/buffer/boolean.rs b/arrow-buffer/src/buffer/boolean.rs
index 97674c18843..285888b3a7c 100644
--- a/arrow-buffer/src/buffer/boolean.rs
+++ b/arrow-buffer/src/buffer/boolean.rs
@@ -18,10 +18,11 @@
use crate::bit_chunk_iterator::BitChunks;
use crate::bit_iterator::{BitIndexIterator, BitIndexU32Iterator, BitIterator, BitSliceIterator};
use crate::{
- BooleanBufferBuilder, Buffer, MutableBuffer, bit_util, buffer_bin_and, buffer_bin_or,
- buffer_bin_xor, buffer_unary_not,
+ BooleanBufferBuilder, Buffer, MutableBuffer, ToByteSlice, bit_util, buffer_bin_and,
+ buffer_bin_or, buffer_bin_xor, buffer_unary_not,
};
+use crate::bit_util::get_remainder_bits;
use std::ops::{BitAnd, BitOr, BitXor, Not};
/// A slice-able [`Buffer`] containing bit-packed booleans
@@ -200,14 +201,37 @@ impl BooleanBuffer {
// the operation directly on u64s
if left_prefix.is_empty() && left_suffix.is_empty() {
let result_u64s: Vec<u64> = left_u64s.iter().map(|l| op(*l)).collect();
- Some(BooleanBuffer::new(
- Buffer::from(result_u64s),
- 0,
- len_in_bits,
- ))
- } else {
- None
+ let buffer = Buffer::from(result_u64s);
+ return Some(BooleanBuffer::new(buffer, 0, len_in_bits));
}
+
+ let mut result = MutableBuffer::with_capacity(
+ left_prefix.len() + left_u64s.len() * 8 + left_suffix.len(),
+ );
+ let prefix_u64 = op(Self::byte_slice_to_u64(left_prefix));
+
+ result.extend_from_slice(&prefix_u64.to_be_bytes()[0..left_prefix.len()]);
+
+ assert!(result.capacity() >= result.len() + left_u64s.len() * 8);
+ for &left in left_u64s.iter() {
+ // SAFETY: we asserted there is enough capacity above
+ unsafe {
+ result.push_unchecked(op(left));
+ }
+ }
+
+ let suffix_u64 = op(Self::byte_slice_to_u64(left_suffix));
+ result.extend_from_slice(&suffix_u64.to_be_bytes()[0..left_suffix.len()]);
+
+ Some(BooleanBuffer::new(result.into(), 0, len_in_bits))
+ }
+
+ /// convert the bytes into a u64 suitable for opeartion
+ fn byte_slice_to_u64(src: &[u8]) -> u64 {
+ let num_bytes = src.len();
+ let mut bytes = [0u8; 8];
+ bytes[0..num_bytes].copy_from_slice(src);
+ u64::from_be_bytes(bytes)
}
/// Returns the number of set bits in this buffer
diff --git a/arrow-buffer/src/util/bit_util.rs b/arrow-buffer/src/util/bit_util.rsThis PR When I tried fancier code for byte alignment: |
||
| src: &[u8], | ||
| len_in_bits: usize, | ||
| op: &mut F, | ||
| ) -> Option<Self> | ||
| where | ||
| F: FnMut(u64) -> u64, | ||
| { | ||
| // Safety: all valid bytes are valid u64s | ||
| let (prefix, aligned_u6us, suffix) = unsafe { src.align_to::<u64>() }; | ||
| if !(prefix.is_empty() && suffix.is_empty()) { | ||
| // Couldn't make this case any faster than the default path, see | ||
| // https://github.com/apache/arrow-rs/pull/8996/changes#r2620022082 | ||
| return None; | ||
| } | ||
| // the buffer is word (64 bit) aligned, so use optimized Vec code. | ||
| let result_u64s: Vec<u64> = aligned_u6us.iter().map(|l| op(*l)).collect(); | ||
| let buffer = Buffer::from(result_u64s); | ||
| Some(BooleanBuffer::new(buffer, 0, len_in_bits)) | ||
| } | ||
|
|
||
| /// Returns the number of set bits in this buffer | ||
| pub fn count_set_bits(&self) -> usize { | ||
| self.buffer.count_set_bits_offset(self.offset, self.len) | ||
| } | ||
|
|
||
| /// Returns a `BitChunks` instance which can be used to iterate over | ||
| /// Returns a [`BitChunks`] instance which can be used to iterate over | ||
| /// this buffer's bits in `u64` chunks | ||
| #[inline] | ||
| pub fn bit_chunks(&self) -> BitChunks<'_> { | ||
|
|
@@ -437,4 +553,42 @@ mod tests { | |
| assert_eq!(buf.values().len(), 1); | ||
| assert!(buf.value(0)); | ||
| } | ||
|
|
||
| #[test] | ||
| fn test_from_bitwise_unary_op() { | ||
| // Use 1024 boolean values so that at least some of the tests cover multiple u64 chunks and | ||
| // perfect alignment | ||
| let input_bools = (0..1024) | ||
| .map(|_| rand::random::<bool>()) | ||
| .collect::<Vec<bool>>(); | ||
| let input_buffer = BooleanBuffer::from(&input_bools[..]); | ||
|
|
||
| // Note ensure we test offsets over 100 to cover multiple u64 chunks | ||
| for offset in 0..1024 { | ||
| let result = BooleanBuffer::from_bitwise_unary_op( | ||
| input_buffer.values(), | ||
| offset, | ||
| input_buffer.len() - offset, | ||
| |a| !a, | ||
| ); | ||
| let expected = input_bools[offset..] | ||
| .iter() | ||
| .map(|b| !*b) | ||
| .collect::<BooleanBuffer>(); | ||
| assert_eq!(result, expected); | ||
| } | ||
|
|
||
| // Also test when the input doesn't cover the entire buffer | ||
| for offset in 0..512 { | ||
| let len = 512 - offset; // fixed length less than total | ||
| let result = | ||
| BooleanBuffer::from_bitwise_unary_op(input_buffer.values(), offset, len, |a| !a); | ||
| let expected = input_bools[offset..] | ||
| .iter() | ||
| .take(len) | ||
| .map(|b| !*b) | ||
| .collect::<BooleanBuffer>(); | ||
| assert_eq!(result, expected); | ||
| } | ||
| } | ||
| } | ||
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.