-
Notifications
You must be signed in to change notification settings - Fork 1.1k
Perf: improve sort via partition_validity to use fast path for bit map scan (up to 30% faster)
#7962
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
Merged
Perf: improve sort via partition_validity to use fast path for bit map scan (up to 30% faster)
#7962
Changes from all commits
Commits
Show all changes
9 commits
Select commit
Hold shift + click to select a range
d8ffb41
Perf: use fast path for bit map scan for partition_validity
zhuqi-lucas dddfd98
Merge remote-tracking branch 'upstream/main' into fast_path_for_bit_m…
zhuqi-lucas d916497
fmt
zhuqi-lucas c7a0ae9
polish comments
zhuqi-lucas 47dbdab
address comments
zhuqi-lucas 4ea7810
Merge remote-tracking branch 'upstream/main' into fast_path_for_bit_m…
zhuqi-lucas 556e673
fix testing
zhuqi-lucas 3d6bcea
address more testing cases
zhuqi-lucas 57d319c
Merge remote-tracking branch 'apache/main' into fast_path_for_bit_map…
alamb File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -231,6 +231,63 @@ impl Iterator for BitIndexIterator<'_> { | |
| } | ||
| } | ||
|
|
||
| /// An iterator of u32 whose index in a provided bitmask is true | ||
| /// Respects arbitrary offsets and slice lead/trail padding exactly like BitIndexIterator | ||
| #[derive(Debug)] | ||
| pub struct BitIndexU32Iterator<'a> { | ||
| curr: u64, | ||
| chunk_offset: i64, | ||
| iter: UnalignedBitChunkIterator<'a>, | ||
| } | ||
|
|
||
| impl<'a> BitIndexU32Iterator<'a> { | ||
| /// Create a new [BitIndexU32Iterator] from the provided buffer, | ||
| /// offset and len in bits. | ||
| pub fn new(buffer: &'a [u8], offset: usize, len: usize) -> Self { | ||
| // Build the aligned chunks (including prefix/suffix masked) | ||
| let chunks = UnalignedBitChunk::new(buffer, offset, len); | ||
| let mut iter = chunks.iter(); | ||
|
|
||
| // First 64-bit word (masked for lead padding), or 0 if empty | ||
| let curr = iter.next().unwrap_or(0); | ||
| // Negative lead padding ensures the first bit in curr maps to index 0 | ||
| let chunk_offset = -(chunks.lead_padding() as i64); | ||
|
|
||
| Self { | ||
| curr, | ||
| chunk_offset, | ||
| iter, | ||
| } | ||
| } | ||
| } | ||
|
|
||
| impl<'a> Iterator for BitIndexU32Iterator<'a> { | ||
| type Item = u32; | ||
|
|
||
| #[inline(always)] | ||
| fn next(&mut self) -> Option<u32> { | ||
| loop { | ||
| if self.curr != 0 { | ||
| // Position of least-significant set bit | ||
| let tz = self.curr.trailing_zeros(); | ||
| // Clear that bit | ||
| self.curr &= self.curr - 1; | ||
| // Return global index = chunk_offset + tz | ||
| return Some((self.chunk_offset + tz as i64) as u32); | ||
| } | ||
| // Advance to next 64-bit chunk | ||
| match self.iter.next() { | ||
| Some(next_chunk) => { | ||
| // Move offset forward by 64 bits | ||
| self.chunk_offset += 64; | ||
| self.curr = next_chunk; | ||
| } | ||
| None => return None, | ||
| } | ||
| } | ||
| } | ||
| } | ||
|
|
||
| /// Calls the provided closure for each index in the provided null mask that is set, | ||
| /// using an adaptive strategy based on the null count | ||
| /// | ||
|
|
@@ -323,4 +380,110 @@ mod tests { | |
| let mask = &[223, 23]; | ||
| BitIterator::new(mask, 17, 0); | ||
| } | ||
|
|
||
| #[test] | ||
| fn test_bit_index_u32_iterator_basic() { | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. These tests make sure index_u32 is same with original usize result. |
||
| let mask = &[0b00010010, 0b00100011]; | ||
|
|
||
| let result: Vec<u32> = BitIndexU32Iterator::new(mask, 0, 16).collect(); | ||
| let expected: Vec<u32> = BitIndexIterator::new(mask, 0, 16) | ||
| .map(|i| i as u32) | ||
| .collect(); | ||
| assert_eq!(result, expected); | ||
|
|
||
| let result: Vec<u32> = BitIndexU32Iterator::new(mask, 4, 8).collect(); | ||
| let expected: Vec<u32> = BitIndexIterator::new(mask, 4, 8) | ||
| .map(|i| i as u32) | ||
| .collect(); | ||
| assert_eq!(result, expected); | ||
|
|
||
| let result: Vec<u32> = BitIndexU32Iterator::new(mask, 10, 4).collect(); | ||
| let expected: Vec<u32> = BitIndexIterator::new(mask, 10, 4) | ||
| .map(|i| i as u32) | ||
| .collect(); | ||
| assert_eq!(result, expected); | ||
|
|
||
| let result: Vec<u32> = BitIndexU32Iterator::new(mask, 0, 0).collect(); | ||
| let expected: Vec<u32> = BitIndexIterator::new(mask, 0, 0) | ||
| .map(|i| i as u32) | ||
| .collect(); | ||
| assert_eq!(result, expected); | ||
| } | ||
|
|
||
| #[test] | ||
| fn test_bit_index_u32_iterator_all_set() { | ||
| let mask = &[0xFF, 0xFF]; | ||
| let result: Vec<u32> = BitIndexU32Iterator::new(mask, 0, 16).collect(); | ||
| let expected: Vec<u32> = BitIndexIterator::new(mask, 0, 16) | ||
| .map(|i| i as u32) | ||
| .collect(); | ||
| assert_eq!(result, expected); | ||
| } | ||
|
|
||
| #[test] | ||
| fn test_bit_index_u32_iterator_none_set() { | ||
| let mask = &[0x00, 0x00]; | ||
| let result: Vec<u32> = BitIndexU32Iterator::new(mask, 0, 16).collect(); | ||
| let expected: Vec<u32> = BitIndexIterator::new(mask, 0, 16) | ||
| .map(|i| i as u32) | ||
| .collect(); | ||
| assert_eq!(result, expected); | ||
| } | ||
|
|
||
| #[test] | ||
| fn test_bit_index_u32_cross_chunk() { | ||
| let mut buf = vec![0u8; 16]; | ||
| for bit in 60..68 { | ||
| let byte = (bit / 8) as usize; | ||
| let bit_in_byte = bit % 8; | ||
| buf[byte] |= 1 << bit_in_byte; | ||
| } | ||
| let offset = 58; | ||
| let len = 10; | ||
|
|
||
| let result: Vec<u32> = BitIndexU32Iterator::new(&buf, offset, len).collect(); | ||
| let expected: Vec<u32> = BitIndexIterator::new(&buf, offset, len) | ||
| .map(|i| i as u32) | ||
| .collect(); | ||
| assert_eq!(result, expected); | ||
| } | ||
|
|
||
| #[test] | ||
| fn test_bit_index_u32_unaligned_offset() { | ||
| let mask = &[0b0110_1100, 0b1010_0000]; | ||
| let offset = 2; | ||
| let len = 12; | ||
|
|
||
| let result: Vec<u32> = BitIndexU32Iterator::new(mask, offset, len).collect(); | ||
| let expected: Vec<u32> = BitIndexIterator::new(mask, offset, len) | ||
| .map(|i| i as u32) | ||
| .collect(); | ||
| assert_eq!(result, expected); | ||
| } | ||
|
|
||
| #[test] | ||
| fn test_bit_index_u32_long_all_set() { | ||
| let len = 200; | ||
| let num_bytes = len / 8 + if len % 8 != 0 { 1 } else { 0 }; | ||
| let bytes = vec![0xFFu8; num_bytes]; | ||
|
|
||
| let result: Vec<u32> = BitIndexU32Iterator::new(&bytes, 0, len).collect(); | ||
| let expected: Vec<u32> = BitIndexIterator::new(&bytes, 0, len) | ||
| .map(|i| i as u32) | ||
| .collect(); | ||
| assert_eq!(result, expected); | ||
| } | ||
|
|
||
| #[test] | ||
| fn test_bit_index_u32_none_set() { | ||
| let len = 50; | ||
| let num_bytes = len / 8 + if len % 8 != 0 { 1 } else { 0 }; | ||
| let bytes = vec![0u8; num_bytes]; | ||
|
|
||
| let result: Vec<u32> = BitIndexU32Iterator::new(&bytes, 0, len).collect(); | ||
| let expected: Vec<u32> = BitIndexIterator::new(&bytes, 0, len) | ||
| .map(|i| i as u32) | ||
| .collect(); | ||
| assert_eq!(result, expected); | ||
| } | ||
| } | ||
Oops, something went wrong.
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is this implementation somehow more performant than using the existing
BitIndexIteratorand casting its items tou32? The only difference I see is in the masking of the lowest bit,^= 1 << bit_posvs&= self.curr - 1, but I think llvm would know that those are equivalent. If it makes a difference, then we should adjustBitIndexIteratorthe same way.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thank you @jhorstmann for good question, actually it is more performant than using the existing BitIndexIterator because we cast directly to u32. But the BitIndexIterator will cast it to usize, so when we use BitIndexIterator, we need to cast from usize to u32, when i was testing, it caused the slowness.
The ^= 1 << bit_pos vs &= self.curr - 1, the performance almost same, it will not show difference, so i can use any of them.
I think i may change to a macro, so it will look more clear.
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I will do a test to compare the performance too
Update: made #7979 and I queued up benchmark runs
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thank you @alamb !
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Conclusion from #7979 is that the u32 specific iterator is worth a 3-5% improvement: #7979 (comment)
Given that I think this PR makes sense to me