Skip to content
7 changes: 6 additions & 1 deletion arrow-buffer/src/buffer/boolean.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
// under the License.

use crate::bit_chunk_iterator::BitChunks;
use crate::bit_iterator::{BitIndexIterator, BitIterator, BitSliceIterator};
use crate::bit_iterator::{BitIndexIterator, BitIndexU32Iterator, BitIterator, BitSliceIterator};
use crate::{
bit_util, buffer_bin_and, buffer_bin_or, buffer_bin_xor, buffer_unary_not,
BooleanBufferBuilder, Buffer, MutableBuffer,
Expand Down Expand Up @@ -208,6 +208,11 @@ impl BooleanBuffer {
BitIndexIterator::new(self.values(), self.offset, self.len)
}

/// Returns a `u32` iterator over set bit positions without any usize->u32 conversion
pub fn set_indices_u32(&self) -> BitIndexU32Iterator<'_> {
BitIndexU32Iterator::new(self.values(), self.offset, self.len)
}

/// Returns a [`BitSliceIterator`] yielding contiguous ranges of set bits
pub fn set_slices(&self) -> BitSliceIterator<'_> {
BitSliceIterator::new(self.values(), self.offset, self.len)
Expand Down
163 changes: 163 additions & 0 deletions arrow-buffer/src/util/bit_iterator.rs
Original file line number Diff line number Diff line change
Expand Up @@ -231,6 +231,63 @@ impl Iterator for BitIndexIterator<'_> {
}
}

/// An iterator of u32 whose index in a provided bitmask is true
/// Respects arbitrary offsets and slice lead/trail padding exactly like BitIndexIterator
#[derive(Debug)]
pub struct BitIndexU32Iterator<'a> {
curr: u64,
chunk_offset: i64,
iter: UnalignedBitChunkIterator<'a>,
}

impl<'a> BitIndexU32Iterator<'a> {
/// Create a new [BitIndexU32Iterator] from the provided buffer,
/// offset and len in bits.
pub fn new(buffer: &'a [u8], offset: usize, len: usize) -> Self {
// Build the aligned chunks (including prefix/suffix masked)
let chunks = UnalignedBitChunk::new(buffer, offset, len);
let mut iter = chunks.iter();

// First 64-bit word (masked for lead padding), or 0 if empty
let curr = iter.next().unwrap_or(0);
// Negative lead padding ensures the first bit in curr maps to index 0
let chunk_offset = -(chunks.lead_padding() as i64);

Self {
curr,
chunk_offset,
iter,
}
}
}

impl<'a> Iterator for BitIndexU32Iterator<'a> {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this implementation somehow more performant than using the existing BitIndexIterator and casting its items to u32? The only difference I see is in the masking of the lowest bit, ^= 1 << bit_pos vs &= self.curr - 1, but I think llvm would know that those are equivalent. If it makes a difference, then we should adjust BitIndexIterator the same way.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thank you @jhorstmann for good question, actually it is more performant than using the existing BitIndexIterator because we cast directly to u32. But the BitIndexIterator will cast it to usize, so when we use BitIndexIterator, we need to cast from usize to u32, when i was testing, it caused the slowness.

The ^= 1 << bit_pos vs &= self.curr - 1, the performance almost same, it will not show difference, so i can use any of them.

I think i may change to a macro, so it will look more clear.

Copy link
Contributor

@alamb alamb Jul 23, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I will do a test to compare the performance too

Update: made #7979 and I queued up benchmark runs

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thank you @alamb !

I will do a test to compare the performance too

Update: made #7979 and I queued up benchmark runs

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Conclusion from #7979 is that the u32 specific iterator is worth a 3-5% improvement: #7979 (comment)

Given that I think this PR makes sense to me

type Item = u32;

#[inline(always)]
fn next(&mut self) -> Option<u32> {
loop {
if self.curr != 0 {
// Position of least-significant set bit
let tz = self.curr.trailing_zeros();
// Clear that bit
self.curr &= self.curr - 1;
// Return global index = chunk_offset + tz
return Some((self.chunk_offset + tz as i64) as u32);
}
// Advance to next 64-bit chunk
match self.iter.next() {
Some(next_chunk) => {
// Move offset forward by 64 bits
self.chunk_offset += 64;
self.curr = next_chunk;
}
None => return None,
}
}
}
}

/// Calls the provided closure for each index in the provided null mask that is set,
/// using an adaptive strategy based on the null count
///
Expand Down Expand Up @@ -323,4 +380,110 @@ mod tests {
let mask = &[223, 23];
BitIterator::new(mask, 17, 0);
}

#[test]
fn test_bit_index_u32_iterator_basic() {
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These tests make sure index_u32 is same with original usize result.

let mask = &[0b00010010, 0b00100011];

let result: Vec<u32> = BitIndexU32Iterator::new(mask, 0, 16).collect();
let expected: Vec<u32> = BitIndexIterator::new(mask, 0, 16)
.map(|i| i as u32)
.collect();
assert_eq!(result, expected);

let result: Vec<u32> = BitIndexU32Iterator::new(mask, 4, 8).collect();
let expected: Vec<u32> = BitIndexIterator::new(mask, 4, 8)
.map(|i| i as u32)
.collect();
assert_eq!(result, expected);

let result: Vec<u32> = BitIndexU32Iterator::new(mask, 10, 4).collect();
let expected: Vec<u32> = BitIndexIterator::new(mask, 10, 4)
.map(|i| i as u32)
.collect();
assert_eq!(result, expected);

let result: Vec<u32> = BitIndexU32Iterator::new(mask, 0, 0).collect();
let expected: Vec<u32> = BitIndexIterator::new(mask, 0, 0)
.map(|i| i as u32)
.collect();
assert_eq!(result, expected);
}

#[test]
fn test_bit_index_u32_iterator_all_set() {
let mask = &[0xFF, 0xFF];
let result: Vec<u32> = BitIndexU32Iterator::new(mask, 0, 16).collect();
let expected: Vec<u32> = BitIndexIterator::new(mask, 0, 16)
.map(|i| i as u32)
.collect();
assert_eq!(result, expected);
}

#[test]
fn test_bit_index_u32_iterator_none_set() {
let mask = &[0x00, 0x00];
let result: Vec<u32> = BitIndexU32Iterator::new(mask, 0, 16).collect();
let expected: Vec<u32> = BitIndexIterator::new(mask, 0, 16)
.map(|i| i as u32)
.collect();
assert_eq!(result, expected);
}

#[test]
fn test_bit_index_u32_cross_chunk() {
let mut buf = vec![0u8; 16];
for bit in 60..68 {
let byte = (bit / 8) as usize;
let bit_in_byte = bit % 8;
buf[byte] |= 1 << bit_in_byte;
}
let offset = 58;
let len = 10;

let result: Vec<u32> = BitIndexU32Iterator::new(&buf, offset, len).collect();
let expected: Vec<u32> = BitIndexIterator::new(&buf, offset, len)
.map(|i| i as u32)
.collect();
assert_eq!(result, expected);
}

#[test]
fn test_bit_index_u32_unaligned_offset() {
let mask = &[0b0110_1100, 0b1010_0000];
let offset = 2;
let len = 12;

let result: Vec<u32> = BitIndexU32Iterator::new(mask, offset, len).collect();
let expected: Vec<u32> = BitIndexIterator::new(mask, offset, len)
.map(|i| i as u32)
.collect();
assert_eq!(result, expected);
}

#[test]
fn test_bit_index_u32_long_all_set() {
let len = 200;
let num_bytes = len / 8 + if len % 8 != 0 { 1 } else { 0 };
let bytes = vec![0xFFu8; num_bytes];

let result: Vec<u32> = BitIndexU32Iterator::new(&bytes, 0, len).collect();
let expected: Vec<u32> = BitIndexIterator::new(&bytes, 0, len)
.map(|i| i as u32)
.collect();
assert_eq!(result, expected);
}

#[test]
fn test_bit_index_u32_none_set() {
let len = 50;
let num_bytes = len / 8 + if len % 8 != 0 { 1 } else { 0 };
let bytes = vec![0u8; num_bytes];

let result: Vec<u32> = BitIndexU32Iterator::new(&bytes, 0, len).collect();
let expected: Vec<u32> = BitIndexIterator::new(&bytes, 0, len)
.map(|i| i as u32)
.collect();
assert_eq!(result, expected);
}
}
Loading
Loading