diff --git a/arrow-buffer/src/buffer/boolean.rs b/arrow-buffer/src/buffer/boolean.rs index 548401ed4201..ecd7de38c031 100644 --- a/arrow-buffer/src/buffer/boolean.rs +++ b/arrow-buffer/src/buffer/boolean.rs @@ -165,6 +165,7 @@ impl BooleanBuffer { /// * `op` must only apply bitwise operations /// on the relevant bits; the input `u64` may contain irrelevant bits /// and may be processed differently on different endian architectures. + /// * `op` may be called with input bits outside the requested range /// * The output always has zero offset /// /// # See Also diff --git a/arrow-buffer/src/buffer/ops.rs b/arrow-buffer/src/buffer/ops.rs index 05593504b1cf..cb0925bb2cd1 100644 --- a/arrow-buffer/src/buffer/ops.rs +++ b/arrow-buffer/src/buffer/ops.rs @@ -20,7 +20,12 @@ use crate::BooleanBuffer; use crate::util::bit_util::ceil; /// Apply a bitwise operation `op` to four inputs and return the result as a Buffer. -/// The inputs are treated as bitmaps, meaning that offsets and length are specified in number of bits. +/// +/// The inputs are treated as bitmaps, meaning that offsets and length are +/// specified in number of bits. +/// +/// NOTE: The operation `op` is applied to chunks of 64 bits (u64) and any bits +/// outside the offsets and len are set to zero out before calling `op`. pub fn bitwise_quaternary_op_helper( buffers: [&Buffer; 4], offsets: [usize; 4], @@ -60,7 +65,12 @@ where } /// Apply a bitwise operation `op` to two inputs and return the result as a Buffer. -/// The inputs are treated as bitmaps, meaning that offsets and length are specified in number of bits. +/// +/// The inputs are treated as bitmaps, meaning that offsets and length are +/// specified in number of bits. +/// +/// NOTE: The operation `op` is applied to chunks of 64 bits (u64) and any bits +/// outside the offsets and len are set to zero out before calling `op`. pub fn bitwise_bin_op_helper( left: &Buffer, left_offset_in_bits: usize, @@ -93,21 +103,42 @@ where } /// Apply a bitwise operation `op` to one input and return the result as a Buffer. -/// The input is treated as a bitmap, meaning that offset and length are specified in number of bits. -#[deprecated( - since = "57.2.0", - note = "use BooleanBuffer::from_bitwise_unary_op instead" -)] +/// +/// The input is treated as a bitmap, meaning that offset and length are +/// specified in number of bits. +/// +/// NOTE: The operation `op` is applied to chunks of 64 bits (u64) and any bits +/// outside the offsets and len are set to zero out before calling `op`. pub fn bitwise_unary_op_helper( left: &Buffer, offset_in_bits: usize, len_in_bits: usize, - op: F, + mut op: F, ) -> Buffer where F: FnMut(u64) -> u64, { - BooleanBuffer::from_bitwise_unary_op(left, offset_in_bits, len_in_bits, op).into_inner() + // reserve capacity and set length so we can get a typed view of u64 chunks + let mut result = + MutableBuffer::new(ceil(len_in_bits, 8)).with_bitset(len_in_bits / 64 * 8, false); + + let left_chunks = left.bit_chunks(offset_in_bits, len_in_bits); + + let result_chunks = result.typed_data_mut::().iter_mut(); + + result_chunks + .zip(left_chunks.iter()) + .for_each(|(res, left)| { + *res = op(left); + }); + + let remainder_bytes = ceil(left_chunks.remainder_len(), 8); + let rem = op(left_chunks.remainder_bits()); + // we are counting its starting from the least significant bit, to to_le_bytes should be correct + let rem = &rem.to_le_bytes()[0..remainder_bytes]; + result.extend_from_slice(rem); + + result.into() } /// Apply a bitwise and to two inputs and return the result as a Buffer. diff --git a/arrow-select/src/nullif.rs b/arrow-select/src/nullif.rs index 211cabf7afc0..e51016f9bad3 100644 --- a/arrow-select/src/nullif.rs +++ b/arrow-select/src/nullif.rs @@ -19,7 +19,7 @@ use arrow_array::{Array, ArrayRef, BooleanArray, make_array}; use arrow_buffer::buffer::bitwise_bin_op_helper; -use arrow_buffer::{BooleanBuffer, NullBuffer}; +use arrow_buffer::{BooleanBuffer, NullBuffer, bitwise_unary_op_helper}; use arrow_schema::{ArrowError, DataType}; /// Returns a new array with the same values and the validity bit to false where @@ -91,13 +91,11 @@ pub fn nullif(left: &dyn Array, right: &BooleanArray) -> Result { let mut null_count = 0; - let buffer = - BooleanBuffer::from_bitwise_unary_op(right.inner(), right.offset(), len, |b| { - let t = !b; - null_count += t.count_zeros() as usize; - t - }) - .into_inner(); + let buffer = bitwise_unary_op_helper(right.inner(), right.offset(), len, |b| { + let t = !b; + null_count += t.count_zeros() as usize; + t + }); (buffer, null_count) } }; @@ -122,7 +120,8 @@ mod tests { use arrow_array::{Int32Array, NullArray, StringArray, StructArray}; use arrow_data::ArrayData; use arrow_schema::{Field, Fields}; - use rand::{Rng, rng}; + use rand::prelude::StdRng; + use rand::{Rng, SeedableRng}; #[test] fn test_nullif_int_array() { @@ -494,23 +493,60 @@ mod tests { let r_data = r.to_data(); r_data.validate().unwrap(); - assert_eq!(r.as_ref(), &expected); + assert_eq!( + r.as_ref(), + &expected, + "expected nulls: {:#?}\n\n\ + result nulls: {:#?}\n\n\\ + expected values: {:#?}\n\n\ + result values: {:#?}", + expected.nulls(), + r.nulls(), + expected.values(), + r.as_primitive::().values() + ); + validate_nulls(expected.nulls()); + validate_nulls(r.nulls()); + } + + /// Ensures that the null count matches the actual number of nulls. + fn validate_nulls(nulls: Option<&NullBuffer>) { + let Some(nulls) = nulls else { + return; + }; + let mut actual_null_count = 0; + for i in 0..nulls.len() { + if nulls.is_null(i) { + actual_null_count += 1; + } + } + assert_eq!(actual_null_count, nulls.null_count()); } #[test] fn nullif_fuzz() { - let mut rng = rng(); + let mut rng = StdRng::seed_from_u64(7337); let arrays = [ - Int32Array::from(vec![0; 128]), - (0..128) - .map(|_| rng.random_bool(0.5).then_some(0)) + Int32Array::from(vec![0; 1024]), // no nulls + (0..1024) // 50% nulls + .map(|_| rng.random_bool(0.5).then_some(1)) .collect(), ]; for a in arrays { - let a_slices = [(0, 128), (64, 64), (0, 64), (32, 32), (0, 0), (32, 0)]; - + let a_slices = [ + (0, 128), + (0, 129), + (64, 64), + (0, 64), + (32, 32), + (0, 0), + (32, 0), + (5, 800), + (33, 53), + (77, 101), + ]; for (a_offset, a_length) in a_slices { let a = a.slice(a_offset, a_length); @@ -518,14 +554,54 @@ mod tests { let b_start_offset = rng.random_range(0..i); let b_end_offset = rng.random_range(0..i); + // b with 50% nulls let b: BooleanArray = (0..a_length + b_start_offset + b_end_offset) .map(|_| rng.random_bool(0.5).then(|| rng.random_bool(0.5))) .collect(); - let b = b.slice(b_start_offset, a_length); - - test_nullif(&a, &b); + let b_sliced = b.slice(b_start_offset, a_length); + test_nullif(&a, &b_sliced); + + // b with no nulls (and no null buffer) + let b = remove_null_buffer(&b); + let b_sliced = b.slice(b_start_offset, a_length); + test_nullif(&a, &b_sliced); + + // b with no nulls (but with a null buffer) + let b = remove_null_values(&b); + let b_sliced = b.slice(b_start_offset, a_length); + test_nullif(&a, &b_sliced); } } } } + + /// Returns a new BooleanArray with no null buffer + fn remove_null_buffer(array: &BooleanArray) -> BooleanArray { + make_array( + array + .into_data() + .into_builder() + .nulls(None) + .build() + .unwrap(), + ) + .as_boolean() + .clone() + } + + /// Returns a new BooleanArray with a null buffer where all values are valid + fn remove_null_values(array: &BooleanArray) -> BooleanArray { + let len = array.len(); + let new_nulls = NullBuffer::from_iter(std::iter::repeat_n(true, len)); + make_array( + array + .into_data() + .into_builder() + .nulls(Some(new_nulls)) + .build() + .unwrap(), + ) + .as_boolean() + .clone() + } }