Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions arrow-buffer/src/buffer/boolean.rs
Original file line number Diff line number Diff line change
Expand Up @@ -165,6 +165,7 @@ impl BooleanBuffer {
/// * `op` must only apply bitwise operations
/// on the relevant bits; the input `u64` may contain irrelevant bits
/// and may be processed differently on different endian architectures.
/// * `op` may be called with input bits outside the requested range
/// * The output always has zero offset
///
/// # See Also
Expand Down
49 changes: 40 additions & 9 deletions arrow-buffer/src/buffer/ops.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,12 @@ use crate::BooleanBuffer;
use crate::util::bit_util::ceil;

/// Apply a bitwise operation `op` to four inputs and return the result as a Buffer.
/// The inputs are treated as bitmaps, meaning that offsets and length are specified in number of bits.
///
/// The inputs are treated as bitmaps, meaning that offsets and length are
/// specified in number of bits.
///
/// NOTE: The operation `op` is applied to chunks of 64 bits (u64) and any bits
/// outside the offsets and len are set to zero out before calling `op`.
pub fn bitwise_quaternary_op_helper<F>(
buffers: [&Buffer; 4],
offsets: [usize; 4],
Expand Down Expand Up @@ -60,7 +65,12 @@ where
}

/// Apply a bitwise operation `op` to two inputs and return the result as a Buffer.
/// The inputs are treated as bitmaps, meaning that offsets and length are specified in number of bits.
///
/// The inputs are treated as bitmaps, meaning that offsets and length are
/// specified in number of bits.
///
/// NOTE: The operation `op` is applied to chunks of 64 bits (u64) and any bits
/// outside the offsets and len are set to zero out before calling `op`.
pub fn bitwise_bin_op_helper<F>(
left: &Buffer,
left_offset_in_bits: usize,
Expand Down Expand Up @@ -93,21 +103,42 @@ where
}

/// Apply a bitwise operation `op` to one input and return the result as a Buffer.
/// The input is treated as a bitmap, meaning that offset and length are specified in number of bits.
#[deprecated(
since = "57.2.0",
note = "use BooleanBuffer::from_bitwise_unary_op instead"
)]
///
/// The input is treated as a bitmap, meaning that offset and length are
/// specified in number of bits.
///
/// NOTE: The operation `op` is applied to chunks of 64 bits (u64) and any bits
/// outside the offsets and len are set to zero out before calling `op`.
pub fn bitwise_unary_op_helper<F>(
left: &Buffer,
offset_in_bits: usize,
len_in_bits: usize,
op: F,
mut op: F,
) -> Buffer
where
F: FnMut(u64) -> u64,
{
BooleanBuffer::from_bitwise_unary_op(left, offset_in_bits, len_in_bits, op).into_inner()
// reserve capacity and set length so we can get a typed view of u64 chunks
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This restores the behavior to what it was prior to #8996

let mut result =
MutableBuffer::new(ceil(len_in_bits, 8)).with_bitset(len_in_bits / 64 * 8, false);

let left_chunks = left.bit_chunks(offset_in_bits, len_in_bits);

let result_chunks = result.typed_data_mut::<u64>().iter_mut();

result_chunks
.zip(left_chunks.iter())
.for_each(|(res, left)| {
*res = op(left);
});

let remainder_bytes = ceil(left_chunks.remainder_len(), 8);
let rem = op(left_chunks.remainder_bits());
// we are counting its starting from the least significant bit, to to_le_bytes should be correct
let rem = &rem.to_le_bytes()[0..remainder_bytes];
result.extend_from_slice(rem);

result.into()
}

/// Apply a bitwise and to two inputs and return the result as a Buffer.
Expand Down
114 changes: 95 additions & 19 deletions arrow-select/src/nullif.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@

use arrow_array::{Array, ArrayRef, BooleanArray, make_array};
use arrow_buffer::buffer::bitwise_bin_op_helper;
use arrow_buffer::{BooleanBuffer, NullBuffer};
use arrow_buffer::{BooleanBuffer, NullBuffer, bitwise_unary_op_helper};
use arrow_schema::{ArrowError, DataType};

/// Returns a new array with the same values and the validity bit to false where
Expand Down Expand Up @@ -91,13 +91,11 @@ pub fn nullif(left: &dyn Array, right: &BooleanArray) -> Result<ArrayRef, ArrowE
}
None => {
let mut null_count = 0;
let buffer =
BooleanBuffer::from_bitwise_unary_op(right.inner(), right.offset(), len, |b| {
let t = !b;
null_count += t.count_zeros() as usize;
t
})
.into_inner();
let buffer = bitwise_unary_op_helper(right.inner(), right.offset(), len, |b| {
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This restores the behavior to what it was prior to #8996

let t = !b;
null_count += t.count_zeros() as usize;
t
});
(buffer, null_count)
}
};
Expand All @@ -122,7 +120,8 @@ mod tests {
use arrow_array::{Int32Array, NullArray, StringArray, StructArray};
use arrow_data::ArrayData;
use arrow_schema::{Field, Fields};
use rand::{Rng, rng};
use rand::prelude::StdRng;
use rand::{Rng, SeedableRng};

#[test]
fn test_nullif_int_array() {
Expand Down Expand Up @@ -494,38 +493,115 @@ mod tests {
let r_data = r.to_data();
r_data.validate().unwrap();

assert_eq!(r.as_ref(), &expected);
assert_eq!(
r.as_ref(),
&expected,
"expected nulls: {:#?}\n\n\
result nulls: {:#?}\n\n\\
expected values: {:#?}\n\n\
result values: {:#?}",
expected.nulls(),
r.nulls(),
expected.values(),
r.as_primitive::<Int32Type>().values()
);
validate_nulls(expected.nulls());
validate_nulls(r.nulls());
}

/// Ensures that the null count matches the actual number of nulls.
fn validate_nulls(nulls: Option<&NullBuffer>) {
let Some(nulls) = nulls else {
return;
};
let mut actual_null_count = 0;
for i in 0..nulls.len() {
if nulls.is_null(i) {
actual_null_count += 1;
}
}
assert_eq!(actual_null_count, nulls.null_count());
}

#[test]
fn nullif_fuzz() {
let mut rng = rng();
let mut rng = StdRng::seed_from_u64(7337);
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

use fixed random seed so the test is reproducable


let arrays = [
Int32Array::from(vec![0; 128]),
(0..128)
.map(|_| rng.random_bool(0.5).then_some(0))
Int32Array::from(vec![0; 1024]), // no nulls
(0..1024) // 50% nulls
.map(|_| rng.random_bool(0.5).then_some(1))
.collect(),
];

for a in arrays {
let a_slices = [(0, 128), (64, 64), (0, 64), (32, 32), (0, 0), (32, 0)];

let a_slices = [
(0, 128),
(0, 129),
(64, 64),
(0, 64),
(32, 32),
(0, 0),
(32, 0),
(5, 800),
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

use larger slices that are more than 2 u64s

(33, 53),
(77, 101),
];
for (a_offset, a_length) in a_slices {
let a = a.slice(a_offset, a_length);

for i in 1..65 {
let b_start_offset = rng.random_range(0..i);
let b_end_offset = rng.random_range(0..i);

// b with 50% nulls
let b: BooleanArray = (0..a_length + b_start_offset + b_end_offset)
.map(|_| rng.random_bool(0.5).then(|| rng.random_bool(0.5)))
.collect();
let b = b.slice(b_start_offset, a_length);

test_nullif(&a, &b);
let b_sliced = b.slice(b_start_offset, a_length);
test_nullif(&a, &b_sliced);

// b with no nulls (and no null buffer)
let b = remove_null_buffer(&b);
let b_sliced = b.slice(b_start_offset, a_length);
test_nullif(&a, &b_sliced);

// b with no nulls (but with a null buffer)
let b = remove_null_values(&b);
let b_sliced = b.slice(b_start_offset, a_length);
test_nullif(&a, &b_sliced);
}
}
}
}

/// Returns a new BooleanArray with no null buffer
fn remove_null_buffer(array: &BooleanArray) -> BooleanArray {
make_array(
array
.into_data()
.into_builder()
.nulls(None)
.build()
.unwrap(),
)
.as_boolean()
.clone()
}

/// Returns a new BooleanArray with a null buffer where all values are valid
fn remove_null_values(array: &BooleanArray) -> BooleanArray {
let len = array.len();
let new_nulls = NullBuffer::from_iter(std::iter::repeat_n(true, len));
make_array(
array
.into_data()
.into_builder()
.nulls(Some(new_nulls))
.build()
.unwrap(),
)
.as_boolean()
.clone()
}
}
Loading