From 1c3a1befd456ea0751e8627072e31c44bcd54abe Mon Sep 17 00:00:00 2001 From: kamille Date: Tue, 30 Jul 2024 05:35:10 +0800 Subject: [PATCH 1/3] lazily compute for null count when it is expansive to calculate it. --- arrow-buffer/src/buffer/null.rs | 84 ++++++++++++++++++++++++++++----- 1 file changed, 71 insertions(+), 13 deletions(-) diff --git a/arrow-buffer/src/buffer/null.rs b/arrow-buffer/src/buffer/null.rs index c79aef398059..b6dcd2af255a 100644 --- a/arrow-buffer/src/buffer/null.rs +++ b/arrow-buffer/src/buffer/null.rs @@ -15,10 +15,32 @@ // specific language governing permissions and limitations // under the License. +use std::sync::atomic::{AtomicI64, Ordering}; + use crate::bit_iterator::{BitIndexIterator, BitIterator, BitSliceIterator}; use crate::buffer::BooleanBuffer; use crate::{Buffer, MutableBuffer}; +const UNINITIALIZED_NULL_COUNT: i64 = -1; + +#[derive(Debug)] +pub enum NullCount { + Eager(usize), + Lazy(AtomicI64), +} + +impl Clone for NullCount { + fn clone(&self) -> Self { + match self { + Self::Eager(v) => Self::Eager(*v), + Self::Lazy(v) => { + let v = v.load(Ordering::Relaxed); + Self::Lazy(AtomicI64::new(v)) + } + } + } +} + /// A [`BooleanBuffer`] used to encode validity for arrow arrays /// /// As per the [Arrow specification], array validity is encoded in a packed bitmask with a @@ -26,16 +48,25 @@ use crate::{Buffer, MutableBuffer}; /// that it is null. /// /// [Arrow specification]: https://arrow.apache.org/docs/format/Columnar.html#validity-bitmaps -#[derive(Debug, Clone, Eq, PartialEq)] +#[derive(Debug, Clone)] pub struct NullBuffer { buffer: BooleanBuffer, - null_count: usize, + null_count: NullCount, +} + +impl PartialEq for NullBuffer { + fn eq(&self, other: &Self) -> bool { + self.buffer == other.buffer + } } +impl Eq for NullBuffer {} + impl NullBuffer { /// Create a new [`NullBuffer`] computing the null count pub fn new(buffer: BooleanBuffer) -> Self { - let null_count = buffer.len() - buffer.count_set_bits(); + // Expensive to calc the null count, we should lazily compute it when + let null_count = NullCount::Lazy(AtomicI64::new(UNINITIALIZED_NULL_COUNT)); Self { buffer, null_count } } @@ -43,7 +74,7 @@ impl NullBuffer { pub fn new_null(len: usize) -> Self { Self { buffer: BooleanBuffer::new_unset(len), - null_count: len, + null_count: NullCount::Eager(len), } } @@ -53,7 +84,7 @@ impl NullBuffer { pub fn new_valid(len: usize) -> Self { Self { buffer: BooleanBuffer::new_set(len), - null_count: 0, + null_count: NullCount::Eager(0), } } @@ -63,7 +94,10 @@ impl NullBuffer { /// /// `buffer` must contain `null_count` `0` bits pub unsafe fn new_unchecked(buffer: BooleanBuffer, null_count: usize) -> Self { - Self { buffer, null_count } + Self { + buffer, + null_count: NullCount::Eager(null_count), + } } /// Computes the union of the nulls in two optional [`NullBuffer`] @@ -81,9 +115,12 @@ impl NullBuffer { /// Returns true if all nulls in `other` also exist in self pub fn contains(&self, other: &NullBuffer) -> bool { - if other.null_count == 0 { - return true; + if let NullCount::Eager(v) = &other.null_count { + if *v == 0 { + return true; + } } + let lhs = self.inner().bit_chunks().iter_padded(); let rhs = other.inner().bit_chunks().iter_padded(); lhs.zip(rhs).all(|(l, r)| (l & !r) == 0) @@ -106,9 +143,17 @@ impl NullBuffer { crate::bit_util::set_bit(buffer.as_mut(), i * count + j) } } + + let null_count = if let NullCount::Eager(v) = &self.null_count { + NullCount::Eager(v * count) + } else { + // TODO: not sure about if it is better to load the atomic and attempt to reuse the compute result + NullCount::Lazy(AtomicI64::new(UNINITIALIZED_NULL_COUNT)) + }; + Self { buffer: BooleanBuffer::new(buffer.into(), 0, capacity), - null_count: self.null_count * count, + null_count, } } @@ -131,9 +176,20 @@ impl NullBuffer { } /// Returns the null count for this [`NullBuffer`] - #[inline] pub fn null_count(&self) -> usize { - self.null_count + match &self.null_count { + NullCount::Eager(v) => *v, + NullCount::Lazy(v) => { + let cached_null_count = v.load(Ordering::Acquire); + if cached_null_count != UNINITIALIZED_NULL_COUNT { + return cached_null_count as usize; + } + + let computed_null_count = self.buffer.len() - self.buffer.count_set_bits(); + v.store(computed_null_count as i64, Ordering::Release); + computed_null_count + } + } } /// Returns `true` if the value at `idx` is not null @@ -189,8 +245,10 @@ impl NullBuffer { &self, f: F, ) -> Result<(), E> { - if self.null_count == self.len() { - return Ok(()); + if let NullCount::Eager(v) = &self.null_count { + if *v == self.len() { + return Ok(()); + } } self.valid_indices().try_for_each(f) } From 9c225fcf76e4cde9e8f933480fa30b894ebe0611 Mon Sep 17 00:00:00 2001 From: kamille Date: Tue, 30 Jul 2024 05:47:41 +0800 Subject: [PATCH 2/3] fix test. --- arrow-array/src/record_batch.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arrow-array/src/record_batch.rs b/arrow-array/src/record_batch.rs index c56b1fd308cf..9d9ab0655a7e 100644 --- a/arrow-array/src/record_batch.rs +++ b/arrow-array/src/record_batch.rs @@ -684,7 +684,7 @@ mod tests { let record_batch = RecordBatch::try_new(Arc::new(schema), vec![Arc::new(a), Arc::new(b)]).unwrap(); - assert_eq!(record_batch.get_array_memory_size(), 364); + assert_eq!(record_batch.get_array_memory_size(), 380); } fn check_batch(record_batch: RecordBatch, num_rows: usize) { From 63a5af940b5eb7af91b833397e2b3c72ca816979 Mon Sep 17 00:00:00 2001 From: kamille Date: Tue, 30 Jul 2024 06:23:07 +0800 Subject: [PATCH 3/3] fix comment. --- arrow-buffer/src/buffer/null.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arrow-buffer/src/buffer/null.rs b/arrow-buffer/src/buffer/null.rs index b6dcd2af255a..7929f99a046d 100644 --- a/arrow-buffer/src/buffer/null.rs +++ b/arrow-buffer/src/buffer/null.rs @@ -65,7 +65,7 @@ impl Eq for NullBuffer {} impl NullBuffer { /// Create a new [`NullBuffer`] computing the null count pub fn new(buffer: BooleanBuffer) -> Self { - // Expensive to calc the null count, we should lazily compute it when + // Expensive to calc the null count, we should lazily compute it when it is really needed let null_count = NullCount::Lazy(AtomicI64::new(UNINITIALIZED_NULL_COUNT)); Self { buffer, null_count } }