Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions rust/arrow/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -116,3 +116,7 @@ harness = false
[[bench]]
name = "equal"
harness = false

[[bench]]
name = "array_slice"
harness = false
52 changes: 52 additions & 0 deletions rust/arrow/benches/array_slice.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

#[macro_use]
extern crate criterion;
use criterion::Criterion;

extern crate arrow;

use arrow::array::*;
use std::sync::Arc;

fn create_array_slice(array: &ArrayRef, length: usize) -> ArrayRef {
array.slice(0, length)
}

fn create_array_with_nulls(size: usize) -> ArrayRef {
let array: Float64Array = (0..size)
.map(|i| if i % 2 == 0 { Some(1.0) } else { None })
.collect();
Arc::new(array)
}

fn array_slice_benchmark(c: &mut Criterion) {
let array = create_array_with_nulls(4096);
c.bench_function("array_slice 128", |b| {
b.iter(|| create_array_slice(&array, 128))
});
c.bench_function("array_slice 512", |b| {
b.iter(|| create_array_slice(&array, 512))
});
c.bench_function("array_slice 2048", |b| {
b.iter(|| create_array_slice(&array, 2048))
});
}

criterion_group!(benches, array_slice_benchmark);
criterion_main!(benches);
19 changes: 12 additions & 7 deletions rust/arrow/src/array/array_struct.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,14 +23,11 @@ use std::{any::Any, sync::Arc};

use super::{make_array, Array, ArrayData, ArrayDataRef, ArrayRef};
use crate::datatypes::DataType;
use crate::error::{ArrowError, Result};
use crate::{
buffer::{buffer_bin_or, Buffer},
datatypes::Field,
};
use crate::{
error::{ArrowError, Result},
util::bit_util,
};

/// A nested array type where each child (called *field*) is represented by a separate
/// array.
Expand Down Expand Up @@ -133,10 +130,18 @@ impl TryFrom<Vec<(&str, ArrayRef)>> for StructArray {
));

if let Some(child_null_buffer) = child_datum.null_buffer() {
let child_datum_offset = child_datum.offset();

null = Some(if let Some(null_buffer) = &null {
buffer_bin_or(null_buffer, 0, child_null_buffer, 0, child_datum_len)
buffer_bin_or(
null_buffer,
0,
child_null_buffer,
child_datum_offset,
child_datum_len,
)
} else {
child_null_buffer.clone()
child_null_buffer.bit_slice(child_datum_offset, child_datum_len)
});
} else if null.is_some() {
// when one of the fields has no nulls, them there is no null in the array
Expand All @@ -149,7 +154,7 @@ impl TryFrom<Vec<(&str, ArrayRef)>> for StructArray {
.len(len)
.child_data(child_data);
if let Some(null_buffer) = null {
let null_count = len - bit_util::count_set_bits(null_buffer.data());
let null_count = len - null_buffer.count_set_bits();
builder = builder.null_count(null_count).null_bit_buffer(null_buffer);
}

Expand Down
7 changes: 4 additions & 3 deletions rust/arrow/src/array/array_union.rs
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,6 @@ use crate::buffer::Buffer;
use crate::datatypes::*;
use crate::error::{ArrowError, Result};

use crate::util::bit_util;
use core::fmt;
use std::any::Any;
use std::mem;
Expand Down Expand Up @@ -145,7 +144,7 @@ impl UnionArray {
bitmap: Option<Buffer>,
) -> Result<Self> {
let bitmap_data = bitmap.map(|b| {
let null_count = type_ids.len() - bit_util::count_set_bits(b.data());
let null_count = type_ids.len() - b.count_set_bits();
(b, null_count)
});

Expand Down Expand Up @@ -231,8 +230,10 @@ impl UnionArray {
pub fn value_offset(&self, index: usize) -> i32 {
assert!(index - self.offset() < self.len());
if self.is_dense() {
// In format v4 unions had their own validity bitmap and offsets are compressed by omitting null values
// Starting with v5 unions don't have a validity bitmap and it's possible to directly index into the offsets buffer
let valid_slots = match self.data.null_buffer() {
Some(b) => bit_util::count_set_bits_offset(b.data(), 0, index),
Some(b) => b.count_set_bits_offset(0, index),
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Unrelated to this refactoring, but this code in UnionArray did not match my understanding of the array format documentation. At least I don't see any mention that the offsets array is compressed by omitting null values. Or maybe the documentation needs a better example that includes a null bitmap.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh, I got it, the union format actually changed and deprecated the use of a null bitmap: 6df8620

None => index,
};
self.data().buffers()[1].data()[valid_slots * size_of::<i32>()] as i32
Expand Down
12 changes: 6 additions & 6 deletions rust/arrow/src/array/builder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -602,7 +602,7 @@ impl<T: ArrowPrimitiveType> PrimitiveBuilder<T> {
pub fn finish(&mut self) -> PrimitiveArray<T> {
let len = self.len();
let null_bit_buffer = self.bitmap_builder.finish();
let null_count = len - bit_util::count_set_bits(null_bit_buffer.data());
let null_count = len - null_bit_buffer.count_set_bits();
let mut builder = ArrayData::builder(T::DATA_TYPE)
.len(len)
.add_buffer(self.values_builder.finish());
Expand All @@ -619,7 +619,7 @@ impl<T: ArrowPrimitiveType> PrimitiveBuilder<T> {
pub fn finish_dict(&mut self, values: ArrayRef) -> DictionaryArray<T> {
let len = self.len();
let null_bit_buffer = self.bitmap_builder.finish();
let null_count = len - bit_util::count_set_bits(null_bit_buffer.data());
let null_count = len - null_bit_buffer.count_set_bits();
let data_type = DataType::Dictionary(
Box::new(T::DATA_TYPE),
Box::new(values.data_type().clone()),
Expand Down Expand Up @@ -831,7 +831,7 @@ where

let offset_buffer = self.offsets_builder.finish();
let null_bit_buffer = self.bitmap_builder.finish();
let nulls = bit_util::count_set_bits(null_bit_buffer.data());
let nulls = null_bit_buffer.count_set_bits();
self.offsets_builder.append(0).unwrap();
let data = ArrayData::builder(DataType::List(Box::new(Field::new(
"item",
Expand Down Expand Up @@ -1043,7 +1043,7 @@ where

let offset_buffer = self.offsets_builder.finish();
let null_bit_buffer = self.bitmap_builder.finish();
let nulls = bit_util::count_set_bits(null_bit_buffer.data());
let nulls = null_bit_buffer.count_set_bits();
self.offsets_builder.append(0).unwrap();
let data = ArrayData::builder(DataType::LargeList(Box::new(Field::new(
"item",
Expand Down Expand Up @@ -1234,7 +1234,7 @@ where
}

let null_bit_buffer = self.bitmap_builder.finish();
let nulls = bit_util::count_set_bits(null_bit_buffer.data());
let nulls = null_bit_buffer.count_set_bits();
let data = ArrayData::builder(DataType::FixedSizeList(
Box::new(Field::new("item", values_data.data_type().clone(), true)),
self.list_len,
Expand Down Expand Up @@ -2134,7 +2134,7 @@ impl StructBuilder {
}

let null_bit_buffer = self.bitmap_builder.finish();
let null_count = self.len - bit_util::count_set_bits(null_bit_buffer.data());
let null_count = self.len - null_bit_buffer.count_set_bits();
let mut builder = ArrayData::builder(DataType::Struct(self.fields.clone()))
.len(self.len)
.child_data(child_data);
Expand Down
3 changes: 1 addition & 2 deletions rust/arrow/src/array/data.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,15 +23,14 @@ use std::sync::Arc;

use crate::buffer::Buffer;
use crate::datatypes::DataType;
use crate::util::bit_util;
use crate::{bitmap::Bitmap, datatypes::ArrowNativeType};

use super::equal::equal;

#[inline]
fn count_nulls(null_bit_buffer: Option<&Buffer>, offset: usize, len: usize) -> usize {
if let Some(ref buf) = null_bit_buffer {
len.checked_sub(bit_util::count_set_bits_offset(buf.data(), offset, len))
len.checked_sub(buf.count_set_bits_offset(offset, len))
.unwrap()
} else {
0
Expand Down
114 changes: 109 additions & 5 deletions rust/arrow/src/buffer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -266,10 +266,30 @@ impl Buffer {
bitwise_unary_op_helper(&self, offset, len, |a| a)
}

/// Returns a `BitChunks` instance which can be used to iterate over this buffers bits
/// in larger chunks and starting at arbitrary bit offsets.
/// Note that both `offset` and `length` are measured in bits.
pub fn bit_chunks(&self, offset: usize, len: usize) -> BitChunks {
BitChunks::new(&self, offset, len)
}

/// Returns the number of 1-bits in this buffer.
pub fn count_set_bits(&self) -> usize {
let len_in_bits = self.len() * 8;
// self.offset is already taken into consideration by the bit_chunks implementation
self.count_set_bits_offset(0, len_in_bits)
}

/// Returns the number of 1-bits in this buffer, starting from `offset` with `length` bits
/// inspected. Note that both `offset` and `length` are measured in bits.
pub fn count_set_bits_offset(&self, offset: usize, len: usize) -> usize {
let chunks = self.bit_chunks(offset, len);
let mut count = chunks.iter().map(|c| c.count_ones() as usize).sum();
count += chunks.remainder_bits().count_ones() as usize;

count
}

/// Returns an empty buffer.
pub fn empty() -> Self {
unsafe { Self::from_raw_parts(BUFFER_INIT.as_ptr() as _, 0, 0) }
Expand Down Expand Up @@ -806,7 +826,6 @@ unsafe impl Send for MutableBuffer {}

#[cfg(test)]
mod tests {
use crate::util::bit_util;
use std::ptr::null_mut;
use std::thread;

Expand Down Expand Up @@ -908,24 +927,24 @@ mod tests {
fn test_with_bitset() {
let mut_buf = MutableBuffer::new(64).with_bitset(64, false);
let buf = mut_buf.freeze();
assert_eq!(0, bit_util::count_set_bits(buf.data()));
assert_eq!(0, buf.count_set_bits());

let mut_buf = MutableBuffer::new(64).with_bitset(64, true);
let buf = mut_buf.freeze();
assert_eq!(512, bit_util::count_set_bits(buf.data()));
assert_eq!(512, buf.count_set_bits());
}

#[test]
fn test_set_null_bits() {
let mut mut_buf = MutableBuffer::new(64).with_bitset(64, true);
mut_buf.set_null_bits(0, 64);
let buf = mut_buf.freeze();
assert_eq!(0, bit_util::count_set_bits(buf.data()));
assert_eq!(0, buf.count_set_bits());

let mut mut_buf = MutableBuffer::new(64).with_bitset(64, true);
mut_buf.set_null_bits(32, 32);
let buf = mut_buf.freeze();
assert_eq!(256, bit_util::count_set_bits(buf.data()));
assert_eq!(256, buf.count_set_bits());
}

#[test]
Expand Down Expand Up @@ -1094,4 +1113,89 @@ mod tests {
check_as_typed_data!(&[1f32, 3f32, 6f32], f32);
check_as_typed_data!(&[1f64, 3f64, 6f64], f64);
}

#[test]
fn test_count_bits() {
assert_eq!(0, Buffer::from(&[0b00000000]).count_set_bits());
assert_eq!(8, Buffer::from(&[0b11111111]).count_set_bits());
assert_eq!(3, Buffer::from(&[0b00001101]).count_set_bits());
assert_eq!(6, Buffer::from(&[0b01001001, 0b01010010]).count_set_bits());
assert_eq!(16, Buffer::from(&[0b11111111, 0b11111111]).count_set_bits());
}

#[test]
fn test_count_bits_slice() {
assert_eq!(
0,
Buffer::from(&[0b11111111, 0b00000000])
.slice(1)
.count_set_bits()
);
assert_eq!(
8,
Buffer::from(&[0b11111111, 0b11111111])
.slice(1)
.count_set_bits()
);
assert_eq!(
3,
Buffer::from(&[0b11111111, 0b11111111, 0b00001101])
.slice(2)
.count_set_bits()
);
assert_eq!(
6,
Buffer::from(&[0b11111111, 0b01001001, 0b01010010])
.slice(1)
.count_set_bits()
);
assert_eq!(
16,
Buffer::from(&[0b11111111, 0b11111111, 0b11111111, 0b11111111])
.slice(2)
.count_set_bits()
);
}

#[test]
fn test_count_bits_offset_slice() {
assert_eq!(8, Buffer::from(&[0b11111111]).count_set_bits_offset(0, 8));
assert_eq!(3, Buffer::from(&[0b11111111]).count_set_bits_offset(0, 3));
assert_eq!(5, Buffer::from(&[0b11111111]).count_set_bits_offset(3, 5));
assert_eq!(1, Buffer::from(&[0b11111111]).count_set_bits_offset(3, 1));
assert_eq!(0, Buffer::from(&[0b11111111]).count_set_bits_offset(8, 0));
assert_eq!(2, Buffer::from(&[0b01010101]).count_set_bits_offset(0, 3));
assert_eq!(
16,
Buffer::from(&[0b11111111, 0b11111111]).count_set_bits_offset(0, 16)
);
assert_eq!(
10,
Buffer::from(&[0b11111111, 0b11111111]).count_set_bits_offset(0, 10)
);
assert_eq!(
10,
Buffer::from(&[0b11111111, 0b11111111]).count_set_bits_offset(3, 10)
);
assert_eq!(
8,
Buffer::from(&[0b11111111, 0b11111111]).count_set_bits_offset(8, 8)
);
assert_eq!(
5,
Buffer::from(&[0b11111111, 0b11111111]).count_set_bits_offset(11, 5)
);
assert_eq!(
0,
Buffer::from(&[0b11111111, 0b11111111]).count_set_bits_offset(16, 0)
);
assert_eq!(
2,
Buffer::from(&[0b01101101, 0b10101010]).count_set_bits_offset(7, 5)
);
assert_eq!(
4,
Buffer::from(&[0b01101101, 0b10101010]).count_set_bits_offset(7, 9)
);
}
}
7 changes: 4 additions & 3 deletions rust/arrow/src/compute/kernels/filter.rs
Original file line number Diff line number Diff line change
Expand Up @@ -319,9 +319,10 @@ impl FilterContext {
));
}
let filter_mask: Vec<u64> = (0..64).map(|x| 1u64 << x).collect();
let filter_bytes = filter_array.data_ref().buffers()[0].data();
let filtered_count =
bit_util::count_set_bits_offset(filter_bytes, 0, filter_array.len());
let filter_buffer = &filter_array.data_ref().buffers()[0];
let filtered_count = filter_buffer.count_set_bits_offset(0, filter_array.len());

let filter_bytes = filter_buffer.data();

// transmute filter_bytes to &[u64]
let mut u64_buffer = MutableBuffer::new(filter_bytes.len());
Expand Down
Loading