From ab79394c05bf2eeef0dd5eea93512cbdda9edfdd Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Fri, 2 Jan 2026 09:13:08 -0500 Subject: [PATCH 1/3] Add nullif_kernel benchmark --- arrow/benches/nullif_kernel.rs | 66 ++++++++++++++++++++++++++++++++++ 1 file changed, 66 insertions(+) create mode 100644 arrow/benches/nullif_kernel.rs diff --git a/arrow/benches/nullif_kernel.rs b/arrow/benches/nullif_kernel.rs new file mode 100644 index 000000000000..61ae7d4eea56 --- /dev/null +++ b/arrow/benches/nullif_kernel.rs @@ -0,0 +1,66 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#[macro_use] +extern crate criterion; +use criterion::Criterion; + +use arrow::util::bench_util::{create_boolean_array, create_primitive_array}; + +use arrow::array::*; +use arrow_array::types::Int64Type; +use arrow_select::nullif::nullif; +use std::hint; + +fn bench_nullif(left: &dyn Array, right: &BooleanArray) { + hint::black_box(nullif(left, right).unwrap()); +} + +fn add_benchmark(c: &mut Criterion) { + let size = 8192usize; + + // create input before benchmark to ensure allocations are consistent + let int64_no_nulls = create_primitive_array::(size, 0.0); + let int64_nulls = create_primitive_array::(size, 0.1); + + let mask_10 = create_boolean_array(size, 0.0, 0.1); + let mask_10_sliced = create_boolean_array(size + 7, 0.0, 0.1).slice(7, size); + let mask_1 = create_boolean_array(size, 0.0, 0.01); + + c.bench_function("nullif no-nulls mask(10%)", |b| { + b.iter(|| bench_nullif(&int64_no_nulls, &mask_10)) + }); + c.bench_function("nullif no-nulls mask(10%, sliced)", |b| { + b.iter(|| bench_nullif(&int64_no_nulls, &mask_10_sliced)) + }); + c.bench_function("nullif no-nulls mask(1%)", |b| { + b.iter(|| bench_nullif(&int64_no_nulls, &mask_1)) + }); + + c.bench_function("nullif nulls mask(10%)", |b| { + b.iter(|| bench_nullif(&int64_nulls, &mask_10)) + }); + c.bench_function("nullif nulls mask(10%, sliced)", |b| { + b.iter(|| bench_nullif(&int64_nulls, &mask_10_sliced)) + }); + c.bench_function("nullif nulls mask(1%)", |b| { + b.iter(|| bench_nullif(&int64_nulls, &mask_1)) + }); +} + +criterion_group!(benches, add_benchmark); +criterion_main!(benches); From d171d8b6595fed4b6d08d75ae2892b05d52f3e98 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Fri, 2 Jan 2026 09:20:44 -0500 Subject: [PATCH 2/3] Add entry to Cargo.toml --- arrow/Cargo.toml | 6 +++++- parquet/src/arrow/arrow_reader/selection.rs | 7 ++++++- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/arrow/Cargo.toml b/arrow/Cargo.toml index 4200cd7a6c78..0c5a925ae330 100644 --- a/arrow/Cargo.toml +++ b/arrow/Cargo.toml @@ -172,7 +172,6 @@ name = "coalesce_kernels" harness = false required-features = ["test_utils"] - [[bench]] name = "take_kernels" harness = false @@ -311,6 +310,11 @@ name = "lexsort" harness = false required-features = ["test_utils"] +[[bench]] +name = "nullif_kernel" +harness = false +required-features = ["test_utils"] + [[test]] name = "csv" required-features = ["csv", "chrono-tz"] diff --git a/parquet/src/arrow/arrow_reader/selection.rs b/parquet/src/arrow/arrow_reader/selection.rs index 2ddf812f9c39..f437787a39bf 100644 --- a/parquet/src/arrow/arrow_reader/selection.rs +++ b/parquet/src/arrow/arrow_reader/selection.rs @@ -44,7 +44,7 @@ pub enum RowSelectionPolicy { impl Default for RowSelectionPolicy { fn default() -> Self { - Self::Auto { threshold: 32 } + Self::Auto { threshold: 16 } } } @@ -146,6 +146,7 @@ impl RowSelection { /// # Panic /// /// Panics if any of the [`BooleanArray`] contain nulls + #[inline(never)] pub fn from_filters(filters: &[BooleanArray]) -> Self { let mut next_offset = 0; let total_rows = filters.iter().map(|x| x.len()).sum(); @@ -161,6 +162,7 @@ impl RowSelection { } /// Creates a [`RowSelection`] from an iterator of consecutive ranges to keep + #[inline(never)] pub fn from_consecutive_ranges>>( ranges: I, total_rows: usize, @@ -201,6 +203,7 @@ impl RowSelection { /// Note: this method does not make any effort to combine consecutive ranges, nor coalesce /// ranges that are close together. This is instead delegated to the IO subsystem to optimise, /// e.g. [`ObjectStore::get_ranges`](object_store::ObjectStore::get_ranges) + #[inline(never)] pub fn scan_ranges(&self, page_locations: &[PageLocation]) -> Vec> { let mut ranges: Vec> = vec![]; let mut row_offset = 0; @@ -342,6 +345,7 @@ impl RowSelection { /// Panics if `other` does not have a length equal to the number of rows selected /// by this RowSelection /// + #[inline(never)] pub fn and_then(&self, other: &Self) -> Self { let mut selectors = vec![]; let mut first = self.selectors.iter().cloned().peekable(); @@ -923,6 +927,7 @@ impl RowSelectionCursor { } } +#[inline(never)] fn boolean_mask_from_selectors(selectors: &[RowSelector]) -> BooleanBuffer { let total_rows: usize = selectors.iter().map(|s| s.row_count).sum(); let mut builder = BooleanBufferBuilder::new(total_rows); From 7266d101469f5faff72c6974db06a5b83c1d8316 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Wed, 7 Jan 2026 11:33:31 -0500 Subject: [PATCH 3/3] revert unecessary changes --- parquet/src/arrow/arrow_reader/selection.rs | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/parquet/src/arrow/arrow_reader/selection.rs b/parquet/src/arrow/arrow_reader/selection.rs index f437787a39bf..2ddf812f9c39 100644 --- a/parquet/src/arrow/arrow_reader/selection.rs +++ b/parquet/src/arrow/arrow_reader/selection.rs @@ -44,7 +44,7 @@ pub enum RowSelectionPolicy { impl Default for RowSelectionPolicy { fn default() -> Self { - Self::Auto { threshold: 16 } + Self::Auto { threshold: 32 } } } @@ -146,7 +146,6 @@ impl RowSelection { /// # Panic /// /// Panics if any of the [`BooleanArray`] contain nulls - #[inline(never)] pub fn from_filters(filters: &[BooleanArray]) -> Self { let mut next_offset = 0; let total_rows = filters.iter().map(|x| x.len()).sum(); @@ -162,7 +161,6 @@ impl RowSelection { } /// Creates a [`RowSelection`] from an iterator of consecutive ranges to keep - #[inline(never)] pub fn from_consecutive_ranges>>( ranges: I, total_rows: usize, @@ -203,7 +201,6 @@ impl RowSelection { /// Note: this method does not make any effort to combine consecutive ranges, nor coalesce /// ranges that are close together. This is instead delegated to the IO subsystem to optimise, /// e.g. [`ObjectStore::get_ranges`](object_store::ObjectStore::get_ranges) - #[inline(never)] pub fn scan_ranges(&self, page_locations: &[PageLocation]) -> Vec> { let mut ranges: Vec> = vec![]; let mut row_offset = 0; @@ -345,7 +342,6 @@ impl RowSelection { /// Panics if `other` does not have a length equal to the number of rows selected /// by this RowSelection /// - #[inline(never)] pub fn and_then(&self, other: &Self) -> Self { let mut selectors = vec![]; let mut first = self.selectors.iter().cloned().peekable(); @@ -927,7 +923,6 @@ impl RowSelectionCursor { } } -#[inline(never)] fn boolean_mask_from_selectors(selectors: &[RowSelector]) -> BooleanBuffer { let total_rows: usize = selectors.iter().map(|s| s.row_count).sum(); let mut builder = BooleanBufferBuilder::new(total_rows);