-
Notifications
You must be signed in to change notification settings - Fork 2.2k
bench: Add IN list benchmarks for non-constant list expressions #20444
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 1 commit
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -23,6 +23,7 @@ use arrow::datatypes::{Field, Schema}; | |
| use arrow::record_batch::RecordBatch; | ||
| use criterion::{Criterion, criterion_group, criterion_main}; | ||
| use datafusion_common::ScalarValue; | ||
| use datafusion_physical_expr::PhysicalExpr; | ||
| use datafusion_physical_expr::expressions::{col, in_list, lit}; | ||
| use rand::distr::Alphanumeric; | ||
| use rand::prelude::*; | ||
|
|
@@ -50,7 +51,9 @@ fn random_string(rng: &mut StdRng, len: usize) -> String { | |
| } | ||
|
|
||
| const IN_LIST_LENGTHS: [usize; 4] = [3, 8, 28, 100]; | ||
| const DYNAMIC_LIST_LENGTHS: [usize; 3] = [3, 8, 28]; | ||
| const NULL_PERCENTS: [f64; 2] = [0., 0.2]; | ||
| const MATCH_PERCENTS: [f64; 3] = [0.0, 0.5, 1.0]; | ||
| const STRING_LENGTHS: [usize; 3] = [3, 12, 100]; | ||
| const ARRAY_LENGTH: usize = 8192; | ||
|
|
||
|
|
@@ -219,6 +222,144 @@ fn bench_realistic_mixed_strings<A>( | |
| } | ||
| } | ||
|
|
||
| /// Benchmarks the dynamic evaluation path (no static filter) by including | ||
| /// a column reference in the IN list, which prevents static filter creation. | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It would be nice to show an example of how the arguments to this function map to the equivalent SQL being benchmarked.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks for your advice, I have added equivalent SQL examples in docstring. |
||
| fn do_bench_dynamic( | ||
| c: &mut Criterion, | ||
| name: &str, | ||
| values: ArrayRef, | ||
| list_cols: &[ArrayRef], | ||
| ) { | ||
| let mut fields = vec![Field::new("a", values.data_type().clone(), true)]; | ||
| let mut columns: Vec<ArrayRef> = vec![values]; | ||
|
|
||
| // Build list expressions: mix of column refs (forces dynamic path) | ||
| let schema_fields: Vec<Field> = list_cols | ||
| .iter() | ||
| .enumerate() | ||
| .map(|(i, col_arr)| { | ||
| let name = format!("b{i}"); | ||
| fields.push(Field::new(&name, col_arr.data_type().clone(), true)); | ||
| columns.push(Arc::clone(col_arr)); | ||
| Field::new(&name, col_arr.data_type().clone(), true) | ||
| }) | ||
| .collect(); | ||
|
|
||
| let schema = Schema::new(fields); | ||
| let list_exprs: Vec<Arc<dyn PhysicalExpr>> = schema_fields | ||
| .iter() | ||
| .map(|f| col(f.name(), &schema).unwrap()) | ||
| .collect(); | ||
|
|
||
| let expr = in_list(col("a", &schema).unwrap(), list_exprs, &false, &schema).unwrap(); | ||
| let batch = RecordBatch::try_new(Arc::new(schema), columns).unwrap(); | ||
|
|
||
| c.bench_function(name, |b| { | ||
| b.iter(|| black_box(expr.evaluate(black_box(&batch)).unwrap())) | ||
| }); | ||
| } | ||
|
|
||
| /// Benchmarks the dynamic IN list path for Int32 arrays with column references. | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It would be nice to see examples in this docstring of what the SQL being benchmarked is, e.g.: // select 1 in x from t;
// where t:
// create table t ...
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Added equivalent SQL examples to both bench_with_columns_int32 and bench_with_columns_utf8: |
||
| fn bench_dynamic_int32(c: &mut Criterion) { | ||
| let mut rng = StdRng::seed_from_u64(42); | ||
|
|
||
| for list_size in DYNAMIC_LIST_LENGTHS { | ||
| for match_percent in MATCH_PERCENTS { | ||
| for null_percent in NULL_PERCENTS { | ||
| // Generate the "needle" column | ||
| let values: Int32Array = (0..ARRAY_LENGTH) | ||
| .map(|_| { | ||
| rng.random_bool(1.0 - null_percent) | ||
| .then(|| rng.random_range(0..1000)) | ||
| }) | ||
| .collect(); | ||
|
|
||
| // Generate list columns with controlled match rate | ||
| let list_cols: Vec<ArrayRef> = (0..list_size) | ||
| .map(|_| { | ||
| let col: Int32Array = (0..ARRAY_LENGTH) | ||
| .map(|row| { | ||
| if rng.random_bool(1.0 - null_percent) { | ||
| if rng.random_bool(match_percent) { | ||
| // Copy from values to create a match | ||
| if values.is_null(row) { | ||
| Some(rng.random_range(0..1000)) | ||
| } else { | ||
| Some(values.value(row)) | ||
| } | ||
| } else { | ||
| // Random value (unlikely to match) | ||
| Some(rng.random_range(1000..2000)) | ||
| } | ||
| } else { | ||
| None | ||
| } | ||
| }) | ||
| .collect(); | ||
| Arc::new(col) as ArrayRef | ||
| }) | ||
| .collect(); | ||
|
|
||
| do_bench_dynamic( | ||
| c, | ||
| &format!( | ||
| "in_list_dynamic/Int32/list={}/match={}%/nulls={}%", | ||
| list_size, | ||
| (match_percent * 100.0) as u32, | ||
| (null_percent * 100.0) as u32 | ||
| ), | ||
| Arc::new(values), | ||
| &list_cols, | ||
| ); | ||
| } | ||
| } | ||
| } | ||
| } | ||
|
|
||
| /// Benchmarks the dynamic IN list path for Utf8 arrays with column references. | ||
| fn bench_dynamic_utf8(c: &mut Criterion) { | ||
| let mut rng = StdRng::seed_from_u64(99); | ||
|
|
||
| for list_size in DYNAMIC_LIST_LENGTHS { | ||
| for match_percent in MATCH_PERCENTS { | ||
| // Generate the "needle" column | ||
| let value_strings: Vec<Option<String>> = (0..ARRAY_LENGTH) | ||
| .map(|_| rng.random_bool(0.8).then(|| random_string(&mut rng, 12))) | ||
| .collect(); | ||
| let values: StringArray = | ||
| value_strings.iter().map(|s| s.as_deref()).collect(); | ||
|
|
||
| // Generate list columns with controlled match rate | ||
| let list_cols: Vec<ArrayRef> = (0..list_size) | ||
| .map(|_| { | ||
| let col: StringArray = (0..ARRAY_LENGTH) | ||
| .map(|row| { | ||
| if rng.random_bool(match_percent) { | ||
| // Copy from values to create a match | ||
| value_strings[row].as_deref() | ||
| } else { | ||
| Some("no_match_value_xyz") | ||
| } | ||
| }) | ||
| .collect(); | ||
| Arc::new(col) as ArrayRef | ||
| }) | ||
| .collect(); | ||
|
|
||
| do_bench_dynamic( | ||
| c, | ||
| &format!( | ||
| "in_list_dynamic/Utf8/list={}/match={}%", | ||
| list_size, | ||
| (match_percent * 100.0) as u32, | ||
| ), | ||
| Arc::new(values), | ||
| &list_cols, | ||
| ); | ||
| } | ||
| } | ||
| } | ||
|
|
||
| /// Entry point: registers in_list benchmarks for string and numeric array types. | ||
| fn criterion_benchmark(c: &mut Criterion) { | ||
| let mut rng = StdRng::seed_from_u64(120320); | ||
|
|
@@ -266,6 +407,10 @@ fn criterion_benchmark(c: &mut Criterion) { | |
| |rng| rng.random(), | ||
| |v| ScalarValue::TimestampNanosecond(Some(v), None), | ||
| ); | ||
|
|
||
| // Dynamic path benchmarks (non-constant list expressions) | ||
| bench_dynamic_int32(c); | ||
| bench_dynamic_utf8(c); | ||
| } | ||
|
|
||
| criterion_group! { | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Does
('a', 1, 123.24)also force this "dynamic" path? If so would use the term "heterogeneous" for that. If not and it's only columns that trigger this code path I would use the term "LIST_WITH_COLUMNS_LENGTHS.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
No, only column references trigger this code path. Heterogeneous literals like 1 IN ('a', 1, 123.24) are type-coerced and still go through the static (HashSet) path. Renamed to LIST_WITH_COLUMNS_LENGTHS, and also renamed all related functions/benchmark names to remove the "dynamic" terminology.