From 41e6c4e0016651d256e3dd5ec57e87f2ba25d695 Mon Sep 17 00:00:00 2001 From: Will Jones Date: Tue, 16 Sep 2025 09:33:11 -0700 Subject: [PATCH 01/18] start a broad test --- Cargo.lock | 1 + rust/lance/Cargo.toml | 1 + rust/lance/src/dataset.rs | 2 + rust/lance/src/dataset/broad_test.rs | 251 +++++++++++++++++++++++++++ 4 files changed, 255 insertions(+) create mode 100644 rust/lance/src/dataset/broad_test.rs diff --git a/Cargo.lock b/Cargo.lock index f89d5c74c1a..95923c70bd7 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4460,6 +4460,7 @@ dependencies = [ "mock_instant", "moka", "object_store", + "paste", "permutation", "pin-project", "pprof", diff --git a/rust/lance/Cargo.toml b/rust/lance/Cargo.toml index c422a5bcf45..dd60157a5fc 100644 --- a/rust/lance/Cargo.toml +++ b/rust/lance/Cargo.toml @@ -106,6 +106,7 @@ test-log.workspace = true tracing-chrome = "0.7.1" rstest = { workspace = true } tracking-allocator = { version = "0.4", features = ["tracing-compat"] } +paste = "1.0" # For S3 / DynamoDB tests aws-config = { workspace = true } aws-sdk-s3 = { workspace = true } diff --git a/rust/lance/src/dataset.rs b/rust/lance/src/dataset.rs index 722ba7c97e1..33ba4ec86db 100644 --- a/rust/lance/src/dataset.rs +++ b/rust/lance/src/dataset.rs @@ -64,6 +64,8 @@ use tracing::{info, instrument}; pub(crate) mod blob; mod branch_location; +#[cfg(test)] +mod broad_test; pub mod builder; pub mod cleanup; pub mod delta; diff --git a/rust/lance/src/dataset/broad_test.rs b/rust/lance/src/dataset/broad_test.rs new file mode 100644 index 00000000000..09ee4263fd2 --- /dev/null +++ b/rust/lance/src/dataset/broad_test.rs @@ -0,0 +1,251 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright The Lance Authors + +//! A generic test suite. +//! +//! This suite aims to get wide coverage of cases in terms of data types, index +//! types, and dataset states. +//! +//! Data type test should include edge cases for each data type. For example: +//! * Null values +//! * For floats: NaN, Infinity, +/-0 +//! * For integers: min / max +//! * For lists: nullability and empty values at all levels +//! +//! Data state includes: +//! * Deletion +//! * Fragmentation of data files +//! * Fragmentation of indices. + +use std::{panic::AssertUnwindSafe, sync::Arc}; + +use arrow_array::{ArrayRef, BooleanArray, Int32Array, RecordBatch}; +use arrow_schema::{DataType, Field, Schema}; +use futures::FutureExt; +use lance_datagen::{array, gen_batch}; +use lance_index::{DatasetIndexExt, IndexType}; + +use crate::{ + dataset::{InsertBuilder, WriteParams}, + Dataset, +}; + +/// Represents the various test states we want to cover +#[derive(Clone, Copy, Debug)] +struct TableState { + fragmentation: Fragmentation, + deletion: DeletionState, +} + +/// All combinations of test states to cover +const TABLE_STATES: &[TableState] = &[ + TableState { + fragmentation: Fragmentation::SingleFragment, + deletion: DeletionState::NoDeletions, + }, + TableState { + fragmentation: Fragmentation::SingleFragment, + deletion: DeletionState::DeleteOdd, + }, + TableState { + fragmentation: Fragmentation::SingleFragment, + deletion: DeletionState::DeleteEven, + }, + TableState { + fragmentation: Fragmentation::MultiFragment, + deletion: DeletionState::NoDeletions, + }, + TableState { + fragmentation: Fragmentation::MultiFragment, + deletion: DeletionState::DeleteOdd, + }, + TableState { + fragmentation: Fragmentation::MultiFragment, + deletion: DeletionState::DeleteEven, + }, +]; + +/// Helper function to run tests across all table states and index types with panic catching +async fn test_states( + states: &[TableState], + index_types: impl IntoIterator> + Clone, + test_fn: F, +) where + F: Fn(Dataset, RecordBatch) -> Fut, + Fut: std::future::Future, +{ + for &state in states { + for index_type in index_types.clone() { + let (original, ds) = create_test_dataset(state, index_type).await; + + let context = format!( + "fragmentation: {:?}, deletion: {:?}, index: {:?}", + state.fragmentation, state.deletion, index_type + ); + + AssertUnwindSafe(test_fn(ds, original.clone())) + .catch_unwind() + .await + .unwrap_or_else(|_| panic!("Test failed for {}", context)); + } + } +} + +/// Create a test dataset with the given state and index configuration +async fn create_test_dataset( + state: TableState, + index_type: Option, +) -> (RecordBatch, Dataset) { + let data_type = DataType::Boolean; // For now, start with Boolean + let original = test_batch(&data_type); + + let ds = create_dataset_with_config( + original.clone(), + state.fragmentation, + state.deletion, + index_type, + ) + .await; + + (original, ds) +} + +#[derive(Clone, Copy, Debug)] +enum Fragmentation { + SingleFragment, + MultiFragment, +} + +#[derive(Clone, Copy, Debug)] +enum DeletionState { + NoDeletions, + DeleteOdd, + DeleteEven, +} + +/// Helper function to create a dataset with specific configuration +async fn create_dataset_with_config( + data: RecordBatch, + fragmentation: Fragmentation, + deletion_state: DeletionState, + index_type: Option, +) -> Dataset { + // Configure fragmentation + let mut ds = match fragmentation { + Fragmentation::SingleFragment => InsertBuilder::new("memory://") + .execute(vec![data]) + .await + .unwrap(), + Fragmentation::MultiFragment => { + let params = WriteParams { + max_rows_per_file: 10, + ..Default::default() + }; + InsertBuilder::new("memory://") + .with_params(¶ms) + .execute(vec![data]) + .await + .unwrap() + } + }; + + // Apply deletions + apply_deletions(&mut ds, deletion_state).await; + + // Create index if specified + if let Some(idx_type) = index_type { + use lance_index::scalar::ScalarIndexParams; + let params = ScalarIndexParams::default(); + ds.create_index(&["value"], idx_type, None, ¶ms, false) + .await + .unwrap(); + } + + ds +} + +/// Create a record batch that has 60 rows and two columns: id (incremental int32) +/// and value (which should match DataType). +/// +/// The values in `value` should include edge cases for the specific data type: +/// * Nulls for all types +/// * For floats: NaN, Infinity, +/-0 +/// * For integers: min / max +/// * For lists: nullability and empty values at all levels +fn test_batch(data_type: &DataType) -> RecordBatch { + let num_rows = 60; + let id: ArrayRef = Arc::new(Int32Array::from_iter(0..num_rows as i32)); + + // For now, create a simple value column based on data type + // TODO: Add edge cases for each type + let value: ArrayRef = match data_type { + DataType::Boolean => Arc::new(BooleanArray::from_iter((0..num_rows).map(|i| { + if i % 10 == 0 { + None + } else { + Some(i % 2 == 0) + } + }))), + DataType::Int32 => Arc::new(Int32Array::from_iter((0..num_rows).map(|i| { + if i % 10 == 0 { + None + } else { + Some(i as i32) + } + }))), + _ => todo!("Implement test data generation for {:?}", data_type), + }; + + let schema = Schema::new(vec![ + Field::new("id", DataType::Int32, false), + Field::new("value", data_type.clone(), true), + ]); + + RecordBatch::try_new(Arc::new(schema), vec![id, value]).unwrap() +} + +async fn apply_deletions(dataset: &mut Dataset, state: DeletionState) { + match state { + DeletionState::NoDeletions => {} + DeletionState::DeleteOdd => { + dataset.delete("id % 2 = 1").await.unwrap(); + } + DeletionState::DeleteEven => { + dataset.delete("id % 2 = 0").await.unwrap(); + } + } +} + +#[tokio::test] +async fn test_query_bool() { + // TODO: pull out data generator (so it can be re-used an inspected). + let original = gen_batch() + .col("id", array::sequence_i32(0, 60)) + .col("value", array::cycle_bool(vec![true, false]).with_nulls(2)) + .into_batch_rows(RowCount::new(60)) + .unwrap(); + test_states( + // Rename: for_states_and_indices() + TABLE_STATES, + [None, Some(IndexType::Bitmap), Some(IndexType::BTree)], + |ds: Dataset, original: RecordBatch| async move { + test_scan(&original, &ds).await; + test_take(&original, &ds).await; + test_filter(&original, &ds, "value").await; + test_filter(&original, &ds, "!value").await; + }, + ) + .await +} + +async fn test_scan(_original: &RecordBatch, _ds: &Dataset) { + todo!("validate that if you scan ds, then sort by id, you get original back.") +} + +async fn test_take(_original: &RecordBatch, _ds: &Dataset) { + todo!("generate a few sets of ids and validate we can call take against the RB and the DS and get the same result."); +} + +async fn test_filter(_original: &RecordBatch, _ds: &Dataset, _predicate: &str) { + todo!("Scan ds with the predicate"); +} From 89893baa378591cdb86b176c16e4450b248b84a2 Mon Sep 17 00:00:00 2001 From: Will Jones Date: Tue, 16 Sep 2025 15:19:32 -0700 Subject: [PATCH 02/18] scaffold new approach --- rust/lance/src/dataset/broad_test.rs | 251 -------------------------- rust/lance/tests/integration_tests.rs | 7 + rust/lance/tests/query/mod.rs | 24 +++ rust/lance/tests/query/primitives.rs | 61 +++++++ rust/lance/tests/query/vectors.rs | 48 +++++ rust/lance/tests/utils/mod.rs | 105 +++++++++++ 6 files changed, 245 insertions(+), 251 deletions(-) delete mode 100644 rust/lance/src/dataset/broad_test.rs create mode 100644 rust/lance/tests/integration_tests.rs create mode 100644 rust/lance/tests/query/mod.rs create mode 100644 rust/lance/tests/query/primitives.rs create mode 100644 rust/lance/tests/query/vectors.rs create mode 100644 rust/lance/tests/utils/mod.rs diff --git a/rust/lance/src/dataset/broad_test.rs b/rust/lance/src/dataset/broad_test.rs deleted file mode 100644 index 09ee4263fd2..00000000000 --- a/rust/lance/src/dataset/broad_test.rs +++ /dev/null @@ -1,251 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 -// SPDX-FileCopyrightText: Copyright The Lance Authors - -//! A generic test suite. -//! -//! This suite aims to get wide coverage of cases in terms of data types, index -//! types, and dataset states. -//! -//! Data type test should include edge cases for each data type. For example: -//! * Null values -//! * For floats: NaN, Infinity, +/-0 -//! * For integers: min / max -//! * For lists: nullability and empty values at all levels -//! -//! Data state includes: -//! * Deletion -//! * Fragmentation of data files -//! * Fragmentation of indices. - -use std::{panic::AssertUnwindSafe, sync::Arc}; - -use arrow_array::{ArrayRef, BooleanArray, Int32Array, RecordBatch}; -use arrow_schema::{DataType, Field, Schema}; -use futures::FutureExt; -use lance_datagen::{array, gen_batch}; -use lance_index::{DatasetIndexExt, IndexType}; - -use crate::{ - dataset::{InsertBuilder, WriteParams}, - Dataset, -}; - -/// Represents the various test states we want to cover -#[derive(Clone, Copy, Debug)] -struct TableState { - fragmentation: Fragmentation, - deletion: DeletionState, -} - -/// All combinations of test states to cover -const TABLE_STATES: &[TableState] = &[ - TableState { - fragmentation: Fragmentation::SingleFragment, - deletion: DeletionState::NoDeletions, - }, - TableState { - fragmentation: Fragmentation::SingleFragment, - deletion: DeletionState::DeleteOdd, - }, - TableState { - fragmentation: Fragmentation::SingleFragment, - deletion: DeletionState::DeleteEven, - }, - TableState { - fragmentation: Fragmentation::MultiFragment, - deletion: DeletionState::NoDeletions, - }, - TableState { - fragmentation: Fragmentation::MultiFragment, - deletion: DeletionState::DeleteOdd, - }, - TableState { - fragmentation: Fragmentation::MultiFragment, - deletion: DeletionState::DeleteEven, - }, -]; - -/// Helper function to run tests across all table states and index types with panic catching -async fn test_states( - states: &[TableState], - index_types: impl IntoIterator> + Clone, - test_fn: F, -) where - F: Fn(Dataset, RecordBatch) -> Fut, - Fut: std::future::Future, -{ - for &state in states { - for index_type in index_types.clone() { - let (original, ds) = create_test_dataset(state, index_type).await; - - let context = format!( - "fragmentation: {:?}, deletion: {:?}, index: {:?}", - state.fragmentation, state.deletion, index_type - ); - - AssertUnwindSafe(test_fn(ds, original.clone())) - .catch_unwind() - .await - .unwrap_or_else(|_| panic!("Test failed for {}", context)); - } - } -} - -/// Create a test dataset with the given state and index configuration -async fn create_test_dataset( - state: TableState, - index_type: Option, -) -> (RecordBatch, Dataset) { - let data_type = DataType::Boolean; // For now, start with Boolean - let original = test_batch(&data_type); - - let ds = create_dataset_with_config( - original.clone(), - state.fragmentation, - state.deletion, - index_type, - ) - .await; - - (original, ds) -} - -#[derive(Clone, Copy, Debug)] -enum Fragmentation { - SingleFragment, - MultiFragment, -} - -#[derive(Clone, Copy, Debug)] -enum DeletionState { - NoDeletions, - DeleteOdd, - DeleteEven, -} - -/// Helper function to create a dataset with specific configuration -async fn create_dataset_with_config( - data: RecordBatch, - fragmentation: Fragmentation, - deletion_state: DeletionState, - index_type: Option, -) -> Dataset { - // Configure fragmentation - let mut ds = match fragmentation { - Fragmentation::SingleFragment => InsertBuilder::new("memory://") - .execute(vec![data]) - .await - .unwrap(), - Fragmentation::MultiFragment => { - let params = WriteParams { - max_rows_per_file: 10, - ..Default::default() - }; - InsertBuilder::new("memory://") - .with_params(¶ms) - .execute(vec![data]) - .await - .unwrap() - } - }; - - // Apply deletions - apply_deletions(&mut ds, deletion_state).await; - - // Create index if specified - if let Some(idx_type) = index_type { - use lance_index::scalar::ScalarIndexParams; - let params = ScalarIndexParams::default(); - ds.create_index(&["value"], idx_type, None, ¶ms, false) - .await - .unwrap(); - } - - ds -} - -/// Create a record batch that has 60 rows and two columns: id (incremental int32) -/// and value (which should match DataType). -/// -/// The values in `value` should include edge cases for the specific data type: -/// * Nulls for all types -/// * For floats: NaN, Infinity, +/-0 -/// * For integers: min / max -/// * For lists: nullability and empty values at all levels -fn test_batch(data_type: &DataType) -> RecordBatch { - let num_rows = 60; - let id: ArrayRef = Arc::new(Int32Array::from_iter(0..num_rows as i32)); - - // For now, create a simple value column based on data type - // TODO: Add edge cases for each type - let value: ArrayRef = match data_type { - DataType::Boolean => Arc::new(BooleanArray::from_iter((0..num_rows).map(|i| { - if i % 10 == 0 { - None - } else { - Some(i % 2 == 0) - } - }))), - DataType::Int32 => Arc::new(Int32Array::from_iter((0..num_rows).map(|i| { - if i % 10 == 0 { - None - } else { - Some(i as i32) - } - }))), - _ => todo!("Implement test data generation for {:?}", data_type), - }; - - let schema = Schema::new(vec![ - Field::new("id", DataType::Int32, false), - Field::new("value", data_type.clone(), true), - ]); - - RecordBatch::try_new(Arc::new(schema), vec![id, value]).unwrap() -} - -async fn apply_deletions(dataset: &mut Dataset, state: DeletionState) { - match state { - DeletionState::NoDeletions => {} - DeletionState::DeleteOdd => { - dataset.delete("id % 2 = 1").await.unwrap(); - } - DeletionState::DeleteEven => { - dataset.delete("id % 2 = 0").await.unwrap(); - } - } -} - -#[tokio::test] -async fn test_query_bool() { - // TODO: pull out data generator (so it can be re-used an inspected). - let original = gen_batch() - .col("id", array::sequence_i32(0, 60)) - .col("value", array::cycle_bool(vec![true, false]).with_nulls(2)) - .into_batch_rows(RowCount::new(60)) - .unwrap(); - test_states( - // Rename: for_states_and_indices() - TABLE_STATES, - [None, Some(IndexType::Bitmap), Some(IndexType::BTree)], - |ds: Dataset, original: RecordBatch| async move { - test_scan(&original, &ds).await; - test_take(&original, &ds).await; - test_filter(&original, &ds, "value").await; - test_filter(&original, &ds, "!value").await; - }, - ) - .await -} - -async fn test_scan(_original: &RecordBatch, _ds: &Dataset) { - todo!("validate that if you scan ds, then sort by id, you get original back.") -} - -async fn test_take(_original: &RecordBatch, _ds: &Dataset) { - todo!("generate a few sets of ids and validate we can call take against the RB and the DS and get the same result."); -} - -async fn test_filter(_original: &RecordBatch, _ds: &Dataset, _predicate: &str) { - todo!("Scan ds with the predicate"); -} diff --git a/rust/lance/tests/integration_tests.rs b/rust/lance/tests/integration_tests.rs new file mode 100644 index 00000000000..90ad5a7f855 --- /dev/null +++ b/rust/lance/tests/integration_tests.rs @@ -0,0 +1,7 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright The Lance Authors + +// NOTE: we only create one integration test binary, to keep compilation overhead down. + +mod query; +mod utils; diff --git a/rust/lance/tests/query/mod.rs b/rust/lance/tests/query/mod.rs new file mode 100644 index 00000000000..16772eb6bde --- /dev/null +++ b/rust/lance/tests/query/mod.rs @@ -0,0 +1,24 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright The Lance Authors + +use arrow_array::RecordBatch; +use lance::Dataset; + +mod primitives; +mod vectors; + +async fn test_scan(_original: &RecordBatch, _ds: &Dataset) { + todo!("validate that if you scan ds, then sort by id, you get original back.") +} + +async fn test_take(_original: &RecordBatch, _ds: &Dataset) { + todo!("generate a few sets of ids and validate we can call take against the RB and the DS and get the same result."); +} + +async fn test_filter(_original: &RecordBatch, _ds: &Dataset, _predicate: &str) { + todo!("Scan ds with the predicate"); +} + +async fn test_ann(_original: &RecordBatch, _ds: &Dataset, _predicate: Option<&str>) { + todo!("Scan ds with the ANN predicate"); +} diff --git a/rust/lance/tests/query/primitives.rs b/rust/lance/tests/query/primitives.rs new file mode 100644 index 00000000000..f500b31d5e3 --- /dev/null +++ b/rust/lance/tests/query/primitives.rs @@ -0,0 +1,61 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright The Lance Authors + +use arrow::datatypes::Int32Type; +use arrow_array::RecordBatch; +use lance::Dataset; + +use lance_datagen::{array, gen_batch, ArrayGeneratorExt, RowCount}; +use lance_index::IndexType; + +use super::{test_filter, test_scan, test_take}; +use crate::utils::DatasetTestCases; + +#[tokio::test] +async fn test_query_bool() { + let batch = gen_batch() + .col("id", array::step::()) + .col( + "value", + array::cycle_bool(vec![true, false]).with_random_nulls(0.1), + ) + .into_batch_rows(RowCount::from(60)) + .unwrap(); + DatasetTestCases::from_data(batch) + .with_index_types( + "value", + [None, Some(IndexType::Bitmap), Some(IndexType::BTree)], + ) + .run(|ds: Dataset, original: RecordBatch| async move { + test_scan(&original, &ds).await; + test_take(&original, &ds).await; + test_filter(&original, &ds, "value").await; + test_filter(&original, &ds, "!value").await; + }) + .await +} + +#[tokio::test] +async fn test_query_integers() { + todo!() +} + +#[tokio::test] +async fn test_query_floats() { + todo!() +} + +#[tokio::test] +async fn test_query_decimals() { + todo!() +} + +#[tokio::test] +async fn test_query_strings() { + todo!() +} + +#[tokio::test] +async fn test_query_timestamps() { + todo!() +} diff --git a/rust/lance/tests/query/vectors.rs b/rust/lance/tests/query/vectors.rs new file mode 100644 index 00000000000..b5a7b577b19 --- /dev/null +++ b/rust/lance/tests/query/vectors.rs @@ -0,0 +1,48 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright The Lance Authors + +use super::{test_ann, test_scan, test_take}; +use crate::utils::DatasetTestCases; +use arrow::datatypes::{Date32Type, Float32Type, Int32Type}; +use arrow_array::RecordBatch; +use lance::Dataset; +use lance_datagen::{array, gen_batch, ArrayGeneratorExt, Dimension, RowCount}; +use lance_index::IndexType; + +#[tokio::test] +async fn test_query_vector() { + todo!() +} + +#[tokio::test] +async fn test_query_prefilter_date() { + let batch = gen_batch() + .col("id", array::step::()) + .col("value", array::step::().with_random_nulls(0.1)) + .col("vec", array::rand_vec::(Dimension::from(16))) + .into_batch_rows(RowCount::from(60)) + .unwrap(); + DatasetTestCases::from_data(batch) + .with_index_types("value", [None, Some(IndexType::BTree)]) + .with_index_types( + "vec", + [ + None, + Some(IndexType::IvfPq), + Some(IndexType::IvfFlat), + Some(IndexType::IvfHnswFlat), + ], + ) + .run(|ds: Dataset, original: RecordBatch| async move { + test_scan(&original, &ds).await; + test_take(&original, &ds).await; + test_ann(&original, &ds, Some("value is not null")).await; + test_ann( + &original, + &ds, + Some("value >= DATE '2020-06-01' AND value <= DATE '2021-06-01'"), + ) + .await; + }) + .await +} diff --git a/rust/lance/tests/utils/mod.rs b/rust/lance/tests/utils/mod.rs new file mode 100644 index 00000000000..7ad06483a0a --- /dev/null +++ b/rust/lance/tests/utils/mod.rs @@ -0,0 +1,105 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright The Lance Authors + +use std::panic::AssertUnwindSafe; + +use arrow_array::RecordBatch; +use futures::FutureExt; +use lance::Dataset; +use lance_index::IndexType; + +#[derive(Clone, Copy, Debug)] +pub enum Fragmentation { + /// All data in a single file. + SingleFragment, + /// Data is spread across multiple fragments, one file per fragment. + MultiFragment, +} + +#[derive(Clone, Copy, Debug)] +pub enum DeletionState { + /// No deletions are applied. + NoDeletions, + /// Delete odd rows. + DeleteOdd, + /// Delete even rows. + DeleteEven, +} + +pub struct DatasetTestCases { + original: RecordBatch, + index_options: Vec<(String, Vec>)>, +} + +impl DatasetTestCases { + pub fn from_data(original: RecordBatch) -> Self { + Self { + original, + index_options: Vec::new(), + } + } + + pub fn with_index_types( + mut self, + column: impl Into, + index_types: impl IntoIterator>, + ) -> Self { + self.index_options + .push((column.into(), index_types.into_iter().collect())); + self + } + + fn generate_index_combinations(&self) -> Vec> { + let mut combinations = Vec::new(); + for (column, index_types) in &self.index_options { + for index_type in index_types { + if let Some(index_type) = index_type { + combinations.push(vec![(column.as_str(), index_type.clone())]); + } + } + } + combinations + } + + pub async fn run(self, test_fn: F) -> Fut::Output + where + F: Fn(Dataset, RecordBatch) -> Fut, + Fut: std::future::Future, + { + for fragmentation in [Fragmentation::SingleFragment, Fragmentation::MultiFragment] { + for deletion in [ + DeletionState::NoDeletions, + DeletionState::DeleteOdd, + DeletionState::DeleteEven, + ] { + let index_combinations = self.generate_index_combinations(); + for indices in index_combinations { + let ds = + build_dataset(self.original.clone(), fragmentation, deletion, &indices); + let context = format!( + "fragmentation: {:?}, deletion: {:?}, index: {:?}", + fragmentation, deletion, indices + ); + // Catch unwind so we can add test context to the panic. + AssertUnwindSafe(test_fn(ds, self.original.clone())) + .catch_unwind() + .await + .unwrap_or_else(|_| panic!("Test failed for {}", context)); + } + } + } + } +} + +/// Create an in-memory dataset with the given state and data. +/// +/// The data in dataset will exactly match the `original` batch. (Extra rows are +/// created for the deleted rows created by `DeletionState`.) +fn build_dataset( + original: RecordBatch, + fragmentation: Fragmentation, + deletion: DeletionState, + indices: &[(&str, IndexType)], +) -> Dataset { + todo!() +} From 4b7257d5de62c54f517c1fc6d1ca464f6be95650 Mon Sep 17 00:00:00 2001 From: Will Jones Date: Wed, 17 Sep 2025 08:36:54 -0700 Subject: [PATCH 03/18] feat: implement test utils for comprehensive dataset testing MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Implement fill_deleted_rows function to simulate deleted data by interleaving filler rows with id=-1. Add dynamic index parameter generation supporting scalar (BTree, Bitmap), vector (IvfFlat, IvfPq), and FTS (Inverted) indices. 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude --- rust/lance/tests/utils/mod.rs | 153 ++++++++++++++++++++++++++++++++-- 1 file changed, 147 insertions(+), 6 deletions(-) diff --git a/rust/lance/tests/utils/mod.rs b/rust/lance/tests/utils/mod.rs index 7ad06483a0a..4bbe10769e3 100644 --- a/rust/lance/tests/utils/mod.rs +++ b/rust/lance/tests/utils/mod.rs @@ -2,11 +2,18 @@ // SPDX-FileCopyrightText: Copyright The Lance Authors use std::panic::AssertUnwindSafe; +use std::sync::Arc; -use arrow_array::RecordBatch; +use arrow_array::{ArrayRef, Int32Array, RecordBatch}; use futures::FutureExt; -use lance::Dataset; -use lance_index::IndexType; +use lance::index::vector::VectorIndexParams; +use lance::{ + dataset::{InsertBuilder, WriteParams}, + Dataset, +}; +use lance_index::scalar::ScalarIndexParams; +use lance_index::{DatasetIndexExt, IndexParams, IndexType}; +use lance_linalg::distance::MetricType; #[derive(Clone, Copy, Debug)] pub enum Fragmentation { @@ -75,7 +82,8 @@ impl DatasetTestCases { let index_combinations = self.generate_index_combinations(); for indices in index_combinations { let ds = - build_dataset(self.original.clone(), fragmentation, deletion, &indices); + build_dataset(self.original.clone(), fragmentation, deletion, &indices) + .await; let context = format!( "fragmentation: {:?}, deletion: {:?}, index: {:?}", fragmentation, deletion, indices @@ -95,11 +103,144 @@ impl DatasetTestCases { /// /// The data in dataset will exactly match the `original` batch. (Extra rows are /// created for the deleted rows created by `DeletionState`.) -fn build_dataset( +async fn build_dataset( original: RecordBatch, fragmentation: Fragmentation, deletion: DeletionState, indices: &[(&str, IndexType)], ) -> Dataset { - todo!() + let data_to_write = fill_deleted_rows(&original, deletion); + + let max_rows_per_file = if let Fragmentation::MultiFragment = fragmentation { + 3 + } else { + original.num_rows() + 1 + }; + + let mut ds = InsertBuilder::new("memory://") + .with_params(&WriteParams { + max_rows_per_file, + ..Default::default() + }) + .execute(vec![data_to_write]) + .await + .unwrap(); + + ds.delete("id = -1").await.unwrap(); + + assert_eq!(ds.count_rows(None).await.unwrap(), original.num_rows()); + + for (column, index_type) in indices { + // TODO: when possible, make indices cover a portion of rows and not be + // aligned between indices. + let index_params: Box = match index_type { + IndexType::BTree + | IndexType::Bitmap + | IndexType::LabelList + | IndexType::NGram + | IndexType::ZoneMap + | IndexType::Inverted + | IndexType::BloomFilter => Box::new(ScalarIndexParams::for_builtin( + (*index_type).try_into().unwrap(), + )), + IndexType::IvfFlat => { + // Use a small number of partitions for testing + Box::new(VectorIndexParams::ivf_flat(4, MetricType::L2)) + } + IndexType::IvfPq => { + // Simple PQ params for testing + Box::new(VectorIndexParams::ivf_pq(4, 8, 2, MetricType::L2, 10)) + } + _ => { + // For other index types, use default scalar params + Box::new(ScalarIndexParams::default()) + } + }; + + ds.create_index_builder(&[column], *index_type, index_params.as_ref()) + //.fragments() <--- Uncomment this line when implementing fragments + .await + .unwrap(); + } + + ds +} + +/// Insert filler rows into a record batch such that applying deletions to the +/// output will yield the input. For example, given the `deletions: DeletionState::DeleteOdd` +/// and the table: +/// +/// ``` +/// id | value +/// 1 | "a" +/// 2 | "b" +/// ``` +/// +/// Produce: +/// +/// ``` +/// id | value +/// -1 | "a" (filler row) +/// 1 | "a" +/// -1 | "a" +/// 2 | "b" +/// ``` +/// +/// The filler row will have the same values as the original row, but with a special +/// identifier (e.g., -1) to indicate that it is a filler row. +fn fill_deleted_rows(batch: &RecordBatch, deletions: DeletionState) -> RecordBatch { + // Early return for no deletions + if let DeletionState::NoDeletions = deletions { + return batch.clone(); + } + + // Create a filler batch by taking the first row and replacing id with -1 + let schema = batch.schema(); + let mut filler_columns: Vec = Vec::new(); + + for (i, field) in schema.fields().iter().enumerate() { + if field.name() == "id" { + // Create an array with a single -1 value + filler_columns.push(Arc::new(Int32Array::from(vec![-1]))); + } else { + // Take the first value from the original column + let original_column = batch.column(i); + let sliced = original_column.slice(0, 1); + filler_columns.push(sliced); + } + } + + let filler_batch = RecordBatch::try_new(schema.clone(), filler_columns).unwrap(); + + // Create an array of filler batches, one for each row that will be deleted + let num_rows = batch.num_rows(); + let filler_batches = vec![filler_batch.clone(); num_rows]; + + // Concatenate all filler batches into one + let all_fillers = arrow_select::concat::concat_batches(&schema, &filler_batches).unwrap(); + + // Create indices for interleaving based on the deletion pattern + // Format: (batch_index, row_index) where batch_index 0 = original, 1 = fillers + let mut indices: Vec<(usize, usize)> = Vec::new(); + + match deletions { + DeletionState::DeleteOdd => { + // Pattern: filler, original[0], filler, original[1], ... + for i in 0..num_rows { + indices.push((1, i)); // filler batch, row i + indices.push((0, i)); // original batch, row i + } + } + DeletionState::DeleteEven => { + // Pattern: original[0], filler, original[1], filler, ... + for i in 0..num_rows { + indices.push((0, i)); // original batch, row i + indices.push((1, i)); // filler batch, row i + } + } + DeletionState::NoDeletions => unreachable!(), + } + + // Use interleave to reorder according to our indices + arrow::compute::interleave_record_batch(&[batch, &all_fillers], &indices).unwrap() } From ee37fc422d9a0d23ecc28d67441002d638d10ed1 Mon Sep 17 00:00:00 2001 From: Will Jones Date: Wed, 17 Sep 2025 08:59:20 -0700 Subject: [PATCH 04/18] feat: implement test_scan, test_take, and test_filter for query tests MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Implement comprehensive test functions for dataset queries: - test_scan: Verifies scanning with ordering by id column - test_take: Tests taking specific rows by indices, validates against Arrow's take_record_batch - test_filter: Tests filtering with SQL predicates using DataFusion for comparison 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude --- rust/lance/tests/query/mod.rs | 92 ++++++++++++++++++++++++++++++++--- 1 file changed, 85 insertions(+), 7 deletions(-) diff --git a/rust/lance/tests/query/mod.rs b/rust/lance/tests/query/mod.rs index 16772eb6bde..71b9fcb6d8c 100644 --- a/rust/lance/tests/query/mod.rs +++ b/rust/lance/tests/query/mod.rs @@ -1,22 +1,100 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright The Lance Authors -use arrow_array::RecordBatch; +use std::sync::Arc; + +use arrow_array::{RecordBatch, UInt32Array}; +use arrow_select::concat::concat_batches; +use datafusion::datasource::MemTable; +use datafusion::prelude::SessionContext; +use lance::dataset::scanner::ColumnOrdering; use lance::Dataset; mod primitives; mod vectors; -async fn test_scan(_original: &RecordBatch, _ds: &Dataset) { - todo!("validate that if you scan ds, then sort by id, you get original back.") +async fn test_scan(original: &RecordBatch, ds: &Dataset) { + let mut scanner = ds.scan(); + scanner + .order_by(Some(vec![ColumnOrdering::asc_nulls_first( + "id".to_string(), + )])) + .unwrap(); + let scanned = scanner.try_into_batch().await.unwrap(); + + assert_eq!(original, &scanned); } -async fn test_take(_original: &RecordBatch, _ds: &Dataset) { - todo!("generate a few sets of ids and validate we can call take against the RB and the DS and get the same result."); +async fn test_take(original: &RecordBatch, ds: &Dataset) { + let num_rows = original.num_rows(); + let cases: Vec> = vec![ + vec![0, 1, 2], // First few rows + vec![5, 3, 1], // Out of order + vec![0], // Single row + vec![], // Empty + (0..num_rows.min(10)).collect(), // Sequential + vec![num_rows - 1, 0], // Last and first + ]; + + for indices in cases { + // Skip cases with invalid indices + if indices.iter().any(|&i| i >= num_rows) { + continue; + } + + // Convert to u64 for Lance take + let indices_u64: Vec = indices.iter().map(|&i| i as u64).collect(); + + if indices_u64.is_empty() { + // Skip empty case as Lance may not handle it the same way + continue; + } + + let taken_ds = ds.take(&indices_u64, ds.schema().clone()).await.unwrap(); + + // Take from RecordBatch using arrow::compute + let indices_u32: Vec = indices.iter().map(|&i| i as u32).collect(); + let indices_array = UInt32Array::from(indices_u32); + let taken_rb = arrow::compute::take_record_batch(original, &indices_array).unwrap(); + + assert_eq!( + taken_rb, taken_ds, + "Take results don't match for indices: {:?}", + indices + ); + } } -async fn test_filter(_original: &RecordBatch, _ds: &Dataset, _predicate: &str) { - todo!("Scan ds with the predicate"); +async fn test_filter(original: &RecordBatch, ds: &Dataset, predicate: &str) { + // Scan with filter and order + let mut scanner = ds.scan(); + scanner + .filter(predicate) + .unwrap() + .order_by(Some(vec![ColumnOrdering::asc_nulls_first( + "id".to_string(), + )])) + .unwrap(); + let scanned = scanner.try_into_batch().await.unwrap(); + + // Use DataFusion to apply same filter using SQL + // Convert Lance filter syntax to SQL syntax + let sql_predicate = if predicate.starts_with('!') { + format!("NOT {}", &predicate[1..]) + } else { + predicate.to_string() + }; + + let ctx = SessionContext::new(); + let table = MemTable::try_new(original.schema(), vec![vec![original.clone()]]).unwrap(); + ctx.register_table("t", Arc::new(table)).unwrap(); + + let sql = format!("SELECT * FROM t WHERE {} ORDER BY id", sql_predicate); + let df = ctx.sql(&sql).await.unwrap(); + let expected_batches = df.collect().await.unwrap(); + let expected = concat_batches(&original.schema(), &expected_batches).unwrap(); + + assert_eq!(&expected, &scanned); } async fn test_ann(_original: &RecordBatch, _ds: &Dataset, _predicate: Option<&str>) { From c0de557f25d483659c5a508c7555ed44e83974a4 Mon Sep 17 00:00:00 2001 From: Will Jones Date: Wed, 17 Sep 2025 11:25:15 -0700 Subject: [PATCH 05/18] get primitive tests working --- rust/lance/tests/query/mod.rs | 10 +--- rust/lance/tests/query/primitives.rs | 77 +++++++++++++++++++--------- 2 files changed, 55 insertions(+), 32 deletions(-) diff --git a/rust/lance/tests/query/mod.rs b/rust/lance/tests/query/mod.rs index 71b9fcb6d8c..dbb9697bcac 100644 --- a/rust/lance/tests/query/mod.rs +++ b/rust/lance/tests/query/mod.rs @@ -77,19 +77,11 @@ async fn test_filter(original: &RecordBatch, ds: &Dataset, predicate: &str) { .unwrap(); let scanned = scanner.try_into_batch().await.unwrap(); - // Use DataFusion to apply same filter using SQL - // Convert Lance filter syntax to SQL syntax - let sql_predicate = if predicate.starts_with('!') { - format!("NOT {}", &predicate[1..]) - } else { - predicate.to_string() - }; - let ctx = SessionContext::new(); let table = MemTable::try_new(original.schema(), vec![vec![original.clone()]]).unwrap(); ctx.register_table("t", Arc::new(table)).unwrap(); - let sql = format!("SELECT * FROM t WHERE {} ORDER BY id", sql_predicate); + let sql = format!("SELECT * FROM t WHERE {} ORDER BY id", predicate); let df = ctx.sql(&sql).await.unwrap(); let expected_batches = df.collect().await.unwrap(); let expected = concat_batches(&original.schema(), &expected_batches).unwrap(); diff --git a/rust/lance/tests/query/primitives.rs b/rust/lance/tests/query/primitives.rs index f500b31d5e3..cb978074d37 100644 --- a/rust/lance/tests/query/primitives.rs +++ b/rust/lance/tests/query/primitives.rs @@ -1,8 +1,9 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright The Lance Authors -use arrow::datatypes::Int32Type; +use arrow::datatypes::*; use arrow_array::RecordBatch; +use arrow_schema::DataType; use lance::Dataset; use lance_datagen::{array, gen_batch, ArrayGeneratorExt, RowCount}; @@ -24,38 +25,68 @@ async fn test_query_bool() { DatasetTestCases::from_data(batch) .with_index_types( "value", - [None, Some(IndexType::Bitmap), Some(IndexType::BTree)], + // TODO: fix bug with bitmap and btree https://github.com/lancedb/lance/issues/4756 + // TODO: fix bug with zone map https://github.com/lancedb/lance/issues/4758 + // TODO: Add boolean to bloom filter supported types https://github.com/lancedb/lance/issues/4757 + // [None, Some(IndexType::Bitmap), Some(IndexType::BTree), Some(IndexType::BloomFilter), Some(IndexType::ZoneMap)], + [None], ) .run(|ds: Dataset, original: RecordBatch| async move { test_scan(&original, &ds).await; test_take(&original, &ds).await; test_filter(&original, &ds, "value").await; - test_filter(&original, &ds, "!value").await; + test_filter(&original, &ds, "NOT value").await; }) .await } #[tokio::test] -async fn test_query_integers() { - todo!() -} +#[rstest::rstest] +#[case::int8(DataType::Int8)] +#[case::int16(DataType::Int16)] +#[case::int32(DataType::Int32)] +#[case::int64(DataType::Int64)] +#[case::uint8(DataType::UInt8)] +#[case::uint16(DataType::UInt16)] +#[case::uint32(DataType::UInt32)] +#[case::uint64(DataType::UInt64)] +async fn test_query_integer(#[case] data_type: DataType) { + let value_generator = match data_type { + DataType::Int8 => array::rand_primitive::(data_type), + DataType::Int16 => array::rand_primitive::(data_type), + DataType::Int32 => array::rand_primitive::(data_type), + DataType::Int64 => array::rand_primitive::(data_type), + DataType::UInt8 => array::rand_primitive::(data_type), + DataType::UInt16 => array::rand_primitive::(data_type), + DataType::UInt32 => array::rand_primitive::(data_type), + DataType::UInt64 => array::rand_primitive::(data_type), + _ => unreachable!(), + }; -#[tokio::test] -async fn test_query_floats() { - todo!() -} - -#[tokio::test] -async fn test_query_decimals() { - todo!() -} - -#[tokio::test] -async fn test_query_strings() { - todo!() + let batch = gen_batch() + .col("id", array::step::()) + .col("value", value_generator.with_random_nulls(0.1)) + .into_batch_rows(RowCount::from(60)) + .unwrap(); + DatasetTestCases::from_data(batch) + .with_index_types( + "value", + // TODO: add zone map and bloom filter once we fix https://github.com/lancedb/lance/issues/4758 + [None, Some(IndexType::Bitmap), Some(IndexType::BTree)], + ) + .run(|ds: Dataset, original: RecordBatch| async move { + test_scan(&original, &ds).await; + test_take(&original, &ds).await; + test_filter(&original, &ds, "value > 20").await; + test_filter(&original, &ds, "NOT (value > 20)").await; + test_filter(&original, &ds, "value is null").await; + test_filter(&original, &ds, "value is not null").await; + }) + .await } -#[tokio::test] -async fn test_query_timestamps() { - todo!() -} +// TODO: floats (including NaN, +/-Inf, +/-0) +// TODO: decimals +// TODO: binary +// TODO: strings (including largestrings and view) +// TODO: timestamps From de58c174563a8d09c766d772956a9c74ce6f928a Mon Sep 17 00:00:00 2001 From: Will Jones Date: Wed, 17 Sep 2025 12:46:43 -0700 Subject: [PATCH 06/18] get a working ANN test --- rust/lance/src/index/create.rs | 11 +++- rust/lance/tests/query/mod.rs | 88 +++++++++++++++++++++++++++++-- rust/lance/tests/query/vectors.rs | 29 +++++++--- rust/lance/tests/utils/mod.rs | 39 +++++++++++--- 4 files changed, 147 insertions(+), 20 deletions(-) diff --git a/rust/lance/src/index/create.rs b/rust/lance/src/index/create.rs index 2724b3a3cb4..fdac7395dff 100644 --- a/rust/lance/src/index/create.rs +++ b/rust/lance/src/index/create.rs @@ -229,7 +229,16 @@ impl<'a> CreateIndexBuilder<'a> { ) .await? } - (IndexType::Vector, LANCE_VECTOR_INDEX) => { + ( + IndexType::Vector + | IndexType::IvfPq + | IndexType::IvfSq + | IndexType::IvfFlat + | IndexType::IvfHnswFlat + | IndexType::IvfHnswPq + | IndexType::IvfHnswSq, + LANCE_VECTOR_INDEX, + ) => { // Vector index params. let vec_params = self .params diff --git a/rust/lance/tests/query/mod.rs b/rust/lance/tests/query/mod.rs index dbb9697bcac..08878def0a1 100644 --- a/rust/lance/tests/query/mod.rs +++ b/rust/lance/tests/query/mod.rs @@ -3,12 +3,20 @@ use std::sync::Arc; -use arrow_array::{RecordBatch, UInt32Array}; +use arrow_array::{cast::AsArray, RecordBatch, UInt32Array}; use arrow_select::concat::concat_batches; use datafusion::datasource::MemTable; use datafusion::prelude::SessionContext; use lance::dataset::scanner::ColumnOrdering; use lance::Dataset; +use lance_datafusion::udf::register_functions; + +/// Creates a fresh SessionContext with Lance UDFs registered +fn create_datafusion_context() -> SessionContext { + let ctx = SessionContext::new(); + register_functions(&ctx); + ctx +} mod primitives; mod vectors; @@ -77,7 +85,7 @@ async fn test_filter(original: &RecordBatch, ds: &Dataset, predicate: &str) { .unwrap(); let scanned = scanner.try_into_batch().await.unwrap(); - let ctx = SessionContext::new(); + let ctx = create_datafusion_context(); let table = MemTable::try_new(original.schema(), vec![vec![original.clone()]]).unwrap(); ctx.register_table("t", Arc::new(table)).unwrap(); @@ -89,6 +97,78 @@ async fn test_filter(original: &RecordBatch, ds: &Dataset, predicate: &str) { assert_eq!(&expected, &scanned); } -async fn test_ann(_original: &RecordBatch, _ds: &Dataset, _predicate: Option<&str>) { - todo!("Scan ds with the ANN predicate"); +/// Test that an exhaustive ANN query gives the same results as brute force +/// KNN against the original batch. +/// +/// By exhaustive ANN, I mean we search all the partitions so we get perfect recall. +async fn test_ann(original: &RecordBatch, ds: &Dataset, column: &str, predicate: Option<&str>) { + // Extract first vector from the column as query vector + let vector_column = original.column_by_name(column).unwrap(); + let fixed_size_list = vector_column.as_fixed_size_list(); + + // Extract the first vector's values as a new array + let vector_values = fixed_size_list + .values() + .slice(0, fixed_size_list.value_length() as usize); + let query_vector = vector_values; + + let mut scanner = ds.scan(); + scanner + .nearest(column, query_vector.as_ref(), 10) + .unwrap() + .prefilter(true) + .refine(2); + if let Some(pred) = predicate { + scanner.filter(pred).unwrap(); + } + let result = scanner.try_into_batch().await.unwrap(); + + // Use DataFusion to apply same vector search using SQL + let ctx = create_datafusion_context(); + let table = MemTable::try_new(original.schema(), vec![vec![original.clone()]]).unwrap(); + ctx.register_table("t", Arc::new(table)).unwrap(); + + // Convert query vector to SQL array literal + let float_array = query_vector.as_primitive::(); + let vector_values_str = float_array + .values() + .iter() + .map(|v| v.to_string()) + .collect::>() + .join(", "); + + let sql = format!( + "SELECT * FROM t {} ORDER BY array_distance(t.{}, [{}]) LIMIT 10", + if let Some(pred) = predicate { + format!("WHERE {}", pred) + } else { + String::new() + }, + column, + vector_values_str + ); + + let df = ctx.sql(&sql).await.unwrap(); + let expected_batches = df.collect().await.unwrap(); + let expected = concat_batches(&original.schema(), &expected_batches).unwrap(); + + // Compare only the main data (excluding _distance column which Lance adds) + // We validate that both return the same number of rows and same row ordering + assert_eq!( + expected.num_rows(), + result.num_rows(), + "Different number of results" + ); + + // Compare the first few columns (excluding _distance) + for (col_idx, field) in original.schema().fields().iter().enumerate() { + let expected_col = expected.column(col_idx); + let result_col = result.column(col_idx); + assert_eq!( + expected_col, + result_col, + "Column '{}' differs between DataFusion and Lance results", + field.name() + ); + } } diff --git a/rust/lance/tests/query/vectors.rs b/rust/lance/tests/query/vectors.rs index b5a7b577b19..019d4cfc073 100644 --- a/rust/lance/tests/query/vectors.rs +++ b/rust/lance/tests/query/vectors.rs @@ -9,18 +9,28 @@ use lance::Dataset; use lance_datagen::{array, gen_batch, ArrayGeneratorExt, Dimension, RowCount}; use lance_index::IndexType; -#[tokio::test] -async fn test_query_vector() { - todo!() +fn date_as_i32(date: &str) -> i32 { + // Return as i32 days since unix epoch. + use chrono::{NaiveDate, TimeZone, Utc}; + + let parsed_date = + NaiveDate::parse_from_str(date, "%Y-%m-%d").expect("Date should be in YYYY-MM-DD format"); + + let unix_epoch = Utc.timestamp_opt(0, 0).unwrap().date_naive(); + + (parsed_date - unix_epoch).num_days() as i32 } #[tokio::test] async fn test_query_prefilter_date() { let batch = gen_batch() .col("id", array::step::()) - .col("value", array::step::().with_random_nulls(0.1)) + .col( + "value", + array::step_custom::(date_as_i32("2020-01-01"), 1).with_random_nulls(0.1), + ) .col("vec", array::rand_vec::(Dimension::from(16))) - .into_batch_rows(RowCount::from(60)) + .into_batch_rows(RowCount::from(256)) .unwrap(); DatasetTestCases::from_data(batch) .with_index_types("value", [None, Some(IndexType::BTree)]) @@ -29,18 +39,23 @@ async fn test_query_prefilter_date() { [ None, Some(IndexType::IvfPq), + Some(IndexType::IvfSq), Some(IndexType::IvfFlat), Some(IndexType::IvfHnswFlat), + Some(IndexType::IvfHnswPq), + Some(IndexType::IvfHnswSq), ], ) .run(|ds: Dataset, original: RecordBatch| async move { test_scan(&original, &ds).await; test_take(&original, &ds).await; - test_ann(&original, &ds, Some("value is not null")).await; + test_ann(&original, &ds, "vec", None).await; + test_ann(&original, &ds, "vec", Some("value is not null")).await; test_ann( &original, &ds, - Some("value >= DATE '2020-06-01' AND value <= DATE '2021-06-01'"), + "vec", + Some("value >= DATE '2020-01-03' AND value <= DATE '2020-01-25'"), ) .await; }) diff --git a/rust/lance/tests/utils/mod.rs b/rust/lance/tests/utils/mod.rs index 4bbe10769e3..2b2d00c9803 100644 --- a/rust/lance/tests/utils/mod.rs +++ b/rust/lance/tests/utils/mod.rs @@ -12,8 +12,12 @@ use lance::{ Dataset, }; use lance_index::scalar::ScalarIndexParams; +use lance_index::vector::hnsw::builder::HnswBuildParams; +use lance_index::vector::ivf::IvfBuildParams; +use lance_index::vector::pq::PQBuildParams; +use lance_index::vector::sq::builder::SQBuildParams; use lance_index::{DatasetIndexExt, IndexParams, IndexType}; -use lance_linalg::distance::MetricType; +use lance_linalg::distance::{DistanceType, MetricType}; #[derive(Clone, Copy, Debug)] pub enum Fragmentation { @@ -59,10 +63,8 @@ impl DatasetTestCases { fn generate_index_combinations(&self) -> Vec> { let mut combinations = Vec::new(); for (column, index_types) in &self.index_options { - for index_type in index_types { - if let Some(index_type) = index_type { - combinations.push(vec![(column.as_str(), index_type.clone())]); - } + for index_type in index_types.iter().flatten() { + combinations.push(vec![(column.as_str(), *index_type)]); } } combinations @@ -145,12 +147,33 @@ async fn build_dataset( )), IndexType::IvfFlat => { // Use a small number of partitions for testing - Box::new(VectorIndexParams::ivf_flat(4, MetricType::L2)) + Box::new(VectorIndexParams::ivf_flat(2, MetricType::L2)) } IndexType::IvfPq => { // Simple PQ params for testing - Box::new(VectorIndexParams::ivf_pq(4, 8, 2, MetricType::L2, 10)) + Box::new(VectorIndexParams::ivf_pq(2, 8, 2, MetricType::L2, 10)) } + IndexType::IvfSq => Box::new(VectorIndexParams::with_ivf_sq_params( + DistanceType::L2, + IvfBuildParams::new(2), + SQBuildParams::default(), + )), + IndexType::IvfHnswFlat => Box::new(VectorIndexParams::with_ivf_flat_params( + DistanceType::L2, + IvfBuildParams::new(2), + )), + IndexType::IvfHnswPq => Box::new(VectorIndexParams::with_ivf_hnsw_pq_params( + DistanceType::L2, + IvfBuildParams::new(2), + HnswBuildParams::default(), + PQBuildParams::new(2, 8), + )), + IndexType::IvfHnswSq => Box::new(VectorIndexParams::with_ivf_hnsw_sq_params( + DistanceType::L2, + IvfBuildParams::new(2), + HnswBuildParams::default(), + SQBuildParams::default(), + )), _ => { // For other index types, use default scalar params Box::new(ScalarIndexParams::default()) @@ -214,7 +237,7 @@ fn fill_deleted_rows(batch: &RecordBatch, deletions: DeletionState) -> RecordBat // Create an array of filler batches, one for each row that will be deleted let num_rows = batch.num_rows(); - let filler_batches = vec![filler_batch.clone(); num_rows]; + let filler_batches = vec![filler_batch; num_rows]; // Concatenate all filler batches into one let all_fillers = arrow_select::concat::concat_batches(&schema, &filler_batches).unwrap(); From 5ee02c632f8f5256496c9291e2dee0626979eacb Mon Sep 17 00:00:00 2001 From: Will Jones Date: Wed, 17 Sep 2025 13:19:09 -0700 Subject: [PATCH 07/18] add index fragment coverage --- rust/lance/src/dataset.rs | 2 -- rust/lance/tests/query/mod.rs | 5 +++++ rust/lance/tests/utils/mod.rs | 27 ++++++++++++++++++++++++--- 3 files changed, 29 insertions(+), 5 deletions(-) diff --git a/rust/lance/src/dataset.rs b/rust/lance/src/dataset.rs index 33ba4ec86db..722ba7c97e1 100644 --- a/rust/lance/src/dataset.rs +++ b/rust/lance/src/dataset.rs @@ -64,8 +64,6 @@ use tracing::{info, instrument}; pub(crate) mod blob; mod branch_location; -#[cfg(test)] -mod broad_test; pub mod builder; pub mod cleanup; pub mod delta; diff --git a/rust/lance/tests/query/mod.rs b/rust/lance/tests/query/mod.rs index 08878def0a1..88fd4bf1529 100644 --- a/rust/lance/tests/query/mod.rs +++ b/rust/lance/tests/query/mod.rs @@ -21,6 +21,7 @@ fn create_datafusion_context() -> SessionContext { mod primitives; mod vectors; +/// Scanning an ordering by id should give same result as original. async fn test_scan(original: &RecordBatch, ds: &Dataset) { let mut scanner = ds.scan(); scanner @@ -33,6 +34,7 @@ async fn test_scan(original: &RecordBatch, ds: &Dataset) { assert_eq!(original, &scanned); } +/// Taking specific rows should give the same result as taking from the original. async fn test_take(original: &RecordBatch, ds: &Dataset) { let num_rows = original.num_rows(); let cases: Vec> = vec![ @@ -73,6 +75,8 @@ async fn test_take(original: &RecordBatch, ds: &Dataset) { } } +/// Querying with filter should give same result as filtering original +/// record batch in DataFusion. async fn test_filter(original: &RecordBatch, ds: &Dataset, predicate: &str) { // Scan with filter and order let mut scanner = ds.scan(); @@ -137,6 +141,7 @@ async fn test_ann(original: &RecordBatch, ds: &Dataset, column: &str, predicate: .collect::>() .join(", "); + // DataFusion's built-in `array_distance` function uses L2 distance. let sql = format!( "SELECT * FROM t {} ORDER BY array_distance(t.{}, [{}]) LIMIT 10", if let Some(pred) = predicate { diff --git a/rust/lance/tests/utils/mod.rs b/rust/lance/tests/utils/mod.rs index 2b2d00c9803..fa447e0b9b8 100644 --- a/rust/lance/tests/utils/mod.rs +++ b/rust/lance/tests/utils/mod.rs @@ -116,7 +116,7 @@ async fn build_dataset( let max_rows_per_file = if let Fragmentation::MultiFragment = fragmentation { 3 } else { - original.num_rows() + 1 + 1_000_000 }; let mut ds = InsertBuilder::new("memory://") @@ -132,7 +132,13 @@ async fn build_dataset( assert_eq!(ds.count_rows(None).await.unwrap(), original.num_rows()); - for (column, index_type) in indices { + let fragment_ids = ds + .manifest() + .fragments + .iter() + .map(|f| f.id as u32) + .collect::>(); + for (fragment_offset, (column, index_type)) in indices.iter().enumerate() { // TODO: when possible, make indices cover a portion of rows and not be // aligned between indices. let index_params: Box = match index_type { @@ -180,8 +186,23 @@ async fn build_dataset( } }; + // Index all but one fragment. Each new index we increase the offset. + // Start at the offset and then count (wrapping around if needed) + // fragments.length - 1. + // So for 3 fragments: + // Index 1: [0, 1] + // Index 2: [1, 2] + // Index 3: [2, 0] + let fragments_to_index = fragment_ids + .iter() + .cycle() + .skip(fragment_offset) + .take((fragment_ids.len() - 1).max(1)) + .cloned() + .collect::>(); + ds.create_index_builder(&[column], *index_type, index_params.as_ref()) - //.fragments() <--- Uncomment this line when implementing fragments + .fragments(fragments_to_index) .await .unwrap(); } From eb31e553099cfa81d9e606a36ed9c2525008fba7 Mon Sep 17 00:00:00 2001 From: Will Jones Date: Tue, 4 Nov 2025 10:09:51 -0800 Subject: [PATCH 08/18] add workflow back --- .github/workflows/rust.yml | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index 3b17c8c9c5e..12909e09da8 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -145,6 +145,30 @@ jobs: run: | ALL_FEATURES=`cargo metadata --format-version=1 --no-deps | jq -r '.packages[] | .features | keys | .[]' | grep -v protoc | sort | uniq | paste -s -d "," -` cargo test --profile ci --locked --features ${ALL_FEATURES} + query-integration-tests: + runs-on: ubuntu-2404-4x-arm64 + timeout-minutes: 75 + steps: + - uses: actions/checkout@v4 + - name: Setup rust toolchain + run: | + rustup toolchain install stable + rustup default stable + - uses: rui314/setup-mold@v1 + - uses: Swatinem/rust-cache@v2 + with: + cache-targets: false + cache-workspace-crates: true + - name: Install dependencies + run: | + sudo apt -y -qq update + sudo apt install -y protobuf-compiler libssl-dev pkg-config + - name: Build query integration tests + run: | + cargo build --locked --profile release-with-debug --features fp16kernels --tests --test integration_tests + - name: Run query integration tests + run: | + cargo test --locked --profile release-with-debug --features fp16kernels --test integration_tests build-no-lock: runs-on: warp-ubuntu-latest-x64-8x timeout-minutes: 30 From fee88b89a87a9ecb744849092869398d037de536 Mon Sep 17 00:00:00 2001 From: Will Jones Date: Tue, 4 Nov 2025 14:53:16 -0800 Subject: [PATCH 09/18] add ci --- .github/workflows/rust.yml | 22 +++++++++++----------- rust/lance/Cargo.toml | 2 ++ rust/lance/tests/query/primitives.rs | 1 + rust/lance/tests/query/vectors.rs | 1 + 4 files changed, 15 insertions(+), 11 deletions(-) diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index 12909e09da8..5b64c29085a 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -63,7 +63,7 @@ jobs: sudo apt install -y protobuf-compiler libssl-dev - name: Get features run: | - ALL_FEATURES=`cargo metadata --format-version=1 --no-deps | jq -r '.packages[] | .features | keys | .[]' | grep -v protoc | sort | uniq | paste -s -d "," -` + ALL_FEATURES=`cargo metadata --format-version=1 --no-deps | jq -r '.packages[] | .features | keys | .[]' | grep -v -e protoc -e slow_tests | sort | uniq | paste -s -d "," -` echo "ALL_FEATURES=${ALL_FEATURES}" >> $GITHUB_ENV - name: Clippy run: cargo clippy --profile ci --locked --features ${{ env.ALL_FEATURES }} --all-targets -- -D warnings @@ -109,7 +109,7 @@ jobs: - name: Run tests if: ${{ matrix.toolchain == 'stable' }} run: | - ALL_FEATURES=`cargo metadata --format-version=1 --no-deps | jq -r '.packages[] | .features | keys | .[]' | grep -v protoc | sort | uniq | paste -s -d "," -` + ALL_FEATURES=`cargo metadata --format-version=1 --no-deps | jq -r '.packages[] | .features | keys | .[]' | grep -v -e protoc -e slow_tests | sort | uniq | paste -s -d "," -` cargo llvm-cov --profile ci --locked --workspace --codecov --output-path coverage.codecov --features ${ALL_FEATURES} - name: Upload coverage to Codecov if: ${{ matrix.toolchain == 'stable' }} @@ -137,16 +137,16 @@ jobs: sudo apt install -y protobuf-compiler libssl-dev pkg-config - name: Build tests run: | - ALL_FEATURES=`cargo metadata --format-version=1 --no-deps | jq -r '.packages[] | .features | keys | .[]' | grep -v protoc | sort | uniq | paste -s -d "," -` + ALL_FEATURES=`cargo metadata --format-version=1 --no-deps | jq -r '.packages[] | .features | keys | .[]' | grep -v -e protoc -e slow_tests | sort | uniq | paste -s -d "," -` cargo test --profile ci --locked --features ${ALL_FEATURES} --no-run - name: Start DynamodDB and S3 run: docker compose -f docker-compose.yml up -d --wait - name: Run tests run: | - ALL_FEATURES=`cargo metadata --format-version=1 --no-deps | jq -r '.packages[] | .features | keys | .[]' | grep -v protoc | sort | uniq | paste -s -d "," -` + ALL_FEATURES=`cargo metadata --format-version=1 --no-deps | jq -r '.packages[] | .features | keys | .[]' | grep -v -e protoc -e slow_tests | sort | uniq | paste -s -d "," -` cargo test --profile ci --locked --features ${ALL_FEATURES} query-integration-tests: - runs-on: ubuntu-2404-4x-arm64 + runs-on: ubuntu-2404-8x-arm64 timeout-minutes: 75 steps: - uses: actions/checkout@v4 @@ -163,12 +163,12 @@ jobs: run: | sudo apt -y -qq update sudo apt install -y protobuf-compiler libssl-dev pkg-config - - name: Build query integration tests - run: | - cargo build --locked --profile release-with-debug --features fp16kernels --tests --test integration_tests + # - name: Build query integration tests + # run: | + # cargo build --locked -p lance --profile release-with-debug --features fp16kernels --tests --test integration_tests - name: Run query integration tests run: | - cargo test --locked --profile release-with-debug --features fp16kernels --test integration_tests + cargo test --locked -p lance --profile release-with-debug --features fp16kernels,slow_tests --test integration_tests build-no-lock: runs-on: warp-ubuntu-latest-x64-8x timeout-minutes: 30 @@ -188,7 +188,7 @@ jobs: sudo apt install -y protobuf-compiler libssl-dev - name: Build all run: | - ALL_FEATURES=`cargo metadata --format-version=1 --no-deps | jq -r '.packages[] | .features | keys | .[]' | grep -v protoc | sort | uniq | paste -s -d "," -` + ALL_FEATURES=`cargo metadata --format-version=1 --no-deps | jq -r '.packages[] | .features | keys | .[]' | grep -v -e protoc -e slow_tests | sort | uniq | paste -s -d "," -` cargo build --profile ci --benches --features ${ALL_FEATURES} --tests mac-build: runs-on: warp-macos-14-arm64-6x @@ -272,5 +272,5 @@ jobs: rustup default ${{ matrix.msrv }} - name: cargo +${{ matrix.msrv }} check run: | - ALL_FEATURES=`cargo metadata --format-version=1 --no-deps | jq -r '.packages[] | .features | keys | .[]' | grep -v protoc | sort | uniq | paste -s -d "," -` + ALL_FEATURES=`cargo metadata --format-version=1 --no-deps | jq -r '.packages[] | .features | keys | .[]' | grep -v -e protoc -e slow_tests | sort | uniq | paste -s -d "," -` cargo check --profile ci --workspace --tests --benches --features ${ALL_FEATURES} diff --git a/rust/lance/Cargo.toml b/rust/lance/Cargo.toml index dd60157a5fc..25c30230b35 100644 --- a/rust/lance/Cargo.toml +++ b/rust/lance/Cargo.toml @@ -134,6 +134,8 @@ gcp = ["lance-io/gcp"] azure = ["lance-io/azure"] oss = ["lance-io/oss"] huggingface = ["lance-io/huggingface"] +# Enable slow integration tests (disabled by default in CI) +slow_tests = [] [[bin]] name = "lq" diff --git a/rust/lance/tests/query/primitives.rs b/rust/lance/tests/query/primitives.rs index cb978074d37..6813d4dc3e0 100644 --- a/rust/lance/tests/query/primitives.rs +++ b/rust/lance/tests/query/primitives.rs @@ -41,6 +41,7 @@ async fn test_query_bool() { } #[tokio::test] +#[cfg(feature = "slow_tests")] #[rstest::rstest] #[case::int8(DataType::Int8)] #[case::int16(DataType::Int16)] diff --git a/rust/lance/tests/query/vectors.rs b/rust/lance/tests/query/vectors.rs index 019d4cfc073..cf1835d5eb9 100644 --- a/rust/lance/tests/query/vectors.rs +++ b/rust/lance/tests/query/vectors.rs @@ -22,6 +22,7 @@ fn date_as_i32(date: &str) -> i32 { } #[tokio::test] +#[cfg(feature = "slow_tests")] async fn test_query_prefilter_date() { let batch = gen_batch() .col("id", array::step::()) From 0d2621be4bcd2d346a4c0cf9b431fcc73d03a9a0 Mon Sep 17 00:00:00 2001 From: Will Jones Date: Tue, 4 Nov 2025 14:54:08 -0800 Subject: [PATCH 10/18] simplify --- rust/lance/tests/integration_tests.rs | 1 + rust/lance/tests/query/primitives.rs | 1 - rust/lance/tests/query/vectors.rs | 1 - 3 files changed, 1 insertion(+), 2 deletions(-) diff --git a/rust/lance/tests/integration_tests.rs b/rust/lance/tests/integration_tests.rs index 90ad5a7f855..4bbdc894b1e 100644 --- a/rust/lance/tests/integration_tests.rs +++ b/rust/lance/tests/integration_tests.rs @@ -3,5 +3,6 @@ // NOTE: we only create one integration test binary, to keep compilation overhead down. +#[cfg(feature = "slow_tests")] mod query; mod utils; diff --git a/rust/lance/tests/query/primitives.rs b/rust/lance/tests/query/primitives.rs index 6813d4dc3e0..cb978074d37 100644 --- a/rust/lance/tests/query/primitives.rs +++ b/rust/lance/tests/query/primitives.rs @@ -41,7 +41,6 @@ async fn test_query_bool() { } #[tokio::test] -#[cfg(feature = "slow_tests")] #[rstest::rstest] #[case::int8(DataType::Int8)] #[case::int16(DataType::Int16)] diff --git a/rust/lance/tests/query/vectors.rs b/rust/lance/tests/query/vectors.rs index cf1835d5eb9..019d4cfc073 100644 --- a/rust/lance/tests/query/vectors.rs +++ b/rust/lance/tests/query/vectors.rs @@ -22,7 +22,6 @@ fn date_as_i32(date: &str) -> i32 { } #[tokio::test] -#[cfg(feature = "slow_tests")] async fn test_query_prefilter_date() { let batch = gen_batch() .col("id", array::step::()) From 7187672ae07511701422448a553d023844e259a9 Mon Sep 17 00:00:00 2001 From: Will Jones Date: Tue, 4 Nov 2025 16:21:26 -0800 Subject: [PATCH 11/18] Faster build in CI --- .github/workflows/rust.yml | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index 5b64c29085a..806d9c1fdba 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -148,6 +148,9 @@ jobs: query-integration-tests: runs-on: ubuntu-2404-8x-arm64 timeout-minutes: 75 + env: + # We use opt-level 1 which makes some tests 5x faster to run. + RUSTFLAGS: "-C debuginfo=1 -C opt-level 1" steps: - uses: actions/checkout@v4 - name: Setup rust toolchain @@ -163,12 +166,12 @@ jobs: run: | sudo apt -y -qq update sudo apt install -y protobuf-compiler libssl-dev pkg-config - # - name: Build query integration tests - # run: | - # cargo build --locked -p lance --profile release-with-debug --features fp16kernels --tests --test integration_tests + - name: Build query integration tests + run: | + cargo build --locked -p lance --no-default-features --features fp16kernels,slow_tests --tests --test integration_tests - name: Run query integration tests run: | - cargo test --locked -p lance --profile release-with-debug --features fp16kernels,slow_tests --test integration_tests + cargo test --locked -p lance --no-default-features --features fp16kernels,slow_tests --test integration_tests build-no-lock: runs-on: warp-ubuntu-latest-x64-8x timeout-minutes: 30 From 4557372a50664e0665b83165eb03f00ae98c688d Mon Sep 17 00:00:00 2001 From: Will Jones Date: Tue, 4 Nov 2025 16:35:29 -0800 Subject: [PATCH 12/18] comment out flakey cases --- rust/lance/tests/query/vectors.rs | 7 ++++--- rust/lance/tests/utils/mod.rs | 28 +++------------------------- 2 files changed, 7 insertions(+), 28 deletions(-) diff --git a/rust/lance/tests/query/vectors.rs b/rust/lance/tests/query/vectors.rs index 019d4cfc073..9d8c640a7e9 100644 --- a/rust/lance/tests/query/vectors.rs +++ b/rust/lance/tests/query/vectors.rs @@ -41,9 +41,10 @@ async fn test_query_prefilter_date() { Some(IndexType::IvfPq), Some(IndexType::IvfSq), Some(IndexType::IvfFlat), - Some(IndexType::IvfHnswFlat), - Some(IndexType::IvfHnswPq), - Some(IndexType::IvfHnswSq), + // TODO: HNSW results are very flakey. + // Some(IndexType::IvfHnswFlat), + // Some(IndexType::IvfHnswPq), + // Some(IndexType::IvfHnswSq), ], ) .run(|ds: Dataset, original: RecordBatch| async move { diff --git a/rust/lance/tests/utils/mod.rs b/rust/lance/tests/utils/mod.rs index fa447e0b9b8..ff6b903501a 100644 --- a/rust/lance/tests/utils/mod.rs +++ b/rust/lance/tests/utils/mod.rs @@ -132,13 +132,7 @@ async fn build_dataset( assert_eq!(ds.count_rows(None).await.unwrap(), original.num_rows()); - let fragment_ids = ds - .manifest() - .fragments - .iter() - .map(|f| f.id as u32) - .collect::>(); - for (fragment_offset, (column, index_type)) in indices.iter().enumerate() { + for (column, index_type) in indices.iter() { // TODO: when possible, make indices cover a portion of rows and not be // aligned between indices. let index_params: Box = match index_type { @@ -171,13 +165,13 @@ async fn build_dataset( IndexType::IvfHnswPq => Box::new(VectorIndexParams::with_ivf_hnsw_pq_params( DistanceType::L2, IvfBuildParams::new(2), - HnswBuildParams::default(), + HnswBuildParams::default().ef_construction(200), PQBuildParams::new(2, 8), )), IndexType::IvfHnswSq => Box::new(VectorIndexParams::with_ivf_hnsw_sq_params( DistanceType::L2, IvfBuildParams::new(2), - HnswBuildParams::default(), + HnswBuildParams::default().ef_construction(200), SQBuildParams::default(), )), _ => { @@ -186,23 +180,7 @@ async fn build_dataset( } }; - // Index all but one fragment. Each new index we increase the offset. - // Start at the offset and then count (wrapping around if needed) - // fragments.length - 1. - // So for 3 fragments: - // Index 1: [0, 1] - // Index 2: [1, 2] - // Index 3: [2, 0] - let fragments_to_index = fragment_ids - .iter() - .cycle() - .skip(fragment_offset) - .take((fragment_ids.len() - 1).max(1)) - .cloned() - .collect::>(); - ds.create_index_builder(&[column], *index_type, index_params.as_ref()) - .fragments(fragments_to_index) .await .unwrap(); } From 96a26f3d1a395d92aaf1d076273f2b749278434c Mon Sep 17 00:00:00 2001 From: Will Jones Date: Wed, 5 Nov 2025 08:09:33 -0800 Subject: [PATCH 13/18] fix ci --- .github/workflows/rust.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index 806d9c1fdba..82ef21680f9 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -63,7 +63,7 @@ jobs: sudo apt install -y protobuf-compiler libssl-dev - name: Get features run: | - ALL_FEATURES=`cargo metadata --format-version=1 --no-deps | jq -r '.packages[] | .features | keys | .[]' | grep -v -e protoc -e slow_tests | sort | uniq | paste -s -d "," -` + ALL_FEATURES=`cargo metadata --format-version=1 --no-deps | jq -r '.packages[] | .features | keys | .[]' | sort | uniq | paste -s -d "," -` echo "ALL_FEATURES=${ALL_FEATURES}" >> $GITHUB_ENV - name: Clippy run: cargo clippy --profile ci --locked --features ${{ env.ALL_FEATURES }} --all-targets -- -D warnings @@ -150,7 +150,7 @@ jobs: timeout-minutes: 75 env: # We use opt-level 1 which makes some tests 5x faster to run. - RUSTFLAGS: "-C debuginfo=1 -C opt-level 1" + RUSTFLAGS: "-C debuginfo=1 -C opt-level=1" steps: - uses: actions/checkout@v4 - name: Setup rust toolchain From 973e1e268b91cf87e003ea94899304f715bd8942 Mon Sep 17 00:00:00 2001 From: Will Jones Date: Wed, 5 Nov 2025 09:45:37 -0800 Subject: [PATCH 14/18] Address PR review feedback on integration test framework MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Fix generate_index_combinations to create cartesian product of all index options across columns, enabling tests with multiple indices - Add better error messages with expect() at key test setup points - Add test cases for duplicate indices in test_take - Document why _distance column isn't validated in ANN tests - Document that index parameters are for deterministic small test datasets 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- rust/lance/tests/integration_tests.rs | 1 + rust/lance/tests/query/mod.rs | 23 +++++++----- rust/lance/tests/utils/mod.rs | 52 ++++++++++++++++++++++----- 3 files changed, 60 insertions(+), 16 deletions(-) diff --git a/rust/lance/tests/integration_tests.rs b/rust/lance/tests/integration_tests.rs index 4bbdc894b1e..81c2535dd9c 100644 --- a/rust/lance/tests/integration_tests.rs +++ b/rust/lance/tests/integration_tests.rs @@ -5,4 +5,5 @@ #[cfg(feature = "slow_tests")] mod query; +#[cfg(feature = "slow_tests")] mod utils; diff --git a/rust/lance/tests/query/mod.rs b/rust/lance/tests/query/mod.rs index 88fd4bf1529..6e1c7ff3f57 100644 --- a/rust/lance/tests/query/mod.rs +++ b/rust/lance/tests/query/mod.rs @@ -38,12 +38,15 @@ async fn test_scan(original: &RecordBatch, ds: &Dataset) { async fn test_take(original: &RecordBatch, ds: &Dataset) { let num_rows = original.num_rows(); let cases: Vec> = vec![ - vec![0, 1, 2], // First few rows - vec![5, 3, 1], // Out of order - vec![0], // Single row - vec![], // Empty - (0..num_rows.min(10)).collect(), // Sequential - vec![num_rows - 1, 0], // Last and first + vec![0, 1, 2], // First few rows + vec![5, 3, 1], // Out of order + vec![0], // Single row + vec![], // Empty + (0..num_rows.min(10)).collect(), // Sequential + vec![num_rows - 1, 0], // Last and first + vec![1, 1, 2], // Duplicate indices + vec![0, 0, 0], // All same index + vec![num_rows - 1, num_rows - 1], // Duplicate of last row ]; for indices in cases { @@ -157,8 +160,12 @@ async fn test_ann(original: &RecordBatch, ds: &Dataset, column: &str, predicate: let expected_batches = df.collect().await.unwrap(); let expected = concat_batches(&original.schema(), &expected_batches).unwrap(); - // Compare only the main data (excluding _distance column which Lance adds) - // We validate that both return the same number of rows and same row ordering + // Compare only the main data (excluding _distance column which Lance adds). + // We validate that both return the same number of rows and same row ordering. + // Note: We don't validate the _distance column values because: + // 1. ANN indices provide approximate distances, not exact values + // 2. Some distance functions return ordering values (e.g., squared euclidean + // without the final sqrt step) rather than true distances assert_eq!( expected.num_rows(), result.num_rows(), diff --git a/rust/lance/tests/utils/mod.rs b/rust/lance/tests/utils/mod.rs index ff6b903501a..9ef9f39b10d 100644 --- a/rust/lance/tests/utils/mod.rs +++ b/rust/lance/tests/utils/mod.rs @@ -61,13 +61,39 @@ impl DatasetTestCases { } fn generate_index_combinations(&self) -> Vec> { - let mut combinations = Vec::new(); - for (column, index_types) in &self.index_options { - for index_type in index_types.iter().flatten() { - combinations.push(vec![(column.as_str(), *index_type)]); + if self.index_options.is_empty() { + return vec![vec![]]; + } + + fn generate_recursive<'a>( + options: &'a [(String, Vec>)], + current_idx: usize, + current_combination: Vec<(&'a str, IndexType)>, + results: &mut Vec>, + ) { + if current_idx == options.len() { + // Only add non-empty combinations (filter out all-None case) + if !current_combination.is_empty() { + results.push(current_combination); + } + return; + } + + let (column, index_types) = &options[current_idx]; + + // Try each index type for this column (including None) + for index_type_opt in index_types { + let mut next_combination = current_combination.clone(); + if let Some(index_type) = index_type_opt { + next_combination.push((column.as_str(), *index_type)); + } + generate_recursive(options, current_idx + 1, next_combination, results); } } - combinations + + let mut results = Vec::new(); + generate_recursive(&self.index_options, 0, Vec::new(), &mut results); + results } pub async fn run(self, test_fn: F) -> Fut::Output @@ -126,15 +152,20 @@ async fn build_dataset( }) .execute(vec![data_to_write]) .await - .unwrap(); + .expect("Failed to create test dataset"); - ds.delete("id = -1").await.unwrap(); + ds.delete("id = -1") + .await + .expect("Failed to delete filler rows (id = -1)"); assert_eq!(ds.count_rows(None).await.unwrap(), original.num_rows()); for (column, index_type) in indices.iter() { // TODO: when possible, make indices cover a portion of rows and not be // aligned between indices. + + // Index parameters are chosen to make search results deterministic for small + // test datasets, not for production use. let index_params: Box = match index_type { IndexType::BTree | IndexType::Bitmap @@ -182,7 +213,12 @@ async fn build_dataset( ds.create_index_builder(&[column], *index_type, index_params.as_ref()) .await - .unwrap(); + .unwrap_or_else(|e| { + panic!( + "Failed to create index on column '{}' with type {:?}: {}", + column, index_type, e + ) + }); } ds From 9c01a7b30dff0706a47a81f1596a7b38eb871124 Mon Sep 17 00:00:00 2001 From: Will Jones Date: Wed, 5 Nov 2025 09:49:37 -0800 Subject: [PATCH 15/18] Apply suggestion from @wjones127 --- rust/lance/tests/query/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rust/lance/tests/query/mod.rs b/rust/lance/tests/query/mod.rs index 6e1c7ff3f57..2f916ab5fa7 100644 --- a/rust/lance/tests/query/mod.rs +++ b/rust/lance/tests/query/mod.rs @@ -21,7 +21,7 @@ fn create_datafusion_context() -> SessionContext { mod primitives; mod vectors; -/// Scanning an ordering by id should give same result as original. +/// Scanning and ordering by id should give same result as original. async fn test_scan(original: &RecordBatch, ds: &Dataset) { let mut scanner = ds.scan(); scanner From ee05a96b8ef0b3acfde007f899de3dc671a759d4 Mon Sep 17 00:00:00 2001 From: Will Jones Date: Wed, 5 Nov 2025 09:49:43 -0800 Subject: [PATCH 16/18] Apply suggestion from @wjones127 --- rust/lance/tests/query/mod.rs | 5 ----- 1 file changed, 5 deletions(-) diff --git a/rust/lance/tests/query/mod.rs b/rust/lance/tests/query/mod.rs index 2f916ab5fa7..8833c4a0bdc 100644 --- a/rust/lance/tests/query/mod.rs +++ b/rust/lance/tests/query/mod.rs @@ -50,11 +50,6 @@ async fn test_take(original: &RecordBatch, ds: &Dataset) { ]; for indices in cases { - // Skip cases with invalid indices - if indices.iter().any(|&i| i >= num_rows) { - continue; - } - // Convert to u64 for Lance take let indices_u64: Vec = indices.iter().map(|&i| i as u64).collect(); From dcd60211b9927a595677a6b58be354a928975187 Mon Sep 17 00:00:00 2001 From: Will Jones Date: Wed, 5 Nov 2025 09:49:48 -0800 Subject: [PATCH 17/18] Apply suggestion from @wjones127 --- rust/lance/tests/query/mod.rs | 5 ----- 1 file changed, 5 deletions(-) diff --git a/rust/lance/tests/query/mod.rs b/rust/lance/tests/query/mod.rs index 8833c4a0bdc..5816d786f89 100644 --- a/rust/lance/tests/query/mod.rs +++ b/rust/lance/tests/query/mod.rs @@ -53,11 +53,6 @@ async fn test_take(original: &RecordBatch, ds: &Dataset) { // Convert to u64 for Lance take let indices_u64: Vec = indices.iter().map(|&i| i as u64).collect(); - if indices_u64.is_empty() { - // Skip empty case as Lance may not handle it the same way - continue; - } - let taken_ds = ds.take(&indices_u64, ds.schema().clone()).await.unwrap(); // Take from RecordBatch using arrow::compute From 030567a86caf94e9f33874e7255f9f80cc0dfb20 Mon Sep 17 00:00:00 2001 From: Will Jones Date: Mon, 8 Dec 2025 17:05:26 -0800 Subject: [PATCH 18/18] fix runner --- .github/workflows/rust.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index 82ef21680f9..a56d2935207 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -146,7 +146,7 @@ jobs: ALL_FEATURES=`cargo metadata --format-version=1 --no-deps | jq -r '.packages[] | .features | keys | .[]' | grep -v -e protoc -e slow_tests | sort | uniq | paste -s -d "," -` cargo test --profile ci --locked --features ${ALL_FEATURES} query-integration-tests: - runs-on: ubuntu-2404-8x-arm64 + runs-on: warp-ubuntu-latest-x64-4x timeout-minutes: 75 env: # We use opt-level 1 which makes some tests 5x faster to run.