From 41e6c4e0016651d256e3dd5ec57e87f2ba25d695 Mon Sep 17 00:00:00 2001
From: Will Jones <willjones127@gmail.com>
Date: Tue, 16 Sep 2025 09:33:11 -0700
Subject: [PATCH 01/18] start a broad test

---
 Cargo.lock                           |   1 +
 rust/lance/Cargo.toml                |   1 +
 rust/lance/src/dataset.rs            |   2 +
 rust/lance/src/dataset/broad_test.rs | 251 +++++++++++++++++++++++++++
 4 files changed, 255 insertions(+)
 create mode 100644 rust/lance/src/dataset/broad_test.rs
diff --git a/Cargo.lock b/Cargo.lock
index f89d5c74c1a..95923c70bd7 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -4460,6 +4460,7 @@ dependencies = [
  "mock_instant",
  "moka",
  "object_store",
+ "paste",
  "permutation",
  "pin-project",
  "pprof",
diff --git a/rust/lance/Cargo.toml b/rust/lance/Cargo.toml
index c422a5bcf45..dd60157a5fc 100644
--- a/rust/lance/Cargo.toml
+++ b/rust/lance/Cargo.toml
@@ -106,6 +106,7 @@ test-log.workspace = true
 tracing-chrome = "0.7.1"
 rstest = { workspace = true }
 tracking-allocator = { version = "0.4", features = ["tracing-compat"] }
+paste = "1.0"
 # For S3 / DynamoDB tests
 aws-config = { workspace = true }
 aws-sdk-s3 = { workspace = true }
diff --git a/rust/lance/src/dataset.rs b/rust/lance/src/dataset.rs
index 722ba7c97e1..33ba4ec86db 100644
--- a/rust/lance/src/dataset.rs
+++ b/rust/lance/src/dataset.rs
@@ -64,6 +64,8 @@ use tracing::{info, instrument};
 
 pub(crate) mod blob;
 mod branch_location;
+#[cfg(test)]
+mod broad_test;
 pub mod builder;
 pub mod cleanup;
 pub mod delta;
diff --git a/rust/lance/src/dataset/broad_test.rs b/rust/lance/src/dataset/broad_test.rs
new file mode 100644
index 00000000000..09ee4263fd2
--- /dev/null
+++ b/rust/lance/src/dataset/broad_test.rs
@@ -0,0 +1,251 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright The Lance Authors
+
+//! A generic test suite.
+//!
+//! This suite aims to get wide coverage of cases in terms of data types, index
+//! types, and dataset states.
+//!
+//! Data type test should include edge cases for each data type. For example:
+//! * Null values
+//! * For floats: NaN, Infinity, +/-0
+//! * For integers: min / max
+//! * For lists: nullability and empty values at all levels
+//!
+//! Data state includes:
+//! * Deletion
+//! * Fragmentation of data files
+//! * Fragmentation of indices.
+
+use std::{panic::AssertUnwindSafe, sync::Arc};
+
+use arrow_array::{ArrayRef, BooleanArray, Int32Array, RecordBatch};
+use arrow_schema::{DataType, Field, Schema};
+use futures::FutureExt;
+use lance_datagen::{array, gen_batch};
+use lance_index::{DatasetIndexExt, IndexType};
+
+use crate::{
+    dataset::{InsertBuilder, WriteParams},
+    Dataset,
+};
+
+/// Represents the various test states we want to cover
+#[derive(Clone, Copy, Debug)]
+struct TableState {
+    fragmentation: Fragmentation,
+    deletion: DeletionState,
+}
+
+/// All combinations of test states to cover
+const TABLE_STATES: &[TableState] = &[
+    TableState {
+        fragmentation: Fragmentation::SingleFragment,
+        deletion: DeletionState::NoDeletions,
+    },
+    TableState {
+        fragmentation: Fragmentation::SingleFragment,
+        deletion: DeletionState::DeleteOdd,
+    },
+    TableState {
+        fragmentation: Fragmentation::SingleFragment,
+        deletion: DeletionState::DeleteEven,
+    },
+    TableState {
+        fragmentation: Fragmentation::MultiFragment,
+        deletion: DeletionState::NoDeletions,
+    },
+    TableState {
+        fragmentation: Fragmentation::MultiFragment,
+        deletion: DeletionState::DeleteOdd,
+    },
+    TableState {
+        fragmentation: Fragmentation::MultiFragment,
+        deletion: DeletionState::DeleteEven,
+    },
+];
+
+/// Helper function to run tests across all table states and index types with panic catching
+async fn test_states<F, Fut>(
+    states: &[TableState],
+    index_types: impl IntoIterator<Item = Option<IndexType>> + Clone,
+    test_fn: F,
+) where
+    F: Fn(Dataset, RecordBatch) -> Fut,
+    Fut: std::future::Future<Output = ()>,
+{
+    for &state in states {
+        for index_type in index_types.clone() {
+            let (original, ds) = create_test_dataset(state, index_type).await;
+
+            let context = format!(
+                "fragmentation: {:?}, deletion: {:?}, index: {:?}",
+                state.fragmentation, state.deletion, index_type
+            );
+
+            AssertUnwindSafe(test_fn(ds, original.clone()))
+                .catch_unwind()
+                .await
+                .unwrap_or_else(|_| panic!("Test failed for {}", context));
+        }
+    }
+}
+
+/// Create a test dataset with the given state and index configuration
+async fn create_test_dataset(
+    state: TableState,
+    index_type: Option<IndexType>,
+) -> (RecordBatch, Dataset) {
+    let data_type = DataType::Boolean; // For now, start with Boolean
+    let original = test_batch(&data_type);
+
+    let ds = create_dataset_with_config(
+        original.clone(),
+        state.fragmentation,
+        state.deletion,
+        index_type,
+    )
+    .await;
+
+    (original, ds)
+}
+
+#[derive(Clone, Copy, Debug)]
+enum Fragmentation {
+    SingleFragment,
+    MultiFragment,
+}
+
+#[derive(Clone, Copy, Debug)]
+enum DeletionState {
+    NoDeletions,
+    DeleteOdd,
+    DeleteEven,
+}
+
+/// Helper function to create a dataset with specific configuration
+async fn create_dataset_with_config(
+    data: RecordBatch,
+    fragmentation: Fragmentation,
+    deletion_state: DeletionState,
+    index_type: Option<IndexType>,
+) -> Dataset {
+    // Configure fragmentation
+    let mut ds = match fragmentation {
+        Fragmentation::SingleFragment => InsertBuilder::new("memory://")
+            .execute(vec![data])
+            .await
+            .unwrap(),
+        Fragmentation::MultiFragment => {
+            let params = WriteParams {
+                max_rows_per_file: 10,
+                ..Default::default()
+            };
+            InsertBuilder::new("memory://")
+                .with_params(&params)
+                .execute(vec![data])
+                .await
+                .unwrap()
+        }
+    };
+
+    // Apply deletions
+    apply_deletions(&mut ds, deletion_state).await;
+
+    // Create index if specified
+    if let Some(idx_type) = index_type {
+        use lance_index::scalar::ScalarIndexParams;
+        let params = ScalarIndexParams::default();
+        ds.create_index(&["value"], idx_type, None, &params, false)
+            .await
+            .unwrap();
+    }
+
+    ds
+}
+
+/// Create a record batch that has 60 rows and two columns: id (incremental int32)
+/// and value (which should match DataType).
+///
+/// The values in `value` should include edge cases for the specific data type:
+/// * Nulls for all types
+/// * For floats: NaN, Infinity, +/-0
+/// * For integers: min / max
+/// * For lists: nullability and empty values at all levels
+fn test_batch(data_type: &DataType) -> RecordBatch {
+    let num_rows = 60;
+    let id: ArrayRef = Arc::new(Int32Array::from_iter(0..num_rows as i32));
+
+    // For now, create a simple value column based on data type
+    // TODO: Add edge cases for each type
+    let value: ArrayRef = match data_type {
+        DataType::Boolean => Arc::new(BooleanArray::from_iter((0..num_rows).map(|i| {
+            if i % 10 == 0 {
+                None
+            } else {
+                Some(i % 2 == 0)
+            }
+        }))),
+        DataType::Int32 => Arc::new(Int32Array::from_iter((0..num_rows).map(|i| {
+            if i % 10 == 0 {
+                None
+            } else {
+                Some(i as i32)
+            }
+        }))),
+        _ => todo!("Implement test data generation for {:?}", data_type),
+    };
+
+    let schema = Schema::new(vec![
+        Field::new("id", DataType::Int32, false),
+        Field::new("value", data_type.clone(), true),
+    ]);
+
+    RecordBatch::try_new(Arc::new(schema), vec![id, value]).unwrap()
+}
+
+async fn apply_deletions(dataset: &mut Dataset, state: DeletionState) {
+    match state {
+        DeletionState::NoDeletions => {}
+        DeletionState::DeleteOdd => {
+            dataset.delete("id % 2 = 1").await.unwrap();
+        }
+        DeletionState::DeleteEven => {
+            dataset.delete("id % 2 = 0").await.unwrap();
+        }
+    }
+}
+
+#[tokio::test]
+async fn test_query_bool() {
+    // TODO: pull out data generator (so it can be re-used an inspected).
+    let original = gen_batch()
+        .col("id", array::sequence_i32(0, 60))
+        .col("value", array::cycle_bool(vec![true, false]).with_nulls(2))
+        .into_batch_rows(RowCount::new(60))
+        .unwrap();
+    test_states(
+        // Rename: for_states_and_indices()
+        TABLE_STATES,
+        [None, Some(IndexType::Bitmap), Some(IndexType::BTree)],
+        |ds: Dataset, original: RecordBatch| async move {
+            test_scan(&original, &ds).await;
+            test_take(&original, &ds).await;
+            test_filter(&original, &ds, "value").await;
+            test_filter(&original, &ds, "!value").await;
+        },
+    )
+    .await
+}
+
+async fn test_scan(_original: &RecordBatch, _ds: &Dataset) {
+    todo!("validate that if you scan ds, then sort by id, you get original back.")
+}
+
+async fn test_take(_original: &RecordBatch, _ds: &Dataset) {
+    todo!("generate a few sets of ids and validate we can call take against the RB and the DS and get the same result.");
+}
+
+async fn test_filter(_original: &RecordBatch, _ds: &Dataset, _predicate: &str) {
+    todo!("Scan ds with the predicate");
+}

From 89893baa378591cdb86b176c16e4450b248b84a2 Mon Sep 17 00:00:00 2001
From: Will Jones <willjones127@gmail.com>
Date: Tue, 16 Sep 2025 15:19:32 -0700
Subject: [PATCH 02/18] scaffold new approach

---
 rust/lance/src/dataset/broad_test.rs  | 251 --------------------------
 rust/lance/tests/integration_tests.rs |   7 +
 rust/lance/tests/query/mod.rs         |  24 +++
 rust/lance/tests/query/primitives.rs  |  61 +++++++
 rust/lance/tests/query/vectors.rs     |  48 +++++
 rust/lance/tests/utils/mod.rs         | 105 +++++++++++
 6 files changed, 245 insertions(+), 251 deletions(-)
 delete mode 100644 rust/lance/src/dataset/broad_test.rs
 create mode 100644 rust/lance/tests/integration_tests.rs
 create mode 100644 rust/lance/tests/query/mod.rs
 create mode 100644 rust/lance/tests/query/primitives.rs
 create mode 100644 rust/lance/tests/query/vectors.rs
 create mode 100644 rust/lance/tests/utils/mod.rs

diff --git a/rust/lance/src/dataset/broad_test.rs b/rust/lance/src/dataset/broad_test.rs
deleted file mode 100644
index 09ee4263fd2..00000000000
--- a/rust/lance/src/dataset/broad_test.rs
+++ /dev/null
@@ -1,251 +0,0 @@
-// SPDX-License-Identifier: Apache-2.0
-// SPDX-FileCopyrightText: Copyright The Lance Authors
-
-//! A generic test suite.
-//!
-//! This suite aims to get wide coverage of cases in terms of data types, index
-//! types, and dataset states.
-//!
-//! Data type test should include edge cases for each data type. For example:
-//! * Null values
-//! * For floats: NaN, Infinity, +/-0
-//! * For integers: min / max
-//! * For lists: nullability and empty values at all levels
-//!
-//! Data state includes:
-//! * Deletion
-//! * Fragmentation of data files
-//! * Fragmentation of indices.
-
-use std::{panic::AssertUnwindSafe, sync::Arc};
-
-use arrow_array::{ArrayRef, BooleanArray, Int32Array, RecordBatch};
-use arrow_schema::{DataType, Field, Schema};
-use futures::FutureExt;
-use lance_datagen::{array, gen_batch};
-use lance_index::{DatasetIndexExt, IndexType};
-
-use crate::{
-    dataset::{InsertBuilder, WriteParams},
-    Dataset,
-};
-
-/// Represents the various test states we want to cover
-#[derive(Clone, Copy, Debug)]
-struct TableState {
-    fragmentation: Fragmentation,
-    deletion: DeletionState,
-}
-
-/// All combinations of test states to cover
-const TABLE_STATES: &[TableState] = &[
-    TableState {
-        fragmentation: Fragmentation::SingleFragment,
-        deletion: DeletionState::NoDeletions,
-    },
-    TableState {
-        fragmentation: Fragmentation::SingleFragment,
-        deletion: DeletionState::DeleteOdd,
-    },
-    TableState {
-        fragmentation: Fragmentation::SingleFragment,
-        deletion: DeletionState::DeleteEven,
-    },
-    TableState {
-        fragmentation: Fragmentation::MultiFragment,
-        deletion: DeletionState::NoDeletions,
-    },
-    TableState {
-        fragmentation: Fragmentation::MultiFragment,
-        deletion: DeletionState::DeleteOdd,
-    },
-    TableState {
-        fragmentation: Fragmentation::MultiFragment,
-        deletion: DeletionState::DeleteEven,
-    },
-];
-
-/// Helper function to run tests across all table states and index types with panic catching
-async fn test_states<F, Fut>(
-    states: &[TableState],
-    index_types: impl IntoIterator<Item = Option<IndexType>> + Clone,
-    test_fn: F,
-) where
-    F: Fn(Dataset, RecordBatch) -> Fut,
-    Fut: std::future::Future<Output = ()>,
-{
-    for &state in states {
-        for index_type in index_types.clone() {
-            let (original, ds) = create_test_dataset(state, index_type).await;
-
-            let context = format!(
-                "fragmentation: {:?}, deletion: {:?}, index: {:?}",
-                state.fragmentation, state.deletion, index_type
-            );
-
-            AssertUnwindSafe(test_fn(ds, original.clone()))
-                .catch_unwind()
-                .await
-                .unwrap_or_else(|_| panic!("Test failed for {}", context));
-        }
-    }
-}
-
-/// Create a test dataset with the given state and index configuration
-async fn create_test_dataset(
-    state: TableState,
-    index_type: Option<IndexType>,
-) -> (RecordBatch, Dataset) {
-    let data_type = DataType::Boolean; // For now, start with Boolean
-    let original = test_batch(&data_type);
-
-    let ds = create_dataset_with_config(
-        original.clone(),
-        state.fragmentation,
-        state.deletion,
-        index_type,
-    )
-    .await;
-
-    (original, ds)
-}
-
-#[derive(Clone, Copy, Debug)]
-enum Fragmentation {
-    SingleFragment,
-    MultiFragment,
-}
-
-#[derive(Clone, Copy, Debug)]
-enum DeletionState {
-    NoDeletions,
-    DeleteOdd,
-    DeleteEven,
-}
-
-/// Helper function to create a dataset with specific configuration
-async fn create_dataset_with_config(
-    data: RecordBatch,
-    fragmentation: Fragmentation,
-    deletion_state: DeletionState,
-    index_type: Option<IndexType>,
-) -> Dataset {
-    // Configure fragmentation
-    let mut ds = match fragmentation {
-        Fragmentation::SingleFragment => InsertBuilder::new("memory://")
-            .execute(vec![data])
-            .await
-            .unwrap(),
-        Fragmentation::MultiFragment => {
-            let params = WriteParams {
-                max_rows_per_file: 10,
-                ..Default::default()
-            };
-            InsertBuilder::new("memory://")
-                .with_params(&params)
-                .execute(vec![data])
-                .await
-                .unwrap()
-        }
-    };
-
-    // Apply deletions
-    apply_deletions(&mut ds, deletion_state).await;
-
-    // Create index if specified
-    if let Some(idx_type) = index_type {
-        use lance_index::scalar::ScalarIndexParams;
-        let params = ScalarIndexParams::default();
-        ds.create_index(&["value"], idx_type, None, &params, false)
-            .await
-            .unwrap();
-    }
-
-    ds
-}
-
-/// Create a record batch that has 60 rows and two columns: id (incremental int32)
-/// and value (which should match DataType).
-///
-/// The values in `value` should include edge cases for the specific data type:
-/// * Nulls for all types
-/// * For floats: NaN, Infinity, +/-0
-/// * For integers: min / max
-/// * For lists: nullability and empty values at all levels
-fn test_batch(data_type: &DataType) -> RecordBatch {
-    let num_rows = 60;
-    let id: ArrayRef = Arc::new(Int32Array::from_iter(0..num_rows as i32));
-
-    // For now, create a simple value column based on data type
-    // TODO: Add edge cases for each type
-    let value: ArrayRef = match data_type {
-        DataType::Boolean => Arc::new(BooleanArray::from_iter((0..num_rows).map(|i| {
-            if i % 10 == 0 {
-                None
-            } else {
-                Some(i % 2 == 0)
-            }
-        }))),
-        DataType::Int32 => Arc::new(Int32Array::from_iter((0..num_rows).map(|i| {
-            if i % 10 == 0 {
-                None
-            } else {
-                Some(i as i32)
-            }
-        }))),
-        _ => todo!("Implement test data generation for {:?}", data_type),
-    };
-
-    let schema = Schema::new(vec![
-        Field::new("id", DataType::Int32, false),
-        Field::new("value", data_type.clone(), true),
-    ]);
-
-    RecordBatch::try_new(Arc::new(schema), vec![id, value]).unwrap()
-}
-
-async fn apply_deletions(dataset: &mut Dataset, state: DeletionState) {
-    match state {
-        DeletionState::NoDeletions => {}
-        DeletionState::DeleteOdd => {
-            dataset.delete("id % 2 = 1").await.unwrap();
-        }
-        DeletionState::DeleteEven => {
-            dataset.delete("id % 2 = 0").await.unwrap();
-        }
-    }
-}
-
-#[tokio::test]
-async fn test_query_bool() {
-    // TODO: pull out data generator (so it can be re-used an inspected).
-    let original = gen_batch()
-        .col("id", array::sequence_i32(0, 60))
-        .col("value", array::cycle_bool(vec![true, false]).with_nulls(2))
-        .into_batch_rows(RowCount::new(60))
-        .unwrap();
-    test_states(
-        // Rename: for_states_and_indices()
-        TABLE_STATES,
-        [None, Some(IndexType::Bitmap), Some(IndexType::BTree)],
-        |ds: Dataset, original: RecordBatch| async move {
-            test_scan(&original, &ds).await;
-            test_take(&original, &ds).await;
-            test_filter(&original, &ds, "value").await;
-            test_filter(&original, &ds, "!value").await;
-        },
-    )
-    .await
-}
-
-async fn test_scan(_original: &RecordBatch, _ds: &Dataset) {
-    todo!("validate that if you scan ds, then sort by id, you get original back.")
-}
-
-async fn test_take(_original: &RecordBatch, _ds: &Dataset) {
-    todo!("generate a few sets of ids and validate we can call take against the RB and the DS and get the same result.");
-}
-
-async fn test_filter(_original: &RecordBatch, _ds: &Dataset, _predicate: &str) {
-    todo!("Scan ds with the predicate");
-}
diff --git a/rust/lance/tests/integration_tests.rs b/rust/lance/tests/integration_tests.rs
new file mode 100644
index 00000000000..90ad5a7f855
--- /dev/null
+++ b/rust/lance/tests/integration_tests.rs
@@ -0,0 +1,7 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright The Lance Authors
+
+// NOTE: we only create one integration test binary, to keep compilation overhead down.
+
+mod query;
+mod utils;
diff --git a/rust/lance/tests/query/mod.rs b/rust/lance/tests/query/mod.rs
new file mode 100644
index 00000000000..16772eb6bde
--- /dev/null
+++ b/rust/lance/tests/query/mod.rs
@@ -0,0 +1,24 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright The Lance Authors
+
+use arrow_array::RecordBatch;
+use lance::Dataset;
+
+mod primitives;
+mod vectors;
+
+async fn test_scan(_original: &RecordBatch, _ds: &Dataset) {
+    todo!("validate that if you scan ds, then sort by id, you get original back.")
+}
+
+async fn test_take(_original: &RecordBatch, _ds: &Dataset) {
+    todo!("generate a few sets of ids and validate we can call take against the RB and the DS and get the same result.");
+}
+
+async fn test_filter(_original: &RecordBatch, _ds: &Dataset, _predicate: &str) {
+    todo!("Scan ds with the predicate");
+}
+
+async fn test_ann(_original: &RecordBatch, _ds: &Dataset, _predicate: Option<&str>) {
+    todo!("Scan ds with the ANN predicate");
+}
diff --git a/rust/lance/tests/query/primitives.rs b/rust/lance/tests/query/primitives.rs
new file mode 100644
index 00000000000..f500b31d5e3
--- /dev/null
+++ b/rust/lance/tests/query/primitives.rs
@@ -0,0 +1,61 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright The Lance Authors
+
+use arrow::datatypes::Int32Type;
+use arrow_array::RecordBatch;
+use lance::Dataset;
+
+use lance_datagen::{array, gen_batch, ArrayGeneratorExt, RowCount};
+use lance_index::IndexType;
+
+use super::{test_filter, test_scan, test_take};
+use crate::utils::DatasetTestCases;
+
+#[tokio::test]
+async fn test_query_bool() {
+    let batch = gen_batch()
+        .col("id", array::step::<Int32Type>())
+        .col(
+            "value",
+            array::cycle_bool(vec![true, false]).with_random_nulls(0.1),
+        )
+        .into_batch_rows(RowCount::from(60))
+        .unwrap();
+    DatasetTestCases::from_data(batch)
+        .with_index_types(
+            "value",
+            [None, Some(IndexType::Bitmap), Some(IndexType::BTree)],
+        )
+        .run(|ds: Dataset, original: RecordBatch| async move {
+            test_scan(&original, &ds).await;
+            test_take(&original, &ds).await;
+            test_filter(&original, &ds, "value").await;
+            test_filter(&original, &ds, "!value").await;
+        })
+        .await
+}
+
+#[tokio::test]
+async fn test_query_integers() {
+    todo!()
+}
+
+#[tokio::test]
+async fn test_query_floats() {
+    todo!()
+}
+
+#[tokio::test]
+async fn test_query_decimals() {
+    todo!()
+}
+
+#[tokio::test]
+async fn test_query_strings() {
+    todo!()
+}
+
+#[tokio::test]
+async fn test_query_timestamps() {
+    todo!()
+}
diff --git a/rust/lance/tests/query/vectors.rs b/rust/lance/tests/query/vectors.rs
new file mode 100644
index 00000000000..b5a7b577b19
--- /dev/null
+++ b/rust/lance/tests/query/vectors.rs
@@ -0,0 +1,48 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright The Lance Authors
+
+use super::{test_ann, test_scan, test_take};
+use crate::utils::DatasetTestCases;
+use arrow::datatypes::{Date32Type, Float32Type, Int32Type};
+use arrow_array::RecordBatch;
+use lance::Dataset;
+use lance_datagen::{array, gen_batch, ArrayGeneratorExt, Dimension, RowCount};
+use lance_index::IndexType;
+
+#[tokio::test]
+async fn test_query_vector() {
+    todo!()
+}
+
+#[tokio::test]
+async fn test_query_prefilter_date() {
+    let batch = gen_batch()
+        .col("id", array::step::<Int32Type>())
+        .col("value", array::step::<Date32Type>().with_random_nulls(0.1))
+        .col("vec", array::rand_vec::<Float32Type>(Dimension::from(16)))
+        .into_batch_rows(RowCount::from(60))
+        .unwrap();
+    DatasetTestCases::from_data(batch)
+        .with_index_types("value", [None, Some(IndexType::BTree)])
+        .with_index_types(
+            "vec",
+            [
+                None,
+                Some(IndexType::IvfPq),
+                Some(IndexType::IvfFlat),
+                Some(IndexType::IvfHnswFlat),
+            ],
+        )
+        .run(|ds: Dataset, original: RecordBatch| async move {
+            test_scan(&original, &ds).await;
+            test_take(&original, &ds).await;
+            test_ann(&original, &ds, Some("value is not null")).await;
+            test_ann(
+                &original,
+                &ds,
+                Some("value >= DATE '2020-06-01' AND value <= DATE '2021-06-01'"),
+            )
+            .await;
+        })
+        .await
+}
diff --git a/rust/lance/tests/utils/mod.rs b/rust/lance/tests/utils/mod.rs
new file mode 100644
index 00000000000..7ad06483a0a
--- /dev/null
+++ b/rust/lance/tests/utils/mod.rs
@@ -0,0 +1,105 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright The Lance Authors
+
+use std::panic::AssertUnwindSafe;
+
+use arrow_array::RecordBatch;
+use futures::FutureExt;
+use lance::Dataset;
+use lance_index::IndexType;
+
+#[derive(Clone, Copy, Debug)]
+pub enum Fragmentation {
+    /// All data in a single file.
+    SingleFragment,
+    /// Data is spread across multiple fragments, one file per fragment.
+    MultiFragment,
+}
+
+#[derive(Clone, Copy, Debug)]
+pub enum DeletionState {
+    /// No deletions are applied.
+    NoDeletions,
+    /// Delete odd rows.
+    DeleteOdd,
+    /// Delete even rows.
+    DeleteEven,
+}
+
+pub struct DatasetTestCases {
+    original: RecordBatch,
+    index_options: Vec<(String, Vec<Option<IndexType>>)>,
+}
+
+impl DatasetTestCases {
+    pub fn from_data(original: RecordBatch) -> Self {
+        Self {
+            original,
+            index_options: Vec::new(),
+        }
+    }
+
+    pub fn with_index_types(
+        mut self,
+        column: impl Into<String>,
+        index_types: impl IntoIterator<Item = Option<IndexType>>,
+    ) -> Self {
+        self.index_options
+            .push((column.into(), index_types.into_iter().collect()));
+        self
+    }
+
+    fn generate_index_combinations(&self) -> Vec<Vec<(&str, IndexType)>> {
+        let mut combinations = Vec::new();
+        for (column, index_types) in &self.index_options {
+            for index_type in index_types {
+                if let Some(index_type) = index_type {
+                    combinations.push(vec![(column.as_str(), index_type.clone())]);
+                }
+            }
+        }
+        combinations
+    }
+
+    pub async fn run<F, Fut>(self, test_fn: F) -> Fut::Output
+    where
+        F: Fn(Dataset, RecordBatch) -> Fut,
+        Fut: std::future::Future<Output = ()>,
+    {
+        for fragmentation in [Fragmentation::SingleFragment, Fragmentation::MultiFragment] {
+            for deletion in [
+                DeletionState::NoDeletions,
+                DeletionState::DeleteOdd,
+                DeletionState::DeleteEven,
+            ] {
+                let index_combinations = self.generate_index_combinations();
+                for indices in index_combinations {
+                    let ds =
+                        build_dataset(self.original.clone(), fragmentation, deletion, &indices);
+                    let context = format!(
+                        "fragmentation: {:?}, deletion: {:?}, index: {:?}",
+                        fragmentation, deletion, indices
+                    );
+                    // Catch unwind so we can add test context to the panic.
+                    AssertUnwindSafe(test_fn(ds, self.original.clone()))
+                        .catch_unwind()
+                        .await
+                        .unwrap_or_else(|_| panic!("Test failed for {}", context));
+                }
+            }
+        }
+    }
+}
+
+/// Create an in-memory dataset with the given state and data.
+///
+/// The data in dataset will exactly match the `original` batch. (Extra rows are
+/// created for the deleted rows created by `DeletionState`.)
+fn build_dataset(
+    original: RecordBatch,
+    fragmentation: Fragmentation,
+    deletion: DeletionState,
+    indices: &[(&str, IndexType)],
+) -> Dataset {
+    todo!()
+}

From 4b7257d5de62c54f517c1fc6d1ca464f6be95650 Mon Sep 17 00:00:00 2001
From: Will Jones <willjones127@gmail.com>
Date: Wed, 17 Sep 2025 08:36:54 -0700
Subject: [PATCH 03/18] feat: implement test utils for comprehensive dataset
 testing
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Implement fill_deleted_rows function to simulate deleted data by interleaving
filler rows with id=-1. Add dynamic index parameter generation supporting scalar
(BTree, Bitmap), vector (IvfFlat, IvfPq), and FTS (Inverted) indices.

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
---
 rust/lance/tests/utils/mod.rs | 153 ++++++++++++++++++++++++++++++++--
 1 file changed, 147 insertions(+), 6 deletions(-)

diff --git a/rust/lance/tests/utils/mod.rs b/rust/lance/tests/utils/mod.rs
index 7ad06483a0a..4bbe10769e3 100644
--- a/rust/lance/tests/utils/mod.rs
+++ b/rust/lance/tests/utils/mod.rs
@@ -2,11 +2,18 @@
 // SPDX-FileCopyrightText: Copyright The Lance Authors
 
 use std::panic::AssertUnwindSafe;
+use std::sync::Arc;
 
-use arrow_array::RecordBatch;
+use arrow_array::{ArrayRef, Int32Array, RecordBatch};
 use futures::FutureExt;
-use lance::Dataset;
-use lance_index::IndexType;
+use lance::index::vector::VectorIndexParams;
+use lance::{
+    dataset::{InsertBuilder, WriteParams},
+    Dataset,
+};
+use lance_index::scalar::ScalarIndexParams;
+use lance_index::{DatasetIndexExt, IndexParams, IndexType};
+use lance_linalg::distance::MetricType;
 
 #[derive(Clone, Copy, Debug)]
 pub enum Fragmentation {
@@ -75,7 +82,8 @@ impl DatasetTestCases {
                 let index_combinations = self.generate_index_combinations();
                 for indices in index_combinations {
                     let ds =
-                        build_dataset(self.original.clone(), fragmentation, deletion, &indices);
+                        build_dataset(self.original.clone(), fragmentation, deletion, &indices)
+                            .await;
                     let context = format!(
                         "fragmentation: {:?}, deletion: {:?}, index: {:?}",
                         fragmentation, deletion, indices
@@ -95,11 +103,144 @@ impl DatasetTestCases {
 ///
 /// The data in dataset will exactly match the `original` batch. (Extra rows are
 /// created for the deleted rows created by `DeletionState`.)
-fn build_dataset(
+async fn build_dataset(
     original: RecordBatch,
     fragmentation: Fragmentation,
     deletion: DeletionState,
     indices: &[(&str, IndexType)],
 ) -> Dataset {
-    todo!()
+    let data_to_write = fill_deleted_rows(&original, deletion);
+
+    let max_rows_per_file = if let Fragmentation::MultiFragment = fragmentation {
+        3
+    } else {
+        original.num_rows() + 1
+    };
+
+    let mut ds = InsertBuilder::new("memory://")
+        .with_params(&WriteParams {
+            max_rows_per_file,
+            ..Default::default()
+        })
+        .execute(vec![data_to_write])
+        .await
+        .unwrap();
+
+    ds.delete("id = -1").await.unwrap();
+
+    assert_eq!(ds.count_rows(None).await.unwrap(), original.num_rows());
+
+    for (column, index_type) in indices {
+        // TODO: when possible, make indices cover a portion of rows and not be
+        // aligned between indices.
+        let index_params: Box<dyn IndexParams> = match index_type {
+            IndexType::BTree
+            | IndexType::Bitmap
+            | IndexType::LabelList
+            | IndexType::NGram
+            | IndexType::ZoneMap
+            | IndexType::Inverted
+            | IndexType::BloomFilter => Box::new(ScalarIndexParams::for_builtin(
+                (*index_type).try_into().unwrap(),
+            )),
+            IndexType::IvfFlat => {
+                // Use a small number of partitions for testing
+                Box::new(VectorIndexParams::ivf_flat(4, MetricType::L2))
+            }
+            IndexType::IvfPq => {
+                // Simple PQ params for testing
+                Box::new(VectorIndexParams::ivf_pq(4, 8, 2, MetricType::L2, 10))
+            }
+            _ => {
+                // For other index types, use default scalar params
+                Box::new(ScalarIndexParams::default())
+            }
+        };
+
+        ds.create_index_builder(&[column], *index_type, index_params.as_ref())
+            //.fragments() <--- Uncomment this line when implementing fragments
+            .await
+            .unwrap();
+    }
+
+    ds
+}
+
+/// Insert filler rows into a record batch such that applying deletions to the
+/// output will yield the input. For example, given the `deletions: DeletionState::DeleteOdd`
+/// and the table:
+///
+/// ```
+/// id | value
+///  1 |   "a"
+///  2 |   "b"
+/// ```
+///
+/// Produce:
+///
+/// ```
+/// id | value
+/// -1 |   "a" (filler row)
+///  1 |   "a"
+/// -1 |   "a"
+///  2 |   "b"
+/// ```
+///
+/// The filler row will have the same values as the original row, but with a special
+/// identifier (e.g., -1) to indicate that it is a filler row.
+fn fill_deleted_rows(batch: &RecordBatch, deletions: DeletionState) -> RecordBatch {
+    // Early return for no deletions
+    if let DeletionState::NoDeletions = deletions {
+        return batch.clone();
+    }
+
+    // Create a filler batch by taking the first row and replacing id with -1
+    let schema = batch.schema();
+    let mut filler_columns: Vec<ArrayRef> = Vec::new();
+
+    for (i, field) in schema.fields().iter().enumerate() {
+        if field.name() == "id" {
+            // Create an array with a single -1 value
+            filler_columns.push(Arc::new(Int32Array::from(vec![-1])));
+        } else {
+            // Take the first value from the original column
+            let original_column = batch.column(i);
+            let sliced = original_column.slice(0, 1);
+            filler_columns.push(sliced);
+        }
+    }
+
+    let filler_batch = RecordBatch::try_new(schema.clone(), filler_columns).unwrap();
+
+    // Create an array of filler batches, one for each row that will be deleted
+    let num_rows = batch.num_rows();
+    let filler_batches = vec![filler_batch.clone(); num_rows];
+
+    // Concatenate all filler batches into one
+    let all_fillers = arrow_select::concat::concat_batches(&schema, &filler_batches).unwrap();
+
+    // Create indices for interleaving based on the deletion pattern
+    // Format: (batch_index, row_index) where batch_index 0 = original, 1 = fillers
+    let mut indices: Vec<(usize, usize)> = Vec::new();
+
+    match deletions {
+        DeletionState::DeleteOdd => {
+            // Pattern: filler, original[0], filler, original[1], ...
+            for i in 0..num_rows {
+                indices.push((1, i)); // filler batch, row i
+                indices.push((0, i)); // original batch, row i
+            }
+        }
+        DeletionState::DeleteEven => {
+            // Pattern: original[0], filler, original[1], filler, ...
+            for i in 0..num_rows {
+                indices.push((0, i)); // original batch, row i
+                indices.push((1, i)); // filler batch, row i
+            }
+        }
+        DeletionState::NoDeletions => unreachable!(),
+    }
+
+    // Use interleave to reorder according to our indices
+    arrow::compute::interleave_record_batch(&[batch, &all_fillers], &indices).unwrap()
 }

From ee37fc422d9a0d23ecc28d67441002d638d10ed1 Mon Sep 17 00:00:00 2001
From: Will Jones <willjones127@gmail.com>
Date: Wed, 17 Sep 2025 08:59:20 -0700
Subject: [PATCH 04/18] feat: implement test_scan, test_take, and test_filter
 for query tests
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Implement comprehensive test functions for dataset queries:
- test_scan: Verifies scanning with ordering by id column
- test_take: Tests taking specific rows by indices, validates against Arrow's take_record_batch
- test_filter: Tests filtering with SQL predicates using DataFusion for comparison

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
---
 rust/lance/tests/query/mod.rs | 92 ++++++++++++++++++++++++++++++++---
 1 file changed, 85 insertions(+), 7 deletions(-)

diff --git a/rust/lance/tests/query/mod.rs b/rust/lance/tests/query/mod.rs
index 16772eb6bde..71b9fcb6d8c 100644
--- a/rust/lance/tests/query/mod.rs
+++ b/rust/lance/tests/query/mod.rs
@@ -1,22 +1,100 @@
 // SPDX-License-Identifier: Apache-2.0
 // SPDX-FileCopyrightText: Copyright The Lance Authors
 
-use arrow_array::RecordBatch;
+use std::sync::Arc;
+
+use arrow_array::{RecordBatch, UInt32Array};
+use arrow_select::concat::concat_batches;
+use datafusion::datasource::MemTable;
+use datafusion::prelude::SessionContext;
+use lance::dataset::scanner::ColumnOrdering;
 use lance::Dataset;
 
 mod primitives;
 mod vectors;
 
-async fn test_scan(_original: &RecordBatch, _ds: &Dataset) {
-    todo!("validate that if you scan ds, then sort by id, you get original back.")
+async fn test_scan(original: &RecordBatch, ds: &Dataset) {
+    let mut scanner = ds.scan();
+    scanner
+        .order_by(Some(vec![ColumnOrdering::asc_nulls_first(
+            "id".to_string(),
+        )]))
+        .unwrap();
+    let scanned = scanner.try_into_batch().await.unwrap();
+
+    assert_eq!(original, &scanned);
 }
 
-async fn test_take(_original: &RecordBatch, _ds: &Dataset) {
-    todo!("generate a few sets of ids and validate we can call take against the RB and the DS and get the same result.");
+async fn test_take(original: &RecordBatch, ds: &Dataset) {
+    let num_rows = original.num_rows();
+    let cases: Vec<Vec<usize>> = vec![
+        vec![0, 1, 2],                   // First few rows
+        vec![5, 3, 1],                   // Out of order
+        vec![0],                         // Single row
+        vec![],                          // Empty
+        (0..num_rows.min(10)).collect(), // Sequential
+        vec![num_rows - 1, 0],           // Last and first
+    ];
+
+    for indices in cases {
+        // Skip cases with invalid indices
+        if indices.iter().any(|&i| i >= num_rows) {
+            continue;
+        }
+
+        // Convert to u64 for Lance take
+        let indices_u64: Vec<u64> = indices.iter().map(|&i| i as u64).collect();
+
+        if indices_u64.is_empty() {
+            // Skip empty case as Lance may not handle it the same way
+            continue;
+        }
+
+        let taken_ds = ds.take(&indices_u64, ds.schema().clone()).await.unwrap();
+
+        // Take from RecordBatch using arrow::compute
+        let indices_u32: Vec<u32> = indices.iter().map(|&i| i as u32).collect();
+        let indices_array = UInt32Array::from(indices_u32);
+        let taken_rb = arrow::compute::take_record_batch(original, &indices_array).unwrap();
+
+        assert_eq!(
+            taken_rb, taken_ds,
+            "Take results don't match for indices: {:?}",
+            indices
+        );
+    }
 }
 
-async fn test_filter(_original: &RecordBatch, _ds: &Dataset, _predicate: &str) {
-    todo!("Scan ds with the predicate");
+async fn test_filter(original: &RecordBatch, ds: &Dataset, predicate: &str) {
+    // Scan with filter and order
+    let mut scanner = ds.scan();
+    scanner
+        .filter(predicate)
+        .unwrap()
+        .order_by(Some(vec![ColumnOrdering::asc_nulls_first(
+            "id".to_string(),
+        )]))
+        .unwrap();
+    let scanned = scanner.try_into_batch().await.unwrap();
+
+    // Use DataFusion to apply same filter using SQL
+    // Convert Lance filter syntax to SQL syntax
+    let sql_predicate = if predicate.starts_with('!') {
+        format!("NOT {}", &predicate[1..])
+    } else {
+        predicate.to_string()
+    };
+
+    let ctx = SessionContext::new();
+    let table = MemTable::try_new(original.schema(), vec![vec![original.clone()]]).unwrap();
+    ctx.register_table("t", Arc::new(table)).unwrap();
+
+    let sql = format!("SELECT * FROM t WHERE {} ORDER BY id", sql_predicate);
+    let df = ctx.sql(&sql).await.unwrap();
+    let expected_batches = df.collect().await.unwrap();
+    let expected = concat_batches(&original.schema(), &expected_batches).unwrap();
+
+    assert_eq!(&expected, &scanned);
 }
 
 async fn test_ann(_original: &RecordBatch, _ds: &Dataset, _predicate: Option<&str>) {

From c0de557f25d483659c5a508c7555ed44e83974a4 Mon Sep 17 00:00:00 2001
From: Will Jones <willjones127@gmail.com>
Date: Wed, 17 Sep 2025 11:25:15 -0700
Subject: [PATCH 05/18] get primitive tests working

---
 rust/lance/tests/query/mod.rs        | 10 +---
 rust/lance/tests/query/primitives.rs | 77 +++++++++++++++++++---------
 2 files changed, 55 insertions(+), 32 deletions(-)

diff --git a/rust/lance/tests/query/mod.rs b/rust/lance/tests/query/mod.rs
index 71b9fcb6d8c..dbb9697bcac 100644
--- a/rust/lance/tests/query/mod.rs
+++ b/rust/lance/tests/query/mod.rs
@@ -77,19 +77,11 @@ async fn test_filter(original: &RecordBatch, ds: &Dataset, predicate: &str) {
         .unwrap();
     let scanned = scanner.try_into_batch().await.unwrap();
 
-    // Use DataFusion to apply same filter using SQL
-    // Convert Lance filter syntax to SQL syntax
-    let sql_predicate = if predicate.starts_with('!') {
-        format!("NOT {}", &predicate[1..])
-    } else {
-        predicate.to_string()
-    };
-
     let ctx = SessionContext::new();
     let table = MemTable::try_new(original.schema(), vec![vec![original.clone()]]).unwrap();
     ctx.register_table("t", Arc::new(table)).unwrap();
 
-    let sql = format!("SELECT * FROM t WHERE {} ORDER BY id", sql_predicate);
+    let sql = format!("SELECT * FROM t WHERE {} ORDER BY id", predicate);
     let df = ctx.sql(&sql).await.unwrap();
     let expected_batches = df.collect().await.unwrap();
     let expected = concat_batches(&original.schema(), &expected_batches).unwrap();
diff --git a/rust/lance/tests/query/primitives.rs b/rust/lance/tests/query/primitives.rs
index f500b31d5e3..cb978074d37 100644
--- a/rust/lance/tests/query/primitives.rs
+++ b/rust/lance/tests/query/primitives.rs
@@ -1,8 +1,9 @@
 // SPDX-License-Identifier: Apache-2.0
 // SPDX-FileCopyrightText: Copyright The Lance Authors
 
-use arrow::datatypes::Int32Type;
+use arrow::datatypes::*;
 use arrow_array::RecordBatch;
+use arrow_schema::DataType;
 use lance::Dataset;
 
 use lance_datagen::{array, gen_batch, ArrayGeneratorExt, RowCount};
@@ -24,38 +25,68 @@ async fn test_query_bool() {
     DatasetTestCases::from_data(batch)
         .with_index_types(
             "value",
-            [None, Some(IndexType::Bitmap), Some(IndexType::BTree)],
+            // TODO: fix bug with bitmap and btree https://github.com/lancedb/lance/issues/4756
+            // TODO: fix bug with zone map https://github.com/lancedb/lance/issues/4758
+            // TODO: Add boolean to bloom filter supported types https://github.com/lancedb/lance/issues/4757
+            // [None, Some(IndexType::Bitmap), Some(IndexType::BTree), Some(IndexType::BloomFilter), Some(IndexType::ZoneMap)],
+            [None],
         )
         .run(|ds: Dataset, original: RecordBatch| async move {
             test_scan(&original, &ds).await;
             test_take(&original, &ds).await;
             test_filter(&original, &ds, "value").await;
-            test_filter(&original, &ds, "!value").await;
+            test_filter(&original, &ds, "NOT value").await;
         })
         .await
 }
 
 #[tokio::test]
-async fn test_query_integers() {
-    todo!()
-}
+#[rstest::rstest]
+#[case::int8(DataType::Int8)]
+#[case::int16(DataType::Int16)]
+#[case::int32(DataType::Int32)]
+#[case::int64(DataType::Int64)]
+#[case::uint8(DataType::UInt8)]
+#[case::uint16(DataType::UInt16)]
+#[case::uint32(DataType::UInt32)]
+#[case::uint64(DataType::UInt64)]
+async fn test_query_integer(#[case] data_type: DataType) {
+    let value_generator = match data_type {
+        DataType::Int8 => array::rand_primitive::<Int8Type>(data_type),
+        DataType::Int16 => array::rand_primitive::<Int16Type>(data_type),
+        DataType::Int32 => array::rand_primitive::<Int32Type>(data_type),
+        DataType::Int64 => array::rand_primitive::<Int64Type>(data_type),
+        DataType::UInt8 => array::rand_primitive::<UInt8Type>(data_type),
+        DataType::UInt16 => array::rand_primitive::<UInt16Type>(data_type),
+        DataType::UInt32 => array::rand_primitive::<UInt32Type>(data_type),
+        DataType::UInt64 => array::rand_primitive::<UInt64Type>(data_type),
+        _ => unreachable!(),
+    };
 
-#[tokio::test]
-async fn test_query_floats() {
-    todo!()
-}
-
-#[tokio::test]
-async fn test_query_decimals() {
-    todo!()
-}
-
-#[tokio::test]
-async fn test_query_strings() {
-    todo!()
+    let batch = gen_batch()
+        .col("id", array::step::<Int32Type>())
+        .col("value", value_generator.with_random_nulls(0.1))
+        .into_batch_rows(RowCount::from(60))
+        .unwrap();
+    DatasetTestCases::from_data(batch)
+        .with_index_types(
+            "value",
+            // TODO: add zone map and bloom filter once we fix https://github.com/lancedb/lance/issues/4758
+            [None, Some(IndexType::Bitmap), Some(IndexType::BTree)],
+        )
+        .run(|ds: Dataset, original: RecordBatch| async move {
+            test_scan(&original, &ds).await;
+            test_take(&original, &ds).await;
+            test_filter(&original, &ds, "value > 20").await;
+            test_filter(&original, &ds, "NOT (value > 20)").await;
+            test_filter(&original, &ds, "value is null").await;
+            test_filter(&original, &ds, "value is not null").await;
+        })
+        .await
 }
 
-#[tokio::test]
-async fn test_query_timestamps() {
-    todo!()
-}
+// TODO: floats (including NaN, +/-Inf, +/-0)
+// TODO: decimals
+// TODO: binary
+// TODO: strings (including largestrings and view)
+// TODO: timestamps

From de58c174563a8d09c766d772956a9c74ce6f928a Mon Sep 17 00:00:00 2001
From: Will Jones <willjones127@gmail.com>
Date: Wed, 17 Sep 2025 12:46:43 -0700
Subject: [PATCH 06/18] get a working ANN test

---
 rust/lance/src/index/create.rs    | 11 +++-
 rust/lance/tests/query/mod.rs     | 88 +++++++++++++++++++++++++++++--
 rust/lance/tests/query/vectors.rs | 29 +++++++---
 rust/lance/tests/utils/mod.rs     | 39 +++++++++++---
 4 files changed, 147 insertions(+), 20 deletions(-)

diff --git a/rust/lance/src/index/create.rs b/rust/lance/src/index/create.rs
index 2724b3a3cb4..fdac7395dff 100644
--- a/rust/lance/src/index/create.rs
+++ b/rust/lance/src/index/create.rs
@@ -229,7 +229,16 @@ impl<'a> CreateIndexBuilder<'a> {
                 )
                 .await?
             }
-            (IndexType::Vector, LANCE_VECTOR_INDEX) => {
+            (
+                IndexType::Vector
+                | IndexType::IvfPq
+                | IndexType::IvfSq
+                | IndexType::IvfFlat
+                | IndexType::IvfHnswFlat
+                | IndexType::IvfHnswPq
+                | IndexType::IvfHnswSq,
+                LANCE_VECTOR_INDEX,
+            ) => {
                 // Vector index params.
                 let vec_params = self
                     .params
diff --git a/rust/lance/tests/query/mod.rs b/rust/lance/tests/query/mod.rs
index dbb9697bcac..08878def0a1 100644
--- a/rust/lance/tests/query/mod.rs
+++ b/rust/lance/tests/query/mod.rs
@@ -3,12 +3,20 @@
 
 use std::sync::Arc;
 
-use arrow_array::{RecordBatch, UInt32Array};
+use arrow_array::{cast::AsArray, RecordBatch, UInt32Array};
 use arrow_select::concat::concat_batches;
 use datafusion::datasource::MemTable;
 use datafusion::prelude::SessionContext;
 use lance::dataset::scanner::ColumnOrdering;
 use lance::Dataset;
+use lance_datafusion::udf::register_functions;
+
+/// Creates a fresh SessionContext with Lance UDFs registered
+fn create_datafusion_context() -> SessionContext {
+    let ctx = SessionContext::new();
+    register_functions(&ctx);
+    ctx
+}
 
 mod primitives;
 mod vectors;
@@ -77,7 +85,7 @@ async fn test_filter(original: &RecordBatch, ds: &Dataset, predicate: &str) {
         .unwrap();
     let scanned = scanner.try_into_batch().await.unwrap();
 
-    let ctx = SessionContext::new();
+    let ctx = create_datafusion_context();
     let table = MemTable::try_new(original.schema(), vec![vec![original.clone()]]).unwrap();
     ctx.register_table("t", Arc::new(table)).unwrap();
 
@@ -89,6 +97,78 @@ async fn test_filter(original: &RecordBatch, ds: &Dataset, predicate: &str) {
     assert_eq!(&expected, &scanned);
 }
 
-async fn test_ann(_original: &RecordBatch, _ds: &Dataset, _predicate: Option<&str>) {
-    todo!("Scan ds with the ANN predicate");
+/// Test that an exhaustive ANN query gives the same results as brute force
+/// KNN against the original batch.
+///
+/// By exhaustive ANN, I mean we search all the partitions so we get perfect recall.
+async fn test_ann(original: &RecordBatch, ds: &Dataset, column: &str, predicate: Option<&str>) {
+    // Extract first vector from the column as query vector
+    let vector_column = original.column_by_name(column).unwrap();
+    let fixed_size_list = vector_column.as_fixed_size_list();
+
+    // Extract the first vector's values as a new array
+    let vector_values = fixed_size_list
+        .values()
+        .slice(0, fixed_size_list.value_length() as usize);
+    let query_vector = vector_values;
+
+    let mut scanner = ds.scan();
+    scanner
+        .nearest(column, query_vector.as_ref(), 10)
+        .unwrap()
+        .prefilter(true)
+        .refine(2);
+    if let Some(pred) = predicate {
+        scanner.filter(pred).unwrap();
+    }
+    let result = scanner.try_into_batch().await.unwrap();
+
+    // Use DataFusion to apply same vector search using SQL
+    let ctx = create_datafusion_context();
+    let table = MemTable::try_new(original.schema(), vec![vec![original.clone()]]).unwrap();
+    ctx.register_table("t", Arc::new(table)).unwrap();
+
+    // Convert query vector to SQL array literal
+    let float_array = query_vector.as_primitive::<arrow::datatypes::Float32Type>();
+    let vector_values_str = float_array
+        .values()
+        .iter()
+        .map(|v| v.to_string())
+        .collect::<Vec<_>>()
+        .join(", ");
+
+    let sql = format!(
+        "SELECT * FROM t {} ORDER BY array_distance(t.{}, [{}]) LIMIT 10",
+        if let Some(pred) = predicate {
+            format!("WHERE {}", pred)
+        } else {
+            String::new()
+        },
+        column,
+        vector_values_str
+    );
+
+    let df = ctx.sql(&sql).await.unwrap();
+    let expected_batches = df.collect().await.unwrap();
+    let expected = concat_batches(&original.schema(), &expected_batches).unwrap();
+
+    // Compare only the main data (excluding _distance column which Lance adds)
+    // We validate that both return the same number of rows and same row ordering
+    assert_eq!(
+        expected.num_rows(),
+        result.num_rows(),
+        "Different number of results"
+    );
+
+    // Compare the first few columns (excluding _distance)
+    for (col_idx, field) in original.schema().fields().iter().enumerate() {
+        let expected_col = expected.column(col_idx);
+        let result_col = result.column(col_idx);
+        assert_eq!(
+            expected_col,
+            result_col,
+            "Column '{}' differs between DataFusion and Lance results",
+            field.name()
+        );
+    }
 }
diff --git a/rust/lance/tests/query/vectors.rs b/rust/lance/tests/query/vectors.rs
index b5a7b577b19..019d4cfc073 100644
--- a/rust/lance/tests/query/vectors.rs
+++ b/rust/lance/tests/query/vectors.rs
@@ -9,18 +9,28 @@ use lance::Dataset;
 use lance_datagen::{array, gen_batch, ArrayGeneratorExt, Dimension, RowCount};
 use lance_index::IndexType;
 
-#[tokio::test]
-async fn test_query_vector() {
-    todo!()
+fn date_as_i32(date: &str) -> i32 {
+    // Return as i32 days since unix epoch.
+    use chrono::{NaiveDate, TimeZone, Utc};
+
+    let parsed_date =
+        NaiveDate::parse_from_str(date, "%Y-%m-%d").expect("Date should be in YYYY-MM-DD format");
+
+    let unix_epoch = Utc.timestamp_opt(0, 0).unwrap().date_naive();
+
+    (parsed_date - unix_epoch).num_days() as i32
 }
 
 #[tokio::test]
 async fn test_query_prefilter_date() {
     let batch = gen_batch()
         .col("id", array::step::<Int32Type>())
-        .col("value", array::step::<Date32Type>().with_random_nulls(0.1))
+        .col(
+            "value",
+            array::step_custom::<Date32Type>(date_as_i32("2020-01-01"), 1).with_random_nulls(0.1),
+        )
         .col("vec", array::rand_vec::<Float32Type>(Dimension::from(16)))
-        .into_batch_rows(RowCount::from(60))
+        .into_batch_rows(RowCount::from(256))
         .unwrap();
     DatasetTestCases::from_data(batch)
         .with_index_types("value", [None, Some(IndexType::BTree)])
@@ -29,18 +39,23 @@ async fn test_query_prefilter_date() {
             [
                 None,
                 Some(IndexType::IvfPq),
+                Some(IndexType::IvfSq),
                 Some(IndexType::IvfFlat),
                 Some(IndexType::IvfHnswFlat),
+                Some(IndexType::IvfHnswPq),
+                Some(IndexType::IvfHnswSq),
             ],
         )
         .run(|ds: Dataset, original: RecordBatch| async move {
             test_scan(&original, &ds).await;
             test_take(&original, &ds).await;
-            test_ann(&original, &ds, Some("value is not null")).await;
+            test_ann(&original, &ds, "vec", None).await;
+            test_ann(&original, &ds, "vec", Some("value is not null")).await;
             test_ann(
                 &original,
                 &ds,
-                Some("value >= DATE '2020-06-01' AND value <= DATE '2021-06-01'"),
+                "vec",
+                Some("value >= DATE '2020-01-03' AND value <= DATE '2020-01-25'"),
             )
             .await;
         })
diff --git a/rust/lance/tests/utils/mod.rs b/rust/lance/tests/utils/mod.rs
index 4bbe10769e3..2b2d00c9803 100644
--- a/rust/lance/tests/utils/mod.rs
+++ b/rust/lance/tests/utils/mod.rs
@@ -12,8 +12,12 @@ use lance::{
     Dataset,
 };
 use lance_index::scalar::ScalarIndexParams;
+use lance_index::vector::hnsw::builder::HnswBuildParams;
+use lance_index::vector::ivf::IvfBuildParams;
+use lance_index::vector::pq::PQBuildParams;
+use lance_index::vector::sq::builder::SQBuildParams;
 use lance_index::{DatasetIndexExt, IndexParams, IndexType};
-use lance_linalg::distance::MetricType;
+use lance_linalg::distance::{DistanceType, MetricType};
 
 #[derive(Clone, Copy, Debug)]
 pub enum Fragmentation {
@@ -59,10 +63,8 @@ impl DatasetTestCases {
     fn generate_index_combinations(&self) -> Vec<Vec<(&str, IndexType)>> {
         let mut combinations = Vec::new();
         for (column, index_types) in &self.index_options {
-            for index_type in index_types {
-                if let Some(index_type) = index_type {
-                    combinations.push(vec![(column.as_str(), index_type.clone())]);
-                }
+            for index_type in index_types.iter().flatten() {
+                combinations.push(vec![(column.as_str(), *index_type)]);
             }
         }
         combinations
@@ -145,12 +147,33 @@ async fn build_dataset(
             )),
             IndexType::IvfFlat => {
                 // Use a small number of partitions for testing
-                Box::new(VectorIndexParams::ivf_flat(4, MetricType::L2))
+                Box::new(VectorIndexParams::ivf_flat(2, MetricType::L2))
             }
             IndexType::IvfPq => {
                 // Simple PQ params for testing
-                Box::new(VectorIndexParams::ivf_pq(4, 8, 2, MetricType::L2, 10))
+                Box::new(VectorIndexParams::ivf_pq(2, 8, 2, MetricType::L2, 10))
             }
+            IndexType::IvfSq => Box::new(VectorIndexParams::with_ivf_sq_params(
+                DistanceType::L2,
+                IvfBuildParams::new(2),
+                SQBuildParams::default(),
+            )),
+            IndexType::IvfHnswFlat => Box::new(VectorIndexParams::with_ivf_flat_params(
+                DistanceType::L2,
+                IvfBuildParams::new(2),
+            )),
+            IndexType::IvfHnswPq => Box::new(VectorIndexParams::with_ivf_hnsw_pq_params(
+                DistanceType::L2,
+                IvfBuildParams::new(2),
+                HnswBuildParams::default(),
+                PQBuildParams::new(2, 8),
+            )),
+            IndexType::IvfHnswSq => Box::new(VectorIndexParams::with_ivf_hnsw_sq_params(
+                DistanceType::L2,
+                IvfBuildParams::new(2),
+                HnswBuildParams::default(),
+                SQBuildParams::default(),
+            )),
             _ => {
                 // For other index types, use default scalar params
                 Box::new(ScalarIndexParams::default())
@@ -214,7 +237,7 @@ fn fill_deleted_rows(batch: &RecordBatch, deletions: DeletionState) -> RecordBat
 
     // Create an array of filler batches, one for each row that will be deleted
     let num_rows = batch.num_rows();
-    let filler_batches = vec![filler_batch.clone(); num_rows];
+    let filler_batches = vec![filler_batch; num_rows];
 
     // Concatenate all filler batches into one
     let all_fillers = arrow_select::concat::concat_batches(&schema, &filler_batches).unwrap();

From 5ee02c632f8f5256496c9291e2dee0626979eacb Mon Sep 17 00:00:00 2001
From: Will Jones <willjones127@gmail.com>
Date: Wed, 17 Sep 2025 13:19:09 -0700
Subject: [PATCH 07/18] add index fragment coverage

---
 rust/lance/src/dataset.rs     |  2 --
 rust/lance/tests/query/mod.rs |  5 +++++
 rust/lance/tests/utils/mod.rs | 27 ++++++++++++++++++++++++---
 3 files changed, 29 insertions(+), 5 deletions(-)

diff --git a/rust/lance/src/dataset.rs b/rust/lance/src/dataset.rs
index 33ba4ec86db..722ba7c97e1 100644
--- a/rust/lance/src/dataset.rs
+++ b/rust/lance/src/dataset.rs
@@ -64,8 +64,6 @@ use tracing::{info, instrument};
 
 pub(crate) mod blob;
 mod branch_location;
-#[cfg(test)]
-mod broad_test;
 pub mod builder;
 pub mod cleanup;
 pub mod delta;
diff --git a/rust/lance/tests/query/mod.rs b/rust/lance/tests/query/mod.rs
index 08878def0a1..88fd4bf1529 100644
--- a/rust/lance/tests/query/mod.rs
+++ b/rust/lance/tests/query/mod.rs
@@ -21,6 +21,7 @@ fn create_datafusion_context() -> SessionContext {
 mod primitives;
 mod vectors;
 
+/// Scanning an ordering by id should give same result as original.
 async fn test_scan(original: &RecordBatch, ds: &Dataset) {
     let mut scanner = ds.scan();
     scanner
@@ -33,6 +34,7 @@ async fn test_scan(original: &RecordBatch, ds: &Dataset) {
     assert_eq!(original, &scanned);
 }
 
+/// Taking specific rows should give the same result as taking from the original.
 async fn test_take(original: &RecordBatch, ds: &Dataset) {
     let num_rows = original.num_rows();
     let cases: Vec<Vec<usize>> = vec![
@@ -73,6 +75,8 @@ async fn test_take(original: &RecordBatch, ds: &Dataset) {
     }
 }
 
+/// Querying with filter should give same result as filtering original
+/// record batch in DataFusion.
 async fn test_filter(original: &RecordBatch, ds: &Dataset, predicate: &str) {
     // Scan with filter and order
     let mut scanner = ds.scan();
@@ -137,6 +141,7 @@ async fn test_ann(original: &RecordBatch, ds: &Dataset, column: &str, predicate:
         .collect::<Vec<_>>()
         .join(", ");
 
+    // DataFusion's built-in `array_distance` function uses L2 distance.
     let sql = format!(
         "SELECT * FROM t {} ORDER BY array_distance(t.{}, [{}]) LIMIT 10",
         if let Some(pred) = predicate {
diff --git a/rust/lance/tests/utils/mod.rs b/rust/lance/tests/utils/mod.rs
index 2b2d00c9803..fa447e0b9b8 100644
--- a/rust/lance/tests/utils/mod.rs
+++ b/rust/lance/tests/utils/mod.rs
@@ -116,7 +116,7 @@ async fn build_dataset(
     let max_rows_per_file = if let Fragmentation::MultiFragment = fragmentation {
         3
     } else {
-        original.num_rows() + 1
+        1_000_000
     };
 
     let mut ds = InsertBuilder::new("memory://")
@@ -132,7 +132,13 @@ async fn build_dataset(
 
     assert_eq!(ds.count_rows(None).await.unwrap(), original.num_rows());
 
-    for (column, index_type) in indices {
+    let fragment_ids = ds
+        .manifest()
+        .fragments
+        .iter()
+        .map(|f| f.id as u32)
+        .collect::<Vec<_>>();
+    for (fragment_offset, (column, index_type)) in indices.iter().enumerate() {
         // TODO: when possible, make indices cover a portion of rows and not be
         // aligned between indices.
         let index_params: Box<dyn IndexParams> = match index_type {
@@ -180,8 +186,23 @@ async fn build_dataset(
             }
         };
 
+        // Index all but one fragment. Each new index we increase the offset.
+        // Start at the offset and then count (wrapping around if needed)
+        // fragments.length - 1.
+        // So for 3 fragments:
+        // Index 1: [0, 1]
+        // Index 2: [1, 2]
+        // Index 3: [2, 0]
+        let fragments_to_index = fragment_ids
+            .iter()
+            .cycle()
+            .skip(fragment_offset)
+            .take((fragment_ids.len() - 1).max(1))
+            .cloned()
+            .collect::<Vec<_>>();
+
         ds.create_index_builder(&[column], *index_type, index_params.as_ref())
-            //.fragments() <--- Uncomment this line when implementing fragments
+            .fragments(fragments_to_index)
             .await
             .unwrap();
     }

From eb31e553099cfa81d9e606a36ed9c2525008fba7 Mon Sep 17 00:00:00 2001
From: Will Jones <willjones127@gmail.com>
Date: Tue, 4 Nov 2025 10:09:51 -0800
Subject: [PATCH 08/18] add workflow back

---
 .github/workflows/rust.yml | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml
index 3b17c8c9c5e..12909e09da8 100644
--- a/.github/workflows/rust.yml
+++ b/.github/workflows/rust.yml
@@ -145,6 +145,30 @@ jobs:
         run: |
           ALL_FEATURES=`cargo metadata --format-version=1 --no-deps | jq -r '.packages[] | .features | keys | .[]' | grep -v protoc | sort | uniq | paste -s -d "," -`
           cargo test --profile ci --locked --features ${ALL_FEATURES}
+  query-integration-tests:
+    runs-on: ubuntu-2404-4x-arm64
+    timeout-minutes: 75
+    steps:
+      - uses: actions/checkout@v4
+      - name: Setup rust toolchain
+        run: |
+          rustup toolchain install stable
+          rustup default stable
+      - uses: rui314/setup-mold@v1
+      - uses: Swatinem/rust-cache@v2
+        with:
+          cache-targets: false
+          cache-workspace-crates: true
+      - name: Install dependencies
+        run: |
+          sudo apt -y -qq update
+          sudo apt install -y protobuf-compiler libssl-dev pkg-config
+      - name: Build query integration tests
+        run: |
+          cargo build --locked --profile release-with-debug --features fp16kernels --tests --test integration_tests
+      - name: Run query integration tests
+        run: |
+          cargo test --locked --profile release-with-debug --features fp16kernels --test integration_tests
   build-no-lock:
     runs-on: warp-ubuntu-latest-x64-8x
     timeout-minutes: 30

From fee88b89a87a9ecb744849092869398d037de536 Mon Sep 17 00:00:00 2001
From: Will Jones <willjones127@gmail.com>
Date: Tue, 4 Nov 2025 14:53:16 -0800
Subject: [PATCH 09/18] add ci

---
 .github/workflows/rust.yml           | 22 +++++++++++-----------
 rust/lance/Cargo.toml                |  2 ++
 rust/lance/tests/query/primitives.rs |  1 +
 rust/lance/tests/query/vectors.rs    |  1 +
 4 files changed, 15 insertions(+), 11 deletions(-)

diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml
index 12909e09da8..5b64c29085a 100644
--- a/.github/workflows/rust.yml
+++ b/.github/workflows/rust.yml
@@ -63,7 +63,7 @@ jobs:
           sudo apt install -y protobuf-compiler libssl-dev
       - name: Get features
         run: |
-          ALL_FEATURES=`cargo metadata --format-version=1 --no-deps | jq -r '.packages[] | .features | keys | .[]' | grep -v protoc | sort | uniq | paste -s -d "," -`
+          ALL_FEATURES=`cargo metadata --format-version=1 --no-deps | jq -r '.packages[] | .features | keys | .[]' | grep -v -e protoc -e slow_tests | sort | uniq | paste -s -d "," -`
           echo "ALL_FEATURES=${ALL_FEATURES}" >> $GITHUB_ENV
       - name: Clippy
         run: cargo clippy --profile ci --locked --features ${{ env.ALL_FEATURES }} --all-targets -- -D warnings
@@ -109,7 +109,7 @@ jobs:
       - name: Run tests
         if: ${{ matrix.toolchain == 'stable' }}
         run: |
-          ALL_FEATURES=`cargo metadata --format-version=1 --no-deps | jq -r '.packages[] | .features | keys | .[]' | grep -v protoc | sort | uniq | paste -s -d "," -`
+          ALL_FEATURES=`cargo metadata --format-version=1 --no-deps | jq -r '.packages[] | .features | keys | .[]' | grep -v -e protoc -e slow_tests | sort | uniq | paste -s -d "," -`
           cargo llvm-cov --profile ci --locked --workspace --codecov --output-path coverage.codecov --features ${ALL_FEATURES}
       - name: Upload coverage to Codecov
         if: ${{ matrix.toolchain == 'stable' }}
@@ -137,16 +137,16 @@ jobs:
           sudo apt install -y protobuf-compiler libssl-dev pkg-config
       - name: Build tests
         run: |
-          ALL_FEATURES=`cargo metadata --format-version=1 --no-deps | jq -r '.packages[] | .features | keys | .[]' | grep -v protoc | sort | uniq | paste -s -d "," -`
+          ALL_FEATURES=`cargo metadata --format-version=1 --no-deps | jq -r '.packages[] | .features | keys | .[]' | grep -v -e protoc -e slow_tests | sort | uniq | paste -s -d "," -`
           cargo test --profile ci --locked --features ${ALL_FEATURES} --no-run
       - name: Start DynamodDB and S3
         run: docker compose -f docker-compose.yml up -d --wait
       - name: Run tests
         run: |
-          ALL_FEATURES=`cargo metadata --format-version=1 --no-deps | jq -r '.packages[] | .features | keys | .[]' | grep -v protoc | sort | uniq | paste -s -d "," -`
+          ALL_FEATURES=`cargo metadata --format-version=1 --no-deps | jq -r '.packages[] | .features | keys | .[]' | grep -v -e protoc -e slow_tests | sort | uniq | paste -s -d "," -`
           cargo test --profile ci --locked --features ${ALL_FEATURES}
   query-integration-tests:
-    runs-on: ubuntu-2404-4x-arm64
+    runs-on: ubuntu-2404-8x-arm64
     timeout-minutes: 75
     steps:
       - uses: actions/checkout@v4
@@ -163,12 +163,12 @@ jobs:
         run: |
           sudo apt -y -qq update
           sudo apt install -y protobuf-compiler libssl-dev pkg-config
-      - name: Build query integration tests
-        run: |
-          cargo build --locked --profile release-with-debug --features fp16kernels --tests --test integration_tests
+      # - name: Build query integration tests
+      #   run: |
+      #     cargo build --locked -p lance --profile release-with-debug --features fp16kernels --tests --test integration_tests
       - name: Run query integration tests
         run: |
-          cargo test --locked --profile release-with-debug --features fp16kernels --test integration_tests
+          cargo test --locked -p lance --profile release-with-debug --features fp16kernels,slow_tests --test integration_tests
   build-no-lock:
     runs-on: warp-ubuntu-latest-x64-8x
     timeout-minutes: 30
@@ -188,7 +188,7 @@ jobs:
           sudo apt install -y protobuf-compiler libssl-dev
       - name: Build all
         run: |
-          ALL_FEATURES=`cargo metadata --format-version=1 --no-deps | jq -r '.packages[] | .features | keys | .[]' | grep -v protoc | sort | uniq | paste -s -d "," -`
+          ALL_FEATURES=`cargo metadata --format-version=1 --no-deps | jq -r '.packages[] | .features | keys | .[]' | grep -v -e protoc -e slow_tests | sort | uniq | paste -s -d "," -`
           cargo build --profile ci --benches --features ${ALL_FEATURES} --tests
   mac-build:
     runs-on: warp-macos-14-arm64-6x
@@ -272,5 +272,5 @@ jobs:
           rustup default ${{ matrix.msrv }}
       - name: cargo +${{ matrix.msrv }} check
         run: |
-          ALL_FEATURES=`cargo metadata --format-version=1 --no-deps | jq -r '.packages[] | .features | keys | .[]' | grep -v protoc | sort | uniq | paste -s -d "," -`
+          ALL_FEATURES=`cargo metadata --format-version=1 --no-deps | jq -r '.packages[] | .features | keys | .[]' | grep -v -e protoc -e slow_tests | sort | uniq | paste -s -d "," -`
           cargo check --profile ci --workspace --tests --benches --features ${ALL_FEATURES}
diff --git a/rust/lance/Cargo.toml b/rust/lance/Cargo.toml
index dd60157a5fc..25c30230b35 100644
--- a/rust/lance/Cargo.toml
+++ b/rust/lance/Cargo.toml
@@ -134,6 +134,8 @@ gcp = ["lance-io/gcp"]
 azure = ["lance-io/azure"]
 oss = ["lance-io/oss"]
 huggingface = ["lance-io/huggingface"]
+# Enable slow integration tests (disabled by default in CI)
+slow_tests = []
 
 [[bin]]
 name = "lq"
diff --git a/rust/lance/tests/query/primitives.rs b/rust/lance/tests/query/primitives.rs
index cb978074d37..6813d4dc3e0 100644
--- a/rust/lance/tests/query/primitives.rs
+++ b/rust/lance/tests/query/primitives.rs
@@ -41,6 +41,7 @@ async fn test_query_bool() {
 }
 
 #[tokio::test]
+#[cfg(feature = "slow_tests")]
 #[rstest::rstest]
 #[case::int8(DataType::Int8)]
 #[case::int16(DataType::Int16)]
diff --git a/rust/lance/tests/query/vectors.rs b/rust/lance/tests/query/vectors.rs
index 019d4cfc073..cf1835d5eb9 100644
--- a/rust/lance/tests/query/vectors.rs
+++ b/rust/lance/tests/query/vectors.rs
@@ -22,6 +22,7 @@ fn date_as_i32(date: &str) -> i32 {
 }
 
 #[tokio::test]
+#[cfg(feature = "slow_tests")]
 async fn test_query_prefilter_date() {
     let batch = gen_batch()
         .col("id", array::step::<Int32Type>())

From 0d2621be4bcd2d346a4c0cf9b431fcc73d03a9a0 Mon Sep 17 00:00:00 2001
From: Will Jones <willjones127@gmail.com>
Date: Tue, 4 Nov 2025 14:54:08 -0800
Subject: [PATCH 10/18] simplify

---
 rust/lance/tests/integration_tests.rs | 1 +
 rust/lance/tests/query/primitives.rs  | 1 -
 rust/lance/tests/query/vectors.rs     | 1 -
 3 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/rust/lance/tests/integration_tests.rs b/rust/lance/tests/integration_tests.rs
index 90ad5a7f855..4bbdc894b1e 100644
--- a/rust/lance/tests/integration_tests.rs
+++ b/rust/lance/tests/integration_tests.rs
@@ -3,5 +3,6 @@
 
 // NOTE: we only create one integration test binary, to keep compilation overhead down.
 
+#[cfg(feature = "slow_tests")]
 mod query;
 mod utils;
diff --git a/rust/lance/tests/query/primitives.rs b/rust/lance/tests/query/primitives.rs
index 6813d4dc3e0..cb978074d37 100644
--- a/rust/lance/tests/query/primitives.rs
+++ b/rust/lance/tests/query/primitives.rs
@@ -41,7 +41,6 @@ async fn test_query_bool() {
 }
 
 #[tokio::test]
-#[cfg(feature = "slow_tests")]
 #[rstest::rstest]
 #[case::int8(DataType::Int8)]
 #[case::int16(DataType::Int16)]
diff --git a/rust/lance/tests/query/vectors.rs b/rust/lance/tests/query/vectors.rs
index cf1835d5eb9..019d4cfc073 100644
--- a/rust/lance/tests/query/vectors.rs
+++ b/rust/lance/tests/query/vectors.rs
@@ -22,7 +22,6 @@ fn date_as_i32(date: &str) -> i32 {
 }
 
 #[tokio::test]
-#[cfg(feature = "slow_tests")]
 async fn test_query_prefilter_date() {
     let batch = gen_batch()
         .col("id", array::step::<Int32Type>())

From 7187672ae07511701422448a553d023844e259a9 Mon Sep 17 00:00:00 2001
From: Will Jones <willjones127@gmail.com>
Date: Tue, 4 Nov 2025 16:21:26 -0800
Subject: [PATCH 11/18] Faster build in CI

---
 .github/workflows/rust.yml | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml
index 5b64c29085a..806d9c1fdba 100644
--- a/.github/workflows/rust.yml
+++ b/.github/workflows/rust.yml
@@ -148,6 +148,9 @@ jobs:
   query-integration-tests:
     runs-on: ubuntu-2404-8x-arm64
     timeout-minutes: 75
+    env:
+      # We use opt-level 1 which makes some tests 5x faster to run.
+      RUSTFLAGS: "-C debuginfo=1 -C opt-level 1"
     steps:
       - uses: actions/checkout@v4
       - name: Setup rust toolchain
@@ -163,12 +166,12 @@ jobs:
         run: |
           sudo apt -y -qq update
           sudo apt install -y protobuf-compiler libssl-dev pkg-config
-      # - name: Build query integration tests
-      #   run: |
-      #     cargo build --locked -p lance --profile release-with-debug --features fp16kernels --tests --test integration_tests
+      - name: Build query integration tests
+        run: |
+          cargo build --locked -p lance --no-default-features --features fp16kernels,slow_tests --tests --test integration_tests
       - name: Run query integration tests
         run: |
-          cargo test --locked -p lance --profile release-with-debug --features fp16kernels,slow_tests --test integration_tests
+          cargo test --locked -p lance --no-default-features --features fp16kernels,slow_tests --test integration_tests
   build-no-lock:
     runs-on: warp-ubuntu-latest-x64-8x
     timeout-minutes: 30

From 4557372a50664e0665b83165eb03f00ae98c688d Mon Sep 17 00:00:00 2001
From: Will Jones <willjones127@gmail.com>
Date: Tue, 4 Nov 2025 16:35:29 -0800
Subject: [PATCH 12/18] comment out flakey cases

---
 rust/lance/tests/query/vectors.rs |  7 ++++---
 rust/lance/tests/utils/mod.rs     | 28 +++-------------------------
 2 files changed, 7 insertions(+), 28 deletions(-)

diff --git a/rust/lance/tests/query/vectors.rs b/rust/lance/tests/query/vectors.rs
index 019d4cfc073..9d8c640a7e9 100644
--- a/rust/lance/tests/query/vectors.rs
+++ b/rust/lance/tests/query/vectors.rs
@@ -41,9 +41,10 @@ async fn test_query_prefilter_date() {
                 Some(IndexType::IvfPq),
                 Some(IndexType::IvfSq),
                 Some(IndexType::IvfFlat),
-                Some(IndexType::IvfHnswFlat),
-                Some(IndexType::IvfHnswPq),
-                Some(IndexType::IvfHnswSq),
+                // TODO: HNSW results are very flakey.
+                // Some(IndexType::IvfHnswFlat),
+                // Some(IndexType::IvfHnswPq),
+                // Some(IndexType::IvfHnswSq),
             ],
         )
         .run(|ds: Dataset, original: RecordBatch| async move {
diff --git a/rust/lance/tests/utils/mod.rs b/rust/lance/tests/utils/mod.rs
index fa447e0b9b8..ff6b903501a 100644
--- a/rust/lance/tests/utils/mod.rs
+++ b/rust/lance/tests/utils/mod.rs
@@ -132,13 +132,7 @@ async fn build_dataset(
 
     assert_eq!(ds.count_rows(None).await.unwrap(), original.num_rows());
 
-    let fragment_ids = ds
-        .manifest()
-        .fragments
-        .iter()
-        .map(|f| f.id as u32)
-        .collect::<Vec<_>>();
-    for (fragment_offset, (column, index_type)) in indices.iter().enumerate() {
+    for (column, index_type) in indices.iter() {
         // TODO: when possible, make indices cover a portion of rows and not be
         // aligned between indices.
         let index_params: Box<dyn IndexParams> = match index_type {
@@ -171,13 +165,13 @@ async fn build_dataset(
             IndexType::IvfHnswPq => Box::new(VectorIndexParams::with_ivf_hnsw_pq_params(
                 DistanceType::L2,
                 IvfBuildParams::new(2),
-                HnswBuildParams::default(),
+                HnswBuildParams::default().ef_construction(200),
                 PQBuildParams::new(2, 8),
             )),
             IndexType::IvfHnswSq => Box::new(VectorIndexParams::with_ivf_hnsw_sq_params(
                 DistanceType::L2,
                 IvfBuildParams::new(2),
-                HnswBuildParams::default(),
+                HnswBuildParams::default().ef_construction(200),
                 SQBuildParams::default(),
             )),
             _ => {
@@ -186,23 +180,7 @@ async fn build_dataset(
             }
         };
 
-        // Index all but one fragment. Each new index we increase the offset.
-        // Start at the offset and then count (wrapping around if needed)
-        // fragments.length - 1.
-        // So for 3 fragments:
-        // Index 1: [0, 1]
-        // Index 2: [1, 2]
-        // Index 3: [2, 0]
-        let fragments_to_index = fragment_ids
-            .iter()
-            .cycle()
-            .skip(fragment_offset)
-            .take((fragment_ids.len() - 1).max(1))
-            .cloned()
-            .collect::<Vec<_>>();
-
         ds.create_index_builder(&[column], *index_type, index_params.as_ref())
-            .fragments(fragments_to_index)
             .await
             .unwrap();
     }

From 96a26f3d1a395d92aaf1d076273f2b749278434c Mon Sep 17 00:00:00 2001
From: Will Jones <willjones127@gmail.com>
Date: Wed, 5 Nov 2025 08:09:33 -0800
Subject: [PATCH 13/18] fix ci

---
 .github/workflows/rust.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml
index 806d9c1fdba..82ef21680f9 100644
--- a/.github/workflows/rust.yml
+++ b/.github/workflows/rust.yml
@@ -63,7 +63,7 @@ jobs:
           sudo apt install -y protobuf-compiler libssl-dev
       - name: Get features
         run: |
-          ALL_FEATURES=`cargo metadata --format-version=1 --no-deps | jq -r '.packages[] | .features | keys | .[]' | grep -v -e protoc -e slow_tests | sort | uniq | paste -s -d "," -`
+          ALL_FEATURES=`cargo metadata --format-version=1 --no-deps | jq -r '.packages[] | .features | keys | .[]' | sort | uniq | paste -s -d "," -`
           echo "ALL_FEATURES=${ALL_FEATURES}" >> $GITHUB_ENV
       - name: Clippy
         run: cargo clippy --profile ci --locked --features ${{ env.ALL_FEATURES }} --all-targets -- -D warnings
@@ -150,7 +150,7 @@ jobs:
     timeout-minutes: 75
     env:
       # We use opt-level 1 which makes some tests 5x faster to run.
-      RUSTFLAGS: "-C debuginfo=1 -C opt-level 1"
+      RUSTFLAGS: "-C debuginfo=1 -C opt-level=1"
     steps:
       - uses: actions/checkout@v4
       - name: Setup rust toolchain

From 973e1e268b91cf87e003ea94899304f715bd8942 Mon Sep 17 00:00:00 2001
From: Will Jones <willjones127@gmail.com>
Date: Wed, 5 Nov 2025 09:45:37 -0800
Subject: [PATCH 14/18] Address PR review feedback on integration test
 framework
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Fix generate_index_combinations to create cartesian product of all index
  options across columns, enabling tests with multiple indices
- Add better error messages with expect() at key test setup points
- Add test cases for duplicate indices in test_take
- Document why _distance column isn't validated in ANN tests
- Document that index parameters are for deterministic small test datasets

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
---
 rust/lance/tests/integration_tests.rs |  1 +
 rust/lance/tests/query/mod.rs         | 23 +++++++-----
 rust/lance/tests/utils/mod.rs         | 52 ++++++++++++++++++++++-----
 3 files changed, 60 insertions(+), 16 deletions(-)

diff --git a/rust/lance/tests/integration_tests.rs b/rust/lance/tests/integration_tests.rs
index 4bbdc894b1e..81c2535dd9c 100644
--- a/rust/lance/tests/integration_tests.rs
+++ b/rust/lance/tests/integration_tests.rs
@@ -5,4 +5,5 @@
 
 #[cfg(feature = "slow_tests")]
 mod query;
+#[cfg(feature = "slow_tests")]
 mod utils;
diff --git a/rust/lance/tests/query/mod.rs b/rust/lance/tests/query/mod.rs
index 88fd4bf1529..6e1c7ff3f57 100644
--- a/rust/lance/tests/query/mod.rs
+++ b/rust/lance/tests/query/mod.rs
@@ -38,12 +38,15 @@ async fn test_scan(original: &RecordBatch, ds: &Dataset) {
 async fn test_take(original: &RecordBatch, ds: &Dataset) {
     let num_rows = original.num_rows();
     let cases: Vec<Vec<usize>> = vec![
-        vec![0, 1, 2],                   // First few rows
-        vec![5, 3, 1],                   // Out of order
-        vec![0],                         // Single row
-        vec![],                          // Empty
-        (0..num_rows.min(10)).collect(), // Sequential
-        vec![num_rows - 1, 0],           // Last and first
+        vec![0, 1, 2],                    // First few rows
+        vec![5, 3, 1],                    // Out of order
+        vec![0],                          // Single row
+        vec![],                           // Empty
+        (0..num_rows.min(10)).collect(),  // Sequential
+        vec![num_rows - 1, 0],            // Last and first
+        vec![1, 1, 2],                    // Duplicate indices
+        vec![0, 0, 0],                    // All same index
+        vec![num_rows - 1, num_rows - 1], // Duplicate of last row
     ];
 
     for indices in cases {
@@ -157,8 +160,12 @@ async fn test_ann(original: &RecordBatch, ds: &Dataset, column: &str, predicate:
     let expected_batches = df.collect().await.unwrap();
     let expected = concat_batches(&original.schema(), &expected_batches).unwrap();
 
-    // Compare only the main data (excluding _distance column which Lance adds)
-    // We validate that both return the same number of rows and same row ordering
+    // Compare only the main data (excluding _distance column which Lance adds).
+    // We validate that both return the same number of rows and same row ordering.
+    // Note: We don't validate the _distance column values because:
+    // 1. ANN indices provide approximate distances, not exact values
+    // 2. Some distance functions return ordering values (e.g., squared euclidean
+    //    without the final sqrt step) rather than true distances
     assert_eq!(
         expected.num_rows(),
         result.num_rows(),
diff --git a/rust/lance/tests/utils/mod.rs b/rust/lance/tests/utils/mod.rs
index ff6b903501a..9ef9f39b10d 100644
--- a/rust/lance/tests/utils/mod.rs
+++ b/rust/lance/tests/utils/mod.rs
@@ -61,13 +61,39 @@ impl DatasetTestCases {
     }
 
     fn generate_index_combinations(&self) -> Vec<Vec<(&str, IndexType)>> {
-        let mut combinations = Vec::new();
-        for (column, index_types) in &self.index_options {
-            for index_type in index_types.iter().flatten() {
-                combinations.push(vec![(column.as_str(), *index_type)]);
+        if self.index_options.is_empty() {
+            return vec![vec![]];
+        }
+
+        fn generate_recursive<'a>(
+            options: &'a [(String, Vec<Option<IndexType>>)],
+            current_idx: usize,
+            current_combination: Vec<(&'a str, IndexType)>,
+            results: &mut Vec<Vec<(&'a str, IndexType)>>,
+        ) {
+            if current_idx == options.len() {
+                // Only add non-empty combinations (filter out all-None case)
+                if !current_combination.is_empty() {
+                    results.push(current_combination);
+                }
+                return;
+            }
+
+            let (column, index_types) = &options[current_idx];
+
+            // Try each index type for this column (including None)
+            for index_type_opt in index_types {
+                let mut next_combination = current_combination.clone();
+                if let Some(index_type) = index_type_opt {
+                    next_combination.push((column.as_str(), *index_type));
+                }
+                generate_recursive(options, current_idx + 1, next_combination, results);
             }
         }
-        combinations
+
+        let mut results = Vec::new();
+        generate_recursive(&self.index_options, 0, Vec::new(), &mut results);
+        results
     }
 
     pub async fn run<F, Fut>(self, test_fn: F) -> Fut::Output
@@ -126,15 +152,20 @@ async fn build_dataset(
         })
         .execute(vec![data_to_write])
         .await
-        .unwrap();
+        .expect("Failed to create test dataset");
 
-    ds.delete("id = -1").await.unwrap();
+    ds.delete("id = -1")
+        .await
+        .expect("Failed to delete filler rows (id = -1)");
 
     assert_eq!(ds.count_rows(None).await.unwrap(), original.num_rows());
 
     for (column, index_type) in indices.iter() {
         // TODO: when possible, make indices cover a portion of rows and not be
         // aligned between indices.
+
+        // Index parameters are chosen to make search results deterministic for small
+        // test datasets, not for production use.
         let index_params: Box<dyn IndexParams> = match index_type {
             IndexType::BTree
             | IndexType::Bitmap
@@ -182,7 +213,12 @@ async fn build_dataset(
 
         ds.create_index_builder(&[column], *index_type, index_params.as_ref())
             .await
-            .unwrap();
+            .unwrap_or_else(|e| {
+                panic!(
+                    "Failed to create index on column '{}' with type {:?}: {}",
+                    column, index_type, e
+                )
+            });
     }
 
     ds

From 9c01a7b30dff0706a47a81f1596a7b38eb871124 Mon Sep 17 00:00:00 2001
From: Will Jones <willjones127@gmail.com>
Date: Wed, 5 Nov 2025 09:49:37 -0800
Subject: [PATCH 15/18] Apply suggestion from @wjones127

---
 rust/lance/tests/query/mod.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/rust/lance/tests/query/mod.rs b/rust/lance/tests/query/mod.rs
index 6e1c7ff3f57..2f916ab5fa7 100644
--- a/rust/lance/tests/query/mod.rs
+++ b/rust/lance/tests/query/mod.rs
@@ -21,7 +21,7 @@ fn create_datafusion_context() -> SessionContext {
 mod primitives;
 mod vectors;
 
-/// Scanning an ordering by id should give same result as original.
+/// Scanning and ordering by id should give same result as original.
 async fn test_scan(original: &RecordBatch, ds: &Dataset) {
     let mut scanner = ds.scan();
     scanner

From ee05a96b8ef0b3acfde007f899de3dc671a759d4 Mon Sep 17 00:00:00 2001
From: Will Jones <willjones127@gmail.com>
Date: Wed, 5 Nov 2025 09:49:43 -0800
Subject: [PATCH 16/18] Apply suggestion from @wjones127

---
 rust/lance/tests/query/mod.rs | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/rust/lance/tests/query/mod.rs b/rust/lance/tests/query/mod.rs
index 2f916ab5fa7..8833c4a0bdc 100644
--- a/rust/lance/tests/query/mod.rs
+++ b/rust/lance/tests/query/mod.rs
@@ -50,11 +50,6 @@ async fn test_take(original: &RecordBatch, ds: &Dataset) {
     ];
 
     for indices in cases {
-        // Skip cases with invalid indices
-        if indices.iter().any(|&i| i >= num_rows) {
-            continue;
-        }
-
         // Convert to u64 for Lance take
         let indices_u64: Vec<u64> = indices.iter().map(|&i| i as u64).collect();
 

From dcd60211b9927a595677a6b58be354a928975187 Mon Sep 17 00:00:00 2001
From: Will Jones <willjones127@gmail.com>
Date: Wed, 5 Nov 2025 09:49:48 -0800
Subject: [PATCH 17/18] Apply suggestion from @wjones127

---
 rust/lance/tests/query/mod.rs | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/rust/lance/tests/query/mod.rs b/rust/lance/tests/query/mod.rs
index 8833c4a0bdc..5816d786f89 100644
--- a/rust/lance/tests/query/mod.rs
+++ b/rust/lance/tests/query/mod.rs
@@ -53,11 +53,6 @@ async fn test_take(original: &RecordBatch, ds: &Dataset) {
         // Convert to u64 for Lance take
         let indices_u64: Vec<u64> = indices.iter().map(|&i| i as u64).collect();
 
-        if indices_u64.is_empty() {
-            // Skip empty case as Lance may not handle it the same way
-            continue;
-        }
-
         let taken_ds = ds.take(&indices_u64, ds.schema().clone()).await.unwrap();
 
         // Take from RecordBatch using arrow::compute

From 030567a86caf94e9f33874e7255f9f80cc0dfb20 Mon Sep 17 00:00:00 2001
From: Will Jones <willjones127@gmail.com>
Date: Mon, 8 Dec 2025 17:05:26 -0800
Subject: [PATCH 18/18] fix runner

---
 .github/workflows/rust.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml
index 82ef21680f9..a56d2935207 100644
--- a/.github/workflows/rust.yml
+++ b/.github/workflows/rust.yml
@@ -146,7 +146,7 @@ jobs:
           ALL_FEATURES=`cargo metadata --format-version=1 --no-deps | jq -r '.packages[] | .features | keys | .[]' | grep -v -e protoc -e slow_tests | sort | uniq | paste -s -d "," -`
           cargo test --profile ci --locked --features ${ALL_FEATURES}
   query-integration-tests:
-    runs-on: ubuntu-2404-8x-arm64
+    runs-on: warp-ubuntu-latest-x64-4x
     timeout-minutes: 75
     env:
       # We use opt-level 1 which makes some tests 5x faster to run.