Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 33 additions & 6 deletions .github/workflows/rust.yml
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ jobs:
sudo apt install -y protobuf-compiler libssl-dev
- name: Get features
run: |
ALL_FEATURES=`cargo metadata --format-version=1 --no-deps | jq -r '.packages[] | .features | keys | .[]' | grep -v protoc | sort | uniq | paste -s -d "," -`
ALL_FEATURES=`cargo metadata --format-version=1 --no-deps | jq -r '.packages[] | .features | keys | .[]' | sort | uniq | paste -s -d "," -`
echo "ALL_FEATURES=${ALL_FEATURES}" >> $GITHUB_ENV
- name: Clippy
run: cargo clippy --profile ci --locked --features ${{ env.ALL_FEATURES }} --all-targets -- -D warnings
Expand Down Expand Up @@ -104,7 +104,7 @@ jobs:
uses: taiki-e/install-action@cargo-llvm-cov
- name: Run tests
run: |
ALL_FEATURES=`cargo metadata --format-version=1 --no-deps | jq -r '.packages[] | .features | keys | .[]' | grep -v protoc | sort | uniq | paste -s -d "," -`
ALL_FEATURES=`cargo metadata --format-version=1 --no-deps | jq -r '.packages[] | .features | keys | .[]' | grep -v -e protoc -e slow_tests | sort | uniq | paste -s -d "," -`
cargo +nightly llvm-cov --profile ci --locked --workspace --codecov --output-path coverage.codecov --features ${ALL_FEATURES}
- name: Upload coverage to Codecov
uses: codecov/codecov-action@v4
Expand All @@ -131,14 +131,41 @@ jobs:
sudo apt install -y protobuf-compiler libssl-dev pkg-config
- name: Build tests
run: |
ALL_FEATURES=`cargo metadata --format-version=1 --no-deps | jq -r '.packages[] | .features | keys | .[]' | grep -v protoc | sort | uniq | paste -s -d "," -`
ALL_FEATURES=`cargo metadata --format-version=1 --no-deps | jq -r '.packages[] | .features | keys | .[]' | grep -v -e protoc -e slow_tests | sort | uniq | paste -s -d "," -`
cargo test --profile ci --locked --features ${ALL_FEATURES} --no-run
- name: Start DynamodDB and S3
run: docker compose -f docker-compose.yml up -d --wait
- name: Run tests
run: |
ALL_FEATURES=`cargo metadata --format-version=1 --no-deps | jq -r '.packages[] | .features | keys | .[]' | grep -v protoc | sort | uniq | paste -s -d "," -`
ALL_FEATURES=`cargo metadata --format-version=1 --no-deps | jq -r '.packages[] | .features | keys | .[]' | grep -v -e protoc -e slow_tests | sort | uniq | paste -s -d "," -`
cargo test --profile ci --locked --features ${ALL_FEATURES}
query-integration-tests:
runs-on: warp-ubuntu-latest-x64-4x
timeout-minutes: 75
env:
# We use opt-level 1 which makes some tests 5x faster to run.
RUSTFLAGS: "-C debuginfo=1 -C opt-level=1"
steps:
- uses: actions/checkout@v4
- name: Setup rust toolchain
run: |
rustup toolchain install stable
rustup default stable
- uses: rui314/setup-mold@v1
- uses: Swatinem/rust-cache@v2
with:
cache-targets: false
cache-workspace-crates: true
- name: Install dependencies
run: |
sudo apt -y -qq update
sudo apt install -y protobuf-compiler libssl-dev pkg-config
- name: Build query integration tests
run: |
cargo build --locked -p lance --no-default-features --features fp16kernels,slow_tests --tests --test integration_tests
- name: Run query integration tests
run: |
cargo test --locked -p lance --no-default-features --features fp16kernels,slow_tests --test integration_tests
build-no-lock:
runs-on: warp-ubuntu-latest-x64-8x
timeout-minutes: 30
Expand All @@ -158,7 +185,7 @@ jobs:
sudo apt install -y protobuf-compiler libssl-dev
- name: Build all
run: |
ALL_FEATURES=`cargo metadata --format-version=1 --no-deps | jq -r '.packages[] | .features | keys | .[]' | grep -v protoc | sort | uniq | paste -s -d "," -`
ALL_FEATURES=`cargo metadata --format-version=1 --no-deps | jq -r '.packages[] | .features | keys | .[]' | grep -v -e protoc -e slow_tests | sort | uniq | paste -s -d "," -`
cargo build --profile ci --benches --features ${ALL_FEATURES} --tests
mac-build:
runs-on: warp-macos-14-arm64-6x
Expand Down Expand Up @@ -242,5 +269,5 @@ jobs:
rustup default ${{ matrix.msrv }}
- name: cargo +${{ matrix.msrv }} check
run: |
ALL_FEATURES=`cargo metadata --format-version=1 --no-deps | jq -r '.packages[] | .features | keys | .[]' | grep -v protoc | sort | uniq | paste -s -d "," -`
ALL_FEATURES=`cargo metadata --format-version=1 --no-deps | jq -r '.packages[] | .features | keys | .[]' | grep -v -e protoc -e slow_tests | sort | uniq | paste -s -d "," -`
cargo check --profile ci --workspace --tests --benches --features ${ALL_FEATURES}
1 change: 1 addition & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 3 additions & 0 deletions rust/lance/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,7 @@ test-log.workspace = true
tracing-chrome = "0.7.1"
rstest = { workspace = true }
tracking-allocator = { version = "0.4", features = ["tracing-compat"] }
paste = "1.0"
# For S3 / DynamoDB tests
aws-config = { workspace = true }
aws-sdk-s3 = { workspace = true }
Expand Down Expand Up @@ -133,6 +134,8 @@ gcp = ["lance-io/gcp"]
azure = ["lance-io/azure"]
oss = ["lance-io/oss"]
huggingface = ["lance-io/huggingface"]
# Enable slow integration tests (disabled by default in CI)
slow_tests = []

[[bin]]
name = "lq"
Expand Down
11 changes: 10 additions & 1 deletion rust/lance/src/index/create.rs
Original file line number Diff line number Diff line change
Expand Up @@ -229,7 +229,16 @@ impl<'a> CreateIndexBuilder<'a> {
)
.await?
}
(IndexType::Vector, LANCE_VECTOR_INDEX) => {
(
IndexType::Vector
| IndexType::IvfPq
| IndexType::IvfSq
| IndexType::IvfFlat
| IndexType::IvfHnswFlat
| IndexType::IvfHnswPq
| IndexType::IvfHnswSq,
LANCE_VECTOR_INDEX,
) => {
// Vector index params.
let vec_params = self
.params
Expand Down
9 changes: 9 additions & 0 deletions rust/lance/tests/integration_tests.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
// SPDX-License-Identifier: Apache-2.0
// SPDX-FileCopyrightText: Copyright The Lance Authors

// NOTE: we only create one integration test binary, to keep compilation overhead down.

#[cfg(feature = "slow_tests")]
mod query;
#[cfg(feature = "slow_tests")]
mod utils;
176 changes: 176 additions & 0 deletions rust/lance/tests/query/mod.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,176 @@
// SPDX-License-Identifier: Apache-2.0
// SPDX-FileCopyrightText: Copyright The Lance Authors

use std::sync::Arc;

use arrow_array::{cast::AsArray, RecordBatch, UInt32Array};
use arrow_select::concat::concat_batches;
use datafusion::datasource::MemTable;
use datafusion::prelude::SessionContext;
use lance::dataset::scanner::ColumnOrdering;
use lance::Dataset;
use lance_datafusion::udf::register_functions;

/// Creates a fresh SessionContext with Lance UDFs registered
fn create_datafusion_context() -> SessionContext {
let ctx = SessionContext::new();
register_functions(&ctx);
ctx
}

mod primitives;
mod vectors;

/// Scanning and ordering by id should give same result as original.
async fn test_scan(original: &RecordBatch, ds: &Dataset) {
let mut scanner = ds.scan();
scanner
.order_by(Some(vec![ColumnOrdering::asc_nulls_first(
"id".to_string(),
)]))
.unwrap();
let scanned = scanner.try_into_batch().await.unwrap();

assert_eq!(original, &scanned);
}

/// Taking specific rows should give the same result as taking from the original.
async fn test_take(original: &RecordBatch, ds: &Dataset) {
let num_rows = original.num_rows();
let cases: Vec<Vec<usize>> = vec![
vec![0, 1, 2], // First few rows
vec![5, 3, 1], // Out of order
vec![0], // Single row
vec![], // Empty
(0..num_rows.min(10)).collect(), // Sequential
vec![num_rows - 1, 0], // Last and first
vec![1, 1, 2], // Duplicate indices
vec![0, 0, 0], // All same index
vec![num_rows - 1, num_rows - 1], // Duplicate of last row
];

for indices in cases {
// Convert to u64 for Lance take
let indices_u64: Vec<u64> = indices.iter().map(|&i| i as u64).collect();

let taken_ds = ds.take(&indices_u64, ds.schema().clone()).await.unwrap();

// Take from RecordBatch using arrow::compute
let indices_u32: Vec<u32> = indices.iter().map(|&i| i as u32).collect();
let indices_array = UInt32Array::from(indices_u32);
let taken_rb = arrow::compute::take_record_batch(original, &indices_array).unwrap();

assert_eq!(
taken_rb, taken_ds,
"Take results don't match for indices: {:?}",
indices
);
}
}

/// Querying with filter should give same result as filtering original
/// record batch in DataFusion.
async fn test_filter(original: &RecordBatch, ds: &Dataset, predicate: &str) {
// Scan with filter and order
let mut scanner = ds.scan();
scanner
.filter(predicate)
.unwrap()
.order_by(Some(vec![ColumnOrdering::asc_nulls_first(
"id".to_string(),
)]))
.unwrap();
let scanned = scanner.try_into_batch().await.unwrap();

let ctx = create_datafusion_context();
let table = MemTable::try_new(original.schema(), vec![vec![original.clone()]]).unwrap();
ctx.register_table("t", Arc::new(table)).unwrap();

let sql = format!("SELECT * FROM t WHERE {} ORDER BY id", predicate);
let df = ctx.sql(&sql).await.unwrap();
let expected_batches = df.collect().await.unwrap();
let expected = concat_batches(&original.schema(), &expected_batches).unwrap();

assert_eq!(&expected, &scanned);
}

/// Test that an exhaustive ANN query gives the same results as brute force
/// KNN against the original batch.
///
/// By exhaustive ANN, I mean we search all the partitions so we get perfect recall.
async fn test_ann(original: &RecordBatch, ds: &Dataset, column: &str, predicate: Option<&str>) {
// Extract first vector from the column as query vector
let vector_column = original.column_by_name(column).unwrap();
let fixed_size_list = vector_column.as_fixed_size_list();

// Extract the first vector's values as a new array
let vector_values = fixed_size_list
.values()
.slice(0, fixed_size_list.value_length() as usize);
let query_vector = vector_values;

let mut scanner = ds.scan();
scanner
.nearest(column, query_vector.as_ref(), 10)
.unwrap()
.prefilter(true)
.refine(2);
if let Some(pred) = predicate {
scanner.filter(pred).unwrap();
}
let result = scanner.try_into_batch().await.unwrap();

// Use DataFusion to apply same vector search using SQL
let ctx = create_datafusion_context();
let table = MemTable::try_new(original.schema(), vec![vec![original.clone()]]).unwrap();
ctx.register_table("t", Arc::new(table)).unwrap();

// Convert query vector to SQL array literal
let float_array = query_vector.as_primitive::<arrow::datatypes::Float32Type>();
let vector_values_str = float_array
.values()
.iter()
.map(|v| v.to_string())
.collect::<Vec<_>>()
.join(", ");

// DataFusion's built-in `array_distance` function uses L2 distance.
let sql = format!(
"SELECT * FROM t {} ORDER BY array_distance(t.{}, [{}]) LIMIT 10",
if let Some(pred) = predicate {
format!("WHERE {}", pred)
} else {
String::new()
},
column,
vector_values_str
);

let df = ctx.sql(&sql).await.unwrap();
let expected_batches = df.collect().await.unwrap();
let expected = concat_batches(&original.schema(), &expected_batches).unwrap();

// Compare only the main data (excluding _distance column which Lance adds).
// We validate that both return the same number of rows and same row ordering.
// Note: We don't validate the _distance column values because:
// 1. ANN indices provide approximate distances, not exact values
// 2. Some distance functions return ordering values (e.g., squared euclidean
// without the final sqrt step) rather than true distances
assert_eq!(
expected.num_rows(),
result.num_rows(),
"Different number of results"
);

// Compare the first few columns (excluding _distance)
for (col_idx, field) in original.schema().fields().iter().enumerate() {
let expected_col = expected.column(col_idx);
let result_col = result.column(col_idx);
assert_eq!(
expected_col,
result_col,
"Column '{}' differs between DataFusion and Lance results",
field.name()
);
}
}
Loading