Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 7 additions & 5 deletions python/src/dataset.rs
Original file line number Diff line number Diff line change
Expand Up @@ -31,17 +31,15 @@ use lance::dataset::{
scanner::Scanner as LanceScanner, transaction::Operation as LanceOperation,
Dataset as LanceDataset, ReadParams, Version, WriteMode, WriteParams,
};
use lance::index::IndexParams;
use lance::index::{
scalar::ScalarIndexParams,
vector::{diskann::DiskANNParams, VectorIndexParams},
DatasetIndexExt,
};
use lance_arrow::as_fixed_size_list_array;
use lance_core::{datatypes::Schema, format::Fragment, io::object_store::ObjectStoreParams};
use lance_index::{
vector::{ivf::IvfBuildParams, pq::PQBuildParams},
IndexType,
DatasetIndexExt, IndexParams, IndexType,
};
use lance_linalg::distance::MetricType;
use pyo3::exceptions::PyStopIteration;
Expand Down Expand Up @@ -808,7 +806,9 @@ impl Dataset {
}

fn count_unindexed_rows(&self, index_name: String) -> PyResult<Option<usize>> {
let idx = RT.block_on(None, self.ds.load_index_by_name(index_name.as_str()))?;
let idx = RT
.block_on(None, self.ds.load_index_by_name(index_name.as_str()))?
.map_err(|err| PyIOError::new_err(err.to_string()))?;
if let Some(index) = idx {
RT.block_on(
None,
Expand All @@ -825,7 +825,9 @@ impl Dataset {
}

fn count_indexed_rows(&self, index_name: String) -> PyResult<Option<usize>> {
let idx = RT.block_on(None, self.ds.load_index_by_name(index_name.as_str()))?;
let idx = RT
.block_on(None, self.ds.load_index_by_name(index_name.as_str()))?
.map_err(|err| PyIOError::new_err(err.to_string()))?;
if let Some(index) = idx {
RT.block_on(
None,
Expand Down
25 changes: 11 additions & 14 deletions rust/lance-core/src/format/index.rs
Original file line number Diff line number Diff line change
@@ -1,19 +1,16 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
// Copyright 2024 Lance Developers.
//
// http://www.apache.org/licenses/LICENSE-2.0
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

//! Metadata for index

Expand Down
3 changes: 3 additions & 0 deletions rust/lance-core/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -34,3 +34,6 @@ lazy_static::lazy_static! {
}

pub(crate) const DELETION_DIRS: &str = "_deletions";

/// Trait for a Lance Dataset
pub trait Dataset {}
Comment on lines +37 to +39
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sorry a left over

13 changes: 12 additions & 1 deletion rust/lance-index/src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// Copyright 2023 Lance Developers.
// Copyright 2024 Lance Developers.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
Expand All @@ -23,7 +23,9 @@ use lance_core::Result;
use roaring::RoaringBitmap;

pub mod scalar;
pub mod traits;
pub mod vector;
pub use crate::traits::*;

pub const INDEX_FILE_NAME: &str = "index.idx";

Expand All @@ -33,16 +35,21 @@ pub mod pb {
}

/// Generic methods common across all types of secondary indices
///
#[async_trait]
pub trait Index: Send + Sync {
/// Cast to [Any].
fn as_any(&self) -> &dyn Any;

/// Cast to [Index]
fn as_index(self: Arc<Self>) -> Arc<dyn Index>;

/// Retrieve index statistics as a JSON string
fn statistics(&self) -> Result<String>;

/// Get the type of the index
fn index_type(&self) -> IndexType;

/// Read through the index and determine which fragment ids are covered by the index
///
/// This is a kind of slow operation. It's better to use the fragment_bitmap. This
Expand All @@ -67,3 +74,7 @@ impl std::fmt::Display for IndexType {
}
}
}

pub trait IndexParams: Send + Sync {
fn as_any(&self) -> &dyn Any;
}
84 changes: 84 additions & 0 deletions rust/lance-index/src/traits.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
// Copyright 2024 Lance Developers.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

use async_trait::async_trait;

use lance_core::{format::Index, Result};

use crate::{IndexParams, IndexType};

// Extends Lance Dataset with secondary index.
///
#[async_trait]
pub trait DatasetIndexExt {
/// Create indices on columns.
///
/// Upon finish, a new dataset version is generated.
///
/// Parameters:
///
/// - `columns`: the columns to build the indices on.
/// - `index_type`: specify [`IndexType`].
/// - `name`: optional index name. Must be unique in the dataset.
/// if not provided, it will auto-generate one.
/// - `params`: index parameters.
/// - `replace`: replace the existing index if it exists.
async fn create_index(
&mut self,
columns: &[&str],
index_type: IndexType,
name: Option<String>,
params: &dyn IndexParams,
replace: bool,
) -> Result<()>;

/// Read all indices of this Dataset version.
async fn load_indices(&self) -> Result<Vec<Index>>;

/// Loads all the indies of a given UUID.
///
/// Note that it is possible to have multiple indices with the same UUID,
/// as they are the deltas of the same index.
async fn load_index(&self, uuid: &str) -> Result<Option<Index>> {
self.load_indices()
.await
.map(|indices| indices.into_iter().find(|idx| idx.uuid.to_string() == uuid))
}

/// Loads a specific index with the given index name
async fn load_index_by_name(&self, name: &str) -> Result<Option<Index>> {
self.load_indices()
.await
.map(|indices| indices.into_iter().find(|idx| idx.name == name))
}

/// Loads a specific index with the given index name.
async fn load_scalar_index_for_column(&self, col: &str) -> Result<Option<Index>>;

/// Optimize indices.
async fn optimize_indices(&mut self) -> Result<()>;

/// Find index with a given index_name and return its serialized statistics.
async fn index_statistics(&self, index_name: &str) -> Result<Option<String>>;

/// Count the rows that are not indexed by the given index.
///
/// TODO: move to [DatasetInternalExt]
async fn count_unindexed_rows(&self, index_name: &str) -> Result<Option<usize>>;

/// Count the rows that are indexed by the given index.
///
/// TODO: move to [DatasetInternalExt]
async fn count_indexed_rows(&self, index_name: &str) -> Result<Option<usize>>;
Comment on lines +53 to +83
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

All of these methods return Result<Option<...>> which is a little confusing. We should document what None means in each case because it is slightly different depending on the function. Or, we could probably change most of these to just Result<...> and raise an error in the None case.

load_index_by_name -> No index exists with that name, maybe just error here?
load_scalar_index_for_column -> No index exists for that column, should definitely error here looking at usage
index_statistics -> No index exists with that name, we should error here
count_unindexed_rows -> An index exists, but we couldn't determine the row count because of old manifest version
count_indexed_rows -> An index exists, but we couldn't determine the row count because of old manifest version

For example, looking at the python version of count_unindexed_rows, it is wrong:

    fn count_unindexed_rows(&self, index_name: String) -> PyResult<Option<usize>> {
        let idx = RT.block_on(None, self.ds.load_index_by_name(index_name.as_str()))?;
        if let Some(index) = idx {
            RT.block_on(
                None,
                self.ds
                    .count_unindexed_rows(index.uuid.to_string().as_str()),
            )?
            .map_err(|err| PyIOError::new_err(err.to_string()))
        } else {
            THIS IS NOT THE CORRECT ERROR MESSAGE
            Err(PyIOError::new_err(format!(
                "Index {} not found",
                index_name
            )))
        }
    }

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This was copied from the previous code. I meant to fix them in the next follow up.

Also try to eliminate these single used APIs and just put them into the json blob in index_stats.

}
6 changes: 3 additions & 3 deletions rust/lance/benches/ivf_pq.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,14 +17,14 @@ use std::sync::Arc;
use arrow_array::{FixedSizeListArray, RecordBatch, RecordBatchIterator};
use arrow_schema::{DataType, Field, FieldRef, Schema};
use criterion::{criterion_group, criterion_main, Criterion};

use lance::{
arrow::*,
dataset::{WriteMode, WriteParams},
index::{vector::VectorIndexParams, DatasetIndexExt},
index::vector::VectorIndexParams,
Dataset,
};

use lance_index::IndexType;
use lance_index::{DatasetIndexExt, IndexType};
use lance_linalg::distance::MetricType;
use lance_testing::datagen::generate_random_array;
#[cfg(target_os = "linux")]
Expand Down
13 changes: 6 additions & 7 deletions rust/lance/benches/vector_index.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,18 +20,17 @@ use arrow_array::{
use arrow_schema::{DataType, Field, FieldRef, Schema as ArrowSchema};
use criterion::{criterion_group, criterion_main, Criterion};
use futures::TryStreamExt;
use lance::dataset::builder::DatasetBuilder;
use lance_index::IndexType;
#[cfg(target_os = "linux")]
use pprof::criterion::{Output, PProfProfiler};
use rand::{self, Rng};

use lance::dataset::{WriteMode, WriteParams};
use lance::dataset::{builder::DatasetBuilder, Dataset, WriteMode, WriteParams};
use lance::index::vector::VectorIndexParams;
use lance::index::DatasetIndexExt;
use lance::{arrow::as_fixed_size_list_array, dataset::Dataset};
use lance_arrow::FixedSizeListArrayExt;
use lance_index::vector::{ivf::IvfBuildParams, pq::PQBuildParams};
use lance_arrow::{as_fixed_size_list_array, FixedSizeListArrayExt};
use lance_index::{
vector::{ivf::IvfBuildParams, pq::PQBuildParams},
DatasetIndexExt, IndexType,
};
use lance_linalg::distance::MetricType;

fn bench_ivf_pq_index(c: &mut Criterion) {
Expand Down
3 changes: 2 additions & 1 deletion rust/lance/src/bin/lq.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,9 @@ use futures::TryStreamExt;
use snafu::{location, Location};

use lance::dataset::Dataset;
use lance::index::{vector::VectorIndexParams, DatasetIndexExt};
use lance::index::vector::VectorIndexParams;
use lance::{Error, Result};
use lance_index::DatasetIndexExt;
use lance_linalg::distance::MetricType;

#[derive(Parser)]
Expand Down
Loading