Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 19 additions & 1 deletion java/src/main/java/org/lance/Dataset.java
Original file line number Diff line number Diff line change
Expand Up @@ -1207,7 +1207,11 @@ public List<String> listIndexes() {
/**
* Get all indexes with full metadata.
*
* @return list of Index objects with complete metadata including index type and fragment coverage
* <p>Each returned {@link Index} is a physical index segment from the manifest. Use {@link
* #describeIndices()} for the logical-index view.
*
* @return list of Index objects with complete segment metadata, including index type and fragment
* coverage
*/
public List<Index> getIndexes() {
try (LockManager.ReadLock readLock = lockManager.acquireReadLock()) {
Expand All @@ -1218,6 +1222,20 @@ public List<Index> getIndexes() {

private native List<Index> nativeGetIndexes();

/**
* Get physical index segments for a specific logical index name.
*
* @param indexName logical index name
* @return list of physical index segments belonging to the logical index
*/
public List<Index> getIndexSegments(String indexName) {
Preconditions.checkArgument(
indexName != null && !indexName.isEmpty(), "indexName cannot be null or empty");
return getIndexes().stream()
.filter(index -> indexName.equals(index.name()))
.collect(Collectors.toList());
}

/**
* Get statistics for a specific index in JSON form.
*
Expand Down
9 changes: 9 additions & 0 deletions java/src/main/java/org/lance/index/IndexDescription.java
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,15 @@ public List<Index> getMetadata() {
return metadata;
}

/**
* Physical index segments for this logical index.
*
* <p>This is an alias for {@link #getMetadata()} with a less ambiguous name.
*/
public List<Index> getSegments() {
return metadata;
}

/**
* JSON representation of index-specific details.
*
Expand Down
10 changes: 10 additions & 0 deletions java/src/test/java/org/lance/DatasetTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -1914,14 +1914,24 @@ public void testDescribeIndicesByName(@TempDir Path tempDir) throws Exception {
assertTrue(desc.getRowsIndexed() > 0, "rowsIndexed should be positive");
assertNotNull(desc.getMetadata(), "Metadata list should not be null");
assertFalse(desc.getMetadata().isEmpty(), "Metadata list should not be empty");
assertEquals(
desc.getMetadata(), desc.getSegments(), "segments alias should match metadata");
assertNotNull(desc.getDetailsJson(), "Details JSON should not be null");

List<Index> physicalSegments = dataset.getIndexSegments("index1");
assertEquals(1, physicalSegments.size(), "Expected exactly one physical segment");
assertEquals("index1", physicalSegments.get(0).name());

descriptions = dataset.describeIndices();
assertEquals(2, descriptions.size(), "Expected exactly one matching index");
for (IndexDescription indexDesc : descriptions) {
assertTrue(indexDesc.getRowsIndexed() > 0, "rowsIndexed should be positive");
assertNotNull(indexDesc.getMetadata(), "Metadata list should not be null");
assertFalse(indexDesc.getMetadata().isEmpty(), "Metadata list should not be empty");
assertEquals(
indexDesc.getMetadata(),
indexDesc.getSegments(),
"segments alias should match metadata");
assertNotNull(indexDesc.getDetailsJson(), "Details JSON should not be null");
}
}
Expand Down
27 changes: 21 additions & 6 deletions python/python/lance/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@

from .commit import CommitLock
from .io import StorageOptionsProvider
from .lance.indices import IndexDescription
from .lance.indices import IndexDescription, IndexSegmentDescription
from .progress import FragmentWriteProgress
from .types import ReaderLike

Expand Down Expand Up @@ -641,12 +641,14 @@ def checkout_latest(self):

def list_indices(self) -> List[Index]:
"""
Returns index information for all indices in the dataset.
Returns physical index segment information for all indices in the dataset.

This method is deprecated as it requires loading the statistics for each index
which can be a very expensive operation. Instead use describe_indices() to
list index information and index_statistics() to get the statistics for
individual indexes of interest.
which can be a very expensive operation. It also exposes physical index
segments directly. Instead use describe_indices() for logical index
descriptions, describe_index_segments() for explicit segment inspection,
and index_statistics() to get the statistics for individual indexes of
interest.
"""
warnings.warn(
"The 'list_indices' method is deprecated. It may be removed in a future "
Expand All @@ -657,9 +659,22 @@ def list_indices(self) -> List[Index]:
return self._ds.load_indices()

def describe_indices(self) -> List[IndexDescription]:
"""Returns index information for all indices in the dataset."""
"""Returns logical index information aggregated across all segments."""
return self._ds.describe_indices()

def describe_index_segments(
self, index_name: Optional[str] = None
) -> List[IndexSegmentDescription]:

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

if we removed in Rust, should we also remove it in Python and Java? Or is there a reason to keep it in these bindings?

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ooops, I overlook them

"""
Returns physical index segment information.

Parameters
----------
index_name: Optional[str]
If provided, only return segments belonging to the named logical index.
"""
return self._ds.describe_index_segments(index_name)

def index_statistics(self, index_name: str) -> Dict[str, Any]:
warnings.warn(
"LanceDataset.index_statistics() is deprecated, "
Expand Down
2 changes: 2 additions & 0 deletions python/python/lance/indices/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from .pq import PqModel

IndexSegment = _lance.indices.IndexSegment
IndexSegmentDescription = _lance.indices.IndexSegmentDescription
IndexSegmentPlan = _lance.indices.IndexSegmentPlan

__all__ = [
Expand All @@ -18,6 +19,7 @@
"IvfModel",
"IndexFileVersion",
"IndexSegment",
"IndexSegmentDescription",
"IndexSegmentPlan",
]

Expand Down
4 changes: 4 additions & 0 deletions python/python/lance/lance/__init__.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ from .fragment import (
)
from .indices import IndexDescription as IndexDescription
from .indices import IndexSegment as IndexSegment
from .indices import IndexSegmentDescription as IndexSegmentDescription
from .indices import IndexSegmentPlan as IndexSegmentPlan
from .lance import PySearchFilter
from .optimize import (
Expand Down Expand Up @@ -234,6 +235,9 @@ class _Dataset:
def serialized_manifest(self) -> bytes: ...
def load_indices(self) -> List[Index]: ...
def describe_indices(self) -> List[IndexDescription]: ...
def describe_index_segments(
self, index_name: Optional[str] = None
) -> List[IndexSegmentDescription]: ...
def scanner(
self,
columns: Optional[List[str]] = None,
Expand Down
22 changes: 22 additions & 0 deletions python/python/tests/test_vector_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -1644,6 +1644,28 @@ def test_optimize_indices(indexed_dataset):
assert stats["num_indices"] == 2


def test_logical_and_physical_index_views(indexed_dataset):
data = create_table()
indexed_dataset = lance.write_dataset(data, indexed_dataset.uri, mode="append")
indexed_dataset.optimize.optimize_indices(num_indices_to_merge=0)

logical_indices = indexed_dataset.describe_indices()
assert len(logical_indices) == 1
assert logical_indices[0].name == "vector_idx"
assert len(logical_indices[0].segments) == 2

physical_segments = indexed_dataset.describe_index_segments("vector_idx")
assert len(physical_segments) == 2
assert all(segment.fragment_ids for segment in physical_segments)

all_segments = indexed_dataset.describe_index_segments()
assert len(all_segments) == 2

stats = indexed_dataset.stats.index_stats("vector_idx")
assert stats["num_segments"] == stats["num_indices"] == 2
assert stats["segments"] == stats["indices"]


@pytest.mark.skip(reason="retrain is deprecated")
def test_retrain_indices(indexed_dataset):
data = create_table()
Expand Down
21 changes: 20 additions & 1 deletion python/src/dataset.rs
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,10 @@ use lance_table::io::commit::external_manifest::ExternalManifestCommitHandler;
use crate::error::PythonErrorExt;
use crate::file::object_store_from_uri_or_path;
use crate::fragment::FileFragment;
use crate::indices::{PyIndexConfig, PyIndexDescription, PyIndexSegment, PyIndexSegmentPlan};
use crate::indices::{
PyIndexConfig, PyIndexDescription, PyIndexSegment, PyIndexSegmentDescription,
PyIndexSegmentPlan,
};
use crate::namespace::extract_namespace_arc;
use crate::rt;
use crate::scanner::ScanStatistics;
Expand Down Expand Up @@ -2798,6 +2801,22 @@ impl Dataset {
.collect())
}

#[pyo3(signature=(index_name=None))]
fn describe_index_segments(
&self,
py: Python<'_>,
index_name: Option<&str>,
) -> PyResult<Vec<PyIndexSegmentDescription>> {
let new_self = self.ds.as_ref().clone();
let indices = rt()
.block_on(Some(py), new_self.describe_index_segments(index_name))?
.infer_error()?;
Ok(indices
.iter()
.map(PyIndexSegmentDescription::from_metadata)
.collect())
}

/// Create a delta builder to explore changes between dataset versions.
#[pyo3(signature=())]
fn delta(&self) -> PyResult<DatasetDeltaBuilder> {
Expand Down
35 changes: 19 additions & 16 deletions python/src/indices.rs
Original file line number Diff line number Diff line change
Expand Up @@ -590,6 +590,24 @@ pub struct PyIndexSegmentDescription {
}

impl PyIndexSegmentDescription {
pub fn from_metadata(segment: &lance_table::format::IndexMetadata) -> Self {
let fragment_ids = segment
.fragment_bitmap
.as_ref()
.map(|bitmap| bitmap.iter().collect::<HashSet<_>>())
.unwrap_or_default();
let size_bytes = segment.total_size_bytes();

Self {
uuid: segment.uuid.to_string(),
dataset_version_at_last_update: segment.dataset_version,
fragment_ids,
index_version: segment.index_version,
created_at: segment.created_at,
size_bytes,
}
}

pub fn __repr__(&self) -> String {
format!(
"IndexSegmentDescription(uuid={}, dataset_version_at_last_update={}, fragment_ids={:?}, index_version={}, created_at={:?}, size_bytes={:?})",
Expand Down Expand Up @@ -643,22 +661,7 @@ impl PyIndexDescription {
let segments = index
.metadata()
.iter()
.map(|segment| {
let fragment_ids = segment
.fragment_bitmap
.as_ref()
.map(|bitmap| bitmap.iter().collect::<HashSet<_>>())
.unwrap_or_default();
let size_bytes = segment.total_size_bytes();
PyIndexSegmentDescription {
uuid: segment.uuid.to_string(),
dataset_version_at_last_update: segment.dataset_version,
fragment_ids,
index_version: segment.index_version,
created_at: segment.created_at,
size_bytes,
}
})
.map(PyIndexSegmentDescription::from_metadata)
.collect();

let details = index.details().unwrap_or_else(|_| "{}".to_string());
Expand Down
24 changes: 24 additions & 0 deletions rust/lance-index/src/traits.rs
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,13 @@ pub trait IndexDescription: Send + Sync {
/// IndexMetadata for each segment of the index.
fn metadata(&self) -> &[IndexMetadata];

/// Returns the physical index segments that make up this logical index.
///
/// This is an alias for [`Self::metadata`] with a less ambiguous name.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

praise: +1 I like explicitly calling it segments.

fn segments(&self) -> &[IndexMetadata] {
self.metadata()
}

/// Returns the index type URL
///
/// This is extracted from the type url of the index details
Expand Down Expand Up @@ -210,6 +217,8 @@ pub trait DatasetIndexExt {
///
/// The indices are lazy loaded and cached in memory within the `Dataset` instance.
/// The cache is invalidated when the dataset version (Manifest) is changed.
///
/// Each returned entry represents a physical index segment from the manifest.
async fn load_indices(&self) -> Result<Arc<Vec<IndexMetadata>>>;

/// Loads all the indies of a given UUID.
Expand Down Expand Up @@ -243,6 +252,21 @@ pub trait DatasetIndexExt {
})
}

/// Describe physical index segments.
///
/// When `name` is provided, only segments belonging to the named logical
/// index are returned. Otherwise, all index segments in the current dataset
/// version are returned.
async fn describe_index_segments(&self, name: Option<&str>) -> Result<Vec<IndexMetadata>> {

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

question(blocking): Users can already call ds.describe_indices(IndexCriteria::default().for_name(name)).await?.first().map(|idx| idx.segments) to get this. I worry adding a new method clutters our API. Do you think it's worth adding this method? Is this called commonly enough where it's worth making a simplified API?

I say this in part because I think segments are a low-level concept that most users won't care about.

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it's a good point. Let's remove it from the public API.

match name {
Some(name) => self.load_indices_by_name(name).await,
None => self
.load_indices()
.await
.map(|indices| indices.as_ref().clone()),
}
}

/// Loads a specific index with the given index name.
/// This function only works for indices that are unique.
/// If there are multiple indices sharing the same name, please use [`Self::load_indices_by_name`]
Expand Down
15 changes: 14 additions & 1 deletion rust/lance/src/index.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1134,7 +1134,9 @@ async fn index_statistics_scalar(
"index_type": index_type,
"name": index_name,
"num_indices": num_indices,
"indices": indices_stats,
"num_segments": num_indices,
"indices": indices_stats.clone(),
"segments": indices_stats,
"num_indexed_fragments": num_indexed_fragments,
"num_indexed_rows": num_indexed_rows,
"num_unindexed_fragments": num_unindexed_fragments,
Expand Down Expand Up @@ -2382,8 +2384,13 @@ mod tests {
fn get_bitmap(meta: &IndexMetadata) -> Vec<u32> {
meta.fragment_bitmap.as_ref().unwrap().iter().collect()
}
fn assert_segment_aliases(stats: &serde_json::Value) {
assert_eq!(stats["num_segments"], stats["num_indices"]);
assert_eq!(stats["segments"], stats["indices"]);
}

let stats = get_stats(&dataset, "vec_idx").await;
assert_segment_aliases(&stats);
assert_eq!(stats["num_unindexed_rows"], 0);
assert_eq!(stats["num_indexed_rows"], 512);
assert_eq!(stats["num_indexed_fragments"], 1);
Expand All @@ -2396,6 +2403,7 @@ mod tests {
RecordBatchIterator::new(vec![record_batch].into_iter().map(Ok), schema.clone());
dataset.append(reader, None).await.unwrap();
let stats = get_stats(&dataset, "vec_idx").await;
assert_segment_aliases(&stats);
assert_eq!(stats["num_unindexed_rows"], 512);
assert_eq!(stats["num_indexed_rows"], 512);
assert_eq!(stats["num_indexed_fragments"], 1);
Expand All @@ -2410,6 +2418,7 @@ mod tests {
.await
.unwrap();
let stats = get_stats(&dataset, "vec_idx").await;
assert_segment_aliases(&stats);
assert_eq!(stats["num_unindexed_rows"], 512);
assert_eq!(stats["num_indexed_rows"], 512);
assert_eq!(stats["num_indexed_fragments"], 1);
Expand All @@ -2427,6 +2436,7 @@ mod tests {
.await
.unwrap();
let stats = get_stats(&dataset, "vec_idx").await;
assert_segment_aliases(&stats);
assert_eq!(stats["num_unindexed_rows"], 512);
assert_eq!(stats["num_indexed_rows"], 512);
assert_eq!(stats["num_indexed_fragments"], 1);
Expand All @@ -2437,6 +2447,7 @@ mod tests {
assert_eq!(get_bitmap(&meta[0]), vec![0]);

let stats = get_stats(&dataset, "other_vec_idx").await;
assert_segment_aliases(&stats);
assert_eq!(stats["num_unindexed_rows"], 0);
assert_eq!(stats["num_indexed_rows"], 1024);
assert_eq!(stats["num_indexed_fragments"], 2);
Expand All @@ -2453,6 +2464,7 @@ mod tests {
.unwrap();

let stats = get_stats(&dataset, "vec_idx").await;
assert_segment_aliases(&stats);
assert_eq!(stats["num_unindexed_rows"], 0);
assert_eq!(stats["num_indexed_rows"], 1024);
assert_eq!(stats["num_indexed_fragments"], 2);
Expand All @@ -2467,6 +2479,7 @@ mod tests {
.await
.unwrap();
let stats = get_stats(&dataset, "other_vec_idx").await;
assert_segment_aliases(&stats);
assert_eq!(stats["num_unindexed_rows"], 0);
assert_eq!(stats["num_indexed_rows"], 1024);
assert_eq!(stats["num_indexed_fragments"], 2);
Expand Down
Loading