diff --git a/Cargo.toml b/Cargo.toml index 86fc9e89f5d..373b2fd4b3b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -228,7 +228,6 @@ redundant_pub_crate = "deny" string_add_assign = "deny" string_add = "deny" string_lit_as_bytes = "deny" -string_to_string = "deny" use_self = "deny" dbg_macro = "deny" trait_duplication_in_bounds = "deny" diff --git a/python/python/tests/test_ingestion.py b/python/python/tests/test_ingestion.py index 366fbeafacf..9f5ab8b53c2 100644 --- a/python/python/tests/test_ingestion.py +++ b/python/python/tests/test_ingestion.py @@ -14,7 +14,7 @@ def can_write(data, dataset, schema=None): lance.write_dataset(pa.table(data, schema=schema), dataset.uri, mode="append") def cannot_write(data, dataset, schema=None): - with pytest.raises(Exception, match="contained null values"): + with pytest.raises(Exception, match=r"contain(ed|s) null values"): can_write(data, dataset, schema) nullable_dataset = lance.write_dataset( diff --git a/rust/compression/bitpacking/src/lib.rs b/rust/compression/bitpacking/src/lib.rs index 5d12db09cad..0101c4a1df0 100644 --- a/rust/compression/bitpacking/src/lib.rs +++ b/rust/compression/bitpacking/src/lib.rs @@ -15,7 +15,6 @@ use arrayref::{array_mut_ref, array_ref}; use core::mem::size_of; -use paste::paste; pub const FL_ORDER: [usize; 8] = [0, 4, 2, 6, 1, 5, 3, 7]; diff --git a/rust/examples/src/hnsw.rs b/rust/examples/src/hnsw.rs index c990566c16a..0c0f705d42e 100644 --- a/rust/examples/src/hnsw.rs +++ b/rust/examples/src/hnsw.rs @@ -79,15 +79,14 @@ async fn main() { let max_level = 7; // 1. Generate a synthetic test data of specified dimensions - let dataset = if uri.is_none() { - println!("No uri is provided, generating test dataset..."); - let output = "test_vectors.lance"; - create_test_vector_dataset(output, 1000, 64).await; - Dataset::open(output).await.expect("Failed to open dataset") - } else { - Dataset::open(uri.as_ref().unwrap()) - .await - .expect("Failed to open dataset") + let dataset = match uri.as_deref() { + None => { + println!("No uri is provided, generating test dataset..."); + let output = "test_vectors.lance"; + create_test_vector_dataset(output, 1000, 64).await; + Dataset::open(output).await.expect("Failed to open dataset") + } + Some(uri) => Dataset::open(uri).await.expect("Failed to open dataset"), }; println!("Dataset schema: {:#?}", dataset.schema()); diff --git a/rust/lance-core/src/datatypes/field.rs b/rust/lance-core/src/datatypes/field.rs index ca3b75349e7..a091632c524 100644 --- a/rust/lance-core/src/datatypes/field.rs +++ b/rust/lance-core/src/datatypes/field.rs @@ -151,7 +151,7 @@ pub struct Field { pub encoding: Option, pub nullable: bool, - pub children: Vec, + pub children: Vec, /// Dictionary value array if this field is dictionary. pub dictionary: Option, diff --git a/rust/lance-core/src/datatypes/schema.rs b/rust/lance-core/src/datatypes/schema.rs index 66dc3a91a0a..3e8340bd19b 100644 --- a/rust/lance-core/src/datatypes/schema.rs +++ b/rust/lance-core/src/datatypes/schema.rs @@ -1533,7 +1533,7 @@ pub fn parse_field_path(path: &str) -> Result> { /// For example: ["parent", "child.with.dot"] formats to "parent.`child.with.dot`" /// For example: ["meta-data", "user-id"] formats to "`meta-data`.`user-id`" /// Backticks in field names are escaped by doubling them. -/// For example: ["field`with`backticks"] formats to "`field``with``backticks`" +/// For example: \["field`with`backticks"\] formats to "`field``with``backticks`" pub fn format_field_path(fields: &[&str]) -> String { fields .iter() diff --git a/rust/lance-core/src/utils/deletion.rs b/rust/lance-core/src/utils/deletion.rs index 03c41e5f09f..be7934b918c 100644 --- a/rust/lance-core/src/utils/deletion.rs +++ b/rust/lance-core/src/utils/deletion.rs @@ -12,8 +12,9 @@ const BITMAP_THRESDHOLD: usize = 5_000; // TODO: Benchmark to find a better value. /// Represents a set of deleted row offsets in a single fragment. -#[derive(Debug, Clone)] +#[derive(Debug, Clone, Default)] pub enum DeletionVector { + #[default] NoDeletions, Set(HashSet), Bitmap(RoaringBitmap), @@ -182,12 +183,6 @@ impl OffsetMapper { } } -impl Default for DeletionVector { - fn default() -> Self { - Self::NoDeletions - } -} - impl From<&DeletionVector> for RoaringBitmap { fn from(value: &DeletionVector) -> Self { match value { diff --git a/rust/lance-datafusion/src/substrait.rs b/rust/lance-datafusion/src/substrait.rs index e98300a417e..54dc9be8808 100644 --- a/rust/lance-datafusion/src/substrait.rs +++ b/rust/lance-datafusion/src/substrait.rs @@ -82,11 +82,17 @@ fn remove_extension_types( for (substrait_field, arrow_field) in fields.types.iter().zip(arrow_schema.fields.iter()) { let num_fields = count_fields(substrait_field); + let kind = substrait_field.kind.as_ref().unwrap(); + let is_user_defined = match kind { + Kind::UserDefined(_) => true, + // Keep compatibility with older Substrait plans. + #[allow(deprecated)] + Kind::UserDefinedTypeReference(_) => true, + _ => false, + }; + if !substrait_schema.names[field_index].starts_with("__unlikely_name_placeholder") - && !matches!( - substrait_field.kind.as_ref().unwrap(), - Kind::UserDefined(_) | Kind::UserDefinedTypeReference(_) - ) + && !is_user_defined { kept_substrait_fields.push(substrait_field.clone()); kept_arrow_fields.push(arrow_field.clone()); @@ -118,10 +124,10 @@ fn remove_extension_types( fn remap_expr_references(expr: &mut Expression, mapping: &HashMap) -> Result<()> { match expr.rex_type.as_mut().unwrap() { // Simple, no field references possible - RexType::Literal(_) - | RexType::Nested(_) - | RexType::Enum(_) - | RexType::DynamicParameter(_) => Ok(()), + RexType::Literal(_) | RexType::Nested(_) | RexType::DynamicParameter(_) => Ok(()), + // Enum literals are deprecated in Substrait and should only appear in older plans. + #[allow(deprecated)] + RexType::Enum(_) => Ok(()), // Complex operators not supported in filters RexType::WindowFunction(_) | RexType::Subquery(_) => Err(Error::invalid_input( "Window functions or subqueries not allowed in filter expression", diff --git a/rust/lance-index/src/scalar/btree.rs b/rust/lance-index/src/scalar/btree.rs index f4fd2ae32f1..5fe7cb0d14b 100644 --- a/rust/lance-index/src/scalar/btree.rs +++ b/rust/lance-index/src/scalar/btree.rs @@ -1755,19 +1755,21 @@ pub async fn train_btree_index( Field::new(BTREE_IDS_COLUMN, DataType::UInt64, false), ])); - let mut sub_index_file; - if partition_id.is_none() { - sub_index_file = index_store - .new_index_file(BTREE_PAGES_NAME, flat_schema.clone()) - .await?; - } else { - sub_index_file = index_store - .new_index_file( - part_page_data_file_path(partition_id.unwrap()).as_str(), - flat_schema.clone(), - ) - .await?; - } + let mut sub_index_file = match partition_id { + None => { + index_store + .new_index_file(BTREE_PAGES_NAME, flat_schema.clone()) + .await? + } + Some(partition_id) => { + index_store + .new_index_file( + part_page_data_file_path(partition_id).as_str(), + flat_schema.clone(), + ) + .await? + } + }; let mut encoded_batches = Vec::new(); let mut batch_idx = 0; @@ -1802,19 +1804,21 @@ pub async fn train_btree_index( RANGE_PARTITIONED_META_KEY.to_string(), range_id.is_some().to_string(), ); - let mut btree_index_file; - if partition_id.is_none() { - btree_index_file = index_store - .new_index_file(BTREE_LOOKUP_NAME, Arc::new(file_schema)) - .await?; - } else { - btree_index_file = index_store - .new_index_file( - part_lookup_file_path(partition_id.unwrap()).as_str(), - Arc::new(file_schema), - ) - .await?; - } + let mut btree_index_file = match partition_id { + None => { + index_store + .new_index_file(BTREE_LOOKUP_NAME, Arc::new(file_schema)) + .await? + } + Some(partition_id) => { + index_store + .new_index_file( + part_lookup_file_path(partition_id).as_str(), + Arc::new(file_schema), + ) + .await? + } + }; btree_index_file.write_record_batch(record_batch).await?; btree_index_file.finish().await?; Ok(()) diff --git a/rust/lance-index/src/scalar/expression.rs b/rust/lance-index/src/scalar/expression.rs index edf9b055721..a1ea6fe84ff 100644 --- a/rust/lance-index/src/scalar/expression.rs +++ b/rust/lance-index/src/scalar/expression.rs @@ -965,9 +965,9 @@ impl PartialEq for ScalarIndexSearch { /// modify the results of scalar lookups #[derive(Debug, Clone)] pub enum ScalarIndexExpr { - Not(Box), - And(Box, Box), - Or(Box, Box), + Not(Box), + And(Box, Box), + Or(Box, Box), Query(ScalarIndexSearch), } diff --git a/rust/lance-index/src/scalar/inverted/query.rs b/rust/lance-index/src/scalar/inverted/query.rs index 5ad00aade30..a54b8d24b3c 100644 --- a/rust/lance-index/src/scalar/inverted/query.rs +++ b/rust/lance-index/src/scalar/inverted/query.rs @@ -71,18 +71,13 @@ impl Default for FtsSearchParams { } } -#[derive(Debug, Clone, Copy, PartialEq, Serialize, Deserialize)] +#[derive(Debug, Clone, Copy, PartialEq, Serialize, Deserialize, Default)] pub enum Operator { And, + #[default] Or, } -impl Default for Operator { - fn default() -> Self { - Self::Or - } -} - impl TryFrom<&str> for Operator { type Error = Error; fn try_from(value: &str) -> Result { diff --git a/rust/lance-io/src/lib.rs b/rust/lance-io/src/lib.rs index 5d4f8cd4d1d..f383278fc0a 100644 --- a/rust/lance-io/src/lib.rs +++ b/rust/lance-io/src/lib.rs @@ -27,13 +27,14 @@ pub mod utils; pub use scheduler::{bytes_read_counter, iops_counter}; /// Defines a selection of rows to read from a file/batch -#[derive(Debug, Clone, PartialEq)] +#[derive(Debug, Clone, PartialEq, Default)] pub enum ReadBatchParams { /// Select a contiguous range of rows Range(Range), /// Select multiple contiguous ranges of rows Ranges(Arc<[Range]>), /// Select all rows (this is the default) + #[default] RangeFull, /// Select all rows up to a given index RangeTo(RangeTo), @@ -77,13 +78,6 @@ impl std::fmt::Display for ReadBatchParams { } } -impl Default for ReadBatchParams { - fn default() -> Self { - // Default of ReadBatchParams is reading the full batch. - Self::RangeFull - } -} - impl From<&[u32]> for ReadBatchParams { fn from(value: &[u32]) -> Self { Self::Indices(UInt32Array::from_iter_values(value.iter().copied())) diff --git a/rust/lance/src/dataset/write/merge_insert/inserted_rows.rs b/rust/lance/src/dataset/write/merge_insert/inserted_rows.rs index db81fd6f831..8b9073fefc3 100644 --- a/rust/lance/src/dataset/write/merge_insert/inserted_rows.rs +++ b/rust/lance/src/dataset/write/merge_insert/inserted_rows.rs @@ -27,7 +27,7 @@ pub enum KeyValue { Int64(i64), UInt64(u64), Binary(Vec), - Composite(Vec), + Composite(Vec), } impl KeyValue { diff --git a/rust/lance/src/index/scalar.rs b/rust/lance/src/index/scalar.rs index a308f6c9321..11a2c22b67a 100644 --- a/rust/lance/src/index/scalar.rs +++ b/rust/lance/src/index/scalar.rs @@ -272,18 +272,19 @@ pub(super) async fn build_scalar_index( let training_request = plugin.new_training_request(params.params.as_deref().unwrap_or("{}"), &field)?; - let training_data = if preprocessed_data.is_none() { - load_training_data( - dataset, - column, - training_request.criteria(), - None, - train, - fragment_ids.clone(), - ) - .await? - } else { - preprocessed_data.unwrap() + let training_data = match preprocessed_data { + Some(preprocessed_data) => preprocessed_data, + None => { + load_training_data( + dataset, + column, + training_request.criteria(), + None, + train, + fragment_ids.clone(), + ) + .await? + } }; plugin diff --git a/rust/lance/src/index/vector/ivf.rs b/rust/lance/src/index/vector/ivf.rs index 3f7d5f10a2a..6226f13d357 100644 --- a/rust/lance/src/index/vector/ivf.rs +++ b/rust/lance/src/index/vector/ivf.rs @@ -1231,8 +1231,7 @@ pub async fn build_ivf_model( ) -> Result { let num_partitions = params.num_partitions.unwrap(); let centroids = params.centroids.clone(); - if centroids.is_some() && !params.retrain { - let centroids = centroids.unwrap(); + if let (Some(centroids), false) = (centroids.as_deref(), params.retrain) { info!("Pre-computed IVF centroids is provided, skip IVF training"); if centroids.values().len() != num_partitions * dim { return Err(Error::Index { @@ -1244,7 +1243,7 @@ pub async fn build_ivf_model( location: location!(), }); } - return Ok(IvfModel::new(centroids.as_ref().clone(), None)); + return Ok(IvfModel::new(centroids.clone(), None)); } let sample_size_hint = num_partitions * params.sample_rate; diff --git a/rust/lance/src/io/exec/projection.rs b/rust/lance/src/io/exec/projection.rs index 8d770cb8ce4..8a74577dd49 100644 --- a/rust/lance/src/io/exec/projection.rs +++ b/rust/lance/src/io/exec/projection.rs @@ -152,7 +152,7 @@ pub enum Selection<'a> { /// Selects this fields and all subfields FullField(&'a str), /// For a struct, selections of subfields - StructProjection(&'a str, Vec>), + StructProjection(&'a str, Vec), } impl Selection<'_> {