Skip to content
Merged
79 changes: 29 additions & 50 deletions rust/lance-core/src/datatypes/field.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ use arrow_schema::{DataType, Field as ArrowField};
use deepsize::DeepSizeOf;
use lance_arrow::{
json::{is_arrow_json_field, is_json_field},
DataTypeExt, ARROW_EXT_META_KEY, ARROW_EXT_NAME_KEY, BLOB_META_KEY, BLOB_V2_EXT_NAME,
DataTypeExt, ARROW_EXT_NAME_KEY, BLOB_META_KEY, BLOB_V2_EXT_NAME,
};
use snafu::location;

Expand Down Expand Up @@ -516,6 +516,23 @@ impl Field {
.unwrap_or(false)
}

/// If the field is a blob, return a new field with the same name and id
Comment thread
Xuanwo marked this conversation as resolved.
Outdated
/// but with the data type set to a struct of the blob description fields.
///
/// If the field is not a blob, return the field itself.
pub fn unloaded_mut(&mut self) -> &mut Self {
if self.is_blob_v2() {
self.logical_type = BLOB_V2_DESC_LANCE_FIELD.logical_type.clone();
self.children = BLOB_V2_DESC_LANCE_FIELD.children.clone();
self.metadata = BLOB_V2_DESC_LANCE_FIELD.metadata.clone();
} else if self.is_blob() {
self.logical_type = BLOB_DESC_LANCE_FIELD.logical_type.clone();
self.children = BLOB_DESC_LANCE_FIELD.children.clone();
self.metadata = BLOB_DESC_LANCE_FIELD.metadata.clone();
}
self
}

/// If the field is a blob, return a new field with the same name and id
/// but with the data type set to a struct of the blob description fields.
///
Expand Down Expand Up @@ -744,6 +761,13 @@ impl Field {
(&self_type, &other_type),
(DataType::Struct(_), DataType::Struct(_)) | (DataType::List(_), DataType::List(_))
) {
// Blob v2 uses a struct logical type for descriptors, which differs from the logical
// input struct (data/uri). When intersecting schemas for projection we want to keep
// the projected blob layout instead of intersecting by child names.
if ignore_types && self.is_blob() && other.is_blob() {
Comment thread
Xuanwo marked this conversation as resolved.
Outdated
return Ok(self.clone());
}

let children = self
.children
.iter()
Expand Down Expand Up @@ -1007,15 +1031,15 @@ impl TryFrom<&ArrowField> for Field {

if is_blob_v2 {
metadata
.entry(BLOB_META_KEY.to_string())
.or_insert_with(|| "true".to_string());
.entry(ARROW_EXT_NAME_KEY.to_string())
.or_insert_with(|| BLOB_V2_EXT_NAME.to_string());
}

// Check for JSON extension types (both Arrow and Lance)
let logical_type = if is_arrow_json_field(field) || is_json_field(field) {
LogicalType::from("json")
} else if is_blob_v2 {
LogicalType::from(super::BLOB_LOGICAL_TYPE)
LogicalType::from("struct")
} else {
LogicalType::try_from(field.data_type())?
};
Expand Down Expand Up @@ -1056,11 +1080,6 @@ impl From<&Field> for ArrowField {
let mut metadata = field.metadata.clone();

if field.logical_type.is_blob() {
metadata.insert(
ARROW_EXT_NAME_KEY.to_string(),
lance_arrow::BLOB_V2_EXT_NAME.to_string(),
);
metadata.entry(ARROW_EXT_META_KEY.to_string()).or_default();
metadata
.entry(BLOB_META_KEY.to_string())
.or_insert_with(|| "true".to_string());
Expand All @@ -1084,7 +1103,7 @@ mod tests {

use arrow_array::{DictionaryArray, StringArray, UInt32Array};
use arrow_schema::{Fields, TimeUnit};
use lance_arrow::{ARROW_EXT_META_KEY, ARROW_EXT_NAME_KEY, BLOB_META_KEY, BLOB_V2_EXT_NAME};
use lance_arrow::BLOB_META_KEY;
use std::collections::HashMap;
#[test]
fn arrow_field_to_field() {
Expand Down Expand Up @@ -1569,44 +1588,4 @@ mod tests {
assert_eq!(unloaded.children.len(), 5);
assert_eq!(unloaded.logical_type, BLOB_V2_DESC_LANCE_FIELD.logical_type);
}

#[test]
fn blob_v2_detection_by_extension() {
let metadata = HashMap::from([
(ARROW_EXT_NAME_KEY.to_string(), BLOB_V2_EXT_NAME.to_string()),
(BLOB_META_KEY.to_string(), "true".to_string()),
]);
let field: Field = ArrowField::new("blob", DataType::LargeBinary, true)
.with_metadata(metadata)
.try_into()
.unwrap();
assert!(field.is_blob_v2());
}

#[test]
fn blob_extension_roundtrip() {
let metadata = HashMap::from([
(ARROW_EXT_NAME_KEY.to_string(), BLOB_V2_EXT_NAME.to_string()),
(ARROW_EXT_META_KEY.to_string(), "".to_string()),
]);
let arrow_field =
ArrowField::new("blob", DataType::LargeBinary, true).with_metadata(metadata);
let field = Field::try_from(&arrow_field).unwrap();
assert_eq!(
field.logical_type,
LogicalType::from(crate::datatypes::BLOB_LOGICAL_TYPE)
);
assert!(field.is_blob());
assert_eq!(field.data_type(), DataType::LargeBinary);

let roundtrip: ArrowField = ArrowField::from(&field);
assert_eq!(
roundtrip.metadata().get(ARROW_EXT_NAME_KEY),
Some(&BLOB_V2_EXT_NAME.to_string())
);
assert_eq!(
roundtrip.metadata().get(BLOB_META_KEY),
Some(&"true".to_string())
);
}
}
4 changes: 3 additions & 1 deletion rust/lance-core/src/datatypes/schema.rs
Original file line number Diff line number Diff line change
Expand Up @@ -928,7 +928,9 @@ pub enum BlobHandling {

impl BlobHandling {
fn should_unload(&self, field: &Field) -> bool {
if !field.data_type().is_binary_like() {
// Blob v2 columns are Structs, so we need to treat any blob-marked field as unloadable
// even if the physical data type is not binary-like.
if !(field.data_type().is_binary_like() || field.is_blob()) {
return false;
}
match self {
Expand Down
5 changes: 3 additions & 2 deletions rust/lance-file/src/reader.rs
Original file line number Diff line number Diff line change
Expand Up @@ -251,10 +251,11 @@ impl ReaderProjection {
field_id_to_column_index,
&mut column_indices,
)?;
Ok(Self {
let projection = Self {
schema: Arc::new(schema.clone()),
column_indices,
})
};
Ok(projection)
}

/// Creates a projection that reads the entire file
Expand Down
9 changes: 9 additions & 0 deletions rust/lance-file/src/writer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -465,6 +465,15 @@ impl FileWriter {
async fn write_global_buffers(&mut self) -> Result<Vec<(u64, u64)>> {
let schema = self.schema.as_mut().ok_or(Error::invalid_input("No schema provided on writer open and no data provided. Schema is unknown and file cannot be created", location!()))?;
schema.metadata = std::mem::take(&mut self.schema_metadata);
// Use descriptor layout for blob v2 in the footer to avoid exposing logical child fields.
//
// TODO(xuanwo): this doesn't work on nested struct, need better solution like fields_per_order_mut?
schema.fields.iter_mut().for_each(|f| {
if f.is_blob_v2() {
let _ = f.unloaded_mut();
}
Comment thread
Xuanwo marked this conversation as resolved.
});

let file_descriptor = Self::make_file_descriptor(schema, self.rows_written)?;
let file_descriptor_bytes = file_descriptor.encode_to_vec();
let file_descriptor_len = file_descriptor_bytes.len() as u64;
Expand Down
Loading