Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
75 changes: 66 additions & 9 deletions protos/format.proto
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,7 @@ message Manifest {
//
// Known flags:
// * 1: deletion files are present
// * 2: list columns have child fields
uint64 reader_feature_flags = 9;

// Feature flags for writers.
Expand Down Expand Up @@ -198,7 +199,9 @@ message DataFile {
// with one file of N columns, the field ids will be 1, 2, ..., N. If a second,
// fragment is created with M columns, the field ids will be N+1, N+2, ..., N+M.
//
// These ids must be sorted and contiguous.
// These ids must be sorted.
//
//
repeated int32 fields = 2;
} // DataFile

Expand Down Expand Up @@ -257,6 +260,15 @@ message Metadata {
// This includes struct columns, which will have a run of length 0 since
// they don't store any actual data.
//
// For FixedSizeList columns, there are no buffers at the list-level, only at
// the child level. However, how this is handled depends on the presense of
// the feature flag 2 (fields in fixed size list). If that flag is missing, then
// the page table will use the list-level field id for value pages in the page
// table, and the field id of the value field may be missing. If the flag is
// present, then the entries for the list-level field id will be zero length
// (just like struct columns), and the value pages will be stored under the
// child field id.
//
// For example, for the column 5 and batch 4, we have:
// ```text
// position = page_table[5][4][0];
Expand Down Expand Up @@ -322,15 +334,24 @@ message Dictionary {
}

// Field metadata for a column.
//
//
message Field {
// TODO: why do we even have these if we have the type field?
// Why is default value PARENT?
enum Type {
// A struct column. Will have one or more children pointing to it.
PARENT = 0;
// A list column. May have one child pointing to it.
REPEATED = 1;
// A leaf column. Will have no children pointing to it.
LEAF = 2;
}
Type type = 1;

// Fully qualified name.
//
// TODO: What does this mean?? Give an example.
string name = 2;
/// Field Id.
///
Expand All @@ -339,21 +360,38 @@ message Field {
/// Parent Field ID. If not set, this is a top-level column.
int32 parent_id = 4;

// Logical types, support parameterized Arrow Type.
// Logical types.
//
// Some of these are encoded (like dictionary), to indicate what Arrow data type
// the column should be output as.
//
// This field may be missing on older tables, which will instead use the
// legacy_logical_type field. The feature flag (2) indicates that this field
// should be used instead of the legacy_logical_type field.
//
// The valid logical types depend on the type of field.
//
// PARENT types will always have logical type "struct".
//
// REPEATED types may have logical types:
// * "list"
// * "large_list"
// * "list.struct"
// * "large_list.struct"
// The final two are used if the list values are structs, and therefore the
// field is both implicitly REPEATED and PARENT.
// REPEATED types will always have logical type "list", "large_list", or
// "fixed_size_list", corresponding to the Arrow data types. Some older
// versions may use the logical type "list.struct" or "large_list.struct" to
// indicate that the list values are structs, but these fields can be safely
// rewritten simply as "list" or "large_list".
//
// The "fixed_size_list" logical type has two variants: a legacy variant with
// two parameters in the format "fixed_size_list:{value_type}:{list_size}",
// and a new variant with one parameter in the format "fixed_size_list:{list_size}".
// To enable compatibility with older versions, writers should use the legacy
// variant when it can be used. In some cases, like when the list values are
// an extension type or not a primitive type, the legacy variant cannot be
// used, and the new variant must be used instead. When the new variant is
// used, the feature flag (2) must be set.
//
// LEAF types may have logical types:
// * "null"
// * "bool"
// * "bool"
// * "int8" / "uint8"
// * "int16" / "uint16"
// * "int32" / "uint32"
Expand All @@ -366,7 +404,26 @@ message Field {
// * "decimal:128:{precision}:{scale}" / "decimal:256:{precision}:{scale}"
// * "time:{unit}" / "timestamp:{unit}" / "duration:{unit}", where unit is "s", "ms", "us", "ns"
// * "dict:{value_type}:{index_type}:false"
// string logical_type = 11;

// Legacy logical type field. Use the logical_type field instead.
//
// This field is the same as the logical_type field, except for the handling
// of REPEATED fields. Whereas repeated types have child fields in new scheme,
// in this field the child types of repeated fields are encoded in the logical
// type of the repeated field itself.
//
// The legacy REPEATED types encode the value type in the logical type. This
// can be:
// * "list:{value_type}"
// * "large_list:{value_type}"
// * "fixed_size_list:{value_type}:{list_size}"
// * "list.struct"
// * "large_list.struct"
// The final two are used if the list values are structs, and therefore the
// field is both implicitly REPEATED and PARENT.
string logical_type = 5;

// If this field is nullable.
bool nullable = 6;

Expand Down
21 changes: 21 additions & 0 deletions python/python/tests/test_arrow.py
Original file line number Diff line number Diff line change
Expand Up @@ -162,6 +162,27 @@ def test_bf16_fixed_size_list_cast():
assert casted == fsl


def test_bf16_roundtrip(tmp_path: Path):
import numpy as np
from ml_dtypes import bfloat16

values = BFloat16Array.from_numpy(np.random.random(9).astype(bfloat16))
vectors = pa.FixedSizeListArray.from_arrays(values, 3)
tensors = pa.ExtensionArray.from_storage(
pa.fixed_shape_tensor(values.type, [3]), vectors
)
data = pa.table(
{
"values": values.slice(0, 3),
"vector": vectors,
"tensors": tensors,
}
)
ds = lance.write_dataset(data, tmp_path)
assert ds.schema == data.schema
assert ds.to_table() == data


def test_roundtrip_take_ext_types(tmp_path: Path):
tensor_type = pa.fixed_shape_tensor(pa.float32(), [2, 3])
inner = pa.array([float(x) for x in range(0, 18)], pa.float32())
Expand Down
1 change: 1 addition & 0 deletions rust/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,7 @@ tokio = { version = "1.23", features = [
tracing = "0.1"
url = "2.3"
uuid = { version = "1.2", features = ["v4", "serde"] }
pretty_assertions = "1.4.0"

[profile.bench]
opt-level = 3
Expand Down
4 changes: 4 additions & 0 deletions rust/lance-arrow/src/bfloat16.rs
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,10 @@ impl BFloat16Array {
let binary_value = self.inner.value_unchecked(i);
bf16::from_bits(u16::from_le_bytes([binary_value[0], binary_value[1]]))
}

pub fn into_inner(self) -> FixedSizeBinaryArray {
self.inner
}
}

impl<'a> ArrayAccessor for &'a BFloat16Array {
Expand Down
1 change: 1 addition & 0 deletions rust/lance-core/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ rand.workspace = true
tempfile.workspace = true
lance-testing.workspace = true
parquet.workspace = true
pretty_assertions.workspace = true
proptest = "1.3.1"

[build-dependencies]
Expand Down
53 changes: 21 additions & 32 deletions rust/lance-core/src/datatypes.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ use std::fmt::{self, Debug, Formatter};
use std::sync::Arc;

use arrow_array::ArrayRef;
use arrow_schema::{DataType, Field as ArrowField, TimeUnit};
use arrow_schema::{DataType, TimeUnit};
use snafu::{location, Location};

mod field;
Expand Down Expand Up @@ -52,6 +52,23 @@ impl LogicalType {
fn is_struct(&self) -> bool {
self.0 == "struct"
}

/// Check whether the logical type is a FixedSizeList.
fn is_fsl(&self) -> bool {
self.0.starts_with("fixed_size_list:")
}

/// Check whether the logical type is a FixedSizeList, and if so, return the size.
fn fsl_size(&self) -> Option<i32> {
// Might be of format "fixed_size_list:{size}" or (legacy)
// "fixed_size_list:{child_type}:{size}".
if self.0.starts_with("fixed_size_list:") {
let (_, size) = self.0.rsplit_once(':').unwrap();
size.parse::<i32>().ok()
} else {
None
}
}
}

impl From<&str> for LogicalType {
Expand Down Expand Up @@ -128,19 +145,9 @@ impl TryFrom<&DataType> for LogicalType {
false
)
}
DataType::List(elem) => match elem.data_type() {
DataType::Struct(_) => "list.struct".to_string(),
_ => "list".to_string(),
},
DataType::LargeList(elem) => match elem.data_type() {
DataType::Struct(_) => "large_list.struct".to_string(),
_ => "large_list".to_string(),
},
DataType::FixedSizeList(dt, len) => format!(
"fixed_size_list:{}:{}",
Self::try_from(dt.data_type())?.0,
*len
),
DataType::List(_) => "list".to_string(),
DataType::LargeList(_) => "large_list".to_string(),
DataType::FixedSizeList(_dt, len) => format!("fixed_size_list:{}", *len),
DataType::FixedSizeBinary(len) => format!("fixed_size_binary:{}", *len),
_ => {
return Err(Error::Schema {
Expand Down Expand Up @@ -193,24 +200,6 @@ impl TryFrom<&LogicalType> for DataType {
} else {
let splits = lt.0.split(':').collect::<Vec<_>>();
match splits[0] {
"fixed_size_list" => {
if splits.len() != 3 {
Err(Error::Schema {
message: format!("Unsupported logical type: {}", lt),
location: location!(),
})
} else {
let elem_type = (&LogicalType(splits[1].to_string())).try_into()?;
let size: i32 = splits[2].parse::<i32>().map_err(|e: _| Error::Schema {
message: e.to_string(),
location: location!(),
})?;
Ok(FixedSizeList(
Arc::new(ArrowField::new("item", elem_type, true)),
size,
))
}
}
"fixed_size_binary" => {
if splits.len() != 2 {
Err(Error::Schema {
Expand Down
Loading