Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
90 changes: 69 additions & 21 deletions parquet-variant-compute/src/arrow_to_variant.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,10 @@
// specific language governing permissions and limitations
// under the License.

use std::collections::HashMap;

use crate::type_conversion::{decimal_to_variant_decimal, CastOptions};
use arrow::array::{
Array, AsArray, GenericBinaryArray, GenericStringArray, OffsetSizeTrait, PrimitiveArray,
Array, AsArray, GenericBinaryArray, GenericListArray, GenericListViewArray, GenericStringArray,
OffsetSizeTrait, PrimitiveArray,
};
use arrow::compute::kernels::cast;
use arrow::datatypes::{
Expand All @@ -36,6 +35,8 @@ use parquet_variant::{
ObjectFieldBuilder, Variant, VariantBuilderExt, VariantDecimal16, VariantDecimal4,
VariantDecimal8,
};
use std::collections::HashMap;
use std::ops::Range;

// ============================================================================
// Row-oriented builders for efficient Arrow-to-Variant conversion
Expand Down Expand Up @@ -77,8 +78,10 @@ pub(crate) enum ArrowToVariantRowBuilder<'a> {
Utf8(StringArrowToVariantBuilder<'a, i32>),
LargeUtf8(StringArrowToVariantBuilder<'a, i64>),
Utf8View(StringViewArrowToVariantBuilder<'a>),
List(ListArrowToVariantBuilder<'a, i32>),
LargeList(ListArrowToVariantBuilder<'a, i64>),
List(ListArrowToVariantBuilder<'a, GenericListArray<i32>>),
LargeList(ListArrowToVariantBuilder<'a, GenericListArray<i64>>),
ListView(ListArrowToVariantBuilder<'a, GenericListViewArray<i32>>),
LargeListView(ListArrowToVariantBuilder<'a, GenericListViewArray<i64>>),
Struct(StructArrowToVariantBuilder<'a>),
Map(MapArrowToVariantBuilder<'a>),
Union(UnionArrowToVariantBuilder<'a>),
Expand Down Expand Up @@ -133,6 +136,8 @@ impl<'a> ArrowToVariantRowBuilder<'a> {
Utf8View(b) => b.append_row(builder, index),
List(b) => b.append_row(builder, index),
LargeList(b) => b.append_row(builder, index),
ListView(b) => b.append_row(builder, index),
LargeListView(b) => b.append_row(builder, index),
Struct(b) => b.append_row(builder, index),
Map(b) => b.append_row(builder, index),
Union(b) => b.append_row(builder, index),
Expand Down Expand Up @@ -238,8 +243,18 @@ pub(crate) fn make_arrow_to_variant_row_builder<'a>(
DataType::Utf8 => Utf8(StringArrowToVariantBuilder::new(array)),
DataType::LargeUtf8 => LargeUtf8(StringArrowToVariantBuilder::new(array)),
DataType::Utf8View => Utf8View(StringViewArrowToVariantBuilder::new(array)),
DataType::List(_) => List(ListArrowToVariantBuilder::new(array, options)?),
DataType::LargeList(_) => LargeList(ListArrowToVariantBuilder::new(array, options)?),
DataType::List(_) => List(ListArrowToVariantBuilder::new(array.as_list(), options)?),
DataType::LargeList(_) => {
LargeList(ListArrowToVariantBuilder::new(array.as_list(), options)?)
}
DataType::ListView(_) => ListView(ListArrowToVariantBuilder::new(
array.as_list_view(),
options,
)?),
DataType::LargeListView(_) => LargeListView(ListArrowToVariantBuilder::new(
array.as_list_view(),
options,
)?),
DataType::Struct(_) => Struct(StructArrowToVariantBuilder::new(
array.as_struct(),
options,
Expand Down Expand Up @@ -425,7 +440,7 @@ define_row_builder!(
options: &'a CastOptions,
has_time_zone: bool,
},
|array| -> arrow::array::PrimitiveArray<T> { array.as_primitive() },
|array| -> PrimitiveArray<T> { array.as_primitive() },
|value| -> Option<_> {
// Convert using Arrow's temporal conversion functions
as_datetime::<T>(value).map(|naive_datetime| {
Expand Down Expand Up @@ -508,21 +523,20 @@ impl NullArrowToVariantBuilder {
}
}

/// Generic list builder for List and LargeList types
pub(crate) struct ListArrowToVariantBuilder<'a, O: OffsetSizeTrait> {
list_array: &'a arrow::array::GenericListArray<O>,
/// Generic list builder for List, LargeList, ListView, and LargeListView types
pub(crate) struct ListArrowToVariantBuilder<'a, L: ListLikeArray> {
list_array: &'a L,
values_builder: Box<ArrowToVariantRowBuilder<'a>>,
}

impl<'a, O: OffsetSizeTrait> ListArrowToVariantBuilder<'a, O> {
pub(crate) fn new(array: &'a dyn Array, options: &'a CastOptions) -> Result<Self, ArrowError> {
let list_array = array.as_list();
let values = list_array.values();
impl<'a, L: ListLikeArray> ListArrowToVariantBuilder<'a, L> {
pub(crate) fn new(array: &'a L, options: &'a CastOptions) -> Result<Self, ArrowError> {
let values = array.values();
let values_builder =
make_arrow_to_variant_row_builder(values.data_type(), values.as_ref(), options)?;
make_arrow_to_variant_row_builder(values.data_type(), values, options)?;

Ok(Self {
list_array,
list_array: array,
values_builder: Box::new(values_builder),
})
}
Expand All @@ -537,12 +551,10 @@ impl<'a, O: OffsetSizeTrait> ListArrowToVariantBuilder<'a, O> {
return Ok(());
}

let offsets = self.list_array.offsets();
let start = offsets[index].as_usize();
let end = offsets[index + 1].as_usize();
let range = self.list_array.element_range(index);

let mut list_builder = builder.try_new_list()?;
for value_index in start..end {
for value_index in range {
self.values_builder
.append_row(&mut list_builder, value_index)?;
}
Expand All @@ -551,6 +563,42 @@ impl<'a, O: OffsetSizeTrait> ListArrowToVariantBuilder<'a, O> {
}
}

/// Trait for list-like arrays that can provide element ranges
pub(crate) trait ListLikeArray: Array {
/// Get the values array
fn values(&self) -> &dyn Array;

/// Get the start and end indices for a list element
fn element_range(&self, index: usize) -> Range<usize>;
}

impl<O: OffsetSizeTrait> ListLikeArray for GenericListArray<O> {
fn values(&self) -> &dyn Array {
self.values()
}

fn element_range(&self, index: usize) -> Range<usize> {
let offsets = self.offsets();
let start = offsets[index].as_usize();
let end = offsets[index + 1].as_usize();
start..end
}
}

impl<O: OffsetSizeTrait> ListLikeArray for GenericListViewArray<O> {
fn values(&self) -> &dyn Array {
self.values()
}

fn element_range(&self, index: usize) -> Range<usize> {
let offsets = self.value_offsets();
let sizes = self.value_sizes();
let offset = offsets[index].as_usize();
let size = sizes[index].as_usize();
offset..(offset + size)
}
}

/// Struct builder for StructArray
pub(crate) struct StructArrowToVariantBuilder<'a> {
struct_array: &'a arrow::array::StructArray,
Expand Down
161 changes: 155 additions & 6 deletions parquet-variant-compute/src/cast_to_variant.rs
Original file line number Diff line number Diff line change
Expand Up @@ -94,11 +94,11 @@ mod tests {
FixedSizeBinaryBuilder, Float16Array, Float32Array, Float64Array, GenericByteBuilder,
GenericByteViewBuilder, Int16Array, Int32Array, Int64Array, Int8Array,
IntervalDayTimeArray, IntervalMonthDayNanoArray, IntervalYearMonthArray, LargeListArray,
LargeStringArray, ListArray, MapArray, NullArray, StringArray, StringRunBuilder,
StringViewArray, StructArray, Time32MillisecondArray, Time32SecondArray,
Time64MicrosecondArray, Time64NanosecondArray, TimestampMicrosecondArray,
TimestampMillisecondArray, TimestampNanosecondArray, TimestampSecondArray, UInt16Array,
UInt32Array, UInt64Array, UInt8Array, UnionArray,
LargeListViewBuilder, LargeStringArray, ListArray, ListViewBuilder, MapArray, NullArray,
StringArray, StringRunBuilder, StringViewArray, StructArray, Time32MillisecondArray,
Time32SecondArray, Time64MicrosecondArray, Time64NanosecondArray,
TimestampMicrosecondArray, TimestampMillisecondArray, TimestampNanosecondArray,
TimestampSecondArray, UInt16Array, UInt32Array, UInt64Array, UInt8Array, UnionArray,
};
use arrow::buffer::{NullBuffer, OffsetBuffer, ScalarBuffer};
use arrow::datatypes::{
Expand All @@ -112,7 +112,8 @@ mod tests {
use chrono::{DateTime, NaiveDate, NaiveTime};
use half::f16;
use parquet_variant::{
Variant, VariantBuilder, VariantDecimal16, VariantDecimal4, VariantDecimal8,
Variant, VariantBuilder, VariantBuilderExt, VariantDecimal16, VariantDecimal4,
VariantDecimal8,
};
use std::{sync::Arc, vec};

Expand Down Expand Up @@ -1258,6 +1259,154 @@ mod tests {
);
}

#[test]
fn test_cast_to_variant_list_view() {
// Create a ListViewArray with some data
let mut builder = ListViewBuilder::new(Int32Array::builder(0));
builder.append_value(&Int32Array::from(vec![Some(0), None, Some(2)]));
builder.append_value(&Int32Array::from(vec![Some(3), Some(4)]));
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you also add please add a test case that has a null as one of the list elements (in addition to entire slot that is null)?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Technically, variant array elements are non-nullable. So any (SQL) NULL must be converted to Variant::Null before adding it to the array -- this is true regardless of shredding.

Note: It almost certainly makes sense to convert NULL values from an arrow array with nullable elements into Variant::Null values when converting to variant. So the resulting code might be the same either way. But it might still be helpful to understand the distinction (code comments, unstated assumptions, etc)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I agree that converting an arrow ListArray with null elements should result in an Variant::List(...) where one of the elements is Variant::Null

I am not sure if you asking a question or just making a statement

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Mostly a statement -- to make sure we're doing the right thing for the right reason rather than by accident.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I asked some variant spec folks a while back, and in their (parquet storage) world, arrays with SQL NULL elements are simply not a thing, and so variant code should not even need to think about it. For example:

it doesn't really make sense to talk about an array with (SQL) nullable elements. The array must already be a Variant array, and the Variant binary spec has no way to represent SQL NULL as an array element, only Variant null.

Meanwhile, IMO it's perfectly fine for arrow to take a position that NULL values in an arrow list array will produce Variant::NULL values if converted to variant.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added cases where array element is NULL

builder.append_null();
builder.append_value(&Int32Array::from(vec![None, None]));
let list_view_array = builder.finish();

// Expected values
let (metadata, value) = {
let mut builder = VariantBuilder::new();
let mut list = builder.new_list();
list.append_value(0i32);
list.append_null();
list.append_value(2i32);
list.finish();
builder.finish()
};
let variant0 = Variant::new(&metadata, &value);

let (metadata, value) = {
let mut builder = VariantBuilder::new();
let mut list = builder.new_list();
list.append_value(3i32);
list.append_value(4i32);
list.finish();
builder.finish()
};
let variant1 = Variant::new(&metadata, &value);

let (metadata, value) = {
let mut builder = VariantBuilder::new();
let mut list = builder.new_list();
list.append_null();
list.append_null();
list.finish();
builder.finish()
};
let variant3 = Variant::new(&metadata, &value);

run_test(
Arc::new(list_view_array),
vec![Some(variant0), Some(variant1), None, Some(variant3)],
);
}

#[test]
fn test_cast_to_variant_sliced_list_view() {
// Create a ListViewArray with some data
let mut builder = ListViewBuilder::new(Int32Array::builder(0));
builder.append_value(&Int32Array::from(vec![Some(0), Some(1), Some(2)]));
builder.append_value(&Int32Array::from(vec![Some(3), None]));
builder.append_null();
let list_view_array = builder.finish();

// Expected value for slice(1, 2) - should get the second and third elements
let (metadata, value) = {
let mut builder = VariantBuilder::new();
let mut list = builder.new_list();
list.append_value(3i32);
list.append_null();
list.finish();
builder.finish()
};
let variant = Variant::new(&metadata, &value);

run_test(
Arc::new(list_view_array.slice(1, 2)),
vec![Some(variant), None],
);
}

#[test]
fn test_cast_to_variant_large_list_view() {
// Create a LargeListViewArray with some data
let mut builder = LargeListViewBuilder::new(Int64Array::builder(0));
builder.append_value(&Int64Array::from(vec![Some(0), None, Some(2)]));
builder.append_value(&Int64Array::from(vec![Some(3), Some(4)]));
builder.append_null();
builder.append_value(&Int64Array::from(vec![None, None]));
let large_list_view_array = builder.finish();

// Expected values
let (metadata, value) = {
let mut builder = VariantBuilder::new();
let mut list = builder.new_list();
list.append_value(0i64);
list.append_null();
list.append_value(2i64);
list.finish();
builder.finish()
};
let variant0 = Variant::new(&metadata, &value);

let (metadata, value) = {
let mut builder = VariantBuilder::new();
let mut list = builder.new_list();
list.append_value(3i64);
list.append_value(4i64);
list.finish();
builder.finish()
};
let variant1 = Variant::new(&metadata, &value);

let (metadata, value) = {
let mut builder = VariantBuilder::new();
let mut list = builder.new_list();
list.append_null();
list.append_null();
list.finish();
builder.finish()
};
let variant3 = Variant::new(&metadata, &value);

run_test(
Arc::new(large_list_view_array),
vec![Some(variant0), Some(variant1), None, Some(variant3)],
);
}

#[test]
fn test_cast_to_variant_sliced_large_list_view() {
// Create a LargeListViewArray with some data
let mut builder = LargeListViewBuilder::new(Int64Array::builder(0));
builder.append_value(&Int64Array::from(vec![Some(0), Some(1), Some(2)]));
builder.append_value(&Int64Array::from(vec![Some(3), None]));
builder.append_null();
let large_list_view_array = builder.finish();

// Expected value for slice(1, 2) - should get the second and third elements
let (metadata, value) = {
let mut builder = VariantBuilder::new();
let mut list = builder.new_list();
list.append_value(3i64);
list.append_null();
list.finish();
builder.finish()
};
let variant = Variant::new(&metadata, &value);

run_test(
Arc::new(large_list_view_array.slice(1, 2)),
vec![Some(variant), None],
);
}

#[test]
fn test_cast_to_variant_struct() {
// Test a simple struct with two fields: id (int64) and age (int32)
Expand Down
Loading