diff --git a/parquet-variant-compute/src/unshred_variant.rs b/parquet-variant-compute/src/unshred_variant.rs index 9bba8f7000ac..f0c1a639bada 100644 --- a/parquet-variant-compute/src/unshred_variant.rs +++ b/parquet-variant-compute/src/unshred_variant.rs @@ -17,10 +17,11 @@ //! Module for unshredding VariantArray by folding typed_value columns back into the value column. +use crate::arrow_to_variant::ListLikeArray; use crate::{BorrowedShreddingState, VariantArray, VariantValueArrayBuilder}; use arrow::array::{ - Array, AsArray as _, BinaryViewArray, BooleanArray, FixedSizeBinaryArray, PrimitiveArray, - StringArray, StructArray, + Array, AsArray as _, BinaryViewArray, BooleanArray, FixedSizeBinaryArray, FixedSizeListArray, + GenericListArray, GenericListViewArray, PrimitiveArray, StringArray, StructArray, }; use arrow::buffer::NullBuffer; use arrow::datatypes::{ @@ -99,6 +100,11 @@ enum UnshredVariantRowBuilder<'a> { PrimitiveString(UnshredPrimitiveRowBuilder<'a, StringArray>), PrimitiveBinaryView(UnshredPrimitiveRowBuilder<'a, BinaryViewArray>), PrimitiveUuid(UnshredPrimitiveRowBuilder<'a, FixedSizeBinaryArray>), + List(ListUnshredVariantBuilder<'a, GenericListArray>), + LargeList(ListUnshredVariantBuilder<'a, GenericListArray>), + ListView(ListUnshredVariantBuilder<'a, GenericListViewArray>), + LargeListView(ListUnshredVariantBuilder<'a, GenericListViewArray>), + FixedSizeList(ListUnshredVariantBuilder<'a, FixedSizeListArray>), Struct(StructUnshredVariantBuilder<'a>), ValueOnly(ValueOnlyUnshredVariantBuilder<'a>), Null(NullUnshredVariantBuilder<'a>), @@ -132,6 +138,11 @@ impl<'a> UnshredVariantRowBuilder<'a> { Self::PrimitiveString(b) => b.append_row(builder, metadata, index), Self::PrimitiveBinaryView(b) => b.append_row(builder, metadata, index), Self::PrimitiveUuid(b) => b.append_row(builder, metadata, index), + Self::List(b) => b.append_row(builder, metadata, index), + Self::LargeList(b) => b.append_row(builder, metadata, index), + Self::ListView(b) => b.append_row(builder, metadata, index), + Self::LargeListView(b) => b.append_row(builder, metadata, index), + Self::FixedSizeList(b) => b.append_row(builder, metadata, index), Self::Struct(b) => b.append_row(builder, metadata, index), Self::ValueOnly(b) => b.append_row(builder, metadata, index), Self::Null(b) => b.append_row(builder, metadata, index), @@ -208,6 +219,25 @@ impl<'a> UnshredVariantRowBuilder<'a> { value, typed_value.as_struct(), )?), + DataType::List(_) => Self::List(ListUnshredVariantBuilder::try_new( + value, + typed_value.as_list(), + )?), + DataType::LargeList(_) => Self::LargeList(ListUnshredVariantBuilder::try_new( + value, + typed_value.as_list(), + )?), + DataType::ListView(_) => Self::ListView(ListUnshredVariantBuilder::try_new( + value, + typed_value.as_list_view(), + )?), + DataType::LargeListView(_) => Self::LargeListView(ListUnshredVariantBuilder::try_new( + value, + typed_value.as_list_view(), + )?), + DataType::FixedSizeList(_, _) => Self::FixedSizeList( + ListUnshredVariantBuilder::try_new(value, typed_value.as_fixed_size_list())?, + ), _ => { return Err(ArrowError::NotYetImplemented(format!( "Unshredding not yet supported for type: {}", @@ -517,5 +547,61 @@ impl<'a> StructUnshredVariantBuilder<'a> { } } +/// Builder for unshredding list/array types with recursive element processing +struct ListUnshredVariantBuilder<'a, L: ListLikeArray> { + value: Option<&'a BinaryViewArray>, + typed_value: &'a L, + element_unshredder: Box>, +} + +impl<'a, L: ListLikeArray> ListUnshredVariantBuilder<'a, L> { + fn try_new(value: Option<&'a BinaryViewArray>, typed_value: &'a L) -> Result { + // Create a recursive unshredder for the list elements + // The element type comes from the values array of the list + let element_values = typed_value.values(); + + // For shredded lists, each element would be a ShreddedVariantFieldArray (struct) + // Extract value/typed_value from the element struct + let Some(element_values) = element_values.as_struct_opt() else { + return Err(ArrowError::InvalidArgumentError(format!( + "Invalid shredded variant array element: expected Struct, got {}", + element_values.data_type() + ))); + }; + + // Create recursive unshredder for elements + // + // NOTE: A None/None array element is technically invalid, but the shredding spec + // requires us to emit `Variant::Null` when a required value is missing. + let element_unshredder = UnshredVariantRowBuilder::try_new_opt(element_values.try_into()?)? + .unwrap_or_else(|| UnshredVariantRowBuilder::null(None)); + + Ok(Self { + value, + typed_value, + element_unshredder: Box::new(element_unshredder), + }) + } + + fn append_row( + &mut self, + builder: &mut impl VariantBuilderExt, + metadata: &VariantMetadata, + index: usize, + ) -> Result<()> { + handle_unshredded_case!(self, builder, metadata, index, false); + + // If we get here, typed_value is valid and value is NULL -- process the list elements + let mut list_builder = builder.try_new_list()?; + for element_index in self.typed_value.element_range(index) { + self.element_unshredder + .append_row(&mut list_builder, metadata, element_index)?; + } + + list_builder.finish(); + Ok(()) + } +} + // TODO: This code is covered by tests in `parquet/tests/variant_integration.rs`. Does that suffice? // Or do we also need targeted stand-alone unit tests for full coverage? diff --git a/parquet-variant/src/variant/list.rs b/parquet-variant/src/variant/list.rs index 438faddffe15..d14d3a7796cf 100644 --- a/parquet-variant/src/variant/list.rs +++ b/parquet-variant/src/variant/list.rs @@ -117,7 +117,7 @@ impl VariantListHeader { /// /// [valid]: VariantMetadata#Validation /// [Variant spec]: https://github.com/apache/parquet-format/blob/master/VariantEncoding.md#value-data-for-array-basic_type3 -#[derive(Debug, Clone, PartialEq)] +#[derive(Debug, Clone)] pub struct VariantList<'m, 'v> { pub metadata: VariantMetadata<'m>, pub value: &'v [u8], @@ -302,6 +302,20 @@ impl<'m, 'v> VariantList<'m, 'v> { } } +// Custom implementation of PartialEq for variant arrays +// +// Instead of comparing the raw bytes of 2 variant lists, this implementation recursively +// checks whether their elements are equal. +impl<'m, 'v> PartialEq for VariantList<'m, 'v> { + fn eq(&self, other: &Self) -> bool { + if self.num_elements != other.num_elements { + return false; + } + + self.iter().zip(other.iter()).all(|(a, b)| a == b) + } +} + #[cfg(test)] mod tests { use super::*; diff --git a/parquet-variant/src/variant/object.rs b/parquet-variant/src/variant/object.rs index aa7714c6d2e1..713be977b9eb 100644 --- a/parquet-variant/src/variant/object.rs +++ b/parquet-variant/src/variant/object.rs @@ -419,13 +419,9 @@ impl<'m, 'v> PartialEq for VariantObject<'m, 'v> { // IFF two objects are valid and logically equal, they will have the same // field names in the same order, because the spec requires the object // fields to be sorted lexicographically. - for ((name_a, value_a), (name_b, value_b)) in self.iter().zip(other.iter()) { - if name_a != name_b || value_a != value_b { - return false; - } - } - - true + self.iter() + .zip(other.iter()) + .all(|((name_a, value_a), (name_b, value_b))| name_a == name_b && value_a == value_b) } } diff --git a/parquet/tests/variant_integration.rs b/parquet/tests/variant_integration.rs index 2e44bddb4aed..98fa04555d77 100644 --- a/parquet/tests/variant_integration.rs +++ b/parquet/tests/variant_integration.rs @@ -34,24 +34,16 @@ use std::{fs, path::PathBuf}; type Result = std::result::Result; -/// Creates a test function for a given case number +/// Creates a test function for a given case number. +/// +/// If an error message is provided, generate an error test case that expects it. /// /// Note the index is zero-based, while the case number is one-based macro_rules! variant_test_case { - ($case_num:literal) => { - paste::paste! { - #[test] - fn []() { - all_cases()[$case_num - 1].run() - } - } - }; - - // Generates an error test case, where the expected result is an error message - ($case_num:literal, $expected_error:literal) => { + ($case_num:literal $(, $expected_error:literal )? ) => { paste::paste! { #[test] - #[should_panic(expected = $expected_error)] + $( #[should_panic(expected = $expected_error)] )? fn []() { all_cases()[$case_num - 1].run() } @@ -65,8 +57,8 @@ macro_rules! variant_test_case { // - cases 40, 42, 87, 127 and 128 are expected to fail always (they include invalid variants) // - the remaining cases are expected to (eventually) pass -variant_test_case!(1, "Unshredding not yet supported for type: List("); -variant_test_case!(2, "Unshredding not yet supported for type: List("); +variant_test_case!(1); +variant_test_case!(2); // case 3 is empty in cases.json 🤷 // ```json // { @@ -130,16 +122,14 @@ variant_test_case!(37); variant_test_case!(38); variant_test_case!(39); // Is an error case (should be failing as the expected error message indicates) -// TODO: Once we support lists: "both value and typed_value are non-null" -variant_test_case!(40, "Unshredding not yet supported for type: List("); -variant_test_case!(41, "Unshredding not yet supported for type: List("); +variant_test_case!(40, "both value and typed_value are non-null"); +variant_test_case!(41); // Is an error case (should be failing as the expected error message indicates) variant_test_case!(42, "both value and typed_value are non-null"); // Is an error case (should be failing as the expected error message indicates) variant_test_case!(43, "Field 'b' appears in both typed_value and value"); variant_test_case!(44); -// https://github.com/apache/arrow-rs/issues/8337 -variant_test_case!(45, "Unshredding not yet supported for type: List("); +variant_test_case!(45); variant_test_case!(46); variant_test_case!(47); variant_test_case!(48); @@ -180,12 +170,11 @@ variant_test_case!(82); variant_test_case!(83); // Invalid case, implementations can choose to read the shredded value or error out variant_test_case!(84); -// https://github.com/apache/arrow-rs/issues/8337 -variant_test_case!(85, "Unshredding not yet supported for type: List("); -variant_test_case!(86, "Unshredding not yet supported for type: List("); +variant_test_case!(85); +variant_test_case!(86); // Is an error case (should be failing as the expected error message indicates) variant_test_case!(87, "Expected object in value field"); -variant_test_case!(88, "Unshredding not yet supported for type: List("); +variant_test_case!(88); variant_test_case!(89); variant_test_case!(90); variant_test_case!(91); @@ -224,7 +213,7 @@ variant_test_case!(123); variant_test_case!(124); // Is an error case (should be failing as the expected error message indicates) variant_test_case!(125, "Field 'b' appears in both typed_value and value"); -variant_test_case!(126, "Unshredding not yet supported for type: List("); +variant_test_case!(126); // Is an error case (should be failing as the expected error message indicates) variant_test_case!(127, "Illegal shredded value type: UInt32"); // Is an error case (should be failing as the expected error message indicates) @@ -235,8 +224,8 @@ variant_test_case!(131); variant_test_case!(132); variant_test_case!(133); variant_test_case!(134); -variant_test_case!(135, "Unshredding not yet supported for type: List("); -variant_test_case!(136, "Unshredding not yet supported for type: List("); +variant_test_case!(135); +variant_test_case!(136); // Is an error case (should be failing as the expected error message indicates) variant_test_case!(137, "Illegal shredded value type: FixedSizeBinary(4)"); variant_test_case!(138);