diff --git a/parquet-variant-compute/src/lib.rs b/parquet-variant-compute/src/lib.rs index 496d550d95b1..b3876ef6ab3e 100644 --- a/parquet-variant-compute/src/lib.rs +++ b/parquet-variant-compute/src/lib.rs @@ -46,7 +46,7 @@ mod variant_array_builder; pub mod variant_get; mod variant_to_arrow; -pub use variant_array::{ShreddingState, VariantArray, VariantType}; +pub use variant_array::{BorrowedShreddingState, ShreddingState, VariantArray, VariantType}; pub use variant_array_builder::{VariantArrayBuilder, VariantValueArrayBuilder}; pub use cast_to_variant::{cast_to_variant, cast_to_variant_with_options}; diff --git a/parquet-variant-compute/src/variant_array.rs b/parquet-variant-compute/src/variant_array.rs index d366d702aa98..79ef37f0c49d 100644 --- a/parquet-variant-compute/src/variant_array.rs +++ b/parquet-variant-compute/src/variant_array.rs @@ -272,26 +272,11 @@ impl VariantArray { ))); }; - // Extract value and typed_value fields - let value = if let Some(value_col) = inner.column_by_name("value") { - if let Some(binary_view) = value_col.as_binary_view_opt() { - Some(binary_view.clone()) - } else { - return Err(ArrowError::NotYetImplemented(format!( - "VariantArray 'value' field must be BinaryView, got {}", - value_col.data_type() - ))); - } - } else { - None - }; - let typed_value = inner.column_by_name("typed_value").cloned(); - // Note these clones are cheap, they just bump the ref count Ok(Self { inner: inner.clone(), metadata: metadata.clone(), - shredding_state: ShreddingState::new(value, typed_value), + shredding_state: ShreddingState::try_from(inner)?, }) } @@ -521,7 +506,7 @@ impl ShreddedVariantFieldArray { // Note this clone is cheap, it just bumps the ref count Ok(Self { inner: inner_struct.clone(), - shredding_state: ShreddingState::from(inner_struct), + shredding_state: ShreddingState::try_from(inner_struct)?, }) } @@ -660,7 +645,7 @@ impl ShreddingState { /// Create a new `ShreddingState` from the given `value` and `typed_value` fields /// /// Note you can create a `ShreddingState` from a &[`StructArray`] using - /// `ShreddingState::from(&struct_array)`, for example: + /// `ShreddingState::try_from(&struct_array)`, for example: /// /// ```no_run /// # use arrow::array::StructArray; @@ -669,7 +654,7 @@ impl ShreddingState { /// # unimplemented!() /// # } /// let struct_array: StructArray = get_struct_array(); - /// let shredding_state = ShreddingState::from(&struct_array); + /// let shredding_state = ShreddingState::try_from(&struct_array).unwrap(); /// ``` pub fn new(value: Option, typed_value: Option) -> Self { Self { value, typed_value } @@ -685,6 +670,14 @@ impl ShreddingState { self.typed_value.as_ref() } + /// Returns a borrowed version of this shredding state + pub fn borrow(&self) -> BorrowedShreddingState<'_> { + BorrowedShreddingState { + value: self.value_field(), + typed_value: self.typed_value_field(), + } + } + /// Slice all the underlying arrays pub fn slice(&self, offset: usize, length: usize) -> Self { Self { @@ -694,14 +687,79 @@ impl ShreddingState { } } -impl From<&StructArray> for ShreddingState { - fn from(inner_struct: &StructArray) -> Self { - let value = inner_struct - .column_by_name("value") - .and_then(|col| col.as_binary_view_opt().cloned()); - let typed_value = inner_struct.column_by_name("typed_value").cloned(); +/// Similar to [`ShreddingState`] except it holds borrowed references of the target arrays. Useful +/// for avoiding clone operations when the caller does not need a self-standing shredding state. +#[derive(Clone, Debug)] +pub struct BorrowedShreddingState<'a> { + value: Option<&'a BinaryViewArray>, + typed_value: Option<&'a ArrayRef>, +} - ShreddingState::new(value, typed_value) +impl<'a> BorrowedShreddingState<'a> { + /// Create a new `BorrowedShreddingState` from the given `value` and `typed_value` fields + /// + /// Note you can create a `BorrowedShreddingState` from a &[`StructArray`] using + /// `BorrowedShreddingState::try_from(&struct_array)`, for example: + /// + /// ```no_run + /// # use arrow::array::StructArray; + /// # use parquet_variant_compute::BorrowedShreddingState; + /// # fn get_struct_array() -> StructArray { + /// # unimplemented!() + /// # } + /// let struct_array: StructArray = get_struct_array(); + /// let shredding_state = BorrowedShreddingState::try_from(&struct_array).unwrap(); + /// ``` + pub fn new(value: Option<&'a BinaryViewArray>, typed_value: Option<&'a ArrayRef>) -> Self { + Self { value, typed_value } + } + + /// Return a reference to the value field, if present + pub fn value_field(&self) -> Option<&'a BinaryViewArray> { + self.value + } + + /// Return a reference to the typed_value field, if present + pub fn typed_value_field(&self) -> Option<&'a ArrayRef> { + self.typed_value + } +} + +impl<'a> TryFrom<&'a StructArray> for BorrowedShreddingState<'a> { + type Error = ArrowError; + + fn try_from(inner_struct: &'a StructArray) -> Result { + // The `value` column need not exist, but if it does it must be a binary view. + let value = if let Some(value_col) = inner_struct.column_by_name("value") { + let Some(binary_view) = value_col.as_binary_view_opt() else { + return Err(ArrowError::NotYetImplemented(format!( + "VariantArray 'value' field must be BinaryView, got {}", + value_col.data_type() + ))); + }; + Some(binary_view) + } else { + None + }; + let typed_value = inner_struct.column_by_name("typed_value"); + Ok(BorrowedShreddingState::new(value, typed_value)) + } +} + +impl TryFrom<&StructArray> for ShreddingState { + type Error = ArrowError; + + fn try_from(inner_struct: &StructArray) -> Result { + Ok(BorrowedShreddingState::try_from(inner_struct)?.into()) + } +} + +impl From> for ShreddingState { + fn from(state: BorrowedShreddingState<'_>) -> Self { + ShreddingState { + value: state.value_field().cloned(), + typed_value: state.typed_value_field().cloned(), + } } } diff --git a/parquet-variant-compute/src/variant_get.rs b/parquet-variant-compute/src/variant_get.rs index a923732ca41b..977507af42a3 100644 --- a/parquet-variant-compute/src/variant_get.rs +++ b/parquet-variant-compute/src/variant_get.rs @@ -23,16 +23,16 @@ use arrow::{ use arrow_schema::{ArrowError, DataType, FieldRef}; use parquet_variant::{VariantPath, VariantPathElement}; -use crate::variant_array::ShreddingState; +use crate::variant_array::BorrowedShreddingState; use crate::variant_to_arrow::make_variant_to_arrow_row_builder; use crate::VariantArray; use arrow::array::AsArray; use std::sync::Arc; -pub(crate) enum ShreddedPathStep { +pub(crate) enum ShreddedPathStep<'a> { /// Path step succeeded, return the new shredding state - Success(ShreddingState), + Success(BorrowedShreddingState<'a>), /// The path element is not present in the `typed_value` column and there is no `value` column, /// so we know it does not exist. It, and all paths under it, are all-NULL. Missing, @@ -47,18 +47,16 @@ pub(crate) enum ShreddedPathStep { /// level, or if `typed_value` is not a struct, or if the requested field name does not exist. /// /// TODO: Support `VariantPathElement::Index`? It wouldn't be easy, and maybe not even possible. -pub(crate) fn follow_shredded_path_element( - shredding_state: &ShreddingState, +pub(crate) fn follow_shredded_path_element<'a>( + shredding_state: &BorrowedShreddingState<'a>, path_element: &VariantPathElement<'_>, cast_options: &CastOptions, -) -> Result { +) -> Result> { // If the requested path element is not present in `typed_value`, and `value` is missing, then // we know it does not exist; it, and all paths under it, are all-NULL. - let missing_path_step = || { - let Some(_value_field) = shredding_state.value_field() else { - return ShreddedPathStep::Missing; - }; - ShreddedPathStep::NotShredded + let missing_path_step = || match shredding_state.value_field() { + Some(_) => ShreddedPathStep::NotShredded, + None => ShreddedPathStep::Missing, }; let Some(typed_value) = shredding_state.typed_value_field() else { @@ -98,7 +96,8 @@ pub(crate) fn follow_shredded_path_element( )) })?; - Ok(ShreddedPathStep::Success(struct_array.into())) + let state = BorrowedShreddingState::try_from(struct_array)?; + Ok(ShreddedPathStep::Success(state)) } VariantPathElement::Index { .. } => { // TODO: Support array indexing. Among other things, it will require slicing not @@ -152,7 +151,7 @@ fn shredded_get_path( // Peel away the prefix of path elements that traverses the shredded parts of this variant // column. Shredding will traverse the rest of the path on a per-row basis. - let mut shredding_state = input.shredding_state().clone(); + let mut shredding_state = input.shredding_state().borrow(); let mut accumulated_nulls = input.inner().nulls().cloned(); let mut path_index = 0; for path_element in path {