From d0483f8a634a21f21d94b91b876df204dd1e07b3 Mon Sep 17 00:00:00 2001 From: Ryan Johnson Date: Mon, 29 Sep 2025 10:58:39 -0700 Subject: [PATCH 1/4] [Variant] Add a VariantBuilderExt implementation for VariantValueArrayBuilder --- parquet-variant-compute/src/shred_variant.rs | 9 +-- .../src/variant_array_builder.rs | 81 ++++++++++++++++--- 2 files changed, 74 insertions(+), 16 deletions(-) diff --git a/parquet-variant-compute/src/shred_variant.rs b/parquet-variant-compute/src/shred_variant.rs index 138209802ab4..31cd481c18a6 100644 --- a/parquet-variant-compute/src/shred_variant.rs +++ b/parquet-variant-compute/src/shred_variant.rs @@ -27,7 +27,7 @@ use arrow::buffer::NullBuffer; use arrow::compute::CastOptions; use arrow::datatypes::{DataType, Fields}; use arrow::error::{ArrowError, Result}; -use parquet_variant::{ObjectBuilder, ReadOnlyMetadataBuilder, Variant}; +use parquet_variant::{Variant, VariantBuilderExt}; use indexmap::IndexMap; use std::sync::Arc; @@ -267,9 +267,8 @@ impl<'a> VariantToShreddedObjectVariantRowBuilder<'a> { }; // Route the object's fields by name as either shredded or unshredded - let mut metadata_builder = ReadOnlyMetadataBuilder::new(value.metadata().clone()); - let state = self.value_builder.parent_state(&mut metadata_builder); - let mut object_builder = ObjectBuilder::new(state, false); + let mut builder = self.value_builder.as_builder_ext(value.metadata().clone()); + let mut object_builder = builder.try_new_object()?; let mut seen = std::collections::HashSet::new(); let mut partially_shredded = false; for (field_name, value) in obj.iter() { @@ -329,7 +328,7 @@ mod tests { use crate::VariantArrayBuilder; use arrow::array::{Array, Float64Array, Int64Array}; use arrow::datatypes::{DataType, Field, Fields}; - use parquet_variant::{Variant, VariantBuilder, VariantBuilderExt as _}; + use parquet_variant::{ObjectBuilder, ReadOnlyMetadataBuilder, Variant, VariantBuilder}; use std::sync::Arc; fn create_test_variant_array(values: Vec>>) -> VariantArray { diff --git a/parquet-variant-compute/src/variant_array_builder.rs b/parquet-variant-compute/src/variant_array_builder.rs index 68c1fd6b5492..1de13da8e799 100644 --- a/parquet-variant-compute/src/variant_array_builder.rs +++ b/parquet-variant-compute/src/variant_array_builder.rs @@ -22,6 +22,7 @@ use arrow::array::{ArrayRef, BinaryViewArray, BinaryViewBuilder, NullBufferBuild use arrow_schema::{ArrowError, DataType, Field, Fields}; use parquet_variant::{ BuilderSpecificState, ListBuilder, MetadataBuilder, ObjectBuilder, Variant, VariantBuilderExt, + VariantMetadata, }; use parquet_variant::{ ParentState, ReadOnlyMetadataBuilder, ValueBuilder, WritableMetadataBuilder, @@ -294,8 +295,8 @@ impl VariantValueArrayBuilder { /// builder.append_value(Variant::from(42)); /// ``` pub fn append_value(&mut self, value: Variant<'_, '_>) { - let mut metadata_builder = ReadOnlyMetadataBuilder::new(value.metadata().clone()); - ValueBuilder::append_variant_bytes(self.parent_state(&mut metadata_builder), value); + self.as_builder_ext(value.metadata().clone()) + .append_value(value); } /// Creates a builder-specific parent state. @@ -333,6 +334,18 @@ impl VariantValueArrayBuilder { ParentState::new(&mut self.value_builder, metadata_builder, state) } + + /// Creates a thin [`VariantBuilderExt`] wrapper for this builder, which hides the `metadata` + /// parameter (similar to how the [`ObjectFieldBuilder`] wrapper hides field names). + pub fn as_builder_ext<'a>( + &'a mut self, + metadata: VariantMetadata<'a>, + ) -> VariantValueArrayBuilderExt<'a> { + VariantValueArrayBuilderExt { + metadata_builder: ReadOnlyMetadataBuilder::new(metadata), + value_builder: self, + } + } } /// Builder-specific state for array building that manages array-level offsets and nulls. See @@ -355,6 +368,52 @@ impl BuilderSpecificState for ValueArrayBuilderState<'_> { } } +/// A thin [`VariantBuilderExt`] wrapper that hides the short-lived (per-row) +/// [`ReadOnlyMetadataBuilder`] instances that [`VariantValueArrayBuilder`] requires. +pub struct VariantValueArrayBuilderExt<'a> { + metadata_builder: ReadOnlyMetadataBuilder<'a>, + value_builder: &'a mut VariantValueArrayBuilder, +} + +impl<'a> VariantValueArrayBuilderExt<'a> { + /// Creates a new instance from a metadata builder and a reference to a variant value builder. + pub fn new( + metadata_builder: ReadOnlyMetadataBuilder<'a>, + value_builder: &'a mut VariantValueArrayBuilder, + ) -> Self { + Self { + metadata_builder, + value_builder, + } + } +} + +impl<'a> VariantBuilderExt for VariantValueArrayBuilderExt<'a> { + type State<'b> + = ValueArrayBuilderState<'b> + where + Self: 'b; + + fn append_null(&mut self) { + self.value_builder.append_null() + } + + fn append_value<'m, 'v>(&mut self, value: impl Into>) { + let state = self.value_builder.parent_state(&mut self.metadata_builder); + ValueBuilder::append_variant_bytes(state, value.into()); + } + + fn try_new_list(&mut self) -> Result>, ArrowError> { + let state = self.value_builder.parent_state(&mut self.metadata_builder); + Ok(ListBuilder::new(state, false)) + } + + fn try_new_object(&mut self) -> Result>, ArrowError> { + let state = self.value_builder.parent_state(&mut self.metadata_builder); + Ok(ObjectBuilder::new(state, false)) + } +} + fn binary_view_array_from_buffers(buffer: Vec, offsets: Vec) -> BinaryViewArray { // All offsets are less than or equal to the buffer length, so we can safely cast all offsets // inside the loop below, as long as the buffer length fits in u32. @@ -482,32 +541,32 @@ mod test { // // NOTE: Because we will reuse the metadata column, we cannot reorder rows. We can only // filter or manipulate values within a row. - let mut builder = VariantValueArrayBuilder::new(3); + let mut value_builder = VariantValueArrayBuilder::new(3); // straight copy - builder.append_value(array.value(0)); + value_builder.append_value(array.value(0)); // filtering fields takes more work because we need to manually create an object builder let value = array.value(1); - let mut metadata_builder = ReadOnlyMetadataBuilder::new(value.metadata().clone()); - let state = builder.parent_state(&mut metadata_builder); - ObjectBuilder::new(state, false) + let mut builder = value_builder.as_builder_ext(value.metadata().clone()); + builder + .new_object() .with_field("name", value.get_object_field("name").unwrap()) .with_field("age", value.get_object_field("age").unwrap()) .finish(); // same bytes, but now nested and duplicated inside a list let value = array.value(2); - let mut metadata_builder = ReadOnlyMetadataBuilder::new(value.metadata().clone()); - let state = builder.parent_state(&mut metadata_builder); - ListBuilder::new(state, false) + let mut builder = value_builder.as_builder_ext(value.metadata().clone()); + builder + .new_list() .with_value(value.clone()) .with_value(value.clone()) .finish(); let array2 = VariantArray::from_parts( array.metadata_field().clone(), - Some(builder.build().unwrap()), + Some(value_builder.build().unwrap()), None, None, ); From 538c9f08bc462bef603ff130e8a2f56465694192 Mon Sep 17 00:00:00 2001 From: Ryan Johnson Date: Mon, 29 Sep 2025 11:12:03 -0700 Subject: [PATCH 2/4] doc --- parquet-variant-compute/src/variant_array_builder.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/parquet-variant-compute/src/variant_array_builder.rs b/parquet-variant-compute/src/variant_array_builder.rs index 1de13da8e799..15767163241b 100644 --- a/parquet-variant-compute/src/variant_array_builder.rs +++ b/parquet-variant-compute/src/variant_array_builder.rs @@ -336,7 +336,7 @@ impl VariantValueArrayBuilder { } /// Creates a thin [`VariantBuilderExt`] wrapper for this builder, which hides the `metadata` - /// parameter (similar to how the [`ObjectFieldBuilder`] wrapper hides field names). + /// parameter (similar to the way [`parquet_variant::ObjectFieldBuilder`] hides field names). pub fn as_builder_ext<'a>( &'a mut self, metadata: VariantMetadata<'a>, From a8054778d09e76c55647f43fa1f26458d21cfabd Mon Sep 17 00:00:00 2001 From: Ryan Johnson Date: Mon, 29 Sep 2025 12:21:17 -0700 Subject: [PATCH 3/4] review comment --- parquet-variant-compute/src/variant_array_builder.rs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/parquet-variant-compute/src/variant_array_builder.rs b/parquet-variant-compute/src/variant_array_builder.rs index 15767163241b..80e6c0a2a0d8 100644 --- a/parquet-variant-compute/src/variant_array_builder.rs +++ b/parquet-variant-compute/src/variant_array_builder.rs @@ -295,7 +295,7 @@ impl VariantValueArrayBuilder { /// builder.append_value(Variant::from(42)); /// ``` pub fn append_value(&mut self, value: Variant<'_, '_>) { - self.as_builder_ext(value.metadata().clone()) + self.builder_ext(value.metadata().clone()) .append_value(value); } @@ -337,7 +337,7 @@ impl VariantValueArrayBuilder { /// Creates a thin [`VariantBuilderExt`] wrapper for this builder, which hides the `metadata` /// parameter (similar to the way [`parquet_variant::ObjectFieldBuilder`] hides field names). - pub fn as_builder_ext<'a>( + pub fn builder_ext<'a>( &'a mut self, metadata: VariantMetadata<'a>, ) -> VariantValueArrayBuilderExt<'a> { @@ -548,7 +548,7 @@ mod test { // filtering fields takes more work because we need to manually create an object builder let value = array.value(1); - let mut builder = value_builder.as_builder_ext(value.metadata().clone()); + let mut builder = value_builder.builder_ext(value.metadata().clone()); builder .new_object() .with_field("name", value.get_object_field("name").unwrap()) @@ -557,7 +557,7 @@ mod test { // same bytes, but now nested and duplicated inside a list let value = array.value(2); - let mut builder = value_builder.as_builder_ext(value.metadata().clone()); + let mut builder = value_builder.builder_ext(value.metadata().clone()); builder .new_list() .with_value(value.clone()) From b6a952a288279c0b0c598af4a8039711cc2da462 Mon Sep 17 00:00:00 2001 From: Ryan Johnson Date: Mon, 29 Sep 2025 12:29:25 -0700 Subject: [PATCH 4/4] missed one --- parquet-variant-compute/src/shred_variant.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/parquet-variant-compute/src/shred_variant.rs b/parquet-variant-compute/src/shred_variant.rs index 31cd481c18a6..568eb1b6af19 100644 --- a/parquet-variant-compute/src/shred_variant.rs +++ b/parquet-variant-compute/src/shred_variant.rs @@ -267,7 +267,7 @@ impl<'a> VariantToShreddedObjectVariantRowBuilder<'a> { }; // Route the object's fields by name as either shredded or unshredded - let mut builder = self.value_builder.as_builder_ext(value.metadata().clone()); + let mut builder = self.value_builder.builder_ext(value.metadata().clone()); let mut object_builder = builder.try_new_object()?; let mut seen = std::collections::HashSet::new(); let mut partially_shredded = false;