diff --git a/parquet-variant/benches/variant_builder.rs b/parquet-variant/benches/variant_builder.rs index 8481ca9c8f5f..8e24a63c3a54 100644 --- a/parquet-variant/benches/variant_builder.rs +++ b/parquet-variant/benches/variant_builder.rs @@ -441,7 +441,7 @@ fn bench_validation_validated_vs_unvalidated(c: &mut Criterion) { b.iter(|| { for variant in &unvalidated { - let validated = variant.clone().validate().unwrap(); + let validated = variant.clone().with_full_validation().unwrap(); hint::black_box(validated); } }) diff --git a/parquet-variant/src/variant.rs b/parquet-variant/src/variant.rs index 96cdb53c15e8..6bcf61c036ac 100644 --- a/parquet-variant/src/variant.rs +++ b/parquet-variant/src/variant.rs @@ -1,5 +1,3 @@ -use std::ops::Deref; - // Licensed to the Apache Software Foundation (ASF) under one // or more contributor license agreements. See the NOTICE file // distributed with this work for additional information @@ -16,6 +14,7 @@ use std::ops::Deref; // KIND, either express or implied. See the License for the // specific language governing permissions and limitations // under the License. + pub use self::decimal::{VariantDecimal16, VariantDecimal4, VariantDecimal8}; pub use self::list::VariantList; pub use self::metadata::VariantMetadata; @@ -24,6 +23,7 @@ use crate::decoder::{ self, get_basic_type, get_primitive_type, VariantBasicType, VariantPrimitiveType, }; use crate::utils::{first_byte_from_slice, slice_from_slice}; +use std::ops::Deref; use arrow_schema::ArrowError; use chrono::{DateTime, NaiveDate, NaiveDateTime, Utc}; @@ -184,7 +184,7 @@ impl Deref for ShortString<'_> { /// Every instance of variant is either _valid_ or _invalid_. depending on whether the /// underlying bytes are a valid encoding of a variant value (see below). /// -/// Instances produced by [`Self::try_new`], [`Self::try_new_with_metadata`], or [`Self::validate`] +/// Instances produced by [`Self::try_new`], [`Self::try_new_with_metadata`], or [`Self::with_full_validation`] /// are fully _validated_. They always contain _valid_ data, and infallible accesses such as /// iteration and indexing are panic-free. The validation cost is `O(m + v)` where `m` and /// `v` are the number of bytes in the metadata and value buffers, respectively. @@ -192,7 +192,7 @@ impl Deref for ShortString<'_> { /// Instances produced by [`Self::new`] and [`Self::new_with_metadata`] are _unvalidated_ and so /// they may contain either _valid_ or _invalid_ data. Infallible accesses to variant objects and /// arrays, such as iteration and indexing will panic if the underlying bytes are _invalid_, and -/// fallible alternatives are provided as panic-free alternatives. [`Self::validate`] can also be +/// fallible alternatives are provided as panic-free alternatives. [`Self::with_full_validation`] can also be /// used to _validate_ an _unvalidated_ instance, if desired. /// /// _Unvalidated_ instances can be constructed in constant time. This can be useful if the caller @@ -297,8 +297,10 @@ impl<'m, 'v> Variant<'m, 'v> { /// /// [unvalidated]: Self#Validation pub fn new(metadata: &'m [u8], value: &'v [u8]) -> Self { - let metadata = VariantMetadata::try_new_impl(metadata).expect("Invalid variant metadata"); - Self::try_new_with_metadata_impl(metadata, value).expect("Invalid variant data") + let metadata = VariantMetadata::try_new_with_shallow_validation(metadata) + .expect("Invalid variant metadata"); + Self::try_new_with_metadata_and_shallow_validation(metadata, value) + .expect("Invalid variant data") } /// Create a new variant with existing metadata. @@ -323,18 +325,19 @@ impl<'m, 'v> Variant<'m, 'v> { metadata: VariantMetadata<'m>, value: &'v [u8], ) -> Result { - Self::try_new_with_metadata_impl(metadata, value)?.validate() + Self::try_new_with_metadata_and_shallow_validation(metadata, value)?.with_full_validation() } /// Similar to [`Self::try_new_with_metadata`], but [unvalidated]. /// /// [unvalidated]: Self#Validation pub fn new_with_metadata(metadata: VariantMetadata<'m>, value: &'v [u8]) -> Self { - Self::try_new_with_metadata_impl(metadata, value).expect("Invalid variant") + Self::try_new_with_metadata_and_shallow_validation(metadata, value) + .expect("Invalid variant") } // The actual constructor, which only performs shallow (constant-time) validation. - fn try_new_with_metadata_impl( + fn try_new_with_metadata_and_shallow_validation( metadata: VariantMetadata<'m>, value: &'v [u8], ) -> Result { @@ -382,10 +385,12 @@ impl<'m, 'v> Variant<'m, 'v> { VariantBasicType::ShortString => { Variant::ShortString(decoder::decode_short_string(value_metadata, value_data)?) } - VariantBasicType::Object => { - Variant::Object(VariantObject::try_new_impl(metadata, value)?) - } - VariantBasicType::Array => Variant::List(VariantList::try_new_impl(metadata, value)?), + VariantBasicType::Object => Variant::Object( + VariantObject::try_new_with_shallow_validation(metadata, value)?, + ), + VariantBasicType::Array => Variant::List(VariantList::try_new_with_shallow_validation( + metadata, value, + )?), }; Ok(new_self) } @@ -393,10 +398,10 @@ impl<'m, 'v> Variant<'m, 'v> { /// True if this variant instance has already been [validated]. /// /// [validated]: Self#Validation - pub fn is_validated(&self) -> bool { + pub fn is_fully_validated(&self) -> bool { match self { - Variant::List(list) => list.is_validated(), - Variant::Object(obj) => obj.is_validated(), + Variant::List(list) => list.is_fully_validated(), + Variant::Object(obj) => obj.is_fully_validated(), _ => true, } } @@ -407,16 +412,16 @@ impl<'m, 'v> Variant<'m, 'v> { /// Variant leaf values are always valid by construction, but [objects] and [arrays] can be /// constructed in unvalidated (and potentially invalid) state. /// - /// If [`Self::is_validated`] is true, validation is a no-op. Otherwise, the cost is `O(m + v)` + /// If [`Self::is_fully_validated`] is true, validation is a no-op. Otherwise, the cost is `O(m + v)` /// where `m` and `v` are the sizes of metadata and value buffers, respectively. /// /// [objects]: VariantObject#Validation /// [arrays]: VariantList#Validation - pub fn validate(self) -> Result { + pub fn with_full_validation(self) -> Result { use Variant::*; match self { - List(list) => list.validate().map(List), - Object(obj) => obj.validate().map(Object), + List(list) => list.with_full_validation().map(List), + Object(obj) => obj.with_full_validation().map(Object), _ => Ok(self), } } diff --git a/parquet-variant/src/variant/list.rs b/parquet-variant/src/variant/list.rs index f9a50e7ef8f0..5257ec6a0254 100644 --- a/parquet-variant/src/variant/list.rs +++ b/parquet-variant/src/variant/list.rs @@ -82,14 +82,14 @@ impl VariantListHeader { /// Every instance of variant list is either _valid_ or _invalid_. depending on whether the /// underlying bytes are a valid encoding of a variant array (see below). /// -/// Instances produced by [`Self::try_new`] or [`Self::validate`] are fully _validated_. They always +/// Instances produced by [`Self::try_new`] or [`Self::with_full_validation`] are fully _validated_. They always /// contain _valid_ data, and infallible accesses such as iteration and indexing are panic-free. The /// validation cost is linear in the number of underlying bytes. /// /// Instances produced by [`Self::new`] are _unvalidated_ and so they may contain either _valid_ or /// _invalid_ data. Infallible accesses such as iteration and indexing will panic if the underlying /// bytes are _invalid_, and fallible alternatives such as [`Self::iter_try`] and [`Self::get`] are -/// provided as panic-free alternatives. [`Self::validate`] can also be used to _validate_ an +/// provided as panic-free alternatives. [`Self::with_full_validation`] can also be used to _validate_ an /// _unvalidated_ instance, if desired. /// /// _Unvalidated_ instances can be constructed in constant time. This can be useful if the caller @@ -136,18 +136,18 @@ impl<'m, 'v> VariantList<'m, 'v> { /// This constructor verifies that `value` points to a valid variant array value. In particular, /// that all offsets are in-bounds and point to valid (recursively validated) objects. pub fn try_new(metadata: VariantMetadata<'m>, value: &'v [u8]) -> Result { - Self::try_new_impl(metadata, value)?.validate() + Self::try_new_with_shallow_validation(metadata, value)?.with_full_validation() } pub fn new(metadata: VariantMetadata<'m>, value: &'v [u8]) -> Self { - Self::try_new_impl(metadata, value).expect("Invalid variant list value") + Self::try_new_with_shallow_validation(metadata, value).expect("Invalid variant list value") } /// Attempts to interpet `metadata` and `value` as a variant array, performing only basic /// (constant-cost) [validation]. /// /// [validation]: Self#Validation - pub(crate) fn try_new_impl( + pub(crate) fn try_new_with_shallow_validation( metadata: VariantMetadata<'m>, value: &'v [u8], ) -> Result { @@ -196,18 +196,18 @@ impl<'m, 'v> VariantList<'m, 'v> { /// True if this instance is fully [validated] for panic-free infallible accesses. /// /// [validated]: Self#Validation - pub fn is_validated(&self) -> bool { + pub fn is_fully_validated(&self) -> bool { self.validated } /// Performs a full [validation] of this variant array and returns the result. /// /// [validation]: Self#Validation - pub fn validate(mut self) -> Result { + pub fn with_full_validation(mut self) -> Result { if !self.validated { // Validate the metadata dictionary first, if not already validated, because we pass it // by value to all the children (who would otherwise re-validate it repeatedly). - self.metadata = self.metadata.validate()?; + self.metadata = self.metadata.with_full_validation()?; // Iterate over all string keys in this dictionary in order to prove that the offset // array is valid, all offsets are in bounds, and all string bytes are valid utf-8. @@ -232,25 +232,25 @@ impl<'m, 'v> VariantList<'m, 'v> { /// [invalid]: Self#Validation pub fn get(&self, index: usize) -> Option> { (index < self.num_elements).then(|| { - self.try_get_impl(index) - .and_then(Variant::validate) + self.try_get_with_shallow_validation(index) .expect("Invalid variant array element") }) } /// Fallible version of `get`. Returns element by index, capturing validation errors pub fn try_get(&self, index: usize) -> Result, ArrowError> { - self.try_get_impl(index)?.validate() + self.try_get_with_shallow_validation(index)? + .with_full_validation() } - /// Fallible iteration over the elements of this list. - pub fn iter_try(&self) -> impl Iterator, ArrowError>> + '_ { - self.iter_try_impl().map(|result| result?.validate()) - } - - // Fallible iteration that only performs basic (constant-time) validation. - fn iter_try_impl(&self) -> impl Iterator, ArrowError>> + '_ { - (0..self.len()).map(move |i| self.try_get_impl(i)) + // Fallible version of `get`, performing only basic (constant-time) validation. + fn try_get_with_shallow_validation(&self, index: usize) -> Result, ArrowError> { + // Fetch the value bytes between the two offsets for this index, from the value array region + // of the byte buffer + let byte_range = self.get_offset(index)?..self.get_offset(index + 1)?; + let value_bytes = + slice_from_slice_at_offset(self.value, self.first_value_byte, byte_range)?; + Variant::try_new_with_metadata_and_shallow_validation(self.metadata, value_bytes) } /// Iterates over the values of this list. When working with [unvalidated] input, consider @@ -258,26 +258,29 @@ impl<'m, 'v> VariantList<'m, 'v> { /// /// [unvalidated]: Self#Validation pub fn iter(&self) -> impl Iterator> + '_ { - self.iter_try_impl() + self.iter_try_with_shallow_validation() .map(|result| result.expect("Invalid variant list entry")) } + /// Fallible iteration over the elements of this list. + pub fn iter_try(&self) -> impl Iterator, ArrowError>> + '_ { + self.iter_try_with_shallow_validation() + .map(|result| result?.with_full_validation()) + } + + // Fallible iteration that only performs basic (constant-time) validation. + fn iter_try_with_shallow_validation( + &self, + ) -> impl Iterator, ArrowError>> + '_ { + (0..self.len()).map(move |i| self.try_get_with_shallow_validation(i)) + } + // Attempts to retrieve the ith offset from the offset array region of the byte buffer. fn get_offset(&self, index: usize) -> Result { let byte_range = self.header.first_offset_byte()..self.first_value_byte; let offset_bytes = slice_from_slice(self.value, byte_range)?; self.header.offset_size.unpack_usize(offset_bytes, index) } - - // Fallible version of `get`, performing only basic (constant-time) validation. - fn try_get_impl(&self, index: usize) -> Result, ArrowError> { - // Fetch the value bytes between the two offsets for this index, from the value array region - // of the byte buffer - let byte_range = self.get_offset(index)?..self.get_offset(index + 1)?; - let value_bytes = - slice_from_slice_at_offset(self.value, self.first_value_byte, byte_range)?; - Variant::try_new_with_metadata(self.metadata, value_bytes) - } } #[cfg(test)] diff --git a/parquet-variant/src/variant/metadata.rs b/parquet-variant/src/variant/metadata.rs index 46d89557bfae..eede7150fd6f 100644 --- a/parquet-variant/src/variant/metadata.rs +++ b/parquet-variant/src/variant/metadata.rs @@ -93,14 +93,14 @@ impl VariantMetadataHeader { /// Every instance of variant metadata is either _valid_ or _invalid_. depending on whether the /// underlying bytes are a valid encoding of variant metadata (see below). /// -/// Instances produced by [`Self::try_new`] or [`Self::validate`] are fully _validated_. They always +/// Instances produced by [`Self::try_new`] or [`Self::with_full_validation`] are fully _validated_. They always /// contain _valid_ data, and infallible accesses such as iteration and indexing are panic-free. The /// validation cost is linear in the number of underlying bytes. /// /// Instances produced by [`Self::new`] are _unvalidated_ and so they may contain either _valid_ or /// _invalid_ data. Infallible accesses such as iteration and indexing will panic if the underlying /// bytes are _invalid_, and fallible alternatives such as [`Self::iter_try`] and [`Self::get`] are -/// provided as panic-free alternatives. [`Self::validate`] can also be used to _validate_ an +/// provided as panic-free alternatives. [`Self::with_full_validation`] can also be used to _validate_ an /// _unvalidated_ instance, if desired. /// /// _Unvalidated_ instances can be constructed in constant time. This can be useful if the caller @@ -143,7 +143,7 @@ impl<'m> VariantMetadata<'m> { /// /// [validation]: Self#Validation pub fn try_new(bytes: &'m [u8]) -> Result { - Self::try_new_impl(bytes)?.validate() + Self::try_new_with_shallow_validation(bytes)?.with_full_validation() } /// Interprets `bytes` as a variant metadata instance, without attempting to [validate] dictionary @@ -157,11 +157,11 @@ impl<'m> VariantMetadata<'m> { /// /// [validate]: Self#Validation pub fn new(bytes: &'m [u8]) -> Self { - Self::try_new_impl(bytes).expect("Invalid variant metadata") + Self::try_new_with_shallow_validation(bytes).expect("Invalid variant metadata") } // The actual constructor, which performs only basic (constant-const) validation. - pub(crate) fn try_new_impl(bytes: &'m [u8]) -> Result { + pub(crate) fn try_new_with_shallow_validation(bytes: &'m [u8]) -> Result { let header_byte = first_byte_from_slice(bytes)?; let header = VariantMetadataHeader::try_new(header_byte)?; @@ -209,14 +209,14 @@ impl<'m> VariantMetadata<'m> { /// True if this instance is fully [validated] for panic-free infallible accesses. /// /// [validated]: Self#Validation - pub fn is_validated(&self) -> bool { + pub fn is_fully_validated(&self) -> bool { self.validated } /// Performs a full [validation] of this metadata dictionary and returns the result. /// /// [validation]: Self#Validation - pub fn validate(mut self) -> Result { + pub fn with_full_validation(mut self) -> Result { if !self.validated { // Iterate over all string keys in this dictionary in order to prove that the offset // array is valid, all offsets are in bounds, and all string bytes are valid utf-8. diff --git a/parquet-variant/src/variant/object.rs b/parquet-variant/src/variant/object.rs index 15c67c9796cc..3991f76e9543 100644 --- a/parquet-variant/src/variant/object.rs +++ b/parquet-variant/src/variant/object.rs @@ -78,14 +78,14 @@ impl VariantObjectHeader { /// Every instance of variant object is either _valid_ or _invalid_. depending on whether the /// underlying bytes are a valid encoding of a variant object subtype (see below). /// -/// Instances produced by [`Self::try_new`] or [`Self::validate`] are fully (and recursively) +/// Instances produced by [`Self::try_new`] or [`Self::with_full_validation`] are fully (and recursively) /// _validated_. They always contain _valid_ data, and infallible accesses such as iteration and /// indexing are panic-free. The validation cost is linear in the number of underlying bytes. /// /// Instances produced by [`Self::new`] are _unvalidated_ and so they may contain either _valid_ or /// _invalid_ data. Infallible accesses such as iteration and indexing will panic if the underlying /// bytes are _invalid_, and fallible alternatives such as [`Self::iter_try`] and [`Self::get`] are -/// provided as panic-free alternatives. [`Self::validate`] can also be used to _validate_ an +/// provided as panic-free alternatives. [`Self::with_full_validation`] can also be used to _validate_ an /// _unvalidated_ instance, if desired. /// /// _Unvalidated_ instances can be constructed in constant time. They can be useful if the caller @@ -128,7 +128,7 @@ pub struct VariantObject<'m, 'v> { impl<'m, 'v> VariantObject<'m, 'v> { pub fn new(metadata: VariantMetadata<'m>, value: &'v [u8]) -> Self { - Self::try_new_impl(metadata, value).expect("Invalid variant object") + Self::try_new_with_shallow_validation(metadata, value).expect("Invalid variant object") } /// Attempts to interpet `metadata` and `value` as a variant object. @@ -139,14 +139,14 @@ impl<'m, 'v> VariantObject<'m, 'v> { /// particular, that all field ids exist in `metadata`, and all offsets are in-bounds and point /// to valid objects. pub fn try_new(metadata: VariantMetadata<'m>, value: &'v [u8]) -> Result { - Self::try_new_impl(metadata, value)?.validate() + Self::try_new_with_shallow_validation(metadata, value)?.with_full_validation() } /// Attempts to interpet `metadata` and `value` as a variant object, performing only basic /// (constant-cost) [validation]. /// /// [validation]: Self#Validation - pub(crate) fn try_new_impl( + pub(crate) fn try_new_with_shallow_validation( metadata: VariantMetadata<'m>, value: &'v [u8], ) -> Result { @@ -197,22 +197,22 @@ impl<'m, 'v> VariantObject<'m, 'v> { /// True if this instance is fully [validated] for panic-free infallible accesses. /// /// [validated]: Self#Validation - pub fn is_validated(&self) -> bool { + pub fn is_fully_validated(&self) -> bool { self.validated } /// Performs a full [validation] of this variant object. /// /// [validation]: Self#Validation - pub fn validate(mut self) -> Result { + pub fn with_full_validation(mut self) -> Result { if !self.validated { // Validate the metadata dictionary first, if not already validated, because we pass it // by value to all the children (who would otherwise re-validate it repeatedly). - self.metadata = self.metadata.validate()?; + self.metadata = self.metadata.with_full_validation()?; // Iterate over all string keys in this dictionary in order to prove that the offset // array is valid, all offsets are in bounds, and all string bytes are valid utf-8. - validate_fallible_iterator(self.iter_try_impl())?; + validate_fallible_iterator(self.iter_try())?; self.validated = true; } Ok(self) @@ -236,20 +236,24 @@ impl<'m, 'v> VariantObject<'m, 'v> { /// field IDs). The latter can only happen when working with an unvalidated object produced by /// [`Self::new`]. pub fn field(&self, i: usize) -> Option> { - (i < self.len()).then(|| self.try_field_impl(i).expect("Invalid object field value")) + (i < self.len()).then(|| { + self.try_field_with_shallow_validation(i) + .expect("Invalid object field value") + }) } /// Fallible version of `field`. Returns field value by index, capturing validation errors pub fn try_field(&self, i: usize) -> Result, ArrowError> { - self.try_field_impl(i)?.validate() + self.try_field_with_shallow_validation(i)? + .with_full_validation() } // Attempts to retrieve the ith field value from the value region of the byte buffer; it // performs only basic (constant-cost) validation. - fn try_field_impl(&self, i: usize) -> Result, ArrowError> { + fn try_field_with_shallow_validation(&self, i: usize) -> Result, ArrowError> { let value_bytes = slice_from_slice(self.value, self.first_value_byte..)?; let value_bytes = slice_from_slice(value_bytes, self.get_offset(i)?..)?; - Variant::try_new_with_metadata(self.metadata, value_bytes) + Variant::try_new_with_metadata_and_shallow_validation(self.metadata, value_bytes) } // Attempts to retrieve the ith offset from the field offset region of the byte buffer. @@ -281,7 +285,7 @@ impl<'m, 'v> VariantObject<'m, 'v> { /// Returns an iterator of (name, value) pairs over the fields of this object. pub fn iter(&self) -> impl Iterator)> + '_ { - self.iter_try_impl() + self.iter_try_with_shallow_validation() .map(|result| result.expect("Invalid variant object field value")) } @@ -289,18 +293,21 @@ impl<'m, 'v> VariantObject<'m, 'v> { pub fn iter_try( &self, ) -> impl Iterator), ArrowError>> + '_ { - self.iter_try_impl().map(|result| { + self.iter_try_with_shallow_validation().map(|result| { let (name, value) = result?; - Ok((name, value.validate()?)) + Ok((name, value.with_full_validation()?)) }) } // Fallible iteration over the fields of this object that performs only shallow (constant-cost) // validation of field values. - fn iter_try_impl( + fn iter_try_with_shallow_validation( &self, ) -> impl Iterator), ArrowError>> + '_ { - (0..self.num_elements).map(move |i| Ok((self.try_field_name(i)?, self.try_field(i)?))) + (0..self.num_elements).map(move |i| { + let field = self.try_field_with_shallow_validation(i)?; + Ok((self.try_field_name(i)?, field)) + }) } /// Returns the value of the field with the specified name, if any. diff --git a/parquet-variant/tests/variant_interop.rs b/parquet-variant/tests/variant_interop.rs index c95d81a3e904..e37172a7d568 100644 --- a/parquet-variant/tests/variant_interop.rs +++ b/parquet-variant/tests/variant_interop.rs @@ -417,7 +417,7 @@ fn test_validation_workflow(metadata: &[u8], value: &[u8]) { }; // Step 2: Try validation - let validation_result = std::panic::catch_unwind(|| variant.clone().validate()); + let validation_result = std::panic::catch_unwind(|| variant.clone().with_full_validation()); match validation_result { Ok(Ok(validated)) => { @@ -515,7 +515,7 @@ fn test_validation_workflow_simple(metadata: &[u8], value: &[u8]) { }; // Step 2: Try validation - let validation_result = std::panic::catch_unwind(|| variant.clone().validate()); + let validation_result = std::panic::catch_unwind(|| variant.clone().with_full_validation()); match validation_result { Ok(Ok(validated)) => {