Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion parquet-variant/benches/variant_builder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -441,7 +441,7 @@ fn bench_validation_validated_vs_unvalidated(c: &mut Criterion) {

b.iter(|| {
for variant in &unvalidated {
let validated = variant.clone().validate().unwrap();
let validated = variant.clone().with_full_validation().unwrap();
hint::black_box(validated);
}
})
Expand Down
45 changes: 25 additions & 20 deletions parquet-variant/src/variant.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
use std::ops::Deref;

// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
Expand All @@ -16,6 +14,7 @@ use std::ops::Deref;
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

pub use self::decimal::{VariantDecimal16, VariantDecimal4, VariantDecimal8};
pub use self::list::VariantList;
pub use self::metadata::VariantMetadata;
Expand All @@ -24,6 +23,7 @@ use crate::decoder::{
self, get_basic_type, get_primitive_type, VariantBasicType, VariantPrimitiveType,
};
use crate::utils::{first_byte_from_slice, slice_from_slice};
use std::ops::Deref;

use arrow_schema::ArrowError;
use chrono::{DateTime, NaiveDate, NaiveDateTime, Utc};
Expand Down Expand Up @@ -184,15 +184,15 @@ impl Deref for ShortString<'_> {
/// Every instance of variant is either _valid_ or _invalid_. depending on whether the
/// underlying bytes are a valid encoding of a variant value (see below).
///
/// Instances produced by [`Self::try_new`], [`Self::try_new_with_metadata`], or [`Self::validate`]
/// Instances produced by [`Self::try_new`], [`Self::try_new_with_metadata`], or [`Self::with_full_validation`]
/// are fully _validated_. They always contain _valid_ data, and infallible accesses such as
/// iteration and indexing are panic-free. The validation cost is `O(m + v)` where `m` and
/// `v` are the number of bytes in the metadata and value buffers, respectively.
///
/// Instances produced by [`Self::new`] and [`Self::new_with_metadata`] are _unvalidated_ and so
/// they may contain either _valid_ or _invalid_ data. Infallible accesses to variant objects and
/// arrays, such as iteration and indexing will panic if the underlying bytes are _invalid_, and
/// fallible alternatives are provided as panic-free alternatives. [`Self::validate`] can also be
/// fallible alternatives are provided as panic-free alternatives. [`Self::with_full_validation`] can also be
/// used to _validate_ an _unvalidated_ instance, if desired.
///
/// _Unvalidated_ instances can be constructed in constant time. This can be useful if the caller
Expand Down Expand Up @@ -297,8 +297,10 @@ impl<'m, 'v> Variant<'m, 'v> {
///
/// [unvalidated]: Self#Validation
pub fn new(metadata: &'m [u8], value: &'v [u8]) -> Self {
let metadata = VariantMetadata::try_new_impl(metadata).expect("Invalid variant metadata");
Self::try_new_with_metadata_impl(metadata, value).expect("Invalid variant data")
let metadata = VariantMetadata::try_new_with_shallow_validation(metadata)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is a very nice name change

.expect("Invalid variant metadata");
Self::try_new_with_metadata_and_shallow_validation(metadata, value)
.expect("Invalid variant data")
}

/// Create a new variant with existing metadata.
Expand All @@ -323,18 +325,19 @@ impl<'m, 'v> Variant<'m, 'v> {
metadata: VariantMetadata<'m>,
value: &'v [u8],
) -> Result<Self, ArrowError> {
Self::try_new_with_metadata_impl(metadata, value)?.validate()
Self::try_new_with_metadata_and_shallow_validation(metadata, value)?.with_full_validation()
}

/// Similar to [`Self::try_new_with_metadata`], but [unvalidated].
///
/// [unvalidated]: Self#Validation
pub fn new_with_metadata(metadata: VariantMetadata<'m>, value: &'v [u8]) -> Self {
Self::try_new_with_metadata_impl(metadata, value).expect("Invalid variant")
Self::try_new_with_metadata_and_shallow_validation(metadata, value)
.expect("Invalid variant")
}

// The actual constructor, which only performs shallow (constant-time) validation.
fn try_new_with_metadata_impl(
fn try_new_with_metadata_and_shallow_validation(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good thing this isn't part of the public API... quite a mouthful!

metadata: VariantMetadata<'m>,
value: &'v [u8],
) -> Result<Self, ArrowError> {
Expand Down Expand Up @@ -382,21 +385,23 @@ impl<'m, 'v> Variant<'m, 'v> {
VariantBasicType::ShortString => {
Variant::ShortString(decoder::decode_short_string(value_metadata, value_data)?)
}
VariantBasicType::Object => {
Variant::Object(VariantObject::try_new_impl(metadata, value)?)
}
VariantBasicType::Array => Variant::List(VariantList::try_new_impl(metadata, value)?),
VariantBasicType::Object => Variant::Object(
VariantObject::try_new_with_shallow_validation(metadata, value)?,
),
VariantBasicType::Array => Variant::List(VariantList::try_new_with_shallow_validation(
metadata, value,
Copy link
Contributor

@scovich scovich Jul 5, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

aside: I don't think fmt usually likes multiple args on a single line? (I sure sometimes wish it did, applying the same rules for splitting that it does for any other complex "expression", i.e. too much nesting or more than ~80 chars would cause line splits)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yeah I don't know -- cargo fmt doesn't reformat this line for me locally (or on CI) 🤷 🤔

Copy link
Contributor

@scovich scovich Jul 7, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think there must have been a fmt rules update, because I'm also seeing this now locally.
Which IMO is a really nice step forward for balancing readability vs, newline bloat 🎉

)?),
};
Ok(new_self)
}

/// True if this variant instance has already been [validated].
///
/// [validated]: Self#Validation
pub fn is_validated(&self) -> bool {
pub fn is_fully_validated(&self) -> bool {
match self {
Variant::List(list) => list.is_validated(),
Variant::Object(obj) => obj.is_validated(),
Variant::List(list) => list.is_fully_validated(),
Variant::Object(obj) => obj.is_fully_validated(),
_ => true,
}
}
Expand All @@ -407,16 +412,16 @@ impl<'m, 'v> Variant<'m, 'v> {
/// Variant leaf values are always valid by construction, but [objects] and [arrays] can be
/// constructed in unvalidated (and potentially invalid) state.
///
/// If [`Self::is_validated`] is true, validation is a no-op. Otherwise, the cost is `O(m + v)`
/// If [`Self::is_fully_validated`] is true, validation is a no-op. Otherwise, the cost is `O(m + v)`
/// where `m` and `v` are the sizes of metadata and value buffers, respectively.
///
/// [objects]: VariantObject#Validation
/// [arrays]: VariantList#Validation
pub fn validate(self) -> Result<Self, ArrowError> {
pub fn with_full_validation(self) -> Result<Self, ArrowError> {
use Variant::*;
match self {
List(list) => list.validate().map(List),
Object(obj) => obj.validate().map(Object),
List(list) => list.with_full_validation().map(List),
Object(obj) => obj.with_full_validation().map(Object),
_ => Ok(self),
}
}
Expand Down
63 changes: 33 additions & 30 deletions parquet-variant/src/variant/list.rs
Original file line number Diff line number Diff line change
Expand Up @@ -82,14 +82,14 @@ impl VariantListHeader {
/// Every instance of variant list is either _valid_ or _invalid_. depending on whether the
/// underlying bytes are a valid encoding of a variant array (see below).
///
/// Instances produced by [`Self::try_new`] or [`Self::validate`] are fully _validated_. They always
/// Instances produced by [`Self::try_new`] or [`Self::with_full_validation`] are fully _validated_. They always
/// contain _valid_ data, and infallible accesses such as iteration and indexing are panic-free. The
/// validation cost is linear in the number of underlying bytes.
///
/// Instances produced by [`Self::new`] are _unvalidated_ and so they may contain either _valid_ or
/// _invalid_ data. Infallible accesses such as iteration and indexing will panic if the underlying
/// bytes are _invalid_, and fallible alternatives such as [`Self::iter_try`] and [`Self::get`] are
/// provided as panic-free alternatives. [`Self::validate`] can also be used to _validate_ an
/// provided as panic-free alternatives. [`Self::with_full_validation`] can also be used to _validate_ an
/// _unvalidated_ instance, if desired.
///
/// _Unvalidated_ instances can be constructed in constant time. This can be useful if the caller
Expand Down Expand Up @@ -136,18 +136,18 @@ impl<'m, 'v> VariantList<'m, 'v> {
/// This constructor verifies that `value` points to a valid variant array value. In particular,
/// that all offsets are in-bounds and point to valid (recursively validated) objects.
pub fn try_new(metadata: VariantMetadata<'m>, value: &'v [u8]) -> Result<Self, ArrowError> {
Self::try_new_impl(metadata, value)?.validate()
Self::try_new_with_shallow_validation(metadata, value)?.with_full_validation()
}

pub fn new(metadata: VariantMetadata<'m>, value: &'v [u8]) -> Self {
Self::try_new_impl(metadata, value).expect("Invalid variant list value")
Self::try_new_with_shallow_validation(metadata, value).expect("Invalid variant list value")
}

/// Attempts to interpet `metadata` and `value` as a variant array, performing only basic
/// (constant-cost) [validation].
///
/// [validation]: Self#Validation
pub(crate) fn try_new_impl(
pub(crate) fn try_new_with_shallow_validation(
metadata: VariantMetadata<'m>,
value: &'v [u8],
) -> Result<Self, ArrowError> {
Expand Down Expand Up @@ -196,18 +196,18 @@ impl<'m, 'v> VariantList<'m, 'v> {
/// True if this instance is fully [validated] for panic-free infallible accesses.
///
/// [validated]: Self#Validation
pub fn is_validated(&self) -> bool {
pub fn is_fully_validated(&self) -> bool {
self.validated
}

/// Performs a full [validation] of this variant array and returns the result.
///
/// [validation]: Self#Validation
pub fn validate(mut self) -> Result<Self, ArrowError> {
pub fn with_full_validation(mut self) -> Result<Self, ArrowError> {
if !self.validated {
// Validate the metadata dictionary first, if not already validated, because we pass it
// by value to all the children (who would otherwise re-validate it repeatedly).
self.metadata = self.metadata.validate()?;
self.metadata = self.metadata.with_full_validation()?;

// Iterate over all string keys in this dictionary in order to prove that the offset
// array is valid, all offsets are in bounds, and all string bytes are valid utf-8.
Expand All @@ -232,52 +232,55 @@ impl<'m, 'v> VariantList<'m, 'v> {
/// [invalid]: Self#Validation
pub fn get(&self, index: usize) -> Option<Variant<'m, 'v>> {
(index < self.num_elements).then(|| {
self.try_get_impl(index)
.and_then(Variant::validate)
self.try_get_with_shallow_validation(index)
.expect("Invalid variant array element")
})
}

/// Fallible version of `get`. Returns element by index, capturing validation errors
pub fn try_get(&self, index: usize) -> Result<Variant<'m, 'v>, ArrowError> {
self.try_get_impl(index)?.validate()
self.try_get_with_shallow_validation(index)?
.with_full_validation()
}

/// Fallible iteration over the elements of this list.
pub fn iter_try(&self) -> impl Iterator<Item = Result<Variant<'m, 'v>, ArrowError>> + '_ {
self.iter_try_impl().map(|result| result?.validate())
}

// Fallible iteration that only performs basic (constant-time) validation.
fn iter_try_impl(&self) -> impl Iterator<Item = Result<Variant<'m, 'v>, ArrowError>> + '_ {
(0..self.len()).map(move |i| self.try_get_impl(i))
// Fallible version of `get`, performing only basic (constant-time) validation.
fn try_get_with_shallow_validation(&self, index: usize) -> Result<Variant<'m, 'v>, ArrowError> {
// Fetch the value bytes between the two offsets for this index, from the value array region
// of the byte buffer
let byte_range = self.get_offset(index)?..self.get_offset(index + 1)?;
let value_bytes =
slice_from_slice_at_offset(self.value, self.first_value_byte, byte_range)?;
Variant::try_new_with_metadata_and_shallow_validation(self.metadata, value_bytes)
}

/// Iterates over the values of this list. When working with [unvalidated] input, consider
/// [`Self::iter_try`] to avoid panics due to invalid data.
///
/// [unvalidated]: Self#Validation
pub fn iter(&self) -> impl Iterator<Item = Variant<'m, 'v>> + '_ {
self.iter_try_impl()
self.iter_try_with_shallow_validation()
.map(|result| result.expect("Invalid variant list entry"))
}

/// Fallible iteration over the elements of this list.
pub fn iter_try(&self) -> impl Iterator<Item = Result<Variant<'m, 'v>, ArrowError>> + '_ {
self.iter_try_with_shallow_validation()
.map(|result| result?.with_full_validation())
}

// Fallible iteration that only performs basic (constant-time) validation.
fn iter_try_with_shallow_validation(
&self,
) -> impl Iterator<Item = Result<Variant<'m, 'v>, ArrowError>> + '_ {
(0..self.len()).map(move |i| self.try_get_with_shallow_validation(i))
}

// Attempts to retrieve the ith offset from the offset array region of the byte buffer.
fn get_offset(&self, index: usize) -> Result<usize, ArrowError> {
let byte_range = self.header.first_offset_byte()..self.first_value_byte;
let offset_bytes = slice_from_slice(self.value, byte_range)?;
self.header.offset_size.unpack_usize(offset_bytes, index)
}

// Fallible version of `get`, performing only basic (constant-time) validation.
fn try_get_impl(&self, index: usize) -> Result<Variant<'m, 'v>, ArrowError> {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

note: moving and changing code in the same commit makes review a lot more difficult, and refactor + bug fix in the same commit suffers a similar issue. I know this code inside out and it's still hard to be sure I'm catching everything -- imagine the burden on a reviewer who isn't as familiar with the code?

In the future, you might consider curating a stack of three commits for changes like this:

  • code movement only (big, but trivial to review because it doesn't change anything)
  • function renames only (big, but purely mechanical and thus easy to review)
  • the actual fix (tiny, and therefore easy to review)

For big changes, some of those commits might even be their own PR... but a commit stack is often good enough, if called out so reviewers know to look at the commits in isolation)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Plus, in my experience, it benefits the author as well. I can't count the number of times I've discovered unnecessarily ugliness, incomplete refactorings, test gaps, and even bugs (by inspection) when I did the work to tease apart a big complicated change into a stack of commits. A bit more work up front, but higher quality code with review stamps landing faster (sometimes a lot faster).

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I 100% agree with @scovich here

Also, If you have a commit that just moves code around, please feel free to put it up and tag me aggressively. I'll try and get it merged quicky (as @scovich says, they are very easy to review)

// Fetch the value bytes between the two offsets for this index, from the value array region
// of the byte buffer
let byte_range = self.get_offset(index)?..self.get_offset(index + 1)?;
let value_bytes =
slice_from_slice_at_offset(self.value, self.first_value_byte, byte_range)?;
Variant::try_new_with_metadata(self.metadata, value_bytes)
}
}

#[cfg(test)]
Expand Down
14 changes: 7 additions & 7 deletions parquet-variant/src/variant/metadata.rs
Original file line number Diff line number Diff line change
Expand Up @@ -93,14 +93,14 @@ impl VariantMetadataHeader {
/// Every instance of variant metadata is either _valid_ or _invalid_. depending on whether the
/// underlying bytes are a valid encoding of variant metadata (see below).
///
/// Instances produced by [`Self::try_new`] or [`Self::validate`] are fully _validated_. They always
/// Instances produced by [`Self::try_new`] or [`Self::with_full_validation`] are fully _validated_. They always
/// contain _valid_ data, and infallible accesses such as iteration and indexing are panic-free. The
/// validation cost is linear in the number of underlying bytes.
///
/// Instances produced by [`Self::new`] are _unvalidated_ and so they may contain either _valid_ or
/// _invalid_ data. Infallible accesses such as iteration and indexing will panic if the underlying
/// bytes are _invalid_, and fallible alternatives such as [`Self::iter_try`] and [`Self::get`] are
/// provided as panic-free alternatives. [`Self::validate`] can also be used to _validate_ an
/// provided as panic-free alternatives. [`Self::with_full_validation`] can also be used to _validate_ an
/// _unvalidated_ instance, if desired.
///
/// _Unvalidated_ instances can be constructed in constant time. This can be useful if the caller
Expand Down Expand Up @@ -143,7 +143,7 @@ impl<'m> VariantMetadata<'m> {
///
/// [validation]: Self#Validation
pub fn try_new(bytes: &'m [u8]) -> Result<Self, ArrowError> {
Self::try_new_impl(bytes)?.validate()
Self::try_new_with_shallow_validation(bytes)?.with_full_validation()
}

/// Interprets `bytes` as a variant metadata instance, without attempting to [validate] dictionary
Expand All @@ -157,11 +157,11 @@ impl<'m> VariantMetadata<'m> {
///
/// [validate]: Self#Validation
pub fn new(bytes: &'m [u8]) -> Self {
Self::try_new_impl(bytes).expect("Invalid variant metadata")
Self::try_new_with_shallow_validation(bytes).expect("Invalid variant metadata")
}

// The actual constructor, which performs only basic (constant-const) validation.
pub(crate) fn try_new_impl(bytes: &'m [u8]) -> Result<Self, ArrowError> {
pub(crate) fn try_new_with_shallow_validation(bytes: &'m [u8]) -> Result<Self, ArrowError> {
let header_byte = first_byte_from_slice(bytes)?;
let header = VariantMetadataHeader::try_new(header_byte)?;

Expand Down Expand Up @@ -209,14 +209,14 @@ impl<'m> VariantMetadata<'m> {
/// True if this instance is fully [validated] for panic-free infallible accesses.
///
/// [validated]: Self#Validation
pub fn is_validated(&self) -> bool {
pub fn is_fully_validated(&self) -> bool {
self.validated
}

/// Performs a full [validation] of this metadata dictionary and returns the result.
///
/// [validation]: Self#Validation
pub fn validate(mut self) -> Result<Self, ArrowError> {
pub fn with_full_validation(mut self) -> Result<Self, ArrowError> {
if !self.validated {
// Iterate over all string keys in this dictionary in order to prove that the offset
// array is valid, all offsets are in bounds, and all string bytes are valid utf-8.
Expand Down
Loading
Loading