Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion parquet-variant-json/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -46,4 +46,3 @@ name = "parquet_variant_json"
bench = false

[dev-dependencies]

4 changes: 4 additions & 0 deletions parquet-variant/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -55,3 +55,7 @@ rand = { version = "0.9", default-features = false, features = [
[[bench]]
name = "variant_builder"
harness = false

[[bench]]
name = "variant_validation"
harness = false
138 changes: 138 additions & 0 deletions parquet-variant/benches/variant_validation.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,138 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

extern crate parquet_variant;

use criterion::*;

use parquet_variant::{Variant, VariantBuilder};

fn generate_large_object() -> (Vec<u8>, Vec<u8>) {
// 256 elements (keys: 000-255) - each element is an object of 256 elements (240-495) - each
// element a list of numbers from 0-127
let mut variant_builder = VariantBuilder::new();
let mut outer_object = variant_builder.new_object();

for i in 0..=125 {
let key = format!("{i:03}");
let mut inner_object = outer_object.new_object(&key);

for j in 125..=250 {
let inner_key = format!("{j}");
let mut list_builder = inner_object.new_list(&inner_key);

for k in 0..=127 {
list_builder.append_value(Variant::Int8(k));
}
list_builder.finish();
}
inner_object.finish().unwrap();
}
outer_object.finish().unwrap();

variant_builder.finish()
}

fn generate_complex_object() -> (Vec<u8>, Vec<u8>) {
let mut variant_builder = VariantBuilder::new();
let mut object_builder = variant_builder.new_object();
let mut inner_list_builder = object_builder.new_list("booleans");

for _ in 0..1024 {
inner_list_builder.append_value(Variant::BooleanTrue);
}

inner_list_builder.finish();
object_builder.insert("null", Variant::Null);
let mut inner_list_builder = object_builder.new_list("numbers");
for _ in 0..1024 {
inner_list_builder.append_value(Variant::Int8(4));
inner_list_builder.append_value(Variant::Double(-3e0));
inner_list_builder.append_value(Variant::Double(1001e-3));
}
inner_list_builder.finish();

let mut inner_object_builder = object_builder.new_object("nested");

for i in 0..2048 {
let key = format!("{}", 1024 - i);
inner_object_builder.insert(&key, i);
}
inner_object_builder.finish().unwrap();

object_builder.finish().unwrap();

variant_builder.finish()
}

fn generate_large_nested_list() -> (Vec<u8>, Vec<u8>) {
let mut variant_builder = VariantBuilder::new();
let mut list_builder = variant_builder.new_list();
for _ in 0..255 {
let mut list_builder_inner = list_builder.new_list();
for _ in 0..120 {
list_builder_inner.append_value(Variant::Null);

let mut list_builder_inner_inner = list_builder_inner.new_list();
for _ in 0..20 {
list_builder_inner_inner.append_value(Variant::Double(-3e0));
}

list_builder_inner_inner.finish();
}
list_builder_inner.finish();
}
list_builder.finish();
variant_builder.finish()
}

// Generates a large object and performs full validation
fn bench_validate_large_object(c: &mut Criterion) {
let (metadata, value) = generate_large_object();
c.bench_function("bench_validate_large_object", |b| {
b.iter(|| {
std::hint::black_box(Variant::try_new(&metadata, &value).unwrap());
})
});
}

fn bench_validate_complex_object(c: &mut Criterion) {
let (metadata, value) = generate_complex_object();
c.bench_function("bench_validate_complex_object", |b| {
b.iter(|| {
std::hint::black_box(Variant::try_new(&metadata, &value).unwrap());
})
});
}

fn bench_validate_large_nested_list(c: &mut Criterion) {
let (metadata, value) = generate_large_nested_list();
c.bench_function("bench_validate_large_nested_list", |b| {
b.iter(|| {
std::hint::black_box(Variant::try_new(&metadata, &value).unwrap());
})
});
}

criterion_group!(
benches,
bench_validate_large_object,
bench_validate_complex_object,
bench_validate_large_nested_list
);

criterion_main!(benches);
19 changes: 19 additions & 0 deletions parquet-variant/src/decoder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -200,6 +200,25 @@ impl OffsetSizeBytes {
}
}

/// Converts a byte buffer to offset values based on the specific offset size
pub(crate) fn map_bytes_to_offsets(
buffer: &[u8],
offset_size: OffsetSizeBytes,
) -> impl Iterator<Item = usize> + use<'_> {
buffer
.chunks_exact(offset_size as usize)
.map(move |chunk| match offset_size {
OffsetSizeBytes::One => chunk[0] as usize,
OffsetSizeBytes::Two => u16::from_le_bytes([chunk[0], chunk[1]]) as usize,
OffsetSizeBytes::Three => {
u32::from_le_bytes([chunk[0], chunk[1], chunk[2], 0]) as usize
}
OffsetSizeBytes::Four => {
u32::from_le_bytes([chunk[0], chunk[1], chunk[2], chunk[3]]) as usize
}
})
}

/// Extract the primitive type from a Variant value-metadata byte
pub(crate) fn get_primitive_type(metadata: u8) -> Result<VariantPrimitiveType, ArrowError> {
// last 6 bits contain the primitive-type, see spec
Expand Down
8 changes: 0 additions & 8 deletions parquet-variant/src/utils.rs
Original file line number Diff line number Diff line change
Expand Up @@ -122,11 +122,3 @@ where

Some(Err(start))
}

/// Attempts to prove a fallible iterator is actually infallible in practice, by consuming every
/// element and returning the first error (if any).
pub(crate) fn validate_fallible_iterator<T, E>(
mut it: impl Iterator<Item = Result<T, E>>,
) -> Result<(), E> {
it.find(Result::is_err).transpose().map(|_| ())
}
35 changes: 30 additions & 5 deletions parquet-variant/src/variant/list.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,9 @@
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
use crate::decoder::OffsetSizeBytes;
use crate::decoder::{map_bytes_to_offsets, OffsetSizeBytes};
use crate::utils::{
first_byte_from_slice, overflow_error, slice_from_slice, slice_from_slice_at_offset,
validate_fallible_iterator,
};
use crate::variant::{Variant, VariantMetadata};

Expand Down Expand Up @@ -209,9 +208,35 @@ impl<'m, 'v> VariantList<'m, 'v> {
// by value to all the children (who would otherwise re-validate it repeatedly).
self.metadata = self.metadata.with_full_validation()?;

// Iterate over all string keys in this dictionary in order to prove that the offset
// array is valid, all offsets are in bounds, and all string bytes are valid utf-8.
validate_fallible_iterator(self.iter_try())?;
let offset_buffer = slice_from_slice(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

it seems like there is a lot of duplicated code that does about the same thing:
Check that a slice of offsets are all

  1. sorted (potentially)--
  2. Less than some max offset
  3. Point into a valid sub variant

I wonder if it possible to factor it all out into a function like

fn validate_offsets(offset_buffer: &[u8], num_offsets: usize, offset_size: OffsetSize, max_valid_offset: usize) {
...
}

Or something 🤔

Copy link
Contributor Author

@friendlymatthew friendlymatthew Jul 11, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I plan on following up with a PR that removes this check since we can validate the monotonicity of offsets when accessing variants

self.value,
self.header.first_offset_byte()..self.first_value_byte,
)?;

let offsets =
map_bytes_to_offsets(offset_buffer, self.header.offset_size).collect::<Vec<_>>();

// Validate offsets are in-bounds and monotonically increasing.
// Since shallow verification checks whether the first and last offsets are in-bounds,
// we can also verify all offsets are in-bounds by checking if offsets are monotonically increasing.
let are_offsets_monotonic = offsets.is_sorted_by(|a, b| a < b);
if !are_offsets_monotonic {
return Err(ArrowError::InvalidArgumentError(
"offsets are not monotonically increasing".to_string(),
));
}
Comment on lines +222 to +227
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We don't need this check -- the loop below does offsets[i-1]..offsets[i] for every i in 1..offsets.len(), and any non-monotonic offset would cause slice_from_slice to return an Err like:?

Tried to extract byte(s) 42..25 from 100-byte buffer

Is it worth making an extra pass over the offsets (which requires materializing them) just to have a slightly nicer error message?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The simplified version would be something like:

let offsets = map_bytes_to_offsets(offset_buffer, self.header.offset_size);
if let Some(mut start) = offsets.next() {
    for end in offsets {
          ... validate start..end ...
        start = end;
    }
}


let value_buffer = slice_from_slice(self.value, self.first_value_byte..)?;

// Validate whether values are valid variant objects
for i in 1..offsets.len() {
let start_offset = offsets[i - 1];
let end_offset = offsets[i];

let value_bytes = slice_from_slice(value_buffer, start_offset..end_offset)?;
Variant::try_new_with_metadata(self.metadata, value_bytes)?;
}

self.validated = true;
}
Ok(self)
Expand Down
87 changes: 79 additions & 8 deletions parquet-variant/src/variant/metadata.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,8 @@
// specific language governing permissions and limitations
// under the License.

use crate::decoder::OffsetSizeBytes;
use crate::utils::{
first_byte_from_slice, overflow_error, slice_from_slice, string_from_slice,
validate_fallible_iterator,
};
use crate::decoder::{map_bytes_to_offsets, OffsetSizeBytes};
use crate::utils::{first_byte_from_slice, overflow_error, slice_from_slice, string_from_slice};

use arrow_schema::ArrowError;

Expand Down Expand Up @@ -228,9 +225,47 @@ impl<'m> VariantMetadata<'m> {
/// [validation]: Self#Validation
pub fn with_full_validation(mut self) -> Result<Self, ArrowError> {
if !self.validated {
// Iterate over all string keys in this dictionary in order to prove that the offset
// array is valid, all offsets are in bounds, and all string bytes are valid utf-8.
validate_fallible_iterator(self.iter_try())?;
let offset_bytes = slice_from_slice(
self.bytes,
self.header.first_offset_byte()..self.first_value_byte,
)?;

let offsets =
map_bytes_to_offsets(offset_bytes, self.header.offset_size).collect::<Vec<_>>();

// Validate offsets are in-bounds and monotonically increasing.
// Since shallow validation ensures the first and last offsets are in bounds, we can also verify all offsets
// are in-bounds by checking if offsets are monotonically increasing.
let are_offsets_monotonic = offsets.is_sorted_by(|a, b| a < b);
if !are_offsets_monotonic {
return Err(ArrowError::InvalidArgumentError(
"offsets not monotonically increasing".to_string(),
));
}
Comment on lines +239 to +244
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Again, I'm not sure it's worth paying an extra pass for a monotonicity check, when the slicing operations that follow will naturally fail in the presence of non-monotonic offsets?

let offsets = map_bytes_to_offsets(offset_bytes, self.header.offset_size);
if let Some(first_offset) = offsets.next() {
    // Create an iterator over the strings. We must consume the iterator to validate it.
    let strings = offsets.scan(first_offset, |start, end| {
        let s = slice_from_slice(value_buffer, start..end);
        *start = end;
        Some(s)
    });
    if self.header.is_sorted {
        // verify the strings are sorted and unique, in addition to being individually valid
        if let Some(mut a) = strings.next().transpose()? {
            for b in strings {
                let b = b?;
                if a >= b {
                    return Err(... dictionary values are not unique and ordered ...);
                }
                a = b;
            }
        }
    } else {
        // Just verify the strings are all individually valid
        validate_fallible_iterator(strings)?;
    }
}

This time, an iterator scan works well because we naturally unpack the errors later.

Note: The above does require making slice_from_slice fully generic:

pub(crate) fn slice_from_slice<T, I: SliceIndex<[T]> + Clone + Debug>(
    bytes: &[T],
    index: I,
) -> Result<&I::Output, ArrowError> {


// Verify the string values in the dictionary are UTF-8 encoded strings.
let value_buffer =
string_from_slice(self.bytes, 0, self.first_value_byte..self.bytes.len())?;

if self.header.is_sorted {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmm, is this a new check? Seems exiting VariantMetadata's with_full_validation doesn't have this check?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, the current validation was written before we supported sorted dictionaries

// Validate the dictionary values are unique and lexicographically sorted
let are_dictionary_values_unique_and_sorted = (1..offsets.len())
.map(|i| {
let field_range = offsets[i - 1]..offsets[i];
value_buffer.get(field_range)
})
.is_sorted_by(|a, b| match (a, b) {
(Some(a), Some(b)) => a < b,
Copy link
Member

@viirya viirya Jul 10, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmm, if there are values like ..., None, Some(a), None, Some(b)..., is it possible that an unordered case like a >= b cannot be detected?

_ => false,
});

if !are_dictionary_values_unique_and_sorted {
return Err(ArrowError::InvalidArgumentError(
"dictionary values are not unique and ordered".to_string(),
));
}
}

self.validated = true;
}
Ok(self)
Expand Down Expand Up @@ -399,6 +434,42 @@ mod tests {
);
}

#[test]
fn try_new_fails_non_monotonic2() {
// this test case checks whether offsets are monotonic in the full validation logic.

// 'cat', 'dog', 'lamb', "eel"
let bytes = &[
0b0000_0001, // header, offset_size_minus_one=0 and version=1
4, // dictionary_size
0x00,
0x02,
0x01, // Doesn't increase monotonically
0x10,
13,
b'c',
b'a',
b't',
b'd',
b'o',
b'g',
b'l',
b'a',
b'm',
b'b',
b'e',
b'e',
b'l',
];

let err = VariantMetadata::try_new(bytes).unwrap_err();

assert!(
matches!(err, ArrowError::InvalidArgumentError(_)),
"unexpected error: {err:?}"
);
}

#[test]
fn try_new_truncated_offsets_inline() {
// Missing final offset
Expand Down
Loading
Loading