diff --git a/parquet-variant-json/Cargo.toml b/parquet-variant-json/Cargo.toml index 830a3c060011..86281e4ae98e 100644 --- a/parquet-variant-json/Cargo.toml +++ b/parquet-variant-json/Cargo.toml @@ -46,4 +46,3 @@ name = "parquet_variant_json" bench = false [dev-dependencies] - diff --git a/parquet-variant/Cargo.toml b/parquet-variant/Cargo.toml index 3edfbb76ed32..329399f9f655 100644 --- a/parquet-variant/Cargo.toml +++ b/parquet-variant/Cargo.toml @@ -55,3 +55,7 @@ rand = { version = "0.9", default-features = false, features = [ [[bench]] name = "variant_builder" harness = false + +[[bench]] +name = "variant_validation" +harness = false diff --git a/parquet-variant/benches/variant_validation.rs b/parquet-variant/benches/variant_validation.rs new file mode 100644 index 000000000000..0ccc10117898 --- /dev/null +++ b/parquet-variant/benches/variant_validation.rs @@ -0,0 +1,138 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +extern crate parquet_variant; + +use criterion::*; + +use parquet_variant::{Variant, VariantBuilder}; + +fn generate_large_object() -> (Vec, Vec) { + // 256 elements (keys: 000-255) - each element is an object of 256 elements (240-495) - each + // element a list of numbers from 0-127 + let mut variant_builder = VariantBuilder::new(); + let mut outer_object = variant_builder.new_object(); + + for i in 0..=125 { + let key = format!("{i:03}"); + let mut inner_object = outer_object.new_object(&key); + + for j in 125..=250 { + let inner_key = format!("{j}"); + let mut list_builder = inner_object.new_list(&inner_key); + + for k in 0..=127 { + list_builder.append_value(Variant::Int8(k)); + } + list_builder.finish(); + } + inner_object.finish().unwrap(); + } + outer_object.finish().unwrap(); + + variant_builder.finish() +} + +fn generate_complex_object() -> (Vec, Vec) { + let mut variant_builder = VariantBuilder::new(); + let mut object_builder = variant_builder.new_object(); + let mut inner_list_builder = object_builder.new_list("booleans"); + + for _ in 0..1024 { + inner_list_builder.append_value(Variant::BooleanTrue); + } + + inner_list_builder.finish(); + object_builder.insert("null", Variant::Null); + let mut inner_list_builder = object_builder.new_list("numbers"); + for _ in 0..1024 { + inner_list_builder.append_value(Variant::Int8(4)); + inner_list_builder.append_value(Variant::Double(-3e0)); + inner_list_builder.append_value(Variant::Double(1001e-3)); + } + inner_list_builder.finish(); + + let mut inner_object_builder = object_builder.new_object("nested"); + + for i in 0..2048 { + let key = format!("{}", 1024 - i); + inner_object_builder.insert(&key, i); + } + inner_object_builder.finish().unwrap(); + + object_builder.finish().unwrap(); + + variant_builder.finish() +} + +fn generate_large_nested_list() -> (Vec, Vec) { + let mut variant_builder = VariantBuilder::new(); + let mut list_builder = variant_builder.new_list(); + for _ in 0..255 { + let mut list_builder_inner = list_builder.new_list(); + for _ in 0..120 { + list_builder_inner.append_value(Variant::Null); + + let mut list_builder_inner_inner = list_builder_inner.new_list(); + for _ in 0..20 { + list_builder_inner_inner.append_value(Variant::Double(-3e0)); + } + + list_builder_inner_inner.finish(); + } + list_builder_inner.finish(); + } + list_builder.finish(); + variant_builder.finish() +} + +// Generates a large object and performs full validation +fn bench_validate_large_object(c: &mut Criterion) { + let (metadata, value) = generate_large_object(); + c.bench_function("bench_validate_large_object", |b| { + b.iter(|| { + std::hint::black_box(Variant::try_new(&metadata, &value).unwrap()); + }) + }); +} + +fn bench_validate_complex_object(c: &mut Criterion) { + let (metadata, value) = generate_complex_object(); + c.bench_function("bench_validate_complex_object", |b| { + b.iter(|| { + std::hint::black_box(Variant::try_new(&metadata, &value).unwrap()); + }) + }); +} + +fn bench_validate_large_nested_list(c: &mut Criterion) { + let (metadata, value) = generate_large_nested_list(); + c.bench_function("bench_validate_large_nested_list", |b| { + b.iter(|| { + std::hint::black_box(Variant::try_new(&metadata, &value).unwrap()); + }) + }); +} + +criterion_group!( + benches, + bench_validate_large_object, + bench_validate_complex_object, + bench_validate_large_nested_list +); + +criterion_main!(benches); diff --git a/parquet-variant/src/decoder.rs b/parquet-variant/src/decoder.rs index e419eca6ee3d..5a6aab43ff6d 100644 --- a/parquet-variant/src/decoder.rs +++ b/parquet-variant/src/decoder.rs @@ -200,6 +200,25 @@ impl OffsetSizeBytes { } } +/// Converts a byte buffer to offset values based on the specific offset size +pub(crate) fn map_bytes_to_offsets( + buffer: &[u8], + offset_size: OffsetSizeBytes, +) -> impl Iterator + use<'_> { + buffer + .chunks_exact(offset_size as usize) + .map(move |chunk| match offset_size { + OffsetSizeBytes::One => chunk[0] as usize, + OffsetSizeBytes::Two => u16::from_le_bytes([chunk[0], chunk[1]]) as usize, + OffsetSizeBytes::Three => { + u32::from_le_bytes([chunk[0], chunk[1], chunk[2], 0]) as usize + } + OffsetSizeBytes::Four => { + u32::from_le_bytes([chunk[0], chunk[1], chunk[2], chunk[3]]) as usize + } + }) +} + /// Extract the primitive type from a Variant value-metadata byte pub(crate) fn get_primitive_type(metadata: u8) -> Result { // last 6 bits contain the primitive-type, see spec diff --git a/parquet-variant/src/utils.rs b/parquet-variant/src/utils.rs index 765ea04ae6ae..ef402064e956 100644 --- a/parquet-variant/src/utils.rs +++ b/parquet-variant/src/utils.rs @@ -122,11 +122,3 @@ where Some(Err(start)) } - -/// Attempts to prove a fallible iterator is actually infallible in practice, by consuming every -/// element and returning the first error (if any). -pub(crate) fn validate_fallible_iterator( - mut it: impl Iterator>, -) -> Result<(), E> { - it.find(Result::is_err).transpose().map(|_| ()) -} diff --git a/parquet-variant/src/variant/list.rs b/parquet-variant/src/variant/list.rs index 05ddf9b2b762..11122190b446 100644 --- a/parquet-variant/src/variant/list.rs +++ b/parquet-variant/src/variant/list.rs @@ -14,10 +14,9 @@ // KIND, either express or implied. See the License for the // specific language governing permissions and limitations // under the License. -use crate::decoder::OffsetSizeBytes; +use crate::decoder::{map_bytes_to_offsets, OffsetSizeBytes}; use crate::utils::{ first_byte_from_slice, overflow_error, slice_from_slice, slice_from_slice_at_offset, - validate_fallible_iterator, }; use crate::variant::{Variant, VariantMetadata}; @@ -209,9 +208,35 @@ impl<'m, 'v> VariantList<'m, 'v> { // by value to all the children (who would otherwise re-validate it repeatedly). self.metadata = self.metadata.with_full_validation()?; - // Iterate over all string keys in this dictionary in order to prove that the offset - // array is valid, all offsets are in bounds, and all string bytes are valid utf-8. - validate_fallible_iterator(self.iter_try())?; + let offset_buffer = slice_from_slice( + self.value, + self.header.first_offset_byte()..self.first_value_byte, + )?; + + let offsets = + map_bytes_to_offsets(offset_buffer, self.header.offset_size).collect::>(); + + // Validate offsets are in-bounds and monotonically increasing. + // Since shallow verification checks whether the first and last offsets are in-bounds, + // we can also verify all offsets are in-bounds by checking if offsets are monotonically increasing. + let are_offsets_monotonic = offsets.is_sorted_by(|a, b| a < b); + if !are_offsets_monotonic { + return Err(ArrowError::InvalidArgumentError( + "offsets are not monotonically increasing".to_string(), + )); + } + + let value_buffer = slice_from_slice(self.value, self.first_value_byte..)?; + + // Validate whether values are valid variant objects + for i in 1..offsets.len() { + let start_offset = offsets[i - 1]; + let end_offset = offsets[i]; + + let value_bytes = slice_from_slice(value_buffer, start_offset..end_offset)?; + Variant::try_new_with_metadata(self.metadata, value_bytes)?; + } + self.validated = true; } Ok(self) diff --git a/parquet-variant/src/variant/metadata.rs b/parquet-variant/src/variant/metadata.rs index 0aad22ea7288..b50a76686996 100644 --- a/parquet-variant/src/variant/metadata.rs +++ b/parquet-variant/src/variant/metadata.rs @@ -15,11 +15,8 @@ // specific language governing permissions and limitations // under the License. -use crate::decoder::OffsetSizeBytes; -use crate::utils::{ - first_byte_from_slice, overflow_error, slice_from_slice, string_from_slice, - validate_fallible_iterator, -}; +use crate::decoder::{map_bytes_to_offsets, OffsetSizeBytes}; +use crate::utils::{first_byte_from_slice, overflow_error, slice_from_slice, string_from_slice}; use arrow_schema::ArrowError; @@ -228,9 +225,47 @@ impl<'m> VariantMetadata<'m> { /// [validation]: Self#Validation pub fn with_full_validation(mut self) -> Result { if !self.validated { - // Iterate over all string keys in this dictionary in order to prove that the offset - // array is valid, all offsets are in bounds, and all string bytes are valid utf-8. - validate_fallible_iterator(self.iter_try())?; + let offset_bytes = slice_from_slice( + self.bytes, + self.header.first_offset_byte()..self.first_value_byte, + )?; + + let offsets = + map_bytes_to_offsets(offset_bytes, self.header.offset_size).collect::>(); + + // Validate offsets are in-bounds and monotonically increasing. + // Since shallow validation ensures the first and last offsets are in bounds, we can also verify all offsets + // are in-bounds by checking if offsets are monotonically increasing. + let are_offsets_monotonic = offsets.is_sorted_by(|a, b| a < b); + if !are_offsets_monotonic { + return Err(ArrowError::InvalidArgumentError( + "offsets not monotonically increasing".to_string(), + )); + } + + // Verify the string values in the dictionary are UTF-8 encoded strings. + let value_buffer = + string_from_slice(self.bytes, 0, self.first_value_byte..self.bytes.len())?; + + if self.header.is_sorted { + // Validate the dictionary values are unique and lexicographically sorted + let are_dictionary_values_unique_and_sorted = (1..offsets.len()) + .map(|i| { + let field_range = offsets[i - 1]..offsets[i]; + value_buffer.get(field_range) + }) + .is_sorted_by(|a, b| match (a, b) { + (Some(a), Some(b)) => a < b, + _ => false, + }); + + if !are_dictionary_values_unique_and_sorted { + return Err(ArrowError::InvalidArgumentError( + "dictionary values are not unique and ordered".to_string(), + )); + } + } + self.validated = true; } Ok(self) @@ -399,6 +434,42 @@ mod tests { ); } + #[test] + fn try_new_fails_non_monotonic2() { + // this test case checks whether offsets are monotonic in the full validation logic. + + // 'cat', 'dog', 'lamb', "eel" + let bytes = &[ + 0b0000_0001, // header, offset_size_minus_one=0 and version=1 + 4, // dictionary_size + 0x00, + 0x02, + 0x01, // Doesn't increase monotonically + 0x10, + 13, + b'c', + b'a', + b't', + b'd', + b'o', + b'g', + b'l', + b'a', + b'm', + b'b', + b'e', + b'e', + b'l', + ]; + + let err = VariantMetadata::try_new(bytes).unwrap_err(); + + assert!( + matches!(err, ArrowError::InvalidArgumentError(_)), + "unexpected error: {err:?}" + ); + } + #[test] fn try_new_truncated_offsets_inline() { // Missing final offset diff --git a/parquet-variant/src/variant/object.rs b/parquet-variant/src/variant/object.rs index 5efca267af77..ea0c6fac0f13 100644 --- a/parquet-variant/src/variant/object.rs +++ b/parquet-variant/src/variant/object.rs @@ -14,10 +14,9 @@ // KIND, either express or implied. See the License for the // specific language governing permissions and limitations // under the License. -use crate::decoder::OffsetSizeBytes; +use crate::decoder::{map_bytes_to_offsets, OffsetSizeBytes}; use crate::utils::{ first_byte_from_slice, overflow_error, slice_from_slice, try_binary_search_range_by, - validate_fallible_iterator, }; use crate::variant::{Variant, VariantMetadata}; @@ -210,9 +209,80 @@ impl<'m, 'v> VariantObject<'m, 'v> { // by value to all the children (who would otherwise re-validate it repeatedly). self.metadata = self.metadata.with_full_validation()?; - // Iterate over all string keys in this dictionary in order to prove that the offset - // array is valid, all offsets are in bounds, and all string bytes are valid utf-8. - validate_fallible_iterator(self.iter_try())?; + let field_id_buffer = slice_from_slice( + self.value, + self.header.field_ids_start_byte()..self.first_field_offset_byte, + )?; + + let field_ids = map_bytes_to_offsets(field_id_buffer, self.header.field_id_size) + .collect::>(); + + // Validate all field ids exist in the metadata dictionary and the corresponding field names are lexicographically sorted + if self.metadata.is_sorted() { + // Since the metadata dictionary has unique and sorted field names, we can also guarantee this object's field names + // are lexicographically sorted by their field id ordering + if !field_ids.is_sorted() { + return Err(ArrowError::InvalidArgumentError( + "field names not sorted".to_string(), + )); + } + + // Since field ids are sorted, if the last field is smaller than the dictionary size, + // we also know all field ids are smaller than the dictionary size and in-bounds. + if let Some(&last_field_id) = field_ids.last() { + if last_field_id >= self.metadata.dictionary_size() { + return Err(ArrowError::InvalidArgumentError( + "field id is not valid".to_string(), + )); + } + } + } else { + // The metadata dictionary can't guarantee uniqueness or sortedness, so we have to parse out the corresponding field names + // to check lexicographical order + let are_field_names_sorted = field_ids + .iter() + .map(|&i| self.metadata.get(i)) + .collect::, _>>()? + .is_sorted(); + + if !are_field_names_sorted { + return Err(ArrowError::InvalidArgumentError( + "field names not sorted".to_string(), + )); + } + + // Since field ids are not guaranteed to be sorted, scan over all field ids + // and check that field ids are less than dictionary size + + let are_field_ids_in_bounds = field_ids + .iter() + .all(|&id| id < self.metadata.dictionary_size()); + + if !are_field_ids_in_bounds { + return Err(ArrowError::InvalidArgumentError( + "field id is not valid".to_string(), + )); + } + } + + // Validate whether values are valid variant objects + let field_offset_buffer = slice_from_slice( + self.value, + self.first_field_offset_byte..self.first_value_byte, + )?; + let num_offsets = field_offset_buffer.len() / self.header.field_offset_size(); + + let value_buffer = slice_from_slice(self.value, self.first_value_byte..)?; + + map_bytes_to_offsets(field_offset_buffer, self.header.field_offset_size) + .take(num_offsets.saturating_sub(1)) + .try_for_each(|offset| { + let value_bytes = slice_from_slice(value_buffer, offset..)?; + Variant::try_new_with_metadata(self.metadata, value_bytes)?; + + Ok::<_, ArrowError>(()) + })?; + self.validated = true; } Ok(self)