-
Notifications
You must be signed in to change notification settings - Fork 1.1k
[Variant] Speedup validation #7878
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
e8fd01e
9135501
3cb1e62
5e630e0
dda573b
6756a65
d7b2596
539722b
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -46,4 +46,3 @@ name = "parquet_variant_json" | |
| bench = false | ||
|
|
||
| [dev-dependencies] | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,138 @@ | ||
| // Licensed to the Apache Software Foundation (ASF) under one | ||
| // or more contributor license agreements. See the NOTICE file | ||
| // distributed with this work for additional information | ||
| // regarding copyright ownership. The ASF licenses this file | ||
| // to you under the Apache License, Version 2.0 (the | ||
| // "License"); you may not use this file except in compliance | ||
| // with the License. You may obtain a copy of the License at | ||
| // | ||
| // http://www.apache.org/licenses/LICENSE-2.0 | ||
| // | ||
| // Unless required by applicable law or agreed to in writing, | ||
| // software distributed under the License is distributed on an | ||
| // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||
| // KIND, either express or implied. See the License for the | ||
| // specific language governing permissions and limitations | ||
| // under the License. | ||
|
|
||
| extern crate parquet_variant; | ||
|
|
||
| use criterion::*; | ||
|
|
||
| use parquet_variant::{Variant, VariantBuilder}; | ||
|
|
||
| fn generate_large_object() -> (Vec<u8>, Vec<u8>) { | ||
| // 256 elements (keys: 000-255) - each element is an object of 256 elements (240-495) - each | ||
| // element a list of numbers from 0-127 | ||
| let mut variant_builder = VariantBuilder::new(); | ||
| let mut outer_object = variant_builder.new_object(); | ||
|
|
||
| for i in 0..=125 { | ||
| let key = format!("{i:03}"); | ||
| let mut inner_object = outer_object.new_object(&key); | ||
|
|
||
| for j in 125..=250 { | ||
| let inner_key = format!("{j}"); | ||
| let mut list_builder = inner_object.new_list(&inner_key); | ||
|
|
||
| for k in 0..=127 { | ||
| list_builder.append_value(Variant::Int8(k)); | ||
| } | ||
| list_builder.finish(); | ||
| } | ||
| inner_object.finish().unwrap(); | ||
| } | ||
| outer_object.finish().unwrap(); | ||
|
|
||
| variant_builder.finish() | ||
| } | ||
|
|
||
| fn generate_complex_object() -> (Vec<u8>, Vec<u8>) { | ||
| let mut variant_builder = VariantBuilder::new(); | ||
| let mut object_builder = variant_builder.new_object(); | ||
| let mut inner_list_builder = object_builder.new_list("booleans"); | ||
|
|
||
| for _ in 0..1024 { | ||
| inner_list_builder.append_value(Variant::BooleanTrue); | ||
| } | ||
|
|
||
| inner_list_builder.finish(); | ||
| object_builder.insert("null", Variant::Null); | ||
| let mut inner_list_builder = object_builder.new_list("numbers"); | ||
| for _ in 0..1024 { | ||
| inner_list_builder.append_value(Variant::Int8(4)); | ||
| inner_list_builder.append_value(Variant::Double(-3e0)); | ||
| inner_list_builder.append_value(Variant::Double(1001e-3)); | ||
| } | ||
| inner_list_builder.finish(); | ||
|
|
||
| let mut inner_object_builder = object_builder.new_object("nested"); | ||
|
|
||
| for i in 0..2048 { | ||
| let key = format!("{}", 1024 - i); | ||
| inner_object_builder.insert(&key, i); | ||
| } | ||
| inner_object_builder.finish().unwrap(); | ||
|
|
||
| object_builder.finish().unwrap(); | ||
|
|
||
| variant_builder.finish() | ||
| } | ||
|
|
||
| fn generate_large_nested_list() -> (Vec<u8>, Vec<u8>) { | ||
| let mut variant_builder = VariantBuilder::new(); | ||
| let mut list_builder = variant_builder.new_list(); | ||
| for _ in 0..255 { | ||
| let mut list_builder_inner = list_builder.new_list(); | ||
| for _ in 0..120 { | ||
| list_builder_inner.append_value(Variant::Null); | ||
|
|
||
| let mut list_builder_inner_inner = list_builder_inner.new_list(); | ||
| for _ in 0..20 { | ||
| list_builder_inner_inner.append_value(Variant::Double(-3e0)); | ||
| } | ||
|
|
||
| list_builder_inner_inner.finish(); | ||
| } | ||
| list_builder_inner.finish(); | ||
| } | ||
| list_builder.finish(); | ||
| variant_builder.finish() | ||
| } | ||
|
|
||
| // Generates a large object and performs full validation | ||
| fn bench_validate_large_object(c: &mut Criterion) { | ||
| let (metadata, value) = generate_large_object(); | ||
| c.bench_function("bench_validate_large_object", |b| { | ||
| b.iter(|| { | ||
| std::hint::black_box(Variant::try_new(&metadata, &value).unwrap()); | ||
| }) | ||
| }); | ||
| } | ||
|
|
||
| fn bench_validate_complex_object(c: &mut Criterion) { | ||
| let (metadata, value) = generate_complex_object(); | ||
| c.bench_function("bench_validate_complex_object", |b| { | ||
| b.iter(|| { | ||
| std::hint::black_box(Variant::try_new(&metadata, &value).unwrap()); | ||
| }) | ||
| }); | ||
| } | ||
|
|
||
| fn bench_validate_large_nested_list(c: &mut Criterion) { | ||
| let (metadata, value) = generate_large_nested_list(); | ||
| c.bench_function("bench_validate_large_nested_list", |b| { | ||
| b.iter(|| { | ||
| std::hint::black_box(Variant::try_new(&metadata, &value).unwrap()); | ||
| }) | ||
| }); | ||
| } | ||
|
|
||
| criterion_group!( | ||
| benches, | ||
| bench_validate_large_object, | ||
| bench_validate_complex_object, | ||
| bench_validate_large_nested_list | ||
| ); | ||
|
|
||
| criterion_main!(benches); |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -14,10 +14,9 @@ | |
| // KIND, either express or implied. See the License for the | ||
| // specific language governing permissions and limitations | ||
| // under the License. | ||
| use crate::decoder::OffsetSizeBytes; | ||
| use crate::decoder::{map_bytes_to_offsets, OffsetSizeBytes}; | ||
| use crate::utils::{ | ||
| first_byte_from_slice, overflow_error, slice_from_slice, slice_from_slice_at_offset, | ||
| validate_fallible_iterator, | ||
| }; | ||
| use crate::variant::{Variant, VariantMetadata}; | ||
|
|
||
|
|
@@ -209,9 +208,35 @@ impl<'m, 'v> VariantList<'m, 'v> { | |
| // by value to all the children (who would otherwise re-validate it repeatedly). | ||
| self.metadata = self.metadata.with_full_validation()?; | ||
|
|
||
| // Iterate over all string keys in this dictionary in order to prove that the offset | ||
| // array is valid, all offsets are in bounds, and all string bytes are valid utf-8. | ||
| validate_fallible_iterator(self.iter_try())?; | ||
| let offset_buffer = slice_from_slice( | ||
| self.value, | ||
| self.header.first_offset_byte()..self.first_value_byte, | ||
| )?; | ||
|
|
||
| let offsets = | ||
| map_bytes_to_offsets(offset_buffer, self.header.offset_size).collect::<Vec<_>>(); | ||
|
|
||
| // Validate offsets are in-bounds and monotonically increasing. | ||
| // Since shallow verification checks whether the first and last offsets are in-bounds, | ||
| // we can also verify all offsets are in-bounds by checking if offsets are monotonically increasing. | ||
| let are_offsets_monotonic = offsets.is_sorted_by(|a, b| a < b); | ||
| if !are_offsets_monotonic { | ||
| return Err(ArrowError::InvalidArgumentError( | ||
| "offsets are not monotonically increasing".to_string(), | ||
| )); | ||
| } | ||
|
||
|
|
||
| let value_buffer = slice_from_slice(self.value, self.first_value_byte..)?; | ||
|
|
||
| // Validate whether values are valid variant objects | ||
| for i in 1..offsets.len() { | ||
| let start_offset = offsets[i - 1]; | ||
| let end_offset = offsets[i]; | ||
|
|
||
| let value_bytes = slice_from_slice(value_buffer, start_offset..end_offset)?; | ||
| Variant::try_new_with_metadata(self.metadata, value_bytes)?; | ||
friendlymatthew marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| } | ||
|
|
||
| self.validated = true; | ||
| } | ||
| Ok(self) | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -15,11 +15,8 @@ | |
| // specific language governing permissions and limitations | ||
| // under the License. | ||
|
|
||
| use crate::decoder::OffsetSizeBytes; | ||
| use crate::utils::{ | ||
| first_byte_from_slice, overflow_error, slice_from_slice, string_from_slice, | ||
| validate_fallible_iterator, | ||
| }; | ||
| use crate::decoder::{map_bytes_to_offsets, OffsetSizeBytes}; | ||
| use crate::utils::{first_byte_from_slice, overflow_error, slice_from_slice, string_from_slice}; | ||
|
|
||
| use arrow_schema::ArrowError; | ||
|
|
||
|
|
@@ -228,9 +225,47 @@ impl<'m> VariantMetadata<'m> { | |
| /// [validation]: Self#Validation | ||
| pub fn with_full_validation(mut self) -> Result<Self, ArrowError> { | ||
| if !self.validated { | ||
| // Iterate over all string keys in this dictionary in order to prove that the offset | ||
| // array is valid, all offsets are in bounds, and all string bytes are valid utf-8. | ||
| validate_fallible_iterator(self.iter_try())?; | ||
| let offset_bytes = slice_from_slice( | ||
| self.bytes, | ||
| self.header.first_offset_byte()..self.first_value_byte, | ||
| )?; | ||
|
|
||
| let offsets = | ||
| map_bytes_to_offsets(offset_bytes, self.header.offset_size).collect::<Vec<_>>(); | ||
|
|
||
| // Validate offsets are in-bounds and monotonically increasing. | ||
| // Since shallow validation ensures the first and last offsets are in bounds, we can also verify all offsets | ||
| // are in-bounds by checking if offsets are monotonically increasing. | ||
| let are_offsets_monotonic = offsets.is_sorted_by(|a, b| a < b); | ||
| if !are_offsets_monotonic { | ||
| return Err(ArrowError::InvalidArgumentError( | ||
| "offsets not monotonically increasing".to_string(), | ||
| )); | ||
| } | ||
|
||
|
|
||
| // Verify the string values in the dictionary are UTF-8 encoded strings. | ||
| let value_buffer = | ||
| string_from_slice(self.bytes, 0, self.first_value_byte..self.bytes.len())?; | ||
|
|
||
| if self.header.is_sorted { | ||
|
||
| // Validate the dictionary values are unique and lexicographically sorted | ||
| let are_dictionary_values_unique_and_sorted = (1..offsets.len()) | ||
| .map(|i| { | ||
| let field_range = offsets[i - 1]..offsets[i]; | ||
| value_buffer.get(field_range) | ||
| }) | ||
| .is_sorted_by(|a, b| match (a, b) { | ||
| (Some(a), Some(b)) => a < b, | ||
|
||
| _ => false, | ||
| }); | ||
|
|
||
| if !are_dictionary_values_unique_and_sorted { | ||
| return Err(ArrowError::InvalidArgumentError( | ||
| "dictionary values are not unique and ordered".to_string(), | ||
| )); | ||
| } | ||
| } | ||
|
|
||
| self.validated = true; | ||
| } | ||
| Ok(self) | ||
|
|
@@ -399,6 +434,42 @@ mod tests { | |
| ); | ||
| } | ||
|
|
||
| #[test] | ||
| fn try_new_fails_non_monotonic2() { | ||
| // this test case checks whether offsets are monotonic in the full validation logic. | ||
|
|
||
| // 'cat', 'dog', 'lamb', "eel" | ||
| let bytes = &[ | ||
| 0b0000_0001, // header, offset_size_minus_one=0 and version=1 | ||
| 4, // dictionary_size | ||
| 0x00, | ||
| 0x02, | ||
| 0x01, // Doesn't increase monotonically | ||
| 0x10, | ||
| 13, | ||
| b'c', | ||
| b'a', | ||
| b't', | ||
| b'd', | ||
| b'o', | ||
| b'g', | ||
| b'l', | ||
| b'a', | ||
| b'm', | ||
| b'b', | ||
| b'e', | ||
| b'e', | ||
| b'l', | ||
| ]; | ||
|
|
||
| let err = VariantMetadata::try_new(bytes).unwrap_err(); | ||
|
|
||
| assert!( | ||
| matches!(err, ArrowError::InvalidArgumentError(_)), | ||
| "unexpected error: {err:?}" | ||
| ); | ||
| } | ||
|
|
||
| #[test] | ||
| fn try_new_truncated_offsets_inline() { | ||
| // Missing final offset | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
it seems like there is a lot of duplicated code that does about the same thing:
Check that a slice of offsets are all
I wonder if it possible to factor it all out into a function like
Or something 🤔
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I plan on following up with a PR that removes this check since we can validate the monotonicity of offsets when accessing variants