From b3f507762baa69ddbba5a774c4ceb1c205250e07 Mon Sep 17 00:00:00 2001 From: Matthew Kim <38759997+friendlymatthew@users.noreply.github.com> Date: Fri, 18 Jul 2025 13:42:30 +0200 Subject: [PATCH 1/4] Revisit VariantMetadata and Object equality --- parquet-variant/src/variant/metadata.rs | 149 +++++++++++++++++++++--- parquet-variant/src/variant/object.rs | 50 +++++++- 2 files changed, 179 insertions(+), 20 deletions(-) diff --git a/parquet-variant/src/variant/metadata.rs b/parquet-variant/src/variant/metadata.rs index 31868aaf055c..fbb0d93d1813 100644 --- a/parquet-variant/src/variant/metadata.rs +++ b/parquet-variant/src/variant/metadata.rs @@ -335,27 +335,20 @@ impl<'m> VariantMetadata<'m> { } } -// According to the spec, metadata dictionaries are not required to be in a specific order, -// to enable flexibility when constructing Variant values -// -// Instead of comparing the raw bytes of 2 variant metadata instances, this implementation -// checks whether the dictionary entries are equal -- regardless of their sorting order +// Metadata dictionaries can be in any order per the spec to allow flexible Variant construction +// This comparison checks for field name presence in both metadata instances rather than raw byte equality impl<'m> PartialEq for VariantMetadata<'m> { fn eq(&self, other: &Self) -> bool { - let is_equal = self.is_empty() == other.is_empty() - && self.is_fully_validated() == other.is_fully_validated() - && self.first_value_byte == other.first_value_byte - && self.validated == other.validated; - - let other_field_names: HashSet<&'m str> = HashSet::from_iter(other.iter()); + self.is_empty() == other.is_empty() + && if self.is_sorted() && other.is_sorted() { + self.len() == other.len() && self.iter().eq(other.iter()) + } else { + // can't guarantee neither field names are unique + let self_field_names: HashSet<&'m str> = HashSet::from_iter(self.iter()); + let other_field_names: HashSet<&'m str> = HashSet::from_iter(other.iter()); - for field_name in self.iter() { - if !other_field_names.contains(field_name) { - return false; + self_field_names == other_field_names } - } - - is_equal } } @@ -374,6 +367,8 @@ impl std::ops::Index for VariantMetadata<'_> { #[cfg(test)] mod tests { + use crate::VariantBuilder; + use super::*; /// `"cat"`, `"dog"` – valid metadata @@ -558,4 +553,124 @@ mod tests { "unexpected error: {err:?}" ); } + + #[test] + fn test_compare_sorted_dictionary_with_unsorted_dictionary() { + // create a sorted object + let mut b = VariantBuilder::new(); + let mut o = b.new_object(); + + o.insert("a", false); + o.insert("b", false); + + o.finish().unwrap(); + + let (m, _) = b.finish(); + + let m1 = VariantMetadata::new(&m); + assert!(m1.is_sorted()); + + // Create metadata with an unsorted dictionary (field names are "a", "a", "b") + // Since field names are not unique, it is considered not sorted. + let metadata_bytes = vec![ + 0b0000_0001, + 3, // dictionary size + 0, // "a" + 1, // "a" + 2, // "b" + 3, + b'a', + b'a', + b'b', + ]; + let m2 = VariantMetadata::try_new(&metadata_bytes).unwrap(); + assert!(!m2.is_sorted()); + + assert_eq!(m1, m2); + } + + #[test] + fn test_compare_sorted_dictionary_with_unsorted_dictionary2() { + // create a sorted object + let mut b = VariantBuilder::new(); + let mut o = b.new_object(); + + o.insert("a", false); + o.insert("b", false); + + o.finish().unwrap(); + + let (m, _) = b.finish(); + + let m1 = VariantMetadata::new(&m); + assert!(m1.is_sorted()); + + // Create metadata with an unsorted dictionary (field names are "b", "b", "a") + // Since field names are not unique nor lexicographically ordered, it is considered not sorted. + let metadata_bytes = vec![ + 0b0000_0001, + 3, // dictionary size + 0, // "b" + 1, // "b" + 2, // "a" + 3, + b'b', + b'b', + b'a', + ]; + let m2 = VariantMetadata::try_new(&metadata_bytes).unwrap(); + assert!(!m2.is_sorted()); + + assert_eq!(m1, m2); + } + + #[test] + fn test_compare_sorted_dictionary_with_sorted_dictionary() { + // create a sorted object + let mut b = VariantBuilder::new(); + let mut o = b.new_object(); + + o.insert("a", false); + o.insert("b", false); + + o.finish().unwrap(); + + let (m, _) = b.finish(); + + let m1 = VariantMetadata::new(&m); + let m2 = VariantMetadata::new(&m); + + assert_eq!(m1, m2); + } + + #[test] + fn test_compare_sorted_dictionary_with_sorted_dictionary2() { + // create a sorted object + let mut b = VariantBuilder::new(); + let mut o = b.new_object(); + + o.insert("a", false); + o.insert("b", false); + + o.finish().unwrap(); + + let (m, _) = b.finish(); + + let m1 = VariantMetadata::new(&m); + + // create a sorted object with different field names + let mut b = VariantBuilder::new(); + let mut o = b.new_object(); + + o.insert("c", false); + o.insert("d", false); + + o.finish().unwrap(); + + let (m, _) = b.finish(); + + let m2 = VariantMetadata::new(&m); + + assert_ne!(m1, m2); + } } diff --git a/parquet-variant/src/variant/object.rs b/parquet-variant/src/variant/object.rs index 9cca3b9639e1..0f9e3ea02ba1 100644 --- a/parquet-variant/src/variant/object.rs +++ b/parquet-variant/src/variant/object.rs @@ -221,6 +221,7 @@ impl<'m, 'v> VariantObject<'m, 'v> { let mut field_ids_iter = map_bytes_to_offsets(field_id_buffer, self.header.field_id_size); + // Validate all field ids exist in the metadata dictionary and the corresponding field names are lexicographically sorted if self.metadata.is_sorted() { // Since the metadata dictionary has unique and sorted field names, we can also guarantee this object's field names @@ -263,7 +264,7 @@ impl<'m, 'v> VariantObject<'m, 'v> { let next_field_name = self.metadata.get(field_id)?; if let Some(current_name) = current_field_name { - if next_field_name <= current_name { + if next_field_name < current_name { return Err(ArrowError::InvalidArgumentError( "field names not sorted".to_string(), )); @@ -416,8 +417,7 @@ impl<'m, 'v> PartialEq for VariantObject<'m, 'v> { && self.header == other.header && self.num_elements == other.num_elements && self.first_field_offset_byte == other.first_field_offset_byte - && self.first_value_byte == other.first_value_byte - && self.validated == other.validated; + && self.first_value_byte == other.first_value_byte; // value validation let other_fields: HashMap<&str, Variant> = HashMap::from_iter(other.iter()); @@ -962,4 +962,48 @@ mod tests { // objects are still logically equal assert_eq!(v1, v2); } + + #[test] + fn test_compare_object_with_unsorted_dictionary_vs_sorted_dictionary() { + // create a sorted object + let mut b = VariantBuilder::new(); + let mut o = b.new_object(); + + o.insert("a", false); + o.insert("b", false); + + o.finish().unwrap(); + + let (m, v) = b.finish(); + + let variant_metadata = VariantMetadata::new(&m); + + dbg!(variant_metadata.iter().collect::>()); + + let v1 = Variant::try_new(&m, &v).unwrap(); + + // Create metadata with an unsorted dictionary (field names are "a", "a", "b") + // Since field names are not unique, it is considered not sorted. + let metadata_bytes = vec![ + 0b0000_0001, + 3, // dictionary size + 0, // "a" + 1, // "b" + 2, // "a" + 3, + b'a', + b'b', + b'a', + ]; + let m = VariantMetadata::try_new(&metadata_bytes).unwrap(); + assert!(!m.is_sorted()); + + dbg!(m.iter().collect::>()); + let v2 = Variant::new_with_metadata(m, &v); + + dbg!(v1.as_object().unwrap().iter().collect::>()); + dbg!(v2.as_object().unwrap().iter().collect::>()); + + assert_eq!(v1, v2); + } } From 41f7d528c29ab98a1d134499793efbd7ebfc8f1d Mon Sep 17 00:00:00 2001 From: Matthew Kim <38759997+friendlymatthew@users.noreply.github.com> Date: Sun, 20 Jul 2025 12:18:30 +0200 Subject: [PATCH 2/4] Logically compare objects, remove custom partial eq impl for metadata --- parquet-variant/src/variant/metadata.rs | 89 +------------------------ parquet-variant/src/variant/object.rs | 29 +++----- 2 files changed, 11 insertions(+), 107 deletions(-) diff --git a/parquet-variant/src/variant/metadata.rs b/parquet-variant/src/variant/metadata.rs index fbb0d93d1813..0e356e34c41e 100644 --- a/parquet-variant/src/variant/metadata.rs +++ b/parquet-variant/src/variant/metadata.rs @@ -15,8 +15,6 @@ // specific language governing permissions and limitations // under the License. -use std::collections::HashSet; - use crate::decoder::{map_bytes_to_offsets, OffsetSizeBytes}; use crate::utils::{first_byte_from_slice, overflow_error, slice_from_slice, string_from_slice}; @@ -127,7 +125,7 @@ impl VariantMetadataHeader { /// /// [`Variant`]: crate::Variant /// [Variant Spec]: https://github.com/apache/parquet-format/blob/master/VariantEncoding.md#metadata-encoding -#[derive(Debug, Clone)] +#[derive(Debug, Clone, PartialEq)] pub struct VariantMetadata<'m> { pub(crate) bytes: &'m [u8], header: VariantMetadataHeader, @@ -335,23 +333,6 @@ impl<'m> VariantMetadata<'m> { } } -// Metadata dictionaries can be in any order per the spec to allow flexible Variant construction -// This comparison checks for field name presence in both metadata instances rather than raw byte equality -impl<'m> PartialEq for VariantMetadata<'m> { - fn eq(&self, other: &Self) -> bool { - self.is_empty() == other.is_empty() - && if self.is_sorted() && other.is_sorted() { - self.len() == other.len() && self.iter().eq(other.iter()) - } else { - // can't guarantee neither field names are unique - let self_field_names: HashSet<&'m str> = HashSet::from_iter(self.iter()); - let other_field_names: HashSet<&'m str> = HashSet::from_iter(other.iter()); - - self_field_names == other_field_names - } - } -} - /// Retrieves the ith dictionary entry, panicking if the index is out of bounds. Accessing /// [unvalidated] input could also panic if the underlying bytes are invalid. /// @@ -586,42 +567,7 @@ mod tests { let m2 = VariantMetadata::try_new(&metadata_bytes).unwrap(); assert!(!m2.is_sorted()); - assert_eq!(m1, m2); - } - - #[test] - fn test_compare_sorted_dictionary_with_unsorted_dictionary2() { - // create a sorted object - let mut b = VariantBuilder::new(); - let mut o = b.new_object(); - - o.insert("a", false); - o.insert("b", false); - - o.finish().unwrap(); - - let (m, _) = b.finish(); - - let m1 = VariantMetadata::new(&m); - assert!(m1.is_sorted()); - - // Create metadata with an unsorted dictionary (field names are "b", "b", "a") - // Since field names are not unique nor lexicographically ordered, it is considered not sorted. - let metadata_bytes = vec![ - 0b0000_0001, - 3, // dictionary size - 0, // "b" - 1, // "b" - 2, // "a" - 3, - b'b', - b'b', - b'a', - ]; - let m2 = VariantMetadata::try_new(&metadata_bytes).unwrap(); - assert!(!m2.is_sorted()); - - assert_eq!(m1, m2); + assert_ne!(m1, m2); } #[test] @@ -642,35 +588,4 @@ mod tests { assert_eq!(m1, m2); } - - #[test] - fn test_compare_sorted_dictionary_with_sorted_dictionary2() { - // create a sorted object - let mut b = VariantBuilder::new(); - let mut o = b.new_object(); - - o.insert("a", false); - o.insert("b", false); - - o.finish().unwrap(); - - let (m, _) = b.finish(); - - let m1 = VariantMetadata::new(&m); - - // create a sorted object with different field names - let mut b = VariantBuilder::new(); - let mut o = b.new_object(); - - o.insert("c", false); - o.insert("d", false); - - o.finish().unwrap(); - - let (m, _) = b.finish(); - - let m2 = VariantMetadata::new(&m); - - assert_ne!(m1, m2); - } } diff --git a/parquet-variant/src/variant/object.rs b/parquet-variant/src/variant/object.rs index 0f9e3ea02ba1..53b5f2b73a96 100644 --- a/parquet-variant/src/variant/object.rs +++ b/parquet-variant/src/variant/object.rs @@ -413,13 +413,8 @@ impl<'m, 'v> VariantObject<'m, 'v> { // checks whether the field values are equal -- regardless of their order impl<'m, 'v> PartialEq for VariantObject<'m, 'v> { fn eq(&self, other: &Self) -> bool { - let mut is_equal = self.metadata == other.metadata - && self.header == other.header - && self.num_elements == other.num_elements - && self.first_field_offset_byte == other.first_field_offset_byte - && self.first_value_byte == other.first_value_byte; + let mut is_equal = self.num_elements == other.num_elements; - // value validation let other_fields: HashMap<&str, Variant> = HashMap::from_iter(other.iter()); for (field_name, variant) in self.iter() { @@ -938,14 +933,14 @@ mod tests { o.finish().unwrap(); - let (m, v) = b.finish(); + let (meta1, value1) = b.finish(); - let v1 = Variant::try_new(&m, &v).unwrap(); + let v1 = Variant::try_new(&meta1, &value1).unwrap(); // v1 is sorted assert!(v1.metadata().unwrap().is_sorted()); // create a second object with different insertion order - let mut b = VariantBuilder::new(); + let mut b = VariantBuilder::new().with_field_names(["d", "c", "b", "a"].into_iter()); let mut o = b.new_object(); o.insert("b", 4.3); @@ -953,12 +948,15 @@ mod tests { o.finish().unwrap(); - let (m, v) = b.finish(); + let (meta2, value2) = b.finish(); - let v2 = Variant::try_new(&m, &v).unwrap(); + let v2 = Variant::try_new(&meta2, &value2).unwrap(); // v2 is not sorted assert!(!v2.metadata().unwrap().is_sorted()); + // object metadata are not the same + assert_ne!(v1.metadata(), v2.metadata()); + // objects are still logically equal assert_eq!(v1, v2); } @@ -976,10 +974,6 @@ mod tests { let (m, v) = b.finish(); - let variant_metadata = VariantMetadata::new(&m); - - dbg!(variant_metadata.iter().collect::>()); - let v1 = Variant::try_new(&m, &v).unwrap(); // Create metadata with an unsorted dictionary (field names are "a", "a", "b") @@ -998,12 +992,7 @@ mod tests { let m = VariantMetadata::try_new(&metadata_bytes).unwrap(); assert!(!m.is_sorted()); - dbg!(m.iter().collect::>()); let v2 = Variant::new_with_metadata(m, &v); - - dbg!(v1.as_object().unwrap().iter().collect::>()); - dbg!(v2.as_object().unwrap().iter().collect::>()); - assert_eq!(v1, v2); } } From dc32619af3fc3f5c370a75c1fa41c0acff273e62 Mon Sep 17 00:00:00 2001 From: Matthew Kim <38759997+friendlymatthew@users.noreply.github.com> Date: Mon, 21 Jul 2025 17:19:49 +0200 Subject: [PATCH 3/4] Eagerly return false during equality cmp --- parquet-variant/src/variant/object.rs | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/parquet-variant/src/variant/object.rs b/parquet-variant/src/variant/object.rs index 53b5f2b73a96..a9fc65032818 100644 --- a/parquet-variant/src/variant/object.rs +++ b/parquet-variant/src/variant/object.rs @@ -413,20 +413,24 @@ impl<'m, 'v> VariantObject<'m, 'v> { // checks whether the field values are equal -- regardless of their order impl<'m, 'v> PartialEq for VariantObject<'m, 'v> { fn eq(&self, other: &Self) -> bool { - let mut is_equal = self.num_elements == other.num_elements; + if self.num_elements != other.num_elements { + return false; + } let other_fields: HashMap<&str, Variant> = HashMap::from_iter(other.iter()); for (field_name, variant) in self.iter() { match other_fields.get(field_name as &str) { Some(other_variant) => { - is_equal = is_equal && variant == *other_variant; + if variant != *other_variant { + return false; + } } None => return false, } } - is_equal + true } } From 26dd44b812ebed774e92df4b6f160209e58524d9 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Tue, 22 Jul 2025 17:54:26 -0400 Subject: [PATCH 4/4] Avoid HashMap on equality check --- parquet-variant/src/variant/object.rs | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) diff --git a/parquet-variant/src/variant/object.rs b/parquet-variant/src/variant/object.rs index a9fc65032818..b809fe278cb4 100644 --- a/parquet-variant/src/variant/object.rs +++ b/parquet-variant/src/variant/object.rs @@ -20,7 +20,6 @@ use crate::utils::{ first_byte_from_slice, overflow_error, slice_from_slice, try_binary_search_range_by, }; use crate::variant::{Variant, VariantMetadata}; -use std::collections::HashMap; use arrow_schema::ArrowError; @@ -417,16 +416,12 @@ impl<'m, 'v> PartialEq for VariantObject<'m, 'v> { return false; } - let other_fields: HashMap<&str, Variant> = HashMap::from_iter(other.iter()); - - for (field_name, variant) in self.iter() { - match other_fields.get(field_name as &str) { - Some(other_variant) => { - if variant != *other_variant { - return false; - } - } - None => return false, + // IFF two objects are valid and logically equal, they will have the same + // field names in the same order, because the spec requires the object + // fields to be sorted lexicographically. + for ((name_a, value_a), (name_b, value_b)) in self.iter().zip(other.iter()) { + if name_a != name_b || value_a != value_b { + return false; } }