diff --git a/parquet-variant/src/builder.rs b/parquet-variant/src/builder.rs index ae82cfec9d3a..73fa15255ec0 100644 --- a/parquet-variant/src/builder.rs +++ b/parquet-variant/src/builder.rs @@ -16,11 +16,12 @@ // under the License. use crate::decoder::{VariantBasicType, VariantPrimitiveType}; use crate::{ - ShortString, Variant, VariantDecimal16, VariantDecimal4, VariantDecimal8, VariantMetadata, + ShortString, Variant, VariantDecimal16, VariantDecimal4, VariantDecimal8, VariantList, + VariantMetadata, VariantObject, }; use arrow_schema::ArrowError; use indexmap::{IndexMap, IndexSet}; -use std::collections::{HashMap, HashSet}; +use std::collections::HashSet; const BASIC_TYPE_BITS: u8 = 2; const UNIX_EPOCH_DATE: chrono::NaiveDate = chrono::NaiveDate::from_ymd_opt(1970, 1, 1).unwrap(); @@ -216,6 +217,57 @@ impl ValueBuffer { self.append_slice(value.as_bytes()); } + fn append_object(&mut self, metadata_builder: &mut MetadataBuilder, obj: VariantObject) { + let mut object_builder = self.new_object(metadata_builder); + + for (field_name, value) in obj.iter() { + object_builder.insert(field_name, value); + } + + object_builder.finish().unwrap(); + } + + fn try_append_object( + &mut self, + metadata_builder: &mut MetadataBuilder, + obj: VariantObject, + ) -> Result<(), ArrowError> { + let mut object_builder = self.new_object(metadata_builder); + + for res in obj.iter_try() { + let (field_name, value) = res?; + object_builder.try_insert(field_name, value)?; + } + + object_builder.finish()?; + + Ok(()) + } + + fn append_list(&mut self, metadata_builder: &mut MetadataBuilder, list: VariantList) { + let mut list_builder = self.new_list(metadata_builder); + for value in list.iter() { + list_builder.append_value(value); + } + list_builder.finish(); + } + + fn try_append_list( + &mut self, + metadata_builder: &mut MetadataBuilder, + list: VariantList, + ) -> Result<(), ArrowError> { + let mut list_builder = self.new_list(metadata_builder); + for res in list.iter_try() { + let value = res?; + list_builder.try_append_value(value)?; + } + + list_builder.finish(); + + Ok(()) + } + fn offset(&self) -> usize { self.0.len() } @@ -252,9 +304,31 @@ impl ValueBuffer { variant: Variant<'m, 'd>, metadata_builder: &mut MetadataBuilder, ) { - self.try_append_variant(variant, metadata_builder).unwrap(); + match variant { + Variant::Null => self.append_null(), + Variant::BooleanTrue => self.append_bool(true), + Variant::BooleanFalse => self.append_bool(false), + Variant::Int8(v) => self.append_int8(v), + Variant::Int16(v) => self.append_int16(v), + Variant::Int32(v) => self.append_int32(v), + Variant::Int64(v) => self.append_int64(v), + Variant::Date(v) => self.append_date(v), + Variant::TimestampMicros(v) => self.append_timestamp_micros(v), + Variant::TimestampNtzMicros(v) => self.append_timestamp_ntz_micros(v), + Variant::Decimal4(decimal4) => self.append_decimal4(decimal4), + Variant::Decimal8(decimal8) => self.append_decimal8(decimal8), + Variant::Decimal16(decimal16) => self.append_decimal16(decimal16), + Variant::Float(v) => self.append_float(v), + Variant::Double(v) => self.append_double(v), + Variant::Binary(v) => self.append_binary(v), + Variant::String(s) => self.append_string(s), + Variant::ShortString(s) => self.append_short_string(s), + Variant::Object(obj) => self.append_object(metadata_builder, obj), + Variant::List(list) => self.append_list(metadata_builder, list), + } } + /// Appends a variant to the buffer fn try_append_variant<'m, 'd>( &mut self, variant: Variant<'m, 'd>, @@ -279,35 +353,8 @@ impl ValueBuffer { Variant::Binary(v) => self.append_binary(v), Variant::String(s) => self.append_string(s), Variant::ShortString(s) => self.append_short_string(s), - Variant::Object(obj) => { - let metadata_field_names = metadata_builder - .field_names - .iter() - .enumerate() - .map(|(i, f)| (f.clone(), i)) - .collect::>(); - - let mut object_builder = self.new_object(metadata_builder); - - // first add all object fields that exist in metadata builder - let mut object_fields = obj.iter().collect::>(); - - object_fields - .sort_by_key(|(field_name, _)| metadata_field_names.get(field_name as &str)); - - for (field_name, value) in object_fields { - object_builder.insert(field_name, value); - } - - object_builder.finish()?; - } - Variant::List(list) => { - let mut list_builder = self.new_list(metadata_builder); - for value in list.iter() { - list_builder.append_value(value); - } - list_builder.finish(); - } + Variant::Object(obj) => self.try_append_object(metadata_builder, obj)?, + Variant::List(list) => self.try_append_list(metadata_builder, list)?, } Ok(()) diff --git a/parquet-variant/src/variant/metadata.rs b/parquet-variant/src/variant/metadata.rs index add31465d28b..1ac9af31b48f 100644 --- a/parquet-variant/src/variant/metadata.rs +++ b/parquet-variant/src/variant/metadata.rs @@ -15,6 +15,8 @@ // specific language governing permissions and limitations // under the License. +use std::collections::HashSet; + use crate::decoder::{map_bytes_to_offsets, OffsetSizeBytes}; use crate::utils::{first_byte_from_slice, overflow_error, slice_from_slice, string_from_slice}; @@ -125,7 +127,7 @@ impl VariantMetadataHeader { /// /// [`Variant`]: crate::Variant /// [Variant Spec]: https://github.com/apache/parquet-format/blob/master/VariantEncoding.md#metadata-encoding -#[derive(Debug, Clone, PartialEq)] +#[derive(Debug, Clone)] pub struct VariantMetadata<'m> { pub(crate) bytes: &'m [u8], header: VariantMetadataHeader, @@ -332,6 +334,30 @@ impl<'m> VariantMetadata<'m> { } } +// According to the spec, metadata dictionaries are not required to be in a specific order, +// to enable flexibility when constructing Variant values +// +// Instead of comparing the raw bytes of 2 variant metadata instances, this implementation +// checks whether the dictionary entries are equal -- regardless of their sorting order +impl<'m> PartialEq for VariantMetadata<'m> { + fn eq(&self, other: &Self) -> bool { + let is_equal = self.is_empty() == other.is_empty() + && self.is_fully_validated() == other.is_fully_validated() + && self.first_value_byte == other.first_value_byte + && self.validated == other.validated; + + let other_field_names: HashSet<&'m str> = HashSet::from_iter(other.iter()); + + for field_name in self.iter() { + if !other_field_names.contains(field_name) { + return false; + } + } + + is_equal + } +} + /// Retrieves the ith dictionary entry, panicking if the index is out of bounds. Accessing /// [unvalidated] input could also panic if the underlying bytes are invalid. /// @@ -346,6 +372,7 @@ impl std::ops::Index for VariantMetadata<'_> { #[cfg(test)] mod tests { + use super::*; /// `"cat"`, `"dog"` – valid metadata diff --git a/parquet-variant/src/variant/object.rs b/parquet-variant/src/variant/object.rs index 37ebce818dca..56d27753f071 100644 --- a/parquet-variant/src/variant/object.rs +++ b/parquet-variant/src/variant/object.rs @@ -14,11 +14,13 @@ // KIND, either express or implied. See the License for the // specific language governing permissions and limitations // under the License. + use crate::decoder::{map_bytes_to_offsets, OffsetSizeBytes}; use crate::utils::{ first_byte_from_slice, overflow_error, slice_from_slice, try_binary_search_range_by, }; use crate::variant::{Variant, VariantMetadata}; +use std::collections::HashMap; use arrow_schema::ArrowError; @@ -114,7 +116,7 @@ impl VariantObjectHeader { /// /// [valid]: VariantMetadata#Validation /// [Variant spec]: https://github.com/apache/parquet-format/blob/master/VariantEncoding.md#value-data-for-object-basic_type2 -#[derive(Debug, Clone, PartialEq)] +#[derive(Debug, Clone)] pub struct VariantObject<'m, 'v> { pub metadata: VariantMetadata<'m>, pub value: &'v [u8], @@ -387,6 +389,38 @@ impl<'m, 'v> VariantObject<'m, 'v> { } } +// Custom implementation of PartialEq for variant objects +// +// According to the spec, field values are not required to be in the same order as the field IDs, +// to enable flexibility when constructing Variant values +// +// Instead of comparing the raw bytes of 2 variant objects, this implementation recursively +// checks whether the field values are equal -- regardless of their order +impl<'m, 'v> PartialEq for VariantObject<'m, 'v> { + fn eq(&self, other: &Self) -> bool { + let mut is_equal = self.metadata == other.metadata + && self.header == other.header + && self.num_elements == other.num_elements + && self.first_field_offset_byte == other.first_field_offset_byte + && self.first_value_byte == other.first_value_byte + && self.validated == other.validated; + + // value validation + let other_fields: HashMap<&str, Variant> = HashMap::from_iter(other.iter()); + + for (field_name, variant) in self.iter() { + match other_fields.get(field_name as &str) { + Some(other_variant) => { + is_equal = is_equal && variant == *other_variant; + } + None => return false, + } + } + + is_equal + } +} + #[cfg(test)] mod tests { use crate::VariantBuilder; @@ -718,4 +752,187 @@ mod tests { test_variant_object_with_large_data(16777216 + 1, OffsetSizeBytes::Four); // 2^24 } + + #[test] + fn test_objects_with_same_fields_are_equal() { + let mut b = VariantBuilder::new(); + let mut o = b.new_object(); + + o.insert("b", ()); + o.insert("c", ()); + o.insert("a", ()); + + o.finish().unwrap(); + + let (m, v) = b.finish(); + + let v1 = Variant::try_new(&m, &v).unwrap(); + let v2 = Variant::try_new(&m, &v).unwrap(); + + assert_eq!(v1, v2); + } + + #[test] + fn test_same_objects_with_different_builder_are_equal() { + let mut b = VariantBuilder::new(); + let mut o = b.new_object(); + + o.insert("a", ()); + o.insert("b", false); + + o.finish().unwrap(); + let (m, v) = b.finish(); + + let v1 = Variant::try_new(&m, &v).unwrap(); + + let mut b = VariantBuilder::new(); + let mut o = b.new_object(); + + o.insert("a", ()); + o.insert("b", false); + + o.finish().unwrap(); + let (m, v) = b.finish(); + + let v2 = Variant::try_new(&m, &v).unwrap(); + + assert_eq!(v1, v2); + } + + #[test] + fn test_objects_with_different_values_are_not_equal() { + let mut b = VariantBuilder::new(); + let mut o = b.new_object(); + + o.insert("a", ()); + o.insert("b", 4.3); + + o.finish().unwrap(); + + let (m, v) = b.finish(); + + let v1 = Variant::try_new(&m, &v).unwrap(); + + // second object, same field name but different values + let mut b = VariantBuilder::new(); + let mut o = b.new_object(); + + o.insert("a", ()); + let mut inner_o = o.new_object("b"); + inner_o.insert("a", 3.3); + inner_o.finish().unwrap(); + o.finish().unwrap(); + + let (m, v) = b.finish(); + + let v2 = Variant::try_new(&m, &v).unwrap(); + + let m1 = v1.metadata().unwrap(); + let m2 = v2.metadata().unwrap(); + + // metadata would be equal since they contain the same keys + assert_eq!(m1, m2); + + // but the objects are not equal + assert_ne!(v1, v2); + } + + #[test] + fn test_objects_with_different_field_names_are_not_equal() { + let mut b = VariantBuilder::new(); + let mut o = b.new_object(); + + o.insert("a", ()); + o.insert("b", 4.3); + + o.finish().unwrap(); + + let (m, v) = b.finish(); + + let v1 = Variant::try_new(&m, &v).unwrap(); + + // second object, same field name but different values + let mut b = VariantBuilder::new(); + let mut o = b.new_object(); + + o.insert("aardvark", ()); + o.insert("barracuda", 3.3); + + o.finish().unwrap(); + + let (m, v) = b.finish(); + let v2 = Variant::try_new(&m, &v).unwrap(); + + assert_ne!(v1, v2); + } + + #[test] + fn test_objects_with_different_insertion_order_are_equal() { + let mut b = VariantBuilder::new(); + let mut o = b.new_object(); + + o.insert("b", false); + o.insert("a", ()); + + o.finish().unwrap(); + + let (m, v) = b.finish(); + + let v1 = Variant::try_new(&m, &v).unwrap(); + assert!(!v1.metadata().unwrap().is_sorted()); + + // create another object pre-filled with field names, b and a + // but insert the fields in the order of a, b + let mut b = VariantBuilder::new().with_field_names(["b", "a"].into_iter()); + let mut o = b.new_object(); + + o.insert("a", ()); + o.insert("b", false); + + o.finish().unwrap(); + + let (m, v) = b.finish(); + + let v2 = Variant::try_new(&m, &v).unwrap(); + + // v2 should also have a unsorted dictionary + assert!(!v2.metadata().unwrap().is_sorted()); + + assert_eq!(v1, v2); + } + + #[test] + fn test_objects_with_differing_metadata_are_equal() { + let mut b = VariantBuilder::new(); + let mut o = b.new_object(); + + o.insert("a", ()); + o.insert("b", 4.3); + + o.finish().unwrap(); + + let (m, v) = b.finish(); + + let v1 = Variant::try_new(&m, &v).unwrap(); + // v1 is sorted + assert!(v1.metadata().unwrap().is_sorted()); + + // create a second object with different insertion order + let mut b = VariantBuilder::new(); + let mut o = b.new_object(); + + o.insert("b", 4.3); + o.insert("a", ()); + + o.finish().unwrap(); + + let (m, v) = b.finish(); + + let v2 = Variant::try_new(&m, &v).unwrap(); + // v2 is not sorted + assert!(!v2.metadata().unwrap().is_sorted()); + + // objects are still logically equal + assert_eq!(v1, v2); + } }