-
Notifications
You must be signed in to change notification settings - Fork 1.1k
[Variant] Impl PartialEq for VariantObject
#7943
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -16,11 +16,12 @@ | |
| // under the License. | ||
| use crate::decoder::{VariantBasicType, VariantPrimitiveType}; | ||
| use crate::{ | ||
| ShortString, Variant, VariantDecimal16, VariantDecimal4, VariantDecimal8, VariantMetadata, | ||
| ShortString, Variant, VariantDecimal16, VariantDecimal4, VariantDecimal8, VariantList, | ||
| VariantMetadata, VariantObject, | ||
| }; | ||
| use arrow_schema::ArrowError; | ||
| use indexmap::{IndexMap, IndexSet}; | ||
| use std::collections::{HashMap, HashSet}; | ||
| use std::collections::HashSet; | ||
|
|
||
| const BASIC_TYPE_BITS: u8 = 2; | ||
| const UNIX_EPOCH_DATE: chrono::NaiveDate = chrono::NaiveDate::from_ymd_opt(1970, 1, 1).unwrap(); | ||
|
|
@@ -216,6 +217,57 @@ impl ValueBuffer { | |
| self.append_slice(value.as_bytes()); | ||
| } | ||
|
|
||
| fn append_object(&mut self, metadata_builder: &mut MetadataBuilder, obj: VariantObject) { | ||
| let mut object_builder = self.new_object(metadata_builder); | ||
|
|
||
| for (field_name, value) in obj.iter() { | ||
| object_builder.insert(field_name, value); | ||
| } | ||
|
|
||
| object_builder.finish().unwrap(); | ||
| } | ||
|
|
||
| fn try_append_object( | ||
| &mut self, | ||
| metadata_builder: &mut MetadataBuilder, | ||
| obj: VariantObject, | ||
| ) -> Result<(), ArrowError> { | ||
| let mut object_builder = self.new_object(metadata_builder); | ||
|
|
||
| for res in obj.iter_try() { | ||
| let (field_name, value) = res?; | ||
| object_builder.try_insert(field_name, value)?; | ||
| } | ||
|
|
||
| object_builder.finish()?; | ||
|
|
||
| Ok(()) | ||
| } | ||
|
|
||
| fn append_list(&mut self, metadata_builder: &mut MetadataBuilder, list: VariantList) { | ||
| let mut list_builder = self.new_list(metadata_builder); | ||
| for value in list.iter() { | ||
| list_builder.append_value(value); | ||
| } | ||
| list_builder.finish(); | ||
| } | ||
|
|
||
| fn try_append_list( | ||
| &mut self, | ||
| metadata_builder: &mut MetadataBuilder, | ||
| list: VariantList, | ||
| ) -> Result<(), ArrowError> { | ||
| let mut list_builder = self.new_list(metadata_builder); | ||
| for res in list.iter_try() { | ||
| let value = res?; | ||
| list_builder.try_append_value(value)?; | ||
| } | ||
|
|
||
| list_builder.finish(); | ||
|
|
||
| Ok(()) | ||
| } | ||
|
|
||
| fn offset(&self) -> usize { | ||
| self.0.len() | ||
| } | ||
|
|
@@ -252,9 +304,31 @@ impl ValueBuffer { | |
| variant: Variant<'m, 'd>, | ||
| metadata_builder: &mut MetadataBuilder, | ||
| ) { | ||
| self.try_append_variant(variant, metadata_builder).unwrap(); | ||
| match variant { | ||
| Variant::Null => self.append_null(), | ||
| Variant::BooleanTrue => self.append_bool(true), | ||
| Variant::BooleanFalse => self.append_bool(false), | ||
| Variant::Int8(v) => self.append_int8(v), | ||
| Variant::Int16(v) => self.append_int16(v), | ||
| Variant::Int32(v) => self.append_int32(v), | ||
| Variant::Int64(v) => self.append_int64(v), | ||
| Variant::Date(v) => self.append_date(v), | ||
| Variant::TimestampMicros(v) => self.append_timestamp_micros(v), | ||
| Variant::TimestampNtzMicros(v) => self.append_timestamp_ntz_micros(v), | ||
| Variant::Decimal4(decimal4) => self.append_decimal4(decimal4), | ||
| Variant::Decimal8(decimal8) => self.append_decimal8(decimal8), | ||
| Variant::Decimal16(decimal16) => self.append_decimal16(decimal16), | ||
| Variant::Float(v) => self.append_float(v), | ||
| Variant::Double(v) => self.append_double(v), | ||
| Variant::Binary(v) => self.append_binary(v), | ||
| Variant::String(s) => self.append_string(s), | ||
| Variant::ShortString(s) => self.append_short_string(s), | ||
| Variant::Object(obj) => self.append_object(metadata_builder, obj), | ||
| Variant::List(list) => self.append_list(metadata_builder, list), | ||
| } | ||
|
||
| } | ||
|
|
||
| /// Appends a variant to the buffer | ||
| fn try_append_variant<'m, 'd>( | ||
| &mut self, | ||
| variant: Variant<'m, 'd>, | ||
|
|
@@ -279,35 +353,8 @@ impl ValueBuffer { | |
| Variant::Binary(v) => self.append_binary(v), | ||
| Variant::String(s) => self.append_string(s), | ||
| Variant::ShortString(s) => self.append_short_string(s), | ||
| Variant::Object(obj) => { | ||
| let metadata_field_names = metadata_builder | ||
| .field_names | ||
| .iter() | ||
| .enumerate() | ||
| .map(|(i, f)| (f.clone(), i)) | ||
| .collect::<HashMap<_, _>>(); | ||
|
|
||
| let mut object_builder = self.new_object(metadata_builder); | ||
|
|
||
| // first add all object fields that exist in metadata builder | ||
| let mut object_fields = obj.iter().collect::<Vec<_>>(); | ||
|
|
||
| object_fields | ||
| .sort_by_key(|(field_name, _)| metadata_field_names.get(field_name as &str)); | ||
|
|
||
| for (field_name, value) in object_fields { | ||
| object_builder.insert(field_name, value); | ||
| } | ||
|
|
||
| object_builder.finish()?; | ||
| } | ||
| Variant::List(list) => { | ||
| let mut list_builder = self.new_list(metadata_builder); | ||
| for value in list.iter() { | ||
| list_builder.append_value(value); | ||
| } | ||
| list_builder.finish(); | ||
| } | ||
| Variant::Object(obj) => self.try_append_object(metadata_builder, obj)?, | ||
| Variant::List(list) => self.try_append_list(metadata_builder, list)?, | ||
| } | ||
|
|
||
| Ok(()) | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -15,6 +15,8 @@ | |
| // specific language governing permissions and limitations | ||
| // under the License. | ||
|
|
||
| use std::collections::HashSet; | ||
|
|
||
| use crate::decoder::{map_bytes_to_offsets, OffsetSizeBytes}; | ||
| use crate::utils::{first_byte_from_slice, overflow_error, slice_from_slice, string_from_slice}; | ||
|
|
||
|
|
@@ -125,7 +127,7 @@ impl VariantMetadataHeader { | |
| /// | ||
| /// [`Variant`]: crate::Variant | ||
| /// [Variant Spec]: https://github.com/apache/parquet-format/blob/master/VariantEncoding.md#metadata-encoding | ||
| #[derive(Debug, Clone, PartialEq)] | ||
| #[derive(Debug, Clone)] | ||
| pub struct VariantMetadata<'m> { | ||
| pub(crate) bytes: &'m [u8], | ||
| header: VariantMetadataHeader, | ||
|
|
@@ -332,6 +334,30 @@ impl<'m> VariantMetadata<'m> { | |
| } | ||
| } | ||
|
|
||
| // According to the spec, metadata dictionaries are not required to be in a specific order, | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do we need a special is_equal check for VariantMetadata? It seems like now since VariantObject doesn't include a check for the metadata being equal we could avoid a special equality 🤔 |
||
| // to enable flexibility when constructing Variant values | ||
| // | ||
| // Instead of comparing the raw bytes of 2 variant metadata instances, this implementation | ||
| // checks whether the dictionary entries are equal -- regardless of their sorting order | ||
| impl<'m> PartialEq for VariantMetadata<'m> { | ||
| fn eq(&self, other: &Self) -> bool { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Given the cost of constructing hash maps etc, is it worth adding the following quick-check in case the dictionaries are both sorted? if self.is_ordered() && other.is_ordered() {
if self.len() != other.len() {
return false;
}
let self_value_bytes = ... all string value bytes ...;
let other_value_bytes = ... all string value bytes ...;
return self_value_bytes == other_value_bytes;
}Before trusting a single long string compare tho, we would need to convince ourselves that there's no way two dictionaries with different offsets can have identical value bytes. Otherwise, we'd have to loop over the two sets of strings manually.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Very nice. I will think about this and push up a PR.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||
| let is_equal = self.is_empty() == other.is_empty() | ||
| && self.is_fully_validated() == other.is_fully_validated() | ||
|
||
| && self.first_value_byte == other.first_value_byte | ||
| && self.validated == other.validated; | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You could break early here if is_equal is false and not check the fields |
||
|
|
||
| let other_field_names: HashSet<&'m str> = HashSet::from_iter(other.iter()); | ||
|
|
||
| for field_name in self.iter() { | ||
| if !other_field_names.contains(field_name) { | ||
| return false; | ||
|
Comment on lines
+351
to
+353
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This seems one-sided? Don't we need to prove the symmetric difference is empty?
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes. You are right. I think @friendlymatthew has a fix for it in |
||
| } | ||
| } | ||
|
|
||
| is_equal | ||
| } | ||
| } | ||
|
|
||
| /// Retrieves the ith dictionary entry, panicking if the index is out of bounds. Accessing | ||
| /// [unvalidated] input could also panic if the underlying bytes are invalid. | ||
| /// | ||
|
|
@@ -346,6 +372,7 @@ impl std::ops::Index<usize> for VariantMetadata<'_> { | |
|
|
||
| #[cfg(test)] | ||
| mod tests { | ||
|
|
||
| use super::*; | ||
|
|
||
| /// `"cat"`, `"dog"` – valid metadata | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
You might be able to consolidate the apis if you checked the
is_validateddynamicallyFor example, have a single try_append_object and internally try
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Except
append_objectis infallible... so even if we "consolidate" by havingtry_append_objectfall back to its infallible partner, both similar-but-different for loops will still exist.