-
Notifications
You must be signed in to change notification settings - Fork 1.1k
[Variant] Reduce variant-related struct sizes #7888
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
3caca3f
d24900d
7edc8fd
f5e21c0
ef7a2a1
93ec848
70e3d7c
72e5681
a74cb1b
9c47a62
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -23,7 +23,7 @@ use crate::variant::{Variant, VariantMetadata}; | |
| use arrow_schema::ArrowError; | ||
|
|
||
| // The value header occupies one byte; use a named constant for readability | ||
| const NUM_HEADER_BYTES: usize = 1; | ||
| const NUM_HEADER_BYTES: u32 = 1; | ||
|
|
||
| /// A parsed version of the variant array value header byte. | ||
| #[derive(Debug, Clone, PartialEq)] | ||
|
|
@@ -34,15 +34,15 @@ pub(crate) struct VariantListHeader { | |
|
|
||
| impl VariantListHeader { | ||
| // Hide the ugly casting | ||
| const fn num_elements_size(&self) -> usize { | ||
| const fn num_elements_size(&self) -> u32 { | ||
| self.num_elements_size as _ | ||
| } | ||
| const fn offset_size(&self) -> usize { | ||
| const fn offset_size(&self) -> u32 { | ||
| self.offset_size as _ | ||
| } | ||
|
|
||
| // Avoid materializing this offset, since it's cheaply and safely computable | ||
| const fn first_offset_byte(&self) -> usize { | ||
| const fn first_offset_byte(&self) -> u32 { | ||
| NUM_HEADER_BYTES + self.num_elements_size() | ||
| } | ||
|
|
||
|
|
@@ -122,11 +122,14 @@ pub struct VariantList<'m, 'v> { | |
| pub metadata: VariantMetadata<'m>, | ||
| pub value: &'v [u8], | ||
| header: VariantListHeader, | ||
| num_elements: usize, | ||
| first_value_byte: usize, | ||
| num_elements: u32, | ||
| first_value_byte: u32, | ||
| validated: bool, | ||
| } | ||
|
|
||
| // We don't want this to grow because it could increase the size of `Variant` and hurt performance. | ||
| const _: () = crate::utils::expect_size_of::<VariantList>(64); | ||
|
|
||
| impl<'m, 'v> VariantList<'m, 'v> { | ||
| /// Attempts to interpret `value` as a variant array value. | ||
| /// | ||
|
|
@@ -157,7 +160,7 @@ impl<'m, 'v> VariantList<'m, 'v> { | |
| let num_elements = | ||
| header | ||
| .num_elements_size | ||
| .unpack_usize_at_offset(value, NUM_HEADER_BYTES, 0)?; | ||
| .unpack_u32_at_offset(value, NUM_HEADER_BYTES as _, 0)?; | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is an annoying side effect of using a named constant... the literal |
||
|
|
||
| // (num_elements + 1) * offset_size + first_offset_byte | ||
| let first_value_byte = num_elements | ||
|
|
@@ -185,10 +188,10 @@ impl<'m, 'v> VariantList<'m, 'v> { | |
|
|
||
| // Use the last offset to upper-bound the value buffer | ||
| let last_offset = new_self | ||
| .get_offset(num_elements)? | ||
| .get_offset(num_elements as _)? | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Rather than do a bunch of (I don't love blind
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I agree this is fine |
||
| .checked_add(first_value_byte) | ||
| .ok_or_else(|| overflow_error("variant array size"))?; | ||
| new_self.value = slice_from_slice(value, ..last_offset)?; | ||
| new_self.value = slice_from_slice(value, ..last_offset as _)?; | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Unfortunately the |
||
| Ok(new_self) | ||
| } | ||
|
|
||
|
|
@@ -210,7 +213,7 @@ impl<'m, 'v> VariantList<'m, 'v> { | |
|
|
||
| let offset_buffer = slice_from_slice( | ||
| self.value, | ||
| self.header.first_offset_byte()..self.first_value_byte, | ||
| self.header.first_offset_byte() as _..self.first_value_byte as _, | ||
| )?; | ||
|
|
||
| let offsets = | ||
|
|
@@ -226,15 +229,15 @@ impl<'m, 'v> VariantList<'m, 'v> { | |
| )); | ||
| } | ||
|
|
||
| let value_buffer = slice_from_slice(self.value, self.first_value_byte..)?; | ||
| let value_buffer = slice_from_slice(self.value, self.first_value_byte as _..)?; | ||
|
|
||
| // Validate whether values are valid variant objects | ||
| for i in 1..offsets.len() { | ||
| let start_offset = offsets[i - 1]; | ||
| let end_offset = offsets[i]; | ||
|
|
||
| let value_bytes = slice_from_slice(value_buffer, start_offset..end_offset)?; | ||
| Variant::try_new_with_metadata(self.metadata, value_bytes)?; | ||
| Variant::try_new_with_metadata(self.metadata.clone(), value_bytes)?; | ||
| } | ||
|
|
||
| self.validated = true; | ||
|
|
@@ -244,7 +247,7 @@ impl<'m, 'v> VariantList<'m, 'v> { | |
|
|
||
| /// Return the length of this array | ||
| pub fn len(&self) -> usize { | ||
| self.num_elements | ||
| self.num_elements as _ | ||
| } | ||
|
|
||
| /// Is the array of zero length | ||
|
|
@@ -256,7 +259,7 @@ impl<'m, 'v> VariantList<'m, 'v> { | |
| /// | ||
| /// [invalid]: Self#Validation | ||
| pub fn get(&self, index: usize) -> Option<Variant<'m, 'v>> { | ||
| (index < self.num_elements).then(|| { | ||
| (index < self.len()).then(|| { | ||
| self.try_get_with_shallow_validation(index) | ||
| .expect("Invalid variant array element") | ||
| }) | ||
|
|
@@ -272,10 +275,10 @@ impl<'m, 'v> VariantList<'m, 'v> { | |
| fn try_get_with_shallow_validation(&self, index: usize) -> Result<Variant<'m, 'v>, ArrowError> { | ||
| // Fetch the value bytes between the two offsets for this index, from the value array region | ||
| // of the byte buffer | ||
| let byte_range = self.get_offset(index)?..self.get_offset(index + 1)?; | ||
| let byte_range = self.get_offset(index)? as _..self.get_offset(index + 1)? as _; | ||
| let value_bytes = | ||
| slice_from_slice_at_offset(self.value, self.first_value_byte, byte_range)?; | ||
| Variant::try_new_with_metadata_and_shallow_validation(self.metadata, value_bytes) | ||
| slice_from_slice_at_offset(self.value, self.first_value_byte as _, byte_range)?; | ||
| Variant::try_new_with_metadata_and_shallow_validation(self.metadata.clone(), value_bytes) | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
| } | ||
|
|
||
| /// Iterates over the values of this list. When working with [unvalidated] input, consider | ||
|
|
@@ -297,14 +300,14 @@ impl<'m, 'v> VariantList<'m, 'v> { | |
| fn iter_try_with_shallow_validation( | ||
| &self, | ||
| ) -> impl Iterator<Item = Result<Variant<'m, 'v>, ArrowError>> + '_ { | ||
| (0..self.len()).map(move |i| self.try_get_with_shallow_validation(i)) | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There's nothing to |
||
| (0..self.len()).map(|i| self.try_get_with_shallow_validation(i)) | ||
| } | ||
|
|
||
| // Attempts to retrieve the ith offset from the offset array region of the byte buffer. | ||
| fn get_offset(&self, index: usize) -> Result<usize, ArrowError> { | ||
| let byte_range = self.header.first_offset_byte()..self.first_value_byte; | ||
| fn get_offset(&self, index: usize) -> Result<u32, ArrowError> { | ||
| let byte_range = self.header.first_offset_byte() as _..self.first_value_byte as _; | ||
| let offset_bytes = slice_from_slice(self.value, byte_range)?; | ||
| self.header.offset_size.unpack_usize(offset_bytes, index) | ||
| self.header.offset_size.unpack_u32(offset_bytes, index) | ||
| } | ||
| } | ||
|
|
||
|
|
@@ -623,7 +626,7 @@ mod tests { | |
| expected_num_element_size, | ||
| variant_list.header.num_elements_size | ||
| ); | ||
| assert_eq!(list_size, variant_list.num_elements); | ||
| assert_eq!(list_size, variant_list.num_elements as usize); | ||
|
|
||
| // verify the data in the variant | ||
| assert_eq!(list_size, variant_list.len()); | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
New addition -- I encapsulated the size check into a const helper function, whose compilation failure includes the object's actual size (otherwise, have to guess what the size was):
As can be seen, a check for
Variantitself exposes the fact that it's 80 bytes. I can't figure out why --VariantObjectandVariantListare the only big enum variants (the next-biggest is 32 bytes)??There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Assuming 8-byte alignment, and observing that we use explicit structs for enum variant payloads which will prevent layout optimizations, I would expect the (one-byte) discriminator for
Variantto push the size from 64 bytes (biggest enum variant payload) to 72 bytes (next alignment boundary). Where did the other 8 bytes come from??There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ouch... figured it out -- i128 has 16-byte alignment, which poisons
VariantDecimal16(whoseu8scale pushes the size from 16 to 17 to 32 bytes) andVariant(whose one-byte discriminator pushes the size from 64 to 65 to 80 bytes).