Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion parquet-variant-json/src/from_json.rs
Original file line number Diff line number Diff line change
Expand Up @@ -165,7 +165,7 @@ mod test {
expected: Variant<'a, 'a>,
}

impl<'a> JsonToVariantTest<'a> {
impl JsonToVariantTest<'_> {
fn run(self) -> Result<(), ArrowError> {
let mut variant_builder = VariantBuilder::new();
json_to_variant(self.json, &mut variant_builder)?;
Expand Down
1 change: 0 additions & 1 deletion parquet-variant/src/builder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1932,7 +1932,6 @@ mod tests {
assert!(metadata.is_empty());

let variant = Variant::try_new_with_metadata(metadata, &value).unwrap();
assert!(metadata.is_empty());
assert_eq!(variant, Variant::Int8(42));
}

Expand Down
66 changes: 25 additions & 41 deletions parquet-variant/src/decoder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,6 @@ use crate::ShortString;
use arrow_schema::ArrowError;
use chrono::{DateTime, Duration, NaiveDate, NaiveDateTime, Utc};

use std::num::TryFromIntError;

/// The basic type of a [`Variant`] value, encoded in the first two bits of the
/// header byte.
///
Expand Down Expand Up @@ -147,11 +145,9 @@ impl OffsetSizeBytes {
/// * `bytes` – the byte buffer to index
/// * `index` – 0-based index into the buffer
///
/// Each value is `self as usize` bytes wide (1, 2, 3 or 4).
/// Three-byte values are zero-extended to 32 bits before the final
/// fallible cast to `usize`.
pub(crate) fn unpack_usize(&self, bytes: &[u8], index: usize) -> Result<usize, ArrowError> {
self.unpack_usize_at_offset(bytes, 0, index)
/// Each value is `self as u32` bytes wide (1, 2, 3 or 4), zero-extended to 32 bits as needed.
pub(crate) fn unpack_u32(&self, bytes: &[u8], index: usize) -> Result<u32, ArrowError> {
self.unpack_u32_at_offset(bytes, 0, index)
}

/// Return one unsigned little-endian value from `bytes`.
Expand All @@ -162,15 +158,13 @@ impl OffsetSizeBytes {
/// * `offset_index` – 0-based index **after** the skipped bytes
/// (`0` is the first value, `1` the next, …).
///
/// Each value is `self as usize` bytes wide (1, 2, 3 or 4).
/// Three-byte values are zero-extended to 32 bits before the final
/// fallible cast to `usize`.
pub(crate) fn unpack_usize_at_offset(
/// Each value is `self as u32` bytes wide (1, 2, 3 or 4), zero-extended to 32 bits as needed.
pub(crate) fn unpack_u32_at_offset(
&self,
bytes: &[u8],
byte_offset: usize, // how many bytes to skip
offset_index: usize, // which offset in an array of offsets
) -> Result<usize, ArrowError> {
) -> Result<u32, ArrowError> {
use OffsetSizeBytes::*;

// Index into the byte array:
Expand All @@ -179,7 +173,7 @@ impl OffsetSizeBytes {
.checked_mul(*self as usize)
.and_then(|n| n.checked_add(byte_offset))
.ok_or_else(|| overflow_error("unpacking offset array value"))?;
let result = match self {
let value = match self {
One => u8::from_le_bytes(array_from_slice(bytes, offset)?).into(),
Two => u16::from_le_bytes(array_from_slice(bytes, offset)?).into(),
Three => {
Expand All @@ -192,11 +186,7 @@ impl OffsetSizeBytes {
}
Four => u32::from_le_bytes(array_from_slice(bytes, offset)?),
};

// Convert the u32 we extracted to usize (should always succeed on 32- and 64-bit arch)
result
.try_into()
.map_err(|e: TryFromIntError| ArrowError::InvalidArgumentError(e.to_string()))
Ok(value)
}
}

Expand Down Expand Up @@ -518,57 +508,51 @@ mod tests {
}

#[test]
fn unpack_usize_all_widths() {
fn unpack_u32_all_widths() {
// One-byte offsets
let buf_one = [0x01u8, 0xAB, 0xCD];
assert_eq!(
OffsetSizeBytes::One.unpack_usize(&buf_one, 0).unwrap(),
0x01
);
assert_eq!(
OffsetSizeBytes::One.unpack_usize(&buf_one, 2).unwrap(),
0xCD
);
assert_eq!(OffsetSizeBytes::One.unpack_u32(&buf_one, 0).unwrap(), 0x01);
assert_eq!(OffsetSizeBytes::One.unpack_u32(&buf_one, 2).unwrap(), 0xCD);

// Two-byte offsets (little-endian 0x1234, 0x5678)
let buf_two = [0x34, 0x12, 0x78, 0x56];
assert_eq!(
OffsetSizeBytes::Two.unpack_usize(&buf_two, 0).unwrap(),
OffsetSizeBytes::Two.unpack_u32(&buf_two, 0).unwrap(),
0x1234
);
assert_eq!(
OffsetSizeBytes::Two.unpack_usize(&buf_two, 1).unwrap(),
OffsetSizeBytes::Two.unpack_u32(&buf_two, 1).unwrap(),
0x5678
);

// Three-byte offsets (0x030201 and 0x0000FF)
let buf_three = [0x01, 0x02, 0x03, 0xFF, 0x00, 0x00];
assert_eq!(
OffsetSizeBytes::Three.unpack_usize(&buf_three, 0).unwrap(),
OffsetSizeBytes::Three.unpack_u32(&buf_three, 0).unwrap(),
0x030201
);
assert_eq!(
OffsetSizeBytes::Three.unpack_usize(&buf_three, 1).unwrap(),
OffsetSizeBytes::Three.unpack_u32(&buf_three, 1).unwrap(),
0x0000FF
);

// Four-byte offsets (0x12345678, 0x90ABCDEF)
let buf_four = [0x78, 0x56, 0x34, 0x12, 0xEF, 0xCD, 0xAB, 0x90];
assert_eq!(
OffsetSizeBytes::Four.unpack_usize(&buf_four, 0).unwrap(),
OffsetSizeBytes::Four.unpack_u32(&buf_four, 0).unwrap(),
0x1234_5678
);
assert_eq!(
OffsetSizeBytes::Four.unpack_usize(&buf_four, 1).unwrap(),
OffsetSizeBytes::Four.unpack_u32(&buf_four, 1).unwrap(),
0x90AB_CDEF
);
}

#[test]
fn unpack_usize_out_of_bounds() {
fn unpack_u32_out_of_bounds() {
let tiny = [0x00u8]; // deliberately too short
assert!(OffsetSizeBytes::Two.unpack_usize(&tiny, 0).is_err());
assert!(OffsetSizeBytes::Three.unpack_usize(&tiny, 0).is_err());
assert!(OffsetSizeBytes::Two.unpack_u32(&tiny, 0).is_err());
assert!(OffsetSizeBytes::Three.unpack_u32(&tiny, 0).is_err());
}

#[test]
Expand All @@ -584,20 +568,20 @@ mod tests {
let width = OffsetSizeBytes::Two;

// dictionary_size starts immediately after the header byte
let dict_size = width.unpack_usize_at_offset(&buf, 1, 0).unwrap();
let dict_size = width.unpack_u32_at_offset(&buf, 1, 0).unwrap();
assert_eq!(dict_size, 2);

// offset array immediately follows the dictionary size
let first = width.unpack_usize_at_offset(&buf, 1, 1).unwrap();
let first = width.unpack_u32_at_offset(&buf, 1, 1).unwrap();
assert_eq!(first, 0);

let second = width.unpack_usize_at_offset(&buf, 1, 2).unwrap();
let second = width.unpack_u32_at_offset(&buf, 1, 2).unwrap();
assert_eq!(second, 5);

let third = width.unpack_usize_at_offset(&buf, 1, 3).unwrap();
let third = width.unpack_u32_at_offset(&buf, 1, 3).unwrap();
assert_eq!(third, 9);

let err = width.unpack_usize_at_offset(&buf, 1, 4);
let err = width.unpack_u32_at_offset(&buf, 1, 4);
assert!(err.is_err())
}
}
9 changes: 9 additions & 0 deletions parquet-variant/src/utils.rs
Original file line number Diff line number Diff line change
Expand Up @@ -122,3 +122,12 @@ where

Some(Err(start))
}

/// Verifies the expected size of type T, for a type that should only grow if absolutely necessary.
#[allow(unused)]
pub(crate) const fn expect_size_of<T>(expected: usize) {
let size = std::mem::size_of::<T>();
if size != expected {
let _ = [""; 0][size];
}
}
3 changes: 3 additions & 0 deletions parquet-variant/src/variant.rs
Original file line number Diff line number Diff line change
Expand Up @@ -256,6 +256,9 @@ pub enum Variant<'m, 'v> {
List(VariantList<'m, 'v>),
}

// We don't want this to grow because it could hurt performance of a frequently-created type.
const _: () = crate::utils::expect_size_of::<Variant>(80);
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

New addition -- I encapsulated the size check into a const helper function, whose compilation failure includes the object's actual size (otherwise, have to guess what the size was):

error[E0080]: evaluation of constant value failed
   --> parquet-variant/src/utils.rs:139:17
    |
139 |         let _ = ["";0][size];
    |                 ^^^^^^^^^^^^ index out of bounds: the length is 0 but the index is 80
    |
note: inside `utils::expect_size_of::<variant::Variant<'_, '_>>`
   --> parquet-variant/src/utils.rs:139:17
    |
139 |         let _ = ["";0][size];
    |                 ^^^^^^^^^^^^
note: inside `variant::_`
   --> parquet-variant/src/variant.rs:260:15
    |
260 | const _: () = crate::utils::expect_size_of::<Variant>(64);
    |               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

As can be seen, a check for Variant itself exposes the fact that it's 80 bytes. I can't figure out why -- VariantObject and VariantList are the only big enum variants (the next-biggest is 32 bytes)??

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Assuming 8-byte alignment, and observing that we use explicit structs for enum variant payloads which will prevent layout optimizations, I would expect the (one-byte) discriminator for Variant to push the size from 64 bytes (biggest enum variant payload) to 72 bytes (next alignment boundary). Where did the other 8 bytes come from??

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ouch... figured it out -- i128 has 16-byte alignment, which poisons VariantDecimal16 (whose u8 scale pushes the size from 16 to 17 to 32 bytes) and Variant (whose one-byte discriminator pushes the size from 64 to 65 to 80 bytes).


impl<'m, 'v> Variant<'m, 'v> {
/// Attempts to interpret a metadata and value buffer pair as a new `Variant`.
///
Expand Down
47 changes: 25 additions & 22 deletions parquet-variant/src/variant/list.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ use crate::variant::{Variant, VariantMetadata};
use arrow_schema::ArrowError;

// The value header occupies one byte; use a named constant for readability
const NUM_HEADER_BYTES: usize = 1;
const NUM_HEADER_BYTES: u32 = 1;

/// A parsed version of the variant array value header byte.
#[derive(Debug, Clone, PartialEq)]
Expand All @@ -34,15 +34,15 @@ pub(crate) struct VariantListHeader {

impl VariantListHeader {
// Hide the ugly casting
const fn num_elements_size(&self) -> usize {
const fn num_elements_size(&self) -> u32 {
self.num_elements_size as _
}
const fn offset_size(&self) -> usize {
const fn offset_size(&self) -> u32 {
self.offset_size as _
}

// Avoid materializing this offset, since it's cheaply and safely computable
const fn first_offset_byte(&self) -> usize {
const fn first_offset_byte(&self) -> u32 {
NUM_HEADER_BYTES + self.num_elements_size()
}

Expand Down Expand Up @@ -122,11 +122,14 @@ pub struct VariantList<'m, 'v> {
pub metadata: VariantMetadata<'m>,
pub value: &'v [u8],
header: VariantListHeader,
num_elements: usize,
first_value_byte: usize,
num_elements: u32,
first_value_byte: u32,
validated: bool,
}

// We don't want this to grow because it could increase the size of `Variant` and hurt performance.
const _: () = crate::utils::expect_size_of::<VariantList>(64);

impl<'m, 'v> VariantList<'m, 'v> {
/// Attempts to interpret `value` as a variant array value.
///
Expand Down Expand Up @@ -157,7 +160,7 @@ impl<'m, 'v> VariantList<'m, 'v> {
let num_elements =
header
.num_elements_size
.unpack_usize_at_offset(value, NUM_HEADER_BYTES, 0)?;
.unpack_u32_at_offset(value, NUM_HEADER_BYTES as _, 0)?;
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is an annoying side effect of using a named constant... the literal 1 would "just work" for both u32 and usize.


// (num_elements + 1) * offset_size + first_offset_byte
let first_value_byte = num_elements
Expand Down Expand Up @@ -185,10 +188,10 @@ impl<'m, 'v> VariantList<'m, 'v> {

// Use the last offset to upper-bound the value buffer
let last_offset = new_self
.get_offset(num_elements)?
.get_offset(num_elements as _)?
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Rather than do a bunch of try_into().map_err(...) calls, just admit that converting u32 to usize is infallible for all practical purposes -- I seriously doubt arrow-rs can run on 16-bit hardware where usize might be only 16 bits.

(I don't love blind as _ casting in general -- too easy to cast to something unexpected or ignore the implications of the cast -- but it seems ok in this specific set of cases)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I agree this is fine

.checked_add(first_value_byte)
.ok_or_else(|| overflow_error("variant array size"))?;
new_self.value = slice_from_slice(value, ..last_offset)?;
new_self.value = slice_from_slice(value, ..last_offset as _)?;
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Unfortunately the SliceIndex trait only works for usize, so we have to widen the u32 values whenever we create one.

Ok(new_self)
}

Expand All @@ -210,7 +213,7 @@ impl<'m, 'v> VariantList<'m, 'v> {

let offset_buffer = slice_from_slice(
self.value,
self.header.first_offset_byte()..self.first_value_byte,
self.header.first_offset_byte() as _..self.first_value_byte as _,
)?;

let offsets =
Expand All @@ -226,15 +229,15 @@ impl<'m, 'v> VariantList<'m, 'v> {
));
}

let value_buffer = slice_from_slice(self.value, self.first_value_byte..)?;
let value_buffer = slice_from_slice(self.value, self.first_value_byte as _..)?;

// Validate whether values are valid variant objects
for i in 1..offsets.len() {
let start_offset = offsets[i - 1];
let end_offset = offsets[i];

let value_bytes = slice_from_slice(value_buffer, start_offset..end_offset)?;
Variant::try_new_with_metadata(self.metadata, value_bytes)?;
Variant::try_new_with_metadata(self.metadata.clone(), value_bytes)?;
}

self.validated = true;
Expand All @@ -244,7 +247,7 @@ impl<'m, 'v> VariantList<'m, 'v> {

/// Return the length of this array
pub fn len(&self) -> usize {
self.num_elements
self.num_elements as _
}

/// Is the array of zero length
Expand All @@ -256,7 +259,7 @@ impl<'m, 'v> VariantList<'m, 'v> {
///
/// [invalid]: Self#Validation
pub fn get(&self, index: usize) -> Option<Variant<'m, 'v>> {
(index < self.num_elements).then(|| {
(index < self.len()).then(|| {
self.try_get_with_shallow_validation(index)
.expect("Invalid variant array element")
})
Expand All @@ -272,10 +275,10 @@ impl<'m, 'v> VariantList<'m, 'v> {
fn try_get_with_shallow_validation(&self, index: usize) -> Result<Variant<'m, 'v>, ArrowError> {
// Fetch the value bytes between the two offsets for this index, from the value array region
// of the byte buffer
let byte_range = self.get_offset(index)?..self.get_offset(index + 1)?;
let byte_range = self.get_offset(index)? as _..self.get_offset(index + 1)? as _;
let value_bytes =
slice_from_slice_at_offset(self.value, self.first_value_byte, byte_range)?;
Variant::try_new_with_metadata_and_shallow_validation(self.metadata, value_bytes)
slice_from_slice_at_offset(self.value, self.first_value_byte as _, byte_range)?;
Variant::try_new_with_metadata_and_shallow_validation(self.metadata.clone(), value_bytes)
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

VariantMetadata is no longer Copy

}

/// Iterates over the values of this list. When working with [unvalidated] input, consider
Expand All @@ -297,14 +300,14 @@ impl<'m, 'v> VariantList<'m, 'v> {
fn iter_try_with_shallow_validation(
&self,
) -> impl Iterator<Item = Result<Variant<'m, 'v>, ArrowError>> + '_ {
(0..self.len()).map(move |i| self.try_get_with_shallow_validation(i))
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There's nothing to move here...

(0..self.len()).map(|i| self.try_get_with_shallow_validation(i))
}

// Attempts to retrieve the ith offset from the offset array region of the byte buffer.
fn get_offset(&self, index: usize) -> Result<usize, ArrowError> {
let byte_range = self.header.first_offset_byte()..self.first_value_byte;
fn get_offset(&self, index: usize) -> Result<u32, ArrowError> {
let byte_range = self.header.first_offset_byte() as _..self.first_value_byte as _;
let offset_bytes = slice_from_slice(self.value, byte_range)?;
self.header.offset_size.unpack_usize(offset_bytes, index)
self.header.offset_size.unpack_u32(offset_bytes, index)
}
}

Expand Down Expand Up @@ -623,7 +626,7 @@ mod tests {
expected_num_element_size,
variant_list.header.num_elements_size
);
assert_eq!(list_size, variant_list.num_elements);
assert_eq!(list_size, variant_list.num_elements as usize);

// verify the data in the variant
assert_eq!(list_size, variant_list.len());
Expand Down
Loading
Loading