Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
182 changes: 176 additions & 6 deletions parquet-variant/src/decoder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,13 @@
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
use crate::utils::{array_from_slice, slice_from_slice, string_from_slice};

use arrow_schema::ArrowError;
use chrono::{DateTime, Duration, NaiveDate, NaiveDateTime, Utc};
use std::array::TryFromSliceError;

use crate::utils::{array_from_slice, slice_from_slice, string_from_slice};
use std::array::TryFromSliceError;
use std::num::TryFromIntError;

#[derive(Debug, Clone, Copy, PartialEq)]
pub enum VariantBasicType {
Expand Down Expand Up @@ -50,10 +52,10 @@ pub enum VariantPrimitiveType {
}

/// Extracts the basic type from a header byte
pub(crate) fn get_basic_type(header: u8) -> Result<VariantBasicType, ArrowError> {
pub(crate) fn get_basic_type(header: u8) -> VariantBasicType {
// See https://github.com/apache/parquet-format/blob/master/VariantEncoding.md#value-encoding
let basic_type = header & 0x03; // Basic type is encoded in the first 2 bits
let basic_type = match basic_type {
match basic_type {
0 => VariantBasicType::Primitive,
1 => VariantBasicType::ShortString,
2 => VariantBasicType::Object,
Expand All @@ -63,8 +65,7 @@ pub(crate) fn get_basic_type(header: u8) -> Result<VariantBasicType, ArrowError>
// masked `basic_type` with 0x03 above.
unreachable!();
}
};
Ok(basic_type)
}
}

impl TryFrom<u8> for VariantPrimitiveType {
Expand Down Expand Up @@ -96,6 +97,76 @@ impl TryFrom<u8> for VariantPrimitiveType {
}
}
}

/// Used to unpack offset array entries such as metadata dictionary offsets or object/array value
/// offsets. Also used to unpack object field ids. These are always derived from a two-bit
/// `XXX_size_minus_one` field in the corresponding header byte.
#[derive(Clone, Debug, Copy, PartialEq)]
pub(crate) enum OffsetSizeBytes {
One = 1,
Two = 2,
Three = 3,
Four = 4,
}

impl OffsetSizeBytes {
/// Build from the `offset_size_minus_one` bits (see spec).
pub(crate) fn try_new(offset_size_minus_one: u8) -> Result<Self, ArrowError> {
use OffsetSizeBytes::*;
let result = match offset_size_minus_one {
0 => One,
1 => Two,
2 => Three,
3 => Four,
_ => {
return Err(ArrowError::InvalidArgumentError(
"offset_size_minus_one must be 0–3".to_string(),
))
}
};
Ok(result)
}

/// Return one unsigned little-endian value from `bytes`.
///
/// * `bytes` – the Variant-metadata buffer.
/// * `byte_offset` – number of bytes to skip **before** reading the first
/// value (usually `1` to move past the header byte).
/// * `offset_index` – 0-based index **after** the skip
/// (`0` is the first value, `1` the next, …).
///
/// Each value is `self as usize` bytes wide (1, 2, 3 or 4).
/// Three-byte values are zero-extended to 32 bits before the final
/// fallible cast to `usize`.
pub(crate) fn unpack_usize(
&self,
bytes: &[u8],
byte_offset: usize, // how many bytes to skip
offset_index: usize, // which offset in an array of offsets
) -> Result<usize, ArrowError> {
use OffsetSizeBytes::*;
let offset = byte_offset + (*self as usize) * offset_index;
let result = match self {
One => u8::from_le_bytes(array_from_slice(bytes, offset)?).into(),
Two => u16::from_le_bytes(array_from_slice(bytes, offset)?).into(),
Three => {
// Let's grab the three byte le-chunk first
let b3_chunks: [u8; 3] = array_from_slice(bytes, offset)?;
// Let's pad it and construct a padded u32 from it.
let mut buf = [0u8; 4];
buf[..3].copy_from_slice(&b3_chunks);
u32::from_le_bytes(buf)
.try_into()
.map_err(|e: TryFromIntError| ArrowError::InvalidArgumentError(e.to_string()))?
}
Four => u32::from_le_bytes(array_from_slice(bytes, offset)?)
.try_into()
.map_err(|e: TryFromIntError| ArrowError::InvalidArgumentError(e.to_string()))?,
};
Ok(result)
}
}

/// Extract the primitive type from a Variant value-metadata byte
pub(crate) fn get_primitive_type(metadata: u8) -> Result<VariantPrimitiveType, ArrowError> {
// last 6 bits contain the primitive-type, see spec
Expand Down Expand Up @@ -363,4 +434,103 @@ mod tests {
assert_eq!(result, "Hello");
Ok(())
}

#[test]
fn test_offset() {
assert_eq!(OffsetSizeBytes::try_new(0).unwrap(), OffsetSizeBytes::One);
assert_eq!(OffsetSizeBytes::try_new(1).unwrap(), OffsetSizeBytes::Two);
assert_eq!(OffsetSizeBytes::try_new(2).unwrap(), OffsetSizeBytes::Three);
assert_eq!(OffsetSizeBytes::try_new(3).unwrap(), OffsetSizeBytes::Four);

// everything outside 0-3 must error
assert!(OffsetSizeBytes::try_new(4).is_err());
assert!(OffsetSizeBytes::try_new(255).is_err());
}

#[test]
fn unpack_usize_all_widths() {
// One-byte offsets
let buf_one = [0x01u8, 0xAB, 0xCD];
assert_eq!(
OffsetSizeBytes::One.unpack_usize(&buf_one, 0, 0).unwrap(),
0x01
);
assert_eq!(
OffsetSizeBytes::One.unpack_usize(&buf_one, 0, 2).unwrap(),
0xCD
);

// Two-byte offsets (little-endian 0x1234, 0x5678)
let buf_two = [0x34, 0x12, 0x78, 0x56];
assert_eq!(
OffsetSizeBytes::Two.unpack_usize(&buf_two, 0, 0).unwrap(),
0x1234
);
assert_eq!(
OffsetSizeBytes::Two.unpack_usize(&buf_two, 0, 1).unwrap(),
0x5678
);

// Three-byte offsets (0x030201 and 0x0000FF)
let buf_three = [0x01, 0x02, 0x03, 0xFF, 0x00, 0x00];
assert_eq!(
OffsetSizeBytes::Three
.unpack_usize(&buf_three, 0, 0)
.unwrap(),
0x030201
);
assert_eq!(
OffsetSizeBytes::Three
.unpack_usize(&buf_three, 0, 1)
.unwrap(),
0x0000FF
);

// Four-byte offsets (0x12345678, 0x90ABCDEF)
let buf_four = [0x78, 0x56, 0x34, 0x12, 0xEF, 0xCD, 0xAB, 0x90];
assert_eq!(
OffsetSizeBytes::Four.unpack_usize(&buf_four, 0, 0).unwrap(),
0x1234_5678
);
assert_eq!(
OffsetSizeBytes::Four.unpack_usize(&buf_four, 0, 1).unwrap(),
0x90AB_CDEF
);
}

#[test]
fn unpack_usize_out_of_bounds() {
let tiny = [0x00u8]; // deliberately too short
assert!(OffsetSizeBytes::Two.unpack_usize(&tiny, 0, 0).is_err());
assert!(OffsetSizeBytes::Three.unpack_usize(&tiny, 0, 0).is_err());
}

#[test]
fn unpack_simple() {
let buf = [
0x41, // header
0x02, 0x00, // dictionary_size = 2
0x00, 0x00, // offset[0] = 0
0x05, 0x00, // offset[1] = 5
0x09, 0x00, // offset[2] = 9
];

let width = OffsetSizeBytes::Two;

// dictionary_size starts immediately after the header
let dict_size = width.unpack_usize(&buf, 1, 0).unwrap();
assert_eq!(dict_size, 2);

let first = width.unpack_usize(&buf, 1, 1).unwrap();
assert_eq!(first, 0);

let second = width.unpack_usize(&buf, 1, 2).unwrap();
assert_eq!(second, 5);

let third = width.unpack_usize(&buf, 1, 3).unwrap();
assert_eq!(third, 9);

let err = width.unpack_usize(&buf, 1, 4);
assert!(err.is_err())
}
}
Loading
Loading