Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 3 additions & 4 deletions parquet/src/encodings/decoding.rs
Original file line number Diff line number Diff line change
Expand Up @@ -395,16 +395,15 @@ impl<T: DataType> DeltaBitPackDecoder<T> {
.get_zigzag_vlq_int()
.ok_or_else(|| eof_err!("Not enough data to decode 'min_delta'"))?;

let mut widths = vec![];
self.delta_bit_widths.clear();
for _ in 0..self.num_mini_blocks {
let w = self
.bit_reader
.get_aligned::<u8>(1)
.ok_or_else(|| eof_err!("Not enough data to decode 'width'"))?;
widths.push(w);
self.delta_bit_widths.push(w);
}

self.delta_bit_widths.set_data(widths);
self.mini_block_idx = 0;
self.delta_bit_width = self.delta_bit_widths.data()[0];
self.values_current_mini_block = self.values_per_mini_block;
Expand All @@ -417,7 +416,6 @@ impl<T: DataType> DeltaBitPackDecoder<T> {
where
T::T: FromBytes,
{
self.deltas_in_mini_block.clear();
if self.use_batch {
self.deltas_in_mini_block
.resize(self.values_current_mini_block, T::T::default());
Expand All @@ -427,6 +425,7 @@ impl<T: DataType> DeltaBitPackDecoder<T> {
);
assert!(loaded == self.values_current_mini_block);
} else {
self.deltas_in_mini_block.clear();
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't understand the need for this change -- was calling clear() a major bottleneck? Or was it having to reinitialize the entire deltas_in_mini_block to default() in the self.use_batch branch?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In this case, the resize is expensive even though it optimizes down to mostly a memset (only 4 elems in the array in my tests). Around a 5% throughput difference.

for _ in 0..self.values_current_mini_block {
// TODO: load one batch at a time similar to int32
let delta = self
Expand Down
13 changes: 3 additions & 10 deletions parquet/src/util/bit_util.rs
Original file line number Diff line number Diff line change
Expand Up @@ -603,11 +603,7 @@ impl BitReader {

// Advance byte_offset to next unread byte and read num_bytes
self.byte_offset += bytes_read;
let v = read_num_bytes!(
T,
num_bytes,
self.buffer.start_from(self.byte_offset).as_ref()
);
let v = read_num_bytes!(T, num_bytes, self.buffer.data()[self.byte_offset..]);
self.byte_offset += num_bytes;

// Reset buffered_values
Expand Down Expand Up @@ -657,11 +653,8 @@ impl BitReader {

fn reload_buffer_values(&mut self) {
let bytes_to_read = cmp::min(self.total_bytes - self.byte_offset, 8);
self.buffered_values = read_num_bytes!(
u64,
bytes_to_read,
self.buffer.start_from(self.byte_offset).as_ref()
);
self.buffered_values =
read_num_bytes!(u64, bytes_to_read, self.buffer.data()[self.byte_offset..]);
}
}

Expand Down