Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion cpp/CMakePresets.json
Original file line number Diff line number Diff line change
Expand Up @@ -444,7 +444,8 @@
"CMAKE_CXX_COMPILER": "clang++",
"ARROW_IPC": "ON",
"ARROW_PARQUET": "ON",
"ARROW_FUZZING": "ON"
"ARROW_FUZZING": "ON",
"ARROW_WITH_SNAPPY": "ON"
}
},
{
Expand Down
14 changes: 9 additions & 5 deletions cpp/src/arrow/util/rle_encoding_internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -657,13 +657,14 @@ auto RleBitPackedParser::PeekImpl(Handler&& handler) const
const auto header_bytes = bit_util::ParseLeadingLEB128(data_, kMaxSize, &run_len_type);

if (ARROW_PREDICT_FALSE(header_bytes == 0)) {
// Malfomrmed LEB128 data
// Malformed LEB128 data
return {0, ControlFlow::Break};
}

const bool is_bit_packed = run_len_type & 1;
const uint32_t count = run_len_type >> 1;
if (is_bit_packed) {
// Bit-packed run
constexpr auto kMaxCount = bit_util::CeilDiv(internal::max_size_for_v<rle_size_t>, 8);
if (ARROW_PREDICT_FALSE(count == 0 || count > kMaxCount)) {
// Illegal number of encoded values
Expand All @@ -672,17 +673,21 @@ auto RleBitPackedParser::PeekImpl(Handler&& handler) const

ARROW_DCHECK_LT(static_cast<uint64_t>(count) * 8,
internal::max_size_for_v<rle_size_t>);
// Count Already divided by 8 for byte size calculations
const auto bytes_read = header_bytes + static_cast<int64_t>(count) * value_bit_width_;
if (ARROW_PREDICT_FALSE(bytes_read > data_size_)) {
// Bit-packed run would overflow data buffer
return {0, ControlFlow::Break};
}
const auto values_count = static_cast<rle_size_t>(count * 8);
// Count Already divided by 8
const auto bytes_read =
header_bytes + static_cast<rle_size_t>(count) * value_bit_width_;

auto control = handler.OnBitPackedRun(
BitPackedRun(data_ + header_bytes, values_count, value_bit_width_));

return {bytes_read, control};
}

// RLE run
if (ARROW_PREDICT_FALSE(count == 0)) {
// Illegal number of encoded values
return {0, ControlFlow::Break};
Expand Down Expand Up @@ -1079,7 +1084,6 @@ auto RleBitPackedDecoder<T>::GetSpaced(Converter converter,
// There may be remaining null if they are not greedily filled by either decoder calls
check_and_handle_fully_null_remaining();

ARROW_DCHECK(batch.is_done() || exhausted());
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This check could trigger if the RLE-bit-packed data is invalid (for example a run of invalid size). @AntoinePrv

return batch.total_read();
}

Expand Down
7 changes: 5 additions & 2 deletions cpp/src/parquet/decoder.cc
Original file line number Diff line number Diff line change
Expand Up @@ -2082,9 +2082,12 @@ class DeltaByteArrayDecoderImpl : public TypedDecoderImpl<DType> {
int64_t valid_bits_offset,
typename EncodingTraits<DType>::Accumulator* out,
int* out_num_values) {
std::vector<ByteArray> values(num_values);
std::vector<ByteArray> values(num_values - null_count);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Aha...

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Well, this was not a problem in itself, it was just allocating too much memory :)

const int num_valid_values = GetInternal(values.data(), num_values - null_count);
DCHECK_EQ(num_values - null_count, num_valid_values);
if (ARROW_PREDICT_FALSE(num_values - null_count != num_valid_values)) {
throw ParquetException("Expected to decode ", num_values - null_count,
" values, but decoded ", num_valid_values, " values.");
}

auto visit_binary_helper = [&](auto* helper) {
auto values_ptr = reinterpret_cast<const ByteArray*>(values.data());
Expand Down
Loading