Skip to content

Commit d620685

Browse files
committed
GH-47740: [C++][Parquet] Fix UB/crashes when reading invalid Parquet data
1 parent bf342b2 commit d620685

File tree

4 files changed

+17
-9
lines changed

4 files changed

+17
-9
lines changed

cpp/CMakePresets.json

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -444,7 +444,8 @@
444444
"CMAKE_CXX_COMPILER": "clang++",
445445
"ARROW_IPC": "ON",
446446
"ARROW_PARQUET": "ON",
447-
"ARROW_FUZZING": "ON"
447+
"ARROW_FUZZING": "ON",
448+
"ARROW_WITH_SNAPPY": "ON"
448449
}
449450
},
450451
{

cpp/src/arrow/util/rle_encoding_internal.h

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -657,13 +657,14 @@ auto RleBitPackedParser::PeekImpl(Handler&& handler) const
657657
const auto header_bytes = bit_util::ParseLeadingLEB128(data_, kMaxSize, &run_len_type);
658658

659659
if (ARROW_PREDICT_FALSE(header_bytes == 0)) {
660-
// Malfomrmed LEB128 data
660+
// Malformed LEB128 data
661661
return {0, ControlFlow::Break};
662662
}
663663

664664
const bool is_bit_packed = run_len_type & 1;
665665
const uint32_t count = run_len_type >> 1;
666666
if (is_bit_packed) {
667+
// Bit-packed run
667668
constexpr auto kMaxCount = bit_util::CeilDiv(internal::max_size_for_v<rle_size_t>, 8);
668669
if (ARROW_PREDICT_FALSE(count == 0 || count > kMaxCount)) {
669670
// Illegal number of encoded values
@@ -672,17 +673,21 @@ auto RleBitPackedParser::PeekImpl(Handler&& handler) const
672673

673674
ARROW_DCHECK_LT(static_cast<uint64_t>(count) * 8,
674675
internal::max_size_for_v<rle_size_t>);
676+
// Count Already divided by 8 for byte size calculations
677+
const auto bytes_read = header_bytes + static_cast<int64_t>(count) * value_bit_width_;
678+
if (ARROW_PREDICT_FALSE(bytes_read > data_size_)) {
679+
// Bit-packed run would overflow data buffer
680+
return {0, ControlFlow::Break};
681+
}
675682
const auto values_count = static_cast<rle_size_t>(count * 8);
676-
// Count Already divided by 8
677-
const auto bytes_read =
678-
header_bytes + static_cast<rle_size_t>(count) * value_bit_width_;
679683

680684
auto control = handler.OnBitPackedRun(
681685
BitPackedRun(data_ + header_bytes, values_count, value_bit_width_));
682686

683687
return {bytes_read, control};
684688
}
685689

690+
// RLE run
686691
if (ARROW_PREDICT_FALSE(count == 0)) {
687692
// Illegal number of encoded values
688693
return {0, ControlFlow::Break};
@@ -1079,7 +1084,6 @@ auto RleBitPackedDecoder<T>::GetSpaced(Converter converter,
10791084
// There may be remaining null if they are not greedily filled by either decoder calls
10801085
check_and_handle_fully_null_remaining();
10811086

1082-
ARROW_DCHECK(batch.is_done() || exhausted());
10831087
return batch.total_read();
10841088
}
10851089

cpp/src/parquet/decoder.cc

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2082,9 +2082,12 @@ class DeltaByteArrayDecoderImpl : public TypedDecoderImpl<DType> {
20822082
int64_t valid_bits_offset,
20832083
typename EncodingTraits<DType>::Accumulator* out,
20842084
int* out_num_values) {
2085-
std::vector<ByteArray> values(num_values);
2085+
std::vector<ByteArray> values(num_values - null_count);
20862086
const int num_valid_values = GetInternal(values.data(), num_values - null_count);
2087-
DCHECK_EQ(num_values - null_count, num_valid_values);
2087+
if (ARROW_PREDICT_FALSE(num_values - null_count != num_valid_values)) {
2088+
throw ParquetException("Expected to decode ", num_values - null_count,
2089+
" values, but decoded ", num_valid_values, " values.");
2090+
}
20882091

20892092
auto visit_binary_helper = [&](auto* helper) {
20902093
auto values_ptr = reinterpret_cast<const ByteArray*>(values.data());

0 commit comments

Comments
 (0)