Skip to content

Commit

Permalink
Fix uninitialized data when reading Parquet RLE-encoded data with nulls,
Browse files Browse the repository at this point in the history
also fix UBSAN issue when converting null Int96 entries to Arrow timestamp.
  • Loading branch information
pitrou committed Sep 16, 2019
1 parent e9c1847 commit 8fdb0bd
Show file tree
Hide file tree
Showing 2 changed files with 14 additions and 1 deletion.
7 changes: 7 additions & 0 deletions cpp/src/arrow/util/rle_encoding.h
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,9 @@ class RleDecoder {
int GetBatchWithDict(const T* dictionary, T* values, int batch_size);

/// Like GetBatchWithDict but add spacing for null entries
///
/// Null entries will be zero-initialized in `values` to avoid leaking
/// private data.
template <typename T>
int GetBatchWithDictSpaced(const T* dictionary, T* values, int batch_size,
int null_count, const uint8_t* valid_bits,
Expand Down Expand Up @@ -433,6 +436,8 @@ inline int RleDecoder::GetBatchWithDictSpaced(const T* dictionary, T* out, int b
DCHECK_GE(bit_width_, 0);
int values_read = 0;
int remaining_nulls = null_count;
T zero;
memset(&zero, 0, sizeof(T));

arrow::internal::BitmapReader bit_reader(valid_bits, valid_bits_offset, batch_size);

Expand Down Expand Up @@ -484,6 +489,7 @@ inline int RleDecoder::GetBatchWithDictSpaced(const T* dictionary, T* out, int b
*out = dictionary[indices[literals_read]];
literals_read++;
} else {
*out = zero;
skipped++;
}
++out;
Expand All @@ -494,6 +500,7 @@ inline int RleDecoder::GetBatchWithDictSpaced(const T* dictionary, T* out, int b
remaining_nulls -= skipped;
}
} else {
*out = zero;
++out;
values_read++;
remaining_nulls--;
Expand Down
8 changes: 7 additions & 1 deletion cpp/src/parquet/arrow/reader_internal.cc
Original file line number Diff line number Diff line change
Expand Up @@ -738,7 +738,13 @@ Status TransferInt96(RecordReader* reader, MemoryPool* pool,
RETURN_NOT_OK(::arrow::AllocateBuffer(pool, length * sizeof(int64_t), &data));
auto data_ptr = reinterpret_cast<int64_t*>(data->mutable_data());
for (int64_t i = 0; i < length; i++) {
*data_ptr++ = Int96GetNanoSeconds(values[i]);
if (values[i].value[2] == 0) {
// Happens for null entries: avoid triggering UBSAN as that Int96 timestamp
// isn't representable as a 64-bit Unix timestamp.
*data_ptr++ = 0;
} else {
*data_ptr++ = Int96GetNanoSeconds(values[i]);
}
}
*out = std::make_shared<TimestampArray>(type, length, data, reader->ReleaseIsValid(),
reader->null_count());
Expand Down

0 comments on commit 8fdb0bd

Please sign in to comment.