Skip to content

Commit

Permalink
GH-41562: [C++][Parquet] Decoding: Fix num_value handling in ByteStre…
Browse files Browse the repository at this point in the history
…amSplitDecoder (#41565)

### Rationale for this change

This problem is raised from  #40094 . Original bug fixed here: #34140 , but this is corrupt in #40094 .

### What changes are included in this PR?

Refine checking

### Are these changes tested?

* [x] Will add

### Are there any user-facing changes?

Bugfix

* GitHub Issue: #41562

Authored-by: mwish <[email protected]>
Signed-off-by: Antoine Pitrou <[email protected]>
  • Loading branch information
mapleFU authored and raulcd committed May 8, 2024
1 parent dcfeceb commit 6cfebb9
Show file tree
Hide file tree
Showing 3 changed files with 24 additions and 7 deletions.
22 changes: 17 additions & 5 deletions cpp/src/parquet/encoding.cc
Original file line number Diff line number Diff line change
Expand Up @@ -3694,12 +3694,24 @@ class ByteStreamSplitDecoderBase : public DecoderImpl,
ByteStreamSplitDecoderBase(const ColumnDescriptor* descr, int byte_width)
: DecoderImpl(descr, Encoding::BYTE_STREAM_SPLIT), byte_width_(byte_width) {}

void SetData(int num_values, const uint8_t* data, int len) override {
if (static_cast<int64_t>(num_values) * byte_width_ != len) {
throw ParquetException("Data size (" + std::to_string(len) +
") does not match number of values in BYTE_STREAM_SPLIT (" +
std::to_string(num_values) + ")");
void SetData(int num_values, const uint8_t* data, int len) final {
// Check that the data size is consistent with the number of values
// The spec requires that the data size is a multiple of the number of values,
// see: https://github.com/apache/parquet-format/pull/192 .
// GH-41562: passed in `num_values` may include nulls, so we need to check and
// adjust the number of values.
if (static_cast<int64_t>(num_values) * byte_width_ < len) {
throw ParquetException(
"Data size (" + std::to_string(len) +
") is too small for the number of values in in BYTE_STREAM_SPLIT (" +
std::to_string(num_values) + ")");
}
if (len % byte_width_ != 0) {
throw ParquetException("ByteStreamSplit data size " + std::to_string(len) +
" not aligned with type " + TypeToString(DType::type_num) +
" and byte_width: " + std::to_string(byte_width_));
}
num_values = len / byte_width_;
DecoderImpl::SetData(num_values, data, len);
stride_ = num_values_;
}
Expand Down
5 changes: 5 additions & 0 deletions cpp/src/parquet/encoding.h
Original file line number Diff line number Diff line change
Expand Up @@ -255,6 +255,11 @@ class Decoder {

// Sets the data for a new page. This will be called multiple times on the same
// decoder and should reset all internal state.
//
// `num_values` comes from the data page header, and may be greater than the number of
// physical values in the data buffer if there are some omitted (null) values.
// `len`, on the other hand, is the size in bytes of the data buffer and
// directly relates to the number of physical values.
virtual void SetData(int num_values, const uint8_t* data, int len) = 0;

// Returns the number of values left (for the last call to SetData()). This is
Expand Down
4 changes: 2 additions & 2 deletions cpp/src/parquet/encoding_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1383,7 +1383,7 @@ class TestByteStreamSplitEncoding : public TestEncodingBase<Type> {
encoder->PutSpaced(draws_, num_values_, valid_bits, valid_bits_offset);
encode_buffer_ = encoder->FlushValues();
ASSERT_EQ(encode_buffer_->size(), physical_byte_width() * (num_values_ - null_count));
decoder->SetData(num_values_ - null_count, encode_buffer_->data(),
decoder->SetData(num_values_, encode_buffer_->data(),
static_cast<int>(encode_buffer_->size()));
auto values_decoded = decoder->DecodeSpaced(decode_buf_, num_values_, null_count,
valid_bits, valid_bits_offset);
Expand Down Expand Up @@ -1717,7 +1717,7 @@ class TestDeltaBitPackEncoding : public TestEncodingBase<Type> {
for (size_t i = 0; i < kNumRoundTrips; ++i) {
encoder->PutSpaced(draws_, num_values_, valid_bits, valid_bits_offset);
encode_buffer_ = encoder->FlushValues();
decoder->SetData(num_values_ - null_count, encode_buffer_->data(),
decoder->SetData(num_values_, encode_buffer_->data(),
static_cast<int>(encode_buffer_->size()));
auto values_decoded = decoder->DecodeSpaced(decode_buf_, num_values_, null_count,
valid_bits, valid_bits_offset);
Expand Down

0 comments on commit 6cfebb9

Please sign in to comment.