-
Notifications
You must be signed in to change notification settings - Fork 4k
GH-33024: [C++][Parquet] Add DELTA_LENGTH_BYTE_ARRAY encoder to Parquet writer #14293
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
e79c52d
7d6b25a
04464b3
24a566b
f5f2415
7de26ec
e31b69b
c94ad88
5adb923
2469f71
112699a
1508a84
c2fb6f3
3eab439
aeafc86
bb95765
3650276
6213858
b8e1dc0
eebc0b0
44282c4
b3cf11e
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -2572,6 +2572,129 @@ class DeltaBitPackDecoder : public DecoderImpl, virtual public TypedDecoder<DTyp | |
| // ---------------------------------------------------------------------- | ||
| // DELTA_LENGTH_BYTE_ARRAY | ||
|
|
||
| // ---------------------------------------------------------------------- | ||
| // DeltaLengthByteArrayEncoder | ||
|
|
||
| template <typename DType> | ||
rok marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| class DeltaLengthByteArrayEncoder : public EncoderImpl, | ||
| virtual public TypedEncoder<ByteArrayType> { | ||
| public: | ||
| explicit DeltaLengthByteArrayEncoder(const ColumnDescriptor* descr, MemoryPool* pool) | ||
| : EncoderImpl(descr, Encoding::DELTA_LENGTH_BYTE_ARRAY, | ||
| pool = ::arrow::default_memory_pool()), | ||
| sink_(pool), | ||
| length_encoder_(nullptr, pool), | ||
| encoded_size_{0} {} | ||
|
|
||
| std::shared_ptr<Buffer> FlushValues() override; | ||
|
|
||
| int64_t EstimatedDataEncodedSize() override { | ||
| return encoded_size_ + length_encoder_.EstimatedDataEncodedSize(); | ||
| } | ||
|
|
||
| using TypedEncoder<ByteArrayType>::Put; | ||
|
|
||
| void Put(const ::arrow::Array& values) override; | ||
|
|
||
| void Put(const T* buffer, int num_values) override; | ||
|
|
||
| void PutSpaced(const T* src, int num_values, const uint8_t* valid_bits, | ||
| int64_t valid_bits_offset) override; | ||
|
|
||
| protected: | ||
| template <typename ArrayType> | ||
| void PutBinaryArray(const ArrayType& array) { | ||
| PARQUET_THROW_NOT_OK(::arrow::VisitArraySpanInline<typename ArrayType::TypeClass>( | ||
| *array.data(), | ||
| [&](::std::string_view view) { | ||
| if (ARROW_PREDICT_FALSE(view.size() > kMaxByteArraySize)) { | ||
| return Status::Invalid("Parquet cannot store strings with size 2GB or more"); | ||
| } | ||
| length_encoder_.Put({static_cast<int32_t>(view.length())}, 1); | ||
| PARQUET_THROW_NOT_OK(sink_.Append(view.data(), view.length())); | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I am sorry for leading to any confusion from my previous review comments. The input
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Personally I agree with this, but maybe as an future optimization. The patch is already great and will be a bit complex if memcpy is used...
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'll give it another try and report back.
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Let's wait for pitrou's idea... Seems modifing same line back and back again is really a torment
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'll just test locally and won't push changes.
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
I agree. Just want to clarify my purpose but not required for this patch.
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This should probably be a followup PR, since it's a non-trivial optimization. |
||
| return Status::OK(); | ||
| }, | ||
| []() { return Status::OK(); })); | ||
| } | ||
|
|
||
| ::arrow::BufferBuilder sink_; | ||
| DeltaBitPackEncoder<Int32Type> length_encoder_; | ||
| uint32_t encoded_size_; | ||
| }; | ||
|
|
||
| template <typename DType> | ||
| void DeltaLengthByteArrayEncoder<DType>::Put(const ::arrow::Array& values) { | ||
| AssertBaseBinary(values); | ||
| if (::arrow::is_binary_like(values.type_id())) { | ||
| PutBinaryArray(checked_cast<const ::arrow::BinaryArray&>(values)); | ||
| } else { | ||
| PutBinaryArray(checked_cast<const ::arrow::LargeBinaryArray&>(values)); | ||
| } | ||
| } | ||
|
|
||
| template <typename DType> | ||
| void DeltaLengthByteArrayEncoder<DType>::Put(const T* src, int num_values) { | ||
| if (num_values == 0) { | ||
| return; | ||
rok marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| } | ||
|
|
||
| constexpr int kBatchSize = 256; | ||
| std::array<int32_t, kBatchSize> lengths; | ||
| for (int idx = 0; idx < num_values; idx += kBatchSize) { | ||
| const int batch_size = std::min(kBatchSize, num_values - idx); | ||
| for (int j = 0; j < batch_size; ++j) { | ||
| const int32_t len = src[idx + j].len; | ||
| if (AddWithOverflow(encoded_size_, len, &encoded_size_)) { | ||
| throw ParquetException("excess expansion in DELTA_LENGTH_BYTE_ARRAY"); | ||
| } | ||
| lengths[j] = len; | ||
| } | ||
| length_encoder_.Put(lengths.data(), batch_size); | ||
| } | ||
|
|
||
| PARQUET_THROW_NOT_OK(sink_.Reserve(encoded_size_)); | ||
| for (int idx = 0; idx < num_values; idx++) { | ||
| sink_.UnsafeAppend(src[idx].ptr, src[idx].len); | ||
rok marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| } | ||
| } | ||
|
|
||
| template <typename DType> | ||
| void DeltaLengthByteArrayEncoder<DType>::PutSpaced(const T* src, int num_values, | ||
| const uint8_t* valid_bits, | ||
| int64_t valid_bits_offset) { | ||
| if (valid_bits != NULLPTR) { | ||
| PARQUET_ASSIGN_OR_THROW(auto buffer, ::arrow::AllocateBuffer(num_values * sizeof(T), | ||
| this->memory_pool())); | ||
| T* data = reinterpret_cast<T*>(buffer->mutable_data()); | ||
| int num_valid_values = ::arrow::util::internal::SpacedCompress<T>( | ||
| src, num_values, valid_bits, valid_bits_offset, data); | ||
| Put(data, num_valid_values); | ||
| } else { | ||
| Put(src, num_values); | ||
wjones127 marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| } | ||
| } | ||
|
|
||
| template <typename DType> | ||
| std::shared_ptr<Buffer> DeltaLengthByteArrayEncoder<DType>::FlushValues() { | ||
| std::shared_ptr<Buffer> encoded_lengths = length_encoder_.FlushValues(); | ||
|
||
|
|
||
| std::shared_ptr<Buffer> data; | ||
| PARQUET_THROW_NOT_OK(sink_.Finish(&data)); | ||
| sink_.Reset(); | ||
|
|
||
| PARQUET_THROW_NOT_OK(sink_.Resize(encoded_lengths->size() + data->size())); | ||
| PARQUET_THROW_NOT_OK(sink_.Append(encoded_lengths->data(), encoded_lengths->size())); | ||
rok marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| PARQUET_THROW_NOT_OK(sink_.Append(data->data(), data->size())); | ||
|
|
||
| std::shared_ptr<Buffer> buffer; | ||
| PARQUET_THROW_NOT_OK(sink_.Finish(&buffer, true)); | ||
rok marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| encoded_size_ = 0; | ||
| return buffer; | ||
| } | ||
|
|
||
| // ---------------------------------------------------------------------- | ||
| // DeltaLengthByteArrayDecoder | ||
|
|
||
| class DeltaLengthByteArrayDecoder : public DecoderImpl, | ||
| virtual public TypedDecoder<ByteArrayType> { | ||
| public: | ||
|
|
@@ -2636,13 +2759,17 @@ class DeltaLengthByteArrayDecoder : public DecoderImpl, | |
| int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits, | ||
| int64_t valid_bits_offset, | ||
| typename EncodingTraits<ByteArrayType>::Accumulator* out) override { | ||
| ParquetException::NYI("DecodeArrow for DeltaLengthByteArrayDecoder"); | ||
| int result = 0; | ||
| PARQUET_THROW_NOT_OK(DecodeArrowDense(num_values, null_count, valid_bits, | ||
| valid_bits_offset, out, &result)); | ||
| return result; | ||
| } | ||
|
|
||
| int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits, | ||
| int64_t valid_bits_offset, | ||
| typename EncodingTraits<ByteArrayType>::DictAccumulator* out) override { | ||
| ParquetException::NYI("DecodeArrow for DeltaLengthByteArrayDecoder"); | ||
| ParquetException::NYI( | ||
| "DecodeArrow of DictAccumulator for DeltaLengthByteArrayDecoder"); | ||
| } | ||
|
|
||
| private: | ||
|
|
@@ -2664,6 +2791,44 @@ class DeltaLengthByteArrayDecoder : public DecoderImpl, | |
| num_valid_values_ = num_length; | ||
| } | ||
|
|
||
| Status DecodeArrowDense(int num_values, int null_count, const uint8_t* valid_bits, | ||
| int64_t valid_bits_offset, | ||
| typename EncodingTraits<ByteArrayType>::Accumulator* out, | ||
| int* out_num_values) { | ||
| ArrowBinaryHelper helper(out); | ||
|
|
||
| std::vector<ByteArray> values(num_values - null_count); | ||
| const int num_valid_values = Decode(values.data(), num_values - null_count); | ||
rok marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| if (ARROW_PREDICT_FALSE(num_values - null_count != num_valid_values)) { | ||
| throw ParquetException("Expected to decode ", num_values - null_count, | ||
| " values, but decoded ", num_valid_values, " values."); | ||
| } | ||
|
|
||
| auto values_ptr = values.data(); | ||
| int value_idx = 0; | ||
|
|
||
| RETURN_NOT_OK(VisitNullBitmapInline( | ||
| valid_bits, valid_bits_offset, num_values, null_count, | ||
| [&]() { | ||
| const auto& val = values_ptr[value_idx]; | ||
| if (ARROW_PREDICT_FALSE(!helper.CanFit(val.len))) { | ||
| RETURN_NOT_OK(helper.PushChunk()); | ||
| } | ||
| RETURN_NOT_OK(helper.Append(val.ptr, static_cast<int32_t>(val.len))); | ||
| ++value_idx; | ||
| return Status::OK(); | ||
| }, | ||
| [&]() { | ||
| RETURN_NOT_OK(helper.AppendNull()); | ||
| --null_count; | ||
| return Status::OK(); | ||
| })); | ||
|
|
||
| DCHECK_EQ(null_count, 0); | ||
| *out_num_values = num_valid_values; | ||
| return Status::OK(); | ||
| } | ||
|
|
||
| std::shared_ptr<::arrow::bit_util::BitReader> decoder_; | ||
| DeltaBitPackDecoder<Int32Type> len_decoder_; | ||
| int num_valid_values_; | ||
|
|
@@ -3075,7 +3240,6 @@ std::unique_ptr<Encoder> MakeEncoder(Type::type type_num, Encoding::type encodin | |
| return std::make_unique<ByteStreamSplitEncoder<DoubleType>>(descr, pool); | ||
| default: | ||
| throw ParquetException("BYTE_STREAM_SPLIT only supports FLOAT and DOUBLE"); | ||
| break; | ||
| } | ||
| } else if (encoding == Encoding::DELTA_BINARY_PACKED) { | ||
| switch (type_num) { | ||
|
|
@@ -3086,7 +3250,13 @@ std::unique_ptr<Encoder> MakeEncoder(Type::type type_num, Encoding::type encodin | |
| default: | ||
| throw ParquetException( | ||
| "DELTA_BINARY_PACKED encoder only supports INT32 and INT64"); | ||
| break; | ||
| } | ||
| } else if (encoding == Encoding::DELTA_LENGTH_BYTE_ARRAY) { | ||
| switch (type_num) { | ||
| case Type::BYTE_ARRAY: | ||
| return std::make_unique<DeltaLengthByteArrayEncoder<ByteArrayType>>(descr, pool); | ||
| default: | ||
| throw ParquetException("DELTA_LENGTH_BYTE_ARRAY only supports BYTE_ARRAY"); | ||
rok marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| } | ||
| } else { | ||
| ParquetException::NYI("Selected encoding is not supported"); | ||
|
|
@@ -3126,7 +3296,6 @@ std::unique_ptr<Decoder> MakeDecoder(Type::type type_num, Encoding::type encodin | |
| return std::make_unique<ByteStreamSplitDecoder<DoubleType>>(descr); | ||
| default: | ||
| throw ParquetException("BYTE_STREAM_SPLIT only supports FLOAT and DOUBLE"); | ||
| break; | ||
| } | ||
| } else if (encoding == Encoding::DELTA_BINARY_PACKED) { | ||
| switch (type_num) { | ||
|
|
@@ -3137,7 +3306,6 @@ std::unique_ptr<Decoder> MakeDecoder(Type::type type_num, Encoding::type encodin | |
| default: | ||
| throw ParquetException( | ||
| "DELTA_BINARY_PACKED decoder only supports INT32 and INT64"); | ||
| break; | ||
| } | ||
| } else if (encoding == Encoding::DELTA_BYTE_ARRAY) { | ||
| if (type_num == Type::BYTE_ARRAY) { | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.