Skip to content
36 changes: 30 additions & 6 deletions cpp/src/parquet/encoding.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1196,24 +1196,40 @@ struct ArrowBinaryHelper<ByteArrayType> {
chunk_space_remaining_(::arrow::kBinaryMemoryLimit -
acc_->builder->value_data_length()) {}

// Prepare will reserve the number of entries remaining in the current chunk.
// If estimated_data_length is provided, it will also reserve the estimated data length,
// and the caller should better call `UnsafeAppend` instead of `Append` to avoid
// double-checking the data length.
Status Prepare(std::optional<int64_t> estimated_data_length = {}) {
RETURN_NOT_OK(acc_->builder->Reserve(entries_remaining_));
if (estimated_data_length.has_value()) {
RETURN_NOT_OK(acc_->builder->ReserveData(
std::min<int64_t>(*estimated_data_length, ::arrow::kBinaryMemoryLimit)));
std::min<int64_t>(*estimated_data_length, this->chunk_space_remaining_)));
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Using ::arrow::kBinaryMemoryLimit is too large here.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also I think this might fixes #38577

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does this make a practical difference? We're taking std::min here...

Copy link
Member Author

@mapleFU mapleFU Nov 24, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it have.

I think this might be pratical difference in corner case, estimated_data_length means remaining data in current page, and this->chunk_space_remaining_ means the remain data in current chunk. They're different.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

( However this might never triggered, it's a bit hard to construct a boundary case like this. I just think this->chunk_space_remaining_ should be more safety

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I can confirm that this change actually fixes the regression (#38577 (comment)).

When the decoder/builder is reused (eg reading a second row group or batch of a row group), chunk_space_remaining_ is smaller than kBinaryMemoryLimit. So if estimated_data_length is larger than chunk_space_remaining_, we are trying to reserve too many values.

}
return Status::OK();
}

Status PrepareNextInput(int64_t next_value_length) {
if (ARROW_PREDICT_FALSE(!CanFit(next_value_length))) {
// This element would exceed the capacity of a chunk
RETURN_NOT_OK(PushChunk());
RETURN_NOT_OK(acc_->builder->Reserve(entries_remaining_));
}
return Status::OK();
}
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What difference does it make to have a separate method?

Copy link
Member Author

@mapleFU mapleFU Nov 24, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

https://godbolt.org/z/5bf1K55a5

Found it might not optimized with hot path. Prepare is not in hot path, but PrepareNextInput is.


// If estimated_remaining_data_length is provided, it will also reserve the estimated
// data length, and the caller should better call `UnsafeAppend` instead of
// `Append` to avoid double-checking the data length.
Status PrepareNextInput(int64_t next_value_length,
std::optional<int64_t> estimated_remaining_data_length = {}) {
int64_t estimated_remaining_data_length) {
if (ARROW_PREDICT_FALSE(!CanFit(next_value_length))) {
// This element would exceed the capacity of a chunk
RETURN_NOT_OK(PushChunk());
RETURN_NOT_OK(acc_->builder->Reserve(entries_remaining_));
if (estimated_remaining_data_length.has_value()) {
if (estimated_remaining_data_length) {
RETURN_NOT_OK(acc_->builder->ReserveData(
std::min<int64_t>(*estimated_remaining_data_length, chunk_space_remaining_)));
std::min<int64_t>(estimated_remaining_data_length, chunk_space_remaining_)));
}
}
return Status::OK();
Expand Down Expand Up @@ -1271,8 +1287,10 @@ struct ArrowBinaryHelper<FLBAType> {
return acc_->Reserve(entries_remaining_);
}

Status PrepareNextInput(int64_t next_value_length) { return Status::OK(); }

Status PrepareNextInput(int64_t next_value_length,
std::optional<int64_t> estimated_remaining_data_length = {}) {
int64_t estimated_remaining_data_length) {
return Status::OK();
}

Expand Down Expand Up @@ -1915,6 +1933,9 @@ class DictByteArrayDecoderImpl : public DictDecoderImpl<ByteArrayType>,
int32_t indices[kBufferSize];

ArrowBinaryHelper<ByteArrayType> helper(out, num_values);
// The `len_` in the ByteArrayDictDecoder is the total length of the
// RLE/Bit-pack encoded data size, so, we cannot use `len_` to reserve
// space for binary data.
RETURN_NOT_OK(helper.Prepare());

auto dict_values = reinterpret_cast<const ByteArray*>(dictionary_->data());
Expand Down Expand Up @@ -1983,7 +2004,10 @@ class DictByteArrayDecoderImpl : public DictDecoderImpl<ByteArrayType>,
int values_decoded = 0;

ArrowBinaryHelper<ByteArrayType> helper(out, num_values);
RETURN_NOT_OK(helper.Prepare(len_));
// The `len_` in the ByteArrayDictDecoder is the total length of the
// RLE/Bit-pack encoded data size, so, we cannot use `len_` to reserve
// space for binary data.
RETURN_NOT_OK(helper.Prepare());

auto dict_values = reinterpret_cast<const ByteArray*>(dictionary_->data());

Expand Down