From eba5044687372787c88ba800d580b2183eae3325 Mon Sep 17 00:00:00 2001 From: mwish Date: Tue, 24 Oct 2023 22:01:27 +0800 Subject: [PATCH 1/2] Prepare GH-38432 --- cpp/src/parquet/encoding.cc | 30 +++++++++++++++++++----------- 1 file changed, 19 insertions(+), 11 deletions(-) diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc index 5221f2588c0..8ad1112738e 100644 --- a/cpp/src/parquet/encoding.cc +++ b/cpp/src/parquet/encoding.cc @@ -1205,16 +1205,22 @@ struct ArrowBinaryHelper { return Status::OK(); } - Status PrepareNextInput(int64_t next_value_length, - std::optional estimated_remaining_data_length = {}) { + Status PrepareNextInput(int64_t next_value_length) { + if (ARROW_PREDICT_FALSE(!CanFit(next_value_length))) { + // This element would exceed the capacity of a chunk + return PushChunk(); + } + return Status::OK(); + } + + Status PrepareNextInputWithEstimatedLength(int64_t next_value_length, + int64_t estimated_remaining_data_length) { if (ARROW_PREDICT_FALSE(!CanFit(next_value_length))) { // This element would exceed the capacity of a chunk RETURN_NOT_OK(PushChunk()); RETURN_NOT_OK(acc_->builder->Reserve(entries_remaining_)); - if (estimated_remaining_data_length.has_value()) { - RETURN_NOT_OK(acc_->builder->ReserveData( - std::min(*estimated_remaining_data_length, chunk_space_remaining_))); - } + RETURN_NOT_OK(acc_->builder->ReserveData( + std::min(estimated_remaining_data_length, chunk_space_remaining_))); } return Status::OK(); } @@ -1271,8 +1277,10 @@ struct ArrowBinaryHelper { return acc_->Reserve(entries_remaining_); } - Status PrepareNextInput(int64_t next_value_length, - std::optional estimated_remaining_data_length = {}) { + Status PrepareNextInput(int64_t next_value_length) { return Status::OK(); } + + Status PrepareNextInputWithEstimatedLength(int64_t next_value_length, + int64_t estimated_remaining_data_length) { return Status::OK(); } @@ -1421,7 +1429,7 @@ class PlainByteArrayDecoder : public PlainDecoder, if (ARROW_PREDICT_FALSE(len_ < increment)) { ParquetException::EofException(); } - RETURN_NOT_OK(helper.PrepareNextInput(value_len, len_)); + RETURN_NOT_OK(helper.PrepareNextInputWithEstimatedLength(value_len, len_)); helper.UnsafeAppend(data_ + 4, value_len); data_ += increment; len_ -= increment; @@ -1915,7 +1923,7 @@ class DictByteArrayDecoderImpl : public DictDecoderImpl, int32_t indices[kBufferSize]; ArrowBinaryHelper helper(out, num_values); - RETURN_NOT_OK(helper.Prepare()); + // RETURN_NOT_OK(helper.Prepare()); auto dict_values = reinterpret_cast(dictionary_->data()); int values_decoded = 0; @@ -1983,7 +1991,7 @@ class DictByteArrayDecoderImpl : public DictDecoderImpl, int values_decoded = 0; ArrowBinaryHelper helper(out, num_values); - RETURN_NOT_OK(helper.Prepare(len_)); + // RETURN_NOT_OK(helper.Prepare(len_)); auto dict_values = reinterpret_cast(dictionary_->data()); From fec597e6afada9168c4765335c104c79cc2f3cae Mon Sep 17 00:00:00 2001 From: mwish Date: Fri, 27 Oct 2023 16:08:06 +0800 Subject: [PATCH 2/2] cleanup --- cpp/src/parquet/encoding.cc | 2 -- 1 file changed, 2 deletions(-) diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc index 8ad1112738e..bf9f15dc4f1 100644 --- a/cpp/src/parquet/encoding.cc +++ b/cpp/src/parquet/encoding.cc @@ -1923,7 +1923,6 @@ class DictByteArrayDecoderImpl : public DictDecoderImpl, int32_t indices[kBufferSize]; ArrowBinaryHelper helper(out, num_values); - // RETURN_NOT_OK(helper.Prepare()); auto dict_values = reinterpret_cast(dictionary_->data()); int values_decoded = 0; @@ -1991,7 +1990,6 @@ class DictByteArrayDecoderImpl : public DictDecoderImpl, int values_decoded = 0; ArrowBinaryHelper helper(out, num_values); - // RETURN_NOT_OK(helper.Prepare(len_)); auto dict_values = reinterpret_cast(dictionary_->data());