diff --git a/c_glib/arrow-glib/array-builder.cpp b/c_glib/arrow-glib/array-builder.cpp index 095c68d8768..afdae8c8b31 100644 --- a/c_glib/arrow-glib/array-builder.cpp +++ b/c_glib/arrow-glib/array-builder.cpp @@ -136,9 +136,7 @@ garrow_array_builder_append_nulls(GArrowArrayBuilder *builder, auto arrow_builder = static_cast(garrow_array_builder_get_raw(builder)); - uint8_t valid_bytes[n]; - memset(valid_bytes, 0, sizeof(uint8_t) * n); - auto status = arrow_builder->AppendNulls(valid_bytes, n); + auto status = arrow_builder->AppendNulls(n); return garrow_error_check(error, status, context); } diff --git a/cpp/src/arrow/array-test.cc b/cpp/src/arrow/array-test.cc index 3b348de8ef2..dead8de4f6e 100644 --- a/cpp/src/arrow/array-test.cc +++ b/cpp/src/arrow/array-test.cc @@ -478,7 +478,11 @@ void TestPrimitiveBuilder::Check(const std::unique_ptr ASSERT_EQ(draws_[i] != 0, actual) << i; } } - ASSERT_TRUE(result->Equals(*expected)); + AssertArraysEqual(*result, *expected); + + // buffers are correctly sized + ASSERT_EQ(result->data()->buffers[0]->size(), BitUtil::BytesForBits(size)); + ASSERT_EQ(result->data()->buffers[1]->size(), BitUtil::BytesForBits(size)); // Builder is now reset ASSERT_EQ(0, builder->length()); @@ -518,15 +522,13 @@ TYPED_TEST(TestPrimitiveBuilder, TestAppendNull) { TYPED_TEST(TestPrimitiveBuilder, TestAppendNulls) { const int64_t size = 10; - const uint8_t valid_bytes[10] = {1, 0, 1, 0, 1, 0, 1, 0, 1, 0}; - - ASSERT_OK(this->builder_->AppendNulls(valid_bytes, size)); + ASSERT_OK(this->builder_->AppendNulls(size)); std::shared_ptr result; FinishAndCheckPadding(this->builder_.get(), &result); for (int64_t i = 0; i < size; ++i) { - ASSERT_EQ(result->IsValid(i), static_cast(valid_bytes[i])); + ASSERT_FALSE(result->IsValid(i)); } } @@ -922,6 +924,27 @@ TYPED_TEST(TestPrimitiveBuilder, TestReserve) { ASSERT_EQ(BitUtil::NextPower2(kMinBuilderCapacity + 100), this->builder_->capacity()); } +TEST(TestBooleanBuilder, AppendNullsAdvanceBuilder) { + BooleanBuilder builder; + + std::vector values = {1, 0, 0, 1}; + std::vector is_valid = {1, 1, 0, 1}; + + std::shared_ptr arr; + ASSERT_OK(builder.AppendValues(values.data(), 2)); + ASSERT_OK(builder.AppendNulls(1)); + ASSERT_OK(builder.AppendValues(values.data() + 3, 1)); + ASSERT_OK(builder.Finish(&arr)); + + ASSERT_EQ(1, arr->null_count()); + + const auto& barr = static_cast(*arr); + ASSERT_TRUE(barr.Value(0)); + ASSERT_FALSE(barr.Value(1)); + ASSERT_TRUE(barr.IsNull(2)); + ASSERT_TRUE(barr.Value(3)); +} + TEST(TestBooleanBuilder, TestStdBoolVectorAppend) { BooleanBuilder builder; BooleanBuilder builder_nn; @@ -1391,13 +1414,12 @@ TEST_F(TestAdaptiveIntBuilder, TestAppendNull) { TEST_F(TestAdaptiveIntBuilder, TestAppendNulls) { constexpr int64_t size = 10; - const uint8_t valid_bytes[size] = {1, 0, 1, 0, 1, 0, 1, 0, 1, 0}; - ASSERT_OK(builder_->AppendNulls(valid_bytes, size)); + ASSERT_OK(builder_->AppendNulls(size)); Done(); for (unsigned index = 0; index < size; ++index) { - ASSERT_EQ(result_->IsValid(index), static_cast(valid_bytes[index])); + ASSERT_FALSE(result_->IsValid(index)); } } @@ -1526,13 +1548,12 @@ TEST_F(TestAdaptiveUIntBuilder, TestAppendNull) { TEST_F(TestAdaptiveUIntBuilder, TestAppendNulls) { constexpr int64_t size = 10; - const uint8_t valid_bytes[size] = {1, 0, 1, 0, 1, 0, 1, 0, 1, 0}; - ASSERT_OK(builder_->AppendNulls(valid_bytes, size)); + ASSERT_OK(builder_->AppendNulls(size)); Done(); for (unsigned index = 0; index < size; ++index) { - ASSERT_EQ(result_->IsValid(index), static_cast(valid_bytes[index])); + ASSERT_FALSE(result_->IsValid(index)); } } diff --git a/cpp/src/arrow/array/builder_adaptive.h b/cpp/src/arrow/array/builder_adaptive.h index 6523de41622..afbfca21aab 100644 --- a/cpp/src/arrow/array/builder_adaptive.h +++ b/cpp/src/arrow/array/builder_adaptive.h @@ -29,12 +29,13 @@ class ARROW_EXPORT AdaptiveIntBuilderBase : public ArrayBuilder { public: explicit AdaptiveIntBuilderBase(MemoryPool* pool); - /// Write nulls as uint8_t* (0 value indicates null) into pre-allocated memory - Status AppendNulls(const uint8_t* valid_bytes, int64_t length) { + /// \brief Append multiple nulls + /// \param[in] length the number of nulls to append + Status AppendNulls(int64_t length) { ARROW_RETURN_NOT_OK(CommitPendingData()); ARROW_RETURN_NOT_OK(Reserve(length)); memset(data_->mutable_data() + length_ * int_size_, 0, int_size_ * length); - UnsafeAppendToBitmap(valid_bytes, length); + UnsafeSetNull(length); return Status::OK(); } diff --git a/cpp/src/arrow/array/builder_base.cc b/cpp/src/arrow/array/builder_base.cc index e8059007c34..75baedd92bd 100644 --- a/cpp/src/arrow/array/builder_base.cc +++ b/cpp/src/arrow/array/builder_base.cc @@ -106,4 +106,10 @@ void ArrayBuilder::UnsafeSetNotNull(int64_t length) { null_bitmap_builder_.UnsafeAppend(length, true); } +void ArrayBuilder::UnsafeSetNull(int64_t length) { + length_ += length; + null_count_ += length; + null_bitmap_builder_.UnsafeAppend(length, false); +} + } // namespace arrow diff --git a/cpp/src/arrow/array/builder_base.h b/cpp/src/arrow/array/builder_base.h index f4655fab0de..ebe1ecec0ed 100644 --- a/cpp/src/arrow/array/builder_base.h +++ b/cpp/src/arrow/array/builder_base.h @@ -162,6 +162,8 @@ class ARROW_EXPORT ArrayBuilder { // Set the next length bits to not null (i.e. valid). void UnsafeSetNotNull(int64_t length); + void UnsafeSetNull(int64_t length); + static Status TrimBuffer(const int64_t bytes_filled, ResizableBuffer* buffer); static Status CheckCapacity(int64_t new_capacity, int64_t old_capacity) { diff --git a/cpp/src/arrow/array/builder_primitive.cc b/cpp/src/arrow/array/builder_primitive.cc index a593f362dd2..13e8f2ee68d 100644 --- a/cpp/src/arrow/array/builder_primitive.cc +++ b/cpp/src/arrow/array/builder_primitive.cc @@ -45,104 +45,8 @@ Status NullBuilder::FinishInternal(std::shared_ptr* out) { return Status::OK(); } -// ---------------------------------------------------------------------- - -template -void PrimitiveBuilder::Reset() { - data_.reset(); - raw_data_ = nullptr; -} - -template -Status PrimitiveBuilder::Resize(int64_t capacity) { - RETURN_NOT_OK(CheckCapacity(capacity, capacity_)); - capacity = std::max(capacity, kMinBuilderCapacity); - - int64_t nbytes = TypeTraits::bytes_required(capacity); - if (capacity_ == 0) { - RETURN_NOT_OK(AllocateResizableBuffer(pool_, nbytes, &data_)); - } else { - RETURN_NOT_OK(data_->Resize(nbytes)); - } - - raw_data_ = reinterpret_cast(data_->mutable_data()); - return ArrayBuilder::Resize(capacity); -} - -template -Status PrimitiveBuilder::AppendValues(const value_type* values, int64_t length, - const uint8_t* valid_bytes) { - RETURN_NOT_OK(Reserve(length)); - - if (length > 0) { - std::memcpy(raw_data_ + length_, values, - static_cast(TypeTraits::bytes_required(length))); - } - - // length_ is update by these - ArrayBuilder::UnsafeAppendToBitmap(valid_bytes, length); - return Status::OK(); -} - -template -Status PrimitiveBuilder::AppendValues(const value_type* values, int64_t length, - const std::vector& is_valid) { - RETURN_NOT_OK(Reserve(length)); - DCHECK_EQ(length, static_cast(is_valid.size())); - - if (length > 0) { - std::memcpy(raw_data_ + length_, values, - static_cast(TypeTraits::bytes_required(length))); - } - - // length_ is update by these - ArrayBuilder::UnsafeAppendToBitmap(is_valid); - return Status::OK(); -} - -template -Status PrimitiveBuilder::AppendValues(const std::vector& values, - const std::vector& is_valid) { - return AppendValues(values.data(), static_cast(values.size()), is_valid); -} - -template -Status PrimitiveBuilder::AppendValues(const std::vector& values) { - return AppendValues(values.data(), static_cast(values.size())); -} - -template -Status PrimitiveBuilder::FinishInternal(std::shared_ptr* out) { - RETURN_NOT_OK(TrimBuffer(TypeTraits::bytes_required(length_), data_.get())); - std::shared_ptr null_bitmap; - RETURN_NOT_OK(null_bitmap_builder_.Finish(&null_bitmap)); - *out = ArrayData::Make(type_, length_, {null_bitmap, data_}, null_count_); - - data_ = nullptr; - capacity_ = length_ = null_count_ = 0; - - return Status::OK(); -} - -template class PrimitiveBuilder; -template class PrimitiveBuilder; -template class PrimitiveBuilder; -template class PrimitiveBuilder; -template class PrimitiveBuilder; -template class PrimitiveBuilder; -template class PrimitiveBuilder; -template class PrimitiveBuilder; -template class PrimitiveBuilder; -template class PrimitiveBuilder; -template class PrimitiveBuilder; -template class PrimitiveBuilder; -template class PrimitiveBuilder; -template class PrimitiveBuilder; -template class PrimitiveBuilder; -template class PrimitiveBuilder; - BooleanBuilder::BooleanBuilder(MemoryPool* pool) - : ArrayBuilder(boolean(), pool), data_(nullptr), raw_data_(nullptr) {} + : ArrayBuilder(boolean(), pool), data_builder_(pool) {} BooleanBuilder::BooleanBuilder(const std::shared_ptr& type, MemoryPool* pool) : BooleanBuilder(pool) { @@ -151,57 +55,23 @@ BooleanBuilder::BooleanBuilder(const std::shared_ptr& type, MemoryPool void BooleanBuilder::Reset() { ArrayBuilder::Reset(); - data_.reset(); - raw_data_ = nullptr; + data_builder_.Reset(); } Status BooleanBuilder::Resize(int64_t capacity) { RETURN_NOT_OK(CheckCapacity(capacity, capacity_)); capacity = std::max(capacity, kMinBuilderCapacity); - - const int64_t new_bitmap_size = BitUtil::BytesForBits(capacity); - if (capacity_ == 0) { - RETURN_NOT_OK(AllocateResizableBuffer(pool_, new_bitmap_size, &data_)); - raw_data_ = reinterpret_cast(data_->mutable_data()); - - // We zero the memory for booleans to keep things simple; for some reason if - // we do not, even though we may write every bit (through in-place | or &), - // valgrind will still show a warning. If we do not zero the bytes here, we - // will have to be careful to zero them in AppendNull and AppendNulls. Also, - // zeroing the bits results in deterministic bits when each byte may have a - // mix of nulls and not nulls. - // - // We only zero up to new_bitmap_size because the padding was zeroed by - // AllocateResizableBuffer - memset(raw_data_, 0, static_cast(new_bitmap_size)); - } else { - const int64_t old_bitmap_capacity = data_->capacity(); - RETURN_NOT_OK(data_->Resize(new_bitmap_size)); - const int64_t new_bitmap_capacity = data_->capacity(); - raw_data_ = reinterpret_cast(data_->mutable_data()); - - // See comment above about why we zero memory for booleans - memset(raw_data_ + old_bitmap_capacity, 0, - static_cast(new_bitmap_capacity - old_bitmap_capacity)); - } - + RETURN_NOT_OK(data_builder_.Resize(capacity)); return ArrayBuilder::Resize(capacity); } Status BooleanBuilder::FinishInternal(std::shared_ptr* out) { - int64_t bit_offset = length_ % 8; - if (bit_offset > 0) { - // Adjust last byte - data_->mutable_data()[length_ / 8] &= BitUtil::kPrecedingBitmask[bit_offset]; - } - - std::shared_ptr null_bitmap; + std::shared_ptr data, null_bitmap; + RETURN_NOT_OK(data_builder_.Finish(&data)); RETURN_NOT_OK(null_bitmap_builder_.Finish(&null_bitmap)); - RETURN_NOT_OK(TrimBuffer(BitUtil::BytesForBits(length_), data_.get())); - *out = ArrayData::Make(boolean(), length_, {null_bitmap, data_}, null_count_); + *out = ArrayData::Make(boolean(), length_, {null_bitmap, data}, null_count_); - data_ = nullptr; capacity_ = length_ = null_count_ = 0; return Status::OK(); } @@ -211,10 +81,8 @@ Status BooleanBuilder::AppendValues(const uint8_t* values, int64_t length, RETURN_NOT_OK(Reserve(length)); int64_t i = 0; - internal::GenerateBitsUnrolled(raw_data_, length_, length, - [values, &i]() -> bool { return values[i++] != 0; }); - - // this updates length_ + data_builder_.UnsafeAppend(length, + [values, &i]() -> bool { return values[i++] != 0; }); ArrayBuilder::UnsafeAppendToBitmap(valid_bytes, length); return Status::OK(); } @@ -223,12 +91,9 @@ Status BooleanBuilder::AppendValues(const uint8_t* values, int64_t length, const std::vector& is_valid) { RETURN_NOT_OK(Reserve(length)); DCHECK_EQ(length, static_cast(is_valid.size())); - int64_t i = 0; - internal::GenerateBitsUnrolled(raw_data_, length_, length, - [values, &i]() -> bool { return values[i++]; }); - - // this updates length_ + data_builder_.UnsafeAppend(length, + [values, &i]() -> bool { return values[i++]; }); ArrayBuilder::UnsafeAppendToBitmap(is_valid); return Status::OK(); } @@ -247,12 +112,9 @@ Status BooleanBuilder::AppendValues(const std::vector& values, const int64_t length = static_cast(values.size()); RETURN_NOT_OK(Reserve(length)); DCHECK_EQ(length, static_cast(is_valid.size())); - int64_t i = 0; - internal::GenerateBitsUnrolled(raw_data_, length_, length, - [&values, &i]() -> bool { return values[i++]; }); - - // this updates length_ + data_builder_.UnsafeAppend(length, + [&values, &i]() -> bool { return values[i++]; }); ArrayBuilder::UnsafeAppendToBitmap(is_valid); return Status::OK(); } @@ -260,12 +122,9 @@ Status BooleanBuilder::AppendValues(const std::vector& values, Status BooleanBuilder::AppendValues(const std::vector& values) { const int64_t length = static_cast(values.size()); RETURN_NOT_OK(Reserve(length)); - int64_t i = 0; - internal::GenerateBitsUnrolled(raw_data_, length_, length, - [&values, &i]() -> bool { return values[i++]; }); - - // this updates length_ + data_builder_.UnsafeAppend(length, + [&values, &i]() -> bool { return values[i++]; }); ArrayBuilder::UnsafeSetNotNull(length); return Status::OK(); } diff --git a/cpp/src/arrow/array/builder_primitive.h b/cpp/src/arrow/array/builder_primitive.h index bf3ec914bec..5a9b69483af 100644 --- a/cpp/src/arrow/array/builder_primitive.h +++ b/cpp/src/arrow/array/builder_primitive.h @@ -21,6 +21,7 @@ #include #include +#include "arrow/array.h" #include "arrow/array/builder_base.h" #include "arrow/type.h" @@ -42,37 +43,53 @@ class ARROW_EXPORT NullBuilder : public ArrayBuilder { Status FinishInternal(std::shared_ptr* out) override; }; -template -class ARROW_EXPORT PrimitiveBuilder : public ArrayBuilder { +/// Base class for all Builders that emit an Array of a scalar numerical type. +template +class NumericBuilder : public ArrayBuilder { public: - using value_type = typename Type::c_type; + using value_type = typename T::c_type; + using ArrayBuilder::ArrayBuilder; - explicit PrimitiveBuilder(const std::shared_ptr& type, MemoryPool* pool) - : ArrayBuilder(type, pool), data_(NULLPTR), raw_data_(NULLPTR) {} + template + explicit NumericBuilder( + typename std::enable_if::is_parameter_free, MemoryPool*>::type pool + ARROW_MEMORY_POOL_DEFAULT) + : ArrayBuilder(TypeTraits::type_singleton(), pool) {} - using ArrayBuilder::Advance; + /// Append a single scalar and increase the size if necessary. + Status Append(const value_type val) { + ARROW_RETURN_NOT_OK(ArrayBuilder::Reserve(1)); + UnsafeAppend(val); + return Status::OK(); + } /// Write nulls as uint8_t* (0 value indicates null) into pre-allocated memory /// The memory at the corresponding data slot is set to 0 to prevent /// uninitialized memory access - Status AppendNulls(const uint8_t* valid_bytes, int64_t length) { + Status AppendNulls(int64_t length) { ARROW_RETURN_NOT_OK(Reserve(length)); - memset(raw_data_ + length_, 0, - static_cast(TypeTraits::bytes_required(length))); - UnsafeAppendToBitmap(valid_bytes, length); + data_builder_.UnsafeAppend(length, static_cast(0)); + UnsafeSetNull(length); return Status::OK(); } /// \brief Append a single null element Status AppendNull() { ARROW_RETURN_NOT_OK(Reserve(1)); - memset(raw_data_ + length_, 0, sizeof(value_type)); + data_builder_.UnsafeAppend(static_cast(0)); UnsafeAppendToBitmap(false); return Status::OK(); } - value_type GetValue(int64_t index) const { - return reinterpret_cast(data_->data())[index]; + value_type GetValue(int64_t index) const { return data_builder_.data()[index]; } + + void Reset() override { data_builder_.Reset(); } + + Status Resize(int64_t capacity) override { + ARROW_RETURN_NOT_OK(CheckCapacity(capacity, capacity_)); + capacity = std::max(capacity, kMinBuilderCapacity); + ARROW_RETURN_NOT_OK(data_builder_.Resize(capacity)); + return ArrayBuilder::Resize(capacity); } /// \brief Append a sequence of elements in one shot @@ -82,7 +99,13 @@ class ARROW_EXPORT PrimitiveBuilder : public ArrayBuilder { /// indicates a valid (non-null) value /// \return Status Status AppendValues(const value_type* values, int64_t length, - const uint8_t* valid_bytes = NULLPTR); + const uint8_t* valid_bytes = NULLPTR) { + ARROW_RETURN_NOT_OK(Reserve(length)); + data_builder_.UnsafeAppend(values, length); + // length_ is update by these + ArrayBuilder::UnsafeAppendToBitmap(valid_bytes, length); + return Status::OK(); + } /// \brief Append a sequence of elements in one shot /// \param[in] values a contiguous C array of values @@ -91,7 +114,13 @@ class ARROW_EXPORT PrimitiveBuilder : public ArrayBuilder { /// (0). Equal in length to values /// \return Status Status AppendValues(const value_type* values, int64_t length, - const std::vector& is_valid); + const std::vector& is_valid) { + ARROW_RETURN_NOT_OK(Reserve(length)); + data_builder_.UnsafeAppend(values, length); + // length_ is update by these + ArrayBuilder::UnsafeAppendToBitmap(is_valid); + return Status::OK(); + } /// \brief Append a sequence of elements in one shot /// \param[in] values a std::vector of values @@ -99,25 +128,35 @@ class ARROW_EXPORT PrimitiveBuilder : public ArrayBuilder { /// (0). Equal in length to values /// \return Status Status AppendValues(const std::vector& values, - const std::vector& is_valid); + const std::vector& is_valid) { + return AppendValues(values.data(), static_cast(values.size()), is_valid); + } /// \brief Append a sequence of elements in one shot /// \param[in] values a std::vector of values /// \return Status - Status AppendValues(const std::vector& values); + Status AppendValues(const std::vector& values) { + return AppendValues(values.data(), static_cast(values.size())); + } + + Status FinishInternal(std::shared_ptr* out) override { + std::shared_ptr data, null_bitmap; + ARROW_RETURN_NOT_OK(null_bitmap_builder_.Finish(&null_bitmap)); + ARROW_RETURN_NOT_OK(data_builder_.Finish(&data)); + *out = ArrayData::Make(type_, length_, {null_bitmap, data}, null_count_); + capacity_ = length_ = null_count_ = 0; + return Status::OK(); + } /// \brief Append a sequence of elements in one shot /// \param[in] values_begin InputIterator to the beginning of the values /// \param[in] values_end InputIterator pointing to the end of the values /// \return Status - template Status AppendValues(ValuesIter values_begin, ValuesIter values_end) { int64_t length = static_cast(std::distance(values_begin, values_end)); ARROW_RETURN_NOT_OK(Reserve(length)); - - std::copy(values_begin, values_end, raw_data_ + length_); - + data_builder_.UnsafeAppend(values_begin, values_end); // this updates the length_ UnsafeSetNotNull(length); return Status::OK(); @@ -137,14 +176,11 @@ class ARROW_EXPORT PrimitiveBuilder : public ArrayBuilder { "version instead"); int64_t length = static_cast(std::distance(values_begin, values_end)); ARROW_RETURN_NOT_OK(Reserve(length)); - - std::copy(values_begin, values_end, raw_data_ + length_); - - // this updates the length_ - for (int64_t i = 0; i != length; ++i) { - UnsafeAppendToBitmap(*valid_begin); - ++valid_begin; - } + data_builder_.UnsafeAppend(values_begin, values_end); + null_bitmap_builder_.UnsafeAppend( + length, [&valid_begin]() -> bool { return *valid_begin++; }); + length_ = null_bitmap_builder_.length(); + null_count_ = null_bitmap_builder_.false_count(); return Status::OK(); } @@ -154,71 +190,37 @@ class ARROW_EXPORT PrimitiveBuilder : public ArrayBuilder { ValuesIter values_begin, ValuesIter values_end, ValidIter valid_begin) { int64_t length = static_cast(std::distance(values_begin, values_end)); ARROW_RETURN_NOT_OK(Reserve(length)); - - std::copy(values_begin, values_end, raw_data_ + length_); - + data_builder_.UnsafeAppend(values_begin, values_end); // this updates the length_ if (valid_begin == NULLPTR) { UnsafeSetNotNull(length); } else { - for (int64_t i = 0; i != length; ++i) { - UnsafeAppendToBitmap(*valid_begin); - ++valid_begin; - } + null_bitmap_builder_.UnsafeAppend( + length, [&valid_begin]() -> bool { return *valid_begin++; }); + length_ = null_bitmap_builder_.length(); + null_count_ = null_bitmap_builder_.false_count(); } return Status::OK(); } - Status FinishInternal(std::shared_ptr* out) override; - void Reset() override; - - Status Resize(int64_t capacity) override; - - protected: - std::shared_ptr data_; - value_type* raw_data_; -}; - -/// Base class for all Builders that emit an Array of a scalar numerical type. -template -class ARROW_EXPORT NumericBuilder : public PrimitiveBuilder { - public: - using typename PrimitiveBuilder::value_type; - using PrimitiveBuilder::PrimitiveBuilder; - - template - explicit NumericBuilder( - typename std::enable_if::is_parameter_free, MemoryPool*>::type pool - ARROW_MEMORY_POOL_DEFAULT) - : PrimitiveBuilder(TypeTraits::type_singleton(), pool) {} - - using ArrayBuilder::UnsafeAppendNull; - using ArrayBuilder::UnsafeAppendToBitmap; - using PrimitiveBuilder::AppendValues; - using PrimitiveBuilder::Resize; - using PrimitiveBuilder::Reserve; - - /// Append a single scalar and increase the size if necessary. - Status Append(const value_type val) { - ARROW_RETURN_NOT_OK(ArrayBuilder::Reserve(1)); - UnsafeAppend(val); - return Status::OK(); - } - /// Append a single scalar under the assumption that the underlying Buffer is /// large enough. /// /// This method does not capacity-check; make sure to call Reserve /// beforehand. void UnsafeAppend(const value_type val) { - raw_data_[length_] = val; - UnsafeAppendToBitmap(true); + ArrayBuilder::UnsafeAppendToBitmap(true); + data_builder_.UnsafeAppend(val); + } + + void UnsafeAppendNull() { + ArrayBuilder::UnsafeAppendToBitmap(false); + data_builder_.UnsafeAppend(0); } protected: - using PrimitiveBuilder::length_; - using PrimitiveBuilder::raw_data_; + TypedBufferBuilder data_builder_; }; // Builders @@ -249,21 +251,17 @@ class ARROW_EXPORT BooleanBuilder : public ArrayBuilder { explicit BooleanBuilder(const std::shared_ptr& type, MemoryPool* pool); - using ArrayBuilder::Advance; - using ArrayBuilder::UnsafeAppendNull; - /// Write nulls as uint8_t* (0 value indicates null) into pre-allocated memory - Status AppendNulls(const uint8_t* valid_bytes, int64_t length) { + Status AppendNulls(int64_t length) { ARROW_RETURN_NOT_OK(Reserve(length)); - UnsafeAppendToBitmap(valid_bytes, length); - + data_builder_.UnsafeAppend(length, false); + UnsafeSetNull(length); return Status::OK(); } Status AppendNull() { ARROW_RETURN_NOT_OK(Reserve(1)); - UnsafeAppendToBitmap(false); - + UnsafeAppendNull(); return Status::OK(); } @@ -278,14 +276,15 @@ class ARROW_EXPORT BooleanBuilder : public ArrayBuilder { /// Scalar append, without checking for capacity void UnsafeAppend(const bool val) { - if (val) { - BitUtil::SetBit(raw_data_, length_); - } else { - BitUtil::ClearBit(raw_data_, length_); - } + data_builder_.UnsafeAppend(val); UnsafeAppendToBitmap(true); } + void UnsafeAppendNull() { + data_builder_.UnsafeAppend(false); + UnsafeAppendToBitmap(false); + } + void UnsafeAppend(const uint8_t val) { UnsafeAppend(val != 0); } /// \brief Append a sequence of elements in one shot @@ -340,10 +339,8 @@ class ARROW_EXPORT BooleanBuilder : public ArrayBuilder { Status AppendValues(ValuesIter values_begin, ValuesIter values_end) { int64_t length = static_cast(std::distance(values_begin, values_end)); ARROW_RETURN_NOT_OK(Reserve(length)); - auto iter = values_begin; - internal::GenerateBitsUnrolled(raw_data_, length_, length, - [&iter]() -> bool { return *(iter++); }); - + data_builder_.UnsafeAppend( + length, [&values_begin]() -> bool { return *values_begin++; }); // this updates length_ UnsafeSetNotNull(length); return Status::OK(); @@ -364,15 +361,12 @@ class ARROW_EXPORT BooleanBuilder : public ArrayBuilder { int64_t length = static_cast(std::distance(values_begin, values_end)); ARROW_RETURN_NOT_OK(Reserve(length)); - auto iter = values_begin; - internal::GenerateBitsUnrolled(raw_data_, length_, length, - [&iter]() -> bool { return *(iter++); }); - - // this updates length_ - for (int64_t i = 0; i != length; ++i) { - ArrayBuilder::UnsafeAppendToBitmap(*valid_begin); - ++valid_begin; - } + data_builder_.UnsafeAppend( + length, [&values_begin]() -> bool { return *values_begin++; }); + null_bitmap_builder_.UnsafeAppend( + length, [&valid_begin]() -> bool { return *valid_begin++; }); + length_ = null_bitmap_builder_.length(); + null_count_ = null_bitmap_builder_.false_count(); return Status::OK(); } @@ -382,21 +376,17 @@ class ARROW_EXPORT BooleanBuilder : public ArrayBuilder { ValuesIter values_begin, ValuesIter values_end, ValidIter valid_begin) { int64_t length = static_cast(std::distance(values_begin, values_end)); ARROW_RETURN_NOT_OK(Reserve(length)); + data_builder_.UnsafeAppend( + length, [&values_begin]() -> bool { return *values_begin++; }); - auto iter = values_begin; - internal::GenerateBitsUnrolled(raw_data_, length_, length, - [&iter]() -> bool { return *(iter++); }); - - // this updates the length_ if (valid_begin == NULLPTR) { UnsafeSetNotNull(length); } else { - for (int64_t i = 0; i != length; ++i) { - ArrayBuilder::UnsafeAppendToBitmap(*valid_begin); - ++valid_begin; - } + null_bitmap_builder_.UnsafeAppend( + length, [&valid_begin]() -> bool { return *valid_begin++; }); } - + length_ = null_bitmap_builder_.length(); + null_count_ = null_bitmap_builder_.false_count(); return Status::OK(); } @@ -405,8 +395,7 @@ class ARROW_EXPORT BooleanBuilder : public ArrayBuilder { Status Resize(int64_t capacity) override; protected: - std::shared_ptr data_; - uint8_t* raw_data_; + TypedBufferBuilder data_builder_; }; } // namespace arrow diff --git a/cpp/src/arrow/buffer-builder.h b/cpp/src/arrow/buffer-builder.h index b27fbd838f2..a843021e9c0 100644 --- a/cpp/src/arrow/buffer-builder.h +++ b/cpp/src/arrow/buffer-builder.h @@ -24,6 +24,7 @@ #include #include #include +#include #include "arrow/buffer.h" #include "arrow/status.h" @@ -57,7 +58,6 @@ class ARROW_EXPORT BufferBuilder { return Status::OK(); } int64_t old_capacity = capacity_; - if (buffer_ == NULLPTR) { ARROW_RETURN_NOT_OK(AllocateResizableBuffer(pool_, new_capacity, &buffer_)); } else { @@ -131,6 +131,9 @@ class ARROW_EXPORT BufferBuilder { // Advance pointer and zero out memory Status Advance(const int64_t length) { return Append(length, 0); } + // Advance pointer, but don't allocate or zero memory + void UnsafeAdvance(const int64_t length) { size_ += length; } + // Unsafe methods don't check existing size void UnsafeAppend(const void* data, const int64_t length) { memcpy(data_ + size_, data, static_cast(length)); @@ -153,6 +156,7 @@ class ARROW_EXPORT BufferBuilder { /// \return Status Status Finish(std::shared_ptr* out, bool shrink_to_fit = true) { ARROW_RETURN_NOT_OK(Resize(size_, shrink_to_fit)); + if (size_ != 0) buffer_->ZeroPadding(); *out = buffer_; Reset(); return Status::OK(); @@ -211,6 +215,14 @@ class TypedBufferBuilder::value num_elements * sizeof(T)); } + template + void UnsafeAppend(Iter values_begin, Iter values_end) { + int64_t num_elements = static_cast(std::distance(values_begin, values_end)); + auto data = mutable_data() + length(); + bytes_builder_.UnsafeAdvance(num_elements * sizeof(T)); + std::copy(values_begin, values_end, data); + } + void UnsafeAppend(const int64_t num_copies, T value) { auto data = mutable_data() + length(); bytes_builder_.UnsafeAppend(num_copies * sizeof(T), 0); @@ -284,7 +296,7 @@ class TypedBufferBuilder { int64_t i = 0; internal::GenerateBitsUnrolled(mutable_data(), bit_length_, num_elements, [&] { bool value = bytes[i++]; - if (!value) ++false_count_; + false_count_ += !value; return value; }); bit_length_ += num_elements; @@ -292,12 +304,27 @@ class TypedBufferBuilder { void UnsafeAppend(const int64_t num_copies, bool value) { BitUtil::SetBitsTo(mutable_data(), bit_length_, num_copies, value); - if (!value) { - false_count_ += num_copies; - } + false_count_ += num_copies * !value; bit_length_ += num_copies; } + template + void UnsafeAppend(const int64_t num_elements, Generator&& gen) { + if (num_elements == 0) return; + + if (count_falses) { + internal::GenerateBitsUnrolled(mutable_data(), bit_length_, num_elements, [&] { + bool value = gen(); + false_count_ += !value; + return value; + }); + } else { + internal::GenerateBitsUnrolled(mutable_data(), bit_length_, num_elements, + std::forward(gen)); + } + bit_length_ += num_elements; + } + Status Resize(const int64_t new_capacity, bool shrink_to_fit = true) { const int64_t old_byte_capacity = bytes_builder_.capacity(); const int64_t new_byte_capacity = BitUtil::BytesForBits(new_capacity); @@ -320,6 +347,9 @@ class TypedBufferBuilder { } Status Finish(std::shared_ptr* out, bool shrink_to_fit = true) { + // set bytes_builder_.size_ == byte size of data + bytes_builder_.UnsafeAdvance(BitUtil::BytesForBits(bit_length_) - + bytes_builder_.length()); bit_length_ = false_count_ = 0; return bytes_builder_.Finish(out, shrink_to_fit); } diff --git a/cpp/src/arrow/buffer-test.cc b/cpp/src/arrow/buffer-test.cc index 9ae96e7832d..e503b286013 100644 --- a/cpp/src/arrow/buffer-test.cc +++ b/cpp/src/arrow/buffer-test.cc @@ -350,6 +350,8 @@ TEST(TestBufferBuilder, BasicBoolBufferBuilderUsage) { for (int i = 0; i != nvalues; ++i) { ASSERT_EQ(BitUtil::GetBit(built->data(), i + 1), static_cast(values[i])); } + + ASSERT_EQ(built->size(), BitUtil::BytesForBits(nvalues + 1)); } TEST(TestBufferBuilder, BoolBufferBuilderAppendCopies) { @@ -367,6 +369,8 @@ TEST(TestBufferBuilder, BoolBufferBuilderAppendCopies) { for (int i = 0; i != 13 + 17; ++i) { EXPECT_EQ(BitUtil::GetBit(built->data(), i), i < 13) << "index = " << i; } + + ASSERT_EQ(built->size(), BitUtil::BytesForBits(13 + 17)); } template