From cc7f851aec6cd6bc88355b8a3d4aa8c82f296ee1 Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Wed, 13 Apr 2016 07:46:52 +0000 Subject: [PATCH 01/25] add Validate method to array and implementation for ListArray --- cpp/src/arrow/array.cc | 5 +++ cpp/src/arrow/array.h | 4 ++ cpp/src/arrow/types/list-test.cc | 75 +++++++++++++++++++++++++------- cpp/src/arrow/types/list.cc | 57 +++++++++++++++++++++++- cpp/src/arrow/types/list.h | 8 +++- 5 files changed, 132 insertions(+), 17 deletions(-) diff --git a/cpp/src/arrow/array.cc b/cpp/src/arrow/array.cc index a1536861a20..c6b9b1599cd 100644 --- a/cpp/src/arrow/array.cc +++ b/cpp/src/arrow/array.cc @@ -20,6 +20,7 @@ #include #include "arrow/util/buffer.h" +#include "arrow/util/status.h" namespace arrow { @@ -47,6 +48,10 @@ bool Array::EqualsExact(const Array& other) const { return true; } +Status Array::Validate() const { + return Status::OK(); +} + bool NullArray::Equals(const std::shared_ptr& arr) const { if (this == arr.get()) { return true; } if (Type::NA != arr->type_enum()) { return false; } diff --git a/cpp/src/arrow/array.h b/cpp/src/arrow/array.h index c6735f87d8f..af62c46edf2 100644 --- a/cpp/src/arrow/array.h +++ b/cpp/src/arrow/array.h @@ -28,6 +28,7 @@ namespace arrow { class Buffer; +class Status; // Immutable data array with some logical type and some length. Any memory is // owned by the respective Buffer instance (or its parents). @@ -58,6 +59,9 @@ class Array { bool EqualsExact(const Array& arr) const; virtual bool Equals(const std::shared_ptr& arr) const = 0; + // Determines if the array is internally consistent. Defaults to always + // returning Status::OK. This can be an expensive check. + virtual Status Validate() const; protected: std::shared_ptr type_; diff --git a/cpp/src/arrow/types/list-test.cc b/cpp/src/arrow/types/list-test.cc index aa34f23cc02..050cc320c34 100644 --- a/cpp/src/arrow/types/list-test.cc +++ b/cpp/src/arrow/types/list-test.cc @@ -94,6 +94,7 @@ TEST_F(TestListBuilder, TestAppendNull) { Done(); + ASSERT_OK(result_->Validate()); ASSERT_TRUE(result_->IsNull(0)); ASSERT_TRUE(result_->IsNull(1)); @@ -105,6 +106,30 @@ TEST_F(TestListBuilder, TestAppendNull) { ASSERT_EQ(0, values->length()); } +void ValidateBasicListArray(const ListArray* result, const vector& values, + const vector& is_null) { + ASSERT_OK(result->Validate()); + ASSERT_EQ(1, result->null_count()); + ASSERT_EQ(0, result->values()->null_count()); + + ASSERT_EQ(3, result->length()); + vector ex_offsets = {0, 3, 3, 7}; + for (size_t i = 0; i < ex_offsets.size(); ++i) { + ASSERT_EQ(ex_offsets[i], result->offset(i)); + } + + for (int i = 0; i < result->length(); ++i) { + ASSERT_EQ(static_cast(is_null[i]), result->IsNull(i)); + } + + ASSERT_EQ(7, result->values()->length()); + Int32Array* varr = static_cast(result->values().get()); + + for (size_t i = 0; i < values.size(); ++i) { + ASSERT_EQ(values[i], varr->Value(i)); + } +} + TEST_F(TestListBuilder, TestBasics) { vector values = {0, 1, 2, 3, 4, 5, 6}; vector lengths = {3, 0, 4}; @@ -112,8 +137,8 @@ TEST_F(TestListBuilder, TestBasics) { Int32Builder* vb = static_cast(builder_->value_builder().get()); - EXPECT_OK(builder_->Reserve(lengths.size())); - EXPECT_OK(vb->Reserve(values.size())); + ASSERT_OK(builder_->Reserve(lengths.size())); + ASSERT_OK(vb->Reserve(values.size())); int pos = 0; for (size_t i = 0; i < lengths.size(); ++i) { @@ -124,31 +149,51 @@ TEST_F(TestListBuilder, TestBasics) { } Done(); + ValidateBasicListArray(result_.get(), values, is_null); +} + +TEST_F(TestListBuilder, BulkAppend) { + vector values = {0, 1, 2, 3, 4, 5, 6}; + vector lengths = {3, 0, 4}; + vector is_null = {0, 1, 0}; + vector is_valid = {1, 0, 1}; + vector offsets = {0, 3, 3}; - ASSERT_EQ(1, result_->null_count()); - ASSERT_EQ(0, result_->values()->null_count()); + Int32Builder* vb = static_cast(builder_->value_builder().get()); + ASSERT_OK(vb->Reserve(values.size())); - ASSERT_EQ(3, result_->length()); - vector ex_offsets = {0, 3, 3, 7}; - for (size_t i = 0; i < ex_offsets.size(); ++i) { - ASSERT_EQ(ex_offsets[i], result_->offset(i)); + builder_->Append(offsets.data(), offsets.size(), is_valid.data()); + for (int32_t value : values) { + vb->Append(value); } + Done(); + ValidateBasicListArray(result_.get(), values, is_null); +} - for (int i = 0; i < result_->length(); ++i) { - ASSERT_EQ(static_cast(is_null[i]), result_->IsNull(i)); - } +TEST_F(TestListBuilder, BulkAppendInvalid) { + vector values = {0, 1, 2, 3, 4, 5, 6}; + vector lengths = {3, 0, 4}; + vector is_null = {0, 1, 0}; + vector is_valid = {1, 0, 1}; + vector offsets = {0, 2, 4}; // should be 0, 3, 3 given the is_null array - ASSERT_EQ(7, result_->values()->length()); - Int32Array* varr = static_cast(result_->values().get()); + Int32Builder* vb = static_cast(builder_->value_builder().get()); + ASSERT_OK(vb->Reserve(values.size())); - for (size_t i = 0; i < values.size(); ++i) { - ASSERT_EQ(values[i], varr->Value(i)); + builder_->Append(offsets.data(), offsets.size(), is_valid.data()); + builder_->Append(offsets.data(), offsets.size(), is_valid.data()); + for (int32_t value : values) { + vb->Append(value); } + + Done(); + ASSERT_RAISES(Invalid, result_->Validate()); } TEST_F(TestListBuilder, TestZeroLength) { // All buffers are null Done(); + ASSERT_OK(result_->Validate()); } } // namespace arrow diff --git a/cpp/src/arrow/types/list.cc b/cpp/src/arrow/types/list.cc index 23f12ddc4ec..931420b1ea5 100644 --- a/cpp/src/arrow/types/list.cc +++ b/cpp/src/arrow/types/list.cc @@ -14,9 +14,10 @@ // KIND, either express or implied. See the License for the // specific language governing permissions and limitations // under the License. - #include "arrow/types/list.h" +#include + namespace arrow { bool ListArray::EqualsExact(const ListArray& other) const { @@ -41,4 +42,58 @@ bool ListArray::Equals(const std::shared_ptr& arr) const { return EqualsExact(*static_cast(arr.get())); } +Status ListArray::Validate() const { + if (length_ == 0) { return Status::OK(); } + if (length_ < 0) { return Status::Invalid("Length was negative"); } + if (!offset_buf_) { + return Status::Invalid("offset_buf_ is null with non-zero_length"); + } + if (offset_buf_->size() / sizeof(int32_t) < length_) { + std::stringstream ss; + ss << "offset buffer size: " << offset_buf_->size() + << " isn't large enough for length: " << length_; + return Status::Invalid(ss.str()); + } + const int32_t last_offset = offset(length_); + if (last_offset > 0) { + if (!values_) { + return Status::Invalid("last offset was non-zero and values was null"); + } + if (values_->length() != last_offset) { + std::stringstream ss; + ss << "Final offset invariant not equal to values length: " << last_offset + << "!=" << values_->length(); + return Status::Invalid(ss.str()); + } + + const Status child_valid = values_->Validate(); + if (!child_valid.ok()) { + std::stringstream ss; + ss << "Child array invalid: " << child_valid.ToString(); + return Status::Invalid(ss.str()); + } + } + + int32_t prev_offset = offset(0); + if (prev_offset != 0) { return Status::Invalid("The first offset wasn't zero"); } + for (int32_t i = 1; i <= length_; ++i) { + int32_t current_offset = offset(i); + if (IsNull(i - 1) && current_offset != prev_offset) { + std::stringstream ss; + ss << "Offset invariant failure at: " << i << " inconsistent offsets for null slot" + << current_offset << "!=" << prev_offset; + return Status::Invalid(ss.str()); + } + if (current_offset < prev_offset) { + std::stringstream ss; + ss << "Offset invariant failure: " << i + << " inconsistent offset for non-null slot: " << current_offset << "<" + << prev_offset; + return Status::Invalid(ss.str()); + } + prev_offset = current_offset; + } + return Status::OK(); +} + } // namespace arrow diff --git a/cpp/src/arrow/types/list.h b/cpp/src/arrow/types/list.h index 6b815460ecb..dd825199729 100644 --- a/cpp/src/arrow/types/list.h +++ b/cpp/src/arrow/types/list.h @@ -28,6 +28,7 @@ #include "arrow/types/primitive.h" #include "arrow/util/bit-util.h" #include "arrow/util/buffer.h" +#include "arrow/util/logging.h" #include "arrow/util/status.h" namespace arrow { @@ -46,11 +47,16 @@ class ListArray : public Array { values_ = values; } - virtual ~ListArray() {} + Status Validate() const override; + + virtual ~ListArray() = default; // Return a shared pointer in case the requestor desires to share ownership // with this array. const std::shared_ptr& values() const { return values_; } + const std::shared_ptr offset_buffer() const { + return std::static_pointer_cast(offset_buf_); + } const std::shared_ptr& value_type() const { return values_->type(); } From 01c50bebd2d8560f19e3be95940a8d4bc2ac642b Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Wed, 13 Apr 2016 07:54:27 +0000 Subject: [PATCH 02/25] Add utility methods for managing null bitmap directly to ArrayBuilder --- cpp/src/arrow/builder.cc | 56 ++++++++++++++++++++++++++++++++++++++++ cpp/src/arrow/builder.h | 27 +++++++++++++++++-- 2 files changed, 81 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/builder.cc b/cpp/src/arrow/builder.cc index 1447078f760..93b93eabd8c 100644 --- a/cpp/src/arrow/builder.cc +++ b/cpp/src/arrow/builder.cc @@ -25,6 +25,25 @@ namespace arrow { +Status ArrayBuilder::AppendToBitmap(bool is_null) { + if (length_ == capacity_) { + // If the capacity was not already a multiple of 2, do so here + // TODO(emkornfield) doubling isn't great default allocation practice + // see https://github.com/facebook/folly/blob/master/folly/docs/FBVector.md + // fo discussion + RETURN_NOT_OK(Resize(util::next_power2(capacity_ + 1))); + } + UnsafeAppendToBitmap(is_null); + return Status::OK(); +} + +Status ArrayBuilder::AppendToBitmap(const uint8_t* valid_bytes, int32_t length) { + Reserve(length); + + UnsafeAppendToBitmap(valid_bytes, length); + return Status::OK(); +} + Status ArrayBuilder::Init(int32_t capacity) { capacity_ = capacity; int32_t to_alloc = util::ceil_byte(capacity) / 8; @@ -36,6 +55,7 @@ Status ArrayBuilder::Init(int32_t capacity) { } Status ArrayBuilder::Resize(int32_t new_bits) { + if (!null_bitmap_) { return Init(new_bits); } int32_t new_bytes = util::ceil_byte(new_bits) / 8; int32_t old_bytes = null_bitmap_->size(); RETURN_NOT_OK(null_bitmap_->Resize(new_bytes)); @@ -56,10 +76,46 @@ Status ArrayBuilder::Advance(int32_t elements) { Status ArrayBuilder::Reserve(int32_t elements) { if (length_ + elements > capacity_) { + // TODO(emkornfield) power of 2 growth is potentially suboptimal int32_t new_capacity = util::next_power2(length_ + elements); return Resize(new_capacity); } return Status::OK(); } +Status ArrayBuilder::SetNotNull(int32_t length) { + RETURN_NOT_OK(Reserve(length)); + UnsafeSetNotNull(length); + return Status::OK(); +} + +void ArrayBuilder::UnsafeAppendToBitmap(bool is_null) { + if (is_null) { + ++null_count_; + } else { + util::set_bit(null_bitmap_data_, length_); + } + ++length_; +} + +void ArrayBuilder::UnsafeAppendToBitmap(const uint8_t* valid_bytes, int32_t length) { + if (valid_bytes == nullptr) { + UnsafeSetNotNull(length); + return; + } + for (int32_t i = 0; i < length; ++i) { + // TODO(emkornfield) Optimize for large values of length? + AppendToBitmap(valid_bytes[i] == 0); + } +} + +void ArrayBuilder::UnsafeSetNotNull(int32_t length) { + const int32_t new_length = length + length_; + // TODO(emkornfield) Optimize for large values of length? + for (int32_t i = length_; i < new_length; ++i) { + util::set_bit(null_bitmap_data_, i); + } + length_ = new_length; +} + } // namespace arrow diff --git a/cpp/src/arrow/builder.h b/cpp/src/arrow/builder.h index 21a6341ef50..e2fb11ffc84 100644 --- a/cpp/src/arrow/builder.h +++ b/cpp/src/arrow/builder.h @@ -34,7 +34,10 @@ class PoolBuffer; static constexpr int32_t MIN_BUILDER_CAPACITY = 1 << 5; -// Base class for all data array builders +// Base class for all data array builders. +// This class provides a facilities for incrementally building the null bitmap +// (see Append methods) and as a side effect the current number of slots and +// the null count. class ArrayBuilder { public: explicit ArrayBuilder(MemoryPool* pool, const TypePtr& type) @@ -58,6 +61,14 @@ class ArrayBuilder { int32_t null_count() const { return null_count_; } int32_t capacity() const { return capacity_; } + // Append to null bitmap + Status AppendToBitmap(bool is_null); + // Vector append. Treat each zero byte as a null. If valid_bytes is null + // assume all of length bits are valid. + Status AppendToBitmap(const uint8_t* valid_bytes, int32_t length); + // Set the next length bits to not null (i.e. valid). + Status SetNotNull(int32_t length); + // Allocates requires memory at this level, but children need to be // initialized independently Status Init(int32_t capacity); @@ -75,7 +86,7 @@ class ArrayBuilder { const std::shared_ptr& null_bitmap() const { return null_bitmap_; } // Creates new array object to hold the contents of the builder and transfers - // ownership of the data + // ownership of the data. This resets all variables on the builder. virtual std::shared_ptr Finish() = 0; const std::shared_ptr& type() const { return type_; } @@ -97,6 +108,18 @@ class ArrayBuilder { // Child value array builders. These are owned by this class std::vector> children_; + // + // Unsafe operations (don't check capacity/don't resize) + // + + // Append to null bitmap. + void UnsafeAppendToBitmap(bool is_null); + // Vector append. Treat each zero byte as a nullzero. If valid_bytes is null + // assume all of length bits are valid. + void UnsafeAppendToBitmap(const uint8_t* valid_bytes, int32_t length); + // Set the next length bits to not null (i.e. valid). + void UnsafeSetNotNull(int32_t length); + private: DISALLOW_COPY_AND_ASSIGN(ArrayBuilder); }; From 3895d347414512462cf3dc3d792516a76c3a90c5 Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Wed, 13 Apr 2016 07:57:53 +0000 Subject: [PATCH 03/25] Refactor list builder to use ArrayBuilders bitmap methods and a separate buffer builder --- cpp/src/arrow/types/list.h | 97 ++++++++++++++++++++++-------------- cpp/src/arrow/types/string.h | 4 +- 2 files changed, 61 insertions(+), 40 deletions(-) diff --git a/cpp/src/arrow/types/list.h b/cpp/src/arrow/types/list.h index dd825199729..2c6c6666afa 100644 --- a/cpp/src/arrow/types/list.h +++ b/cpp/src/arrow/types/list.h @@ -84,27 +84,51 @@ class ListArray : public Array { // // To use this class, you must append values to the child array builder and use // the Append function to delimit each distinct list value (once the values -// have been appended to the child array) -class ListBuilder : public Int32Builder { +// have been appended to the child array) or use the bulk API to append +// a sequence of offests and null values. +// +// A note on types. Per arrow/type.h all types in the c++ implementation are +// logical so even though this class always builds an Array of lists, this can +// represent multiple different logical types. If no logical type is provided +// at construction time, the class defaults to List where t is take from the +// value_builder/values that the object is constructed with. +class ListBuilder : public ArrayBuilder { public: + // Use this constructor to incrementally build the value array along with offsets and + // null bitmap. + ListBuilder(MemoryPool* pool, std::shared_ptr value_builder, + const TypePtr& type = nullptr) + : ArrayBuilder( + pool, type ? type : std::static_pointer_cast( + std::make_shared(value_builder->type()))), + offset_builder_(pool), + value_builder_(value_builder) {} + + // Use this constructor to build the list with a pre-existing values array ListBuilder( - MemoryPool* pool, const TypePtr& type, std::shared_ptr value_builder) - : Int32Builder(pool, type), value_builder_(value_builder) {} + MemoryPool* pool, std::shared_ptr values, const TypePtr& type = nullptr) + : ArrayBuilder(pool, type ? type : std::static_pointer_cast( + std::make_shared(values->type()))), + offset_builder_(pool), + values_(values) {} Status Init(int32_t elements) { - // One more than requested. - // - // XXX: This is slightly imprecise, because we might trigger null mask - // resizes that are unnecessary when creating arrays with power-of-two size - return Int32Builder::Init(elements + 1); + RETURN_NOT_OK(ArrayBuilder::Init(elements)); + // one more then requested for offsets + return offset_builder_.Resize(elements + 1); } Status Resize(int32_t capacity) { - // Need space for the end offset - RETURN_NOT_OK(Int32Builder::Resize(capacity + 1)); + // +1 because we Need space for the end offset + RETURN_NOT_OK(offset_builder_.Resize((capacity + 1) * sizeof(int32_t))); + return ArrayBuilder::Resize(capacity); + } - // Slight hack, as the "real" capacity is one less - --capacity_; + Status Reserve(int32_t elements) { + if (length_ + elements > capacity_) { + int32_t new_capacity = util::next_power2(length_ + elements); + return Resize(new_capacity); + } return Status::OK(); } @@ -112,31 +136,30 @@ class ListBuilder : public Int32Builder { // // If passed, valid_bytes is of equal length to values, and any zero byte // will be considered as a null for that slot - Status Append(value_type* values, int32_t length, uint8_t* valid_bytes = nullptr) { - if (length_ + length > capacity_) { - int32_t new_capacity = util::next_power2(length_ + length); - RETURN_NOT_OK(Resize(new_capacity)); - } - memcpy(raw_data_ + length_, values, type_traits::bytes_required(length)); - - if (valid_bytes != nullptr) { AppendNulls(valid_bytes, length); } - - length_ += length; + Status Append( + const int32_t* offsets, int32_t length, const uint8_t* valid_bytes = nullptr) { + RETURN_NOT_OK(Reserve(length)); + UnsafeAppendToBitmap(valid_bytes, length); + offset_builder_.UnsafeAppend(offsets, length); return Status::OK(); } + // The same as Finalize but allows for overridding the c++ type template std::shared_ptr Transfer() { - std::shared_ptr items = value_builder_->Finish(); + std::shared_ptr items = values_; + if (!items) { items = value_builder_->Finish(); } // Add final offset if the length is non-zero - if (length_) { raw_data_[length_] = items->length(); } + if (length_) { offset_builder_.UnsafeAppend(items->length()); } + const auto offsets_buffer = offset_builder_.Finish(); auto result = std::make_shared( - type_, length_, data_, items, null_count_, null_bitmap_); + type_, length_, offsets_buffer, items, null_count_, null_bitmap_); - data_ = null_bitmap_ = nullptr; + // TODO(emkornfield) make a reset method capacity_ = length_ = null_count_ = 0; + null_bitmap_ = nullptr; return result; } @@ -148,25 +171,23 @@ class ListBuilder : public Int32Builder { // This function should be called before beginning to append elements to the // value builder Status Append(bool is_null = false) { - if (length_ == capacity_) { - // If the capacity was not already a multiple of 2, do so here - RETURN_NOT_OK(Resize(util::next_power2(capacity_ + 1))); - } - if (is_null) { - ++null_count_; - } else { - util::set_bit(null_bitmap_data_, length_); - } - raw_data_[length_++] = value_builder_->length(); + RETURN_NOT_OK(Reserve(1)); + UnsafeAppendToBitmap(is_null); + RETURN_NOT_OK(offset_builder_.Append(value_builder_->length())); return Status::OK(); } Status AppendNull() { return Append(true); } - const std::shared_ptr& value_builder() const { return value_builder_; } + const std::shared_ptr& value_builder() const { + DCHECK(!values_) << "Using value builder is pointless when values_ is set"; + return value_builder_; + } protected: + BufferBuilder offset_builder_; std::shared_ptr value_builder_; + std::shared_ptr values_; }; } // namespace arrow diff --git a/cpp/src/arrow/types/string.h b/cpp/src/arrow/types/string.h index c5cbe1058c7..f5874db45db 100644 --- a/cpp/src/arrow/types/string.h +++ b/cpp/src/arrow/types/string.h @@ -89,11 +89,11 @@ class StringArray : public ListArray { const uint8_t* raw_bytes_; }; -// Array builder +// String builder class StringBuilder : public ListBuilder { public: explicit StringBuilder(MemoryPool* pool, const TypePtr& type) - : ListBuilder(pool, type, std::make_shared(pool, value_type_)) { + : ListBuilder(pool, std::make_shared(pool, value_type_), type) { byte_builder_ = static_cast(value_builder_.get()); } From 20f984b1e07f874de330f52b1a7cfa6c7a233fc7 Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Wed, 13 Apr 2016 08:02:34 +0000 Subject: [PATCH 04/25] refactor primitive builders to use parent builders bitmap --- cpp/src/arrow/types/primitive.cc | 32 ++++--------------------------- cpp/src/arrow/types/primitive.h | 33 +++++++++++++++++--------------- 2 files changed, 22 insertions(+), 43 deletions(-) diff --git a/cpp/src/arrow/types/primitive.cc b/cpp/src/arrow/types/primitive.cc index 9549c47b411..ef00cd37f36 100644 --- a/cpp/src/arrow/types/primitive.cc +++ b/cpp/src/arrow/types/primitive.cc @@ -119,30 +119,12 @@ Status PrimitiveBuilder::Append( memcpy(raw_data_ + length_, values, type_traits::bytes_required(length)); } - if (valid_bytes != nullptr) { - PrimitiveBuilder::AppendNulls(valid_bytes, length); - } else { - for (int i = 0; i < length; ++i) { - util::set_bit(null_bitmap_data_, length_ + i); - } - } + // length_ is update by these + ArrayBuilder::UnsafeAppendToBitmap(valid_bytes, length); - length_ += length; return Status::OK(); } -template -void PrimitiveBuilder::AppendNulls(const uint8_t* valid_bytes, int32_t length) { - // If valid_bytes is all not null, then none of the values are null - for (int i = 0; i < length; ++i) { - if (valid_bytes[i] == 0) { - ++null_count_; - } else { - util::set_bit(null_bitmap_data_, length_ + i); - } - } -} - template std::shared_ptr PrimitiveBuilder::Finish() { std::shared_ptr result = std::make_shared::ArrayType>( @@ -166,14 +148,8 @@ Status PrimitiveBuilder::Append( } } - if (valid_bytes != nullptr) { - PrimitiveBuilder::AppendNulls(valid_bytes, length); - } else { - for (int i = 0; i < length; ++i) { - util::set_bit(null_bitmap_data_, length_ + i); - } - } - length_ += length; + // this updates length_ + ArrayBuilder::UnsafeAppendToBitmap(valid_bytes, length); return Status::OK(); } diff --git a/cpp/src/arrow/types/primitive.h b/cpp/src/arrow/types/primitive.h index fcd3db4e96e..5621a311eea 100644 --- a/cpp/src/arrow/types/primitive.h +++ b/cpp/src/arrow/types/primitive.h @@ -95,15 +95,13 @@ class PrimitiveBuilder : public ArrayBuilder { using ArrayBuilder::Advance; // Write nulls as uint8_t* (0 value indicates null) into pre-allocated memory - void AppendNulls(const uint8_t* valid_bytes, int32_t length); + void AppendNulls(const uint8_t* valid_bytes, int32_t length) { + UnsafeAppendToBitmap(valid_bytes, length); + } Status AppendNull() { - if (length_ == capacity_) { - // If the capacity was not already a multiple of 2, do so here - RETURN_NOT_OK(Resize(util::next_power2(capacity_ + 1))); - } - ++null_count_; - ++length_; + RETURN_NOT_OK(Reserve(1)); + UnsafeAppendToBitmap(true); return Status::OK(); } @@ -122,15 +120,15 @@ class PrimitiveBuilder : public ArrayBuilder { std::shared_ptr Finish() override; - protected: - std::shared_ptr data_; - value_type* raw_data_; - Status Init(int32_t capacity); // Increase the capacity of the builder to accommodate at least the indicated // number of elements Status Resize(int32_t capacity); + + protected: + std::shared_ptr data_; + value_type* raw_data_; }; template @@ -140,9 +138,17 @@ class NumericBuilder : public PrimitiveBuilder { using PrimitiveBuilder::PrimitiveBuilder; using PrimitiveBuilder::Append; + using PrimitiveBuilder::Init; + using PrimitiveBuilder::Resize; - // Scalar append. Does not capacity-check; make sure to call Reserve beforehand + // Scalar append. void Append(value_type val) { + PrimitiveBuilder::Reserve(1); + UnsafeAppend(val); + } + + // Does not capacity-check; make sure to call Reserve beforehand + void UnsafeAppend(value_type val) { util::set_bit(null_bitmap_data_, length_); raw_data_[length_++] = val; } @@ -151,9 +157,6 @@ class NumericBuilder : public PrimitiveBuilder { using PrimitiveBuilder::length_; using PrimitiveBuilder::null_bitmap_data_; using PrimitiveBuilder::raw_data_; - - using PrimitiveBuilder::Init; - using PrimitiveBuilder::Resize; }; template <> From 137448589537e0fccad7e54397e9f5f88f2b2601 Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Wed, 13 Apr 2016 07:48:29 +0000 Subject: [PATCH 05/25] augment python unittest to have null element in list --- python/pyarrow/tests/test_array.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py index d608f8167df..bf5a22089cd 100644 --- a/python/pyarrow/tests/test_array.py +++ b/python/pyarrow/tests/test_array.py @@ -31,14 +31,15 @@ def test_getitem_NA(self): assert arr[1] is pyarrow.NA def test_list_format(self): - arr = pyarrow.from_pylist([[1], None, [2, 3]]) + arr = pyarrow.from_pylist([[1], None, [2, 3, None]]) result = fmt.array_format(arr) expected = """\ [ [1], NA, [2, - 3] + 3, + NA] ]""" assert result == expected From 45e41c0aad0a1ca4712b37e9720791f500f5674b Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Wed, 13 Apr 2016 07:49:13 +0000 Subject: [PATCH 06/25] Make BufferBuilder more useable for appending primitives --- cpp/src/arrow/util/buffer.h | 59 ++++++++++++++++++++++++++++++------- 1 file changed, 49 insertions(+), 10 deletions(-) diff --git a/cpp/src/arrow/util/buffer.h b/cpp/src/arrow/util/buffer.h index 56532be8070..5ef0076953c 100644 --- a/cpp/src/arrow/util/buffer.h +++ b/cpp/src/arrow/util/buffer.h @@ -23,6 +23,7 @@ #include #include +#include "arrow/util/bit-util.h" #include "arrow/util/macros.h" #include "arrow/util/status.h" @@ -137,26 +138,64 @@ class BufferBuilder { public: explicit BufferBuilder(MemoryPool* pool) : pool_(pool), capacity_(0), size_(0) {} + Status Resize(int32_t elements) { + if (capacity_ == 0) { buffer_ = std::make_shared(pool_); } + capacity_ = elements; + RETURN_NOT_OK(buffer_->Resize(capacity_)); + data_ = buffer_->mutable_data(); + return Status::OK(); + } + Status Append(const uint8_t* data, int length) { - if (capacity_ < length + size_) { - if (capacity_ == 0) { buffer_ = std::make_shared(pool_); } - capacity_ = std::max(MIN_BUFFER_CAPACITY, capacity_); - while (capacity_ < length + size_) { - capacity_ *= 2; - } - RETURN_NOT_OK(buffer_->Resize(capacity_)); - data_ = buffer_->mutable_data(); - } + if (capacity_ < length + size_) { RETURN_NOT_OK(Resize(length + size_)); } + UnsafeAppend(data, length); + return Status::OK(); + } + + template + Status Append(T arithmetic_value) { + static_assert(std::is_arithmetic::value, + "Convenience buffer append only supports arithmetic types"); + return Append(reinterpret_cast(&arithmetic_value), sizeof(T)); + } + + template + Status Append(const T* arithmetic_values, int num_elements) { + static_assert(std::is_arithmetic::value, + "Convenience buffer append only supports arithmetic types"); + return Append( + reinterpret_cast(arithmetic_values), num_elements * sizeof(T)); + } + + // Unsafe methods don't check existing size + void UnsafeAppend(const uint8_t* data, int length) { memcpy(data_ + size_, data, length); size_ += length; - return Status::OK(); + } + + template + void UnsafeAppend(T arithmetic_value) { + static_assert(std::is_arithmetic::value, + "Convenience buffer append only supports arithmetic types"); + UnsafeAppend(reinterpret_cast(&arithmetic_value), sizeof(T)); + } + + template + void UnsafeAppend(const T* arithmetic_values, int num_elements) { + static_assert(std::is_arithmetic::value, + "Convenience buffer append only supports arithmetic types"); + UnsafeAppend( + reinterpret_cast(arithmetic_values), num_elements * sizeof(T)); } std::shared_ptr Finish() { auto result = buffer_; buffer_ = nullptr; + capacity_ = size_ = 0; return result; } + int capacity() { return capacity_; } + int length() { return size_; } private: std::shared_ptr buffer_; From 5f87aefe55489f8388505b072a791cd958b42d26 Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Wed, 13 Apr 2016 07:51:09 +0000 Subject: [PATCH 07/25] Refactor ipc-adapter-test to make it paramaterizable. add unit test for lists. make unit test pass and and construction method for list arrays --- cpp/src/arrow/ipc/adapter.cc | 94 +++++++++++++++------ cpp/src/arrow/ipc/ipc-adapter-test.cc | 112 +++++++++++++++++--------- cpp/src/arrow/types/construct.cc | 41 ++++++++-- cpp/src/arrow/types/construct.h | 9 +++ 4 files changed, 191 insertions(+), 65 deletions(-) diff --git a/cpp/src/arrow/ipc/adapter.cc b/cpp/src/arrow/ipc/adapter.cc index 2f72c3aa846..d61ff0c31fc 100644 --- a/cpp/src/arrow/ipc/adapter.cc +++ b/cpp/src/arrow/ipc/adapter.cc @@ -19,6 +19,7 @@ #include #include +#include #include #include "arrow/array.h" @@ -31,6 +32,7 @@ #include "arrow/type.h" #include "arrow/types/construct.h" #include "arrow/types/primitive.h" +#include "arrow/types/list.h" #include "arrow/util/buffer.h" #include "arrow/util/logging.h" #include "arrow/util/status.h" @@ -63,32 +65,56 @@ static bool IsPrimitive(const DataType* type) { } } +static bool IsListType(const DataType* type) { + DCHECK(type != nullptr); + switch (type->type) { + // TODO(emkornfield) grouping like this are used in a few places in the + // code consider using pattern like: + // http://stackoverflow.com/questions/26784685/c-macro-for-calling-function-based-on-enum-type + // + // TODO(emkornfield) Fix type systems so these are all considered lists and + // the types behave the same way? + // case Type::BINARY: + // case Type::CHAR: + case Type::LIST: + // see todo on common types + // case Type::STRING: + // case Type::VARCHAR: + return true; + default: + return false; + } +} + // ---------------------------------------------------------------------- // Row batch write path Status VisitArray(const Array* arr, std::vector* field_nodes, std::vector>* buffers) { - if (IsPrimitive(arr->type().get())) { - const PrimitiveArray* prim_arr = static_cast(arr); - - field_nodes->push_back( - flatbuf::FieldNode(prim_arr->length(), prim_arr->null_count())); + DCHECK(arr); + DCHECK(field_nodes); + // push back all common elements + field_nodes->push_back(flatbuf::FieldNode(arr->length(), arr->null_count())); + if (arr->null_count() > 0) { + buffers->push_back(arr->null_bitmap()); + } else { + // Push a dummy zero-length buffer, not to be copied + buffers->push_back(std::make_shared(nullptr, 0)); + } - if (prim_arr->null_count() > 0) { - buffers->push_back(prim_arr->null_bitmap()); - } else { - // Push a dummy zero-length buffer, not to be copied - buffers->push_back(std::make_shared(nullptr, 0)); - } + const auto arr_type = arr->type().get(); + if (IsPrimitive(arr_type)) { + const PrimitiveArray* prim_arr = static_cast(arr); buffers->push_back(prim_arr->data()); - } else if (arr->type_enum() == Type::LIST) { - // TODO(wesm) - return Status::NotImplemented("List type"); + } else if (IsListType(arr_type)) { + const ListArray* list_arr = static_cast(arr); + buffers->push_back(list_arr->offset_buffer()); + // TODO(emkornfield) limit recursion depth + RETURN_NOT_OK(VisitArray(list_arr->values().get(), field_nodes, buffers)); } else if (arr->type_enum() == Type::STRUCT) { // TODO(wesm) return Status::NotImplemented("Struct type"); } - return Status::OK(); } @@ -214,7 +240,7 @@ class RowBatchReader::Impl { // Traverse the flattened record batch metadata and reassemble the // corresponding array containers Status NextArray(const Field* field, std::shared_ptr* out) { - const std::shared_ptr& type = field->type; + const TypePtr& type = field->type; // pop off a field if (field_index_ >= num_flattened_fields_) { @@ -226,15 +252,17 @@ class RowBatchReader::Impl { // we can skip that buffer without reading from shared memory FieldMetadata field_meta = metadata_->field(field_index_++); + // extract null_bitmap which is common to all arrays + std::shared_ptr null_bitmap; + if (field_meta.null_count == 0) { + null_bitmap = nullptr; + ++buffer_index_; + } else { + RETURN_NOT_OK(GetBuffer(buffer_index_++, &null_bitmap)); + } + if (IsPrimitive(type.get())) { - std::shared_ptr null_bitmap; std::shared_ptr data; - if (field_meta.null_count == 0) { - null_bitmap = nullptr; - ++buffer_index_; - } else { - RETURN_NOT_OK(GetBuffer(buffer_index_++, &null_bitmap)); - } if (field_meta.length > 0) { RETURN_NOT_OK(GetBuffer(buffer_index_++, &data)); } else { @@ -243,6 +271,26 @@ class RowBatchReader::Impl { return MakePrimitiveArray( type, field_meta.length, data, field_meta.null_count, null_bitmap, out); } + if (IsListType(type.get())) { + std::shared_ptr offsets; + if (field_meta.length > 0) { + RETURN_NOT_OK(GetBuffer(buffer_index_++, &offsets)); + } else { + offsets.reset(new Buffer(nullptr, 0)); + } + const int num_children = type->num_children(); + if (num_children != 1) { + std::stringstream ss; + ss << "Field: " << field->ToString() + << " has wrong number of children:" << num_children; + return Status::Invalid(ss.str()); + } + std::shared_ptr values_array; + // TODO(emkornfield): limit recursion depth? + RETURN_NOT_OK(NextArray(type->child(0).get(), &values_array)); + return MakeListArray(type, field_meta.length, offsets, values_array, + field_meta.null_count, null_bitmap, out); + } return Status::NotImplemented("Non-primitive types not complete yet"); } diff --git a/cpp/src/arrow/ipc/ipc-adapter-test.cc b/cpp/src/arrow/ipc/ipc-adapter-test.cc index fbdda77e491..3d678da2105 100644 --- a/cpp/src/arrow/ipc/ipc-adapter-test.cc +++ b/cpp/src/arrow/ipc/ipc-adapter-test.cc @@ -15,11 +15,13 @@ // specific language governing permissions and limitations // under the License. +#include #include #include #include #include #include +#include #include #include #include @@ -31,6 +33,7 @@ #include "arrow/ipc/test-common.h" #include "arrow/test-util.h" +#include "arrow/types/list.h" #include "arrow/types/primitive.h" #include "arrow/util/bit-util.h" #include "arrow/util/buffer.h" @@ -40,15 +43,37 @@ namespace arrow { namespace ipc { -class TestWriteRowBatch : public ::testing::Test, public MemoryMapFixture { +// TODO(emkornfield) convert to google style kInt32, etc? +const auto INT32 = std::make_shared(); +const auto LIST_INT32 = std::make_shared(INT32); +const auto LIST_LIST_INT32 = std::make_shared(LIST_INT32); + +typedef Status MakeRowBatch(std::shared_ptr* out); + +class TestWriteRowBatch : public ::testing::TestWithParam, + public MemoryMapFixture { public: void SetUp() { pool_ = default_memory_pool(); } void TearDown() { MemoryMapFixture::TearDown(); } - void InitMemoryMap(int64_t size) { + Status InitMemoryMap(int64_t size) { std::string path = "test-write-row-batch"; MemoryMapFixture::CreateFile(path, size); - ASSERT_OK(MemoryMappedSource::Open(path, MemorySource::READ_WRITE, &mmap_)); + return MemoryMappedSource::Open(path, MemorySource::READ_WRITE, &mmap_); + } + + Status RoundTripHelper(const RowBatch& batch, int memory_map_size, + std::shared_ptr* batch_result) { + InitMemoryMap(memory_map_size); + int64_t header_location; + RETURN_NOT_OK(WriteRowBatch(mmap_.get(), &batch, 0, &header_location)); + + std::shared_ptr reader; + RETURN_NOT_OK(RowBatchReader::Open(mmap_.get(), header_location, &reader)); + + // TODO(emkornfield): why does this require a smart pointer for schema? + RETURN_NOT_OK(reader->GetRowBatch(batch.schema(), batch_result)); + return Status::OK(); } protected: @@ -56,9 +81,24 @@ class TestWriteRowBatch : public ::testing::Test, public MemoryMapFixture { std::shared_ptr mmap_; }; -const auto INT32 = std::make_shared(); +TEST_P(TestWriteRowBatch, RoundTrip) { + std::shared_ptr batch; + ASSERT_OK((*GetParam())(&batch)); + std::shared_ptr batch_result; + ASSERT_OK(RoundTripHelper(*batch, 1 << 16, &batch_result)); + + // do checks + ASSERT_TRUE(batch->schema()->Equals(batch_result->schema())); + ASSERT_EQ(batch->num_columns(), batch_result->num_columns()) + << batch->schema()->ToString() << " result: " << batch_result->schema()->ToString(); + EXPECT_EQ(batch->num_rows(), batch_result->num_rows()); + for (int i = 0; i < batch->num_columns(); ++i) { + EXPECT_TRUE(batch->column(i)->Equals(batch_result->column(i))) + << i << batch->column_name(i); + } +} -TEST_F(TestWriteRowBatch, IntegerRoundTrip) { +Status MakeIntRowBatch(std::shared_ptr* out) { const int length = 1000; // Make the schema @@ -67,42 +107,40 @@ TEST_F(TestWriteRowBatch, IntegerRoundTrip) { std::shared_ptr schema(new Schema({f0, f1})); // Example data + std::shared_ptr a0, a1; + MemoryPool* pool = default_memory_pool(); + RETURN_NOT_OK(MakeRandomInt32Array(length, false, pool, &a0)); + RETURN_NOT_OK(MakeRandomInt32Array(length, true, pool, &a1)); + out->reset(new RowBatch(schema, length, {a0, a1})); + return Status::OK(); +} - auto data = std::make_shared(pool_); - ASSERT_OK(data->Resize(length * sizeof(int32_t))); - test::rand_uniform_int(length, 0, 0, std::numeric_limits::max(), - reinterpret_cast(data->mutable_data())); - - auto null_bitmap = std::make_shared(pool_); - int null_bytes = util::bytes_for_bits(length); - ASSERT_OK(null_bitmap->Resize(null_bytes)); - test::random_bytes(null_bytes, 0, null_bitmap->mutable_data()); - - auto a0 = std::make_shared(length, data); - auto a1 = std::make_shared( - length, data, test::bitmap_popcount(null_bitmap->data(), length), null_bitmap); - - RowBatch batch(schema, length, {a0, a1}); - - // TODO(wesm): computing memory requirements for a row batch - // 64k is plenty of space - InitMemoryMap(1 << 16); - - int64_t header_location; - ASSERT_OK(WriteRowBatch(mmap_.get(), &batch, 0, &header_location)); - - std::shared_ptr result; - ASSERT_OK(RowBatchReader::Open(mmap_.get(), header_location, &result)); +Status MakeListRowBatch(std::shared_ptr* out) { + // Make the schema + auto f0 = std::make_shared("f0", LIST_INT32); + auto f1 = std::make_shared("f1", LIST_LIST_INT32); + auto f2 = std::make_shared("f2", INT32); + std::shared_ptr schema(new Schema({f0, f1, f2})); - std::shared_ptr batch_result; - ASSERT_OK(result->GetRowBatch(schema, &batch_result)); - EXPECT_EQ(batch.num_rows(), batch_result->num_rows()); + // Example data - for (int i = 0; i < batch.num_columns(); ++i) { - EXPECT_TRUE(batch.column(i)->Equals(batch_result->column(i))) << i - << batch.column_name(i); - } + MemoryPool* pool = default_memory_pool(); + const int length = 200; + std::shared_ptr leaf_values, list_array, list_list_array, flat_array; + RETURN_NOT_OK(MakeRandomInt32Array(1000, true, pool, &leaf_values)); + RETURN_NOT_OK(MakeRandomListArray(leaf_values, length, pool, &list_array)); + RETURN_NOT_OK(MakeRandomListArray(list_array, length, pool, &list_list_array)); + RETURN_NOT_OK(MakeRandomInt32Array(length, true, pool, &flat_array)); + out->reset(new RowBatch(schema, length, {list_array, list_list_array, flat_array})); + return Status::OK(); } +INSTANTIATE_TEST_CASE_P(RoundTripTests, TestWriteRowBatch, + ::testing::Values(&MakeIntRowBatch, &MakeListRowBatch)); + +// TODO(emkornfield) More tests +// Test primitive and lists with zero elements +// Tests lists and primitives with no nulls +// String type } // namespace ipc } // namespace arrow diff --git a/cpp/src/arrow/types/construct.cc b/cpp/src/arrow/types/construct.cc index 0a30929b97c..f70ebb19697 100644 --- a/cpp/src/arrow/types/construct.cc +++ b/cpp/src/arrow/types/construct.cc @@ -60,11 +60,10 @@ Status MakeBuilder(MemoryPool* pool, const std::shared_ptr& type, case Type::LIST: { std::shared_ptr value_builder; - const std::shared_ptr& value_type = static_cast(type.get())->value_type(); RETURN_NOT_OK(MakeBuilder(pool, value_type, &value_builder)); - out->reset(new ListBuilder(pool, type, value_builder)); + out->reset(new ListBuilder(pool, value_builder)); return Status::OK(); } default: @@ -75,11 +74,11 @@ Status MakeBuilder(MemoryPool* pool, const std::shared_ptr& type, #define MAKE_PRIMITIVE_ARRAY_CASE(ENUM, ArrayType) \ case Type::ENUM: \ out->reset(new ArrayType(type, length, data, null_count, null_bitmap)); \ - return Status::OK(); + break; -Status MakePrimitiveArray(const std::shared_ptr& type, int32_t length, +Status MakePrimitiveArray(const TypePtr& type, int32_t length, const std::shared_ptr& data, int32_t null_count, - const std::shared_ptr& null_bitmap, std::shared_ptr* out) { + const std::shared_ptr& null_bitmap, ArrayPtr* out) { switch (type->type) { MAKE_PRIMITIVE_ARRAY_CASE(BOOL, BooleanArray); MAKE_PRIMITIVE_ARRAY_CASE(UINT8, UInt8Array); @@ -90,11 +89,43 @@ Status MakePrimitiveArray(const std::shared_ptr& type, int32_t length, MAKE_PRIMITIVE_ARRAY_CASE(INT32, Int32Array); MAKE_PRIMITIVE_ARRAY_CASE(UINT64, UInt64Array); MAKE_PRIMITIVE_ARRAY_CASE(INT64, Int64Array); + MAKE_PRIMITIVE_ARRAY_CASE(TIME, Int64Array); + MAKE_PRIMITIVE_ARRAY_CASE(TIMESTAMP, Int64Array); MAKE_PRIMITIVE_ARRAY_CASE(FLOAT, FloatArray); MAKE_PRIMITIVE_ARRAY_CASE(DOUBLE, DoubleArray); + MAKE_PRIMITIVE_ARRAY_CASE(TIMESTAMP_DOUBLE, DoubleArray); + default: + return Status::NotImplemented(type->ToString()); + } +#ifdef NDEBUG + return Status::OK(); +#else + return (*out)->Validate(); +#endif +} + +Status MakeListArray(const TypePtr& type, int32_t length, + const std::shared_ptr& offsets, const ArrayPtr& values, int32_t null_count, + const std::shared_ptr& null_bitmap, ArrayPtr* out) { + switch (type->type) { + case Type::BINARY: + case Type::LIST: + out->reset(new ListArray(type, length, offsets, values, null_count, null_bitmap)); + break; + case Type::CHAR: + case Type::DECIMAL_TEXT: + case Type::STRING: + case Type::VARCHAR: + out->reset(new StringArray(type, length, offsets, values, null_count, null_bitmap)); + break; default: return Status::NotImplemented(type->ToString()); } +#ifdef NDEBUG + return Status::OK(); +#else + return (*out)->Validate(); +#endif } } // namespace arrow diff --git a/cpp/src/arrow/types/construct.h b/cpp/src/arrow/types/construct.h index 27fb7bd2149..43c0018c67e 100644 --- a/cpp/src/arrow/types/construct.h +++ b/cpp/src/arrow/types/construct.h @@ -33,10 +33,19 @@ class Status; Status MakeBuilder(MemoryPool* pool, const std::shared_ptr& type, std::shared_ptr* out); +// Create new arrays for logical types that are backed by primitive arrays. Status MakePrimitiveArray(const std::shared_ptr& type, int32_t length, const std::shared_ptr& data, int32_t null_count, const std::shared_ptr& null_bitmap, std::shared_ptr* out); +// Create new list arrays for logical types that are backed by ListArrays (e.g. list of +// primitives and strings) +// TODO(emkornfield) split up string vs list? +Status MakeListArray(const std::shared_ptr& type, int32_t length, + const std::shared_ptr& offests, const std::shared_ptr& values, + int32_t null_count, const std::shared_ptr& null_bitmap, + std::shared_ptr* out); + } // namespace arrow #endif // ARROW_BUILDER_H_ From 61b048120bda0b948490b27ef2c428f11ecd936f Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Wed, 13 Apr 2016 08:06:59 +0000 Subject: [PATCH 08/25] small fixes to naming/style for c++ and potential bugs --- cpp/src/arrow/array.h | 2 +- cpp/src/arrow/builder.h | 2 +- cpp/src/arrow/test-util.h | 7 +++++-- cpp/src/arrow/type.h | 2 +- cpp/src/arrow/types/primitive-test.cc | 6 +++--- cpp/src/arrow/types/primitive.cc | 1 + cpp/src/arrow/types/string.h | 1 - 7 files changed, 12 insertions(+), 9 deletions(-) diff --git a/cpp/src/arrow/array.h b/cpp/src/arrow/array.h index af62c46edf2..f98c4c28310 100644 --- a/cpp/src/arrow/array.h +++ b/cpp/src/arrow/array.h @@ -40,7 +40,7 @@ class Array { Array(const std::shared_ptr& type, int32_t length, int32_t null_count = 0, const std::shared_ptr& null_bitmap = nullptr); - virtual ~Array() {} + virtual ~Array() = default; // Determine if a slot is null. For inner loops. Does *not* boundscheck bool IsNull(int i) const { diff --git a/cpp/src/arrow/builder.h b/cpp/src/arrow/builder.h index e2fb11ffc84..b23c52c261f 100644 --- a/cpp/src/arrow/builder.h +++ b/cpp/src/arrow/builder.h @@ -49,7 +49,7 @@ class ArrayBuilder { length_(0), capacity_(0) {} - virtual ~ArrayBuilder() {} + virtual ~ArrayBuilder() = default; // For nested types. Since the objects are owned by this class instance, we // skip shared pointers and just return a raw pointer diff --git a/cpp/src/arrow/test-util.h b/cpp/src/arrow/test-util.h index 538d9b233d9..ea1c61a263f 100644 --- a/cpp/src/arrow/test-util.h +++ b/cpp/src/arrow/test-util.h @@ -103,10 +103,12 @@ std::shared_ptr to_buffer(const std::vector& values) { reinterpret_cast(values.data()), values.size() * sizeof(T)); } -void random_null_bitmap(int64_t n, double pct_null, uint8_t* null_bitmap) { +// Sets approximately pct_null of the first n bytes in null_bytes to zero +// and the rest to non-zero (true) values. +void random_null_bytes(int64_t n, double pct_null, uint8_t* null_bytes) { Random rng(random_seed()); for (int i = 0; i < n; ++i) { - null_bitmap[i] = rng.NextDoubleFraction() > pct_null; + null_bytes[i] = rng.NextDoubleFraction() > pct_null; } } @@ -121,6 +123,7 @@ static inline void random_bytes(int n, uint32_t seed, uint8_t* out) { template void rand_uniform_int(int n, uint32_t seed, T min_value, T max_value, T* out) { + DCHECK(out); std::mt19937 gen(seed); std::uniform_int_distribution d(min_value, max_value); for (int i = 0; i < n; ++i) { diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h index 051ab46b199..fe07f7cf493 100644 --- a/cpp/src/arrow/type.h +++ b/cpp/src/arrow/type.h @@ -116,7 +116,7 @@ struct DataType { bool Equals(const DataType* other) { // Call with a pointer so more friendly to subclasses - return this == other || (this->type == other->type); + return this == other || ((other != nullptr) && (this->type == other->type)); } bool Equals(const std::shared_ptr& other) { return Equals(other.get()); } diff --git a/cpp/src/arrow/types/primitive-test.cc b/cpp/src/arrow/types/primitive-test.cc index 6bd9e73eb46..2b4c0879a28 100644 --- a/cpp/src/arrow/types/primitive-test.cc +++ b/cpp/src/arrow/types/primitive-test.cc @@ -102,7 +102,7 @@ class TestPrimitiveBuilder : public TestBuilder { Attrs::draw(N, &draws_); valid_bytes_.resize(N); - test::random_null_bitmap(N, pct_null, valid_bytes_.data()); + test::random_null_bytes(N, pct_null, valid_bytes_.data()); } void Check(const std::shared_ptr& builder, bool nullable) { @@ -193,8 +193,8 @@ void TestPrimitiveBuilder::RandomData(int N, double pct_null) { draws_.resize(N); valid_bytes_.resize(N); - test::random_null_bitmap(N, 0.5, draws_.data()); - test::random_null_bitmap(N, pct_null, valid_bytes_.data()); + test::random_null_bytes(N, 0.5, draws_.data()); + test::random_null_bytes(N, pct_null, valid_bytes_.data()); } template <> diff --git a/cpp/src/arrow/types/primitive.cc b/cpp/src/arrow/types/primitive.cc index ef00cd37f36..e4f2a883418 100644 --- a/cpp/src/arrow/types/primitive.cc +++ b/cpp/src/arrow/types/primitive.cc @@ -63,6 +63,7 @@ bool PrimitiveArray::EqualsExact(const PrimitiveArray& other) const { bool PrimitiveArray::Equals(const std::shared_ptr& arr) const { if (this == arr.get()) { return true; } + if (!arr) { return false; } if (this->type_enum() != arr->type_enum()) { return false; } return EqualsExact(*static_cast(arr.get())); } diff --git a/cpp/src/arrow/types/string.h b/cpp/src/arrow/types/string.h index f5874db45db..d2d3c5b6b5a 100644 --- a/cpp/src/arrow/types/string.h +++ b/cpp/src/arrow/types/string.h @@ -110,7 +110,6 @@ class StringBuilder : public ListBuilder { } protected: - std::shared_ptr list_builder_; UInt8Builder* byte_builder_; static TypePtr value_type_; From a2e1e52170169bed3ca8c2125f570005fc52ec76 Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Wed, 13 Apr 2016 08:08:40 +0000 Subject: [PATCH 09/25] native popcount --- cpp/src/arrow/test-util.h | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/test-util.h b/cpp/src/arrow/test-util.h index ea1c61a263f..92c6d9c04b6 100644 --- a/cpp/src/arrow/test-util.h +++ b/cpp/src/arrow/test-util.h @@ -132,11 +132,25 @@ void rand_uniform_int(int n, uint32_t seed, T min_value, T max_value, T* out) { } static inline int bitmap_popcount(const uint8_t* data, int length) { + // book keeping + constexpr int pop_len = sizeof(uint64_t); + const uint64_t* i64_data = reinterpret_cast(data); + const int fast_counts = length / pop_len; + const uint64_t* end = i64_data + fast_counts; + int count = 0; - for (int i = 0; i < length; ++i) { - // TODO(wesm): accelerate this + // popcount as much as possible with the widest possible count + for (auto iter = i64_data; iter < end; ++iter) { + count += __builtin_popcountll(*iter); + } + + // Account for left over bytes (in theory we could fall back to smaller + // versions of popcount but the code complexity is likely not worth it) + const int loop_tail_index = fast_counts * pop_len; + for (int i = loop_tail_index; i < length; ++i) { if (util::get_bit(data, i)) { ++count; } } + return count; } From 39c57edeebbf9679ac0a83f759d5cae2d4a57df5 Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Wed, 13 Apr 2016 08:09:21 +0000 Subject: [PATCH 10/25] add potentially useful methods for generative arrays to ipc test-common --- cpp/src/arrow/ipc/test-common.h | 59 +++++++++++++++++++++++++++++++++ 1 file changed, 59 insertions(+) diff --git a/cpp/src/arrow/ipc/test-common.h b/cpp/src/arrow/ipc/test-common.h index 65c837dc8b1..00688ad1f05 100644 --- a/cpp/src/arrow/ipc/test-common.h +++ b/cpp/src/arrow/ipc/test-common.h @@ -23,6 +23,13 @@ #include #include +#include "arrow/array.h" +#include "arrow/test-util.h" +#include "arrow/types/primitive.h" +#include "arrow/types/list.h" +#include "arrow/util/buffer.h" +#include "arrow/util/memory-pool.h" + namespace arrow { namespace ipc { @@ -45,6 +52,58 @@ class MemoryMapFixture { std::vector tmp_files_; }; +Status MakeRandomInt32Array( + int32_t length, bool include_nulls, MemoryPool* pool, std::shared_ptr* array) { + std::shared_ptr data; + test::MakeRandomInt32PoolBuffer(length, pool, &data); + const auto INT32 = std::make_shared(); + Int32Builder builder(pool, INT32); + if (include_nulls) { + std::shared_ptr valid_bytes; + test::MakeRandomBytePoolBuffer(length, pool, &valid_bytes); + RETURN_NOT_OK(builder.Append( + reinterpret_cast(data->data()), length, valid_bytes->data())); + *array = builder.Finish(); + return Status::OK(); + } + RETURN_NOT_OK(builder.Append(reinterpret_cast(data->data()), length)); + *array = builder.Finish(); + return Status::OK(); +} + +Status MakeRandomListArray(const std::shared_ptr& child_array, int num_lists, + MemoryPool* pool, std::shared_ptr* array) { + // Create the null list values + std::vector valid_lists(num_lists); + const double null_percent = 0.1; + test::random_null_bytes(num_lists, null_percent, valid_lists.data()); + + // Create list offsets + const int max_list_size = 10; + + std::vector list_sizes(num_lists, 0); + std::vector offsets( + num_lists + 1, 0); // +1 so we can shift for nulls. See partial sum below. + const int seed = child_array->length(); + test::rand_uniform_int(num_lists, seed, 0, max_list_size, list_sizes.data()); + // make sure sizes are consistent with null + std::transform(list_sizes.begin(), list_sizes.end(), valid_lists.begin(), + list_sizes.begin(), + [](int32_t size, int32_t valid) { return valid == 0 ? 0 : size; }); + std::partial_sum(list_sizes.begin(), list_sizes.end(), ++offsets.begin()); + + // Force invariants + const int child_length = child_array->length(); + offsets[0] = 0; + std::replace_if(offsets.begin(), offsets.end(), + [child_length](int32_t offset) { return offset > child_length; }, child_length); + + ListBuilder builder(pool, child_array); + RETURN_NOT_OK(builder.Append(offsets.data(), num_lists, valid_lists.data())); + *array = builder.Finish(); + return (*array)->Validate(); +} + } // namespace ipc } // namespace arrow From aa0602cd52d2b351e007a0e3ff7b6fb52f2c74fb Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Wed, 13 Apr 2016 08:10:34 +0000 Subject: [PATCH 11/25] add potentially useful pool factories to test utils --- cpp/src/arrow/test-util.h | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/cpp/src/arrow/test-util.h b/cpp/src/arrow/test-util.h index 92c6d9c04b6..9de67541e1d 100644 --- a/cpp/src/arrow/test-util.h +++ b/cpp/src/arrow/test-util.h @@ -32,6 +32,7 @@ #include "arrow/table.h" #include "arrow/util/bit-util.h" #include "arrow/util/buffer.h" +#include "arrow/util/logging.h" #include "arrow/util/memory-pool.h" #include "arrow/util/random.h" #include "arrow/util/status.h" @@ -170,6 +171,26 @@ std::shared_ptr bytes_to_null_buffer(const std::vector& bytes) return out; } +Status MakeRandomInt32PoolBuffer(int32_t length, MemoryPool* pool, + std::shared_ptr* pool_buffer, uint32_t seed = 0) { + DCHECK(pool); + auto data = std::make_shared(pool); + RETURN_NOT_OK(data->Resize(length * sizeof(int32_t))); + test::rand_uniform_int(length, seed, 0, std::numeric_limits::max(), + reinterpret_cast(data->mutable_data())); + *pool_buffer = data; + return Status::OK(); +} + +Status MakeRandomBytePoolBuffer(int32_t length, MemoryPool* pool, + std::shared_ptr* pool_buffer, uint32_t seed = 0) { + auto bytes = std::make_shared(pool); + RETURN_NOT_OK(bytes->Resize(length)); + test::random_bytes(length, seed, bytes->mutable_data()); + *pool_buffer = bytes; + return Status::OK(); +} + } // namespace test } // namespace arrow From 8e464b59aae36f99d67b9d1832ccbb1849421e5c Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Wed, 13 Apr 2016 08:13:48 +0000 Subject: [PATCH 12/25] Fixes per tidy and lint --- cpp/src/arrow/ipc/ipc-adapter-test.cc | 6 +----- cpp/src/arrow/ipc/test-common.h | 1 + cpp/src/arrow/test-util.h | 1 + 3 files changed, 3 insertions(+), 5 deletions(-) diff --git a/cpp/src/arrow/ipc/ipc-adapter-test.cc b/cpp/src/arrow/ipc/ipc-adapter-test.cc index 3d678da2105..bf66132ee5b 100644 --- a/cpp/src/arrow/ipc/ipc-adapter-test.cc +++ b/cpp/src/arrow/ipc/ipc-adapter-test.cc @@ -15,14 +15,10 @@ // specific language governing permissions and limitations // under the License. -#include #include #include #include -#include #include -#include -#include #include #include @@ -83,7 +79,7 @@ class TestWriteRowBatch : public ::testing::TestWithParam, TEST_P(TestWriteRowBatch, RoundTrip) { std::shared_ptr batch; - ASSERT_OK((*GetParam())(&batch)); + ASSERT_OK((*GetParam())(&batch)); // NOLINT clang-tidy complains about gtest std::shared_ptr batch_result; ASSERT_OK(RoundTripHelper(*batch, 1 << 16, &batch_result)); diff --git a/cpp/src/arrow/ipc/test-common.h b/cpp/src/arrow/ipc/test-common.h index 00688ad1f05..7c23489b7b4 100644 --- a/cpp/src/arrow/ipc/test-common.h +++ b/cpp/src/arrow/ipc/test-common.h @@ -18,6 +18,7 @@ #ifndef ARROW_IPC_TEST_COMMON_H #define ARROW_IPC_TEST_COMMON_H +#include #include #include #include diff --git a/cpp/src/arrow/test-util.h b/cpp/src/arrow/test-util.h index 9de67541e1d..9d09a6973ad 100644 --- a/cpp/src/arrow/test-util.h +++ b/cpp/src/arrow/test-util.h @@ -19,6 +19,7 @@ #define ARROW_TEST_UTIL_H_ #include +#include #include #include #include From 53d37bc026e9b8ac503f41111efe20c36470df6e Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Wed, 13 Apr 2016 09:10:47 +0000 Subject: [PATCH 13/25] filter out ipc-adapter-test from tidy --- cpp/CMakeLists.txt | 4 ++-- cpp/src/arrow/ipc/ipc-adapter-test.cc | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index f803c0fb3e4..c98c7a71471 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -562,10 +562,10 @@ endif() if (${CLANG_TIDY_FOUND}) # runs clang-tidy and attempts to fix any warning automatically add_custom_target(clang-tidy ${BUILD_SUPPORT_DIR}/run-clang-tidy.sh ${CLANG_TIDY_BIN} ${CMAKE_BINARY_DIR}/compile_commands.json 1 - `find ${CMAKE_CURRENT_SOURCE_DIR}/src -name \\*.cc | sed -e '/_generated/g'`) + `find ${CMAKE_CURRENT_SOURCE_DIR}/src -name \\*.cc |grep -v ipc-adapter-test.cc | sed -e '/_generated/g'`) # runs clang-tidy and exits with a non-zero exit code if any errors are found. add_custom_target(check-clang-tidy ${BUILD_SUPPORT_DIR}/run-clang-tidy.sh ${CLANG_TIDY_BIN} ${CMAKE_BINARY_DIR}/compile_commands.json - 0 `find ${CMAKE_CURRENT_SOURCE_DIR}/src -name \\*.cc | sed -e '/_generated/g'`) + 0 `find ${CMAKE_CURRENT_SOURCE_DIR}/src -name \\*.cc |grep -v ipc-adapter-test.cc | sed -e '/_generated/g'`) endif() diff --git a/cpp/src/arrow/ipc/ipc-adapter-test.cc b/cpp/src/arrow/ipc/ipc-adapter-test.cc index bf66132ee5b..8f637d6d031 100644 --- a/cpp/src/arrow/ipc/ipc-adapter-test.cc +++ b/cpp/src/arrow/ipc/ipc-adapter-test.cc @@ -79,7 +79,7 @@ class TestWriteRowBatch : public ::testing::TestWithParam, TEST_P(TestWriteRowBatch, RoundTrip) { std::shared_ptr batch; - ASSERT_OK((*GetParam())(&batch)); // NOLINT clang-tidy complains about gtest + ASSERT_OK((*GetParam())(&batch)); // NOLINT clang-tidy gtest issue std::shared_ptr batch_result; ASSERT_OK(RoundTripHelper(*batch, 1 << 16, &batch_result)); From 8ab5315cd10de01c0b947d210ab3abcd01d2cd9e Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Sun, 17 Apr 2016 19:14:36 +0000 Subject: [PATCH 14/25] make clang tidy ignore a little bit less hacky --- cpp/CMakeLists.txt | 4 ++-- cpp/README.md | 9 ++++++++- cpp/src/.clang-tidy-ignore | 1 + 3 files changed, 11 insertions(+), 3 deletions(-) create mode 100644 cpp/src/.clang-tidy-ignore diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index c98c7a71471..b38f91e5d68 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -562,10 +562,10 @@ endif() if (${CLANG_TIDY_FOUND}) # runs clang-tidy and attempts to fix any warning automatically add_custom_target(clang-tidy ${BUILD_SUPPORT_DIR}/run-clang-tidy.sh ${CLANG_TIDY_BIN} ${CMAKE_BINARY_DIR}/compile_commands.json 1 - `find ${CMAKE_CURRENT_SOURCE_DIR}/src -name \\*.cc |grep -v ipc-adapter-test.cc | sed -e '/_generated/g'`) + `find ${CMAKE_CURRENT_SOURCE_DIR}/src -name \\*.cc | sed -e '/_generated/g'`) # runs clang-tidy and exits with a non-zero exit code if any errors are found. add_custom_target(check-clang-tidy ${BUILD_SUPPORT_DIR}/run-clang-tidy.sh ${CLANG_TIDY_BIN} ${CMAKE_BINARY_DIR}/compile_commands.json - 0 `find ${CMAKE_CURRENT_SOURCE_DIR}/src -name \\*.cc |grep -v ipc-adapter-test.cc | sed -e '/_generated/g'`) + 0 `find ${CMAKE_CURRENT_SOURCE_DIR}/src -name \\*.cc |grep -v -F -f ${CMAKE_CURRENT_SOURCE_DIR}/src/.clang-tidy-ignore | sed -e '/_generated/g'`) endif() diff --git a/cpp/README.md b/cpp/README.md index 3f5da21b7d4..c8cd86fedc6 100644 --- a/cpp/README.md +++ b/cpp/README.md @@ -76,4 +76,11 @@ build failures by running the following checks before submitting your pull reque Note that the clang-tidy target may take a while to run. You might consider running clang-tidy separately on the files you have added/changed before -invoking the make target to reduce iteration time. +invoking the make target to reduce iteration time. Also, it might generate warnings +that aren't valid. To avoid these you can use add a line comment `// NOLINT`. If +NOLINT doesn't suppress the warnings, you add the file in question to +the .clang-tidy-ignore file. This will allow `make check-clang-tidy` to pass in +travis-CI (but still surface the potential warnings in `make clang-tidy`). Ideally, +both of these options would be used rarely. Current known uses-cases whent hey are required: + +* Parameterized tests in google test. diff --git a/cpp/src/.clang-tidy-ignore b/cpp/src/.clang-tidy-ignore new file mode 100644 index 00000000000..a128c388896 --- /dev/null +++ b/cpp/src/.clang-tidy-ignore @@ -0,0 +1 @@ +ipc-adapter-test.cc From e71810bc9248c2e65f5ae325c7b70ea37f7d2dff Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Sun, 17 Apr 2016 20:03:42 +0000 Subject: [PATCH 15/25] make Resize and Init virtual on builder --- cpp/src/arrow/builder.h | 21 +++++++++++++-------- cpp/src/arrow/types/list.cc | 2 +- cpp/src/arrow/types/list.h | 16 ++++------------ cpp/src/arrow/types/primitive.cc | 11 +---------- cpp/src/arrow/types/primitive.h | 10 +++------- 5 files changed, 22 insertions(+), 38 deletions(-) diff --git a/cpp/src/arrow/builder.h b/cpp/src/arrow/builder.h index b23c52c261f..70e6b460842 100644 --- a/cpp/src/arrow/builder.h +++ b/cpp/src/arrow/builder.h @@ -69,14 +69,19 @@ class ArrayBuilder { // Set the next length bits to not null (i.e. valid). Status SetNotNull(int32_t length); - // Allocates requires memory at this level, but children need to be - // initialized independently - Status Init(int32_t capacity); - - // Resizes the null_bitmap array - Status Resize(int32_t new_bits); - - Status Reserve(int32_t extra_bits); + // Allocates initial capacity requirements for the builder. In most + // cases subclasses should override and call there parent classes + // method as well. + virtual Status Init(int32_t capacity); + + // Resizes the null_bitmap array. In most + // cases subclasses should override and call there parent classes + // method as well. + virtual Status Resize(int32_t new_bits); + + // Ensures there is enough space for adding the number of elements by checking + // capacity and calling Resize if necessary. + Status Reserve(int32_t elements); // For cases where raw data was memcpy'd into the internal buffers, allows us // to advance the length of the builder. It is your responsibility to use diff --git a/cpp/src/arrow/types/list.cc b/cpp/src/arrow/types/list.cc index 931420b1ea5..621612f96a5 100644 --- a/cpp/src/arrow/types/list.cc +++ b/cpp/src/arrow/types/list.cc @@ -50,7 +50,7 @@ Status ListArray::Validate() const { } if (offset_buf_->size() / sizeof(int32_t) < length_) { std::stringstream ss; - ss << "offset buffer size: " << offset_buf_->size() + ss << "offset buffer size (bytes): " << offset_buf_->size() << " isn't large enough for length: " << length_; return Status::Invalid(ss.str()); } diff --git a/cpp/src/arrow/types/list.h b/cpp/src/arrow/types/list.h index 2c6c6666afa..6fcf78c9fa4 100644 --- a/cpp/src/arrow/types/list.h +++ b/cpp/src/arrow/types/list.h @@ -112,26 +112,18 @@ class ListBuilder : public ArrayBuilder { offset_builder_(pool), values_(values) {} - Status Init(int32_t elements) { + Status Init(int32_t elements) override { RETURN_NOT_OK(ArrayBuilder::Init(elements)); // one more then requested for offsets - return offset_builder_.Resize(elements + 1); + return offset_builder_.Resize((elements + 1) * sizeof(int32_t)); } - Status Resize(int32_t capacity) { - // +1 because we Need space for the end offset + Status Resize(int32_t capacity) override { + // one more then requested for offsets RETURN_NOT_OK(offset_builder_.Resize((capacity + 1) * sizeof(int32_t))); return ArrayBuilder::Resize(capacity); } - Status Reserve(int32_t elements) { - if (length_ + elements > capacity_) { - int32_t new_capacity = util::next_power2(length_ + elements); - return Resize(new_capacity); - } - return Status::OK(); - } - // Vector append // // If passed, valid_bytes is of equal length to values, and any zero byte diff --git a/cpp/src/arrow/types/primitive.cc b/cpp/src/arrow/types/primitive.cc index e4f2a883418..c1880ab92c4 100644 --- a/cpp/src/arrow/types/primitive.cc +++ b/cpp/src/arrow/types/primitive.cc @@ -102,19 +102,10 @@ Status PrimitiveBuilder::Resize(int32_t capacity) { return Status::OK(); } -template -Status PrimitiveBuilder::Reserve(int32_t elements) { - if (length_ + elements > capacity_) { - int32_t new_capacity = util::next_power2(length_ + elements); - return Resize(new_capacity); - } - return Status::OK(); -} - template Status PrimitiveBuilder::Append( const value_type* values, int32_t length, const uint8_t* valid_bytes) { - RETURN_NOT_OK(PrimitiveBuilder::Reserve(length)); + RETURN_NOT_OK(Reserve(length)); if (length > 0) { memcpy(raw_data_ + length_, values, type_traits::bytes_required(length)); diff --git a/cpp/src/arrow/types/primitive.h b/cpp/src/arrow/types/primitive.h index 5621a311eea..f9faafab3fd 100644 --- a/cpp/src/arrow/types/primitive.h +++ b/cpp/src/arrow/types/primitive.h @@ -114,17 +114,13 @@ class PrimitiveBuilder : public ArrayBuilder { Status Append( const value_type* values, int32_t length, const uint8_t* valid_bytes = nullptr); - // Ensure that builder can accommodate an additional number of - // elements. Resizes if the current capacity is not sufficient - Status Reserve(int32_t elements); - std::shared_ptr Finish() override; - Status Init(int32_t capacity); + Status Init(int32_t capacity) override; // Increase the capacity of the builder to accommodate at least the indicated // number of elements - Status Resize(int32_t capacity); + Status Resize(int32_t capacity) override; protected: std::shared_ptr data_; @@ -143,7 +139,7 @@ class NumericBuilder : public PrimitiveBuilder { // Scalar append. void Append(value_type val) { - PrimitiveBuilder::Reserve(1); + ArrayBuilder::Reserve(1); UnsafeAppend(val); } From 2e6c47704a8de4c7f8a9887230ea7fccf4e41e9d Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Sun, 17 Apr 2016 20:19:45 +0000 Subject: [PATCH 16/25] add missing RETURN_NOT_OK --- cpp/src/arrow/builder.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/builder.cc b/cpp/src/arrow/builder.cc index 93b93eabd8c..36469ba29cc 100644 --- a/cpp/src/arrow/builder.cc +++ b/cpp/src/arrow/builder.cc @@ -38,7 +38,7 @@ Status ArrayBuilder::AppendToBitmap(bool is_null) { } Status ArrayBuilder::AppendToBitmap(const uint8_t* valid_bytes, int32_t length) { - Reserve(length); + RETURN_NOT_OK(Reserve(length)); UnsafeAppendToBitmap(valid_bytes, length); return Status::OK(); From 3b219a1aaefe19c823de2d7a451f7211c8aaa023 Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Mon, 18 Apr 2016 02:20:04 +0000 Subject: [PATCH 17/25] Make append is_null parameter is_valid for api consistency --- cpp/src/arrow/builder.cc | 14 +++++++------- cpp/src/arrow/builder.h | 4 ++-- cpp/src/arrow/types/list-test.cc | 13 ++++++------- cpp/src/arrow/types/list.h | 6 +++--- cpp/src/arrow/types/primitive.h | 2 +- 5 files changed, 19 insertions(+), 20 deletions(-) diff --git a/cpp/src/arrow/builder.cc b/cpp/src/arrow/builder.cc index 36469ba29cc..eded671d611 100644 --- a/cpp/src/arrow/builder.cc +++ b/cpp/src/arrow/builder.cc @@ -25,7 +25,7 @@ namespace arrow { -Status ArrayBuilder::AppendToBitmap(bool is_null) { +Status ArrayBuilder::AppendToBitmap(bool is_valid) { if (length_ == capacity_) { // If the capacity was not already a multiple of 2, do so here // TODO(emkornfield) doubling isn't great default allocation practice @@ -33,7 +33,7 @@ Status ArrayBuilder::AppendToBitmap(bool is_null) { // fo discussion RETURN_NOT_OK(Resize(util::next_power2(capacity_ + 1))); } - UnsafeAppendToBitmap(is_null); + UnsafeAppendToBitmap(is_valid); return Status::OK(); } @@ -89,11 +89,11 @@ Status ArrayBuilder::SetNotNull(int32_t length) { return Status::OK(); } -void ArrayBuilder::UnsafeAppendToBitmap(bool is_null) { - if (is_null) { - ++null_count_; - } else { +void ArrayBuilder::UnsafeAppendToBitmap(bool is_valid) { + if (is_valid) { util::set_bit(null_bitmap_data_, length_); + } else { + ++null_count_; } ++length_; } @@ -105,7 +105,7 @@ void ArrayBuilder::UnsafeAppendToBitmap(const uint8_t* valid_bytes, int32_t leng } for (int32_t i = 0; i < length; ++i) { // TODO(emkornfield) Optimize for large values of length? - AppendToBitmap(valid_bytes[i] == 0); + AppendToBitmap(valid_bytes[i] > 0); } } diff --git a/cpp/src/arrow/builder.h b/cpp/src/arrow/builder.h index 70e6b460842..7d3f4398d73 100644 --- a/cpp/src/arrow/builder.h +++ b/cpp/src/arrow/builder.h @@ -62,7 +62,7 @@ class ArrayBuilder { int32_t capacity() const { return capacity_; } // Append to null bitmap - Status AppendToBitmap(bool is_null); + Status AppendToBitmap(bool is_valid); // Vector append. Treat each zero byte as a null. If valid_bytes is null // assume all of length bits are valid. Status AppendToBitmap(const uint8_t* valid_bytes, int32_t length); @@ -118,7 +118,7 @@ class ArrayBuilder { // // Append to null bitmap. - void UnsafeAppendToBitmap(bool is_null); + void UnsafeAppendToBitmap(bool is_valid); // Vector append. Treat each zero byte as a nullzero. If valid_bytes is null // assume all of length bits are valid. void UnsafeAppendToBitmap(const uint8_t* valid_bytes, int32_t length); diff --git a/cpp/src/arrow/types/list-test.cc b/cpp/src/arrow/types/list-test.cc index 050cc320c34..17ff1efd4c9 100644 --- a/cpp/src/arrow/types/list-test.cc +++ b/cpp/src/arrow/types/list-test.cc @@ -107,7 +107,7 @@ TEST_F(TestListBuilder, TestAppendNull) { } void ValidateBasicListArray(const ListArray* result, const vector& values, - const vector& is_null) { + const vector& is_valid) { ASSERT_OK(result->Validate()); ASSERT_EQ(1, result->null_count()); ASSERT_EQ(0, result->values()->null_count()); @@ -119,7 +119,7 @@ void ValidateBasicListArray(const ListArray* result, const vector& valu } for (int i = 0; i < result->length(); ++i) { - ASSERT_EQ(static_cast(is_null[i]), result->IsNull(i)); + ASSERT_EQ(!static_cast(is_valid[i]), result->IsNull(i)); } ASSERT_EQ(7, result->values()->length()); @@ -133,7 +133,7 @@ void ValidateBasicListArray(const ListArray* result, const vector& valu TEST_F(TestListBuilder, TestBasics) { vector values = {0, 1, 2, 3, 4, 5, 6}; vector lengths = {3, 0, 4}; - vector is_null = {0, 1, 0}; + vector is_valid = {1, 0, 1}; Int32Builder* vb = static_cast(builder_->value_builder().get()); @@ -142,20 +142,19 @@ TEST_F(TestListBuilder, TestBasics) { int pos = 0; for (size_t i = 0; i < lengths.size(); ++i) { - ASSERT_OK(builder_->Append(is_null[i] > 0)); + ASSERT_OK(builder_->Append(is_valid[i] > 0)); for (int j = 0; j < lengths[i]; ++j) { vb->Append(values[pos++]); } } Done(); - ValidateBasicListArray(result_.get(), values, is_null); + ValidateBasicListArray(result_.get(), values, is_valid); } TEST_F(TestListBuilder, BulkAppend) { vector values = {0, 1, 2, 3, 4, 5, 6}; vector lengths = {3, 0, 4}; - vector is_null = {0, 1, 0}; vector is_valid = {1, 0, 1}; vector offsets = {0, 3, 3}; @@ -167,7 +166,7 @@ TEST_F(TestListBuilder, BulkAppend) { vb->Append(value); } Done(); - ValidateBasicListArray(result_.get(), values, is_null); + ValidateBasicListArray(result_.get(), values, is_valid); } TEST_F(TestListBuilder, BulkAppendInvalid) { diff --git a/cpp/src/arrow/types/list.h b/cpp/src/arrow/types/list.h index 6fcf78c9fa4..76b310773bc 100644 --- a/cpp/src/arrow/types/list.h +++ b/cpp/src/arrow/types/list.h @@ -162,14 +162,14 @@ class ListBuilder : public ArrayBuilder { // // This function should be called before beginning to append elements to the // value builder - Status Append(bool is_null = false) { + Status Append(bool is_valid = true) { RETURN_NOT_OK(Reserve(1)); - UnsafeAppendToBitmap(is_null); + UnsafeAppendToBitmap(is_valid); RETURN_NOT_OK(offset_builder_.Append(value_builder_->length())); return Status::OK(); } - Status AppendNull() { return Append(true); } + Status AppendNull() { return Append(false); } const std::shared_ptr& value_builder() const { DCHECK(!values_) << "Using value builder is pointless when values_ is set"; diff --git a/cpp/src/arrow/types/primitive.h b/cpp/src/arrow/types/primitive.h index f9faafab3fd..6f6b2fed5a3 100644 --- a/cpp/src/arrow/types/primitive.h +++ b/cpp/src/arrow/types/primitive.h @@ -101,7 +101,7 @@ class PrimitiveBuilder : public ArrayBuilder { Status AppendNull() { RETURN_NOT_OK(Reserve(1)); - UnsafeAppendToBitmap(true); + UnsafeAppendToBitmap(false); return Status::OK(); } From 10e6651c8dc1a4f3d1043542564751bf14bf3c0c Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Mon, 18 Apr 2016 04:36:46 +0000 Subject: [PATCH 18/25] add in maximum recursion depth, surfaced possible recursion issue with flatbuffers --- cpp/src/arrow/ipc/adapter.cc | 51 +++++++++------ cpp/src/arrow/ipc/adapter.h | 11 +++- cpp/src/arrow/ipc/ipc-adapter-test.cc | 88 +++++++++++++++++++++++--- cpp/src/arrow/ipc/memory.cc | 2 +- cpp/src/arrow/ipc/metadata-internal.cc | 2 +- cpp/src/arrow/ipc/metadata-internal.h | 2 +- cpp/src/arrow/ipc/metadata.cc | 2 +- cpp/src/arrow/ipc/test-common.h | 8 ++- cpp/src/arrow/parquet/schema.cc | 2 +- cpp/src/arrow/schema.cc | 2 +- cpp/src/arrow/test-util.h | 2 +- cpp/src/arrow/types/construct.cc | 2 +- cpp/src/arrow/types/list-test.cc | 2 +- cpp/src/arrow/types/primitive-test.cc | 3 +- cpp/src/arrow/util/logging.h | 6 +- cpp/src/arrow/util/memory-pool.cc | 2 +- 16 files changed, 141 insertions(+), 46 deletions(-) diff --git a/cpp/src/arrow/ipc/adapter.cc b/cpp/src/arrow/ipc/adapter.cc index d61ff0c31fc..dd755411ff8 100644 --- a/cpp/src/arrow/ipc/adapter.cc +++ b/cpp/src/arrow/ipc/adapter.cc @@ -23,16 +23,16 @@ #include #include "arrow/array.h" -#include "arrow/ipc/memory.h" #include "arrow/ipc/Message_generated.h" -#include "arrow/ipc/metadata.h" +#include "arrow/ipc/memory.h" #include "arrow/ipc/metadata-internal.h" +#include "arrow/ipc/metadata.h" #include "arrow/schema.h" #include "arrow/table.h" #include "arrow/type.h" #include "arrow/types/construct.h" -#include "arrow/types/primitive.h" #include "arrow/types/list.h" +#include "arrow/types/primitive.h" #include "arrow/util/buffer.h" #include "arrow/util/logging.h" #include "arrow/util/status.h" @@ -90,7 +90,8 @@ static bool IsListType(const DataType* type) { // Row batch write path Status VisitArray(const Array* arr, std::vector* field_nodes, - std::vector>* buffers) { + std::vector>* buffers, int max_recursion_depth) { + if (max_recursion_depth <= 0) { return Status::Invalid("Max recursion depth reached"); } DCHECK(arr); DCHECK(field_nodes); // push back all common elements @@ -109,8 +110,8 @@ Status VisitArray(const Array* arr, std::vector* field_nodes } else if (IsListType(arr_type)) { const ListArray* list_arr = static_cast(arr); buffers->push_back(list_arr->offset_buffer()); - // TODO(emkornfield) limit recursion depth - RETURN_NOT_OK(VisitArray(list_arr->values().get(), field_nodes, buffers)); + RETURN_NOT_OK(VisitArray( + list_arr->values().get(), field_nodes, buffers, max_recursion_depth - 1)); } else if (arr->type_enum() == Type::STRUCT) { // TODO(wesm) return Status::NotImplemented("Struct type"); @@ -120,13 +121,14 @@ Status VisitArray(const Array* arr, std::vector* field_nodes class RowBatchWriter { public: - explicit RowBatchWriter(const RowBatch* batch) : batch_(batch) {} + RowBatchWriter(const RowBatch* batch, int max_recursion_depth) + : batch_(batch), max_recursion_depth_(max_recursion_depth) {} Status AssemblePayload() { // Perform depth-first traversal of the row-batch for (int i = 0; i < batch_->num_columns(); ++i) { const Array* arr = batch_->column(i).get(); - RETURN_NOT_OK(VisitArray(arr, &field_nodes_, &buffers_)); + RETURN_NOT_OK(VisitArray(arr, &field_nodes_, &buffers_, max_recursion_depth_)); } return Status::OK(); } @@ -197,11 +199,13 @@ class RowBatchWriter { std::vector field_nodes_; std::vector buffer_meta_; std::vector> buffers_; + int max_recursion_depth_; }; -Status WriteRowBatch( - MemorySource* dst, const RowBatch* batch, int64_t position, int64_t* header_offset) { - RowBatchWriter serializer(batch); +Status WriteRowBatch(MemorySource* dst, const RowBatch* batch, int64_t position, + int64_t* header_offset, int max_recursion_depth) { + DCHECK(max_recursion_depth > 0); + RowBatchWriter serializer(batch, max_recursion_depth); RETURN_NOT_OK(serializer.AssemblePayload()); return serializer.Write(dst, position, header_offset); } @@ -212,8 +216,9 @@ static constexpr int64_t INIT_METADATA_SIZE = 4096; class RowBatchReader::Impl { public: - Impl(MemorySource* source, const std::shared_ptr& metadata) - : source_(source), metadata_(metadata) { + Impl(MemorySource* source, const std::shared_ptr& metadata, + int max_recursion_depth) + : source_(source), metadata_(metadata), max_recursion_depth_(max_recursion_depth) { num_buffers_ = metadata->num_buffers(); num_flattened_fields_ = metadata->num_fields(); } @@ -229,7 +234,7 @@ class RowBatchReader::Impl { buffer_index_ = 0; for (int i = 0; i < schema->num_fields(); ++i) { const Field* field = schema->field(i).get(); - RETURN_NOT_OK(NextArray(field, &arrays[i])); + RETURN_NOT_OK(NextArray(field, max_recursion_depth_, &arrays[i])); } *out = std::make_shared(schema, metadata_->length(), arrays); @@ -239,8 +244,12 @@ class RowBatchReader::Impl { private: // Traverse the flattened record batch metadata and reassemble the // corresponding array containers - Status NextArray(const Field* field, std::shared_ptr* out) { + Status NextArray( + const Field* field, int max_recursion_depth, std::shared_ptr* out) { const TypePtr& type = field->type; + if (max_recursion_depth <= 0) { + return Status::Invalid("Max recursion depth reached"); + } // pop off a field if (field_index_ >= num_flattened_fields_) { @@ -286,8 +295,8 @@ class RowBatchReader::Impl { return Status::Invalid(ss.str()); } std::shared_ptr values_array; - // TODO(emkornfield): limit recursion depth? - RETURN_NOT_OK(NextArray(type->child(0).get(), &values_array)); + RETURN_NOT_OK( + NextArray(type->child(0).get(), max_recursion_depth - 1, &values_array)); return MakeListArray(type, field_meta.length, offsets, values_array, field_meta.null_count, null_bitmap, out); } @@ -304,12 +313,18 @@ class RowBatchReader::Impl { int field_index_; int buffer_index_; + int max_recursion_depth_; int num_buffers_; int num_flattened_fields_; }; Status RowBatchReader::Open( MemorySource* source, int64_t position, std::shared_ptr* out) { + return Open(source, position, kMaxIpcRecursionDepth, out); +} + +Status RowBatchReader::Open(MemorySource* source, int64_t position, + int max_recursion_depth, std::shared_ptr* out) { std::shared_ptr metadata; RETURN_NOT_OK(source->ReadAt(position, INIT_METADATA_SIZE, &metadata)); @@ -334,7 +349,7 @@ Status RowBatchReader::Open( std::shared_ptr batch_meta = message->GetRecordBatch(); std::shared_ptr result(new RowBatchReader()); - result->impl_.reset(new Impl(source, batch_meta)); + result->impl_.reset(new Impl(source, batch_meta, max_recursion_depth)); *out = result; return Status::OK(); diff --git a/cpp/src/arrow/ipc/adapter.h b/cpp/src/arrow/ipc/adapter.h index d453fa05f49..4c9a8a9d8ee 100644 --- a/cpp/src/arrow/ipc/adapter.h +++ b/cpp/src/arrow/ipc/adapter.h @@ -38,7 +38,9 @@ class RecordBatchMessage; // ---------------------------------------------------------------------- // Write path - +// We have trouble decoding flatbuffers if the size i > 70, so 64 is a nice round number +// TODO(emkornfield) investigate this more +constexpr int kMaxIpcRecursionDepth = 64; // Write the RowBatch (collection of equal-length Arrow arrays) to the memory // source at the indicated position // @@ -52,8 +54,8 @@ class RecordBatchMessage; // // Finally, the memory offset to the start of the metadata / data header is // returned in an out-variable -Status WriteRowBatch( - MemorySource* dst, const RowBatch* batch, int64_t position, int64_t* header_offset); +Status WriteRowBatch(MemorySource* dst, const RowBatch* batch, int64_t position, + int64_t* header_offset, int max_recursion_depth = kMaxIpcRecursionDepth); // int64_t GetRowBatchMetadata(const RowBatch* batch); @@ -70,6 +72,9 @@ class RowBatchReader { static Status Open( MemorySource* source, int64_t position, std::shared_ptr* out); + static Status Open(MemorySource* source, int64_t position, int max_recursion_depth, + std::shared_ptr* out); + // Reassemble the row batch. A Schema is required to be able to construct the // right array containers Status GetRowBatch( diff --git a/cpp/src/arrow/ipc/ipc-adapter-test.cc b/cpp/src/arrow/ipc/ipc-adapter-test.cc index 8f637d6d031..5422cfdd168 100644 --- a/cpp/src/arrow/ipc/ipc-adapter-test.cc +++ b/cpp/src/arrow/ipc/ipc-adapter-test.cc @@ -52,29 +52,23 @@ class TestWriteRowBatch : public ::testing::TestWithParam, void SetUp() { pool_ = default_memory_pool(); } void TearDown() { MemoryMapFixture::TearDown(); } - Status InitMemoryMap(int64_t size) { - std::string path = "test-write-row-batch"; - MemoryMapFixture::CreateFile(path, size); - return MemoryMappedSource::Open(path, MemorySource::READ_WRITE, &mmap_); - } - Status RoundTripHelper(const RowBatch& batch, int memory_map_size, std::shared_ptr* batch_result) { - InitMemoryMap(memory_map_size); + std::string path = "test-write-row-batch"; + MemoryMapFixture::InitMemoryMap(memory_map_size, path, &mmap_); int64_t header_location; RETURN_NOT_OK(WriteRowBatch(mmap_.get(), &batch, 0, &header_location)); std::shared_ptr reader; RETURN_NOT_OK(RowBatchReader::Open(mmap_.get(), header_location, &reader)); - // TODO(emkornfield): why does this require a smart pointer for schema? RETURN_NOT_OK(reader->GetRowBatch(batch.schema(), batch_result)); return Status::OK(); } protected: - MemoryPool* pool_; std::shared_ptr mmap_; + MemoryPool* pool_; }; TEST_P(TestWriteRowBatch, RoundTrip) { @@ -131,8 +125,82 @@ Status MakeListRowBatch(std::shared_ptr* out) { return Status::OK(); } +Status MakeDeeplyNestedList(std::shared_ptr* out) { + const int batch_length = 5; + TypePtr type = INT32; + + MemoryPool* pool = default_memory_pool(); + ArrayPtr array; + RETURN_NOT_OK(MakeRandomInt32Array(1000, true, pool, &array)); + for (int i = 0; i < 63; ++i) { + type = std::static_pointer_cast(std::make_shared(type)); + RETURN_NOT_OK(MakeRandomListArray(array, batch_length, pool, &array)); + } + + auto f0 = std::make_shared("f0", type); + std::shared_ptr schema(new Schema({f0})); + std::vector arrays = {array}; + out->reset(new RowBatch(schema, batch_length, arrays)); + return Status::OK(); +} + INSTANTIATE_TEST_CASE_P(RoundTripTests, TestWriteRowBatch, - ::testing::Values(&MakeIntRowBatch, &MakeListRowBatch)); + ::testing::Values(&MakeIntRowBatch, &MakeListRowBatch, &MakeDeeplyNestedList)); + +class RecursionLimits : public ::testing::Test, public MemoryMapFixture { + public: + void SetUp() { pool_ = default_memory_pool(); } + void TearDown() { MemoryMapFixture::TearDown(); } + + Status WriteToMmap(int recursion_level, bool override_level, + int64_t* header_out = nullptr, std::shared_ptr* schema_out = nullptr) { + const int batch_length = 5; + TypePtr type = INT32; + ArrayPtr array; + RETURN_NOT_OK(MakeRandomInt32Array(1000, true, pool_, &array)); + for (int i = 0; i < recursion_level; ++i) { + type = std::static_pointer_cast(std::make_shared(type)); + RETURN_NOT_OK(MakeRandomListArray(array, batch_length, pool_, &array)); + } + + auto f0 = std::make_shared("f0", type); + std::shared_ptr schema(new Schema({f0})); + if (schema_out != nullptr) { *schema_out = schema; } + std::vector arrays = {array}; + auto batch = std::make_shared(schema, batch_length, arrays); + + std::string path = "test-write-past-max-recursion"; + const int memory_map_size = 1 << 16; + MemoryMapFixture::InitMemoryMap(memory_map_size, path, &mmap_); + int64_t header_location; + int64_t* header_out_param = header_out == nullptr ? &header_location : header_out; + if (override_level) { + return WriteRowBatch( + mmap_.get(), batch.get(), 0, header_out_param, recursion_level + 1); + } else { + return WriteRowBatch(mmap_.get(), batch.get(), 0, header_out_param); + } + } + + protected: + std::shared_ptr mmap_; + MemoryPool* pool_; +}; + +TEST_F(RecursionLimits, WriteLimit) { + ASSERT_RAISES(Invalid, WriteToMmap((1 << 8) + 1, false)); +} + +TEST_F(RecursionLimits, ReadLimit) { + int64_t header_location; + std::shared_ptr schema; + ASSERT_OK(WriteToMmap(64, true, &header_location, &schema)); + + std::shared_ptr reader; + ASSERT_OK(RowBatchReader::Open(mmap_.get(), header_location, &reader)); + std::shared_ptr batch_result; + ASSERT_RAISES(Invalid, reader->GetRowBatch(schema, &batch_result)); +} // TODO(emkornfield) More tests // Test primitive and lists with zero elements diff --git a/cpp/src/arrow/ipc/memory.cc b/cpp/src/arrow/ipc/memory.cc index 2b077e97929..9150538fcd0 100644 --- a/cpp/src/arrow/ipc/memory.cc +++ b/cpp/src/arrow/ipc/memory.cc @@ -17,7 +17,6 @@ #include "arrow/ipc/memory.h" -#include // For memory-mapping #include #include #include @@ -25,6 +24,7 @@ #include #include #include +#include // For memory-mapping #include "arrow/util/buffer.h" #include "arrow/util/status.h" diff --git a/cpp/src/arrow/ipc/metadata-internal.cc b/cpp/src/arrow/ipc/metadata-internal.cc index ad5951d17e2..e6e109f6078 100644 --- a/cpp/src/arrow/ipc/metadata-internal.cc +++ b/cpp/src/arrow/ipc/metadata-internal.cc @@ -17,9 +17,9 @@ #include "arrow/ipc/metadata-internal.h" -#include #include #include +#include #include #include #include diff --git a/cpp/src/arrow/ipc/metadata-internal.h b/cpp/src/arrow/ipc/metadata-internal.h index 779c5a30a04..bec92b80210 100644 --- a/cpp/src/arrow/ipc/metadata-internal.h +++ b/cpp/src/arrow/ipc/metadata-internal.h @@ -18,8 +18,8 @@ #ifndef ARROW_IPC_METADATA_INTERNAL_H #define ARROW_IPC_METADATA_INTERNAL_H -#include #include +#include #include #include diff --git a/cpp/src/arrow/ipc/metadata.cc b/cpp/src/arrow/ipc/metadata.cc index bcf104f0b8b..ed233b74ae8 100644 --- a/cpp/src/arrow/ipc/metadata.cc +++ b/cpp/src/arrow/ipc/metadata.cc @@ -17,8 +17,8 @@ #include "arrow/ipc/metadata.h" -#include #include +#include #include #include diff --git a/cpp/src/arrow/ipc/test-common.h b/cpp/src/arrow/ipc/test-common.h index 7c23489b7b4..a8f21c7909f 100644 --- a/cpp/src/arrow/ipc/test-common.h +++ b/cpp/src/arrow/ipc/test-common.h @@ -26,8 +26,8 @@ #include "arrow/array.h" #include "arrow/test-util.h" -#include "arrow/types/primitive.h" #include "arrow/types/list.h" +#include "arrow/types/primitive.h" #include "arrow/util/buffer.h" #include "arrow/util/memory-pool.h" @@ -49,6 +49,12 @@ class MemoryMapFixture { fclose(file); } + Status InitMemoryMap( + int64_t size, const std::string& path, std::shared_ptr* mmap) { + CreateFile(path, size); + return MemoryMappedSource::Open(path, MemorySource::READ_WRITE, mmap); + } + private: std::vector tmp_files_; }; diff --git a/cpp/src/arrow/parquet/schema.cc b/cpp/src/arrow/parquet/schema.cc index 066388b4d0e..560e2837406 100644 --- a/cpp/src/arrow/parquet/schema.cc +++ b/cpp/src/arrow/parquet/schema.cc @@ -21,8 +21,8 @@ #include "parquet/api/schema.h" -#include "arrow/util/status.h" #include "arrow/types/decimal.h" +#include "arrow/util/status.h" using parquet::schema::Node; using parquet::schema::NodePtr; diff --git a/cpp/src/arrow/schema.cc b/cpp/src/arrow/schema.cc index a38acaa94ba..ff3ea1990e5 100644 --- a/cpp/src/arrow/schema.cc +++ b/cpp/src/arrow/schema.cc @@ -18,8 +18,8 @@ #include "arrow/schema.h" #include -#include #include +#include #include #include "arrow/type.h" diff --git a/cpp/src/arrow/test-util.h b/cpp/src/arrow/test-util.h index 9d09a6973ad..2f81161d1d6 100644 --- a/cpp/src/arrow/test-util.h +++ b/cpp/src/arrow/test-util.h @@ -27,10 +27,10 @@ #include "gtest/gtest.h" -#include "arrow/type.h" #include "arrow/column.h" #include "arrow/schema.h" #include "arrow/table.h" +#include "arrow/type.h" #include "arrow/util/bit-util.h" #include "arrow/util/buffer.h" #include "arrow/util/logging.h" diff --git a/cpp/src/arrow/types/construct.cc b/cpp/src/arrow/types/construct.cc index f70ebb19697..78036d4bf57 100644 --- a/cpp/src/arrow/types/construct.cc +++ b/cpp/src/arrow/types/construct.cc @@ -20,8 +20,8 @@ #include #include "arrow/type.h" -#include "arrow/types/primitive.h" #include "arrow/types/list.h" +#include "arrow/types/primitive.h" #include "arrow/types/string.h" #include "arrow/util/buffer.h" #include "arrow/util/status.h" diff --git a/cpp/src/arrow/types/list-test.cc b/cpp/src/arrow/types/list-test.cc index 17ff1efd4c9..6a8ad9aa59e 100644 --- a/cpp/src/arrow/types/list-test.cc +++ b/cpp/src/arrow/types/list-test.cc @@ -15,8 +15,8 @@ // specific language governing permissions and limitations // under the License. -#include #include +#include #include #include #include diff --git a/cpp/src/arrow/types/primitive-test.cc b/cpp/src/arrow/types/primitive-test.cc index 2b4c0879a28..65923b0c908 100644 --- a/cpp/src/arrow/types/primitive-test.cc +++ b/cpp/src/arrow/types/primitive-test.cc @@ -238,7 +238,8 @@ void TestPrimitiveBuilder::Check( } typedef ::testing::Types Primitives; + PInt32, PInt64, PFloat, PDouble> + Primitives; TYPED_TEST_CASE(TestPrimitiveBuilder, Primitives); diff --git a/cpp/src/arrow/util/logging.h b/cpp/src/arrow/util/logging.h index 527ce423e77..96e36266e33 100644 --- a/cpp/src/arrow/util/logging.h +++ b/cpp/src/arrow/util/logging.h @@ -18,8 +18,8 @@ #ifndef ARROW_UTIL_LOGGING_H #define ARROW_UTIL_LOGGING_H -#include #include +#include namespace arrow { @@ -118,9 +118,9 @@ class CerrLog { class FatalLog : public CerrLog { public: FatalLog(int /* severity */) // NOLINT - : CerrLog(ARROW_FATAL) {} + : CerrLog(ARROW_FATAL){} - [[noreturn]] ~FatalLog() { + [[noreturn]] ~FatalLog() { if (has_logged_) { std::cerr << std::endl; } std::exit(1); } diff --git a/cpp/src/arrow/util/memory-pool.cc b/cpp/src/arrow/util/memory-pool.cc index fb417e74daf..961554fe06b 100644 --- a/cpp/src/arrow/util/memory-pool.cc +++ b/cpp/src/arrow/util/memory-pool.cc @@ -18,8 +18,8 @@ #include "arrow/util/memory-pool.h" #include -#include #include +#include #include "arrow/util/status.h" From be04b3eced435e3fe1b34a31a5914528942eb64a Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Mon, 18 Apr 2016 06:09:08 +0000 Subject: [PATCH 19/25] add unit tests for zero length row batches and non-null batches. fix bugs --- cpp/src/arrow/ipc/adapter.cc | 17 ++++---- cpp/src/arrow/ipc/ipc-adapter-test.cc | 63 +++++++++++++++++++++++---- cpp/src/arrow/ipc/test-common.h | 31 ++++++------- cpp/src/arrow/types/list.cc | 10 +++-- cpp/src/arrow/types/list.h | 3 +- cpp/src/arrow/types/primitive.cc | 3 ++ 6 files changed, 89 insertions(+), 38 deletions(-) diff --git a/cpp/src/arrow/ipc/adapter.cc b/cpp/src/arrow/ipc/adapter.cc index dd755411ff8..dd9b66cd9d3 100644 --- a/cpp/src/arrow/ipc/adapter.cc +++ b/cpp/src/arrow/ipc/adapter.cc @@ -139,8 +139,12 @@ class RowBatchWriter { int64_t offset = 0; for (size_t i = 0; i < buffers_.size(); ++i) { const Buffer* buffer = buffers_[i].get(); - int64_t size = buffer->size(); + int64_t size = 0; + // The buffer might be null if we are handling zero row lengths. + if (buffer) { + size = buffer->size(); + } // TODO(wesm): We currently have no notion of shared memory page id's, // but we've included it in the metadata IDL for when we have it in the // future. Use page=0 for now @@ -150,7 +154,7 @@ class RowBatchWriter { // may (in the future) associate integer page id's with physical memory // pages (according to whatever is the desired shared memory mechanism) buffer_meta_.push_back(flatbuf::Buffer(0, position + offset, size)); - + if (size > 0) { RETURN_NOT_OK(dst->Write(position + offset, buffer->data(), size)); offset += size; @@ -264,7 +268,6 @@ class RowBatchReader::Impl { // extract null_bitmap which is common to all arrays std::shared_ptr null_bitmap; if (field_meta.null_count == 0) { - null_bitmap = nullptr; ++buffer_index_; } else { RETURN_NOT_OK(GetBuffer(buffer_index_++, &null_bitmap)); @@ -275,18 +278,16 @@ class RowBatchReader::Impl { if (field_meta.length > 0) { RETURN_NOT_OK(GetBuffer(buffer_index_++, &data)); } else { + buffer_index_++; data.reset(new Buffer(nullptr, 0)); } return MakePrimitiveArray( type, field_meta.length, data, field_meta.null_count, null_bitmap, out); } + if (IsListType(type.get())) { std::shared_ptr offsets; - if (field_meta.length > 0) { - RETURN_NOT_OK(GetBuffer(buffer_index_++, &offsets)); - } else { - offsets.reset(new Buffer(nullptr, 0)); - } + RETURN_NOT_OK(GetBuffer(buffer_index_++, &offsets)); const int num_children = type->num_children(); if (num_children != 1) { std::stringstream ss; diff --git a/cpp/src/arrow/ipc/ipc-adapter-test.cc b/cpp/src/arrow/ipc/ipc-adapter-test.cc index 5422cfdd168..e73607f3c9d 100644 --- a/cpp/src/arrow/ipc/ipc-adapter-test.cc +++ b/cpp/src/arrow/ipc/ipc-adapter-test.cc @@ -84,7 +84,7 @@ TEST_P(TestWriteRowBatch, RoundTrip) { EXPECT_EQ(batch->num_rows(), batch_result->num_rows()); for (int i = 0; i < batch->num_columns(); ++i) { EXPECT_TRUE(batch->column(i)->Equals(batch_result->column(i))) - << i << batch->column_name(i); + << "Idx: " << i << " Name: " << batch->column_name(i); } } @@ -117,10 +117,52 @@ Status MakeListRowBatch(std::shared_ptr* out) { MemoryPool* pool = default_memory_pool(); const int length = 200; std::shared_ptr leaf_values, list_array, list_list_array, flat_array; + const bool include_nulls = true; + RETURN_NOT_OK(MakeRandomInt32Array(1000, include_nulls, pool, &leaf_values)); + RETURN_NOT_OK(MakeRandomListArray(leaf_values, length, include_nulls, pool, &list_array)); + RETURN_NOT_OK(MakeRandomListArray(list_array, length, include_nulls, pool, &list_list_array)); + RETURN_NOT_OK(MakeRandomInt32Array(length, include_nulls, pool, &flat_array)); + out->reset(new RowBatch(schema, length, {list_array, list_list_array, flat_array})); + return Status::OK(); +} + +Status MakeZeroLengthRowBatch(std::shared_ptr* out) { + // Make the schema + auto f0 = std::make_shared("f0", LIST_INT32); + auto f1 = std::make_shared("f1", LIST_LIST_INT32); + auto f2 = std::make_shared("f2", INT32); + std::shared_ptr schema(new Schema({f0, f1, f2})); + + // Example data + MemoryPool* pool = default_memory_pool(); + const int length = 200; + const bool include_nulls = true; + std::shared_ptr leaf_values, list_array, list_list_array, flat_array; + RETURN_NOT_OK(MakeRandomInt32Array(0, include_nulls, pool, &leaf_values)); + RETURN_NOT_OK(MakeRandomListArray(leaf_values, 0, include_nulls, pool, &list_array)); + RETURN_NOT_OK(MakeRandomListArray(list_array, 0, include_nulls, pool, &list_list_array)); + RETURN_NOT_OK(MakeRandomInt32Array(0, include_nulls, pool, &flat_array)); + out->reset(new RowBatch(schema, length, {list_array, list_list_array, flat_array})); + return Status::OK(); +} + +Status MakeNonNullRowBatch(std::shared_ptr* out) { + // Make the schema + auto f0 = std::make_shared("f0", LIST_INT32); + auto f1 = std::make_shared("f1", LIST_LIST_INT32); + auto f2 = std::make_shared("f2", INT32); + std::shared_ptr schema(new Schema({f0, f1, f2})); + + // Example data + MemoryPool* pool = default_memory_pool(); + const int length = 200; + std::shared_ptr leaf_values, list_array, list_list_array, flat_array; + RETURN_NOT_OK(MakeRandomInt32Array(1000, true, pool, &leaf_values)); - RETURN_NOT_OK(MakeRandomListArray(leaf_values, length, pool, &list_array)); - RETURN_NOT_OK(MakeRandomListArray(list_array, length, pool, &list_list_array)); - RETURN_NOT_OK(MakeRandomInt32Array(length, true, pool, &flat_array)); + bool include_nulls = false; + RETURN_NOT_OK(MakeRandomListArray(leaf_values, 50, include_nulls, pool, &list_array)); + RETURN_NOT_OK(MakeRandomListArray(list_array, 50, include_nulls, pool, &list_list_array)); + RETURN_NOT_OK(MakeRandomInt32Array(0, include_nulls, pool, &flat_array)); out->reset(new RowBatch(schema, length, {list_array, list_list_array, flat_array})); return Status::OK(); } @@ -131,10 +173,11 @@ Status MakeDeeplyNestedList(std::shared_ptr* out) { MemoryPool* pool = default_memory_pool(); ArrayPtr array; - RETURN_NOT_OK(MakeRandomInt32Array(1000, true, pool, &array)); + const bool include_nulls = true; + RETURN_NOT_OK(MakeRandomInt32Array(1000, include_nulls, pool, &array)); for (int i = 0; i < 63; ++i) { type = std::static_pointer_cast(std::make_shared(type)); - RETURN_NOT_OK(MakeRandomListArray(array, batch_length, pool, &array)); + RETURN_NOT_OK(MakeRandomListArray(array, batch_length, include_nulls, pool, &array)); } auto f0 = std::make_shared("f0", type); @@ -145,7 +188,8 @@ Status MakeDeeplyNestedList(std::shared_ptr* out) { } INSTANTIATE_TEST_CASE_P(RoundTripTests, TestWriteRowBatch, - ::testing::Values(&MakeIntRowBatch, &MakeListRowBatch, &MakeDeeplyNestedList)); + ::testing::Values(&MakeIntRowBatch, &MakeListRowBatch, &MakeNonNullRowBatch, &MakeZeroLengthRowBatch, + &MakeDeeplyNestedList)); class RecursionLimits : public ::testing::Test, public MemoryMapFixture { public: @@ -157,10 +201,11 @@ class RecursionLimits : public ::testing::Test, public MemoryMapFixture { const int batch_length = 5; TypePtr type = INT32; ArrayPtr array; - RETURN_NOT_OK(MakeRandomInt32Array(1000, true, pool_, &array)); + const bool include_nulls = true; + RETURN_NOT_OK(MakeRandomInt32Array(1000, include_nulls, pool_, &array)); for (int i = 0; i < recursion_level; ++i) { type = std::static_pointer_cast(std::make_shared(type)); - RETURN_NOT_OK(MakeRandomListArray(array, batch_length, pool_, &array)); + RETURN_NOT_OK(MakeRandomListArray(array, batch_length, include_nulls, pool_, &array)); } auto f0 = std::make_shared("f0", type); diff --git a/cpp/src/arrow/ipc/test-common.h b/cpp/src/arrow/ipc/test-common.h index a8f21c7909f..e7dbb84d790 100644 --- a/cpp/src/arrow/ipc/test-common.h +++ b/cpp/src/arrow/ipc/test-common.h @@ -79,10 +79,10 @@ Status MakeRandomInt32Array( } Status MakeRandomListArray(const std::shared_ptr& child_array, int num_lists, - MemoryPool* pool, std::shared_ptr* array) { + bool include_nulls, MemoryPool* pool, std::shared_ptr* array) { // Create the null list values std::vector valid_lists(num_lists); - const double null_percent = 0.1; + const double null_percent = include_nulls ? 0.1 : 0; test::random_null_bytes(num_lists, null_percent, valid_lists.data()); // Create list offsets @@ -92,19 +92,20 @@ Status MakeRandomListArray(const std::shared_ptr& child_array, int num_li std::vector offsets( num_lists + 1, 0); // +1 so we can shift for nulls. See partial sum below. const int seed = child_array->length(); - test::rand_uniform_int(num_lists, seed, 0, max_list_size, list_sizes.data()); - // make sure sizes are consistent with null - std::transform(list_sizes.begin(), list_sizes.end(), valid_lists.begin(), - list_sizes.begin(), - [](int32_t size, int32_t valid) { return valid == 0 ? 0 : size; }); - std::partial_sum(list_sizes.begin(), list_sizes.end(), ++offsets.begin()); - - // Force invariants - const int child_length = child_array->length(); - offsets[0] = 0; - std::replace_if(offsets.begin(), offsets.end(), - [child_length](int32_t offset) { return offset > child_length; }, child_length); - + if (num_lists > 0) { + test::rand_uniform_int(num_lists, seed, 0, max_list_size, list_sizes.data()); + // make sure sizes are consistent with null + std::transform(list_sizes.begin(), list_sizes.end(), valid_lists.begin(), + list_sizes.begin(), + [](int32_t size, int32_t valid) { return valid == 0 ? 0 : size; }); + std::partial_sum(list_sizes.begin(), list_sizes.end(), ++offsets.begin()); + + // Force invariants + const int child_length = child_array->length(); + offsets[0] = 0; + std::replace_if(offsets.begin(), offsets.end(), + [child_length](int32_t offset) { return offset > child_length; }, child_length); + } ListBuilder builder(pool, child_array); RETURN_NOT_OK(builder.Append(offsets.data(), num_lists, valid_lists.data())); *array = builder.Finish(); diff --git a/cpp/src/arrow/types/list.cc b/cpp/src/arrow/types/list.cc index 621612f96a5..8beef1f5014 100644 --- a/cpp/src/arrow/types/list.cc +++ b/cpp/src/arrow/types/list.cc @@ -24,14 +24,17 @@ bool ListArray::EqualsExact(const ListArray& other) const { if (this == &other) { return true; } if (null_count_ != other.null_count_) { return false; } - bool equal_offsets = offset_buf_->Equals(*other.offset_buf_, length_ + 1); + bool equal_offsets = offset_buf_->Equals(*other.offset_buf_, (length_ + 1) * sizeof(int32_t)); + if (!equal_offsets) { + return false; + } bool equal_null_bitmap = true; if (null_count_ > 0) { equal_null_bitmap = null_bitmap_->Equals(*other.null_bitmap_, util::bytes_for_bits(length_)); } - if (!(equal_offsets && equal_null_bitmap)) { return false; } + if (!equal_null_bitmap) { return false; } return values()->Equals(other.values()); } @@ -43,10 +46,9 @@ bool ListArray::Equals(const std::shared_ptr& arr) const { } Status ListArray::Validate() const { - if (length_ == 0) { return Status::OK(); } if (length_ < 0) { return Status::Invalid("Length was negative"); } if (!offset_buf_) { - return Status::Invalid("offset_buf_ is null with non-zero_length"); + return Status::Invalid("offset_buf_ was null"); } if (offset_buf_->size() / sizeof(int32_t) < length_) { std::stringstream ss; diff --git a/cpp/src/arrow/types/list.h b/cpp/src/arrow/types/list.h index 76b310773bc..e2302d917b8 100644 --- a/cpp/src/arrow/types/list.h +++ b/cpp/src/arrow/types/list.h @@ -142,8 +142,7 @@ class ListBuilder : public ArrayBuilder { std::shared_ptr items = values_; if (!items) { items = value_builder_->Finish(); } - // Add final offset if the length is non-zero - if (length_) { offset_builder_.UnsafeAppend(items->length()); } + offset_builder_.Append(items->length()); const auto offsets_buffer = offset_builder_.Finish(); auto result = std::make_shared( diff --git a/cpp/src/arrow/types/primitive.cc b/cpp/src/arrow/types/primitive.cc index c1880ab92c4..e1adf1c448d 100644 --- a/cpp/src/arrow/types/primitive.cc +++ b/cpp/src/arrow/types/primitive.cc @@ -57,6 +57,9 @@ bool PrimitiveArray::EqualsExact(const PrimitiveArray& other) const { } return true; } else { + if (length_ == 0 && other.length_ == 0) { + return true; + } return data_->Equals(*other.data_, length_); } } From 8982723f2aada7faa64e19055f25c4d68bc4d51e Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Mon, 18 Apr 2016 06:31:55 +0000 Subject: [PATCH 20/25] remaining style cleanup --- cpp/src/arrow/builder.cc | 2 +- cpp/src/arrow/ipc/adapter.cc | 12 +++++------- cpp/src/arrow/ipc/ipc-adapter-test.cc | 27 ++++++++++++++------------- cpp/src/arrow/type.h | 2 +- cpp/src/arrow/types/list.cc | 11 ++++------- cpp/src/arrow/types/primitive.cc | 4 +--- 6 files changed, 26 insertions(+), 32 deletions(-) diff --git a/cpp/src/arrow/builder.cc b/cpp/src/arrow/builder.cc index eded671d611..87c1219025d 100644 --- a/cpp/src/arrow/builder.cc +++ b/cpp/src/arrow/builder.cc @@ -105,7 +105,7 @@ void ArrayBuilder::UnsafeAppendToBitmap(const uint8_t* valid_bytes, int32_t leng } for (int32_t i = 0; i < length; ++i) { // TODO(emkornfield) Optimize for large values of length? - AppendToBitmap(valid_bytes[i] > 0); + UnsafeAppendToBitmap(valid_bytes[i] > 0); } } diff --git a/cpp/src/arrow/ipc/adapter.cc b/cpp/src/arrow/ipc/adapter.cc index dd9b66cd9d3..8430c1e7d31 100644 --- a/cpp/src/arrow/ipc/adapter.cc +++ b/cpp/src/arrow/ipc/adapter.cc @@ -103,12 +103,12 @@ Status VisitArray(const Array* arr, std::vector* field_nodes buffers->push_back(std::make_shared(nullptr, 0)); } - const auto arr_type = arr->type().get(); + const DataType* arr_type = arr->type().get(); if (IsPrimitive(arr_type)) { - const PrimitiveArray* prim_arr = static_cast(arr); + const auto prim_arr = static_cast(arr); buffers->push_back(prim_arr->data()); } else if (IsListType(arr_type)) { - const ListArray* list_arr = static_cast(arr); + const auto list_arr = static_cast(arr); buffers->push_back(list_arr->offset_buffer()); RETURN_NOT_OK(VisitArray( list_arr->values().get(), field_nodes, buffers, max_recursion_depth - 1)); @@ -142,9 +142,7 @@ class RowBatchWriter { int64_t size = 0; // The buffer might be null if we are handling zero row lengths. - if (buffer) { - size = buffer->size(); - } + if (buffer) { size = buffer->size(); } // TODO(wesm): We currently have no notion of shared memory page id's, // but we've included it in the metadata IDL for when we have it in the // future. Use page=0 for now @@ -154,7 +152,7 @@ class RowBatchWriter { // may (in the future) associate integer page id's with physical memory // pages (according to whatever is the desired shared memory mechanism) buffer_meta_.push_back(flatbuf::Buffer(0, position + offset, size)); - + if (size > 0) { RETURN_NOT_OK(dst->Write(position + offset, buffer->data(), size)); offset += size; diff --git a/cpp/src/arrow/ipc/ipc-adapter-test.cc b/cpp/src/arrow/ipc/ipc-adapter-test.cc index e73607f3c9d..c243cfba820 100644 --- a/cpp/src/arrow/ipc/ipc-adapter-test.cc +++ b/cpp/src/arrow/ipc/ipc-adapter-test.cc @@ -119,8 +119,10 @@ Status MakeListRowBatch(std::shared_ptr* out) { std::shared_ptr leaf_values, list_array, list_list_array, flat_array; const bool include_nulls = true; RETURN_NOT_OK(MakeRandomInt32Array(1000, include_nulls, pool, &leaf_values)); - RETURN_NOT_OK(MakeRandomListArray(leaf_values, length, include_nulls, pool, &list_array)); - RETURN_NOT_OK(MakeRandomListArray(list_array, length, include_nulls, pool, &list_list_array)); + RETURN_NOT_OK( + MakeRandomListArray(leaf_values, length, include_nulls, pool, &list_array)); + RETURN_NOT_OK( + MakeRandomListArray(list_array, length, include_nulls, pool, &list_list_array)); RETURN_NOT_OK(MakeRandomInt32Array(length, include_nulls, pool, &flat_array)); out->reset(new RowBatch(schema, length, {list_array, list_list_array, flat_array})); return Status::OK(); @@ -140,7 +142,8 @@ Status MakeZeroLengthRowBatch(std::shared_ptr* out) { std::shared_ptr leaf_values, list_array, list_list_array, flat_array; RETURN_NOT_OK(MakeRandomInt32Array(0, include_nulls, pool, &leaf_values)); RETURN_NOT_OK(MakeRandomListArray(leaf_values, 0, include_nulls, pool, &list_array)); - RETURN_NOT_OK(MakeRandomListArray(list_array, 0, include_nulls, pool, &list_list_array)); + RETURN_NOT_OK( + MakeRandomListArray(list_array, 0, include_nulls, pool, &list_list_array)); RETURN_NOT_OK(MakeRandomInt32Array(0, include_nulls, pool, &flat_array)); out->reset(new RowBatch(schema, length, {list_array, list_list_array, flat_array})); return Status::OK(); @@ -157,11 +160,12 @@ Status MakeNonNullRowBatch(std::shared_ptr* out) { MemoryPool* pool = default_memory_pool(); const int length = 200; std::shared_ptr leaf_values, list_array, list_list_array, flat_array; - + RETURN_NOT_OK(MakeRandomInt32Array(1000, true, pool, &leaf_values)); - bool include_nulls = false; + bool include_nulls = false; RETURN_NOT_OK(MakeRandomListArray(leaf_values, 50, include_nulls, pool, &list_array)); - RETURN_NOT_OK(MakeRandomListArray(list_array, 50, include_nulls, pool, &list_list_array)); + RETURN_NOT_OK( + MakeRandomListArray(list_array, 50, include_nulls, pool, &list_list_array)); RETURN_NOT_OK(MakeRandomInt32Array(0, include_nulls, pool, &flat_array)); out->reset(new RowBatch(schema, length, {list_array, list_list_array, flat_array})); return Status::OK(); @@ -188,8 +192,8 @@ Status MakeDeeplyNestedList(std::shared_ptr* out) { } INSTANTIATE_TEST_CASE_P(RoundTripTests, TestWriteRowBatch, - ::testing::Values(&MakeIntRowBatch, &MakeListRowBatch, &MakeNonNullRowBatch, &MakeZeroLengthRowBatch, - &MakeDeeplyNestedList)); + ::testing::Values(&MakeIntRowBatch, &MakeListRowBatch, &MakeNonNullRowBatch, + &MakeZeroLengthRowBatch, &MakeDeeplyNestedList)); class RecursionLimits : public ::testing::Test, public MemoryMapFixture { public: @@ -205,7 +209,8 @@ class RecursionLimits : public ::testing::Test, public MemoryMapFixture { RETURN_NOT_OK(MakeRandomInt32Array(1000, include_nulls, pool_, &array)); for (int i = 0; i < recursion_level; ++i) { type = std::static_pointer_cast(std::make_shared(type)); - RETURN_NOT_OK(MakeRandomListArray(array, batch_length, include_nulls, pool_, &array)); + RETURN_NOT_OK( + MakeRandomListArray(array, batch_length, include_nulls, pool_, &array)); } auto f0 = std::make_shared("f0", type); @@ -247,9 +252,5 @@ TEST_F(RecursionLimits, ReadLimit) { ASSERT_RAISES(Invalid, reader->GetRowBatch(schema, &batch_result)); } -// TODO(emkornfield) More tests -// Test primitive and lists with zero elements -// Tests lists and primitives with no nulls -// String type } // namespace ipc } // namespace arrow diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h index fe07f7cf493..77404cd7025 100644 --- a/cpp/src/arrow/type.h +++ b/cpp/src/arrow/type.h @@ -116,7 +116,7 @@ struct DataType { bool Equals(const DataType* other) { // Call with a pointer so more friendly to subclasses - return this == other || ((other != nullptr) && (this->type == other->type)); + return other && ((this == other) || (this->type == other->type)); } bool Equals(const std::shared_ptr& other) { return Equals(other.get()); } diff --git a/cpp/src/arrow/types/list.cc b/cpp/src/arrow/types/list.cc index 8beef1f5014..fc3331139c6 100644 --- a/cpp/src/arrow/types/list.cc +++ b/cpp/src/arrow/types/list.cc @@ -24,10 +24,9 @@ bool ListArray::EqualsExact(const ListArray& other) const { if (this == &other) { return true; } if (null_count_ != other.null_count_) { return false; } - bool equal_offsets = offset_buf_->Equals(*other.offset_buf_, (length_ + 1) * sizeof(int32_t)); - if (!equal_offsets) { - return false; - } + bool equal_offsets = + offset_buf_->Equals(*other.offset_buf_, (length_ + 1) * sizeof(int32_t)); + if (!equal_offsets) { return false; } bool equal_null_bitmap = true; if (null_count_ > 0) { equal_null_bitmap = @@ -47,9 +46,7 @@ bool ListArray::Equals(const std::shared_ptr& arr) const { Status ListArray::Validate() const { if (length_ < 0) { return Status::Invalid("Length was negative"); } - if (!offset_buf_) { - return Status::Invalid("offset_buf_ was null"); - } + if (!offset_buf_) { return Status::Invalid("offset_buf_ was null"); } if (offset_buf_->size() / sizeof(int32_t) < length_) { std::stringstream ss; ss << "offset buffer size (bytes): " << offset_buf_->size() diff --git a/cpp/src/arrow/types/primitive.cc b/cpp/src/arrow/types/primitive.cc index e1adf1c448d..9102c530e25 100644 --- a/cpp/src/arrow/types/primitive.cc +++ b/cpp/src/arrow/types/primitive.cc @@ -57,9 +57,7 @@ bool PrimitiveArray::EqualsExact(const PrimitiveArray& other) const { } return true; } else { - if (length_ == 0 && other.length_ == 0) { - return true; - } + if (length_ == 0 && other.length_ == 0) { return true; } return data_->Equals(*other.data_, length_); } } From 5e1581505f7b58d3cb57437860356c9fb5631431 Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Mon, 18 Apr 2016 06:37:39 +0000 Subject: [PATCH 21/25] fix make lint --- cpp/src/arrow/ipc/adapter.cc | 2 +- cpp/src/arrow/ipc/memory.cc | 3 ++- cpp/src/arrow/ipc/metadata-internal.cc | 3 ++- cpp/src/arrow/ipc/metadata-internal.h | 3 ++- cpp/src/arrow/ipc/metadata.cc | 3 ++- cpp/src/arrow/util/logging.h | 2 +- 6 files changed, 10 insertions(+), 6 deletions(-) diff --git a/cpp/src/arrow/ipc/adapter.cc b/cpp/src/arrow/ipc/adapter.cc index 8430c1e7d31..bf6fa94dea7 100644 --- a/cpp/src/arrow/ipc/adapter.cc +++ b/cpp/src/arrow/ipc/adapter.cc @@ -206,7 +206,7 @@ class RowBatchWriter { Status WriteRowBatch(MemorySource* dst, const RowBatch* batch, int64_t position, int64_t* header_offset, int max_recursion_depth) { - DCHECK(max_recursion_depth > 0); + DCHECK_GT(max_recursion_depth, 0); RowBatchWriter serializer(batch, max_recursion_depth); RETURN_NOT_OK(serializer.AssemblePayload()); return serializer.Write(dst, position, header_offset); diff --git a/cpp/src/arrow/ipc/memory.cc b/cpp/src/arrow/ipc/memory.cc index 9150538fcd0..84cbc182cd2 100644 --- a/cpp/src/arrow/ipc/memory.cc +++ b/cpp/src/arrow/ipc/memory.cc @@ -17,6 +17,8 @@ #include "arrow/ipc/memory.h" +#include // For memory-mapping + #include #include #include @@ -24,7 +26,6 @@ #include #include #include -#include // For memory-mapping #include "arrow/util/buffer.h" #include "arrow/util/status.h" diff --git a/cpp/src/arrow/ipc/metadata-internal.cc b/cpp/src/arrow/ipc/metadata-internal.cc index e6e109f6078..1b1d50f96ea 100644 --- a/cpp/src/arrow/ipc/metadata-internal.cc +++ b/cpp/src/arrow/ipc/metadata-internal.cc @@ -19,11 +19,12 @@ #include #include -#include #include #include #include +#include "flatbuffers/flatbuffers.h" + #include "arrow/ipc/Message_generated.h" #include "arrow/schema.h" #include "arrow/type.h" diff --git a/cpp/src/arrow/ipc/metadata-internal.h b/cpp/src/arrow/ipc/metadata-internal.h index bec92b80210..871b5bc4bf6 100644 --- a/cpp/src/arrow/ipc/metadata-internal.h +++ b/cpp/src/arrow/ipc/metadata-internal.h @@ -19,10 +19,11 @@ #define ARROW_IPC_METADATA_INTERNAL_H #include -#include #include #include +#include "flatbuffers/flatbuffers.h" + #include "arrow/ipc/Message_generated.h" namespace arrow { diff --git a/cpp/src/arrow/ipc/metadata.cc b/cpp/src/arrow/ipc/metadata.cc index ed233b74ae8..4fc8ec50eb7 100644 --- a/cpp/src/arrow/ipc/metadata.cc +++ b/cpp/src/arrow/ipc/metadata.cc @@ -18,10 +18,11 @@ #include "arrow/ipc/metadata.h" #include -#include #include #include +#include "flatbuffers/flatbuffers.h" + // Generated C++ flatbuffer IDL #include "arrow/ipc/Message_generated.h" #include "arrow/ipc/metadata-internal.h" diff --git a/cpp/src/arrow/util/logging.h b/cpp/src/arrow/util/logging.h index 96e36266e33..6b68aeefb3b 100644 --- a/cpp/src/arrow/util/logging.h +++ b/cpp/src/arrow/util/logging.h @@ -118,7 +118,7 @@ class CerrLog { class FatalLog : public CerrLog { public: FatalLog(int /* severity */) // NOLINT - : CerrLog(ARROW_FATAL){} + : CerrLog(ARROW_FATAL) {} [[noreturn]] ~FatalLog() { if (has_logged_) { std::cerr << std::endl; } From 6e5772884f79d6f89629ea638d58228e52960020 Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Mon, 18 Apr 2016 06:38:16 +0000 Subject: [PATCH 22/25] make format fixes --- cpp/src/arrow/util/logging.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/util/logging.h b/cpp/src/arrow/util/logging.h index 6b68aeefb3b..49aaa47b80f 100644 --- a/cpp/src/arrow/util/logging.h +++ b/cpp/src/arrow/util/logging.h @@ -118,7 +118,7 @@ class CerrLog { class FatalLog : public CerrLog { public: FatalLog(int /* severity */) // NOLINT - : CerrLog(ARROW_FATAL) {} + : CerrLog(ARROW_FATAL){} // NOLINT [[noreturn]] ~FatalLog() { if (has_logged_) { std::cerr << std::endl; } From 77892058b51c1d276d3dc96c1c8349fd3f21cfa0 Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Mon, 18 Apr 2016 06:49:37 +0000 Subject: [PATCH 23/25] make clang-format-3.7 happy --- cpp/src/arrow/types/primitive-test.cc | 3 +-- cpp/src/arrow/util/logging.h | 6 +++--- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/cpp/src/arrow/types/primitive-test.cc b/cpp/src/arrow/types/primitive-test.cc index 65923b0c908..2b4c0879a28 100644 --- a/cpp/src/arrow/types/primitive-test.cc +++ b/cpp/src/arrow/types/primitive-test.cc @@ -238,8 +238,7 @@ void TestPrimitiveBuilder::Check( } typedef ::testing::Types - Primitives; + PInt32, PInt64, PFloat, PDouble> Primitives; TYPED_TEST_CASE(TestPrimitiveBuilder, Primitives); diff --git a/cpp/src/arrow/util/logging.h b/cpp/src/arrow/util/logging.h index 49aaa47b80f..2dffa9851d5 100644 --- a/cpp/src/arrow/util/logging.h +++ b/cpp/src/arrow/util/logging.h @@ -117,10 +117,10 @@ class CerrLog { // return so we create a new class to give it a hint. class FatalLog : public CerrLog { public: - FatalLog(int /* severity */) // NOLINT - : CerrLog(ARROW_FATAL){} // NOLINT + FatalLog(int /* severity */) // NOLINT + : CerrLog(ARROW_FATAL) {} // NOLINT - [[noreturn]] ~FatalLog() { + [[noreturn]] ~FatalLog() { if (has_logged_) { std::cerr << std::endl; } std::exit(1); } From 0af558b72b0dd65e763bf3f060b598fbe703d289 Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Mon, 18 Apr 2016 06:56:36 +0000 Subject: [PATCH 24/25] remove a now unnecessary NOLINT, but mostly to trigger another travis-ci job that failed due to apt get issue --- cpp/src/arrow/util/logging.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/util/logging.h b/cpp/src/arrow/util/logging.h index 2dffa9851d5..263f0c73d21 100644 --- a/cpp/src/arrow/util/logging.h +++ b/cpp/src/arrow/util/logging.h @@ -118,7 +118,7 @@ class CerrLog { class FatalLog : public CerrLog { public: FatalLog(int /* severity */) // NOLINT - : CerrLog(ARROW_FATAL) {} // NOLINT + : CerrLog(ARROW_FATAL) {} [[noreturn]] ~FatalLog() { if (has_logged_) { std::cerr << std::endl; } From 0c5162d1c5024cd21217dd0a01438f214e6c297c Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Mon, 18 Apr 2016 07:03:21 +0000 Subject: [PATCH 25/25] another format fix --- cpp/src/arrow/util/logging.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/util/logging.h b/cpp/src/arrow/util/logging.h index 263f0c73d21..fccc5e3085d 100644 --- a/cpp/src/arrow/util/logging.h +++ b/cpp/src/arrow/util/logging.h @@ -117,7 +117,7 @@ class CerrLog { // return so we create a new class to give it a hint. class FatalLog : public CerrLog { public: - FatalLog(int /* severity */) // NOLINT + FatalLog(int /* severity */) // NOLINT : CerrLog(ARROW_FATAL) {} [[noreturn]] ~FatalLog() {