diff --git a/cpp/src/arrow/array-test.cc b/cpp/src/arrow/array-test.cc index 44270d1fc2f..df6051e3d0e 100644 --- a/cpp/src/arrow/array-test.cc +++ b/cpp/src/arrow/array-test.cc @@ -2919,6 +2919,8 @@ TEST_F(TestListArray, TestAppendNull) { auto values = result_->values(); ASSERT_EQ(0, values->length()); + // Values buffer should be non-null + ASSERT_NE(nullptr, values->data()->buffers[1]); } void ValidateBasicListArray(const ListArray* result, const vector& values, diff --git a/cpp/src/arrow/array.cc b/cpp/src/arrow/array.cc index fb6ccafafec..6222e37faf4 100644 --- a/cpp/src/arrow/array.cc +++ b/cpp/src/arrow/array.cc @@ -664,12 +664,30 @@ namespace internal { struct ValidateVisitor { Status Visit(const NullArray&) { return Status::OK(); } - Status Visit(const PrimitiveArray&) { return Status::OK(); } + Status Visit(const PrimitiveArray& array) { + if (array.data()->buffers.size() != 2) { + return Status::Invalid("number of buffers was != 2"); + } + if (array.values() == nullptr) { + return Status::Invalid("values was null"); + } + return Status::OK(); + } - Status Visit(const Decimal128Array&) { return Status::OK(); } + Status Visit(const Decimal128Array& array) { + if (array.data()->buffers.size() != 2) { + return Status::Invalid("number of buffers was != 2"); + } + if (array.values() == nullptr) { + return Status::Invalid("values was null"); + } + return Status::OK(); + } - Status Visit(const BinaryArray&) { - // TODO(wesm): what to do here? + Status Visit(const BinaryArray& array) { + if (array.data()->buffers.size() != 3) { + return Status::Invalid("number of buffers was != 3"); + } return Status::OK(); } @@ -688,24 +706,24 @@ struct ValidateVisitor { << " isn't large enough for length: " << array.length(); return Status::Invalid(ss.str()); } + + if (!array.values()) { + return Status::Invalid("values was null"); + } + const int32_t last_offset = array.value_offset(array.length()); - if (last_offset > 0) { - if (!array.values()) { - return Status::Invalid("last offset was non-zero and values was null"); - } - if (array.values()->length() != last_offset) { - std::stringstream ss; - ss << "Final offset invariant not equal to values length: " << last_offset - << "!=" << array.values()->length(); - return Status::Invalid(ss.str()); - } + if (array.values()->length() != last_offset) { + std::stringstream ss; + ss << "Final offset invariant not equal to values length: " << last_offset + << "!=" << array.values()->length(); + return Status::Invalid(ss.str()); + } - const Status child_valid = ValidateArray(*array.values()); - if (!child_valid.ok()) { - std::stringstream ss; - ss << "Child array invalid: " << child_valid.ToString(); - return Status::Invalid(ss.str()); - } + const Status child_valid = ValidateArray(*array.values()); + if (!child_valid.ok()) { + std::stringstream ss; + ss << "Child array invalid: " << child_valid.ToString(); + return Status::Invalid(ss.str()); } int32_t prev_offset = array.value_offset(0); diff --git a/cpp/src/arrow/builder.cc b/cpp/src/arrow/builder.cc index 1b779452635..152f1fc2603 100644 --- a/cpp/src/arrow/builder.cc +++ b/cpp/src/arrow/builder.cc @@ -54,6 +54,7 @@ Status TrimBuffer(const int64_t bytes_filled, ResizableBuffer* buffer) { // zero the padding buffer->ZeroPadding(); } else { + // Null buffers are allowed in place of 0-byte buffers DCHECK_EQ(bytes_filled, 0); } return Status::OK(); @@ -1336,6 +1337,10 @@ Status ListBuilder::FinishInternal(std::shared_ptr* out) { if (values_) { items = values_->data(); } else { + if (value_builder_->length() == 0) { + // Try to make sure we get a non-null values buffer (ARROW-2744) + RETURN_NOT_OK(value_builder_->Resize(0)); + } RETURN_NOT_OK(value_builder_->FinishInternal(&items)); } @@ -1632,6 +1637,10 @@ Status StructBuilder::FinishInternal(std::shared_ptr* out) { (*out)->child_data.resize(field_builders_.size()); for (size_t i = 0; i < field_builders_.size(); ++i) { + if (length_ == 0) { + // Try to make sure the child buffers are initialized + RETURN_NOT_OK(field_builders_[i]->Resize(0)); + } RETURN_NOT_OK(field_builders_[i]->FinishInternal(&(*out)->child_data[i])); } diff --git a/cpp/src/arrow/ipc/test-common.h b/cpp/src/arrow/ipc/test-common.h index 2f0237ed263..4e9fab17c44 100644 --- a/cpp/src/arrow/ipc/test-common.h +++ b/cpp/src/arrow/ipc/test-common.h @@ -100,6 +100,7 @@ Status MakeRandomInt32Array(int64_t length, bool include_nulls, MemoryPool* pool std::shared_ptr data; RETURN_NOT_OK(test::MakeRandomInt32PoolBuffer(length, pool, &data)); Int32Builder builder(int32(), pool); + RETURN_NOT_OK(builder.Init(length)); if (include_nulls) { std::shared_ptr valid_bytes; RETURN_NOT_OK(test::MakeRandomBytePoolBuffer(length, pool, &valid_bytes)); @@ -127,6 +128,7 @@ Status MakeRandomListArray(const std::shared_ptr& child_array, int num_li std::vector offsets( num_lists + 1, 0); // +1 so we can shift for nulls. See partial sum below. const uint32_t seed = static_cast(child_array->length()); + if (num_lists > 0) { test::rand_uniform_int(num_lists, seed, 0, max_list_size, list_sizes.data()); // make sure sizes are consistent with null diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py index fec65b93cb7..23c9e3029c5 100644 --- a/python/pyarrow/tests/test_parquet.py +++ b/python/pyarrow/tests/test_parquet.py @@ -158,7 +158,7 @@ def test_pandas_parquet_2_0_rountrip(tmpdir, chunk_size): @parquet -def test_chunked_table_write(tmpdir): +def test_chunked_table_write(): # ARROW-232 df = alltypes_sample(size=10) @@ -176,7 +176,7 @@ def test_chunked_table_write(tmpdir): @parquet -def test_empty_table_roundtrip(tmpdir): +def test_empty_table_roundtrip(): df = alltypes_sample(size=10) # The nanosecond->us conversion is a nuisance, so we just avoid it here del df['datetime'] @@ -192,6 +192,14 @@ def test_empty_table_roundtrip(tmpdir): _check_roundtrip(table, version='2.0') +@parquet +def test_empty_lists_table_roundtrip(): + # ARROW-2744: Shouldn't crash when writing an array of empty lists + arr = pa.array([[], []], type=pa.list_(pa.int32())) + table = pa.Table.from_arrays([arr], ["A"]) + _check_roundtrip(table) + + @parquet def test_pandas_parquet_datetime_tz(): import pyarrow.parquet as pq