diff --git a/cpp/src/arrow/json/CMakeLists.txt b/cpp/src/arrow/json/CMakeLists.txt index 5f26a2e1ebd..f09b15ce51c 100644 --- a/cpp/src/arrow/json/CMakeLists.txt +++ b/cpp/src/arrow/json/CMakeLists.txt @@ -17,6 +17,7 @@ add_arrow_test(test SOURCES + chunked_builder_test.cc chunker_test.cc converter_test.cc parser_test.cc diff --git a/cpp/src/arrow/json/chunked_builder.cc b/cpp/src/arrow/json/chunked_builder.cc index 01385d2b8e1..040009c764f 100644 --- a/cpp/src/arrow/json/chunked_builder.cc +++ b/cpp/src/arrow/json/chunked_builder.cc @@ -27,11 +27,13 @@ #include "arrow/buffer.h" #include "arrow/json/converter.h" #include "arrow/table.h" +#include "arrow/util/checked_cast.h" #include "arrow/util/logging.h" #include "arrow/util/task_group.h" namespace arrow { +using internal::checked_cast; using internal::TaskGroup; namespace json { @@ -199,15 +201,6 @@ class ChunkedListArrayBuilder : public ChunkedArrayBuilder { const std::shared_ptr& unconverted) override { std::unique_lock lock(mutex_); - auto list_array = static_cast(unconverted.get()); - - if (null_bitmap_chunks_.size() <= static_cast(block_index)) { - null_bitmap_chunks_.resize(static_cast(block_index) + 1, nullptr); - offset_chunks_.resize(null_bitmap_chunks_.size(), nullptr); - } - null_bitmap_chunks_[block_index] = unconverted->null_bitmap(); - offset_chunks_[block_index] = list_array->value_offsets(); - if (unconverted->type_id() == Type::NA) { auto st = InsertNull(block_index, unconverted->length()); if (!st.ok()) { @@ -217,8 +210,17 @@ class ChunkedListArrayBuilder : public ChunkedArrayBuilder { } DCHECK_EQ(unconverted->type_id(), Type::LIST); - value_builder_->Insert(block_index, list_array->list_type()->value_field(), - list_array->values()); + const auto& list_array = checked_cast(*unconverted); + + if (null_bitmap_chunks_.size() <= static_cast(block_index)) { + null_bitmap_chunks_.resize(static_cast(block_index) + 1, nullptr); + offset_chunks_.resize(null_bitmap_chunks_.size(), nullptr); + } + null_bitmap_chunks_[block_index] = unconverted->null_bitmap(); + offset_chunks_[block_index] = list_array.value_offsets(); + + value_builder_->Insert(block_index, list_array.list_type()->value_field(), + list_array.values()); } Status Finish(std::shared_ptr* out) override { @@ -305,17 +307,17 @@ class ChunkedStructArrayBuilder : public ChunkedArrayBuilder { return; } - auto struct_array = std::static_pointer_cast(unconverted); + const auto& struct_array = checked_cast(*unconverted); if (promotion_graph_ == nullptr) { // If unexpected fields are ignored or result in an error then all parsers will emit // columns exclusively in the ordering specified in ParseOptions::explicit_schema, // so child_builders_ is immutable and no associative lookup is necessary. for (int i = 0; i < unconverted->num_fields(); ++i) { child_builders_[i]->Insert(block_index, unconverted->type()->field(i), - struct_array->field(i)); + struct_array.field(i)); } } else { - auto st = InsertChildren(block_index, struct_array.get()); + auto st = InsertChildren(block_index, struct_array); if (!st.ok()) { return task_group_->Append([st] { return st; }); } @@ -383,10 +385,10 @@ class ChunkedStructArrayBuilder : public ChunkedArrayBuilder { // Insert children associatively by name; the unconverted block may have unexpected or // differently ordered fields // call from Insert() only, with mutex_ locked - Status InsertChildren(int64_t block_index, const StructArray* unconverted) { - const auto& fields = unconverted->type()->fields(); + Status InsertChildren(int64_t block_index, const StructArray& unconverted) { + const auto& fields = unconverted.type()->fields(); - for (int i = 0; i < unconverted->num_fields(); ++i) { + for (int i = 0; i < unconverted.num_fields(); ++i) { auto it = name_to_index_.find(fields[i]->name()); if (it == name_to_index_.end()) { @@ -405,9 +407,9 @@ class ChunkedStructArrayBuilder : public ChunkedArrayBuilder { child_builders_.emplace_back(std::move(child_builder)); } - auto unconverted_field = unconverted->type()->field(i); + auto unconverted_field = unconverted.type()->field(i); child_builders_[it->second]->Insert(block_index, unconverted_field, - unconverted->field(i)); + unconverted.field(i)); child_absent_[block_index].resize(child_builders_.size(), true); child_absent_[block_index][it->second] = false; @@ -444,12 +446,12 @@ Status MakeChunkedArrayBuilder(const std::shared_ptr& task_group, return Status::OK(); } if (type->id() == Type::LIST) { - auto list_type = static_cast(type.get()); + const auto& list_type = checked_cast(*type); std::shared_ptr value_builder; RETURN_NOT_OK(MakeChunkedArrayBuilder(task_group, pool, promotion_graph, - list_type->value_type(), &value_builder)); + list_type.value_type(), &value_builder)); *out = std::make_shared( - task_group, pool, std::move(value_builder), list_type->value_field()); + task_group, pool, std::move(value_builder), list_type.value_field()); return Status::OK(); } std::shared_ptr converter; diff --git a/cpp/src/arrow/json/chunked_builder_test.cc b/cpp/src/arrow/json/chunked_builder_test.cc index 5c57b4963bc..a3810316f76 100644 --- a/cpp/src/arrow/json/chunked_builder_test.cc +++ b/cpp/src/arrow/json/chunked_builder_test.cc @@ -40,7 +40,7 @@ using internal::checked_cast; using internal::GetCpuThreadPool; using internal::TaskGroup; -void AssertBuilding(const std::unique_ptr& builder, +void AssertBuilding(const std::shared_ptr& builder, const std::vector& chunks, std::shared_ptr* out) { ArrayVector unconverted; @@ -67,9 +67,8 @@ std::shared_ptr ExtractField(const std::string& name, for (auto& chunk : chunks) { chunk = checked_cast(*chunk).GetFieldByName(name); } - auto struct_type = static_cast(columns.type().get()); - return std::make_shared(chunks, - struct_type->GetFieldByName(name)->type()); + const auto& struct_type = checked_cast(*columns.type()); + return std::make_shared(chunks, struct_type.GetFieldByName(name)->type()); } void AssertFieldEqual(const std::vector& path, @@ -83,27 +82,9 @@ void AssertFieldEqual(const std::vector& path, AssertChunkedEqual(expected, *actual); } -template -std::string RowsOfOneColumn(string_view name, std::initializer_list values, - decltype(std::to_string(*values.begin()))* = nullptr) { - std::stringstream ss; - for (auto value : values) { - ss << R"({")" << name << R"(":)" << std::to_string(value) << "}\n"; - } - return ss.str(); -} - -std::string RowsOfOneColumn(string_view name, std::initializer_list values) { - std::stringstream ss; - for (auto value : values) { - ss << R"({")" << name << R"(":)" << value << "}\n"; - } - return ss.str(); -} - TEST(ChunkedArrayBuilder, Empty) { auto tg = TaskGroup::MakeSerial(); - std::unique_ptr builder; + std::shared_ptr builder; ASSERT_OK(MakeChunkedArrayBuilder(tg, default_memory_pool(), nullptr, struct_({field("a", int32())}), &builder)); @@ -116,7 +97,7 @@ TEST(ChunkedArrayBuilder, Empty) { TEST(ChunkedArrayBuilder, Basics) { auto tg = TaskGroup::MakeSerial(); - std::unique_ptr builder; + std::shared_ptr builder; ASSERT_OK(MakeChunkedArrayBuilder(tg, default_memory_pool(), nullptr, struct_({field("a", int32())}), &builder)); @@ -130,7 +111,7 @@ TEST(ChunkedArrayBuilder, Basics) { TEST(ChunkedArrayBuilder, Insert) { auto tg = TaskGroup::MakeSerial(); - std::unique_ptr builder; + std::shared_ptr builder; ASSERT_OK(MakeChunkedArrayBuilder(tg, default_memory_pool(), nullptr, struct_({field("a", int32())}), &builder)); @@ -151,7 +132,7 @@ TEST(ChunkedArrayBuilder, Insert) { TEST(ChunkedArrayBuilder, MultipleChunks) { auto tg = TaskGroup::MakeSerial(); - std::unique_ptr builder; + std::shared_ptr builder; ASSERT_OK(MakeChunkedArrayBuilder(tg, default_memory_pool(), nullptr, struct_({field("a", int32())}), &builder)); @@ -170,7 +151,7 @@ TEST(ChunkedArrayBuilder, MultipleChunks) { TEST(ChunkedArrayBuilder, MultipleChunksParallel) { auto tg = TaskGroup::MakeThreaded(GetCpuThreadPool()); - std::unique_ptr builder; + std::shared_ptr builder; ASSERT_OK(MakeChunkedArrayBuilder(tg, default_memory_pool(), nullptr, struct_({field("a", int32())}), &builder)); @@ -194,7 +175,7 @@ TEST(ChunkedArrayBuilder, MultipleChunksParallel) { TEST(InferringChunkedArrayBuilder, Empty) { auto tg = TaskGroup::MakeSerial(); - std::unique_ptr builder; + std::shared_ptr builder; ASSERT_OK(MakeChunkedArrayBuilder(tg, default_memory_pool(), GetPromotionGraph(), struct_({}), &builder)); @@ -207,7 +188,7 @@ TEST(InferringChunkedArrayBuilder, Empty) { TEST(InferringChunkedArrayBuilder, SingleChunkNull) { auto tg = TaskGroup::MakeSerial(); - std::unique_ptr builder; + std::shared_ptr builder; ASSERT_OK(MakeChunkedArrayBuilder(tg, default_memory_pool(), GetPromotionGraph(), struct_({}), &builder)); @@ -224,7 +205,7 @@ TEST(InferringChunkedArrayBuilder, SingleChunkNull) { TEST(InferringChunkedArrayBuilder, MultipleChunkNull) { auto tg = TaskGroup::MakeSerial(); - std::unique_ptr builder; + std::shared_ptr builder; ASSERT_OK(MakeChunkedArrayBuilder(tg, default_memory_pool(), GetPromotionGraph(), struct_({}), &builder)); @@ -244,7 +225,7 @@ TEST(InferringChunkedArrayBuilder, MultipleChunkNull) { TEST(InferringChunkedArrayBuilder, SingleChunkInteger) { auto tg = TaskGroup::MakeSerial(); - std::unique_ptr builder; + std::shared_ptr builder; ASSERT_OK(MakeChunkedArrayBuilder(tg, default_memory_pool(), GetPromotionGraph(), struct_({}), &builder)); @@ -264,7 +245,7 @@ TEST(InferringChunkedArrayBuilder, SingleChunkInteger) { TEST(InferringChunkedArrayBuilder, MultipleChunkInteger) { auto tg = TaskGroup::MakeSerial(); - std::unique_ptr builder; + std::shared_ptr builder; ASSERT_OK(MakeChunkedArrayBuilder(tg, default_memory_pool(), GetPromotionGraph(), struct_({}), &builder)); @@ -285,7 +266,7 @@ TEST(InferringChunkedArrayBuilder, MultipleChunkInteger) { TEST(InferringChunkedArrayBuilder, SingleChunkDouble) { auto tg = TaskGroup::MakeSerial(); - std::unique_ptr builder; + std::shared_ptr builder; ASSERT_OK(MakeChunkedArrayBuilder(tg, default_memory_pool(), GetPromotionGraph(), struct_({}), &builder)); @@ -305,7 +286,7 @@ TEST(InferringChunkedArrayBuilder, SingleChunkDouble) { TEST(InferringChunkedArrayBuilder, MultipleChunkDouble) { auto tg = TaskGroup::MakeSerial(); - std::unique_ptr builder; + std::shared_ptr builder; ASSERT_OK(MakeChunkedArrayBuilder(tg, default_memory_pool(), GetPromotionGraph(), struct_({}), &builder)); @@ -327,7 +308,7 @@ TEST(InferringChunkedArrayBuilder, MultipleChunkDouble) { TEST(InferringChunkedArrayBuilder, SingleChunkTimestamp) { auto tg = TaskGroup::MakeSerial(); - std::unique_ptr builder; + std::shared_ptr builder; ASSERT_OK(MakeChunkedArrayBuilder(tg, default_memory_pool(), GetPromotionGraph(), struct_({}), &builder)); @@ -348,7 +329,7 @@ TEST(InferringChunkedArrayBuilder, SingleChunkTimestamp) { TEST(InferringChunkedArrayBuilder, MultipleChunkTimestamp) { auto tg = TaskGroup::MakeSerial(); - std::unique_ptr builder; + std::shared_ptr builder; ASSERT_OK(MakeChunkedArrayBuilder(tg, default_memory_pool(), GetPromotionGraph(), struct_({}), &builder)); @@ -371,7 +352,7 @@ TEST(InferringChunkedArrayBuilder, MultipleChunkTimestamp) { TEST(InferringChunkedArrayBuilder, SingleChunkString) { auto tg = TaskGroup::MakeSerial(); - std::unique_ptr builder; + std::shared_ptr builder; ASSERT_OK(MakeChunkedArrayBuilder(tg, default_memory_pool(), GetPromotionGraph(), struct_({}), &builder)); @@ -392,7 +373,7 @@ TEST(InferringChunkedArrayBuilder, SingleChunkString) { TEST(InferringChunkedArrayBuilder, MultipleChunkString) { auto tg = TaskGroup::MakeSerial(); - std::unique_ptr builder; + std::shared_ptr builder; ASSERT_OK(MakeChunkedArrayBuilder(tg, default_memory_pool(), GetPromotionGraph(), struct_({}), &builder)); @@ -415,7 +396,7 @@ TEST(InferringChunkedArrayBuilder, MultipleChunkString) { TEST(InferringChunkedArrayBuilder, MultipleChunkIntegerParallel) { auto tg = TaskGroup::MakeThreaded(GetCpuThreadPool()); - std::unique_ptr builder; + std::shared_ptr builder; ASSERT_OK(MakeChunkedArrayBuilder(tg, default_memory_pool(), GetPromotionGraph(), struct_({}), &builder)); @@ -433,5 +414,41 @@ TEST(InferringChunkedArrayBuilder, MultipleChunkIntegerParallel) { AssertFieldEqual({"a"}, actual, *expected); } +TEST(InferringChunkedArrayBuilder, SingleChunkList) { + auto tg = TaskGroup::MakeSerial(); + std::shared_ptr builder; + ASSERT_OK(MakeChunkedArrayBuilder(tg, default_memory_pool(), GetPromotionGraph(), + struct_({}), &builder)); + + std::shared_ptr actual; + AssertBuilding(builder, + { + std::string("{}\n") + "{\"a\": []}\n" + "{\"a\": [1, 2]}\n", + }, + &actual); + + auto expected = ChunkedArrayFromJSON(list(int64()), {"[null, [], [1, 2]]"}); + AssertFieldEqual({"a"}, actual, *expected); +} + +TEST(InferringChunkedArrayBuilder, MultipleChunkList) { + auto tg = TaskGroup::MakeSerial(); + std::shared_ptr builder; + ASSERT_OK(MakeChunkedArrayBuilder(tg, default_memory_pool(), GetPromotionGraph(), + struct_({}), &builder)); + + std::shared_ptr actual; + AssertBuilding(builder, + { + "{}\n", + "{\"a\": []}\n", + "{\"a\": [1, 2]}\n", + }, + &actual); + + auto expected = ChunkedArrayFromJSON(list(int64()), {"[null]", "[[]]", "[[1, 2]]"}); + AssertFieldEqual({"a"}, actual, *expected); +} + } // namespace json } // namespace arrow diff --git a/cpp/src/arrow/json/converter.cc b/cpp/src/arrow/json/converter.cc index b1b10ca8750..fe9500d40ca 100644 --- a/cpp/src/arrow/json/converter.cc +++ b/cpp/src/arrow/json/converter.cc @@ -48,17 +48,17 @@ namespace { const DictionaryArray& GetDictionaryArray(const std::shared_ptr& in) { DCHECK_EQ(in->type_id(), Type::DICTIONARY); - auto dict_type = static_cast(in->type().get()); + auto dict_type = checked_cast(in->type().get()); DCHECK_EQ(dict_type->index_type()->id(), Type::INT32); DCHECK_EQ(dict_type->value_type()->id(), Type::STRING); - return static_cast(*in); + return checked_cast(*in); } template Status VisitDictionaryEntries(const DictionaryArray& dict_array, ValidVisitor&& visit_valid, NullVisitor&& visit_null) { - const StringArray& dict = static_cast(*dict_array.dictionary()); - const Int32Array& indices = static_cast(*dict_array.indices()); + const StringArray& dict = checked_cast(*dict_array.dictionary()); + const Int32Array& indices = checked_cast(*dict_array.indices()); for (int64_t i = 0; i < indices.length(); ++i) { if (indices.IsValid(i)) { RETURN_NOT_OK(visit_valid(dict.GetView(indices.GetView(i)))); @@ -281,8 +281,8 @@ const PromotionGraph* GetPromotionGraph() { return timestamp(TimeUnit::SECOND); case Kind::kArray: { - auto type = static_cast(unexpected_field->type().get()); - auto value_field = type->value_field(); + const auto& type = checked_cast(*unexpected_field->type()); + auto value_field = type.value_field(); return list(value_field->WithType(Infer(value_field))); } case Kind::kObject: { diff --git a/cpp/src/arrow/json/parser.cc b/cpp/src/arrow/json/parser.cc index a53742aa801..05f155645a6 100644 --- a/cpp/src/arrow/json/parser.cc +++ b/cpp/src/arrow/json/parser.cc @@ -33,6 +33,7 @@ #include "arrow/buffer_builder.h" #include "arrow/type.h" #include "arrow/util/bitset_stack.h" +#include "arrow/util/checked_cast.h" #include "arrow/util/logging.h" #include "arrow/util/make_unique.h" #include "arrow/util/string_view.h" @@ -431,7 +432,7 @@ class RawBuilderSet { case Kind::kArray: { RETURN_NOT_OK(MakeBuilder(leading_nulls, builder)); - const auto& list_type = static_cast(t); + const auto& list_type = checked_cast(t); BuilderPtr value_builder; RETURN_NOT_OK(MakeBuilder(*list_type.value_type(), 0, &value_builder)); @@ -442,7 +443,7 @@ class RawBuilderSet { } case Kind::kObject: { RETURN_NOT_OK(MakeBuilder(leading_nulls, builder)); - const auto& struct_type = static_cast(t); + const auto& struct_type = checked_cast(t); for (const auto& f : struct_type.fields()) { BuilderPtr field_builder; diff --git a/cpp/src/arrow/json/parser_test.cc b/cpp/src/arrow/json/parser_test.cc index 35340187f07..d9861b385c6 100644 --- a/cpp/src/arrow/json/parser_test.cc +++ b/cpp/src/arrow/json/parser_test.cc @@ -28,9 +28,13 @@ #include "arrow/json/test_common.h" #include "arrow/status.h" #include "arrow/testing/gtest_util.h" +#include "arrow/util/checked_cast.h" #include "arrow/util/string_view.h" namespace arrow { + +using internal::checked_cast; + namespace json { using util::string_view; @@ -46,7 +50,7 @@ void AssertUnconvertedArraysEqual(const Array& expected, const Array& actual) { case Type::DICTIONARY: { ASSERT_EQ(expected.type_id(), Type::STRING); std::shared_ptr actual_decoded; - ASSERT_OK(DecodeStringDictionary(static_cast(actual), + ASSERT_OK(DecodeStringDictionary(checked_cast(actual), &actual_decoded)); return AssertArraysEqual(expected, *actual_decoded); } @@ -59,14 +63,15 @@ void AssertUnconvertedArraysEqual(const Array& expected, const Array& actual) { const auto& expected_offsets = expected.data()->buffers[1]; const auto& actual_offsets = actual.data()->buffers[1]; AssertBufferEqual(*expected_offsets, *actual_offsets); - auto expected_values = static_cast(expected).values(); - auto actual_values = static_cast(actual).values(); + auto expected_values = checked_cast(expected).values(); + auto actual_values = checked_cast(actual).values(); return AssertUnconvertedArraysEqual(*expected_values, *actual_values); } case Type::STRUCT: ASSERT_EQ(expected.type_id(), Type::STRUCT); - return AssertUnconvertedStructArraysEqual(static_cast(expected), - static_cast(actual)); + return AssertUnconvertedStructArraysEqual( + checked_cast(expected), + checked_cast(actual)); default: FAIL(); } diff --git a/cpp/src/arrow/json/reader.cc b/cpp/src/arrow/json/reader.cc index ed57a134e93..51c77fa4df9 100644 --- a/cpp/src/arrow/json/reader.cc +++ b/cpp/src/arrow/json/reader.cc @@ -30,6 +30,7 @@ #include "arrow/record_batch.h" #include "arrow/table.h" #include "arrow/util/async_generator.h" +#include "arrow/util/checked_cast.h" #include "arrow/util/iterator.h" #include "arrow/util/logging.h" #include "arrow/util/string_view.h" @@ -40,6 +41,7 @@ namespace arrow { using util::string_view; +using internal::checked_cast; using internal::GetCpuThreadPool; using internal::TaskGroup; using internal::ThreadPool; @@ -211,13 +213,13 @@ Result> ParseOne(ParseOptions options, builder->Insert(0, field("", type), parsed); std::shared_ptr converted_chunked; RETURN_NOT_OK(builder->Finish(&converted_chunked)); - auto converted = static_cast(converted_chunked->chunk(0).get()); + const auto& converted = checked_cast(*converted_chunked->chunk(0)); - std::vector> columns(converted->num_fields()); - for (int i = 0; i < converted->num_fields(); ++i) { - columns[i] = converted->field(i); + std::vector> columns(converted.num_fields()); + for (int i = 0; i < converted.num_fields(); ++i) { + columns[i] = converted.field(i); } - return RecordBatch::Make(schema(converted->type()->fields()), converted->length(), + return RecordBatch::Make(schema(converted.type()->fields()), converted.length(), std::move(columns)); } diff --git a/cpp/src/arrow/json/reader_test.cc b/cpp/src/arrow/json/reader_test.cc index c19c0bc7da2..976343b5211 100644 --- a/cpp/src/arrow/json/reader_test.cc +++ b/cpp/src/arrow/json/reader_test.cc @@ -203,16 +203,6 @@ TEST_P(ReaderTest, MultipleChunks) { AssertTablesEqual(*expected_table, *table_); } -template -std::string RowsOfOneColumn(string_view name, std::initializer_list values, - decltype(std::to_string(*values.begin()))* = nullptr) { - std::stringstream ss; - for (auto value : values) { - ss << R"({")" << name << R"(":)" << std::to_string(value) << "}\n"; - } - return ss.str(); -} - TEST(ReaderTest, MultipleChunksParallel) { int64_t count = 1 << 10; diff --git a/cpp/src/arrow/json/test_common.h b/cpp/src/arrow/json/test_common.h index 125a6e27ed8..618b16ae424 100644 --- a/cpp/src/arrow/json/test_common.h +++ b/cpp/src/arrow/json/test_common.h @@ -19,7 +19,9 @@ #include #include +#include #include +#include #include #include "arrow/json/rapidjson_defs.h" @@ -34,10 +36,14 @@ #include "arrow/json/parser.h" #include "arrow/testing/gtest_util.h" #include "arrow/type.h" +#include "arrow/util/checked_cast.h" #include "arrow/util/string_view.h" #include "arrow/visitor_inline.h" namespace arrow { + +using internal::checked_cast; + namespace json { namespace rj = arrow::rapidjson; @@ -161,8 +167,8 @@ inline static Status MakeStream(string_view src_str, // dictionary. This can be decoded for ease of comparison inline static Status DecodeStringDictionary(const DictionaryArray& dict_array, std::shared_ptr* decoded) { - const StringArray& dict = static_cast(*dict_array.dictionary()); - const Int32Array& indices = static_cast(*dict_array.indices()); + const StringArray& dict = checked_cast(*dict_array.dictionary()); + const Int32Array& indices = checked_cast(*dict_array.indices()); StringBuilder builder; RETURN_NOT_OK(builder.Resize(indices.length())); for (int64_t i = 0; i < indices.length(); ++i) { @@ -197,6 +203,25 @@ static inline std::string PrettyPrint(string_view one_line) { return sb.GetString(); } +template +std::string RowsOfOneColumn(util::string_view name, std::initializer_list values, + decltype(std::to_string(*values.begin()))* = nullptr) { + std::stringstream ss; + for (auto value : values) { + ss << R"({")" << name << R"(":)" << std::to_string(value) << "}\n"; + } + return ss.str(); +} + +inline std::string RowsOfOneColumn(util::string_view name, + std::initializer_list values) { + std::stringstream ss; + for (auto value : values) { + ss << R"({")" << name << R"(":)" << value << "}\n"; + } + return ss.str(); +} + inline static std::string scalars_only_src() { return R"( { "hello": 3.5, "world": false, "yo": "thing" } diff --git a/python/pyarrow/tests/test_json.py b/python/pyarrow/tests/test_json.py index 3fef4f24479..6ce584e5105 100644 --- a/python/pyarrow/tests/test_json.py +++ b/python/pyarrow/tests/test_json.py @@ -208,6 +208,25 @@ def test_empty_rows(self): assert table.num_columns == 0 assert table.num_rows == 2 + def test_reconcile_accross_blocks(self): + # ARROW-12065: reconciling inferred types accross blocks + first_row = b'{ }\n' + read_options = ReadOptions(block_size=len(first_row)) + for next_rows, expected_pylist in [ + (b'{"a": 0}', [None, 0]), + (b'{"a": []}', [None, []]), + (b'{"a": []}\n{"a": [[1]]}', [None, [], [[1]]]), + (b'{"a": {}}', [None, {}]), + (b'{"a": {}}\n{"a": {"b": {"c": 1}}}', + [None, {"b": None}, {"b": {"c": 1}}]), + ]: + table = self.read_bytes(first_row + next_rows, + read_options=read_options) + expected = {"a": expected_pylist} + assert table.to_pydict() == expected + # Check that the issue was exercised + assert table.column("a").num_chunks > 1 + def test_explicit_schema_with_unexpected_behaviour(self): # infer by default rows = (b'{"foo": "bar", "num": 0}\n'