diff --git a/cpp/src/arrow/json/chunker.cc b/cpp/src/arrow/json/chunker.cc index bba3491da8d..1ce3dcc171e 100644 --- a/cpp/src/arrow/json/chunker.cc +++ b/cpp/src/arrow/json/chunker.cc @@ -125,13 +125,12 @@ namespace { class ParsingBoundaryFinder : public BoundaryFinder { public: Status FindFirst(string_view partial, string_view block, int64_t* out_pos) override { - // NOTE: We could bubble up JSON parse errors here, but the actual parsing - // step will detect them later anyway. auto length = ConsumeWholeObject(MultiStringStream({partial, block})); if (length == string_view::npos) { *out_pos = -1; + } else if (ARROW_PREDICT_FALSE(length < partial.size())) { + return Status::Invalid("JSON chunk error: invalid data at end of document"); } else { - DCHECK_GE(length, partial.size()); DCHECK_LE(length, partial.size() + block.size()); *out_pos = static_cast(length - partial.size()); } diff --git a/cpp/src/arrow/json/chunker_test.cc b/cpp/src/arrow/json/chunker_test.cc index ed1328fa601..1c26d52b140 100644 --- a/cpp/src/arrow/json/chunker_test.cc +++ b/cpp/src/arrow/json/chunker_test.cc @@ -22,6 +22,7 @@ #include #include +#include #include #include "arrow/buffer.h" @@ -261,6 +262,20 @@ TEST(ChunkerTest, StraddlingSingleLine) { AssertStraddledChunking(*chunker, join(lines(), "")); } +TEST(ChunkerTest, Errors) { + std::string parts[] = {R"({"a":0})", "}", R"({"a":1})"}; + auto chunker = MakeChunker(true); + std::shared_ptr whole, rest, completion; + ASSERT_OK(chunker->Process(Buffer::FromString(parts[0] + parts[1]), &whole, &rest)); + ASSERT_EQ(std::string_view(*whole), parts[0]); + ASSERT_EQ(std::string_view(*rest), parts[1]); + auto status = + chunker->ProcessWithPartial(rest, Buffer::FromString(parts[2]), &completion, &rest); + ASSERT_RAISES(Invalid, status); + EXPECT_THAT(status.message(), + ::testing::StartsWith("JSON chunk error: invalid data at end of document")); +} + TEST_P(BaseChunkerTest, StraddlingEmpty) { auto all = join(lines(), "\n"); diff --git a/cpp/src/arrow/json/parser.cc b/cpp/src/arrow/json/parser.cc index 31a707520c6..e2941a29ab9 100644 --- a/cpp/src/arrow/json/parser.cc +++ b/cpp/src/arrow/json/parser.cc @@ -762,7 +762,7 @@ class HandlerBase : public BlockParser, protected: template - Status DoParse(Handler& handler, Stream&& json) { + Status DoParse(Handler& handler, Stream&& json, size_t json_size) { constexpr auto parse_flags = rj::kParseIterativeFlag | rj::kParseNanAndInfFlag | rj::kParseStopWhenDoneFlag | rj::kParseNumbersAsStringsFlag; @@ -776,6 +776,9 @@ class HandlerBase : public BlockParser, // parse the next object continue; case rj::kParseErrorDocumentEmpty: + if (json.Tell() < json_size) { + return ParseError(rj::GetParseError_En(ok.Code())); + } // parsed all objects, finish return Status::OK(); case rj::kParseErrorTermination: @@ -794,7 +797,7 @@ class HandlerBase : public BlockParser, RETURN_NOT_OK(ReserveScalarStorage(json->size())); rj::MemoryStream ms(reinterpret_cast(json->data()), json->size()); using InputStream = rj::EncodedInputStream, rj::MemoryStream>; - return DoParse(handler, InputStream(ms)); + return DoParse(handler, InputStream(ms), static_cast(json->size())); } /// \defgroup handlerbase-append-methods append non-nested values diff --git a/cpp/src/arrow/json/parser_test.cc b/cpp/src/arrow/json/parser_test.cc index 53c456b4feb..681df4e6fa0 100644 --- a/cpp/src/arrow/json/parser_test.cc +++ b/cpp/src/arrow/json/parser_test.cc @@ -300,6 +300,14 @@ TEST(BlockParser, InferNewFieldsInMiddle) { } } +TEST(BlockParser, FailOnInvalidEOF) { + std::shared_ptr parsed; + auto status = ParseFromString(ParseOptions::Defaults(), "}", &parsed); + ASSERT_RAISES(Invalid, status); + EXPECT_THAT(status.message(), + ::testing::StartsWith("JSON parse error: The document is empty")); +} + TEST(BlockParser, AdHoc) { auto options = ParseOptions::Defaults(); options.unexpected_field_behavior = UnexpectedFieldBehavior::InferType; diff --git a/cpp/src/arrow/json/reader_test.cc b/cpp/src/arrow/json/reader_test.cc index 452409209c4..83f5956a64c 100644 --- a/cpp/src/arrow/json/reader_test.cc +++ b/cpp/src/arrow/json/reader_test.cc @@ -305,5 +305,20 @@ TEST(ReaderTest, ListArrayWithFewValues) { AssertTablesEqual(*actual_table, *expected_table); } +TEST(ReaderTest, FailOnInvalidEOF) { + auto read_options = ReadOptions::Defaults(); + auto parse_options = ParseOptions::Defaults(); + read_options.use_threads = false; + std::shared_ptr input; + ASSERT_OK(MakeStream("}", &input)); + + for (auto newlines_in_values : {false, true}) { + parse_options.newlines_in_values = newlines_in_values; + ASSERT_OK_AND_ASSIGN(auto reader, TableReader::Make(default_memory_pool(), input, + read_options, parse_options)); + ASSERT_RAISES(Invalid, reader->Read()); + } +} + } // namespace json } // namespace arrow