diff --git a/cpp/src/arrow/adapters/orc/adapter.cc b/cpp/src/arrow/adapters/orc/adapter.cc index e691a21f416..d4e379a93b4 100644 --- a/cpp/src/arrow/adapters/orc/adapter.cc +++ b/cpp/src/arrow/adapters/orc/adapter.cc @@ -126,13 +126,6 @@ class ArrowInputFile : public liborc::InputStream { std::shared_ptr file_; }; -struct StripeInformation { - uint64_t offset; - uint64_t length; - uint64_t num_rows; - uint64_t first_row_of_stripe; -}; - // The number of rows to read in a ColumnVectorBatch constexpr int64_t kReadRowsBatch = 1000; @@ -206,8 +199,10 @@ class ORCFileReader::Impl { uint64_t first_row_of_stripe = 0; for (int i = 0; i < nstripes; ++i) { stripe = reader_->getStripe(i); - stripes_[i] = StripeInformation({stripe->getOffset(), stripe->getLength(), - stripe->getNumberOfRows(), first_row_of_stripe}); + stripes_[i] = StripeInformation({static_cast(stripe->getOffset()), + static_cast(stripe->getLength()), + static_cast(stripe->getNumberOfRows()), + static_cast(first_row_of_stripe)}); first_row_of_stripe += stripe->getNumberOfRows(); } return Status::OK(); @@ -217,6 +212,8 @@ class ORCFileReader::Impl { int64_t NumberOfRows() { return static_cast(reader_->getNumberOfRows()); } + StripeInformation GetStripeInformation(int64_t stripe) { return stripes_[stripe]; } + FileVersion GetFileVersion() { liborc::FileVersion orc_file_version = reader_->getFormatVersion(); return FileVersion(orc_file_version.getMajor(), orc_file_version.getMinor()); @@ -383,7 +380,8 @@ class ORCFileReader::Impl { ARROW_RETURN_IF(stripe < 0 || stripe >= NumberOfStripes(), Status::Invalid("Out of bounds stripe: ", stripe)); - opts->range(stripes_[stripe].offset, stripes_[stripe].length); + opts->range(static_cast(stripes_[stripe].offset), + static_cast(stripes_[stripe].length)); return Status::OK(); } @@ -393,9 +391,9 @@ class ORCFileReader::Impl { Status::Invalid("Out of bounds row number: ", row_number)); for (auto it = stripes_.begin(); it != stripes_.end(); it++) { - if (static_cast(row_number) >= it->first_row_of_stripe && - static_cast(row_number) < it->first_row_of_stripe + it->num_rows) { - opts->range(it->offset, it->length); + if (row_number >= it->first_row_id && + row_number < it->first_row_id + it->num_rows) { + opts->range(static_cast(it->offset), static_cast(it->length)); *out = *it; return Status::OK(); } @@ -427,7 +425,8 @@ class ORCFileReader::Impl { liborc::RowReaderOptions opts(row_opts); std::vector> batches(stripes_.size()); for (size_t stripe = 0; stripe < stripes_.size(); stripe++) { - opts.range(stripes_[stripe].offset, stripes_[stripe].length); + opts.range(static_cast(stripes_[stripe].offset), + static_cast(stripes_[stripe].length)); ARROW_ASSIGN_OR_RAISE(batches[stripe], ReadBatch(opts, schema, stripes_[stripe].num_rows)); } @@ -488,7 +487,7 @@ class ORCFileReader::Impl { ORC_BEGIN_CATCH_NOT_OK row_reader = reader_->createRowReader(opts); row_reader->seekToRow(current_row_); - current_row_ = stripe_info.first_row_of_stripe + stripe_info.num_rows; + current_row_ = stripe_info.first_row_id + stripe_info.num_rows; ORC_END_CATCH_NOT_OK return std::make_shared(std::move(row_reader), schema, batch_size, @@ -600,6 +599,10 @@ int64_t ORCFileReader::NumberOfStripes() { return impl_->NumberOfStripes(); } int64_t ORCFileReader::NumberOfRows() { return impl_->NumberOfRows(); } +StripeInformation ORCFileReader::GetStripeInformation(int64_t stripe) { + return impl_->GetStripeInformation(stripe); +} + FileVersion ORCFileReader::GetFileVersion() { return impl_->GetFileVersion(); } std::string ORCFileReader::GetSoftwareVersion() { return impl_->GetSoftwareVersion(); } diff --git a/cpp/src/arrow/adapters/orc/adapter.h b/cpp/src/arrow/adapters/orc/adapter.h index af7e2ff77cf..013be78600a 100644 --- a/cpp/src/arrow/adapters/orc/adapter.h +++ b/cpp/src/arrow/adapters/orc/adapter.h @@ -35,6 +35,18 @@ namespace arrow { namespace adapters { namespace orc { +/// \brief Information about an ORC stripe +struct StripeInformation { + /// \brief Offset of the stripe from the start of the file, in bytes + int64_t offset; + /// \brief Length of the stripe, in bytes + int64_t length; + /// \brief Number of rows in the stripe + int64_t num_rows; + /// \brief Index of the first row of the stripe + int64_t first_row_id; +}; + /// \class ORCFileReader /// \brief Read an Arrow Table or RecordBatch from an ORC file. class ARROW_EXPORT ORCFileReader { @@ -168,6 +180,9 @@ class ARROW_EXPORT ORCFileReader { /// \brief The number of rows in the file int64_t NumberOfRows(); + /// \brief StripeInformation for each stripe. + StripeInformation GetStripeInformation(int64_t stripe); + /// \brief Get the format version of the file. /// Currently known values are 0.11 and 0.12. /// diff --git a/cpp/src/arrow/adapters/orc/adapter_test.cc b/cpp/src/arrow/adapters/orc/adapter_test.cc index 320c71992fc..c119e5cbeb8 100644 --- a/cpp/src/arrow/adapters/orc/adapter_test.cc +++ b/cpp/src/arrow/adapters/orc/adapter_test.cc @@ -392,6 +392,10 @@ TEST(TestAdapterRead, ReadIntAndStringFileMultipleStripes) { ASSERT_TRUE(metadata->Equals(*expected_metadata)); ASSERT_EQ(stripe_row_count * stripe_count, reader->NumberOfRows()); ASSERT_EQ(stripe_count, reader->NumberOfStripes()); + ASSERT_EQ(static_cast(stripe_row_count), + reader->GetStripeInformation(0).num_rows); + ASSERT_EQ(static_cast(reader->NumberOfRows() - stripe_row_count), + reader->GetStripeInformation(stripe_count - 1).first_row_id); accumulated = 0; EXPECT_OK_AND_ASSIGN(auto stripe_reader, reader->NextStripeReader(reader_batch_size)); while (stripe_reader) {