Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 18 additions & 15 deletions cpp/src/arrow/adapters/orc/adapter.cc
Original file line number Diff line number Diff line change
Expand Up @@ -126,13 +126,6 @@ class ArrowInputFile : public liborc::InputStream {
std::shared_ptr<io::RandomAccessFile> file_;
};

struct StripeInformation {
uint64_t offset;
uint64_t length;
uint64_t num_rows;
uint64_t first_row_of_stripe;
};

// The number of rows to read in a ColumnVectorBatch
constexpr int64_t kReadRowsBatch = 1000;

Expand Down Expand Up @@ -206,8 +199,10 @@ class ORCFileReader::Impl {
uint64_t first_row_of_stripe = 0;
for (int i = 0; i < nstripes; ++i) {
stripe = reader_->getStripe(i);
stripes_[i] = StripeInformation({stripe->getOffset(), stripe->getLength(),
stripe->getNumberOfRows(), first_row_of_stripe});
stripes_[i] = StripeInformation({static_cast<int64_t>(stripe->getOffset()),
static_cast<int64_t>(stripe->getLength()),
static_cast<int64_t>(stripe->getNumberOfRows()),
static_cast<int64_t>(first_row_of_stripe)});
first_row_of_stripe += stripe->getNumberOfRows();
}
return Status::OK();
Expand All @@ -217,6 +212,8 @@ class ORCFileReader::Impl {

int64_t NumberOfRows() { return static_cast<int64_t>(reader_->getNumberOfRows()); }

StripeInformation GetStripeInformation(int64_t stripe) { return stripes_[stripe]; }

FileVersion GetFileVersion() {
liborc::FileVersion orc_file_version = reader_->getFormatVersion();
return FileVersion(orc_file_version.getMajor(), orc_file_version.getMinor());
Expand Down Expand Up @@ -383,7 +380,8 @@ class ORCFileReader::Impl {
ARROW_RETURN_IF(stripe < 0 || stripe >= NumberOfStripes(),
Status::Invalid("Out of bounds stripe: ", stripe));

opts->range(stripes_[stripe].offset, stripes_[stripe].length);
opts->range(static_cast<uint64_t>(stripes_[stripe].offset),
static_cast<uint64_t>(stripes_[stripe].length));
return Status::OK();
}

Expand All @@ -393,9 +391,9 @@ class ORCFileReader::Impl {
Status::Invalid("Out of bounds row number: ", row_number));

for (auto it = stripes_.begin(); it != stripes_.end(); it++) {
if (static_cast<uint64_t>(row_number) >= it->first_row_of_stripe &&
static_cast<uint64_t>(row_number) < it->first_row_of_stripe + it->num_rows) {
opts->range(it->offset, it->length);
if (row_number >= it->first_row_id &&
row_number < it->first_row_id + it->num_rows) {
opts->range(static_cast<uint64_t>(it->offset), static_cast<uint64_t>(it->length));
*out = *it;
return Status::OK();
}
Expand Down Expand Up @@ -427,7 +425,8 @@ class ORCFileReader::Impl {
liborc::RowReaderOptions opts(row_opts);
std::vector<std::shared_ptr<RecordBatch>> batches(stripes_.size());
for (size_t stripe = 0; stripe < stripes_.size(); stripe++) {
opts.range(stripes_[stripe].offset, stripes_[stripe].length);
opts.range(static_cast<uint64_t>(stripes_[stripe].offset),
static_cast<uint64_t>(stripes_[stripe].length));
ARROW_ASSIGN_OR_RAISE(batches[stripe],
ReadBatch(opts, schema, stripes_[stripe].num_rows));
}
Expand Down Expand Up @@ -488,7 +487,7 @@ class ORCFileReader::Impl {
ORC_BEGIN_CATCH_NOT_OK
row_reader = reader_->createRowReader(opts);
row_reader->seekToRow(current_row_);
current_row_ = stripe_info.first_row_of_stripe + stripe_info.num_rows;
current_row_ = stripe_info.first_row_id + stripe_info.num_rows;
ORC_END_CATCH_NOT_OK

return std::make_shared<OrcStripeReader>(std::move(row_reader), schema, batch_size,
Expand Down Expand Up @@ -600,6 +599,10 @@ int64_t ORCFileReader::NumberOfStripes() { return impl_->NumberOfStripes(); }

int64_t ORCFileReader::NumberOfRows() { return impl_->NumberOfRows(); }

StripeInformation ORCFileReader::GetStripeInformation(int64_t stripe) {
return impl_->GetStripeInformation(stripe);
}

FileVersion ORCFileReader::GetFileVersion() { return impl_->GetFileVersion(); }

std::string ORCFileReader::GetSoftwareVersion() { return impl_->GetSoftwareVersion(); }
Expand Down
15 changes: 15 additions & 0 deletions cpp/src/arrow/adapters/orc/adapter.h
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,18 @@ namespace arrow {
namespace adapters {
namespace orc {

/// \brief Information about an ORC stripe
struct StripeInformation {
/// \brief Offset of the stripe from the start of the file, in bytes
int64_t offset;
/// \brief Length of the stripe, in bytes
int64_t length;
/// \brief Number of rows in the stripe
int64_t num_rows;
/// \brief Index of the first row of the stripe
int64_t first_row_id;
};

/// \class ORCFileReader
/// \brief Read an Arrow Table or RecordBatch from an ORC file.
class ARROW_EXPORT ORCFileReader {
Expand Down Expand Up @@ -168,6 +180,9 @@ class ARROW_EXPORT ORCFileReader {
/// \brief The number of rows in the file
int64_t NumberOfRows();

/// \brief StripeInformation for each stripe.
StripeInformation GetStripeInformation(int64_t stripe);

/// \brief Get the format version of the file.
/// Currently known values are 0.11 and 0.12.
///
Expand Down
4 changes: 4 additions & 0 deletions cpp/src/arrow/adapters/orc/adapter_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -392,6 +392,10 @@ TEST(TestAdapterRead, ReadIntAndStringFileMultipleStripes) {
ASSERT_TRUE(metadata->Equals(*expected_metadata));
ASSERT_EQ(stripe_row_count * stripe_count, reader->NumberOfRows());
ASSERT_EQ(stripe_count, reader->NumberOfStripes());
ASSERT_EQ(static_cast<int64_t>(stripe_row_count),
reader->GetStripeInformation(0).num_rows);
ASSERT_EQ(static_cast<int64_t>(reader->NumberOfRows() - stripe_row_count),
reader->GetStripeInformation(stripe_count - 1).first_row_id);
accumulated = 0;
EXPECT_OK_AND_ASSIGN(auto stripe_reader, reader->NextStripeReader(reader_batch_size));
while (stripe_reader) {
Expand Down