Skip to content
Merged
Changes from all commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
e2b927e
Refactor variable names to make it clear that Skip works on values and
fatemehp Aug 29, 2022
ca4eeec
Refactor variable names to make it clear that Skip works on values and
fatemehp Aug 29, 2022
830c013
Merge branch 'skip' of github.com:fatemehp/arrow into skip
fatemehp Aug 29, 2022
37b29d6
Merge branch 'skip' of github.com:fatemehp/arrow into skip
fatemehp Aug 29, 2022
9e44944
Merge branch 'skip' of github.com:fatemehp/arrow into skip
fatemehp Aug 29, 2022
f533bdc
Merge branch 'skip' of github.com:fatemehp/arrow into skip
fatemehp Aug 29, 2022
9be24ca
Merge branch 'skip' of github.com:fatemehp/arrow into skip
fatemehp Aug 29, 2022
e77908a
Clarify comments
fatemehp Aug 29, 2022
2222cda
Merge branch 'skip' of github.com:fatemehp/arrow into skip
fatemehp Aug 29, 2022
2b50af4
Refactor variable names inside the Skip implementation
fatemehp Aug 29, 2022
442fddc
Merge branch 'skip' of github.com:fatemehp/arrow into skip
fatemehp Aug 29, 2022
03ee3c0
Fix lint issue
fatemehp Aug 29, 2022
b500271
Clarify the comment for the Skip method.
fatemehp Aug 30, 2022
e0ff7fa
Fix another lint issue.
fatemehp Aug 30, 2022
ad749a7
Add a Skip test for repeated field making it clear that it is skipping
fatemehp Aug 30, 2022
174a8b0
Address comments
fatemehp Sep 2, 2022
ff4ad3d
Resolve conflict
fatemehp Oct 17, 2022
0b742bd
Merge branch 'apache:master' into master
fatemehp Oct 17, 2022
88e6407
Merge branch 'master' into fatemehp-master
pitrou Oct 18, 2022
99692d2
Lint / fix downcast / improve test a bit
pitrou Oct 18, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
69 changes: 64 additions & 5 deletions cpp/src/parquet/column_reader_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -260,7 +260,8 @@ TEST_F(TestPrimitiveReader, TestInt32FlatRepeated) {
ASSERT_NO_FATAL_FAILURE(ExecuteDict(num_pages, levels_per_page, &descr));
}

TEST_F(TestPrimitiveReader, TestInt32FlatRequiredSkip) {
// Tests skipping around page boundaries.
TEST_F(TestPrimitiveReader, TestSkipAroundPageBoundries) {
int levels_per_page = 100;
int num_pages = 5;
max_def_level_ = 0;
Expand Down Expand Up @@ -289,10 +290,10 @@ TEST_F(TestPrimitiveReader, TestInt32FlatRequiredSkip) {
values_.begin() + static_cast<int>(2.5 * static_cast<double>(levels_per_page)));
ASSERT_TRUE(vector_equal(sub_values, vresult));

// 2) skip_size == page_size (skip across two pages)
// 2) skip_size == page_size (skip across two pages from page 2.5 to 3.5)
levels_skipped = reader->Skip(levels_per_page);
ASSERT_EQ(levels_per_page, levels_skipped);
// Read half a page
// Read half a page (page 3.5 to 4)
reader->ReadBatch(levels_per_page / 2, dresult.data(), rresult.data(), vresult.data(),
&values_read);
sub_values.clear();
Expand All @@ -303,10 +304,10 @@ TEST_F(TestPrimitiveReader, TestInt32FlatRequiredSkip) {
ASSERT_TRUE(vector_equal(sub_values, vresult));

// 3) skip_size < page_size (skip limited to a single page)
// Skip half a page
// Skip half a page (page 4 to 4.5)
levels_skipped = reader->Skip(levels_per_page / 2);
ASSERT_EQ(0.5 * levels_per_page, levels_skipped);
// Read half a page
// Read half a page (page 4.5 to 5)
reader->ReadBatch(levels_per_page / 2, dresult.data(), rresult.data(), vresult.data(),
&values_read);
sub_values.clear();
Expand All @@ -316,13 +317,71 @@ TEST_F(TestPrimitiveReader, TestInt32FlatRequiredSkip) {
values_.end());
ASSERT_TRUE(vector_equal(sub_values, vresult));

// 4) skip_size = 0
levels_skipped = reader->Skip(0);
ASSERT_EQ(0, levels_skipped);

// 5) Skip past the end page. There are 5 pages and we have either skipped
// or read all of them, so there is nothing left to skip.
levels_skipped = reader->Skip(10);
ASSERT_EQ(0, levels_skipped);

values_.clear();
def_levels_.clear();
rep_levels_.clear();
pages_.clear();
reader_.reset();
}

// Skip with repeated field. This test makes it clear that we are skipping
// values and not records.
TEST_F(TestPrimitiveReader, TestSkipRepeatedField) {
// Example schema: message M { repeated int32 b = 1 }
max_def_level_ = 1;
max_rep_level_ = 1;
NodePtr type = schema::Int32("b", Repetition::REPEATED);
const ColumnDescriptor descr(type, max_def_level_, max_rep_level_);
// Example rows: {}, {[10, 10]}, {[20, 20, 20]}
std::vector<int32_t> values = {10, 10, 20, 20, 20};
std::vector<int16_t> def_levels = {0, 1, 1, 1, 1, 1};
std::vector<int16_t> rep_levels = {0, 0, 1, 0, 1, 1};
num_values_ = static_cast<int>(def_levels.size());
std::shared_ptr<DataPageV1> page = MakeDataPage<Int32Type>(
&descr, values, num_values_, Encoding::PLAIN, /*indices=*/{},
/*indices_size=*/0, def_levels, max_def_level_, rep_levels, max_rep_level_);

pages_.push_back(std::move(page));

InitReader(&descr);
Int32Reader* reader = static_cast<Int32Reader*>(reader_.get());

// Vecotrs to hold read values, definition levels, and repetition levels.
std::vector<int32_t> read_vals(4, -1);
std::vector<int16_t> read_defs(4, -1);
std::vector<int16_t> read_reps(4, -1);

// Skip two levels.
int64_t levels_skipped = reader->Skip(2);
ASSERT_EQ(2, levels_skipped);

int64_t num_read_values = 0;
// Read the next set of values
reader->ReadBatch(10, read_defs.data(), read_reps.data(), read_vals.data(),
&num_read_values);
ASSERT_EQ(num_read_values, 4);
// Note that we end up in the record with {[10, 10]}
ASSERT_TRUE(vector_equal({10, 20, 20, 20}, read_vals));
ASSERT_TRUE(vector_equal({1, 1, 1, 1}, read_defs));
ASSERT_TRUE(vector_equal({1, 0, 1, 1}, read_reps));

// No values remain in data page
levels_skipped = reader->Skip(2);
ASSERT_EQ(0, levels_skipped);
reader->ReadBatch(10, read_defs.data(), read_reps.data(), read_vals.data(),
&num_read_values);
ASSERT_EQ(num_read_values, 0);
}

// Page claims to have two values but only 1 is present.
TEST_F(TestPrimitiveReader, TestReadValuesMissing) {
max_def_level_ = 1;
Expand Down