Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 6 additions & 4 deletions cpp/src/arrow/util/bit_stream_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -203,9 +203,10 @@ class BitReader {
};

inline bool BitWriter::PutValue(uint64_t v, int num_bits) {
// TODO: revisit this limit if necessary (can be raised to 64 by fixing some edge cases)
DCHECK_LE(num_bits, 32);
DCHECK_EQ(v >> num_bits, 0) << "v = " << v << ", num_bits = " << num_bits;
DCHECK_LE(num_bits, 64);
if (num_bits < 64) {
DCHECK_EQ(v >> num_bits, 0) << "v = " << v << ", num_bits = " << num_bits;
}

if (ARROW_PREDICT_FALSE(byte_offset_ * 8 + bit_offset_ + num_bits > max_bytes_ * 8))
return false;
Expand All @@ -220,7 +221,8 @@ inline bool BitWriter::PutValue(uint64_t v, int num_bits) {
buffered_values_ = 0;
byte_offset_ += 8;
bit_offset_ -= 64;
buffered_values_ = v >> (num_bits - bit_offset_);
buffered_values_ =
(num_bits - bit_offset_ == 64) ? 0 : (v >> (num_bits - bit_offset_));
}
DCHECK_LT(bit_offset_, 64);
return true;
Expand Down
34 changes: 34 additions & 0 deletions cpp/src/arrow/util/rle_encoding_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -173,6 +173,40 @@ TEST(BitArray, TestMixed) {
}
}

// Write up to 'num_vals' values with width 'bit_width' and reads them back.
static void TestPutValue(int bit_width, uint64_t num_vals) {
// The max value representable in `bit_width` bits.
const uint64_t max = std::numeric_limits<uint64_t>::max() >> (64 - bit_width);
num_vals = std::min(num_vals, max);
int len = static_cast<int>(bit_util::BytesForBits(bit_width * num_vals));
EXPECT_GT(len, 0);

std::vector<uint8_t> buffer(len);
bit_util::BitWriter writer(buffer.data(), len);
for (uint64_t i = max - num_vals; i < max; i++) {
bool result = writer.PutValue(i, bit_width);
EXPECT_TRUE(result);
}
writer.Flush();
EXPECT_EQ(writer.bytes_written(), len);

bit_util::BitReader reader(buffer.data(), len);
for (uint64_t i = max - num_vals; i < max; i++) {
int64_t val = 0;
bool result = reader.GetValue(bit_width, &val);
EXPECT_TRUE(result);
EXPECT_EQ(val, i);
}
EXPECT_EQ(reader.bytes_left(), 0);
}

TEST(BitUtil, RoundTripIntValues) {
for (int width = 1; width < 64; width++) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ideally this should be:

Suggested change
for (int width = 1; width < 64; width++) {
for (int width = 1; width <= 64; width++) {

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Reverted this because it causes an overflow to 0 and testing for bit_width == 0 doesn't make sense I think.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What do you mean? Testing that we are able to write 64-bit values makes sense.

Perhaps you want something like:

  // The max value representable in `bit_width` bits.
  const uint64_t max = std::numeric_limits<uint64_t>::max() >> (64 - bit_width);

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Changed to your proposal. It runs ok locally.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What do you mean? Testing that we are able to write 64-bit values makes sense.

What I meant was that 2 << 63 was overflowing, setting max and hence num_vals to 0. That then tripped the EXPECT_GT(len, 0); test. This is not an issue now I expect.

TestPutValue(width, 1);
TestPutValue(width, 1024);
}
}

// Validates encoding of values by encoding and decoding them. If
// expected_encoding != NULL, also validates that the encoded buffer is
// exactly 'expected_encoding'.
Expand Down
15 changes: 11 additions & 4 deletions cpp/src/parquet/column_writer_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -400,7 +400,8 @@ typedef ::testing::Types<Int32Type, Int64Type, Int96Type, FloatType, DoubleType,

TYPED_TEST_SUITE(TestPrimitiveWriter, TestTypes);

using TestNullValuesWriter = TestPrimitiveWriter<Int32Type>;
using TestValuesWriterInt32Type = TestPrimitiveWriter<Int32Type>;
using TestValuesWriterInt64Type = TestPrimitiveWriter<Int64Type>;

TYPED_TEST(TestPrimitiveWriter, RequiredPlain) {
this->TestRequiredWithEncoding(Encoding::PLAIN);
Expand All @@ -418,23 +419,29 @@ TYPED_TEST(TestPrimitiveWriter, RequiredRLE) {
TYPED_TEST(TestPrimitiveWriter, RequiredBitPacked) {
this->TestRequiredWithEncoding(Encoding::BIT_PACKED);
}
*/

TEST_F(TestValuesWriterInt32Type, RequiredDeltaBinaryPacked) {
this->TestRequiredWithEncoding(Encoding::DELTA_BINARY_PACKED);
}

TYPED_TEST(TestPrimitiveWriter, RequiredDeltaBinaryPacked) {
TEST_F(TestValuesWriterInt64Type, RequiredDeltaBinaryPacked) {
this->TestRequiredWithEncoding(Encoding::DELTA_BINARY_PACKED);
}
Comment on lines +424 to 430
Copy link
Member Author

@rok rok Dec 2, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we add a roundtrip test in the arrow_reader_writer_test.cc file to make sure this is covered?

@wgtmac I don't see where (and if) arrow_reader_writer_test.cc tests encodings. I've enabled this test as this is probably what you're suggesting.

Copy link
Member Author

@rok rok Dec 2, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah, I misunderstood which line you were commenting due to GitHub interface. Still good to have these enabled.


/*
TYPED_TEST(TestPrimitiveWriter, RequiredDeltaLengthByteArray) {
this->TestRequiredWithEncoding(Encoding::DELTA_LENGTH_BYTE_ARRAY);
}

TYPED_TEST(TestPrimitiveWriter, RequiredDeltaByteArray) {
this->TestRequiredWithEncoding(Encoding::DELTA_BYTE_ARRAY);
}
*/

TYPED_TEST(TestPrimitiveWriter, RequiredRLEDictionary) {
this->TestRequiredWithEncoding(Encoding::RLE_DICTIONARY);
}
*/

TYPED_TEST(TestPrimitiveWriter, RequiredPlainWithStats) {
this->TestRequiredWithSettings(Encoding::PLAIN, Compression::UNCOMPRESSED, false, true,
Expand Down Expand Up @@ -647,7 +654,7 @@ TEST(TestWriter, NullValuesBuffer) {

// PARQUET-719
// Test case for NULL values
TEST_F(TestNullValuesWriter, OptionalNullValueChunk) {
TEST_F(TestValuesWriterInt32Type, OptionalNullValueChunk) {
this->SetUpSchema(Repetition::OPTIONAL);

this->GenerateData(LARGE_SIZE);
Expand Down
Loading