diff --git a/cpp/src/arrow/builder.cc b/cpp/src/arrow/builder.cc index 65018dee326..9a8fc7d9e30 100644 --- a/cpp/src/arrow/builder.cc +++ b/cpp/src/arrow/builder.cc @@ -210,7 +210,7 @@ void ArrayBuilder::UnsafeSetNotNull(int64_t length) { memset(null_bitmap_data_ + ((length_ + pad_to_byte) / 8), 0xFF, static_cast(fast_length)); - // Trailing bytes + // Trailing bits for (int64_t i = length_ + pad_to_byte + (fast_length * 8); i < new_length; ++i) { BitUtil::SetBit(null_bitmap_data_, i); } @@ -775,16 +775,9 @@ Status BooleanBuilder::AppendValues(const uint8_t* values, int64_t length, const uint8_t* valid_bytes) { RETURN_NOT_OK(Reserve(length)); - internal::FirstTimeBitmapWriter bit_writer(raw_data_, length_, length); - for (int64_t i = 0; i < length; ++i) { - if (values[i] != 0) { - bit_writer.Set(); - } else { - bit_writer.Clear(); - } - bit_writer.Next(); - } - bit_writer.Finish(); + int64_t i = 0; + internal::GenerateBitsUnrolled(raw_data_, length_, length, + [values, &i]() -> bool { return values[i++] != 0; }); // this updates length_ ArrayBuilder::UnsafeAppendToBitmap(valid_bytes, length); @@ -801,16 +794,9 @@ Status BooleanBuilder::AppendValues(const uint8_t* values, int64_t length, RETURN_NOT_OK(Reserve(length)); DCHECK_EQ(length, static_cast(is_valid.size())); - internal::FirstTimeBitmapWriter bit_writer(raw_data_, length_, length); - for (int64_t i = 0; i < length; ++i) { - if (values[i]) { - bit_writer.Set(); - } else { - bit_writer.Clear(); - } - bit_writer.Next(); - } - bit_writer.Finish(); + int64_t i = 0; + internal::GenerateBitsUnrolled(raw_data_, length_, length, + [values, &i]() -> bool { return values[i++]; }); // this updates length_ ArrayBuilder::UnsafeAppendToBitmap(is_valid); @@ -846,16 +832,9 @@ Status BooleanBuilder::AppendValues(const std::vector& values, RETURN_NOT_OK(Reserve(length)); DCHECK_EQ(length, static_cast(is_valid.size())); - internal::FirstTimeBitmapWriter bit_writer(raw_data_, length_, length); - for (int64_t i = 0; i < length; ++i) { - if (values[i]) { - bit_writer.Set(); - } else { - bit_writer.Clear(); - } - bit_writer.Next(); - } - bit_writer.Finish(); + int64_t i = 0; + internal::GenerateBitsUnrolled(raw_data_, length_, length, + [values, &i]() -> bool { return values[i++]; }); // this updates length_ ArrayBuilder::UnsafeAppendToBitmap(is_valid); @@ -871,16 +850,9 @@ Status BooleanBuilder::AppendValues(const std::vector& values) { const int64_t length = static_cast(values.size()); RETURN_NOT_OK(Reserve(length)); - internal::FirstTimeBitmapWriter bit_writer(raw_data_, length_, length); - for (int64_t i = 0; i < length; ++i) { - if (values[i]) { - bit_writer.Set(); - } else { - bit_writer.Clear(); - } - bit_writer.Next(); - } - bit_writer.Finish(); + int64_t i = 0; + internal::GenerateBitsUnrolled(raw_data_, length_, length, + [values, &i]() -> bool { return values[i++]; }); // this updates length_ ArrayBuilder::UnsafeSetNotNull(length); diff --git a/cpp/src/arrow/compute/kernels/cast.cc b/cpp/src/arrow/compute/kernels/cast.cc index 7987a1c95df..39925d78358 100644 --- a/cpp/src/arrow/compute/kernels/cast.cc +++ b/cpp/src/arrow/compute/kernels/cast.cc @@ -200,18 +200,9 @@ struct CastFunctor(input, 1); - internal::FirstTimeBitmapWriter writer(output->buffers[1]->mutable_data(), - output->offset, input.length); - - for (int64_t i = 0; i < input.length; ++i) { - if (*in_data++ != 0) { - writer.Set(); - } else { - writer.Clear(); - } - writer.Next(); - } - writer.Finish(); + const auto generate = [&in_data]() -> bool { return *in_data++ != 0; }; + internal::GenerateBitsUnrolled(output->buffers[1]->mutable_data(), output->offset, + input.length, generate); } }; diff --git a/cpp/src/arrow/util/bit-util-benchmark.cc b/cpp/src/arrow/util/bit-util-benchmark.cc index 43f461cc8c1..47f4bfe82fa 100644 --- a/cpp/src/arrow/util/bit-util-benchmark.cc +++ b/cpp/src/arrow/util/bit-util-benchmark.cc @@ -126,26 +126,48 @@ static void BenchmarkBitmapWriter(benchmark::State& state, int64_t nbytes) { const int64_t num_bits = nbytes * 8; uint8_t* bitmap = buffer->mutable_data(); + const bool pattern[] = {false, false, false, true, true, true}; while (state.KeepRunning()) { - { - BitmapWriterType writer(bitmap, 0, num_bits); - for (int64_t i = 0; i < num_bits; i++) { + int64_t pattern_index = 0; + BitmapWriterType writer(bitmap, 0, num_bits); + for (int64_t i = 0; i < num_bits; i++) { + if (pattern[pattern_index++]) { writer.Set(); - writer.Next(); - } - writer.Finish(); - benchmark::ClobberMemory(); - } - { - BitmapWriterType writer(bitmap, 0, num_bits); - for (int64_t i = 0; i < num_bits; i++) { + } else { writer.Clear(); - writer.Next(); } - writer.Finish(); - benchmark::ClobberMemory(); + if (pattern_index == sizeof(pattern) / sizeof(bool)) { + pattern_index = 0; + } + writer.Next(); } + writer.Finish(); + benchmark::ClobberMemory(); + } + state.SetBytesProcessed(int64_t(state.iterations()) * nbytes); +} + +template +static void BenchmarkGenerateBits(benchmark::State& state, int64_t nbytes) { + std::shared_ptr buffer = CreateRandomBuffer(nbytes); + + const int64_t num_bits = nbytes * 8; + uint8_t* bitmap = buffer->mutable_data(); + // pattern should be the same as in BenchmarkBitmapWriter + const bool pattern[] = {false, false, false, true, true, true}; + + while (state.KeepRunning()) { + int64_t pattern_index = 0; + const auto generate = [&]() -> bool { + bool b = pattern[pattern_index++]; + if (pattern_index == sizeof(pattern) / sizeof(bool)) { + pattern_index = 0; + } + return b; + }; + GenerateBitsFunctorType()(bitmap, 0, num_bits, generate); + benchmark::ClobberMemory(); } state.SetBytesProcessed(2 * int64_t(state.iterations()) * nbytes); } @@ -170,6 +192,28 @@ static void BM_FirstTimeBitmapWriter(benchmark::State& state) { BenchmarkBitmapWriter(state, state.range(0)); } +struct GenerateBitsFunctor { + template + void operator()(uint8_t* bitmap, int64_t start_offset, int64_t length, Generator&& g) { + return internal::GenerateBits(bitmap, start_offset, length, g); + } +}; + +struct GenerateBitsUnrolledFunctor { + template + void operator()(uint8_t* bitmap, int64_t start_offset, int64_t length, Generator&& g) { + return internal::GenerateBitsUnrolled(bitmap, start_offset, length, g); + } +}; + +static void BM_GenerateBits(benchmark::State& state) { + BenchmarkGenerateBits(state, state.range(0)); +} + +static void BM_GenerateBitsUnrolled(benchmark::State& state) { + BenchmarkGenerateBits(state, state.range(0)); +} + static void BM_CopyBitmap(benchmark::State& state) { // NOLINT non-const reference const int kBufferSize = state.range(0); std::shared_ptr buffer = CreateRandomBuffer(kBufferSize); @@ -201,13 +245,31 @@ BENCHMARK(BM_BitmapReader)->Args({100000})->MinTime(1.0)->Unit(benchmark::kMicro BENCHMARK(BM_NaiveBitmapWriter) ->Args({100000}) + ->Repetitions(2) ->MinTime(1.0) ->Unit(benchmark::kMicrosecond); -BENCHMARK(BM_BitmapWriter)->Args({100000})->MinTime(1.0)->Unit(benchmark::kMicrosecond); +BENCHMARK(BM_BitmapWriter) + ->Args({100000}) + ->Repetitions(2) + ->MinTime(1.0) + ->Unit(benchmark::kMicrosecond); BENCHMARK(BM_FirstTimeBitmapWriter) ->Args({100000}) + ->Repetitions(2) + ->MinTime(1.0) + ->Unit(benchmark::kMicrosecond); + +BENCHMARK(BM_GenerateBits) + ->Args({100000}) + ->Repetitions(2) + ->MinTime(1.0) + ->Unit(benchmark::kMicrosecond); + +BENCHMARK(BM_GenerateBitsUnrolled) + ->Args({100000}) + ->Repetitions(2) ->MinTime(1.0) ->Unit(benchmark::kMicrosecond); diff --git a/cpp/src/arrow/util/bit-util-test.cc b/cpp/src/arrow/util/bit-util-test.cc index 5e0b6963cb2..ae8923d6684 100644 --- a/cpp/src/arrow/util/bit-util-test.cc +++ b/cpp/src/arrow/util/bit-util-test.cc @@ -282,6 +282,80 @@ TEST(FirstTimeBitmapWriter, NormalOperation) { } } +// Tests for GenerateBits and GenerateBitsUnrolled + +struct GenerateBitsFunctor { + template + void operator()(uint8_t* bitmap, int64_t start_offset, int64_t length, Generator&& g) { + return internal::GenerateBits(bitmap, start_offset, length, g); + } +}; + +struct GenerateBitsUnrolledFunctor { + template + void operator()(uint8_t* bitmap, int64_t start_offset, int64_t length, Generator&& g) { + return internal::GenerateBitsUnrolled(bitmap, start_offset, length, g); + } +}; + +template +class TestGenerateBits : public ::testing::Test {}; + +typedef ::testing::Types + GenerateBitsTypes; +TYPED_TEST_CASE(TestGenerateBits, GenerateBitsTypes); + +TYPED_TEST(TestGenerateBits, NormalOperation) { + const int kSourceSize = 256; + uint8_t source[kSourceSize]; + test::random_bytes(kSourceSize, 0, source); + + const int64_t start_offsets[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 21, 31, 32}; + const int64_t lengths[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 12, 16, + 17, 21, 31, 32, 100, 201, 202, 203, 204, 205, 206, 207}; + const uint8_t fill_bytes[] = {0x00, 0xff}; + + for (const int64_t start_offset : start_offsets) { + for (const int64_t length : lengths) { + for (const uint8_t fill_byte : fill_bytes) { + uint8_t bitmap[kSourceSize]; + memset(bitmap, fill_byte, kSourceSize); + // First call GenerateBits + { + int64_t ncalled = 0; + internal::BitmapReader reader(source, 0, length); + TypeParam()(bitmap, start_offset, length, [&]() -> bool { + bool b = reader.IsSet(); + reader.Next(); + ++ncalled; + return b; + }); + ASSERT_EQ(ncalled, length); + } + // Then check generated contents + { + internal::BitmapReader source_reader(source, 0, length); + internal::BitmapReader result_reader(bitmap, start_offset, length); + for (int64_t i = 0; i < length; ++i) { + ASSERT_EQ(source_reader.IsSet(), result_reader.IsSet()) + << "mismatch at bit #" << i; + source_reader.Next(); + result_reader.Next(); + } + } + // Check bits preceding and following generated contents weren't clobbered + { + internal::BitmapReader reader_before(bitmap, 0, start_offset); + for (int64_t i = 0; i < start_offset; ++i) { + ASSERT_EQ(reader_before.IsSet(), fill_byte == 0xff) + << "mismatch at preceding bit #" << start_offset - i; + } + } + } + } + } +} + TEST(BitmapAnd, Aligned) { std::shared_ptr left, right, out; int64_t length; diff --git a/cpp/src/arrow/util/bit-util.h b/cpp/src/arrow/util/bit-util.h index c012f7c4126..6c06d9c0305 100644 --- a/cpp/src/arrow/util/bit-util.h +++ b/cpp/src/arrow/util/bit-util.h @@ -99,6 +99,9 @@ static constexpr uint8_t kFlippedBitmask[] = {254, 253, 251, 247, 239, 223, 191, // Bitmask selecting the (k - 1) preceding bits in a byte static constexpr uint8_t kPrecedingBitmask[] = {0, 1, 3, 7, 15, 31, 63, 127}; +// the bitwise complement version of kPrecedingBitmask +static constexpr uint8_t kTrailingBitmask[] = {255, 254, 252, 248, 240, 224, 192, 128}; + static inline int64_t CeilByte(int64_t size) { return (size + 7) & ~7; } static inline int64_t BytesForBits(int64_t size) { return CeilByte(size) / 8; } @@ -465,7 +468,7 @@ class BitmapWriter { BitmapWriter(uint8_t* bitmap, int64_t start_offset, int64_t length) : bitmap_(bitmap), position_(0), length_(length) { byte_offset_ = start_offset / 8; - bit_mask_ = static_cast(1 << (start_offset % 8)); + bit_mask_ = BitUtil::kBitmask[start_offset % 8]; if (length > 0) { current_byte_ = bitmap[byte_offset_]; } else { @@ -519,7 +522,7 @@ class FirstTimeBitmapWriter { : bitmap_(bitmap), position_(0), length_(length) { current_byte_ = 0; byte_offset_ = start_offset / 8; - bit_mask_ = static_cast(1 << (start_offset % 8)); + bit_mask_ = BitUtil::kBitmask[start_offset % 8]; if (length > 0) { current_byte_ = bitmap[byte_offset_] & BitUtil::kPrecedingBitmask[start_offset % 8]; } else { @@ -561,7 +564,83 @@ class FirstTimeBitmapWriter { int64_t byte_offset_; }; -// TODO: add a std::generate-like function for writing bitmaps? +// A std::generate() like function to write sequential bits into a bitmap area. +// Bits preceding the bitmap area are preserved, bits following the bitmap +// area may be clobbered. + +template +void GenerateBits(uint8_t* bitmap, int64_t start_offset, int64_t length, Generator&& g) { + if (length == 0) { + return; + } + uint8_t* cur = bitmap + start_offset / 8; + uint8_t bit_mask = BitUtil::kBitmask[start_offset % 8]; + uint8_t current_byte = *cur & BitUtil::kPrecedingBitmask[start_offset % 8]; + + for (int64_t index = 0; index < length; ++index) { + const bool bit = g(); + current_byte = bit ? (current_byte | bit_mask) : current_byte; + bit_mask = static_cast(bit_mask << 1); + if (bit_mask == 0) { + bit_mask = 1; + *cur++ = current_byte; + current_byte = 0; + } + } + if (bit_mask != 1) { + *cur++ = current_byte; + } +} + +// Like GenerateBits(), but unrolls its main loop for higher performance. + +template +void GenerateBitsUnrolled(uint8_t* bitmap, int64_t start_offset, int64_t length, + Generator&& g) { + if (length == 0) { + return; + } + uint8_t current_byte; + uint8_t* cur = bitmap + start_offset / 8; + const uint64_t start_bit_offset = start_offset % 8; + uint8_t bit_mask = BitUtil::kBitmask[start_bit_offset]; + int64_t remaining = length; + + if (bit_mask != 0x01) { + current_byte = *cur & BitUtil::kPrecedingBitmask[start_bit_offset]; + while (bit_mask != 0 && remaining > 0) { + current_byte = g() ? (current_byte | bit_mask) : current_byte; + bit_mask = static_cast(bit_mask << 1); + --remaining; + } + *cur++ = current_byte; + } + + int64_t remaining_bytes = remaining / 8; + while (remaining_bytes-- > 0) { + current_byte = 0; + current_byte = g() ? current_byte | 0x01 : current_byte; + current_byte = g() ? current_byte | 0x02 : current_byte; + current_byte = g() ? current_byte | 0x04 : current_byte; + current_byte = g() ? current_byte | 0x08 : current_byte; + current_byte = g() ? current_byte | 0x10 : current_byte; + current_byte = g() ? current_byte | 0x20 : current_byte; + current_byte = g() ? current_byte | 0x40 : current_byte; + current_byte = g() ? current_byte | 0x80 : current_byte; + *cur++ = current_byte; + } + + int64_t remaining_bits = remaining % 8; + if (remaining_bits) { + current_byte = 0; + bit_mask = 0x01; + while (remaining_bits-- > 0) { + current_byte = g() ? (current_byte | bit_mask) : current_byte; + bit_mask = static_cast(bit_mask << 1); + } + *cur++ = current_byte; + } +} } // namespace internal