Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
54 changes: 13 additions & 41 deletions cpp/src/arrow/builder.cc
Original file line number Diff line number Diff line change
Expand Up @@ -210,7 +210,7 @@ void ArrayBuilder::UnsafeSetNotNull(int64_t length) {
memset(null_bitmap_data_ + ((length_ + pad_to_byte) / 8), 0xFF,
static_cast<size_t>(fast_length));

// Trailing bytes
// Trailing bits
for (int64_t i = length_ + pad_to_byte + (fast_length * 8); i < new_length; ++i) {
BitUtil::SetBit(null_bitmap_data_, i);
}
Expand Down Expand Up @@ -775,16 +775,9 @@ Status BooleanBuilder::AppendValues(const uint8_t* values, int64_t length,
const uint8_t* valid_bytes) {
RETURN_NOT_OK(Reserve(length));

internal::FirstTimeBitmapWriter bit_writer(raw_data_, length_, length);
for (int64_t i = 0; i < length; ++i) {
if (values[i] != 0) {
bit_writer.Set();
} else {
bit_writer.Clear();
}
bit_writer.Next();
}
bit_writer.Finish();
int64_t i = 0;
internal::GenerateBitsUnrolled(raw_data_, length_, length,
[values, &i]() -> bool { return values[i++] != 0; });
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I have often wondered if lambda functions have much overhead vs. inlined functions, is there a good reference on how the various compilers behave?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

A bit of Googling suggests that in instances like this (where the type of the lambda is a template argument), the lambda will be inlined https://www.quora.com/Are-C++-lambda-functions-always-inlined). If you passed a lambda into a function accepting an std::function of some kind, it wouldn't be necessarily

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, apparently the recommended idiom is to let the callable argument be a template parameter so as to select a favorable specialization.


// this updates length_
ArrayBuilder::UnsafeAppendToBitmap(valid_bytes, length);
Expand All @@ -801,16 +794,9 @@ Status BooleanBuilder::AppendValues(const uint8_t* values, int64_t length,
RETURN_NOT_OK(Reserve(length));
DCHECK_EQ(length, static_cast<int64_t>(is_valid.size()));

internal::FirstTimeBitmapWriter bit_writer(raw_data_, length_, length);
for (int64_t i = 0; i < length; ++i) {
if (values[i]) {
bit_writer.Set();
} else {
bit_writer.Clear();
}
bit_writer.Next();
}
bit_writer.Finish();
int64_t i = 0;
internal::GenerateBitsUnrolled(raw_data_, length_, length,
[values, &i]() -> bool { return values[i++]; });

// this updates length_
ArrayBuilder::UnsafeAppendToBitmap(is_valid);
Expand Down Expand Up @@ -846,16 +832,9 @@ Status BooleanBuilder::AppendValues(const std::vector<bool>& values,
RETURN_NOT_OK(Reserve(length));
DCHECK_EQ(length, static_cast<int64_t>(is_valid.size()));

internal::FirstTimeBitmapWriter bit_writer(raw_data_, length_, length);
for (int64_t i = 0; i < length; ++i) {
if (values[i]) {
bit_writer.Set();
} else {
bit_writer.Clear();
}
bit_writer.Next();
}
bit_writer.Finish();
int64_t i = 0;
internal::GenerateBitsUnrolled(raw_data_, length_, length,
[values, &i]() -> bool { return values[i++]; });

// this updates length_
ArrayBuilder::UnsafeAppendToBitmap(is_valid);
Expand All @@ -871,16 +850,9 @@ Status BooleanBuilder::AppendValues(const std::vector<bool>& values) {
const int64_t length = static_cast<int64_t>(values.size());
RETURN_NOT_OK(Reserve(length));

internal::FirstTimeBitmapWriter bit_writer(raw_data_, length_, length);
for (int64_t i = 0; i < length; ++i) {
if (values[i]) {
bit_writer.Set();
} else {
bit_writer.Clear();
}
bit_writer.Next();
}
bit_writer.Finish();
int64_t i = 0;
internal::GenerateBitsUnrolled(raw_data_, length_, length,
[values, &i]() -> bool { return values[i++]; });

// this updates length_
ArrayBuilder::UnsafeSetNotNull(length);
Expand Down
15 changes: 3 additions & 12 deletions cpp/src/arrow/compute/kernels/cast.cc
Original file line number Diff line number Diff line change
Expand Up @@ -200,18 +200,9 @@ struct CastFunctor<O, I,
void operator()(FunctionContext* ctx, const CastOptions& options,
const ArrayData& input, ArrayData* output) {
auto in_data = GetValues<typename I::c_type>(input, 1);
internal::FirstTimeBitmapWriter writer(output->buffers[1]->mutable_data(),
output->offset, input.length);

for (int64_t i = 0; i < input.length; ++i) {
if (*in_data++ != 0) {
writer.Set();
} else {
writer.Clear();
}
writer.Next();
}
writer.Finish();
const auto generate = [&in_data]() -> bool { return *in_data++ != 0; };
internal::GenerateBitsUnrolled(output->buffers[1]->mutable_data(), output->offset,
input.length, generate);
}
};

Expand Down
92 changes: 77 additions & 15 deletions cpp/src/arrow/util/bit-util-benchmark.cc
Original file line number Diff line number Diff line change
Expand Up @@ -126,26 +126,48 @@ static void BenchmarkBitmapWriter(benchmark::State& state, int64_t nbytes) {

const int64_t num_bits = nbytes * 8;
uint8_t* bitmap = buffer->mutable_data();
const bool pattern[] = {false, false, false, true, true, true};

while (state.KeepRunning()) {
{
BitmapWriterType writer(bitmap, 0, num_bits);
for (int64_t i = 0; i < num_bits; i++) {
int64_t pattern_index = 0;
BitmapWriterType writer(bitmap, 0, num_bits);
for (int64_t i = 0; i < num_bits; i++) {
if (pattern[pattern_index++]) {
writer.Set();
writer.Next();
}
writer.Finish();
benchmark::ClobberMemory();
}
{
BitmapWriterType writer(bitmap, 0, num_bits);
for (int64_t i = 0; i < num_bits; i++) {
} else {
writer.Clear();
writer.Next();
}
writer.Finish();
benchmark::ClobberMemory();
if (pattern_index == sizeof(pattern) / sizeof(bool)) {
pattern_index = 0;
}
writer.Next();
}
writer.Finish();
benchmark::ClobberMemory();
}
state.SetBytesProcessed(int64_t(state.iterations()) * nbytes);
}

template <typename GenerateBitsFunctorType>
static void BenchmarkGenerateBits(benchmark::State& state, int64_t nbytes) {
std::shared_ptr<Buffer> buffer = CreateRandomBuffer(nbytes);

const int64_t num_bits = nbytes * 8;
uint8_t* bitmap = buffer->mutable_data();
// pattern should be the same as in BenchmarkBitmapWriter
const bool pattern[] = {false, false, false, true, true, true};

while (state.KeepRunning()) {
int64_t pattern_index = 0;
const auto generate = [&]() -> bool {
bool b = pattern[pattern_index++];
if (pattern_index == sizeof(pattern) / sizeof(bool)) {
pattern_index = 0;
}
return b;
};
GenerateBitsFunctorType()(bitmap, 0, num_bits, generate);
benchmark::ClobberMemory();
}
state.SetBytesProcessed(2 * int64_t(state.iterations()) * nbytes);
}
Expand All @@ -170,6 +192,28 @@ static void BM_FirstTimeBitmapWriter(benchmark::State& state) {
BenchmarkBitmapWriter<internal::FirstTimeBitmapWriter>(state, state.range(0));
}

struct GenerateBitsFunctor {
template <class Generator>
void operator()(uint8_t* bitmap, int64_t start_offset, int64_t length, Generator&& g) {
return internal::GenerateBits(bitmap, start_offset, length, g);
}
};

struct GenerateBitsUnrolledFunctor {
template <class Generator>
void operator()(uint8_t* bitmap, int64_t start_offset, int64_t length, Generator&& g) {
return internal::GenerateBitsUnrolled(bitmap, start_offset, length, g);
}
};

static void BM_GenerateBits(benchmark::State& state) {
BenchmarkGenerateBits<GenerateBitsFunctor>(state, state.range(0));
}

static void BM_GenerateBitsUnrolled(benchmark::State& state) {
BenchmarkGenerateBits<GenerateBitsUnrolledFunctor>(state, state.range(0));
}

static void BM_CopyBitmap(benchmark::State& state) { // NOLINT non-const reference
const int kBufferSize = state.range(0);
std::shared_ptr<Buffer> buffer = CreateRandomBuffer(kBufferSize);
Expand Down Expand Up @@ -201,13 +245,31 @@ BENCHMARK(BM_BitmapReader)->Args({100000})->MinTime(1.0)->Unit(benchmark::kMicro

BENCHMARK(BM_NaiveBitmapWriter)
->Args({100000})
->Repetitions(2)
->MinTime(1.0)
->Unit(benchmark::kMicrosecond);

BENCHMARK(BM_BitmapWriter)->Args({100000})->MinTime(1.0)->Unit(benchmark::kMicrosecond);
BENCHMARK(BM_BitmapWriter)
->Args({100000})
->Repetitions(2)
->MinTime(1.0)
->Unit(benchmark::kMicrosecond);

BENCHMARK(BM_FirstTimeBitmapWriter)
->Args({100000})
->Repetitions(2)
->MinTime(1.0)
->Unit(benchmark::kMicrosecond);

BENCHMARK(BM_GenerateBits)
->Args({100000})
->Repetitions(2)
->MinTime(1.0)
->Unit(benchmark::kMicrosecond);

BENCHMARK(BM_GenerateBitsUnrolled)
->Args({100000})
->Repetitions(2)
->MinTime(1.0)
->Unit(benchmark::kMicrosecond);

Expand Down
74 changes: 74 additions & 0 deletions cpp/src/arrow/util/bit-util-test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -282,6 +282,80 @@ TEST(FirstTimeBitmapWriter, NormalOperation) {
}
}

// Tests for GenerateBits and GenerateBitsUnrolled

struct GenerateBitsFunctor {
template <class Generator>
void operator()(uint8_t* bitmap, int64_t start_offset, int64_t length, Generator&& g) {
return internal::GenerateBits(bitmap, start_offset, length, g);
}
};

struct GenerateBitsUnrolledFunctor {
template <class Generator>
void operator()(uint8_t* bitmap, int64_t start_offset, int64_t length, Generator&& g) {
return internal::GenerateBitsUnrolled(bitmap, start_offset, length, g);
}
};

template <typename T>
class TestGenerateBits : public ::testing::Test {};

typedef ::testing::Types<GenerateBitsFunctor, GenerateBitsUnrolledFunctor>
GenerateBitsTypes;
TYPED_TEST_CASE(TestGenerateBits, GenerateBitsTypes);

TYPED_TEST(TestGenerateBits, NormalOperation) {
const int kSourceSize = 256;
uint8_t source[kSourceSize];
test::random_bytes(kSourceSize, 0, source);

const int64_t start_offsets[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 21, 31, 32};
const int64_t lengths[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 12, 16,
17, 21, 31, 32, 100, 201, 202, 203, 204, 205, 206, 207};
const uint8_t fill_bytes[] = {0x00, 0xff};

for (const int64_t start_offset : start_offsets) {
for (const int64_t length : lengths) {
for (const uint8_t fill_byte : fill_bytes) {
uint8_t bitmap[kSourceSize];
memset(bitmap, fill_byte, kSourceSize);
// First call GenerateBits
{
int64_t ncalled = 0;
internal::BitmapReader reader(source, 0, length);
TypeParam()(bitmap, start_offset, length, [&]() -> bool {
bool b = reader.IsSet();
reader.Next();
++ncalled;
return b;
});
ASSERT_EQ(ncalled, length);
}
// Then check generated contents
{
internal::BitmapReader source_reader(source, 0, length);
internal::BitmapReader result_reader(bitmap, start_offset, length);
for (int64_t i = 0; i < length; ++i) {
ASSERT_EQ(source_reader.IsSet(), result_reader.IsSet())
<< "mismatch at bit #" << i;
source_reader.Next();
result_reader.Next();
}
}
// Check bits preceding and following generated contents weren't clobbered
{
internal::BitmapReader reader_before(bitmap, 0, start_offset);
for (int64_t i = 0; i < start_offset; ++i) {
ASSERT_EQ(reader_before.IsSet(), fill_byte == 0xff)
<< "mismatch at preceding bit #" << start_offset - i;
}
}
}
}
}
}

TEST(BitmapAnd, Aligned) {
std::shared_ptr<Buffer> left, right, out;
int64_t length;
Expand Down
Loading