diff --git a/cpp/src/arrow/testing/random.cc b/cpp/src/arrow/testing/random.cc index d32e1d57de4..32007f810e9 100644 --- a/cpp/src/arrow/testing/random.cc +++ b/cpp/src/arrow/testing/random.cc @@ -31,6 +31,7 @@ #include "arrow/type_fwd.h" #include "arrow/type_traits.h" #include "arrow/util/bit_util.h" +#include "arrow/util/bitmap_reader.h" #include "arrow/util/logging.h" namespace arrow { @@ -268,18 +269,21 @@ std::shared_ptr RandomArrayGenerator::StringWithRepeats(int64_t size, std::shared_ptr RandomArrayGenerator::Offsets(int64_t size, int32_t first_offset, int32_t last_offset, - double null_probability) { + double null_probability, + bool force_empty_nulls) { using GenOpt = GenerateOptions>; GenOpt options(seed(), first_offset, last_offset, null_probability); BufferVector buffers{2}; int64_t null_count = 0; + buffers[0] = *AllocateEmptyBitmap(size); - options.GenerateBitmap(buffers[0]->mutable_data(), size, &null_count); + uint8_t* null_bitmap = buffers[0]->mutable_data(); + options.GenerateBitmap(null_bitmap, size, &null_count); // Make sure the first and last entry are non-null - arrow::BitUtil::SetBit(buffers[0]->mutable_data(), 0); - arrow::BitUtil::SetBit(buffers[0]->mutable_data(), size - 1); + arrow::BitUtil::SetBit(null_bitmap, 0); + arrow::BitUtil::SetBit(null_bitmap, size - 1); buffers[1] = *AllocateBuffer(sizeof(int32_t) * size); auto data = reinterpret_cast(buffers[1]->mutable_data()); @@ -292,10 +296,31 @@ std::shared_ptr RandomArrayGenerator::Offsets(int64_t size, int32_t first data[0] = first_offset; data[size - 1] = last_offset; + if (force_empty_nulls) { + arrow::internal::BitmapReader reader(null_bitmap, 0, size); + for (int64_t i = 0; i < size; ++i) { + if (reader.IsNotSet()) { + // Ensure a null entry corresponds to a 0-sized list extent + // (note this can be neither the first nor the last list entry, see above) + data[i + 1] = data[i]; + } + reader.Next(); + } + } + auto array_data = ArrayData::Make(int32(), size, buffers, null_count); return std::make_shared(array_data); } +std::shared_ptr RandomArrayGenerator::List(const Array& values, int64_t size, + double null_probability, + bool force_empty_nulls) { + auto offsets = Offsets(size, static_cast(values.offset()), + static_cast(values.offset() + values.length()), + null_probability, force_empty_nulls); + return *::arrow::ListArray::FromArrays(*offsets, values); +} + namespace { struct RandomArrayGeneratorOfImpl { diff --git a/cpp/src/arrow/testing/random.h b/cpp/src/arrow/testing/random.h index 1fb656334bb..6f04d31bd2e 100644 --- a/cpp/src/arrow/testing/random.h +++ b/cpp/src/arrow/testing/random.h @@ -229,10 +229,12 @@ class ARROW_TESTING_EXPORT RandomArrayGenerator { /// \param[in] first_offset the first offset value (usually 0) /// \param[in] last_offset the last offset value (usually the size of the child array) /// \param[in] null_probability the probability of an offset being null + /// \param[in] force_empty_nulls if true, null offsets must have 0 "length" /// /// \return a generated Array std::shared_ptr Offsets(int64_t size, int32_t first_offset, int32_t last_offset, - double null_probability = 0); + double null_probability = 0, + bool force_empty_nulls = false); /// \brief Generate a random StringArray /// @@ -281,7 +283,18 @@ class ARROW_TESTING_EXPORT RandomArrayGenerator { int32_t min_length, int32_t max_length, double null_probability = 0); - /// \brief Randomly generate an Array of the specified type, size, and null_probability. + /// \brief Generate a random ListArray + /// + /// \param[in] values The underlying values array + /// \param[in] size The size of the generated list array + /// \param[in] null_probability the probability of a list value being null + /// \param[in] force_empty_nulls if true, null list entries must have 0 length + /// + /// \return a generated Array + std::shared_ptr List(const Array& values, int64_t size, double null_probability, + bool force_empty_nulls = false); + + /// \brief Generate a random Array of the specified type, size, and null_probability. /// /// Generation parameters other than size and null_probability are determined based on /// the type of Array to be generated. diff --git a/cpp/src/parquet/arrow/reader_writer_benchmark.cc b/cpp/src/parquet/arrow/reader_writer_benchmark.cc index 3bb1e755dd9..134cedce6d3 100644 --- a/cpp/src/parquet/arrow/reader_writer_benchmark.cc +++ b/cpp/src/parquet/arrow/reader_writer_benchmark.cc @@ -30,9 +30,13 @@ #include "arrow/api.h" #include "arrow/testing/random.h" +#include "arrow/util/bitmap_ops.h" #include "arrow/util/logging.h" +using arrow::Array; +using arrow::ArrayVector; using arrow::BooleanBuilder; +using arrow::FieldVector; using arrow::NumericBuilder; #define EXIT_NOT_OK(s) \ @@ -223,6 +227,17 @@ static void BenchmarkReadTable(::benchmark::State& state, const ::arrow::Table& } } +static void BenchmarkReadArray(::benchmark::State& state, + const std::shared_ptr& array, bool nullable, + int64_t num_values = -1, int64_t bytes_per_value = -1) { + auto schema = ::arrow::schema({field("s", array->type(), nullable)}); + auto table = ::arrow::Table::Make(schema, {array}, array->length()); + + EXIT_NOT_OK(table->Validate()); + + BenchmarkReadTable(state, *table, num_values, bytes_per_value); +} + // // Benchmark reading a primitive column // @@ -302,6 +317,54 @@ BENCHMARK_TEMPLATE2(BM_ReadColumn, true, BooleanType) // Benchmark reading a nested column // +const std::vector kNestedNullPercents = {0, 1, 50, 99}; + +// XXX We can use ArgsProduct() starting from Benchmark 1.5.2 +static void NestedReadArguments(::benchmark::internal::Benchmark* b) { + for (const auto null_percentage : kNestedNullPercents) { + b->Arg(null_percentage); + } +} + +static std::shared_ptr MakeStructArray(::arrow::random::RandomArrayGenerator* rng, + const ArrayVector& children, + double null_probability, + bool propagate_validity = false) { + ARROW_CHECK_GT(children.size(), 0); + const int64_t length = children[0]->length(); + + std::shared_ptr<::arrow::Buffer> null_bitmap; + if (null_probability > 0.0) { + null_bitmap = rng->NullBitmap(length, null_probability); + if (propagate_validity) { + // HACK: the Parquet writer currently doesn't allow non-empty list + // entries where a parent node is null (for instance, a struct-of-list + // where the outer struct is marked null but the inner list value is + // non-empty). + for (const auto& child : children) { + null_bitmap = *::arrow::internal::BitmapOr( + ::arrow::default_memory_pool(), null_bitmap->data(), 0, + child->null_bitmap_data(), 0, length, 0); + } + } + } + FieldVector fields(children.size()); + char field_name = 'a'; + for (size_t i = 0; i < children.size(); ++i) { + fields[i] = field(std::string{field_name++}, children[i]->type(), + /*nullable=*/null_probability > 0.0); + } + return *::arrow::StructArray::Make(children, std::move(fields), null_bitmap); +} + +// Make a (int32, int64) struct array +static std::shared_ptr MakeStructArray(::arrow::random::RandomArrayGenerator* rng, + int64_t size, double null_probability) { + auto values1 = rng->Int32(size, -5, 5, null_probability); + auto values2 = rng->Int64(size, -12345678912345LL, 12345678912345LL, null_probability); + return MakeStructArray(rng, {values1, values2}, null_probability); +} + static void BM_ReadStructColumn(::benchmark::State& state) { constexpr int64_t kNumValues = BENCHMARK_SIZE / 10; const double null_probability = static_cast(state.range(0)) / 100.0; @@ -309,38 +372,79 @@ static void BM_ReadStructColumn(::benchmark::State& state) { ARROW_CHECK_GE(null_probability, 0.0); + const int64_t kBytesPerValue = sizeof(int32_t) + sizeof(int64_t); + + ::arrow::random::RandomArrayGenerator rng(42); + auto array = MakeStructArray(&rng, kNumValues, null_probability); + + BenchmarkReadArray(state, array, nullable, kNumValues, kBytesPerValue); +} + +BENCHMARK(BM_ReadStructColumn)->Apply(NestedReadArguments); + +static void BM_ReadStructOfStructColumn(::benchmark::State& state) { + constexpr int64_t kNumValues = BENCHMARK_SIZE / 10; + const double null_probability = static_cast(state.range(0)) / 100.0; + const bool nullable = (null_probability != 0.0); + + ARROW_CHECK_GE(null_probability, 0.0); + + const int64_t kBytesPerValue = 2 * (sizeof(int32_t) + sizeof(int64_t)); + ::arrow::random::RandomArrayGenerator rng(42); + auto values1 = MakeStructArray(&rng, kNumValues, null_probability); + auto values2 = MakeStructArray(&rng, kNumValues, null_probability); + auto array = MakeStructArray(&rng, {values1, values2}, null_probability); + + BenchmarkReadArray(state, array, nullable, kNumValues, kBytesPerValue); +} + +BENCHMARK(BM_ReadStructOfStructColumn)->Apply(NestedReadArguments); + +static void BM_ReadStructOfListColumn(::benchmark::State& state) { + constexpr int64_t kNumValues = BENCHMARK_SIZE / 10; + const double null_probability = static_cast(state.range(0)) / 100.0; + const bool nullable = (null_probability != 0.0); + + ARROW_CHECK_GE(null_probability, 0.0); + + ::arrow::random::RandomArrayGenerator rng(42); + + const int64_t kBytesPerValue = sizeof(int32_t) + sizeof(int64_t); auto values1 = rng.Int32(kNumValues, -5, 5, null_probability); auto values2 = rng.Int64(kNumValues, -12345678912345LL, 12345678912345LL, null_probability); + auto list1 = rng.List(*values1, kNumValues / 10, null_probability); + auto list2 = rng.List(*values2, kNumValues / 10, null_probability); + auto array = MakeStructArray(&rng, {list1, list2}, null_probability, + /*propagate_validity =*/true); - const int64_t kBytesPerValue = sizeof(int32_t) + sizeof(int64_t); + BenchmarkReadArray(state, array, nullable, kNumValues, kBytesPerValue); +} - std::shared_ptr<::arrow::Buffer> null_bitmap; - if (nullable) { - null_bitmap = rng.NullBitmap(kNumValues, null_probability); - } - auto array = *::arrow::StructArray::Make( - {values1, values2}, - ::arrow::FieldVector{field("a", values1->type(), nullable), - field("b", values2->type(), nullable)}, - null_bitmap); - auto schema = ::arrow::schema({field("s", array->type(), nullable)}); - auto table = ::arrow::Table::Make(schema, {array}, array->length()); +BENCHMARK(BM_ReadStructOfListColumn)->Apply(NestedReadArguments); - EXIT_NOT_OK(table->Validate()); +static void BM_ReadListColumn(::benchmark::State& state) { + constexpr int64_t kNumValues = BENCHMARK_SIZE / 10; + const double null_probability = static_cast(state.range(0)) / 100.0; + const bool nullable = (null_probability != 0.0); + + ARROW_CHECK_GE(null_probability, 0.0); + + ::arrow::random::RandomArrayGenerator rng(42); + + auto values = rng.Int64(kNumValues, /*min=*/-5, /*max=*/5, null_probability); + const int64_t kBytesPerValue = sizeof(int64_t); + + auto array = rng.List(*values, kNumValues / 10, null_probability); - BenchmarkReadTable(state, *table, kNumValues, kBytesPerValue); + BenchmarkReadArray(state, array, nullable, kNumValues, kBytesPerValue); } -BENCHMARK(BM_ReadStructColumn) - ->Arg(/*null_percentage=*/0) - ->Arg(/*null_percentage=*/1) - ->Arg(/*null_percentage=*/50) - ->Arg(/*null_percentage=*/99); +BENCHMARK(BM_ReadListColumn)->Apply(NestedReadArguments); -static void BM_ReadListColumn(::benchmark::State& state) { +static void BM_ReadListOfStructColumn(::benchmark::State& state) { constexpr int64_t kNumValues = BENCHMARK_SIZE / 10; const double null_probability = static_cast(state.range(0)) / 100.0; const bool nullable = (null_probability != 0.0); @@ -349,26 +453,35 @@ static void BM_ReadListColumn(::benchmark::State& state) { ::arrow::random::RandomArrayGenerator rng(42); - auto values = rng.Int64(kNumValues, /*min=*/-5, /*max=*/5, null_probability); - auto offsets = rng.Offsets(kNumValues / 10, 0, static_cast(values->length()), - null_probability); + auto values = MakeStructArray(&rng, kNumValues, null_probability); + const int64_t kBytesPerValue = sizeof(int32_t) + sizeof(int64_t); - const int64_t kBytesPerValue = sizeof(int64_t); + auto array = rng.List(*values, kNumValues / 10, null_probability); - auto array = *::arrow::ListArray::FromArrays(*offsets, *values); - auto schema = ::arrow::schema({field("s", array->type(), nullable)}); - auto table = ::arrow::Table::Make(schema, {array}, array->length()); + BenchmarkReadArray(state, array, nullable, kNumValues, kBytesPerValue); +} - EXIT_NOT_OK(table->Validate()); +BENCHMARK(BM_ReadListOfStructColumn)->Apply(NestedReadArguments); + +static void BM_ReadListOfListColumn(::benchmark::State& state) { + constexpr int64_t kNumValues = BENCHMARK_SIZE / 10; + const double null_probability = static_cast(state.range(0)) / 100.0; + const bool nullable = (null_probability != 0.0); + + ARROW_CHECK_GE(null_probability, 0.0); + + ::arrow::random::RandomArrayGenerator rng(42); + + auto values = rng.Int64(kNumValues, /*min=*/-5, /*max=*/5, null_probability); + const int64_t kBytesPerValue = sizeof(int64_t); + + auto inner = rng.List(*values, kNumValues / 10, null_probability); + auto array = rng.List(*inner, kNumValues / 100, null_probability); - BenchmarkReadTable(state, *table, kNumValues, kBytesPerValue); + BenchmarkReadArray(state, array, nullable, kNumValues, kBytesPerValue); } -BENCHMARK(BM_ReadListColumn) - ->Arg(/*null_percentage=*/0) - ->Arg(/*null_percentage=*/1) - ->Arg(/*null_percentage=*/50) - ->Arg(/*null_percentage=*/99); +BENCHMARK(BM_ReadListOfListColumn)->Apply(NestedReadArguments); // // Benchmark different ways of reading select row groups