Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 29 additions & 4 deletions cpp/src/arrow/testing/random.cc
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
#include "arrow/type_fwd.h"
#include "arrow/type_traits.h"
#include "arrow/util/bit_util.h"
#include "arrow/util/bitmap_reader.h"
#include "arrow/util/logging.h"

namespace arrow {
Expand Down Expand Up @@ -268,18 +269,21 @@ std::shared_ptr<Array> RandomArrayGenerator::StringWithRepeats(int64_t size,

std::shared_ptr<Array> RandomArrayGenerator::Offsets(int64_t size, int32_t first_offset,
int32_t last_offset,
double null_probability) {
double null_probability,
bool force_empty_nulls) {
using GenOpt = GenerateOptions<int32_t, std::uniform_int_distribution<int32_t>>;
GenOpt options(seed(), first_offset, last_offset, null_probability);

BufferVector buffers{2};

int64_t null_count = 0;

buffers[0] = *AllocateEmptyBitmap(size);
options.GenerateBitmap(buffers[0]->mutable_data(), size, &null_count);
uint8_t* null_bitmap = buffers[0]->mutable_data();
options.GenerateBitmap(null_bitmap, size, &null_count);
// Make sure the first and last entry are non-null
arrow::BitUtil::SetBit(buffers[0]->mutable_data(), 0);
arrow::BitUtil::SetBit(buffers[0]->mutable_data(), size - 1);
arrow::BitUtil::SetBit(null_bitmap, 0);
arrow::BitUtil::SetBit(null_bitmap, size - 1);

buffers[1] = *AllocateBuffer(sizeof(int32_t) * size);
auto data = reinterpret_cast<int32_t*>(buffers[1]->mutable_data());
Expand All @@ -292,10 +296,31 @@ std::shared_ptr<Array> RandomArrayGenerator::Offsets(int64_t size, int32_t first
data[0] = first_offset;
data[size - 1] = last_offset;

if (force_empty_nulls) {
arrow::internal::BitmapReader reader(null_bitmap, 0, size);
for (int64_t i = 0; i < size; ++i) {
if (reader.IsNotSet()) {
// Ensure a null entry corresponds to a 0-sized list extent
// (note this can be neither the first nor the last list entry, see above)
data[i + 1] = data[i];
}
reader.Next();
}
}

auto array_data = ArrayData::Make(int32(), size, buffers, null_count);
return std::make_shared<Int32Array>(array_data);
}

std::shared_ptr<Array> RandomArrayGenerator::List(const Array& values, int64_t size,
double null_probability,
bool force_empty_nulls) {
auto offsets = Offsets(size, static_cast<int32_t>(values.offset()),
static_cast<int32_t>(values.offset() + values.length()),
null_probability, force_empty_nulls);
return *::arrow::ListArray::FromArrays(*offsets, values);
}

namespace {

struct RandomArrayGeneratorOfImpl {
Expand Down
17 changes: 15 additions & 2 deletions cpp/src/arrow/testing/random.h
Original file line number Diff line number Diff line change
Expand Up @@ -229,10 +229,12 @@ class ARROW_TESTING_EXPORT RandomArrayGenerator {
/// \param[in] first_offset the first offset value (usually 0)
/// \param[in] last_offset the last offset value (usually the size of the child array)
/// \param[in] null_probability the probability of an offset being null
/// \param[in] force_empty_nulls if true, null offsets must have 0 "length"
///
/// \return a generated Array
std::shared_ptr<Array> Offsets(int64_t size, int32_t first_offset, int32_t last_offset,
double null_probability = 0);
double null_probability = 0,
bool force_empty_nulls = false);

/// \brief Generate a random StringArray
///
Expand Down Expand Up @@ -281,7 +283,18 @@ class ARROW_TESTING_EXPORT RandomArrayGenerator {
int32_t min_length, int32_t max_length,
double null_probability = 0);

/// \brief Randomly generate an Array of the specified type, size, and null_probability.
/// \brief Generate a random ListArray
///
/// \param[in] values The underlying values array
/// \param[in] size The size of the generated list array
/// \param[in] null_probability the probability of a list value being null
/// \param[in] force_empty_nulls if true, null list entries must have 0 length
///
/// \return a generated Array
std::shared_ptr<Array> List(const Array& values, int64_t size, double null_probability,
bool force_empty_nulls = false);

/// \brief Generate a random Array of the specified type, size, and null_probability.
///
/// Generation parameters other than size and null_probability are determined based on
/// the type of Array to be generated.
Expand Down
181 changes: 147 additions & 34 deletions cpp/src/parquet/arrow/reader_writer_benchmark.cc
Original file line number Diff line number Diff line change
Expand Up @@ -30,9 +30,13 @@

#include "arrow/api.h"
#include "arrow/testing/random.h"
#include "arrow/util/bitmap_ops.h"
#include "arrow/util/logging.h"

using arrow::Array;
using arrow::ArrayVector;
using arrow::BooleanBuilder;
using arrow::FieldVector;
using arrow::NumericBuilder;

#define EXIT_NOT_OK(s) \
Expand Down Expand Up @@ -223,6 +227,17 @@ static void BenchmarkReadTable(::benchmark::State& state, const ::arrow::Table&
}
}

static void BenchmarkReadArray(::benchmark::State& state,
const std::shared_ptr<Array>& array, bool nullable,
int64_t num_values = -1, int64_t bytes_per_value = -1) {
auto schema = ::arrow::schema({field("s", array->type(), nullable)});
auto table = ::arrow::Table::Make(schema, {array}, array->length());

EXIT_NOT_OK(table->Validate());

BenchmarkReadTable(state, *table, num_values, bytes_per_value);
}

//
// Benchmark reading a primitive column
//
Expand Down Expand Up @@ -302,45 +317,134 @@ BENCHMARK_TEMPLATE2(BM_ReadColumn, true, BooleanType)
// Benchmark reading a nested column
//

const std::vector<int64_t> kNestedNullPercents = {0, 1, 50, 99};

// XXX We can use ArgsProduct() starting from Benchmark 1.5.2
static void NestedReadArguments(::benchmark::internal::Benchmark* b) {
for (const auto null_percentage : kNestedNullPercents) {
b->Arg(null_percentage);
}
}

static std::shared_ptr<Array> MakeStructArray(::arrow::random::RandomArrayGenerator* rng,
const ArrayVector& children,
double null_probability,
bool propagate_validity = false) {
ARROW_CHECK_GT(children.size(), 0);
const int64_t length = children[0]->length();

std::shared_ptr<::arrow::Buffer> null_bitmap;
if (null_probability > 0.0) {
null_bitmap = rng->NullBitmap(length, null_probability);
if (propagate_validity) {
// HACK: the Parquet writer currently doesn't allow non-empty list
// entries where a parent node is null (for instance, a struct-of-list
// where the outer struct is marked null but the inner list value is
// non-empty).
for (const auto& child : children) {
null_bitmap = *::arrow::internal::BitmapOr(
::arrow::default_memory_pool(), null_bitmap->data(), 0,
child->null_bitmap_data(), 0, length, 0);
}
}
}
FieldVector fields(children.size());
char field_name = 'a';
for (size_t i = 0; i < children.size(); ++i) {
fields[i] = field(std::string{field_name++}, children[i]->type(),
/*nullable=*/null_probability > 0.0);
}
return *::arrow::StructArray::Make(children, std::move(fields), null_bitmap);
}

// Make a (int32, int64) struct array
static std::shared_ptr<Array> MakeStructArray(::arrow::random::RandomArrayGenerator* rng,
int64_t size, double null_probability) {
auto values1 = rng->Int32(size, -5, 5, null_probability);
auto values2 = rng->Int64(size, -12345678912345LL, 12345678912345LL, null_probability);
return MakeStructArray(rng, {values1, values2}, null_probability);
}

static void BM_ReadStructColumn(::benchmark::State& state) {
constexpr int64_t kNumValues = BENCHMARK_SIZE / 10;
const double null_probability = static_cast<double>(state.range(0)) / 100.0;
const bool nullable = (null_probability != 0.0);

ARROW_CHECK_GE(null_probability, 0.0);

const int64_t kBytesPerValue = sizeof(int32_t) + sizeof(int64_t);

::arrow::random::RandomArrayGenerator rng(42);
auto array = MakeStructArray(&rng, kNumValues, null_probability);

BenchmarkReadArray(state, array, nullable, kNumValues, kBytesPerValue);
}

BENCHMARK(BM_ReadStructColumn)->Apply(NestedReadArguments);

static void BM_ReadStructOfStructColumn(::benchmark::State& state) {
constexpr int64_t kNumValues = BENCHMARK_SIZE / 10;
const double null_probability = static_cast<double>(state.range(0)) / 100.0;
const bool nullable = (null_probability != 0.0);

ARROW_CHECK_GE(null_probability, 0.0);

const int64_t kBytesPerValue = 2 * (sizeof(int32_t) + sizeof(int64_t));

::arrow::random::RandomArrayGenerator rng(42);
auto values1 = MakeStructArray(&rng, kNumValues, null_probability);
auto values2 = MakeStructArray(&rng, kNumValues, null_probability);
auto array = MakeStructArray(&rng, {values1, values2}, null_probability);

BenchmarkReadArray(state, array, nullable, kNumValues, kBytesPerValue);
}

BENCHMARK(BM_ReadStructOfStructColumn)->Apply(NestedReadArguments);

static void BM_ReadStructOfListColumn(::benchmark::State& state) {
constexpr int64_t kNumValues = BENCHMARK_SIZE / 10;
const double null_probability = static_cast<double>(state.range(0)) / 100.0;
const bool nullable = (null_probability != 0.0);

ARROW_CHECK_GE(null_probability, 0.0);

::arrow::random::RandomArrayGenerator rng(42);

const int64_t kBytesPerValue = sizeof(int32_t) + sizeof(int64_t);

auto values1 = rng.Int32(kNumValues, -5, 5, null_probability);
auto values2 =
rng.Int64(kNumValues, -12345678912345LL, 12345678912345LL, null_probability);
auto list1 = rng.List(*values1, kNumValues / 10, null_probability);
auto list2 = rng.List(*values2, kNumValues / 10, null_probability);
auto array = MakeStructArray(&rng, {list1, list2}, null_probability,
/*propagate_validity =*/true);

const int64_t kBytesPerValue = sizeof(int32_t) + sizeof(int64_t);
BenchmarkReadArray(state, array, nullable, kNumValues, kBytesPerValue);
}

std::shared_ptr<::arrow::Buffer> null_bitmap;
if (nullable) {
null_bitmap = rng.NullBitmap(kNumValues, null_probability);
}
auto array = *::arrow::StructArray::Make(
{values1, values2},
::arrow::FieldVector{field("a", values1->type(), nullable),
field("b", values2->type(), nullable)},
null_bitmap);
auto schema = ::arrow::schema({field("s", array->type(), nullable)});
auto table = ::arrow::Table::Make(schema, {array}, array->length());
BENCHMARK(BM_ReadStructOfListColumn)->Apply(NestedReadArguments);

EXIT_NOT_OK(table->Validate());
static void BM_ReadListColumn(::benchmark::State& state) {
constexpr int64_t kNumValues = BENCHMARK_SIZE / 10;
const double null_probability = static_cast<double>(state.range(0)) / 100.0;
const bool nullable = (null_probability != 0.0);

ARROW_CHECK_GE(null_probability, 0.0);

::arrow::random::RandomArrayGenerator rng(42);

auto values = rng.Int64(kNumValues, /*min=*/-5, /*max=*/5, null_probability);
const int64_t kBytesPerValue = sizeof(int64_t);

auto array = rng.List(*values, kNumValues / 10, null_probability);

BenchmarkReadTable(state, *table, kNumValues, kBytesPerValue);
BenchmarkReadArray(state, array, nullable, kNumValues, kBytesPerValue);
}

BENCHMARK(BM_ReadStructColumn)
->Arg(/*null_percentage=*/0)
->Arg(/*null_percentage=*/1)
->Arg(/*null_percentage=*/50)
->Arg(/*null_percentage=*/99);
BENCHMARK(BM_ReadListColumn)->Apply(NestedReadArguments);

static void BM_ReadListColumn(::benchmark::State& state) {
static void BM_ReadListOfStructColumn(::benchmark::State& state) {
constexpr int64_t kNumValues = BENCHMARK_SIZE / 10;
const double null_probability = static_cast<double>(state.range(0)) / 100.0;
const bool nullable = (null_probability != 0.0);
Expand All @@ -349,26 +453,35 @@ static void BM_ReadListColumn(::benchmark::State& state) {

::arrow::random::RandomArrayGenerator rng(42);

auto values = rng.Int64(kNumValues, /*min=*/-5, /*max=*/5, null_probability);
auto offsets = rng.Offsets(kNumValues / 10, 0, static_cast<int32_t>(values->length()),
null_probability);
auto values = MakeStructArray(&rng, kNumValues, null_probability);
const int64_t kBytesPerValue = sizeof(int32_t) + sizeof(int64_t);

const int64_t kBytesPerValue = sizeof(int64_t);
auto array = rng.List(*values, kNumValues / 10, null_probability);

auto array = *::arrow::ListArray::FromArrays(*offsets, *values);
auto schema = ::arrow::schema({field("s", array->type(), nullable)});
auto table = ::arrow::Table::Make(schema, {array}, array->length());
BenchmarkReadArray(state, array, nullable, kNumValues, kBytesPerValue);
}

EXIT_NOT_OK(table->Validate());
BENCHMARK(BM_ReadListOfStructColumn)->Apply(NestedReadArguments);

static void BM_ReadListOfListColumn(::benchmark::State& state) {
constexpr int64_t kNumValues = BENCHMARK_SIZE / 10;
const double null_probability = static_cast<double>(state.range(0)) / 100.0;
const bool nullable = (null_probability != 0.0);

ARROW_CHECK_GE(null_probability, 0.0);

::arrow::random::RandomArrayGenerator rng(42);

auto values = rng.Int64(kNumValues, /*min=*/-5, /*max=*/5, null_probability);
const int64_t kBytesPerValue = sizeof(int64_t);

auto inner = rng.List(*values, kNumValues / 10, null_probability);
auto array = rng.List(*inner, kNumValues / 100, null_probability);

BenchmarkReadTable(state, *table, kNumValues, kBytesPerValue);
BenchmarkReadArray(state, array, nullable, kNumValues, kBytesPerValue);
}

BENCHMARK(BM_ReadListColumn)
->Arg(/*null_percentage=*/0)
->Arg(/*null_percentage=*/1)
->Arg(/*null_percentage=*/50)
->Arg(/*null_percentage=*/99);
BENCHMARK(BM_ReadListOfListColumn)->Apply(NestedReadArguments);

//
// Benchmark different ways of reading select row groups
Expand Down