diff --git a/cpp/src/parquet/encoding_benchmark.cc b/cpp/src/parquet/encoding_benchmark.cc index e6a3c2c58ca..405549054e0 100644 --- a/cpp/src/parquet/encoding_benchmark.cc +++ b/cpp/src/parquet/encoding_benchmark.cc @@ -569,6 +569,133 @@ BENCHMARK(BM_DeltaBitPackingDecode_Int64_Narrow)->Range(MIN_RANGE, MAX_RANGE); BENCHMARK(BM_DeltaBitPackingDecode_Int32_Wide)->Range(MIN_RANGE, MAX_RANGE); BENCHMARK(BM_DeltaBitPackingDecode_Int64_Wide)->Range(MIN_RANGE, MAX_RANGE); +static void ByteArrayCustomArguments(benchmark::internal::Benchmark* b) { + b->ArgsProduct({{8, 64, 1024}, {512, 2048}}) + ->ArgNames({"max-string-length", "batch-size"}); +} + +void EncodingByteArrayBenchmark(benchmark::State& state, Encoding::type encoding) { + ::arrow::random::RandomArrayGenerator rag(0); + // Using arrow generator to generate random data. + int32_t max_length = static_cast(state.range(0)); + int32_t array_size = static_cast(state.range(1)); + auto array = + rag.String(/* size */ array_size, /* min_length */ 0, /* max_length */ max_length, + /* null_probability */ 0); + const auto array_actual = + ::arrow::internal::checked_pointer_cast<::arrow::StringArray>(array); + auto encoder = MakeTypedEncoder(encoding); + std::vector values; + for (int i = 0; i < array_actual->length(); ++i) { + values.emplace_back(array_actual->GetView(i)); + } + + for (auto _ : state) { + encoder->Put(values.data(), static_cast(values.size())); + encoder->FlushValues(); + } + state.SetItemsProcessed(state.iterations() * array_actual->length()); + state.SetBytesProcessed(state.iterations() * (array_actual->value_data()->size() + + array_actual->value_offsets()->size())); +} + +static void BM_DeltaLengthEncodingByteArray(benchmark::State& state) { + EncodingByteArrayBenchmark(state, Encoding::DELTA_LENGTH_BYTE_ARRAY); +} + +static void BM_PlainEncodingByteArray(benchmark::State& state) { + EncodingByteArrayBenchmark(state, Encoding::PLAIN); +} + +void DecodingByteArrayBenchmark(benchmark::State& state, Encoding::type encoding) { + ::arrow::random::RandomArrayGenerator rag(0); + int32_t max_length = static_cast(state.range(0)); + int32_t array_size = static_cast(state.range(1)); + // Using arrow to write, because we just benchmark decoding here. + auto array = + rag.String(/* size */ array_size, /* min_length */ 0, /* max_length */ max_length, + /* null_probability */ 0); + const auto array_actual = + ::arrow::internal::checked_pointer_cast<::arrow::StringArray>(array); + auto encoder = MakeTypedEncoder(encoding); + encoder->Put(*array); + std::shared_ptr buf = encoder->FlushValues(); + + std::vector values; + values.resize(array->length()); + for (auto _ : state) { + auto decoder = MakeTypedDecoder(encoding); + decoder->SetData(static_cast(array->length()), buf->data(), + static_cast(buf->size())); + decoder->Decode(values.data(), static_cast(values.size())); + ::benchmark::DoNotOptimize(values); + } + state.SetItemsProcessed(state.iterations() * array->length()); + state.SetBytesProcessed(state.iterations() * (array_actual->value_data()->size() + + array_actual->value_offsets()->size())); +} + +static void BM_PlainDecodingByteArray(benchmark::State& state) { + DecodingByteArrayBenchmark(state, Encoding::PLAIN); +} + +static void BM_DeltaLengthDecodingByteArray(benchmark::State& state) { + DecodingByteArrayBenchmark(state, Encoding::DELTA_LENGTH_BYTE_ARRAY); +} + +BENCHMARK(BM_PlainEncodingByteArray)->Apply(ByteArrayCustomArguments); +BENCHMARK(BM_DeltaLengthEncodingByteArray)->Apply(ByteArrayCustomArguments); +BENCHMARK(BM_PlainDecodingByteArray)->Apply(ByteArrayCustomArguments); +BENCHMARK(BM_DeltaLengthDecodingByteArray)->Apply(ByteArrayCustomArguments); + +static void BM_DecodingByteArraySpaced(benchmark::State& state, Encoding::type encoding) { + const double null_percent = 0.02; + + auto rand = ::arrow::random::RandomArrayGenerator(0); + int32_t max_length = static_cast(state.range(0)); + int32_t num_values = static_cast(state.range(1)); + const auto array = rand.String(num_values, /* min_length */ 0, + /* max_length */ max_length, null_percent); + const auto valid_bits = array->null_bitmap_data(); + const int null_count = static_cast(array->null_count()); + const auto array_actual = + ::arrow::internal::checked_pointer_cast<::arrow::StringArray>(array); + + std::vector byte_arrays; + byte_arrays.reserve(array_actual->length()); + for (int i = 0; i < array_actual->length(); ++i) { + byte_arrays.emplace_back(array_actual->GetView(i)); + } + + auto encoder = MakeTypedEncoder(encoding); + encoder->PutSpaced(byte_arrays.data(), num_values, valid_bits, 0); + std::shared_ptr buf = encoder->FlushValues(); + + auto decoder = MakeTypedDecoder(encoding); + std::vector decode_values(num_values * sizeof(ByteArray)); + auto decode_buf = reinterpret_cast(decode_values.data()); + for (auto _ : state) { + decoder->SetData(num_values - null_count, buf->data(), static_cast(buf->size())); + decoder->DecodeSpaced(decode_buf, num_values, null_count, valid_bits, 0); + ::benchmark::DoNotOptimize(decode_buf); + } + state.counters["null_percent"] = null_percent * 100; + state.SetItemsProcessed(state.iterations() * array_actual->length()); + state.SetBytesProcessed(state.iterations() * (array_actual->value_data()->size() + + array_actual->value_offsets()->size())); +} + +static void BM_PlainDecodingSpacedByteArray(benchmark::State& state) { + BM_DecodingByteArraySpaced(state, Encoding::PLAIN); +} + +static void BM_DeltaLengthDecodingSpacedByteArray(benchmark::State& state) { + BM_DecodingByteArraySpaced(state, Encoding::DELTA_LENGTH_BYTE_ARRAY); +} + +BENCHMARK(BM_PlainDecodingSpacedByteArray)->Apply(ByteArrayCustomArguments); +BENCHMARK(BM_DeltaLengthDecodingSpacedByteArray)->Apply(ByteArrayCustomArguments); + template static void DecodeDict(std::vector& values, benchmark::State& state) { @@ -634,6 +761,29 @@ static void BM_DictDecodingInt64_literals(benchmark::State& state) { BENCHMARK(BM_DictDecodingInt64_literals)->Range(MIN_RANGE, MAX_RANGE); +static void BM_DictDecodingByteArray(benchmark::State& state) { + ::arrow::random::RandomArrayGenerator rag(0); + // Using arrow generator to generate random data. + int32_t max_length = static_cast(state.range(0)); + int32_t array_size = static_cast(state.range(1)); + auto array = + rag.String(/* size */ array_size, /* min_length */ 0, /* max_length */ max_length, + /* null_probability */ 0); + const auto array_actual = + ::arrow::internal::checked_pointer_cast<::arrow::StringArray>(array); + auto encoder = MakeDictDecoder(); + std::vector values; + for (int i = 0; i < array_actual->length(); ++i) { + values.emplace_back(array_actual->GetView(i)); + } + DecodeDict(values, state); + state.SetItemsProcessed(state.iterations() * array_actual->length()); + state.SetBytesProcessed(state.iterations() * (array_actual->value_data()->size() + + array_actual->value_offsets()->size())); +} + +BENCHMARK(BM_DictDecodingByteArray)->Apply(ByteArrayCustomArguments); + // ---------------------------------------------------------------------- // Shared benchmarks for decoding using arrow builders