diff --git a/velox/dwio/common/CMakeLists.txt b/velox/dwio/common/CMakeLists.txt index 2ce0e670ad6..165a5b41d25 100644 --- a/velox/dwio/common/CMakeLists.txt +++ b/velox/dwio/common/CMakeLists.txt @@ -60,6 +60,7 @@ velox_add_library( SelectiveStructColumnReader.cpp SortingWriter.cpp SortingWriter.h + StatisticsBuilder.cpp Throttler.cpp TypeUtils.cpp TypeWithId.cpp @@ -76,6 +77,7 @@ velox_link_libraries( velox_common_io velox_common_compression velox_common_config + velox_common_hyperloglog velox_dwio_common_encryption velox_dwio_common_exception velox_exception diff --git a/velox/dwio/common/Statistics.h b/velox/dwio/common/Statistics.h index bffaaaaea3d..2498ca6f71d 100644 --- a/velox/dwio/common/Statistics.h +++ b/velox/dwio/common/Statistics.h @@ -142,6 +142,12 @@ class ColumnStatistics { numDistinct_ = count; } + /// Returns true if there are no non-null values (value count is known to be + /// zero). + bool isAllNull() const { + return valueCount_.has_value() && valueCount_.value() == 0; + } + /** * return string representation of this stats object */ diff --git a/velox/dwio/common/StatisticsBuilder.cpp b/velox/dwio/common/StatisticsBuilder.cpp new file mode 100644 index 00000000000..9d0b41cc6d5 --- /dev/null +++ b/velox/dwio/common/StatisticsBuilder.cpp @@ -0,0 +1,450 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "velox/dwio/common/StatisticsBuilder.h" + +namespace facebook::velox::dwio::stats { + +// Import column statistics types from dwio::common. +using common::BinaryColumnStatistics; +using common::BooleanColumnStatistics; +using common::ColumnStatistics; +using common::DoubleColumnStatistics; +using common::IntegerColumnStatistics; +using common::StringColumnStatistics; + +namespace { + +template +void addWithOverflowCheck(std::optional& to, T value, uint64_t count) { + if (to.has_value()) { + T result; + auto overflow = __builtin_mul_overflow(value, count, &result); + if (!overflow) { + overflow = __builtin_add_overflow(to.value(), result, &to.value()); + } + if (overflow) { + to.reset(); + } + } +} + +template +void mergeWithOverflowCheck( + std::optional& to, + const std::optional& from) { + if (to.has_value()) { + if (from.has_value()) { + auto overflow = + __builtin_add_overflow(to.value(), from.value(), &to.value()); + if (overflow) { + to.reset(); + } + } else { + to.reset(); + } + } +} + +template +void mergeCount(std::optional& to, const std::optional& from) { + if (to.has_value()) { + if (from.has_value()) { + to.value() += from.value(); + } else { + to.reset(); + } + } +} + +template +void mergeMin(std::optional& to, const std::optional& from) { + if (to.has_value()) { + if (!from.has_value()) { + to.reset(); + } else if (from.value() < to.value()) { + to = from; + } + } +} + +template +void mergeMax(std::optional& to, const std::optional& from) { + if (to.has_value()) { + if (!from.has_value()) { + to.reset(); + } else if (from.value() > to.value()) { + to = from; + } + } +} + +bool isValidLength(const std::optional& length) { + return length.has_value() && + length.value() <= std::numeric_limits::max(); +} + +bool shouldKeepString( + const std::optional& val, + uint32_t lengthLimit) { + return val.has_value() && val.value().size() <= lengthLimit; +} + +} // namespace + +std::unique_ptr StatisticsBuilder::build() const { + auto result = std::make_unique( + valueCount_, hasNull_, rawSize_, size_, estimateNumDistinct()); + + // For the base builder, there are no typed stats to add. + return result; +} + +void StatisticsBuilder::incrementSize(uint64_t size) { + if (LIKELY(size_.has_value())) { + addWithOverflowCheck(size_, size, /*count=*/1); + } +} + +void StatisticsBuilder::merge(const ColumnStatistics& other, bool ignoreSize) { + mergeCount(valueCount_, other.getNumberOfValues()); + + if (!hasNull_.has_value() || !hasNull_.value()) { + auto otherHasNull = other.hasNull(); + if (otherHasNull.has_value()) { + if (otherHasNull.value()) { + hasNull_ = true; + } + } else if (hasNull_.has_value()) { + hasNull_.reset(); + } + } + mergeCount(rawSize_, other.getRawSize()); + if (!ignoreSize) { + mergeCount(size_, other.getSize()); + } + if (hll_) { + auto* otherBuilder = dynamic_cast(&other); + VELOX_CHECK_NOT_NULL(otherBuilder); + VELOX_CHECK_NOT_NULL(otherBuilder->hll_); + hll_->mergeWith(*otherBuilder->hll_); + } +} + +std::unique_ptr StatisticsBuilder::create( + const Type& type, + const StatisticsBuilderOptions& options) { + switch (type.kind()) { + case TypeKind::BOOLEAN: + return std::make_unique(options); + case TypeKind::TINYINT: + case TypeKind::SMALLINT: + case TypeKind::INTEGER: + case TypeKind::BIGINT: + return std::make_unique(options); + case TypeKind::REAL: + case TypeKind::DOUBLE: + return std::make_unique(options); + case TypeKind::VARCHAR: + return std::make_unique(options); + case TypeKind::VARBINARY: + return std::make_unique(options); + default: + return std::make_unique(options); + } +} + +void StatisticsBuilder::createTree( + std::vector>& statBuilders, + const Type& type, + const StatisticsBuilderOptions& options) { + auto kind = type.kind(); + switch (kind) { + case TypeKind::BOOLEAN: + case TypeKind::TINYINT: + case TypeKind::SMALLINT: + case TypeKind::INTEGER: + case TypeKind::BIGINT: + case TypeKind::REAL: + case TypeKind::DOUBLE: + case TypeKind::VARCHAR: + case TypeKind::VARBINARY: + case TypeKind::TIMESTAMP: + statBuilders.push_back(StatisticsBuilder::create(type, options)); + break; + + case TypeKind::ARRAY: { + statBuilders.push_back(StatisticsBuilder::create(type, options)); + const auto& arrayType = dynamic_cast(type); + createTree(statBuilders, *arrayType.elementType(), options); + break; + } + + case TypeKind::MAP: { + statBuilders.push_back(StatisticsBuilder::create(type, options)); + const auto& mapType = dynamic_cast(type); + createTree(statBuilders, *mapType.keyType(), options); + createTree(statBuilders, *mapType.valueType(), options); + break; + } + + case TypeKind::ROW: { + statBuilders.push_back(StatisticsBuilder::create(type, options)); + const auto& rowType = dynamic_cast(type); + for (const auto& childType : rowType.children()) { + createTree(statBuilders, *childType, options); + } + break; + } + default: + VELOX_FAIL("Not supported type: {}", kind); + break; + } +} + +void BooleanStatisticsBuilder::addValues(bool value, uint64_t count) { + increaseValueCount(count); + if (trueCount_.has_value() && value) { + trueCount_.value() += count; + } +} + +void BooleanStatisticsBuilder::merge( + const ColumnStatistics& other, + bool ignoreSize) { + StatisticsBuilder::merge(other, ignoreSize); + auto stats = dynamic_cast(&other); + if (!stats) { + if (!other.isAllNull() && trueCount_.has_value()) { + trueCount_.reset(); + } + return; + } + mergeCount(trueCount_, stats->getTrueCount()); +} + +std::unique_ptr BooleanStatisticsBuilder::build() const { + auto trueCount = isAllNull() ? std::nullopt : trueCount_; + auto result = std::make_unique( + static_cast(*this), trueCount); + if (auto numDistinct = estimateNumDistinct()) { + result->setNumDistinct(*numDistinct); + } + return result; +} + +void IntegerStatisticsBuilder::addValues(int64_t value, uint64_t count) { + increaseValueCount(count); + if (min_.has_value() && value < min_.value()) { + min_ = value; + } + if (max_.has_value() && value > max_.value()) { + max_ = value; + } + addWithOverflowCheck(sum_, value, count); + addHash(value); +} + +void IntegerStatisticsBuilder::merge( + const ColumnStatistics& other, + bool ignoreSize) { + StatisticsBuilder::merge(other, ignoreSize); + auto stats = dynamic_cast(&other); + if (!stats) { + if (!other.isAllNull()) { + min_.reset(); + max_.reset(); + sum_.reset(); + } + return; + } + mergeMin(min_, stats->getMinimum()); + mergeMax(max_, stats->getMaximum()); + mergeWithOverflowCheck(sum_, stats->getSum()); +} + +std::unique_ptr IntegerStatisticsBuilder::build() const { + auto min = isAllNull() ? std::nullopt : min_; + auto max = isAllNull() ? std::nullopt : max_; + auto sum = isAllNull() ? std::nullopt : sum_; + auto result = std::make_unique( + static_cast(*this), min, max, sum); + if (auto numDistinct = estimateNumDistinct()) { + result->setNumDistinct(*numDistinct); + } + return result; +} + +void DoubleStatisticsBuilder::addValues(double value, uint64_t count) { + increaseValueCount(count); + if (std::isnan(value)) { + clear(); + return; + } + + if (min_.has_value() && value < min_.value()) { + min_ = value; + } + if (max_.has_value() && value > max_.value()) { + max_ = value; + } + addHash(value); + if (sum_.has_value()) { + for (uint64_t i = 0; i < count; ++i) { + sum_.value() += value; + } + if (std::isnan(sum_.value())) { + sum_.reset(); + } + } +} + +void DoubleStatisticsBuilder::merge( + const ColumnStatistics& other, + bool ignoreSize) { + StatisticsBuilder::merge(other, ignoreSize); + auto stats = dynamic_cast(&other); + if (!stats) { + if (!other.isAllNull()) { + clear(); + } + return; + } + mergeMin(min_, stats->getMinimum()); + mergeMax(max_, stats->getMaximum()); + mergeCount(sum_, stats->getSum()); + if (sum_.has_value() && std::isnan(sum_.value())) { + sum_.reset(); + } +} + +std::unique_ptr DoubleStatisticsBuilder::build() const { + auto min = isAllNull() ? std::nullopt : min_; + auto max = isAllNull() ? std::nullopt : max_; + auto sum = isAllNull() ? std::nullopt : sum_; + auto result = std::make_unique( + static_cast(*this), min, max, sum); + if (auto numDistinct = estimateNumDistinct()) { + result->setNumDistinct(*numDistinct); + } + return result; +} + +void StringStatisticsBuilder::addValues( + std::string_view value, + uint64_t count) { + auto isSelfEmpty = isAllNull(); + increaseValueCount(count); + if (isSelfEmpty) { + min_ = value; + max_ = value; + } else { + if (min_.has_value() && value < std::string_view{min_.value()}) { + min_ = value; + } + if (max_.has_value() && value > std::string_view{max_.value()}) { + max_ = value; + } + } + addHash(value); + + addWithOverflowCheck(length_, value.size(), count); +} + +void StringStatisticsBuilder::merge( + const ColumnStatistics& other, + bool ignoreSize) { + auto isSelfEmpty = isAllNull(); + StatisticsBuilder::merge(other, ignoreSize); + auto stats = dynamic_cast(&other); + if (!stats) { + if (!other.isAllNull()) { + min_.reset(); + max_.reset(); + length_.reset(); + } + return; + } + + if (other.isAllNull()) { + return; + } + + if (isSelfEmpty) { + min_ = stats->getMinimum(); + max_ = stats->getMaximum(); + } else { + mergeMin(min_, stats->getMinimum()); + mergeMax(max_, stats->getMaximum()); + } + + mergeWithOverflowCheck(length_, stats->getTotalLength()); +} + +std::unique_ptr StringStatisticsBuilder::build() const { + std::optional min; + std::optional max; + std::optional length; + if (!isAllNull()) { + if (shouldKeepString(min_, lengthLimit_)) { + min = min_; + } + if (shouldKeepString(max_, lengthLimit_)) { + max = max_; + } + if (isValidLength(length_)) { + length = length_.value(); + } + } + auto result = std::make_unique( + static_cast(*this), min, max, length); + if (auto numDistinct = estimateNumDistinct()) { + result->setNumDistinct(*numDistinct); + } + return result; +} + +void BinaryStatisticsBuilder::addValues(uint64_t length, uint64_t count) { + increaseValueCount(count); + addWithOverflowCheck(length_, length, count); +} + +void BinaryStatisticsBuilder::merge( + const ColumnStatistics& other, + bool ignoreSize) { + StatisticsBuilder::merge(other, ignoreSize); + auto stats = dynamic_cast(&other); + if (!stats) { + if (!other.isAllNull() && length_.has_value()) { + length_.reset(); + } + return; + } + mergeWithOverflowCheck(length_, stats->getTotalLength()); +} + +std::unique_ptr BinaryStatisticsBuilder::build() const { + auto length = + (!isAllNull() && isValidLength(length_)) ? length_ : std::nullopt; + auto result = std::make_unique( + static_cast(*this), length); + if (auto numDistinct = estimateNumDistinct()) { + result->setNumDistinct(*numDistinct); + } + return result; +} + +} // namespace facebook::velox::dwio::stats diff --git a/velox/dwio/common/StatisticsBuilder.h b/velox/dwio/common/StatisticsBuilder.h new file mode 100644 index 00000000000..61319f73364 --- /dev/null +++ b/velox/dwio/common/StatisticsBuilder.h @@ -0,0 +1,334 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "velox/common/base/Exceptions.h" +#include "velox/common/hyperloglog/SparseHll.h" +#include "velox/dwio/common/Statistics.h" +#include "velox/type/Type.h" + +namespace facebook::velox::dwio::stats { + +// Import column statistics types from dwio::common. +using common::BinaryColumnStatistics; +using common::BooleanColumnStatistics; +using common::ColumnStatistics; +using common::DoubleColumnStatistics; +using common::IntegerColumnStatistics; +using common::StringColumnStatistics; + +/// Options for creating StatisticsBuilder instances. +struct StatisticsBuilderOptions { + explicit StatisticsBuilderOptions( + uint32_t stringLengthLimit, + std::optional initialSize = std::nullopt, + bool countDistincts = false, + HashStringAllocator* allocator = nullptr) + : stringLengthLimit{stringLengthLimit}, + initialSize{initialSize}, + countDistincts(countDistincts), + allocator(allocator) {} + + /// Maximum length of min/max string values to track. Strings longer than + /// this limit are dropped from statistics. + uint32_t stringLengthLimit; + + /// Initial value for the size statistic (total stream bytes). Nullopt means + /// size tracking is disabled until ensureSize() is called. + std::optional initialSize; + + /// Whether to count approximate distinct values using HyperLogLog. Requires + /// 'allocator' to be set. + bool countDistincts{false}; + + /// Allocator for HyperLogLog distinct counting. Required if 'countDistincts' + /// is true. + HashStringAllocator* allocator; + + /// Returns a copy with distinct counting disabled. + StatisticsBuilderOptions dropNumDistinct() const { + return StatisticsBuilderOptions(stringLengthLimit, initialSize); + } +}; + +/// Base class for stats builder. Stats builder is used in writer and file merge +/// to collect and merge stats. +/// It can also be used for gathering stats in ad hoc sampling. In this case it +/// may also count distinct values if enabled in 'options'. +class StatisticsBuilder : public virtual ColumnStatistics { + public: + explicit StatisticsBuilder(const StatisticsBuilderOptions& options) + : options_{options} { + VELOX_CHECK( + !options.countDistincts || options.allocator != nullptr, + "allocator is required when countDistincts is true"); + init(); + } + + ~StatisticsBuilder() override = default; + + void setHasNull() { + hasNull_ = true; + } + + void increaseValueCount(uint64_t count = 1) { + if (valueCount_.has_value()) { + valueCount_.value() += count; + } + } + + void increaseRawSize(uint64_t rawSize) { + if (rawSize_.has_value()) { + rawSize_.value() += rawSize; + } + } + + void clearRawSize() { + rawSize_.reset(); + } + + void ensureSize() { + if (!size_.has_value()) { + size_ = 0; + } + } + + void incrementSize(uint64_t size); + + template + void addHash(const T& data) { + if (hll_) { + hll_->insertHash(folly::hasher()(data)); + } + } + + int64_t cardinality() const { + VELOX_CHECK_NOT_NULL(hll_); + return hll_->cardinality(); + } + + /// Returns estimated number of distinct values if distinct counting is + /// enabled, or std::nullopt otherwise. + std::optional estimateNumDistinct() const { + if (hll_) { + return hll_->cardinality(); + } + return std::nullopt; + } + + /// Merges stats of same type. Used in writer to aggregate file level stats. + virtual void merge(const ColumnStatistics& other, bool ignoreSize = false); + + /// Resets to initial state. Used where row index entry level stats is + /// captured. + virtual void reset() { + init(); + } + + /// Builds a read-only ColumnStatistics snapshot. Typed stats (min/max/sum) + /// are omitted when isAllNull(). String min/max are omitted when they exceed + /// the string length limit. + virtual std::unique_ptr build() const; + + /// Creates a StatisticsBuilder for the given type. For MAP type, creates a + /// base StatisticsBuilder (not a MapStatisticsBuilder, which stays in DWRF). + static std::unique_ptr create( + const Type& type, + const StatisticsBuilderOptions& options); + + /// For the given type tree, creates a list of stat builders. + static void createTree( + std::vector>& statBuilders, + const Type& type, + const StatisticsBuilderOptions& options); + + private: + void init() { + valueCount_ = 0; + hasNull_ = false; + rawSize_ = 0; + size_ = options_.initialSize; + if (options_.countDistincts) { + hll_ = + std::make_shared>(options_.allocator); + } + } + + protected: + StatisticsBuilderOptions options_; + std::shared_ptr> hll_; +}; + +class BooleanStatisticsBuilder : public virtual StatisticsBuilder, + public BooleanColumnStatistics { + public: + explicit BooleanStatisticsBuilder(const StatisticsBuilderOptions& options) + : StatisticsBuilder{options.dropNumDistinct()} { + init(); + } + + ~BooleanStatisticsBuilder() override = default; + + void addValues(bool value, uint64_t count = 1); + + std::unique_ptr build() const override; + + void merge(const ColumnStatistics& other, bool ignoreSize = false) override; + + void reset() override { + StatisticsBuilder::reset(); + init(); + } + + private: + void init() { + trueCount_ = 0; + } +}; + +class IntegerStatisticsBuilder : public virtual StatisticsBuilder, + public IntegerColumnStatistics { + public: + explicit IntegerStatisticsBuilder(const StatisticsBuilderOptions& options) + : StatisticsBuilder{options} { + init(); + } + + ~IntegerStatisticsBuilder() override = default; + + void addValues(int64_t value, uint64_t count = 1); + + std::unique_ptr build() const override; + + void merge(const ColumnStatistics& other, bool ignoreSize = false) override; + + void reset() override { + StatisticsBuilder::reset(); + init(); + } + + private: + void init() { + min_ = std::numeric_limits::max(); + max_ = std::numeric_limits::min(); + sum_ = 0; + } +}; + +static_assert( + std::numeric_limits::has_infinity, + "infinity not defined"); + +class DoubleStatisticsBuilder : public virtual StatisticsBuilder, + public DoubleColumnStatistics { + public: + explicit DoubleStatisticsBuilder(const StatisticsBuilderOptions& options) + : StatisticsBuilder{options} { + init(); + } + + ~DoubleStatisticsBuilder() override = default; + + void addValues(double value, uint64_t count = 1); + + std::unique_ptr build() const override; + + void merge(const ColumnStatistics& other, bool ignoreSize = false) override; + + void reset() override { + StatisticsBuilder::reset(); + init(); + } + + private: + void init() { + min_ = std::numeric_limits::infinity(); + max_ = -std::numeric_limits::infinity(); + sum_ = 0; + } + + void clear() { + min_.reset(); + max_.reset(); + sum_.reset(); + } +}; + +class StringStatisticsBuilder : public virtual StatisticsBuilder, + public StringColumnStatistics { + public: + explicit StringStatisticsBuilder(const StatisticsBuilderOptions& options) + : StatisticsBuilder{options}, lengthLimit_{options.stringLengthLimit} { + init(); + } + + ~StringStatisticsBuilder() override = default; + + void addValues(std::string_view value, uint64_t count = 1); + + std::unique_ptr build() const override; + + void merge(const ColumnStatistics& other, bool ignoreSize = false) override; + + void reset() override { + StatisticsBuilder::reset(); + init(); + } + + protected: + uint32_t lengthLimit_; + + bool shouldKeep(const std::optional& val) const { + return val.has_value() && val.value().size() <= lengthLimit_; + } + + private: + void init() { + min_.reset(); + max_.reset(); + length_ = 0; + } +}; + +class BinaryStatisticsBuilder : public virtual StatisticsBuilder, + public BinaryColumnStatistics { + public: + explicit BinaryStatisticsBuilder(const StatisticsBuilderOptions& options) + : StatisticsBuilder{options.dropNumDistinct()} { + init(); + } + + ~BinaryStatisticsBuilder() override = default; + + void addValues(uint64_t length, uint64_t count = 1); + + std::unique_ptr build() const override; + + void merge(const ColumnStatistics& other, bool ignoreSize = false) override; + + void reset() override { + StatisticsBuilder::reset(); + init(); + } + + private: + void init() { + length_ = 0; + } +}; + +} // namespace facebook::velox::dwio::stats diff --git a/velox/dwio/dwrf/common/Statistics.h b/velox/dwio/dwrf/common/Statistics.h index 864fb08db37..1fb1e78c2b3 100644 --- a/velox/dwio/dwrf/common/Statistics.h +++ b/velox/dwio/dwrf/common/Statistics.h @@ -19,7 +19,6 @@ #include "velox/dwio/common/Statistics.h" #include "velox/dwio/dwrf/common/Common.h" #include "velox/dwio/dwrf/common/FileMetadata.h" -#include "velox/dwio/dwrf/common/wrap/dwrf-proto-wrapper.h" namespace facebook::velox::dwrf { diff --git a/velox/dwio/dwrf/test/E2EWriterTest.cpp b/velox/dwio/dwrf/test/E2EWriterTest.cpp index 771a8078f70..6a448bea0dd 100644 --- a/velox/dwio/dwrf/test/E2EWriterTest.cpp +++ b/velox/dwio/dwrf/test/E2EWriterTest.cpp @@ -30,6 +30,7 @@ #include "velox/dwio/dwrf/reader/DwrfReader.h" #include "velox/dwio/dwrf/test/OrcTest.h" #include "velox/dwio/dwrf/test/utils/E2EWriterTestUtil.h" +#include "velox/dwio/dwrf/writer/StatisticsBuilder.h" #include "velox/type/fbhive/HiveTypeParser.h" #include "velox/vector/fuzzer/VectorFuzzer.h" #include "velox/vector/tests/utils/VectorMaker.h" @@ -205,7 +206,7 @@ class E2EWriterTest : public testing::Test { dwrf::EncodingKey seqEk(valueTypeId, sequence); const auto& keyInfo = stripeStreams.getEncoding(seqEk).key(); - auto key = dwrf::constructKey(keyInfo); + auto key = dwrf::MapStatisticsBuilder::constructKey(keyInfo); sequenceToKey.emplace(sequence, key); }); @@ -235,7 +236,8 @@ class E2EWriterTest : public testing::Test { const auto& entry = stats.mapStatistics().stats(i); ASSERT_TRUE(entry.stats().has_size()); EXPECT_EQ( - featureStreamSizes.at(dwrf::constructKey(entry.key())), + featureStreamSizes.at( + dwrf::MapStatisticsBuilder::constructKey(entry.key())), entry.stats().size()); } } diff --git a/velox/dwio/dwrf/writer/ColumnWriter.h b/velox/dwio/dwrf/writer/ColumnWriter.h index ec7c8f09ab1..9fde39f331a 100644 --- a/velox/dwio/dwrf/writer/ColumnWriter.h +++ b/velox/dwio/dwrf/writer/ColumnWriter.h @@ -192,7 +192,7 @@ class BaseColumnWriter : public ColumnWriter { createBooleanRleEncoder(newStream(StreamKind::StreamKind_PRESENT)); } const auto options = - StatisticsBuilderOptions::fromConfig(context.getConfigs()); + StatisticsBuilder::optionsFromConfig(context.getConfigs()); indexStatsBuilder_ = StatisticsBuilder::create(*type.type(), options); fileStatsBuilder_ = StatisticsBuilder::create(*type.type(), options); } diff --git a/velox/dwio/dwrf/writer/FlatMapColumnWriter.cpp b/velox/dwio/dwrf/writer/FlatMapColumnWriter.cpp index 65bea8d379f..30ffefce382 100644 --- a/velox/dwio/dwrf/writer/FlatMapColumnWriter.cpp +++ b/velox/dwio/dwrf/writer/FlatMapColumnWriter.cpp @@ -61,7 +61,7 @@ FlatMapColumnWriter::FlatMapColumnWriter( valueType_{*type.childAt(1)}, maxKeyCount_{context_.getConfig(Config::MAP_FLAT_MAX_KEYS)}, collectMapStats_{context.getConfig(Config::MAP_STATISTICS)} { - auto options = StatisticsBuilderOptions::fromConfig(context.getConfigs()); + auto options = StatisticsBuilder::optionsFromConfig(context.getConfigs()); keyFileStatsBuilder_ = std::unique_ptr::StatisticsBuilder>( dynamic_cast::StatisticsBuilder*>( diff --git a/velox/dwio/dwrf/writer/FlatMapColumnWriter.h b/velox/dwio/dwrf/writer/FlatMapColumnWriter.h index ab92e520b76..3394ab8a9bc 100644 --- a/velox/dwio/dwrf/writer/FlatMapColumnWriter.h +++ b/velox/dwio/dwrf/writer/FlatMapColumnWriter.h @@ -38,7 +38,7 @@ class ValueStatisticsBuilder { static std::unique_ptr create( WriterContext& context, const dwio::common::TypeWithId& root) { - auto options = StatisticsBuilderOptions::fromConfig(context.getConfigs()); + auto options = StatisticsBuilder::optionsFromConfig(context.getConfigs()); return create_(context, root, options); } diff --git a/velox/dwio/dwrf/writer/StatisticsBuilder.cpp b/velox/dwio/dwrf/writer/StatisticsBuilder.cpp index b716d1aa71a..20d6989d9ef 100644 --- a/velox/dwio/dwrf/writer/StatisticsBuilder.cpp +++ b/velox/dwio/dwrf/writer/StatisticsBuilder.cpp @@ -24,100 +24,33 @@ using dwio::common::ArenaCreate; namespace { -static bool isValidLength(const std::optional& length) { +bool isValidLength(const std::optional& length) { return length.has_value() && length.value() <= std::numeric_limits::max(); } -template -static void mergeCount(std::optional& to, const std::optional& from) { - if (to.has_value()) { - if (from.has_value()) { - to.value() += from.value(); - } else { - to.reset(); - } +// Serializes base ColumnStatistics fields to proto. +void baseToProto( + const dwio::common::ColumnStatistics& builder, + ColumnStatisticsWriteWrapper& stats) { + if (builder.hasNull().has_value()) { + stats.setHasNull(builder.hasNull().value()); } -} - -template -static void mergeMin(std::optional& to, const std::optional& from) { - if (to.has_value()) { - if (!from.has_value()) { - to.reset(); - } else if (from.value() < to.value()) { - to = from; - } + if (builder.getNumberOfValues().has_value()) { + stats.setNumberOfValues(builder.getNumberOfValues().value()); } -} - -template -static void mergeMax(std::optional& to, const std::optional& from) { - if (to.has_value()) { - if (!from.has_value()) { - to.reset(); - } else if (from.value() > to.value()) { - to = from; - } + if (builder.getRawSize().has_value()) { + stats.setRawSize(builder.getRawSize().value()); + } + if (builder.getSize().has_value()) { + stats.setSize(builder.getSize().value()); } } } // namespace -void StatisticsBuilder::merge( - const dwio::common::ColumnStatistics& other, - bool ignoreSize) { - // Merge valueCount_ only if both sides have it. Otherwise, reset. - mergeCount(valueCount_, other.getNumberOfValues()); - - // Merge hasNull_. Follow below rule: - // self / other => result - // true / any => true - // unknown / true => true - // unknown / unknown or false => unknown - // false / unknown => unknown - // false / false => false - // false / true => true - if (!hasNull_.has_value() || !hasNull_.value()) { - auto otherHasNull = other.hasNull(); - if (otherHasNull.has_value()) { - if (otherHasNull.value()) { - // other is true, set to true - hasNull_ = true; - } - // when other is false, no change is needed - } else if (hasNull_.has_value()) { - // self value is false and other is unknown, set to unknown - hasNull_.reset(); - } - } - // Merge rawSize_ the way similar to valueCount_ - mergeCount(rawSize_, other.getRawSize()); - if (!ignoreSize) { - // Merge size - mergeCount(size_, other.getSize()); - } - if (hll_) { - auto* otherBuilder = dynamic_cast(&other); - VELOX_CHECK_NOT_NULL(otherBuilder); - VELOX_CHECK_NOT_NULL(otherBuilder->hll_); - hll_->mergeWith(*otherBuilder->hll_); - } -} - void StatisticsBuilder::toProto(ColumnStatisticsWriteWrapper& stats) const { - if (hasNull_.has_value()) { - stats.setHasNull(hasNull_.value()); - } - if (valueCount_.has_value()) { - stats.setNumberOfValues(valueCount_.value()); - } - if (rawSize_.has_value()) { - stats.setRawSize(rawSize_.value()); - } - if (size_.has_value()) { - stats.setSize(size_.value()); - } + baseToProto(*this, stats); } std::unique_ptr StatisticsBuilder::build() @@ -214,67 +147,22 @@ void StatisticsBuilder::createTree( DWIO_RAISE("Not supported type: ", kind); break; } - return; -}; - -void BooleanStatisticsBuilder::merge( - const dwio::common::ColumnStatistics& other, - bool ignoreSize) { - StatisticsBuilder::merge(other, ignoreSize); - auto stats = - dynamic_cast(&other); - if (!stats) { - // We only care about the case when type specific stats is missing yet - // it has non-null values. - if (!isEmpty(other) && trueCount_.has_value()) { - trueCount_.reset(); - } - return; - } - - // Now the case when both sides have type specific stats - mergeCount(trueCount_, stats->getTrueCount()); } void BooleanStatisticsBuilder::toProto( ColumnStatisticsWriteWrapper& stats) const { - StatisticsBuilder::toProto(stats); - // Serialize type specific stats only if there is non-null values - if (!isEmpty(*this) && trueCount_.has_value()) { + baseToProto(*this, stats); + if (!isAllNull() && trueCount_.has_value()) { auto bStats = stats.mutableBucketStatistics(); DWIO_ENSURE_EQ(bStats.countSize(), 0); bStats.addCount(trueCount_.value()); } } -void IntegerStatisticsBuilder::merge( - const dwio::common::ColumnStatistics& other, - bool ignoreSize) { - StatisticsBuilder::merge(other, ignoreSize); - auto stats = - dynamic_cast(&other); - if (!stats) { - // We only care about the case when type specific stats is missing yet - // it has non-null values. - if (!isEmpty(other)) { - min_.reset(); - max_.reset(); - sum_.reset(); - } - return; - } - - // Now the case when both sides have type specific stats - mergeMin(min_, stats->getMinimum()); - mergeMax(max_, stats->getMaximum()); - mergeWithOverflowCheck(sum_, stats->getSum()); -} - void IntegerStatisticsBuilder::toProto( ColumnStatisticsWriteWrapper& stats) const { - StatisticsBuilder::toProto(stats); - // Serialize type specific stats only if there is non-null values - if (!isEmpty(*this) && + baseToProto(*this, stats); + if (!isAllNull() && (min_.has_value() || max_.has_value() || sum_.has_value())) { auto iStats = stats.mutableIntegerStatistics(); if (min_.has_value()) { @@ -289,35 +177,10 @@ void IntegerStatisticsBuilder::toProto( } } -void DoubleStatisticsBuilder::merge( - const dwio::common::ColumnStatistics& other, - bool ignoreSize) { - StatisticsBuilder::merge(other, ignoreSize); - auto stats = - dynamic_cast(&other); - if (!stats) { - // We only care about the case when type specific stats is missing yet - // it has non-null values. - if (!isEmpty(other)) { - clear(); - } - return; - } - - // Now the case when both sides have type specific stats - mergeMin(min_, stats->getMinimum()); - mergeMax(max_, stats->getMaximum()); - mergeCount(sum_, stats->getSum()); - if (sum_.has_value() && std::isnan(sum_.value())) { - sum_.reset(); - } -} - void DoubleStatisticsBuilder::toProto( ColumnStatisticsWriteWrapper& stats) const { - StatisticsBuilder::toProto(stats); - // Serialize type specific stats only if there is non-null values - if (!isEmpty(*this) && + baseToProto(*this, stats); + if (!isAllNull() && (min_.has_value() || max_.has_value() || sum_.has_value())) { auto dStats = stats.mutableDoubleStatistics(); if (min_.has_value()) { @@ -332,49 +195,10 @@ void DoubleStatisticsBuilder::toProto( } } -void StringStatisticsBuilder::merge( - const dwio::common::ColumnStatistics& other, - bool ignoreSize) { - // min_/max_ is not initialized with default that can be compared against - // easily. So we need to capture whether self is empty and handle - // differently. - auto isSelfEmpty = isEmpty(*this); - StatisticsBuilder::merge(other, ignoreSize); - auto stats = - dynamic_cast(&other); - if (!stats) { - // We only care about the case when type specific stats is missing yet - // it has non-null values. - if (!isEmpty(other)) { - min_.reset(); - max_.reset(); - length_.reset(); - } - return; - } - - // If the other stats is empty, there is nothing to merge at string stats - // level. - if (isEmpty(other)) { - return; - } - - if (isSelfEmpty) { - min_ = stats->getMinimum(); - max_ = stats->getMaximum(); - } else { - mergeMin(min_, stats->getMinimum()); - mergeMax(max_, stats->getMaximum()); - } - - mergeWithOverflowCheck(length_, stats->getTotalLength()); -} - void StringStatisticsBuilder::toProto( ColumnStatisticsWriteWrapper& stats) const { - StatisticsBuilder::toProto(stats); - // If string value is too long, drop it and fall back to basic stats - if (!isEmpty(*this) && + baseToProto(*this, stats); + if (!isAllNull() && (shouldKeep(min_) || shouldKeep(max_) || isValidLength(length_))) { auto dStats = stats.mutableStringStatistics(); if (isValidLength(length_)) { @@ -391,43 +215,47 @@ void StringStatisticsBuilder::toProto( } } -void BinaryStatisticsBuilder::merge( - const dwio::common::ColumnStatistics& other, - bool ignoreSize) { - StatisticsBuilder::merge(other, ignoreSize); - auto stats = - dynamic_cast(&other); - if (!stats) { - // We only care about the case when type specific stats is missing yet - // it has non-null values. - if (!isEmpty(other) && length_.has_value()) { - length_.reset(); - } - return; - } - - mergeWithOverflowCheck(length_, stats->getTotalLength()); -} - void BinaryStatisticsBuilder::toProto( ColumnStatisticsWriteWrapper& stats) const { - StatisticsBuilder::toProto(stats); - // Serialize type specific stats only if there is non-null values - if (!isEmpty(*this) && isValidLength(length_)) { + baseToProto(*this, stats); + if (!isAllNull() && isValidLength(length_)) { auto bStats = stats.mutableBinaryStatistics(); bStats.setSum(length_.value()); } } +dwio::common::KeyInfo MapStatisticsBuilder::constructKey( + const dwrf::proto::KeyInfo& keyInfo) { + if (keyInfo.has_intkey()) { + return dwio::common::KeyInfo{keyInfo.intkey()}; + } else if (keyInfo.has_byteskey()) { + return dwio::common::KeyInfo{keyInfo.byteskey()}; + } + VELOX_UNREACHABLE("Illegal null key info"); +} + +void MapStatisticsBuilder::addValues( + const dwrf::proto::KeyInfo& keyInfo, + const StatisticsBuilder& stats) { + auto& keyStats = getKeyStats(MapStatisticsBuilder::constructKey(keyInfo)); + keyStats.merge(stats, /*ignoreSize=*/true); +} + +void MapStatisticsBuilder::incrementSize( + const dwrf::proto::KeyInfo& keyInfo, + uint64_t size) { + auto& keyStats = getKeyStats(MapStatisticsBuilder::constructKey(keyInfo)); + keyStats.ensureSize(); + keyStats.incrementSize(size); +} + void MapStatisticsBuilder::merge( const dwio::common::ColumnStatistics& other, bool ignoreSize) { StatisticsBuilder::merge(other, ignoreSize); auto stats = dynamic_cast(&other); if (!stats) { - // We only care about the case when type specific stats is missing yet - // it has non-null values. - if (!isEmpty(other) && !entryStatistics_.empty()) { + if (!other.isAllNull() && !entryStatistics_.empty()) { entryStatistics_.clear(); } return; @@ -440,12 +268,11 @@ void MapStatisticsBuilder::merge( void MapStatisticsBuilder::toProto(ColumnStatisticsWriteWrapper& stats) const { StatisticsBuilder::toProto(stats); - if (!isEmpty(*this) && !entryStatistics_.empty()) { + if (!isAllNull() && !entryStatistics_.empty()) { auto mapStats = stats.mutableMapStatistics(); for (const auto& entry : entryStatistics_) { auto entryStatistics = mapStats->add_stats(); const auto& key = entry.first; - // Sets the corresponding key. Leave null keys null. if (key.intKey.has_value()) { entryStatistics->mutable_key()->set_intkey(key.intKey.value()); } else if (key.bytesKey.has_value()) { diff --git a/velox/dwio/dwrf/writer/StatisticsBuilder.h b/velox/dwio/dwrf/writer/StatisticsBuilder.h index 497d7837746..7d278163455 100644 --- a/velox/dwio/dwrf/writer/StatisticsBuilder.h +++ b/velox/dwio/dwrf/writer/StatisticsBuilder.h @@ -16,426 +16,136 @@ #pragma once -#include -#include +#include "velox/dwio/common/StatisticsBuilder.h" #include "velox/dwio/dwrf/common/Config.h" #include "velox/dwio/dwrf/common/Statistics.h" #include "velox/dwio/dwrf/common/wrap/dwrf-proto-wrapper.h" -#include "velox/type/Type.h" namespace facebook::velox::dwrf { -namespace { -inline bool isEmpty(const dwio::common::ColumnStatistics& stats) { - auto valueCount = stats.getNumberOfValues(); - return valueCount.has_value() && valueCount.value() == 0; -} - -template -static void -addWithOverflowCheck(std::optional& to, T value, uint64_t count) { - if (to.has_value()) { - // check overflow. Value is only valid when not overflow - T result; - auto overflow = __builtin_mul_overflow(value, count, &result); - if (!overflow) { - overflow = __builtin_add_overflow(to.value(), result, &to.value()); - } - if (overflow) { - to.reset(); - } - } -} - -template -static void mergeWithOverflowCheck( - std::optional& to, - const std::optional& from) { - if (to.has_value()) { - if (from.has_value()) { - auto overflow = - __builtin_add_overflow(to.value(), from.value(), &to.value()); - if (overflow) { - to.reset(); - } - } else { - to.reset(); - } - } -} - -inline dwio::common::KeyInfo constructKey(const dwrf::proto::KeyInfo& keyInfo) { - if (keyInfo.has_intkey()) { - return dwio::common::KeyInfo{keyInfo.intkey()}; - } else if (keyInfo.has_byteskey()) { - return dwio::common::KeyInfo{keyInfo.byteskey()}; - } - VELOX_UNREACHABLE("Illegal null key info"); -} -} // namespace - -struct StatisticsBuilderOptions { - explicit StatisticsBuilderOptions( - uint32_t stringLengthLimit, - std::optional initialSize = std::nullopt, - bool countDistincts = false, - HashStringAllocator* allocator = nullptr) - : stringLengthLimit{stringLengthLimit}, - initialSize{initialSize}, - countDistincts(countDistincts), - allocator(allocator) {} - - uint32_t stringLengthLimit; - std::optional initialSize; - bool countDistincts{false}; - HashStringAllocator* allocator; - - StatisticsBuilderOptions withoutNumDistinct() const { - return StatisticsBuilderOptions(stringLengthLimit, initialSize); - } - - static StatisticsBuilderOptions fromConfig(const Config& config) { - return StatisticsBuilderOptions{config.get(Config::STRING_STATS_LIMIT)}; - } -}; +// Re-export common types into dwrf namespace for backward compatibility. +using dwio::stats::StatisticsBuilderOptions; -/* - * Base class for stats builder. Stats builder is used in writer and file merge - * to collect and merge stats. - * It can also be used for gathering stats in ad hoc sampling. In this case it - * may also count distinct values if enabled in 'options'. - */ -class StatisticsBuilder : public virtual dwio::common::ColumnStatistics { +/// DWRF-specific StatisticsBuilder that adds proto serialization and +/// proto-based build() on top of the common StatisticsBuilder. +class StatisticsBuilder : public virtual dwio::stats::StatisticsBuilder { public: - /// Constructs with 'options'. explicit StatisticsBuilder(const StatisticsBuilderOptions& options) - : options_{options}, arena_(std::make_unique()) { - init(); - } + : dwio::stats::StatisticsBuilder(options), + arena_(std::make_unique()) {} ~StatisticsBuilder() override = default; - void setHasNull() { - hasNull_ = true; - } - - void increaseValueCount(uint64_t count = 1) { - if (valueCount_.has_value()) { - valueCount_.value() += count; - } - } - - void increaseRawSize(uint64_t rawSize) { - if (rawSize_.has_value()) { - rawSize_.value() += rawSize; - } - } - - void clearRawSize() { - rawSize_.reset(); - } - - void ensureSize() { - if (!size_.has_value()) { - size_ = 0; - } - } - - void incrementSize(uint64_t size) { - if (LIKELY(size_.has_value())) { - addWithOverflowCheck(size_, size, /*count=*/1); - } - } - - template - void addHash(const T& data) { - if (hll_) { - hll_->insertHash(folly::hasher()(data)); - } - } - - int64_t cardinality() const { - VELOX_CHECK_NOT_NULL(hll_); - return hll_->cardinality(); - } - - /* - * Merge stats of same type. This is used in writer to aggregate file level - * stats. - */ - virtual void merge( - const dwio::common::ColumnStatistics& other, - bool ignoreSize = false); - - /* - * Reset. Used in the place where row index entry level stats in captured. - */ - virtual void reset() { - init(); - } - - /* - * Write stats to proto - */ + /// Serializes statistics to a proto wrapper. virtual void toProto(ColumnStatisticsWriteWrapper& stats) const; - std::unique_ptr build() const; + /// Builds a read-only ColumnStatistics by round-tripping through proto. + std::unique_ptr build() const override; + /// Creates a DWRF-specific StatisticsBuilder for the given type. For MAP + /// type, returns a MapStatisticsBuilder. static std::unique_ptr create( const Type& type, const StatisticsBuilderOptions& options); - // for the given type tree, create the a list of stat builders + /// For the given type tree, creates a list of DWRF stat builders. static void createTree( std::vector>& statBuilders, const Type& type, const StatisticsBuilderOptions& options); - private: - void init() { - valueCount_ = 0; - hasNull_ = false; - rawSize_ = 0; - size_ = options_.initialSize; - if (options_.countDistincts) { - hll_ = std::make_shared>(options_.allocator); - } + /// Creates StatisticsBuilderOptions from a DWRF Config. + static StatisticsBuilderOptions optionsFromConfig(const Config& config) { + return StatisticsBuilderOptions{config.get(Config::STRING_STATS_LIMIT)}; } - protected: - StatisticsBuilderOptions options_; - std::shared_ptr> hll_; + private: std::unique_ptr arena_; }; class BooleanStatisticsBuilder : public StatisticsBuilder, - public dwio::common::BooleanColumnStatistics { + public dwio::stats::BooleanStatisticsBuilder { public: explicit BooleanStatisticsBuilder(const StatisticsBuilderOptions& options) - : StatisticsBuilder{options.withoutNumDistinct()} { - init(); - } + : dwio::stats::StatisticsBuilder{options.dropNumDistinct()}, + StatisticsBuilder{options.dropNumDistinct()}, + dwio::stats::BooleanStatisticsBuilder{options} {} ~BooleanStatisticsBuilder() override = default; - void addValues(bool value, uint64_t count = 1) { - increaseValueCount(count); - if (trueCount_.has_value() && value) { - trueCount_.value() += count; - } - } - - void merge( - const dwio::common::ColumnStatistics& other, - bool ignoreSize = false) override; - - void reset() override { - StatisticsBuilder::reset(); - init(); + std::unique_ptr build() const override { + return StatisticsBuilder::build(); } void toProto(ColumnStatisticsWriteWrapper& stats) const override; - - private: - void init() { - trueCount_ = 0; - } }; class IntegerStatisticsBuilder : public StatisticsBuilder, - public dwio::common::IntegerColumnStatistics { + public dwio::stats::IntegerStatisticsBuilder { public: explicit IntegerStatisticsBuilder(const StatisticsBuilderOptions& options) - : StatisticsBuilder{options} { - init(); - } + : dwio::stats::StatisticsBuilder{options}, + StatisticsBuilder{options}, + dwio::stats::IntegerStatisticsBuilder{options} {} ~IntegerStatisticsBuilder() override = default; - void addValues(int64_t value, uint64_t count = 1) { - increaseValueCount(count); - if (min_.has_value() && value < min_.value()) { - min_ = value; - } - if (max_.has_value() && value > max_.value()) { - max_ = value; - } - addWithOverflowCheck(sum_, value, count); - addHash(value); - } - - void merge( - const dwio::common::ColumnStatistics& other, - bool ignoreSize = false) override; - - void reset() override { - StatisticsBuilder::reset(); - init(); + std::unique_ptr build() const override { + return StatisticsBuilder::build(); } void toProto(ColumnStatisticsWriteWrapper& stats) const override; - - private: - void init() { - min_ = std::numeric_limits::max(); - max_ = std::numeric_limits::min(); - sum_ = 0; - } }; -static_assert( - std::numeric_limits::has_infinity, - "infinity not defined"); - class DoubleStatisticsBuilder : public StatisticsBuilder, - public dwio::common::DoubleColumnStatistics { + public dwio::stats::DoubleStatisticsBuilder { public: explicit DoubleStatisticsBuilder(const StatisticsBuilderOptions& options) - : StatisticsBuilder{options} { - init(); - } + : dwio::stats::StatisticsBuilder{options}, + StatisticsBuilder{options}, + dwio::stats::DoubleStatisticsBuilder{options} {} ~DoubleStatisticsBuilder() override = default; - void addValues(double value, uint64_t count = 1) { - increaseValueCount(count); - // min/max/sum is defined only when none of the values added is NaN - if (std::isnan(value)) { - clear(); - return; - } - - if (min_.has_value() && value < min_.value()) { - min_ = value; - } - if (max_.has_value() && value > max_.value()) { - max_ = value; - } - addHash(value); - // value * count sometimes is not same as adding values (count) times. So - // add in a loop - if (sum_.has_value()) { - for (uint64_t i = 0; i < count; ++i) { - sum_.value() += value; - } - if (std::isnan(sum_.value())) { - sum_.reset(); - } - } - } - - void merge( - const dwio::common::ColumnStatistics& other, - bool ignoreSize = false) override; - - void reset() override { - StatisticsBuilder::reset(); - init(); + std::unique_ptr build() const override { + return StatisticsBuilder::build(); } void toProto(ColumnStatisticsWriteWrapper& stats) const override; - - private: - void init() { - min_ = std::numeric_limits::infinity(); - max_ = -std::numeric_limits::infinity(); - sum_ = 0; - } - - void clear() { - min_.reset(); - max_.reset(); - sum_.reset(); - } }; class StringStatisticsBuilder : public StatisticsBuilder, - public dwio::common::StringColumnStatistics { + public dwio::stats::StringStatisticsBuilder { public: explicit StringStatisticsBuilder(const StatisticsBuilderOptions& options) - : StatisticsBuilder{options}, lengthLimit_{options.stringLengthLimit} { - init(); - } + : dwio::stats::StatisticsBuilder{options}, + StatisticsBuilder{options}, + dwio::stats::StringStatisticsBuilder{options} {} ~StringStatisticsBuilder() override = default; - void addValues(std::string_view value, uint64_t count = 1) { - // min_/max_ is not initialized with default that can be compared against - // easily. So we need to capture whether self is empty and handle - // differently. - auto isSelfEmpty = isEmpty(*this); - increaseValueCount(count); - if (isSelfEmpty) { - min_ = value; - max_ = value; - } else { - if (min_.has_value() && value < std::string_view{min_.value()}) { - min_ = value; - } - if (max_.has_value() && value > std::string_view{max_.value()}) { - max_ = value; - } - } - addHash(value); - - addWithOverflowCheck(length_, value.size(), count); - } - - void merge( - const dwio::common::ColumnStatistics& other, - bool ignoreSize = false) override; - - void reset() override { - StatisticsBuilder::reset(); - init(); + std::unique_ptr build() const override { + return StatisticsBuilder::build(); } void toProto(ColumnStatisticsWriteWrapper& stats) const override; - - private: - uint32_t lengthLimit_; - - void init() { - min_.reset(); - max_.reset(); - length_ = 0; - } - - bool shouldKeep(const std::optional& val) const { - return val.has_value() && val.value().size() <= lengthLimit_; - } }; class BinaryStatisticsBuilder : public StatisticsBuilder, - public dwio::common::BinaryColumnStatistics { + public dwio::stats::BinaryStatisticsBuilder { public: explicit BinaryStatisticsBuilder(const StatisticsBuilderOptions& options) - : StatisticsBuilder{options.withoutNumDistinct()} { - init(); - } + : dwio::stats::StatisticsBuilder{options.dropNumDistinct()}, + StatisticsBuilder{options.dropNumDistinct()}, + dwio::stats::BinaryStatisticsBuilder{options} {} ~BinaryStatisticsBuilder() override = default; - void addValues(uint64_t length, uint64_t count = 1) { - increaseValueCount(count); - addWithOverflowCheck(length_, length, count); - } - - void merge( - const dwio::common::ColumnStatistics& other, - bool ignoreSize = false) override; - - void reset() override { - StatisticsBuilder::reset(); - init(); + std::unique_ptr build() const override { + return StatisticsBuilder::build(); } void toProto(ColumnStatisticsWriteWrapper& stats) const override; - - private: - void init() { - length_ = 0; - } }; class MapStatisticsBuilder : public StatisticsBuilder, @@ -444,7 +154,8 @@ class MapStatisticsBuilder : public StatisticsBuilder, MapStatisticsBuilder( const Type& type, const StatisticsBuilderOptions& options) - : StatisticsBuilder{options}, + : dwio::stats::StatisticsBuilder{options}, + StatisticsBuilder{options}, valueType_{type.as().valueType()} { init(); hll_.reset(); @@ -454,20 +165,9 @@ class MapStatisticsBuilder : public StatisticsBuilder, void addValues( const dwrf::proto::KeyInfo& keyInfo, - const StatisticsBuilder& stats) { - // Since addValues is called once per key info per stride, - // it's ok to just construct the key struct per call. - auto& keyStats = getKeyStats(constructKey(keyInfo)); - keyStats.merge(stats, /*ignoreSize=*/true); - } + const StatisticsBuilder& stats); - void incrementSize(const dwrf::proto::KeyInfo& keyInfo, uint64_t size) { - // Since incrementSize is called once per key info per stripe, - // it's ok to just construct the key struct per call. - auto& keyStats = getKeyStats(constructKey(keyInfo)); - keyStats.ensureSize(); - keyStats.incrementSize(size); - } + void incrementSize(const dwrf::proto::KeyInfo& keyInfo, uint64_t size); void merge( const dwio::common::ColumnStatistics& other, @@ -480,6 +180,10 @@ class MapStatisticsBuilder : public StatisticsBuilder, void toProto(ColumnStatisticsWriteWrapper& stats) const override; + /// Converts a proto KeyInfo to a dwio::common::KeyInfo. + static dwio::common::KeyInfo constructKey( + const dwrf::proto::KeyInfo& keyInfo); + private: void init() { entryStatistics_.clear();