From 4a34b15dc2fb1a2bf4312fd0d946e1b8a9573b5f Mon Sep 17 00:00:00 2001 From: Eric Jia Date: Tue, 18 Nov 2025 11:12:42 -0800 Subject: [PATCH] feat: Add bool AND/OR and bit AND/OR/XOR aggregation feature to TableEvolutionFuzzer (#15528) Summary: Add support for bool_and, bool_or aggregation and bitwise_and_agg, bitwise_or_agg, bitwise_xor_agg functions in the TableEvolutionFuzzer's aggregation pushdown testing. ## Changes Extended the aggregation pushdown configuration generator to support boolean aggregation functions and bitwise aggregation functions in addition to the existing numeric aggregation functions: - Split `supportedAggs` into two separate lists: - `supportedNumericAggs`: {min, max} for numeric columns (INTEGER, BIGINT, REAL, DOUBLE, etc.) - `supportedBooleanAggs`: {bool_and, bool_or} for boolean columns - `supportedIntegerAggs`: {bitwise_and_agg, bitwise_or_agg, bitwise_xor_agg} for integer columns - Updated column collection logic to separately track numeric and boolean columns that are eligible for aggregation - Modified aggregation selection to independently try both numeric and boolean aggregations with 50% probability each, allowing for more diverse test coverage This change enables more comprehensive testing of aggregation pushdown functionality by including boolean aggregate functions, ensuring better coverage of different data types and aggregation scenarios. Reviewed By: Yuhta Differential Revision: D87204100 --- velox/exec/tests/TableEvolutionFuzzer.cpp | 101 +++++++++++++++------- 1 file changed, 72 insertions(+), 29 deletions(-) diff --git a/velox/exec/tests/TableEvolutionFuzzer.cpp b/velox/exec/tests/TableEvolutionFuzzer.cpp index d176c341c4f..247eeef35b2 100644 --- a/velox/exec/tests/TableEvolutionFuzzer.cpp +++ b/velox/exec/tests/TableEvolutionFuzzer.cpp @@ -187,6 +187,43 @@ TableEvolutionFuzzer::parseFileFormats(std::string input) { namespace { +// Helper function to randomly select aggregates from available columns +// without replacement. Returns a list of aggregate expressions. +void generateAggregatesForColumns( + const std::vector& availableColumns, + const std::vector& supportedAggFuncs, + const RowTypePtr& schema, + FuzzerGenerator& rng, + std::vector& aggregates) { + if (availableColumns.empty()) { + return; + } + + int numAggregates = std::min( + static_cast(availableColumns.size()), + std::min( + static_cast(5), + static_cast( + folly::Random::rand32(1, availableColumns.size() + 1, rng)))); + + std::unordered_set selectedIndices; + for (int i = 0; i < numAggregates; ++i) { + if (folly::Random::oneIn(2, rng)) { + int randomIdx; + do { + randomIdx = folly::Random::rand32(availableColumns.size(), rng); + } while (selectedIndices.count(randomIdx) > 0); + selectedIndices.insert(randomIdx); + + int colIdx = availableColumns[randomIdx]; + std::string aggFunc = supportedAggFuncs[folly::Random::rand32( + supportedAggFuncs.size(), rng)]; + aggregates.push_back( + fmt::format("{}({})", aggFunc, schema->nameOf(colIdx))); + } + } +} + std::vector> runTaskCursors( const std::vector>& cursors, folly::Executor& executor) { @@ -410,7 +447,7 @@ fuzzer::ExpressionFuzzer::FuzzedExpressionData generateRemainingFilters( // Generate random aggregation configuration for pushdown testing. // Only generates aggregations that are eligible for pushdown: -// - Supported aggregate functions: min, max, sum +// - Supported aggregate functions: min, max, bool_and, bool_or // - Each column can only be used by at most one aggregate // - Grouping keys are optional (can be empty for global aggregation) // - Columns with filters (subfield or remaining) are excluded to enable @@ -421,7 +458,11 @@ std::optional generateAggregationConfig( const std::unordered_set& filteredColumns) { // List of aggregate functions that support pushdown // Note: Excluding 'sum' to avoid integer overflow in fuzzer with random data - static const std::vector supportedAggs = {"min", "max"}; + static const std::vector supportedNumericAggs = {"min", "max"}; + static const std::vector supportedBooleanAggs = { + "bool_and", "bool_or"}; + static const std::vector supportedIntegerAggs = { + "bitwise_and_agg", "bitwise_or_agg", "bitwise_xor_agg"}; // Randomly decide number of grouping keys (0 to 2) int numGroupingKeys = folly::Random::rand32(3, rng); @@ -441,7 +482,9 @@ std::optional generateAggregationConfig( // For aggregation pushdown to work, each column should only be used once // and columns with filters should be excluded std::vector aggregates; - std::vector availableColumns; + std::vector availableNumericColumns; + std::vector availableIntegerColumns; + std::vector availableBooleanColumns; for (int i = 0; i < schema->size(); ++i) { if (usedColumnIndices.count(i) == 0) { auto columnName = schema->nameOf(i); @@ -451,41 +494,41 @@ std::optional generateAggregationConfig( } auto type = schema->childAt(i); - // Only numeric types support min/max/sum - // Check if it's a primitive numeric type (not decimal) + // Integer types: randomly choose between min/max or bitwise aggregations + // Note: Exclude DATE type as it doesn't support bitwise aggregations if ((type->isInteger() || type->isBigint() || type->isSmallint() || - type->isTinyint() || type->isReal() || type->isDouble()) && - !type->isDecimal()) { - availableColumns.push_back(i); + type->isTinyint()) && + !type->isDate()) { + if (folly::Random::oneIn(2, rng)) { + availableIntegerColumns.push_back(i); + } else { + availableNumericColumns.push_back(i); + } + } + // Float types support min/max only + else if ((type->isReal() || type->isDouble()) && !type->isDecimal()) { + availableNumericColumns.push_back(i); + } + // Boolean types support bool_and/bool_or + else if (type->isBoolean()) { + availableBooleanColumns.push_back(i); } } } - if (availableColumns.empty()) { + // Need at least one column to aggregate + if (availableNumericColumns.empty() && availableBooleanColumns.empty() && + availableIntegerColumns.empty()) { return std::nullopt; } - // Randomly select 1-5 aggregates - int numAggregates = std::min( - static_cast(5), - static_cast( - folly::Random::rand32(1, availableColumns.size() + 1, rng))); - // Randomly pick columns for aggregates without replacement - std::unordered_set selectedIndices; - for (int i = 0; i < numAggregates; ++i) { - int randomIdx; - do { - randomIdx = folly::Random::rand32(availableColumns.size(), rng); - } while (selectedIndices.count(randomIdx) > 0); - selectedIndices.insert(randomIdx); - - int colIdx = availableColumns[randomIdx]; - std::string aggFunc = - supportedAggs[folly::Random::rand32(supportedAggs.size(), rng)]; - aggregates.push_back( - fmt::format("{}({})", aggFunc, schema->nameOf(colIdx))); - } + generateAggregatesForColumns( + availableNumericColumns, supportedNumericAggs, schema, rng, aggregates); + generateAggregatesForColumns( + availableBooleanColumns, supportedBooleanAggs, schema, rng, aggregates); + generateAggregatesForColumns( + availableIntegerColumns, supportedIntegerAggs, schema, rng, aggregates); if (aggregates.empty()) { return std::nullopt;