diff --git a/axiom/connectors/hive/LocalHiveConnectorMetadata.cpp b/axiom/connectors/hive/LocalHiveConnectorMetadata.cpp index 4088077cf..7e4d5fca4 100644 --- a/axiom/connectors/hive/LocalHiveConnectorMetadata.cpp +++ b/axiom/connectors/hive/LocalHiveConnectorMetadata.cpp @@ -214,6 +214,24 @@ int64_t estimateNdv(int64_t ndv, uint64_t sampleRows, uint64_t totalRows) { return std::max(1, static_cast(estimated)); } +// Creates an integer Variant with the TypeKind matching the column type. +// IntegerColumnStatistics::getMinimum/getMaximum return int64_t regardless of +// the actual integer width. Constructing Variant(int64_t) always produces +// BIGINT, which causes a type mismatch when the column is TINYINT, SMALLINT, or +// INTEGER. This helper narrows the value to the correct type. +velox::Variant makeIntegerVariant(velox::TypeKind kind, int64_t value) { + switch (kind) { + case velox::TypeKind::TINYINT: + return velox::Variant(static_cast(value)); + case velox::TypeKind::SMALLINT: + return velox::Variant(static_cast(value)); + case velox::TypeKind::INTEGER: + return velox::Variant(static_cast(value)); + default: + return velox::Variant(value); + } +} + // Aggregates per-column stats across selected files. Columns missing from a // file's columnStats are treated as all-null (numValues = 0, no min/max). This // handles schema evolution where a column was added after the file was written. @@ -623,13 +641,8 @@ LocalHiveTableLayout::co_estimateStats( requestedColumns.push_back(column); } - // Aggregate per-file column stats. Skip for Parquet because its - // Reader::columnStatistics() is not implemented and returns nullptr. - std::vector columnStats; - if (fileFormat() != velox::dwio::common::FileFormat::PARQUET) { - columnStats = aggregateColumnStats( - selectedFiles, requestedColumns, totalRows, table().numRows()); - } + auto columnStats = aggregateColumnStats( + selectedFiles, requestedColumns, totalRows, table().numRows()); co_return FilteredTableStats{ totalRows, std::move(columnStats), std::move(rejectedFilterIndices)}; @@ -1194,8 +1207,8 @@ void LocalHiveConnectorMetadata::loadTable( const auto* column = table->findColumn(name); VELOX_CHECK_NOT_NULL(column, "Column not found: {}", name); - // Node ID 0 is the root RowType; top-level columns start at 1. - if (auto readerStats = reader->columnStatistics(i + 1)) { + const auto& typeWithId = reader->typeWithId()->childByName(name); + if (auto readerStats = reader->columnStatistics(typeWithId->id())) { auto* stats = const_cast(column)->mutableStats(); stats->numValues += readerStats->getNumberOfValues().value_or(0); @@ -1212,11 +1225,14 @@ void LocalHiveConnectorMetadata::loadTable( if (auto* intStats = dynamic_cast< const velox::dwio::common::IntegerColumnStatistics*>( readerStats.get())) { + auto columnKind = column->type()->kind(); if (intStats->getMinimum().has_value()) { - fileColStats.min = velox::Variant(intStats->getMinimum().value()); + fileColStats.min = + makeIntegerVariant(columnKind, intStats->getMinimum().value()); } if (intStats->getMaximum().has_value()) { - fileColStats.max = velox::Variant(intStats->getMaximum().value()); + fileColStats.max = + makeIntegerVariant(columnKind, intStats->getMaximum().value()); } } else if ( auto* dblStats = dynamic_cast< diff --git a/axiom/optimizer/tests/FilteredTableStatsTest.cpp b/axiom/optimizer/tests/FilteredTableStatsTest.cpp index f8a9989de..d8cd639d1 100644 --- a/axiom/optimizer/tests/FilteredTableStatsTest.cpp +++ b/axiom/optimizer/tests/FilteredTableStatsTest.cpp @@ -31,10 +31,6 @@ class FilteredTableStatsTest : public test::HiveQueriesTestBase { protected: static void SetUpTestCase() { test::HiveQueriesTestBase::SetUpTestCase(); - // Use DWRF format because Parquet reader's columnStatistics() is not - // implemented and returns nullptr, preventing per-file stats collection. - LocalRunnerTestBase::localFileFormat_ = - velox::dwio::common::FileFormat::DWRF; createTpchTables({velox::tpch::Table::TBL_NATION}); } diff --git a/axiom/optimizer/tests/HiveQueriesTestBase.h b/axiom/optimizer/tests/HiveQueriesTestBase.h index e16d46118..20ad7b1b1 100644 --- a/axiom/optimizer/tests/HiveQueriesTestBase.h +++ b/axiom/optimizer/tests/HiveQueriesTestBase.h @@ -30,11 +30,16 @@ class HiveQueriesTestBase : public QueryTestBase { static const inline std::string kDefaultSchema{ connector::hive::LocalHiveConnectorMetadata::kDefaultSchema}; + /// Initializes a temporary data directory for Parquet test data. Subclasses + /// should call this, then use createTpchTables() to populate test tables. static void SetUpTestCase(); + /// Enables reading and writing Hive tables in Parquet and DWRF formats + /// and makes prestoParser(), hiveConnector(), and hiveMetadata() available. void SetUp() override; - /// Creates specified TPC-H tables in the temp directory. + /// Generates TPC-H data for the specified tables using the file format + /// configured in SetUpTestCase(). static void createTpchTables(const std::vector& tables); void TearDown() override; diff --git a/velox b/velox index 344087e4f..91b2e4585 160000 --- a/velox +++ b/velox @@ -1 +1 @@ -Subproject commit 344087e4ff0f8bcd36f2d2799bba75eed5e8b699 +Subproject commit 91b2e458585eb7d1667263c70ac50b92476d7278