Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 27 additions & 11 deletions axiom/connectors/hive/LocalHiveConnectorMetadata.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -214,6 +214,24 @@ int64_t estimateNdv(int64_t ndv, uint64_t sampleRows, uint64_t totalRows) {
return std::max<int64_t>(1, static_cast<int64_t>(estimated));
}

// Creates an integer Variant with the TypeKind matching the column type.
// IntegerColumnStatistics::getMinimum/getMaximum return int64_t regardless of
// the actual integer width. Constructing Variant(int64_t) always produces
// BIGINT, which causes a type mismatch when the column is TINYINT, SMALLINT, or
// INTEGER. This helper narrows the value to the correct type.
velox::Variant makeIntegerVariant(velox::TypeKind kind, int64_t value) {
switch (kind) {
case velox::TypeKind::TINYINT:
return velox::Variant(static_cast<int8_t>(value));
case velox::TypeKind::SMALLINT:
return velox::Variant(static_cast<int16_t>(value));
case velox::TypeKind::INTEGER:
return velox::Variant(static_cast<int32_t>(value));
default:
return velox::Variant(value);
}
}

// Aggregates per-column stats across selected files. Columns missing from a
// file's columnStats are treated as all-null (numValues = 0, no min/max). This
// handles schema evolution where a column was added after the file was written.
Expand Down Expand Up @@ -623,13 +641,8 @@ LocalHiveTableLayout::co_estimateStats(
requestedColumns.push_back(column);
}

// Aggregate per-file column stats. Skip for Parquet because its
// Reader::columnStatistics() is not implemented and returns nullptr.
std::vector<ColumnStatistics> columnStats;
if (fileFormat() != velox::dwio::common::FileFormat::PARQUET) {
columnStats = aggregateColumnStats(
selectedFiles, requestedColumns, totalRows, table().numRows());
}
auto columnStats = aggregateColumnStats(
selectedFiles, requestedColumns, totalRows, table().numRows());

co_return FilteredTableStats{
totalRows, std::move(columnStats), std::move(rejectedFilterIndices)};
Expand Down Expand Up @@ -1194,8 +1207,8 @@ void LocalHiveConnectorMetadata::loadTable(
const auto* column = table->findColumn(name);
VELOX_CHECK_NOT_NULL(column, "Column not found: {}", name);

// Node ID 0 is the root RowType; top-level columns start at 1.
if (auto readerStats = reader->columnStatistics(i + 1)) {
const auto& typeWithId = reader->typeWithId()->childByName(name);
if (auto readerStats = reader->columnStatistics(typeWithId->id())) {
auto* stats = const_cast<Column*>(column)->mutableStats();
stats->numValues += readerStats->getNumberOfValues().value_or(0);

Expand All @@ -1212,11 +1225,14 @@ void LocalHiveConnectorMetadata::loadTable(
if (auto* intStats = dynamic_cast<
const velox::dwio::common::IntegerColumnStatistics*>(
readerStats.get())) {
auto columnKind = column->type()->kind();
if (intStats->getMinimum().has_value()) {
fileColStats.min = velox::Variant(intStats->getMinimum().value());
fileColStats.min =
makeIntegerVariant(columnKind, intStats->getMinimum().value());
}
if (intStats->getMaximum().has_value()) {
fileColStats.max = velox::Variant(intStats->getMaximum().value());
fileColStats.max =
makeIntegerVariant(columnKind, intStats->getMaximum().value());
}
} else if (
auto* dblStats = dynamic_cast<
Expand Down
4 changes: 0 additions & 4 deletions axiom/optimizer/tests/FilteredTableStatsTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,10 +31,6 @@ class FilteredTableStatsTest : public test::HiveQueriesTestBase {
protected:
static void SetUpTestCase() {
test::HiveQueriesTestBase::SetUpTestCase();
// Use DWRF format because Parquet reader's columnStatistics() is not
// implemented and returns nullptr, preventing per-file stats collection.
LocalRunnerTestBase::localFileFormat_ =
velox::dwio::common::FileFormat::DWRF;
createTpchTables({velox::tpch::Table::TBL_NATION});
}

Expand Down
7 changes: 6 additions & 1 deletion axiom/optimizer/tests/HiveQueriesTestBase.h
Original file line number Diff line number Diff line change
Expand Up @@ -30,11 +30,16 @@ class HiveQueriesTestBase : public QueryTestBase {
static const inline std::string kDefaultSchema{
connector::hive::LocalHiveConnectorMetadata::kDefaultSchema};

/// Initializes a temporary data directory for Parquet test data. Subclasses
/// should call this, then use createTpchTables() to populate test tables.
static void SetUpTestCase();

/// Enables reading and writing Hive tables in Parquet and DWRF formats
/// and makes prestoParser(), hiveConnector(), and hiveMetadata() available.
void SetUp() override;

/// Creates specified TPC-H tables in the temp directory.
/// Generates TPC-H data for the specified tables using the file format
/// configured in SetUpTestCase().
static void createTpchTables(const std::vector<velox::tpch::Table>& tables);

void TearDown() override;
Expand Down
2 changes: 1 addition & 1 deletion velox
Submodule velox updated 34 files
+0 −137 .github/workflows/benchmark.yml
+0 −160 scripts/ci/benchmark-alert.py
+9 −0 velox/core/QueryConfig.h
+4 −0 velox/docs/configs.rst
+233 −139 velox/docs/designs/column-extraction-pushdown.md
+2 −0 velox/dwio/common/CMakeLists.txt
+12 −5 velox/dwio/common/Reader.h
+6 −0 velox/dwio/common/Statistics.h
+450 −0 velox/dwio/common/StatisticsBuilder.cpp
+334 −0 velox/dwio/common/StatisticsBuilder.h
+0 −1 velox/dwio/dwrf/common/Statistics.h
+4 −2 velox/dwio/dwrf/test/E2EWriterTest.cpp
+1 −1 velox/dwio/dwrf/writer/ColumnWriter.h
+1 −1 velox/dwio/dwrf/writer/FlatMapColumnWriter.cpp
+1 −1 velox/dwio/dwrf/writer/FlatMapColumnWriter.h
+52 −225 velox/dwio/dwrf/writer/StatisticsBuilder.cpp
+56 −352 velox/dwio/dwrf/writer/StatisticsBuilder.h
+56 −0 velox/dwio/parquet/reader/ParquetReader.cpp
+1 −3 velox/dwio/parquet/reader/ParquetReader.h
+121 −0 velox/dwio/parquet/tests/reader/ParquetReaderTest.cpp
+10 −0 velox/exec/HashProbe.cpp
+1 −1 velox/exec/MergeSource.cpp
+1 −1 velox/exec/SortBuffer.cpp
+12 −1 velox/exec/VectorHasher.cpp
+9 −0 velox/exec/VectorHasher.h
+224 −0 velox/exec/tests/HashJoinTest.cpp
+116 −0 velox/exec/tests/VectorHasherTest.cpp
+2 −2 velox/experimental/cudf/CudfConfig.h
+4 −0 velox/experimental/cudf/exec/CudfHashAggregation.cpp
+0 −1 velox/experimental/cudf/expression/ExpressionEvaluator.cpp
+16 −3 velox/experimental/cudf/tests/ExpressionEvaluatorSelectionTest.cpp
+1 −1 velox/experimental/cudf/tests/ToCudfSelectionTest.cpp
+12 −1 velox/experimental/cudf/tests/sparksql/FilterProjectTest.cpp
+400 −436 velox/functions/prestosql/tests/GeometryFunctionsTest.cpp