From 692cad81648309aa5b07ca064d8be5319679dd65 Mon Sep 17 00:00:00 2001 From: Kamil Endruszkiewicz Date: Wed, 7 Jun 2023 09:25:52 +0200 Subject: [PATCH] Fix underestimating size of partitioned Glue hive tables --- .../metastore/glue/GlueHiveMetastore.java | 6 +-- .../metastore/glue/TestHiveGlueMetastore.java | 44 ++++++++++++++++++- 2 files changed, 46 insertions(+), 4 deletions(-) diff --git a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/metastore/glue/GlueHiveMetastore.java b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/metastore/glue/GlueHiveMetastore.java index 55e0a82a42ac..f7b58ed6934a 100644 --- a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/metastore/glue/GlueHiveMetastore.java +++ b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/metastore/glue/GlueHiveMetastore.java @@ -383,12 +383,12 @@ private void updatePartitionStatisticsBatch(Table table, Map partitions = batchGetPartition(table, ImmutableList.copyOf(updates.keySet())); - Map> statisticsPerPartition = columnStatisticsProvider.getPartitionColumnStatistics(partitions); + Map partitionsStatistics = getPartitionStatistics(table, partitions); - statisticsPerPartition.forEach((partition, columnStatistics) -> { + partitions.forEach(partition -> { Function update = updates.get(partitionValuesToName.get(partition.getValues())); - PartitionStatistics currentStatistics = new PartitionStatistics(getHiveBasicStatistics(partition.getParameters()), columnStatistics); + PartitionStatistics currentStatistics = partitionsStatistics.get(makePartitionName(table, partition)); PartitionStatistics updatedStatistics = update.apply(currentStatistics); Map updatedStatisticsParameters = updateStatisticsParameters(partition.getParameters(), updatedStatistics.getBasicStatistics()); diff --git a/plugin/trino-hive/src/test/java/io/trino/plugin/hive/metastore/glue/TestHiveGlueMetastore.java b/plugin/trino-hive/src/test/java/io/trino/plugin/hive/metastore/glue/TestHiveGlueMetastore.java index 03bdffa53904..89da15287220 100644 --- a/plugin/trino-hive/src/test/java/io/trino/plugin/hive/metastore/glue/TestHiveGlueMetastore.java +++ b/plugin/trino-hive/src/test/java/io/trino/plugin/hive/metastore/glue/TestHiveGlueMetastore.java @@ -297,11 +297,53 @@ public void testUpdatePartitionColumnStatisticsEmptyOptionalFields() // in order to avoid incorrect data we skip writes for statistics with min/max = null } + @Override + public void testUpdateBasicPartitionStatistics() + throws Exception + { + SchemaTableName tableName = temporaryTable("update_basic_partition_statistics"); + try { + createDummyPartitionedTable(tableName, STATISTICS_PARTITIONED_TABLE_COLUMNS); + testUpdatePartitionStatistics( + tableName, + PartitionStatistics.empty(), + ImmutableList.of(BASIC_STATISTICS_1, BASIC_STATISTICS_2), + ImmutableList.of(BASIC_STATISTICS_2, BASIC_STATISTICS_1)); + } + finally { + dropTable(tableName); + } + } + + @Override + public void testUpdatePartitionColumnStatistics() + throws Exception + { + SchemaTableName tableName = temporaryTable("update_partition_column_statistics"); + try { + createDummyPartitionedTable(tableName, STATISTICS_PARTITIONED_TABLE_COLUMNS); + // When the table has partitions, but row count statistics are set to zero, we treat this case as empty + // statistics to avoid underestimation in the CBO. This scenario may be caused when other engines are + // used to ingest data into partitioned hive tables. + testUpdatePartitionStatistics( + tableName, + PartitionStatistics.empty(), + ImmutableList.of(STATISTICS_1_1, STATISTICS_1_2, STATISTICS_2), + ImmutableList.of(STATISTICS_1_2, STATISTICS_1_1, STATISTICS_2)); + } + finally { + dropTable(tableName); + } + } + @Override public void testStorePartitionWithStatistics() throws Exception { - testStorePartitionWithStatistics(STATISTICS_PARTITIONED_TABLE_COLUMNS, BASIC_STATISTICS_1, BASIC_STATISTICS_2, BASIC_STATISTICS_1, ZERO_TABLE_STATISTICS); + // When the table has partitions, but row count statistics are set to zero, we treat this case as empty + // statistics to avoid underestimation in the CBO. This scenario may be caused when other engines are + // used to ingest data into partitioned hive tables. + testStorePartitionWithStatistics(STATISTICS_PARTITIONED_TABLE_COLUMNS, BASIC_STATISTICS_1, BASIC_STATISTICS_2, BASIC_STATISTICS_1, PartitionStatistics.empty()); } @Override