From d5bb8c75dd48403acc9b0a7a4f2b772bd9ade46e Mon Sep 17 00:00:00 2001
From: Sourabh Badhya <iamsbadhya@gmail.com>
Date: Thu, 8 Jun 2023 19:56:33 +0530
Subject: [PATCH] HIVE-27421: Do not set stats in metastore when non-native
 table can store stats in its own format

---
 .../test/queries/positive/analyze_col_stats.q |  19 ++
 .../positive/llap/analyze_col_stats.q.out     | 258 ++++++++++++++++++
 .../hive/ql/stats/ColStatsProcessor.java      |   6 +-
 3 files changed, 281 insertions(+), 2 deletions(-)
 create mode 100644 iceberg/iceberg-handler/src/test/queries/positive/analyze_col_stats.q
 create mode 100644 iceberg/iceberg-handler/src/test/results/positive/llap/analyze_col_stats.q.out

diff --git a/iceberg/iceberg-handler/src/test/queries/positive/analyze_col_stats.q b/iceberg/iceberg-handler/src/test/queries/positive/analyze_col_stats.q
new file mode 100644
index 000000000000..ecdb42128762
--- /dev/null
+++ b/iceberg/iceberg-handler/src/test/queries/positive/analyze_col_stats.q
@@ -0,0 +1,19 @@
+set hive.stats.autogather=false;
+
+create external table test_iceberg_stats (strcol string, intcol integer) partitioned by (pcol int) stored by iceberg;
+
+insert into table test_iceberg_stats values ('abc', 1, 1);
+insert into table test_iceberg_stats values ('def', 2, 2);
+insert into table test_iceberg_stats values ('ghi', 3, 3);
+
+set hive.iceberg.stats.source=iceberg;
+-- No column stats is written in puffin files yet.
+explain analyze table test_iceberg_stats compute statistics for columns;
+
+-- Column stats is written in puffin files.
+analyze table test_iceberg_stats compute statistics for columns;
+explain analyze table test_iceberg_stats compute statistics for columns;
+
+set hive.iceberg.stats.source=metastore;
+-- No column stats must be seen when accessing metastore.
+explain analyze table test_iceberg_stats compute statistics for columns;
diff --git a/iceberg/iceberg-handler/src/test/results/positive/llap/analyze_col_stats.q.out b/iceberg/iceberg-handler/src/test/results/positive/llap/analyze_col_stats.q.out
new file mode 100644
index 000000000000..4becee42877d
--- /dev/null
+++ b/iceberg/iceberg-handler/src/test/results/positive/llap/analyze_col_stats.q.out
@@ -0,0 +1,258 @@
+PREHOOK: query: create external table test_iceberg_stats (strcol string, intcol integer) partitioned by (pcol int) stored by iceberg
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@test_iceberg_stats
+POSTHOOK: query: create external table test_iceberg_stats (strcol string, intcol integer) partitioned by (pcol int) stored by iceberg
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@test_iceberg_stats
+PREHOOK: query: insert into table test_iceberg_stats values ('abc', 1, 1)
+PREHOOK: type: QUERY
+PREHOOK: Input: _dummy_database@_dummy_table
+PREHOOK: Output: default@test_iceberg_stats
+POSTHOOK: query: insert into table test_iceberg_stats values ('abc', 1, 1)
+POSTHOOK: type: QUERY
+POSTHOOK: Input: _dummy_database@_dummy_table
+POSTHOOK: Output: default@test_iceberg_stats
+PREHOOK: query: insert into table test_iceberg_stats values ('def', 2, 2)
+PREHOOK: type: QUERY
+PREHOOK: Input: _dummy_database@_dummy_table
+PREHOOK: Output: default@test_iceberg_stats
+POSTHOOK: query: insert into table test_iceberg_stats values ('def', 2, 2)
+POSTHOOK: type: QUERY
+POSTHOOK: Input: _dummy_database@_dummy_table
+POSTHOOK: Output: default@test_iceberg_stats
+PREHOOK: query: insert into table test_iceberg_stats values ('ghi', 3, 3)
+PREHOOK: type: QUERY
+PREHOOK: Input: _dummy_database@_dummy_table
+PREHOOK: Output: default@test_iceberg_stats
+POSTHOOK: query: insert into table test_iceberg_stats values ('ghi', 3, 3)
+POSTHOOK: type: QUERY
+POSTHOOK: Input: _dummy_database@_dummy_table
+POSTHOOK: Output: default@test_iceberg_stats
+PREHOOK: query: explain analyze table test_iceberg_stats compute statistics for columns
+PREHOOK: type: ANALYZE_TABLE
+PREHOOK: Input: default@test_iceberg_stats
+PREHOOK: Output: default@test_iceberg_stats
+#### A masked pattern was here ####
+POSTHOOK: query: explain analyze table test_iceberg_stats compute statistics for columns
+POSTHOOK: type: ANALYZE_TABLE
+POSTHOOK: Input: default@test_iceberg_stats
+POSTHOOK: Output: default@test_iceberg_stats
+#### A masked pattern was here ####
+STAGE DEPENDENCIES:
+  Stage-0 is a root stage
+  Stage-2 depends on stages: Stage-0
+
+STAGE PLANS:
+  Stage: Stage-0
+    Tez
+#### A masked pattern was here ####
+      Edges:
+        Reducer 2 <- Map 1 (CUSTOM_SIMPLE_EDGE)
+#### A masked pattern was here ####
+      Vertices:
+        Map 1 
+            Map Operator Tree:
+                TableScan
+                  alias: test_iceberg_stats
+                  Statistics: Num rows: 3 Data size: 576 Basic stats: COMPLETE Column stats: NONE
+                  Select Operator
+                    expressions: strcol (type: string), intcol (type: int), pcol (type: int)
+                    outputColumnNames: strcol, intcol, pcol
+                    Statistics: Num rows: 3 Data size: 576 Basic stats: COMPLETE Column stats: NONE
+                    Group By Operator
+                      aggregations: max(length(strcol)), avg(COALESCE(length(strcol),0)), count(1), count(strcol), compute_bit_vector_hll(strcol), min(intcol), max(intcol), count(intcol), compute_bit_vector_hll(intcol), min(pcol), max(pcol), count(pcol), compute_bit_vector_hll(pcol)
+                      minReductionHashAggr: 0.99
+                      mode: hash
+                      outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10, _col11, _col12
+                      Statistics: Num rows: 1 Data size: 752 Basic stats: COMPLETE Column stats: NONE
+                      Reduce Output Operator
+                        null sort order: 
+                        sort order: 
+                        Statistics: Num rows: 1 Data size: 752 Basic stats: COMPLETE Column stats: NONE
+                        value expressions: _col0 (type: int), _col1 (type: struct<count:bigint,sum:double,input:int>), _col2 (type: bigint), _col3 (type: bigint), _col4 (type: binary), _col5 (type: int), _col6 (type: int), _col7 (type: bigint), _col8 (type: binary), _col9 (type: int), _col10 (type: int), _col11 (type: bigint), _col12 (type: binary)
+            Execution mode: llap
+            LLAP IO: no inputs
+        Reducer 2 
+            Execution mode: vectorized, llap
+            Reduce Operator Tree:
+              Group By Operator
+                aggregations: max(VALUE._col0), avg(VALUE._col1), count(VALUE._col2), count(VALUE._col3), compute_bit_vector_hll(VALUE._col4), min(VALUE._col5), max(VALUE._col6), count(VALUE._col7), compute_bit_vector_hll(VALUE._col8), min(VALUE._col9), max(VALUE._col10), count(VALUE._col11), compute_bit_vector_hll(VALUE._col12)
+                mode: mergepartial
+                outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10, _col11, _col12
+                Statistics: Num rows: 1 Data size: 752 Basic stats: COMPLETE Column stats: NONE
+                Select Operator
+                  expressions: 'STRING' (type: string), UDFToLong(COALESCE(_col0,0)) (type: bigint), COALESCE(_col1,0) (type: double), (_col2 - _col3) (type: bigint), COALESCE(ndv_compute_bit_vector(_col4),0) (type: bigint), _col4 (type: binary), 'LONG' (type: string), UDFToLong(_col5) (type: bigint), UDFToLong(_col6) (type: bigint), (_col2 - _col7) (type: bigint), COALESCE(ndv_compute_bit_vector(_col8),0) (type: bigint), _col8 (type: binary), 'LONG' (type: string), UDFToLong(_col9) (type: bigint), UDFToLong(_col10) (type: bigint), (_col2 - _col11) (type: bigint), COALESCE(ndv_compute_bit_vector(_col12),0) (type: bigint), _col12 (type: binary)
+                  outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10, _col11, _col12, _col13, _col14, _col15, _col16, _col17
+                  Statistics: Num rows: 1 Data size: 752 Basic stats: COMPLETE Column stats: NONE
+                  File Output Operator
+                    compressed: false
+                    Statistics: Num rows: 1 Data size: 752 Basic stats: COMPLETE Column stats: NONE
+                    table:
+                        input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+                        output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+                        serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+  Stage: Stage-2
+    Stats Work
+      Basic Stats Work:
+      Column Stats Desc:
+          Columns: strcol, intcol, pcol
+          Column Types: string, int, int
+          Table: default.test_iceberg_stats
+
+PREHOOK: query: analyze table test_iceberg_stats compute statistics for columns
+PREHOOK: type: ANALYZE_TABLE
+PREHOOK: Input: default@test_iceberg_stats
+PREHOOK: Output: default@test_iceberg_stats
+#### A masked pattern was here ####
+POSTHOOK: query: analyze table test_iceberg_stats compute statistics for columns
+POSTHOOK: type: ANALYZE_TABLE
+POSTHOOK: Input: default@test_iceberg_stats
+POSTHOOK: Output: default@test_iceberg_stats
+#### A masked pattern was here ####
+PREHOOK: query: explain analyze table test_iceberg_stats compute statistics for columns
+PREHOOK: type: ANALYZE_TABLE
+PREHOOK: Input: default@test_iceberg_stats
+PREHOOK: Output: default@test_iceberg_stats
+#### A masked pattern was here ####
+POSTHOOK: query: explain analyze table test_iceberg_stats compute statistics for columns
+POSTHOOK: type: ANALYZE_TABLE
+POSTHOOK: Input: default@test_iceberg_stats
+POSTHOOK: Output: default@test_iceberg_stats
+#### A masked pattern was here ####
+STAGE DEPENDENCIES:
+  Stage-0 is a root stage
+  Stage-2 depends on stages: Stage-0
+
+STAGE PLANS:
+  Stage: Stage-0
+    Tez
+#### A masked pattern was here ####
+      Edges:
+        Reducer 2 <- Map 1 (CUSTOM_SIMPLE_EDGE)
+#### A masked pattern was here ####
+      Vertices:
+        Map 1 
+            Map Operator Tree:
+                TableScan
+                  alias: test_iceberg_stats
+                  Statistics: Num rows: 3 Data size: 285 Basic stats: COMPLETE Column stats: COMPLETE
+                  Select Operator
+                    expressions: strcol (type: string), intcol (type: int), pcol (type: int)
+                    outputColumnNames: strcol, intcol, pcol
+                    Statistics: Num rows: 3 Data size: 285 Basic stats: COMPLETE Column stats: COMPLETE
+                    Group By Operator
+                      aggregations: max(length(strcol)), avg(COALESCE(length(strcol),0)), count(1), count(strcol), compute_bit_vector_hll(strcol), min(intcol), max(intcol), count(intcol), compute_bit_vector_hll(intcol), min(pcol), max(pcol), count(pcol), compute_bit_vector_hll(pcol)
+                      minReductionHashAggr: 0.6666666
+                      mode: hash
+                      outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10, _col11, _col12
+                      Statistics: Num rows: 1 Data size: 560 Basic stats: COMPLETE Column stats: COMPLETE
+                      Reduce Output Operator
+                        null sort order: 
+                        sort order: 
+                        Statistics: Num rows: 1 Data size: 560 Basic stats: COMPLETE Column stats: COMPLETE
+                        value expressions: _col0 (type: int), _col1 (type: struct<count:bigint,sum:double,input:int>), _col2 (type: bigint), _col3 (type: bigint), _col4 (type: binary), _col5 (type: int), _col6 (type: int), _col7 (type: bigint), _col8 (type: binary), _col9 (type: int), _col10 (type: int), _col11 (type: bigint), _col12 (type: binary)
+            Execution mode: llap
+            LLAP IO: no inputs
+        Reducer 2 
+            Execution mode: vectorized, llap
+            Reduce Operator Tree:
+              Group By Operator
+                aggregations: max(VALUE._col0), avg(VALUE._col1), count(VALUE._col2), count(VALUE._col3), compute_bit_vector_hll(VALUE._col4), min(VALUE._col5), max(VALUE._col6), count(VALUE._col7), compute_bit_vector_hll(VALUE._col8), min(VALUE._col9), max(VALUE._col10), count(VALUE._col11), compute_bit_vector_hll(VALUE._col12)
+                mode: mergepartial
+                outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10, _col11, _col12
+                Statistics: Num rows: 1 Data size: 492 Basic stats: COMPLETE Column stats: COMPLETE
+                Select Operator
+                  expressions: 'STRING' (type: string), UDFToLong(COALESCE(_col0,0)) (type: bigint), COALESCE(_col1,0) (type: double), (_col2 - _col3) (type: bigint), COALESCE(ndv_compute_bit_vector(_col4),0) (type: bigint), _col4 (type: binary), 'LONG' (type: string), UDFToLong(_col5) (type: bigint), UDFToLong(_col6) (type: bigint), (_col2 - _col7) (type: bigint), COALESCE(ndv_compute_bit_vector(_col8),0) (type: bigint), _col8 (type: binary), 'LONG' (type: string), UDFToLong(_col9) (type: bigint), UDFToLong(_col10) (type: bigint), (_col2 - _col11) (type: bigint), COALESCE(ndv_compute_bit_vector(_col12),0) (type: bigint), _col12 (type: binary)
+                  outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10, _col11, _col12, _col13, _col14, _col15, _col16, _col17
+                  Statistics: Num rows: 1 Data size: 794 Basic stats: COMPLETE Column stats: COMPLETE
+                  File Output Operator
+                    compressed: false
+                    Statistics: Num rows: 1 Data size: 794 Basic stats: COMPLETE Column stats: COMPLETE
+                    table:
+                        input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+                        output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+                        serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+  Stage: Stage-2
+    Stats Work
+      Basic Stats Work:
+      Column Stats Desc:
+          Columns: strcol, intcol, pcol
+          Column Types: string, int, int
+          Table: default.test_iceberg_stats
+
+PREHOOK: query: explain analyze table test_iceberg_stats compute statistics for columns
+PREHOOK: type: ANALYZE_TABLE
+PREHOOK: Input: default@test_iceberg_stats
+PREHOOK: Output: default@test_iceberg_stats
+#### A masked pattern was here ####
+POSTHOOK: query: explain analyze table test_iceberg_stats compute statistics for columns
+POSTHOOK: type: ANALYZE_TABLE
+POSTHOOK: Input: default@test_iceberg_stats
+POSTHOOK: Output: default@test_iceberg_stats
+#### A masked pattern was here ####
+STAGE DEPENDENCIES:
+  Stage-0 is a root stage
+  Stage-2 depends on stages: Stage-0
+
+STAGE PLANS:
+  Stage: Stage-0
+    Tez
+#### A masked pattern was here ####
+      Edges:
+        Reducer 2 <- Map 1 (CUSTOM_SIMPLE_EDGE)
+#### A masked pattern was here ####
+      Vertices:
+        Map 1 
+            Map Operator Tree:
+                TableScan
+                  alias: test_iceberg_stats
+                  Statistics: Num rows: 3 Data size: 576 Basic stats: COMPLETE Column stats: NONE
+                  Select Operator
+                    expressions: strcol (type: string), intcol (type: int), pcol (type: int)
+                    outputColumnNames: strcol, intcol, pcol
+                    Statistics: Num rows: 3 Data size: 576 Basic stats: COMPLETE Column stats: NONE
+                    Group By Operator
+                      aggregations: max(length(strcol)), avg(COALESCE(length(strcol),0)), count(1), count(strcol), compute_bit_vector_hll(strcol), min(intcol), max(intcol), count(intcol), compute_bit_vector_hll(intcol), min(pcol), max(pcol), count(pcol), compute_bit_vector_hll(pcol)
+                      minReductionHashAggr: 0.99
+                      mode: hash
+                      outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10, _col11, _col12
+                      Statistics: Num rows: 1 Data size: 752 Basic stats: COMPLETE Column stats: NONE
+                      Reduce Output Operator
+                        null sort order: 
+                        sort order: 
+                        Statistics: Num rows: 1 Data size: 752 Basic stats: COMPLETE Column stats: NONE
+                        value expressions: _col0 (type: int), _col1 (type: struct<count:bigint,sum:double,input:int>), _col2 (type: bigint), _col3 (type: bigint), _col4 (type: binary), _col5 (type: int), _col6 (type: int), _col7 (type: bigint), _col8 (type: binary), _col9 (type: int), _col10 (type: int), _col11 (type: bigint), _col12 (type: binary)
+            Execution mode: llap
+            LLAP IO: no inputs
+        Reducer 2 
+            Execution mode: vectorized, llap
+            Reduce Operator Tree:
+              Group By Operator
+                aggregations: max(VALUE._col0), avg(VALUE._col1), count(VALUE._col2), count(VALUE._col3), compute_bit_vector_hll(VALUE._col4), min(VALUE._col5), max(VALUE._col6), count(VALUE._col7), compute_bit_vector_hll(VALUE._col8), min(VALUE._col9), max(VALUE._col10), count(VALUE._col11), compute_bit_vector_hll(VALUE._col12)
+                mode: mergepartial
+                outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10, _col11, _col12
+                Statistics: Num rows: 1 Data size: 752 Basic stats: COMPLETE Column stats: NONE
+                Select Operator
+                  expressions: 'STRING' (type: string), UDFToLong(COALESCE(_col0,0)) (type: bigint), COALESCE(_col1,0) (type: double), (_col2 - _col3) (type: bigint), COALESCE(ndv_compute_bit_vector(_col4),0) (type: bigint), _col4 (type: binary), 'LONG' (type: string), UDFToLong(_col5) (type: bigint), UDFToLong(_col6) (type: bigint), (_col2 - _col7) (type: bigint), COALESCE(ndv_compute_bit_vector(_col8),0) (type: bigint), _col8 (type: binary), 'LONG' (type: string), UDFToLong(_col9) (type: bigint), UDFToLong(_col10) (type: bigint), (_col2 - _col11) (type: bigint), COALESCE(ndv_compute_bit_vector(_col12),0) (type: bigint), _col12 (type: binary)
+                  outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10, _col11, _col12, _col13, _col14, _col15, _col16, _col17
+                  Statistics: Num rows: 1 Data size: 752 Basic stats: COMPLETE Column stats: NONE
+                  File Output Operator
+                    compressed: false
+                    Statistics: Num rows: 1 Data size: 752 Basic stats: COMPLETE Column stats: NONE
+                    table:
+                        input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+                        output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+                        serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+  Stage: Stage-2
+    Stats Work
+      Basic Stats Work:
+      Column Stats Desc:
+          Columns: strcol, intcol, pcol
+          Column Types: string, int, int
+          Table: default.test_iceberg_stats
+
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/stats/ColStatsProcessor.java b/ql/src/java/org/apache/hadoop/hive/ql/stats/ColStatsProcessor.java
index e2777a128bc4..79b66a622ca7 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/stats/ColStatsProcessor.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/stats/ColStatsProcessor.java
@@ -226,9 +226,11 @@ public int persistColumnStats(Hive db, Table tbl) throws HiveException, MetaExce
         if (!(tbl.isMaterializedView() || tbl.isView() || tbl.isTemporary())) {
           setOrRemoveColumnStatsAccurateProperty(db, tbl, colStatDesc.getColName(), success);
         }
+        tbl.getStorageHandler().setColStatistics(tbl, colStats);
+      } else {
+        // Set table or partition column statistics in metastore.
+        db.setPartitionColumnStatistics(request);
       }
-      // TODO: Write stats for native tables only (See HIVE-27421)
-      db.setPartitionColumnStatistics(request);
       end = System.currentTimeMillis();
       LOG.info("Time taken to update " + colStats.size() + " stats : " + ((end - start)/1000F) + " seconds.");
     }