deniskuzZ · deniskuzZ · Feb 20, 2025 · Mar 9, 2025
diff --git a/...g/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergStorageHandler.java b/...g/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergStorageHandler.java
diff --git a/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/HiveTableUtil.java b/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/HiveTableUtil.java
@@ -19,8 +19,6 @@
 
 package org.apache.iceberg.mr.hive;
 
-import com.fasterxml.jackson.core.JsonProcessingException;
-import com.fasterxml.jackson.databind.ObjectMapper;
 import java.io.IOException;
 import java.io.ObjectInputStream;
 import java.io.ObjectOutputStream;
@@ -34,14 +32,12 @@
 import java.util.concurrent.Callable;
 import java.util.concurrent.ExecutorService;
 import java.util.concurrent.Executors;
-import java.util.stream.Collectors;
 import java.util.stream.IntStream;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.LocatedFileStatus;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.fs.RemoteIterator;
-import org.apache.hadoop.hive.conf.Constants;
 import org.apache.hadoop.hive.conf.HiveConf;
 import org.apache.hadoop.hive.metastore.Warehouse;
 import org.apache.hadoop.hive.metastore.api.FieldSchema;
@@ -50,12 +46,8 @@
 import org.apache.hadoop.hive.metastore.api.hive_metastoreConstants;
 import org.apache.hadoop.hive.metastore.partition.spec.PartitionSpecProxy;
 import org.apache.hadoop.hive.metastore.utils.FileUtils;
-import org.apache.hadoop.hive.ql.exec.Utilities;
 import org.apache.hadoop.hive.ql.io.IOConstants;
 import org.apache.hadoop.hive.ql.parse.SemanticException;
-import org.apache.hadoop.hive.serde.serdeConstants;
-import org.apache.hadoop.hive.serde2.ColumnProjectionUtils;
-import org.apache.hadoop.mapred.JobConf;
 import org.apache.iceberg.AppendFiles;
 import org.apache.iceberg.DataFile;
 import org.apache.iceberg.DeleteFiles;
@@ -281,32 +273,4 @@ public static boolean isCtas(Properties properties) {
     return Boolean.parseBoolean(properties.getProperty(hive_metastoreConstants.TABLE_IS_CTAS));
   }
 
-  static Properties getSerializationProps() {
-    Properties props = new Properties();
-    props.put(serdeConstants.SERIALIZATION_FORMAT, "" + Utilities.tabCode);
-    props.put(serdeConstants.SERIALIZATION_NULL_FORMAT, "NULL");
-    return props;
-  }
-
-  static String getParseData(String parseData, String specId, ObjectMapper mapper, Integer currentSpecId)
-      throws JsonProcessingException {
-    Map<String, String> map = mapper.readValue(parseData, Map.class);
-    String partString =
-        map.entrySet().stream()
-            .filter(entry -> entry.getValue() != null)
-            .map(java.lang.Object::toString)
-            .collect(Collectors.joining("/"));
-    String currentSpecMarker = currentSpecId.toString().equals(specId) ? "current-" : "";
-    return String.format("%sspec-id=%s/%s", currentSpecMarker, specId, partString);
-  }
-
-  static JobConf getPartJobConf(Configuration confs, org.apache.hadoop.hive.ql.metadata.Table tbl) {
-    JobConf job = new JobConf(confs);
-    job.set(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR, Constants.ICEBERG_PARTITION_COLUMNS);
-    job.set(InputFormatConfig.TABLE_LOCATION, tbl.getPath().toString());
-    job.set(InputFormatConfig.TABLE_IDENTIFIER, tbl.getFullyQualifiedName() + ".partitions");
-    HiveConf.setVar(job, HiveConf.ConfVars.HIVE_FETCH_OUTPUT_SERDE, Constants.DELIMITED_JSON_SERDE);
-    HiveConf.setBoolVar(job, HiveConf.ConfVars.HIVE_VECTORIZATION_ENABLED, false);
-    return job;
-  }
 }
diff --git a/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/IcebergTableUtil.java b/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/IcebergTableUtil.java
@@ -22,6 +22,7 @@
 import java.io.IOException;
 import java.time.ZoneId;
 import java.util.Collections;
+import java.util.Comparator;
 import java.util.List;
 import java.util.Map;
 import java.util.Optional;
@@ -473,50 +474,56 @@ public static List<FieldSchema> getPartitionKeys(Table table, int specId) {
   public static List<PartitionField> getPartitionFields(Table table, boolean latestSpecOnly) {
     return latestSpecOnly ? table.spec().fields() :
       table.specs().values().stream()
-        .flatMap(spec -> spec.fields().stream()).distinct()
+        .flatMap(spec -> spec.fields().stream()
+            .filter(f -> !f.transform().isVoid()))
+        .distinct()
         .collect(Collectors.toList());
   }
 
   /**
-   * Returns a Map of PartitionData as the keys and partition spec ids as the values
-   * @param icebergTable Iceberg table
+   * Returns a list of partition names satisfying the provided partition spec.
+   * @param table Iceberg table
    * @param partSpecMap Partition Spec used as the criteria for filtering
-   * @param allowPartialSpec When true, must return partitions which match partSpecMap exactly, otherwise partially
-   * @param latestSpecOnly When true, returns partitions with the latest partition spec only, otherwise with any specs
-   * @return Map of PartitionData and partition spec found based on the specified constraints
+   * @param latestSpecOnly when True, returns partitions with the current spec only, else - any specs
+   * @return List of partition names
    */
-  public static Map<PartitionData, Integer> getPartitionInfo(Table icebergTable, Map<String, String> partSpecMap,
-      boolean allowPartialSpec, boolean latestSpecOnly) throws SemanticException, IOException {
+  public static List<String> getPartitionNames(Table table, Map<String, String> partSpecMap,
+      boolean latestSpecOnly) throws SemanticException {
     Expression expression = IcebergTableUtil.generateExpressionFromPartitionSpec(
-        icebergTable, partSpecMap, latestSpecOnly);
+        table, partSpecMap, latestSpecOnly);
     PartitionsTable partitionsTable = (PartitionsTable) MetadataTableUtils.createMetadataTableInstance(
-        icebergTable, MetadataTableType.PARTITIONS);
+        table, MetadataTableType.PARTITIONS);
 
-    Map<PartitionData, Integer> result = Maps.newLinkedHashMap();
     try (CloseableIterable<FileScanTask> fileScanTasks = partitionsTable.newScan().planFiles()) {
-      fileScanTasks.forEach(task ->
-          CloseableIterable.filter(
-              CloseableIterable.transform(task.asDataTask().rows(), row -> {
-                StructProjection data = row.get(IcebergTableUtil.PART_IDX, StructProjection.class);
-                Integer specId = row.get(IcebergTableUtil.SPEC_IDX, Integer.class);
-                return Maps.immutableEntry(IcebergTableUtil.toPartitionData(data,
-                    Partitioning.partitionType(icebergTable), icebergTable.specs().get(specId).partitionType()),
-                    specId);
-              }), entry -> {
-                ResidualEvaluator resEval = ResidualEvaluator.of(icebergTable.specs().get(entry.getValue()),
-                    expression, false);
-                return resEval.residualFor(entry.getKey()).isEquivalentTo(Expressions.alwaysTrue()) &&
-                    (entry.getKey().size() == partSpecMap.size() || allowPartialSpec) &&
-                    (entry.getValue() == icebergTable.spec().specId() || !latestSpecOnly);
-              }).forEach(entry -> result.put(entry.getKey(), entry.getValue())));
-    }
+      return FluentIterable.from(fileScanTasks)
+          .transformAndConcat(task -> task.asDataTask().rows())
+          .transform(row -> {
+            StructLike data = row.get(IcebergTableUtil.PART_IDX, StructProjection.class);
+            PartitionSpec spec = table.specs().get(row.get(IcebergTableUtil.SPEC_IDX, Integer.class));
+            return Maps.immutableEntry(
+                IcebergTableUtil.toPartitionData(
+                    data, Partitioning.partitionType(table), spec.partitionType()),
+                spec);
+          }).filter(e -> {
+            ResidualEvaluator resEval = ResidualEvaluator.of(e.getValue(),
+                expression, false);
+            return e.getValue().isPartitioned() &&
+              resEval.residualFor(e.getKey()).isEquivalentTo(Expressions.alwaysTrue()) &&
+              (e.getValue().specId() == table.spec().specId() || !latestSpecOnly);
+
+          }).transform(e -> e.getValue().partitionToPath(e.getKey())).toSortedList(
+            Comparator.naturalOrder());
 
-    return result;
+    } catch (IOException e) {
+      throw new SemanticException(
+          String.format("Error while fetching the partitions due to: %s", e));
+    }
   }
 
   public static long getPartitionHash(Table icebergTable, String partitionPath) throws IOException {
     PartitionsTable partitionsTable = (PartitionsTable) MetadataTableUtils
         .createMetadataTableInstance(icebergTable, MetadataTableType.PARTITIONS);
+
     try (CloseableIterable<FileScanTask> fileScanTasks = partitionsTable.newScan().planFiles()) {
       return FluentIterable.from(fileScanTasks)
           .transformAndConcat(task -> task.asDataTask().rows())
@@ -534,19 +541,10 @@ public static long getPartitionHash(Table icebergTable, String partitionPath) th
     }
   }
 
-  public static List<String> getPartitionNames(Table icebergTable, Map<String, String> partitionSpec,
-      boolean latestSpecOnly) throws SemanticException {
-    try {
-      return IcebergTableUtil
-          .getPartitionInfo(icebergTable, partitionSpec, true, latestSpecOnly).entrySet().stream()
-          .map(e -> {
-            PartitionData partitionData = e.getKey();
-            int specId = e.getValue();
-            return icebergTable.specs().get(specId).partitionToPath(partitionData);
-          }).collect(Collectors.toList());
-    } catch (IOException e) {
-      throw new SemanticException(String.format("Error while fetching the partitions due to: %s", e));
-    }
+  public static TransformSpec getTransformSpec(Table table, String transformName, int sourceId) {
+    TransformSpec spec = TransformSpec.fromString(transformName.toUpperCase(),
+        table.schema().findColumnName(sourceId));
+    return spec;
   }
 
 }
diff --git a/...-handler/src/test/java/org/apache/iceberg/mr/hive/HiveIcebergStorageHandlerTestUtils.java b/...-handler/src/test/java/org/apache/iceberg/mr/hive/HiveIcebergStorageHandlerTestUtils.java
@@ -126,6 +126,10 @@ static TestTables testTables(TestHiveShell shell, TestTables.TestTableType testT
     return testTableType.instance(shell.metastore().hiveConf(), temp, catalogName);
   }
 
+  static void init(TestHiveShell shell, TestTables testTables, TemporaryFolder temp) {
+    init(shell, testTables, temp, "tez");
+  }
+
   static void init(TestHiveShell shell, TestTables testTables, TemporaryFolder temp, String engine) {
     shell.getSession();
 

diff --git a/...ler/src/test/java/org/apache/iceberg/mr/hive/HiveIcebergStorageHandlerWithEngineBase.java b/...ler/src/test/java/org/apache/iceberg/mr/hive/HiveIcebergStorageHandlerWithEngineBase.java
@@ -33,8 +33,8 @@
 import org.apache.iceberg.SnapshotSummary;
 import org.apache.iceberg.Table;
 import org.apache.iceberg.data.Record;
-import org.apache.iceberg.hive.HiveVersion;
 import org.apache.iceberg.mr.TestHelper;
+import org.apache.iceberg.mr.hive.TestTables.TestTableType;
 import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList;
 import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap;
 import org.apache.iceberg.relocated.com.google.common.collect.Lists;
@@ -53,6 +53,8 @@
 import org.junit.runner.RunWith;
 import org.junit.runners.Parameterized;
 
+import static org.apache.iceberg.mr.hive.TestTables.ALL_TABLE_TYPES;
+import static org.apache.iceberg.mr.hive.TestTables.TestTableType.HIVE_CATALOG;
 import static org.apache.iceberg.types.Types.NestedField.optional;
 import static org.apache.iceberg.types.Types.NestedField.required;
 import static org.junit.Assume.assumeTrue;
@@ -62,8 +64,6 @@
 @RunWith(Parameterized.class)
 public abstract class HiveIcebergStorageHandlerWithEngineBase {
 
-  protected static final String[] EXECUTION_ENGINES = new String[] {"tez"};
-
   public static final String RETRY_STRATEGIES =
       "overlay,reoptimize,reexecute_lost_am,dagsubmit,recompile_without_cbo,write_conflict";
 
@@ -108,34 +108,25 @@ public abstract class HiveIcebergStorageHandlerWithEngineBase {
       StatsSetupConst.TOTAL_SIZE, SnapshotSummary.TOTAL_FILE_SIZE_PROP
   );
 
-  @Parameters(name = "fileFormat={0}, engine={1}, catalog={2}, isVectorized={3}, formatVersion={4}")
+  @Parameters(name = "fileFormat={0}, catalog={1}, isVectorized={2}, formatVersion={3}")
   public static Collection<Object[]> parameters() {
     Collection<Object[]> testParams = Lists.newArrayList();
-    String javaVersion = System.getProperty("java.specification.version");
 
     // Run tests with every FileFormat for a single Catalog (HiveCatalog)
     for (FileFormat fileFormat : HiveIcebergStorageHandlerTestUtils.FILE_FORMATS) {
-      for (String engine : EXECUTION_ENGINES) {
-        IntStream.of(2, 1).forEach(formatVersion -> {
-          // include Tez tests only for Java 8
-          if (javaVersion.equals("1.8")) {
-            testParams.add(new Object[]{fileFormat, engine, TestTables.TestTableType.HIVE_CATALOG, false,
-                formatVersion});
-            // test for vectorization=ON in case of ORC and PARQUET format with Tez engine
-            if (fileFormat != FileFormat.METADATA && "tez".equals(engine) && HiveVersion.min(HiveVersion.HIVE_3)) {
-              testParams.add(new Object[]{fileFormat, engine, TestTables.TestTableType.HIVE_CATALOG, true,
-                  formatVersion});
-            }
-          }
-        });
-      }
+      IntStream.of(2, 1).forEach(formatVersion -> {
+        testParams.add(new Object[]{fileFormat, HIVE_CATALOG, false, formatVersion});
+        // test for vectorization=ON in case of ORC and PARQUET format
+        if (fileFormat != FileFormat.METADATA) {
+          testParams.add(new Object[]{fileFormat, HIVE_CATALOG, true, formatVersion});
+        }
+      });
     }
 
-    // Run tests for every Catalog for a single FileFormat (PARQUET) and execution engine (tez)
-    // skip HiveCatalog tests as they are added before
-    for (TestTables.TestTableType testTableType : TestTables.ALL_TABLE_TYPES) {
-      if (!TestTables.TestTableType.HIVE_CATALOG.equals(testTableType)) {
-        testParams.add(new Object[]{FileFormat.PARQUET, "tez", testTableType, false, 1});
+    // Run tests for every Catalog for a single FileFormat (PARQUET), skip HiveCatalog tests as they are added before
+    for (TestTableType testTableType : ALL_TABLE_TYPES) {
+      if (testTableType != HIVE_CATALOG) {
+        testParams.add(new Object[]{FileFormat.PARQUET, testTableType, false, 1});
       }
     }
 
@@ -150,15 +141,12 @@ public static Collection<Object[]> parameters() {
   public FileFormat fileFormat;
 
   @Parameter(1)
-  public String executionEngine;
+  public TestTableType testTableType;
 
   @Parameter(2)
-  public TestTables.TestTableType testTableType;
-
-  @Parameter(3)
   public boolean isVectorized;
 
-  @Parameter(4)
+  @Parameter(3)
   public Integer formatVersion;
 
   @Rule
@@ -181,7 +169,7 @@ public static void afterClass() throws Exception {
   public void before() throws IOException {
     validateTestParams();
     testTables = HiveIcebergStorageHandlerTestUtils.testTables(shell, testTableType, temp);
-    HiveIcebergStorageHandlerTestUtils.init(shell, testTables, temp, executionEngine);
+    HiveIcebergStorageHandlerTestUtils.init(shell, testTables, temp);
     HiveConf.setBoolVar(shell.getHiveConf(), HiveConf.ConfVars.HIVE_VECTORIZATION_ENABLED, isVectorized);
     // Fetch task conversion might kick in for certain queries preventing vectorization code path to be used, so
     // we turn it off explicitly to achieve better coverage.

diff --git a/...rg/iceberg-handler/src/test/java/org/apache/iceberg/mr/hive/TestConflictingDataFiles.java b/...rg/iceberg-handler/src/test/java/org/apache/iceberg/mr/hive/TestConflictingDataFiles.java
@@ -98,7 +98,7 @@ public void testSingleFilterUpdate() {
     try {
       Tasks.range(2).executeWith(Executors.newFixedThreadPool(2)).run(i -> {
         TestUtilPhaser.getInstance().getPhaser().register();
-        init(shell, testTables, temp, executionEngine);
+        init(shell, testTables, temp);
         HiveConf.setBoolVar(shell.getHiveConf(), HiveConf.ConfVars.HIVE_VECTORIZATION_ENABLED, isVectorized);
         HiveConf.setVar(shell.getHiveConf(), HiveConf.ConfVars.HIVE_FETCH_TASK_CONVERSION, "none");
         HiveConf.setVar(shell.getHiveConf(), HiveConf.ConfVars.HIVE_QUERY_REEXECUTION_STRATEGIES,
@@ -141,7 +141,7 @@ public void testMultiFiltersUpdate() {
     try {
       Tasks.range(2).executeWith(Executors.newFixedThreadPool(2)).run(i -> {
         TestUtilPhaser.getInstance().getPhaser().register();
-        init(shell, testTables, temp, executionEngine);
+        init(shell, testTables, temp);
         HiveConf.setBoolVar(shell.getHiveConf(), HiveConf.ConfVars.HIVE_VECTORIZATION_ENABLED, isVectorized);
         HiveConf.setVar(shell.getHiveConf(), HiveConf.ConfVars.HIVE_FETCH_TASK_CONVERSION, "none");
         HiveConf.setVar(shell.getHiveConf(), HiveConf.ConfVars.HIVE_QUERY_REEXECUTION_STRATEGIES,
@@ -188,7 +188,7 @@ public void testDeleteFilters() {
     try {
       Tasks.range(3).executeWith(Executors.newFixedThreadPool(3)).run(i -> {
         TestUtilPhaser.getInstance().getPhaser().register();
-        init(shell, testTables, temp, executionEngine);
+        init(shell, testTables, temp);
         HiveConf.setBoolVar(shell.getHiveConf(), HiveConf.ConfVars.HIVE_VECTORIZATION_ENABLED, isVectorized);
         HiveConf.setVar(shell.getHiveConf(), HiveConf.ConfVars.HIVE_FETCH_TASK_CONVERSION, "none");
         HiveConf.setVar(shell.getHiveConf(), HiveConf.ConfVars.HIVE_QUERY_REEXECUTION_STRATEGIES,
@@ -230,7 +230,7 @@ public void testConflictingUpdates() {
     try {
       Tasks.range(2).executeWith(Executors.newFixedThreadPool(2)).run(i -> {
         TestUtilPhaser.getInstance().getPhaser().register();
-        init(shell, testTables, temp, executionEngine);
+        init(shell, testTables, temp);
         HiveConf.setBoolVar(shell.getHiveConf(), HiveConf.ConfVars.HIVE_VECTORIZATION_ENABLED, isVectorized);
         HiveConf.setVar(shell.getHiveConf(), HiveConf.ConfVars.HIVE_FETCH_TASK_CONVERSION, "none");
         HiveConf.setVar(shell.getHiveConf(), HiveConf.ConfVars.HIVE_QUERY_REEXECUTION_STRATEGIES,
@@ -299,7 +299,7 @@ public void testConcurrentInsertAndInsertOverwrite() {
           throw new RuntimeException(e);
         }
       }
-      init(shell, testTables, temp, executionEngine);
+      init(shell, testTables, temp);
       HiveConf.setBoolVar(shell.getHiveConf(), HiveConf.ConfVars.HIVE_VECTORIZATION_ENABLED, isVectorized);
       HiveConf.setVar(shell.getHiveConf(), HiveConf.ConfVars.HIVE_FETCH_TASK_CONVERSION, "none");
 

diff --git a/iceberg/iceberg-handler/src/test/java/org/apache/iceberg/mr/hive/TestHiveIcebergCRUD.java b/iceberg/iceberg-handler/src/test/java/org/apache/iceberg/mr/hive/TestHiveIcebergCRUD.java
@@ -616,7 +616,7 @@ public void testConcurrent2Deletes() {
       Tasks.range(2)
           .executeWith(Executors.newFixedThreadPool(2))
           .run(i -> {
-            init(shell, testTables, temp, executionEngine);
+            init(shell, testTables, temp);
             HiveConf.setBoolVar(shell.getHiveConf(), HiveConf.ConfVars.HIVE_VECTORIZATION_ENABLED, isVectorized);
             HiveConf.setVar(shell.getHiveConf(), HiveConf.ConfVars.HIVE_FETCH_TASK_CONVERSION, "none");
             HiveConf.setVar(shell.getHiveConf(), HiveConf.ConfVars.HIVE_QUERY_REEXECUTION_STRATEGIES,
@@ -647,7 +647,7 @@ public void testConcurrent2Updates() {
       Tasks.range(2)
           .executeWith(Executors.newFixedThreadPool(2))
           .run(i -> {
-            init(shell, testTables, temp, executionEngine);
+            init(shell, testTables, temp);
             HiveConf.setBoolVar(shell.getHiveConf(), HiveConf.ConfVars.HIVE_VECTORIZATION_ENABLED, isVectorized);
             HiveConf.setVar(shell.getHiveConf(), HiveConf.ConfVars.HIVE_FETCH_TASK_CONVERSION, "none");
             HiveConf.setVar(shell.getHiveConf(), HiveConf.ConfVars.HIVE_QUERY_REEXECUTION_STRATEGIES,
@@ -682,7 +682,7 @@ public void testConcurrentUpdateAndDelete() {
       Tasks.range(2)
           .executeWith(Executors.newFixedThreadPool(2))
           .run(i -> {
-            init(shell, testTables, temp, executionEngine);
+            init(shell, testTables, temp);
             HiveConf.setBoolVar(shell.getHiveConf(), HiveConf.ConfVars.HIVE_VECTORIZATION_ENABLED, isVectorized);
             HiveConf.setVar(shell.getHiveConf(), HiveConf.ConfVars.HIVE_FETCH_TASK_CONVERSION, "none");
             HiveConf.setVar(shell.getHiveConf(), HiveConf.ConfVars.HIVE_QUERY_REEXECUTION_STRATEGIES,
@@ -717,7 +717,7 @@ public void testConcurrent2MergeInserts() {
       Tasks.range(2)
           .executeWith(Executors.newFixedThreadPool(2))
           .run(i -> {
-            init(shell, testTables, temp, executionEngine);
+            init(shell, testTables, temp);
             HiveConf.setBoolVar(shell.getHiveConf(), HiveConf.ConfVars.HIVE_VECTORIZATION_ENABLED, isVectorized);
             HiveConf.setVar(shell.getHiveConf(), HiveConf.ConfVars.HIVE_FETCH_TASK_CONVERSION, "none");
             HiveConf.setVar(shell.getHiveConf(), HiveConf.ConfVars.HIVE_QUERY_REEXECUTION_STRATEGIES,