apache · aokolnychyi · Aug 14, 2021 · Aug 4, 2021 · Aug 10, 2021 · Aug 10, 2021
diff --git a/api/src/main/java/org/apache/iceberg/PartitionField.java b/api/src/main/java/org/apache/iceberg/PartitionField.java
@@ -22,6 +22,7 @@
 import java.io.Serializable;
 import org.apache.iceberg.relocated.com.google.common.base.Objects;
 import org.apache.iceberg.transforms.Transform;
+import org.apache.iceberg.transforms.Transforms;
 
 /**
  * Represents a single field in a {@link PartitionSpec}.
@@ -67,6 +68,20 @@ public String name() {
     return transform;
   }
 
+  /**
+   * Returns true if this partition field is compatible with another partition field.
+   * <p>
+   * Partition fields are considered compatible if they have the same source ID, field ID and their
+   * transforms are equivalent or one of them is always producing nulls.
+   */
+  boolean compatibleWith(PartitionField other) {
+    return sourceId == other.sourceId && fieldId == other.fieldId && compatibleTransforms(transform, other.transform);
+  }
+
+  private boolean compatibleTransforms(Transform<?, ?> t1, Transform<?, ?> t2) {
+    return t1.equals(t2) || t1.equals(Transforms.alwaysNull()) || t2.equals(Transforms.alwaysNull());
+  }
+
   @Override
   public String toString() {
     return fieldId + ": " + name + ": " + transform + "(" + sourceId + ")";

diff --git a/core/src/main/java/org/apache/iceberg/AllDataFilesTable.java b/core/src/main/java/org/apache/iceberg/AllDataFilesTable.java
@@ -29,6 +29,7 @@
 import org.apache.iceberg.relocated.com.google.common.collect.Iterables;
 import org.apache.iceberg.relocated.com.google.common.collect.Sets;
 import org.apache.iceberg.types.TypeUtil;
+import org.apache.iceberg.types.Types.StructType;
 import org.apache.iceberg.util.ParallelIterable;
 import org.apache.iceberg.util.ThreadPools;
 
@@ -56,8 +57,9 @@ public TableScan newScan() {
 
   @Override
   public Schema schema() {
-    Schema schema = new Schema(DataFile.getType(table().spec().partitionType()).fields());
-    if (table().spec().fields().size() < 1) {
+    StructType partitionType = Partitioning.partitionType(table());
+    Schema schema = new Schema(DataFile.getType(partitionType).fields());
+    if (partitionType.fields().size() < 1) {
       // avoid returning an empty struct, which is not always supported. instead, drop the partition field (id 102)
       return TypeUtil.selectNot(schema, Sets.newHashSet(102));
     } else {

diff --git a/core/src/main/java/org/apache/iceberg/AllEntriesTable.java b/core/src/main/java/org/apache/iceberg/AllEntriesTable.java
@@ -29,6 +29,7 @@
 import org.apache.iceberg.relocated.com.google.common.collect.Iterables;
 import org.apache.iceberg.relocated.com.google.common.collect.Sets;
 import org.apache.iceberg.types.TypeUtil;
+import org.apache.iceberg.types.Types.StructType;
 import org.apache.iceberg.util.ParallelIterable;
 import org.apache.iceberg.util.ThreadPools;
 
@@ -55,8 +56,9 @@ public TableScan newScan() {
 
   @Override
   public Schema schema() {
-    Schema schema = ManifestEntry.getSchema(table().spec().partitionType());
-    if (table().spec().fields().size() < 1) {
+    StructType partitionType = Partitioning.partitionType(table());
+    Schema schema = ManifestEntry.getSchema(partitionType);
+    if (partitionType.fields().size() < 1) {
       // avoid returning an empty struct, which is not always supported. instead, drop the partition field (id 102)
       return TypeUtil.selectNot(schema, Sets.newHashSet(102));
     } else {

diff --git a/core/src/main/java/org/apache/iceberg/DataFilesTable.java b/core/src/main/java/org/apache/iceberg/DataFilesTable.java
@@ -27,6 +27,7 @@
 import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList;
 import org.apache.iceberg.relocated.com.google.common.collect.Sets;
 import org.apache.iceberg.types.TypeUtil;
+import org.apache.iceberg.types.Types.StructType;
 
 /**
  * A {@link Table} implementation that exposes a table's data files as rows.
@@ -48,8 +49,9 @@ public TableScan newScan() {
 
   @Override
   public Schema schema() {
-    Schema schema = new Schema(DataFile.getType(table().spec().partitionType()).fields());
-    if (table().spec().fields().size() < 1) {
+    StructType partitionType = Partitioning.partitionType(table());
+    Schema schema = new Schema(DataFile.getType(partitionType).fields());
+    if (partitionType.fields().size() < 1) {
       // avoid returning an empty struct, which is not always supported. instead, drop the partition field
       return TypeUtil.selectNot(schema, Sets.newHashSet(DataFile.PARTITION_ID));
     } else {

diff --git a/core/src/main/java/org/apache/iceberg/ManifestEntriesTable.java b/core/src/main/java/org/apache/iceberg/ManifestEntriesTable.java
@@ -29,6 +29,7 @@
 import org.apache.iceberg.relocated.com.google.common.collect.Sets;
 import org.apache.iceberg.types.Type;
 import org.apache.iceberg.types.TypeUtil;
+import org.apache.iceberg.types.Types.StructType;
 import org.apache.iceberg.util.StructProjection;
 
 /**
@@ -54,8 +55,9 @@ public TableScan newScan() {
 
   @Override
   public Schema schema() {
-    Schema schema = ManifestEntry.getSchema(table().spec().partitionType());
-    if (table().spec().fields().size() < 1) {
+    StructType partitionType = Partitioning.partitionType(table());
+    Schema schema = ManifestEntry.getSchema(partitionType);
+    if (partitionType.fields().size() < 1) {
       // avoid returning an empty struct, which is not always supported. instead, drop the partition field (id 102)
       return TypeUtil.selectNot(schema, Sets.newHashSet(102));
     } else {

diff --git a/core/src/main/java/org/apache/iceberg/Partitioning.java b/core/src/main/java/org/apache/iceberg/Partitioning.java
@@ -19,9 +19,18 @@
 
 package org.apache.iceberg;
 
+import java.util.Collections;
+import java.util.Comparator;
 import java.util.List;
+import java.util.Map;
+import java.util.stream.Collectors;
+import org.apache.iceberg.exceptions.ValidationException;
 import org.apache.iceberg.expressions.Expressions;
+import org.apache.iceberg.relocated.com.google.common.collect.Lists;
+import org.apache.iceberg.relocated.com.google.common.collect.Maps;
 import org.apache.iceberg.transforms.PartitionSpecVisitor;
+import org.apache.iceberg.types.Types.NestedField;
+import org.apache.iceberg.types.Types.StructType;
 
 public class Partitioning {
   private Partitioning() {
@@ -177,4 +186,52 @@ public Void alwaysNull(int fieldId, String sourceName, int sourceId) {
       return null;
     }
   }
+
+  /**
+   * Builds a common partition type for all specs in a table.
+   * <p>
+   * Whenever a table has multiple specs, the partition type is a struct containing
+   * all columns that have ever been a part of any spec in the table.
+   *
+   * @param table a table with one or many specs
+   * @return the constructed common partition type
+   */
+  public static StructType partitionType(Table table) {
+    if (table.specs().size() == 1) {
+      return table.spec().partitionType();
+    }
+
+    Map<Integer, PartitionField> fieldMap = Maps.newHashMap();
+    List<NestedField> structFields = Lists.newArrayList();
+
+    // sort the spec IDs in descending order to pick up the most recent field names
+    List<Integer> specIds = table.specs().keySet().stream()
+        .sorted(Collections.reverseOrder())
+        .collect(Collectors.toList());
+
+    for (Integer specId : specIds) {
+      PartitionSpec spec = table.specs().get(specId);
+
+      for (PartitionField field : spec.fields()) {
+        int fieldId = field.fieldId();
+        PartitionField existingField = fieldMap.get(fieldId);
+
+        if (existingField == null) {
+          fieldMap.put(fieldId, field);
+          NestedField structField = spec.partitionType().field(fieldId);
+          structFields.add(structField);
+        } else {
+          // verify the fields are compatible as they may conflict in v1 tables
+          ValidationException.check(field.compatibleWith(existingField),
+              "Conflicting partition fields: ['%s', '%s']",
+              field, existingField);
+        }
+      }
+    }
+
+    List<NestedField> sortedStructFields = structFields.stream()
+        .sorted(Comparator.comparingInt(NestedField::fieldId))
+        .collect(Collectors.toList());
+    return StructType.of(sortedStructFields);
+  }
 }
diff --git a/core/src/test/java/org/apache/iceberg/TestPartitioning.java b/core/src/test/java/org/apache/iceberg/TestPartitioning.java
@@ -0,0 +1,196 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.iceberg;
+
+import java.io.File;
+import java.io.IOException;
+import org.apache.iceberg.exceptions.ValidationException;
+import org.apache.iceberg.expressions.Expressions;
+import org.apache.iceberg.types.Types;
+import org.apache.iceberg.types.Types.NestedField;
+import org.apache.iceberg.types.Types.StructType;
+import org.junit.After;
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.TemporaryFolder;
+
+import static org.apache.iceberg.types.Types.NestedField.required;
+
+public class TestPartitioning {
+
+  private static final int V1_FORMAT_VERSION = 1;
+  private static final int V2_FORMAT_VERSION = 2;
+  private static final Schema SCHEMA = new Schema(
+      required(1, "id", Types.IntegerType.get()),
+      required(2, "data", Types.StringType.get()),
+      required(3, "category", Types.StringType.get())
+  );
+
+  @Rule
+  public TemporaryFolder temp = new TemporaryFolder();
+  private File tableDir = null;
+
+  @Before
+  public void setupTableDir() throws IOException {
+    this.tableDir = temp.newFolder();
+  }
+
+  @After
+  public void cleanupTables() {
+    TestTables.clearTables();
+  }
+
+  @Test
+  public void testPartitionTypeWithSpecEvolutionInV1Tables() {
+    PartitionSpec initialSpec = PartitionSpec.builderFor(SCHEMA)
+        .identity("data")
+        .build();
+    TestTables.TestTable table = TestTables.create(tableDir, "test", SCHEMA, initialSpec, V1_FORMAT_VERSION);
+
+    table.updateSpec()
+        .addField(Expressions.bucket("category", 8))
+        .commit();
+
+    Assert.assertEquals("Should have 2 specs", 2, table.specs().size());
+
+    StructType expectedType = StructType.of(
+        NestedField.optional(1000, "data", Types.StringType.get()),
+        NestedField.optional(1001, "category_bucket_8", Types.IntegerType.get())
+    );
+    StructType actualType = Partitioning.partitionType(table);
+    Assert.assertEquals("Types must match", expectedType, actualType);
+  }
+
+  @Test
+  public void testPartitionTypeWithSpecEvolutionInV2Tables() {
+    PartitionSpec initialSpec = PartitionSpec.builderFor(SCHEMA)
+        .identity("data")
+        .build();
+    TestTables.TestTable table = TestTables.create(tableDir, "test", SCHEMA, initialSpec, V2_FORMAT_VERSION);
+
+    table.updateSpec()
+        .removeField("data")
+        .addField("category")
+        .commit();
+
+    Assert.assertEquals("Should have 2 specs", 2, table.specs().size());
+
+    StructType expectedType = StructType.of(
+        NestedField.optional(1000, "data", Types.StringType.get()),
+        NestedField.optional(1001, "category", Types.StringType.get())
+    );
+    StructType actualType = Partitioning.partitionType(table);
+    Assert.assertEquals("Types must match", expectedType, actualType);
+  }
+
+  @Test
+  public void testPartitionTypeWithRenamesInV1Table() {
+    PartitionSpec initialSpec = PartitionSpec.builderFor(SCHEMA)
+        .identity("data", "p1")
+        .build();
+    TestTables.TestTable table = TestTables.create(tableDir, "test", SCHEMA, initialSpec, V1_FORMAT_VERSION);
+
+    table.updateSpec()
+        .addField("category")
+        .commit();
+
+    table.updateSpec()
+        .renameField("p1", "p2")
+        .commit();
+
+    StructType expectedType = StructType.of(
+        NestedField.optional(1000, "p2", Types.StringType.get()),
+        NestedField.optional(1001, "category", Types.StringType.get())
+    );
+    StructType actualType = Partitioning.partitionType(table);
+    Assert.assertEquals("Types must match", expectedType, actualType);
+  }
+
+  @Test
+  public void testPartitionTypeWithAddingBackSamePartitionFieldInV1Table() {
+    PartitionSpec initialSpec = PartitionSpec.builderFor(SCHEMA)
+        .identity("data")
+        .build();
+    TestTables.TestTable table = TestTables.create(tableDir, "test", SCHEMA, initialSpec, V1_FORMAT_VERSION);
+
+    table.updateSpec()
+        .removeField("data")
+        .commit();
+
+    table.updateSpec()
+        .addField("data")
+        .commit();
+
+    // in v1, we use void transforms instead of dropping partition fields
+    StructType expectedType = StructType.of(
+        NestedField.optional(1000, "data_1000", Types.StringType.get()),
+        NestedField.optional(1001, "data", Types.StringType.get())
+    );
+    StructType actualType = Partitioning.partitionType(table);
+    Assert.assertEquals("Types must match", expectedType, actualType);
+  }
+
+  @Test
+  public void testPartitionTypeWithAddingBackSamePartitionFieldInV2Table() {
+    PartitionSpec initialSpec = PartitionSpec.builderFor(SCHEMA)
+        .identity("data")
+        .build();
+    TestTables.TestTable table = TestTables.create(tableDir, "test", SCHEMA, initialSpec, V2_FORMAT_VERSION);
+
+    table.updateSpec()
+        .removeField("data")
+        .commit();
+
+    table.updateSpec()
+        .addField("data")
+        .commit();
+
+    // in v2, we should be able to reuse the original partition spec
+    StructType expectedType = StructType.of(
+        NestedField.optional(1000, "data", Types.StringType.get())
+    );
+    StructType actualType = Partitioning.partitionType(table);
+    Assert.assertEquals("Types must match", expectedType, actualType);
+  }
+
+  @Test
+  public void testPartitionTypeWithIncompatibleSpecEvolution() {
+    PartitionSpec initialSpec = PartitionSpec.builderFor(SCHEMA)
+        .identity("data")
+        .build();
+    TestTables.TestTable table = TestTables.create(tableDir, "test", SCHEMA, initialSpec, V1_FORMAT_VERSION);
+
+    PartitionSpec newSpec = PartitionSpec.builderFor(table.schema())
+        .identity("category")
+        .build();
+
+    TableOperations ops = ((HasTableOperations) table).operations();
+    TableMetadata current = ops.current();
+    ops.commit(current, current.updatePartitionSpec(newSpec));
+
+    Assert.assertEquals("Should have 2 specs", 2, table.specs().size());
+
+    AssertHelpers.assertThrows("Should complain about incompatible specs",
+        ValidationException.class, "Conflicting partition fields",
+        () -> Partitioning.partitionType(table));
+  }
+}