apache · rdblue · Feb 23, 2021 · Jan 14, 2021 · Feb 2, 2021 · Feb 4, 2021
diff --git a/api/src/main/java/org/apache/iceberg/Schema.java b/api/src/main/java/org/apache/iceberg/Schema.java
@@ -40,12 +40,17 @@
 
 /**
  * The schema of a data table.
+ * <p>
+ * Schema ID will only be populated when reading from/writing to table metadata,
+ * otherwise it will be default to 0.
  */
 public class Schema implements Serializable {
   private static final Joiner NEWLINE = Joiner.on('\n');
   private static final String ALL_COLUMNS = "*";
+  private static final int DEFAULT_SCHEMA_ID = 0;
 
   private final StructType struct;
+  private final int schemaId;
   private transient BiMap<String, Integer> aliasToId = null;
   private transient Map<Integer, NestedField> idToField = null;
   private transient Map<String, Integer> nameToId = null;
@@ -54,6 +59,7 @@ public class Schema implements Serializable {
   private transient Map<Integer, String> idToName = null;
 
   public Schema(List<NestedField> columns, Map<String, Integer> aliases) {
+    this.schemaId = DEFAULT_SCHEMA_ID;
     this.struct = StructType.of(columns);
     this.aliasToId = aliases != null ? ImmutableBiMap.copyOf(aliases) : null;
 
@@ -62,12 +68,21 @@ public Schema(List<NestedField> columns, Map<String, Integer> aliases) {
   }
 
   public Schema(List<NestedField> columns) {
+    this(DEFAULT_SCHEMA_ID, columns);
+  }
+
+  public Schema(int schemaId, List<NestedField> columns) {
+    this.schemaId = schemaId;
     this.struct = StructType.of(columns);
     lazyIdToName();
   }
 
   public Schema(NestedField... columns) {
-    this(Arrays.asList(columns));
+    this(DEFAULT_SCHEMA_ID, Arrays.asList(columns));
+  }
+
+  public Schema(int schemaId, NestedField... columns) {
+    this(schemaId, Arrays.asList(columns));
   }
 
   private Map<Integer, NestedField> lazyIdToField() {
@@ -105,6 +120,16 @@ private Map<Integer, Accessor<StructLike>> lazyIdToAccessor() {
     return idToAccessor;
   }
 
+  /**
+   * Returns the schema ID for this schema.
+   * <p>
+   * Note that schema ID will only be populated when reading from/writing to table metadata,
+   * otherwise it will be default to 0.
+   */
+  public int schemaId() {
+    return this.schemaId;
+  }
+
   /**
    * Returns an alias map for this schema, if set.
    * <p>

diff --git a/api/src/main/java/org/apache/iceberg/types/TypeUtil.java b/api/src/main/java/org/apache/iceberg/types/TypeUtil.java
@@ -157,6 +157,21 @@ public static Schema assignFreshIds(Schema schema, NextID nextId) {
         .fields());
   }
 
+  /**
+   * Assigns fresh ids from the {@link NextID nextId function} for all fields in a schema.
+   *
+   * @param schemaId an ID assigned to this schema
+   * @param schema a schema
+   * @param nextId an id assignment function
+   * @return a structurally identical schema with new ids assigned by the nextId function
+   */
+  public static Schema assignFreshIds(int schemaId, Schema schema, NextID nextId) {
+    return new Schema(schemaId, TypeUtil
+        .visit(schema.asStruct(), new AssignFreshIds(nextId))
+        .asNestedType()
+        .fields());
+  }
+
   /**
    * Assigns ids to match a given schema, and fresh ids from the {@link NextID nextId function} for all other fields.
    *

diff --git a/api/src/test/java/org/apache/iceberg/TestHelpers.java b/api/src/test/java/org/apache/iceberg/TestHelpers.java
@@ -28,6 +28,7 @@
 import java.util.Arrays;
 import java.util.List;
 import java.util.Map;
+import java.util.stream.IntStream;
 import org.apache.iceberg.expressions.BoundPredicate;
 import org.apache.iceberg.expressions.BoundSetPredicate;
 import org.apache.iceberg.expressions.Expression;
@@ -84,6 +85,23 @@ public static <T> T roundTripSerialize(T type) throws IOException, ClassNotFound
     }
   }
 
+  public static void assertSameSchemaList(List<Schema> list1, List<Schema> list2) {
+    if (list1.size() != list2.size()) {
+      Assert.fail("Should have same number of schemas in both lists");
+    }
+
+    IntStream.range(0, list1.size()).forEach(
+        index -> {
+          Schema schema1 = list1.get(index);
+          Schema schema2 = list2.get(index);
+          Assert.assertEquals("Should have matching schema id",
+              schema1.schemaId(), schema2.schemaId());
+          Assert.assertEquals("Should have matching schema struct",
+              schema1.asStruct(), schema2.asStruct());
+        }
+    );
+  }
+
   private static class CheckReferencesBound extends ExpressionVisitors.ExpressionVisitor<Void> {
     private final String message;
 

diff --git a/core/src/main/java/org/apache/iceberg/SchemaParser.java b/core/src/main/java/org/apache/iceberg/SchemaParser.java
@@ -39,6 +39,7 @@ public class SchemaParser {
   private SchemaParser() {
   }
 
+  private static final String SCHEMA_ID = "schema-id";
   private static final String TYPE = "type";
   private static final String STRUCT = "struct";
   private static final String LIST = "list";
@@ -57,10 +58,18 @@ private SchemaParser() {
   private static final String ELEMENT_REQUIRED = "element-required";
   private static final String VALUE_REQUIRED = "value-required";
 
-  static void toJson(Types.StructType struct, JsonGenerator generator) throws IOException {
+  private static void toJson(Types.StructType struct, JsonGenerator generator) throws IOException {
+    toJson(struct, null, generator);
+  }
+
+  private static void toJson(Types.StructType struct, Integer schemaId, JsonGenerator generator) throws IOException {
     generator.writeStartObject();
 
     generator.writeStringField(TYPE, STRUCT);
+    if (schemaId != null) {
+      generator.writeNumberField(SCHEMA_ID, schemaId);
+    }
+
     generator.writeArrayFieldStart(FIELDS);
     for (Types.NestedField field : struct.fields()) {
       generator.writeStartObject();
@@ -135,7 +144,7 @@ static void toJson(Type type, JsonGenerator generator) throws IOException {
   }
 
   public static void toJson(Schema schema, JsonGenerator generator) throws IOException {
-    toJson(schema.asStruct(), generator);
+    toJson(schema.asStruct(), schema.schemaId(), generator);
   }
 
   public static String toJson(Schema schema) {
@@ -237,7 +246,13 @@ public static Schema fromJson(JsonNode json) {
     Type type  = typeFromJson(json);
     Preconditions.checkArgument(type.isNestedType() && type.asNestedType().isStructType(),
         "Cannot create schema, not a struct type: %s", type);
-    return new Schema(type.asNestedType().asStructType().fields());
+    Integer schemaId = JsonUtil.getIntOrNull(SCHEMA_ID, json);
+
+    if (schemaId == null) {
+      return new Schema(type.asNestedType().asStructType().fields());
+    } else {
+      return new Schema(schemaId, type.asNestedType().asStructType().fields());
+    }
   }
 
   private static final Cache<String, Schema> SCHEMA_CACHE = Caffeine.newBuilder()