apache · funcheetah · Feb 28, 2022 · Apr 27, 2022 · Apr 27, 2022 · Apr 28, 2022
diff --git a/core/src/main/java/org/apache/iceberg/avro/AvroSchemaUtil.java b/core/src/main/java/org/apache/iceberg/avro/AvroSchemaUtil.java
@@ -154,6 +154,27 @@ public static boolean isOptionSchema(Schema schema) {
     return false;
   }
 
+  /**
+   * This method decides whether a schema is of type union and is complex union and is optional
+   *
+   * Complex union: the number of options in union larger than 2
+   * Optional: null is present in union
+   *
+   * @param schema input schema
+   * @return true if schema is complex union and it is optional
+   */
+  public static boolean isOptionalComplexUnion(Schema schema) {
+    if (schema.getType() == UNION && schema.getTypes().size() > 2) {
+      for (Schema type : schema.getTypes()) {
+        if (type.getType() == Schema.Type.NULL) {
+          return true;
+        }
+      }
+    }
+
+    return false;
+  }
+
   static Schema toOption(Schema schema) {
     if (schema.getType() == UNION) {
       Preconditions.checkArgument(isOptionSchema(schema),

diff --git a/core/src/main/java/org/apache/iceberg/avro/AvroSchemaVisitor.java b/core/src/main/java/org/apache/iceberg/avro/AvroSchemaVisitor.java
@@ -52,8 +52,21 @@ public static <T> T visit(Schema schema, AvroSchemaVisitor<T> visitor) {
       case UNION:
         List<Schema> types = schema.getTypes();
         List<T> options = Lists.newArrayListWithExpectedSize(types.size());
-        for (Schema type : types) {
-          options.add(visit(type, visitor));
+        if (AvroSchemaUtil.isOptionSchema(schema)) {
+          for (Schema type : types) {
+            options.add(visit(type, visitor));
+          }
+        } else {
+          // complex union case
+          int nonNullIdx = 0;
+          for (Schema type : types) {
+            if (type.getType() != Schema.Type.NULL) {
+              options.add(visitWithName("field" + nonNullIdx, type, visitor));
+              nonNullIdx += 1;
+            } else {
+              options.add(visit(type, visitor));
+            }
+          }
         }
         return visitor.union(schema, options);
 

diff --git a/core/src/main/java/org/apache/iceberg/avro/AvroSchemaWithTypeVisitor.java b/core/src/main/java/org/apache/iceberg/avro/AvroSchemaWithTypeVisitor.java
@@ -79,11 +79,30 @@ private static <T> T visitRecord(Types.StructType struct, Schema record, AvroSch
   private static <T> T visitUnion(Type type, Schema union, AvroSchemaWithTypeVisitor<T> visitor) {
     List<Schema> types = union.getTypes();
     List<T> options = Lists.newArrayListWithExpectedSize(types.size());
-    for (Schema branch : types) {
-      if (branch.getType() == Schema.Type.NULL) {
-        options.add(visit((Type) null, branch, visitor));
-      } else {
-        options.add(visit(type, branch, visitor));
+
+    // simple union case
+    if (AvroSchemaUtil.isOptionSchema(union)) {
+      for (Schema branch : types) {
+        if (branch.getType() == Schema.Type.NULL) {
+          options.add(visit((Type) null, branch, visitor));
+        } else {
+          options.add(visit(type, branch, visitor));
+        }
+      }
+    } else { // complex union case
+      Preconditions.checkArgument(type instanceof Types.StructType,
+          "Cannot visit invalid Iceberg type: %s for Avro complex union type: %s", type, union);
+
+      List<Types.NestedField> fields = type.asStructType().fields();
+      // start index from 1 because 0 is the tag field which doesn't exist in the original Avro schema
+      int index = 1;
+      for (Schema branch : types) {
+        if (branch.getType() == Schema.Type.NULL) {
+          options.add(visit((Type) null, branch, visitor));
+        } else {
+          options.add(visit(fields.get(index).type(), branch, visitor));
+          index += 1;
+        }
       }
     }
     return visitor.union(type, union, options);

diff --git a/core/src/main/java/org/apache/iceberg/avro/BuildAvroProjection.java b/core/src/main/java/org/apache/iceberg/avro/BuildAvroProjection.java
@@ -19,6 +19,7 @@
 
 package org.apache.iceberg.avro;
 
+import java.util.Iterator;
 import java.util.List;
 import java.util.Map;
 import java.util.Objects;
@@ -154,16 +155,48 @@ public Schema.Field field(Schema.Field field, Supplier<Schema> fieldResult) {
 
   @Override
   public Schema union(Schema union, Iterable<Schema> options) {
-    Preconditions.checkState(AvroSchemaUtil.isOptionSchema(union),
-        "Invalid schema: non-option unions are not supported: %s", union);
-    Schema nonNullOriginal = AvroSchemaUtil.fromOption(union);
-    Schema nonNullResult = AvroSchemaUtil.fromOptions(Lists.newArrayList(options));
+    if (AvroSchemaUtil.isOptionSchema(union)) {
+      Schema nonNullOriginal = AvroSchemaUtil.fromOption(union);
+      Schema nonNullResult = AvroSchemaUtil.fromOptions(Lists.newArrayList(options));
 
-    if (!Objects.equals(nonNullOriginal, nonNullResult)) {
-      return AvroSchemaUtil.toOption(nonNullResult);
-    }
+      if (!Objects.equals(nonNullOriginal, nonNullResult)) {
+        return AvroSchemaUtil.toOption(nonNullResult);
+      }
+
+      return union;
+    } else { // Complex union
+      Preconditions.checkArgument(current instanceof Types.StructType,
+          "Incompatible projected type: %s for Avro complex union type: %s", current, union);
+
+      Types.StructType asStructType = current.asStructType();
+
+      long nonNullBranchesCount = union.getTypes().stream()
+          .filter(branch -> branch.getType() != Schema.Type.NULL).count();
+      Preconditions.checkState(asStructType.fields().size() > nonNullBranchesCount,
+          "Column projection on struct converted from Avro complex union type: %s is not supported", union);
+
+      Iterator<Schema> resultBranchIterator = options.iterator();
+
+      // we start index from 1 because 0 is the tag field which doesn't exist in the original Avro
+      int index = 1;
+      List<Schema> resultBranches = Lists.newArrayListWithExpectedSize(union.getTypes().size());
 
-    return union;
+      try {
+        for (Schema originalBranch : union.getTypes()) {
+          if (originalBranch.getType() == Schema.Type.NULL) {
+            resultBranches.add(resultBranchIterator.next());
+          } else {
+            this.current = asStructType.fields().get(index).type();
+            resultBranches.add(resultBranchIterator.next());
+            index += 1;
+          }
+        }
+
+        return Schema.createUnion(resultBranches);
+      } finally {
+        this.current = asStructType;
+      }
+    }
   }
 
   @Override

diff --git a/core/src/main/java/org/apache/iceberg/avro/PruneColumns.java b/core/src/main/java/org/apache/iceberg/avro/PruneColumns.java
@@ -119,25 +119,27 @@ public Schema record(Schema record, List<String> names, List<Schema> fields) {
 
   @Override
   public Schema union(Schema union, List<Schema> options) {
-    Preconditions.checkState(AvroSchemaUtil.isOptionSchema(union),
-        "Invalid schema: non-option unions are not supported: %s", union);
-
-    // only unions with null are allowed, and a null schema results in null
-    Schema pruned = null;
-    if (options.get(0) != null) {
-      pruned = options.get(0);
-    } else if (options.get(1) != null) {
-      pruned = options.get(1);
-    }
+    if (AvroSchemaUtil.isOptionSchema(union)) {
+      // case option union
+      Schema pruned = null;
+      if (options.get(0) != null) {
+        pruned = options.get(0);
+      } else if (options.get(1) != null) {
+        pruned = options.get(1);
+      }
 
-    if (pruned != null) {
-      if (!Objects.equals(pruned, AvroSchemaUtil.fromOption(union))) {
-        return AvroSchemaUtil.toOption(pruned);
+      if (pruned != null) {
+        if (!Objects.equals(pruned, AvroSchemaUtil.fromOption(union))) {
+          return AvroSchemaUtil.toOption(pruned);
+        }
+        return union;
       }
-      return union;
-    }
 
-    return null;
+      return null;
+    } else {
+      // Complex union case
+      return copyUnion(union, options);
+    }
   }
 
   @Override
@@ -323,4 +325,19 @@ private static Schema.Field copyField(Schema.Field field, Schema newSchema, Inte
   private static boolean isOptionSchemaWithNonNullFirstOption(Schema schema) {
     return AvroSchemaUtil.isOptionSchema(schema) && schema.getTypes().get(0).getType() != Schema.Type.NULL;
   }
+
+  // for primitive types, the visitResult will be null, we want to reuse the primitive types from the original
+  // schema, while for nested types, we want to use the visitResult because they have content from the previous
+  // recursive calls.
+  private static Schema copyUnion(Schema record, List<Schema> visitResults) {
+    List<Schema> branches = Lists.newArrayListWithExpectedSize(visitResults.size());
+    for (int i = 0; i < visitResults.size(); i++) {
+      if (visitResults.get(i) == null) {
+        branches.add(record.getTypes().get(i));
+      } else {
+        branches.add(visitResults.get(i));
+      }
+    }
+    return Schema.createUnion(branches);
+  }
 }
diff --git a/core/src/main/java/org/apache/iceberg/avro/SchemaToType.java b/core/src/main/java/org/apache/iceberg/avro/SchemaToType.java
@@ -93,7 +93,7 @@ public Type record(Schema record, List<String> names, List<Type> fieldTypes) {
       Type fieldType = fieldTypes.get(i);
       int fieldId = getId(field);
 
-      if (AvroSchemaUtil.isOptionSchema(field.schema())) {
+      if (AvroSchemaUtil.isOptionSchema(field.schema()) || AvroSchemaUtil.isOptionalComplexUnion(field.schema())) {
         newFields.add(Types.NestedField.optional(fieldId, field.name(), fieldType, field.doc()));
       } else {
         newFields.add(Types.NestedField.required(fieldId, field.name(), fieldType, field.doc()));
@@ -105,13 +105,27 @@ public Type record(Schema record, List<String> names, List<Type> fieldTypes) {
 
   @Override
   public Type union(Schema union, List<Type> options) {
-    Preconditions.checkArgument(AvroSchemaUtil.isOptionSchema(union),
-        "Unsupported type: non-option union: %s", union);
-    // records, arrays, and maps will check nullability later
-    if (options.get(0) == null) {
-      return options.get(1);
+    if (AvroSchemaUtil.isOptionSchema(union)) {
+      // Optional simple union
+      // records, arrays, and maps will check nullability later
+      if (options.get(0) == null) {
+        return options.get(1);
+      } else {
+        return options.get(0);
+      }
     } else {
-      return options.get(0);
+      // Complex union
+      List<Types.NestedField> newFields = Lists.newArrayList();
+      newFields.add(Types.NestedField.required(allocateId(), "tag", Types.IntegerType.get()));
+
+      int tagIndex = 0;
+      for (Type type : options) {
+        if (type != null) {
+          newFields.add(Types.NestedField.optional(allocateId(), "field" + tagIndex++, type));
+        }
+      }
+
+      return Types.StructType.of(newFields);
     }
   }
 

diff --git a/core/src/test/java/org/apache/iceberg/avro/TestBuildAvroProjection.java b/core/src/test/java/org/apache/iceberg/avro/TestBuildAvroProjection.java
@@ -256,4 +256,121 @@ public void projectMapWithLessFieldInValueSchema() {
     assertEquals("Unexpected value ID discovered on the projected map schema",
         1, Integer.valueOf(actual.getProp(AvroSchemaUtil.VALUE_ID_PROP)).intValue());
   }
+
+  @Test
+  public void projectUnionWithBranchSchemaUnchanged() {
+
+    final Type icebergType = Types.StructType.of(
+        Types.NestedField.required(0, "tag", Types.IntegerType.get()),
+        Types.NestedField.optional(1, "field0", Types.IntegerType.get()),
+        Types.NestedField.optional(2, "field1", Types.StringType.get())
+    );
+
+    final org.apache.avro.Schema expected = SchemaBuilder.unionOf()
+        .intType()
+        .and()
+        .stringType()
+        .endUnion();
+
+    final BuildAvroProjection testSubject = new BuildAvroProjection(icebergType, Collections.emptyMap());
+
+    final Iterable<org.apache.avro.Schema> branches = expected.getTypes();
+
+    final org.apache.avro.Schema actual = testSubject.union(expected, branches);
+
+    assertEquals("Union projection produced undesired union schema",
+        expected, actual);
+  }
+
+  @Test
+  public void projectUnionWithTypePromotion() {
+
+    final Type icebergType = Types.StructType.of(
+        Types.NestedField.required(0, "tag", Types.IntegerType.get()),
+        Types.NestedField.optional(1, "field0", Types.LongType.get()),
+        Types.NestedField.optional(2, "field1", Types.StringType.get())
+    );
+
+    final org.apache.avro.Schema originalSchema = SchemaBuilder.unionOf()
+        .intType()
+        .and()
+        .stringType()
+        .endUnion();
+
+    // once projected onto iceberg schema, first branch of Avro union schema will be promoted from int to long
+    final org.apache.avro.Schema expected = SchemaBuilder.unionOf()
+        .longType()
+        .and()
+        .stringType()
+        .endUnion();
+
+    final BuildAvroProjection testSubject = new BuildAvroProjection(icebergType, Collections.emptyMap());
+
+    final Iterable<org.apache.avro.Schema> branches = expected.getTypes();
+
+    final org.apache.avro.Schema actual = testSubject.union(originalSchema, branches);
+
+    assertEquals("Union projection produced undesired union schema",
+        expected, actual);
+  }
+
+  @Test
+  public void projectUnionWithExtraFieldInNestedType() {
+
+    final Type icebergType = Types.StructType.of(
+        Types.NestedField.required(0, "tag", Types.IntegerType.get()),
+        Types.NestedField.optional(1, "field0", Types.StringType.get()),
+        Types.NestedField.optional(2, "field1", Types.StructType.of(
+            Types.NestedField.optional(3, "c1", Types.IntegerType.get()),
+            Types.NestedField.optional(4, "c2", Types.StringType.get()),
+            Types.NestedField.optional(5, "c3", Types.StringType.get())
+        ))
+    );
+
+    final org.apache.avro.Schema originalSchema = SchemaBuilder.unionOf()
+        .stringType()
+        .and()
+        .record("r")
+        .fields()
+        .name("c1")
+        .type()
+        .intType()
+        .noDefault()
+        .name("c2")
+        .type()
+        .stringType()
+        .noDefault()
+        .endRecord()
+        .endUnion();
+
+    // once projected onto iceberg schema, the avro schema will have an extra string column in struct within union
+    final org.apache.avro.Schema expected = SchemaBuilder.unionOf()
+        .stringType()
+        .and()
+        .record("r")
+        .fields()
+        .name("c1")
+        .type()
+        .intType()
+        .noDefault()
+        .name("c2")
+        .type()
+        .stringType()
+        .noDefault()
+        .name("c3")
+        .type()
+        .stringType()
+        .noDefault()
+        .endRecord()
+        .endUnion();
+
+    final BuildAvroProjection testSubject = new BuildAvroProjection(icebergType, Collections.emptyMap());
+
+    final Iterable<org.apache.avro.Schema> branches = expected.getTypes();
+
+    final org.apache.avro.Schema actual = testSubject.union(originalSchema, branches);
+
+    assertEquals("Union projection produced undesired union schema",
+        expected, actual);
+  }
 }