apache · sudssf · May 18, 2020 · May 19, 2020 · May 21, 2020 · May 24, 2020
diff --git a/api/src/main/java/org/apache/iceberg/types/Types.java b/api/src/main/java/org/apache/iceberg/types/Types.java
@@ -514,28 +514,37 @@ public int hashCode() {
   public static class StructType extends NestedType {
     private static final Joiner FIELD_SEP = Joiner.on(", ");
 
+    public static StructType of(boolean isUnion, NestedField... fields) {
+      return of(Arrays.asList(fields), isUnion);
+    }
+
     public static StructType of(NestedField... fields) {
       return of(Arrays.asList(fields));
     }
 
     public static StructType of(List<NestedField> fields) {
-      return new StructType(fields);
+      return new StructType(fields, false);
     }
 
-    private final NestedField[] fields;
+    public static StructType of(List<NestedField> fields, boolean convertedFromUnionSchema) {
+      return new StructType(fields, convertedFromUnionSchema);
+    }
 
+    private final NestedField[] fields;
+    private final boolean convertedFromUnionSchema;
     // lazy values
     private transient List<NestedField> fieldList = null;
     private transient Map<String, NestedField> fieldsByName = null;
     private transient Map<String, NestedField> fieldsByLowerCaseName = null;
     private transient Map<Integer, NestedField> fieldsById = null;
 
-    private StructType(List<NestedField> fields) {
+    private StructType(List<NestedField> fields, boolean convertedFromUnionSchema) {
       Preconditions.checkNotNull(fields, "Field list cannot be null");
       this.fields = new NestedField[fields.size()];
       for (int i = 0; i < this.fields.length; i += 1) {
         this.fields[i] = fields.get(i);
       }
+      this.convertedFromUnionSchema = convertedFromUnionSchema;
     }
 
     @Override
@@ -641,6 +650,13 @@ private Map<Integer, NestedField> lazyFieldsById() {
       }
       return fieldsById;
     }
+
+    /**
+     * @return true if struct represents union schema converted to struct type
+     */
+    public boolean isConvertedFromUnionSchema() {
+      return convertedFromUnionSchema;
+    }
   }
 
   public static class ListType extends NestedType {

diff --git a/core/src/main/java/org/apache/iceberg/avro/AvroSchemaUtil.java b/core/src/main/java/org/apache/iceberg/avro/AvroSchemaUtil.java
@@ -47,6 +47,7 @@ private AvroSchemaUtil() {}
   public static final String VALUE_ID_PROP = "value-id";
   public static final String ELEMENT_ID_PROP = "element-id";
   public static final String ADJUST_TO_UTC_PROP = "adjust-to-utc";
+  public static final String UNION_SCHEMA_TO_RECORD = "union-schema-to-record";
 
   private static final Schema NULL = Schema.create(Schema.Type.NULL);
   private static final Schema.Type MAP = Schema.Type.MAP;
@@ -120,10 +121,10 @@ public static boolean isTimestamptz(Schema schema) {
   }
 
   public static boolean isOptionSchema(Schema schema) {
-    if (schema.getType() == UNION && schema.getTypes().size() == 2) {
+    if (schema.getType() == UNION && schema.getTypes().size() >= 2) {
       if (schema.getTypes().get(0).getType() == Schema.Type.NULL) {
         return true;
-      } else if (schema.getTypes().get(1).getType() == Schema.Type.NULL) {
+      } else if (schema.getTypes().size() == 2 && schema.getTypes().get(1).getType() == Schema.Type.NULL) {
         return true;
       }
     }

diff --git a/core/src/main/java/org/apache/iceberg/avro/GenericAvroWriter.java b/core/src/main/java/org/apache/iceberg/avro/GenericAvroWriter.java
@@ -21,10 +21,13 @@
 
 import com.google.common.base.Preconditions;
 import java.io.IOException;
+import java.lang.reflect.Array;
 import java.util.List;
+import java.util.stream.Collectors;
 import org.apache.avro.LogicalType;
 import org.apache.avro.LogicalTypes;
 import org.apache.avro.Schema;
+import org.apache.avro.generic.GenericData;
 import org.apache.avro.io.DatumWriter;
 import org.apache.avro.io.Encoder;
 
@@ -52,7 +55,12 @@ private WriteBuilder() {
 
     @Override
     public ValueWriter<?> record(Schema record, List<String> names, List<ValueWriter<?>> fields) {
-      return ValueWriters.record(fields);
+      Object isUnionSchema = record.getObjectProp(AvroSchemaUtil.UNION_SCHEMA_TO_RECORD);
+      if (isUnionSchema != null && (boolean) isUnionSchema) {
+        return new UnionSchemaWriter<>(record, fields);
+      } else {
+        return ValueWriters.record(fields);
+      }
     }
 
     @Override
@@ -133,4 +141,38 @@ public ValueWriter<?> primitive(Schema primitive) {
       }
     }
   }
+
+  public static class UnionSchemaWriter<V extends Object> implements ValueWriter<V> {
+    private final ValueWriter<Object>[] writers;
+    private final Schema schema;
+
+    @SuppressWarnings("unchecked")
+    protected UnionSchemaWriter(Schema schema, List<ValueWriter<?>> writers) {
+      this.schema = Schema.createUnion(schema.getFields()
+          .stream()
+          .flatMap(x -> x.schema().getTypes().stream())
+          .filter(x -> x.getType() != Schema.Type.NULL) // only process non-null types
+          .collect(Collectors.toList()));
+      this.writers = (ValueWriter<Object>[]) Array.newInstance(ValueWriter.class, writers.size());
+      for (int i = 0; i < this.writers.length; i += 1) {
+        this.writers[i] = (ValueWriter<Object>) writers.get(i);
+      }
+    }
+
+    public ValueWriter<?> writer(int pos) {
+      return writers[pos];
+    }
+
+    @Override
+    public void write(V row, Encoder encoder) throws IOException {
+      int index = GenericData.get().resolveUnion(schema, row);
+      for (int i = 0; i < this.writers.length; i += 1) {
+        if (i == index) {
+          writers[i].write(row, encoder);
+        } else {
+          writers[i].write(null, encoder);
+        }
+      }
+    }
+  }
 }
diff --git a/core/src/main/java/org/apache/iceberg/avro/SchemaToType.java b/core/src/main/java/org/apache/iceberg/avro/SchemaToType.java
@@ -106,11 +106,27 @@ public Type record(Schema record, List<String> names, List<Type> fieldTypes) {
   public Type union(Schema union, List<Type> options) {
     Preconditions.checkArgument(AvroSchemaUtil.isOptionSchema(union),
         "Unsupported type: non-option union: %s", union);
-    // records, arrays, and maps will check nullability later
-    if (options.get(0) == null) {
-      return options.get(1);
-    } else {
+    if (options.size() == 1) {
       return options.get(0);
+    } else if (options.size() == 2) {
+      if (options.get(0) == null) {
+        return options.get(1);
+      } else {
+        return options.get(0);
+      }
+    } else {
+      // Convert complex unions to struct types where field names are member0, member1, etc.
+      // This is consistent with the behavior of the spark Avro SchemaConverter
+      List<Types.NestedField> fields = Lists.newArrayListWithExpectedSize(options.size());
+      for (int i = 0; i < options.size(); i += 1) {
+        Type fieldType = options.get(i);
+        if (fieldType == null) {
+          continue;
+        }
+        // All fields are optional because only one of them is set at a time
+        fields.add(Types.NestedField.optional(allocateId(), "member" + i, fieldType));
+      }
+      return Types.StructType.of(fields, true);
     }
   }
 

diff --git a/core/src/main/java/org/apache/iceberg/avro/TypeToSchema.java b/core/src/main/java/org/apache/iceberg/avro/TypeToSchema.java
@@ -112,7 +112,9 @@ public Schema struct(Types.StructType struct, List<Schema> fieldSchemas) {
     }
 
     recordSchema = Schema.createRecord(recordName, null, null, false, fields);
-
+    if (struct.isConvertedFromUnionSchema()) {
+      recordSchema.addProp(AvroSchemaUtil.UNION_SCHEMA_TO_RECORD, true);
+    }
     results.put(struct, recordSchema);
 
     return recordSchema;
@@ -160,7 +162,6 @@ public Schema map(Types.MapType map, Schema keySchema, Schema valueSchema) {
           map.isValueOptional() ? AvroSchemaUtil.toOption(valueSchema) : valueSchema);
       mapSchema.addProp(AvroSchemaUtil.KEY_ID_PROP, map.keyId());
       mapSchema.addProp(AvroSchemaUtil.VALUE_ID_PROP, map.valueId());
-
     } else {
       mapSchema = AvroSchemaUtil.createMap(map.keyId(), keySchema,
           map.valueId(), map.isValueOptional() ? AvroSchemaUtil.toOption(valueSchema) : valueSchema);

diff --git a/core/src/test/java/org/apache/iceberg/avro/AvroDataUnionRecordTest.java b/core/src/test/java/org/apache/iceberg/avro/AvroDataUnionRecordTest.java
@@ -0,0 +1,183 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.iceberg.avro;
+
+import com.google.common.collect.Lists;
+import java.io.File;
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import org.apache.avro.generic.GenericData;
+import org.apache.avro.generic.GenericRecordBuilder;
+import org.apache.iceberg.Files;
+import org.apache.iceberg.Schema;
+import org.apache.iceberg.io.FileAppender;
+import org.junit.Assert;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.TemporaryFolder;
+
+public class AvroDataUnionRecordTest {
+
+  @Rule
+  public TemporaryFolder temp = new TemporaryFolder();
+
+  protected void writeAndValidate(
+      List<GenericData.Record> actualWrite,
+      List<GenericData.Record> expectedRead,
+      Schema icebergSchema) throws IOException {
+    File testFile = temp.newFile();
+    Assert.assertTrue("Delete should succeed", testFile.delete());
+
+    try (FileAppender<GenericData.Record> writer = Avro.write(Files.localOutput(testFile))
+        .schema(icebergSchema)
+        .named("test")
+        .build()) {
+      for (GenericData.Record rec : actualWrite) {
+        writer.add(rec);
+      }
+    }
+
+    List<GenericData.Record> rows;
+    try (AvroIterable<GenericData.Record> reader = Avro.read(Files.localInput(testFile))
+        .project(icebergSchema)
+        .build()) {
+      rows = Lists.newArrayList(reader);
+    }
+
+    for (int i = 0; i < expectedRead.size(); i += 1) {
+      AvroTestHelpers.assertEquals(icebergSchema.asStruct(), expectedRead.get(i), rows.get(i));
+    }
+  }
+
+  @Test
+  public void testMapOfUnionValues() throws IOException {
+    String schema1 = "{\n" +
+        "  \"name\": \"MapOfUnion\",\n" +
+        "  \"type\": \"record\",\n" +
+        "  \"fields\": [\n" +
+        "    {\n" +
+        "      \"name\": \"map\",\n" +
+        "      \"type\": [\n" +
+        "        \"null\",\n" +
+        "        {\n" +
+        "          \"type\": \"map\",\n" +
+        "          \"values\": [\n" +
+        "            \"null\",\n" +
+        "            \"boolean\",\n" +
+        "            \"int\",\n" +
+        "            \"long\",\n" +
+        "            \"float\",\n" +
+        "            \"double\",\n" +
+        "            \"bytes\",\n" +
+        "            \"string\"\n" +
+        "          ]\n" +
+        "        }\n" +
+        "      ]\n" +
+        "    }\n" +
+        "  ]\n" +
+        "}";
+    org.apache.avro.Schema avroSchema = new org.apache.avro.Schema.Parser().parse(schema1);
+    org.apache.iceberg.Schema icebergSchema = AvroSchemaUtil.toIceberg(avroSchema);
+    org.apache.avro.Schema avroSchemaUnionRecord = AvroSchemaUtil.convert(icebergSchema, "test");
+    org.apache.avro.Schema unionRecordSchema =
+        avroSchemaUnionRecord.getFields().get(0).schema().getTypes().get(1).getValueType().getTypes().get(1);
+
+    List<GenericData.Record> expectedRead = new ArrayList<>();
+    List<GenericData.Record> actualWrite = new ArrayList<>();
+
+    for (long i = 0; i < 10; i++) {
+      Map<String, Object> map = new HashMap<>();
+      Map<String, Object> mapRead = new HashMap<>();
+      updateMapsForUnionSchema(unionRecordSchema, map, mapRead, i);
+      GenericData.Record recordRead = new GenericRecordBuilder(avroSchema)
+          .set("map", mapRead)
+          .build();
+      GenericData.Record record = new GenericRecordBuilder(avroSchema)
+          .set("map", map)
+          .build();
+      actualWrite.add(record);
+      expectedRead.add(recordRead);
+    }
+    writeAndValidate(actualWrite, expectedRead, icebergSchema);
+  }
+
+  private void updateMapsForUnionSchema(
+      org.apache.avro.Schema unionRecordSchema,
+      Map<String, Object> map,
+      Map<String, Object> mapRead,
+      Long index) {
+    map.put("boolean", index % 2 == 0);
+    map.put("int", index.intValue());
+    map.put("long", index);
+    map.put("float", index.floatValue());
+    map.put("double", index.doubleValue());
+    map.put("bytes", ByteBuffer.wrap(("bytes_" + index).getBytes()));
+    map.put("string", "string_" + index);
+
+    map.entrySet().stream().forEach(e -> {
+      String key = e.getKey();
+      GenericData.Record record = getGenericRecordForUnionType(unionRecordSchema, map, key);
+      mapRead.put(key, record);
+    });
+  }
+
+  private GenericData.Record getGenericRecordForUnionType(
+      org.apache.avro.Schema unionRecordSchema,
+      Map<String, Object> map,
+      String key) {
+    GenericRecordBuilder rec = new GenericRecordBuilder(unionRecordSchema);
+    switch (key) {
+      case "boolean":
+        return rec
+            .set("member1", map.get(key))
+            .build();
+      case "int":
+        return rec
+            .set("member2", map.get(key))
+            .build();
+      case "long":
+        return rec
+            .set("member3", map.get(key))
+            .build();
+      case "float":
+        return rec
+            .set("member4", map.get(key))
+            .build();
+      case "double":
+        return rec
+            .set("member5", map.get(key))
+            .build();
+      case "bytes":
+        return rec
+            .set("member6", map.get(key))
+            .build();
+      case "string":
+        return rec
+            .set("member7", map.get(key))
+            .build();
+      default:
+        throw new IllegalStateException("key mapping not found for " + key);
+    }
+  }
+}