diff --git a/core/src/main/java/org/apache/iceberg/avro/AvroSchemaUtil.java b/core/src/main/java/org/apache/iceberg/avro/AvroSchemaUtil.java index 15caea5d1a13..e9cc5ca4a073 100644 --- a/core/src/main/java/org/apache/iceberg/avro/AvroSchemaUtil.java +++ b/core/src/main/java/org/apache/iceberg/avro/AvroSchemaUtil.java @@ -37,6 +37,8 @@ public class AvroSchemaUtil { private AvroSchemaUtil() {} + // Original Iceberg field name corresponding to a sanitized Avro name + public static final String ICEBERG_FIELD_NAME_PROP = "iceberg-field-name"; public static final String FIELD_ID_PROP = "field-id"; public static final String KEY_ID_PROP = "key-id"; public static final String VALUE_ID_PROP = "value-id"; @@ -274,4 +276,49 @@ static Schema.Field copyField(Schema.Field field, Schema newSchema, String newNa return copy; } + + static boolean validAvroName(String name) { + int length = name.length(); + Preconditions.checkArgument(length > 0, "Empty name"); + char first = name.charAt(0); + if (!(Character.isLetter(first) || first == '_')) { + return false; + } + + for (int i = 1; i < length; i++) { + char character = name.charAt(i); + if (!(Character.isLetterOrDigit(character) || character == '_')) { + return false; + } + } + return true; + } + + static String sanitize(String name) { + int length = name.length(); + StringBuilder sb = new StringBuilder(name.length()); + char first = name.charAt(0); + if (!(Character.isLetter(first) || first == '_')) { + sb.append(sanitize(first)); + } else { + sb.append(first); + } + + for (int i = 1; i < length; i++) { + char character = name.charAt(i); + if (!(Character.isLetterOrDigit(character) || character == '_')) { + sb.append(sanitize(character)); + } else { + sb.append(character); + } + } + return sb.toString(); + } + + private static String sanitize(char character) { + if (Character.isDigit(character)) { + return "_" + character; + } + return "_x" + Integer.toHexString(character).toUpperCase(); + } } diff --git a/core/src/main/java/org/apache/iceberg/avro/TypeToSchema.java b/core/src/main/java/org/apache/iceberg/avro/TypeToSchema.java index 50d041c9b2af..7a79c2a07f18 100644 --- a/core/src/main/java/org/apache/iceberg/avro/TypeToSchema.java +++ b/core/src/main/java/org/apache/iceberg/avro/TypeToSchema.java @@ -86,9 +86,15 @@ public Schema struct(Types.StructType struct, List fieldSchemas) { List fields = Lists.newArrayListWithExpectedSize(fieldSchemas.size()); for (int i = 0; i < structFields.size(); i += 1) { Types.NestedField structField = structFields.get(i); + String origFieldName = structField.name(); + boolean isValidFieldName = AvroSchemaUtil.validAvroName(origFieldName); + String fieldName = isValidFieldName ? origFieldName : AvroSchemaUtil.sanitize(origFieldName); Schema.Field field = new Schema.Field( - structField.name(), fieldSchemas.get(i), null, + fieldName, fieldSchemas.get(i), null, structField.isOptional() ? JsonProperties.NULL_VALUE : null); + if (!isValidFieldName) { + field.addProp(AvroSchemaUtil.ICEBERG_FIELD_NAME_PROP, origFieldName); + } field.addProp(AvroSchemaUtil.FIELD_ID_PROP, structField.fieldId()); fields.add(field); } diff --git a/core/src/test/java/org/apache/iceberg/avro/TestSchemaConversions.java b/core/src/test/java/org/apache/iceberg/avro/TestSchemaConversions.java index f027f3c07c95..88320dbff45d 100644 --- a/core/src/test/java/org/apache/iceberg/avro/TestSchemaConversions.java +++ b/core/src/test/java/org/apache/iceberg/avro/TestSchemaConversions.java @@ -19,6 +19,7 @@ package org.apache.iceberg.avro; +import com.google.common.collect.Iterables; import com.google.common.collect.Lists; import java.util.List; import org.apache.avro.LogicalTypes; @@ -277,4 +278,25 @@ public void testComplexSchema() { AvroSchemaUtil.convert(schema, "newTableName").toString(true); } + @Test + public void testSpecialChars() { + List names = Lists.newArrayList("9x", "x_", "a.b", "☃", "a#b"); + org.apache.iceberg.Schema schema = new org.apache.iceberg.Schema( + required(1, names.get(0), Types.IntegerType.get()), + required(2, names.get(1), Types.StringType.get()), + required(3, names.get(2), Types.IntegerType.get()), + required(4, names.get(3), Types.IntegerType.get()), + required(5, names.get(4), Types.IntegerType.get())); + + Schema avroSchema = AvroSchemaUtil.convert(schema.asStruct()); + List sanitizedNames = Lists.newArrayList(Iterables.transform(avroSchema.getFields(), Schema.Field::name)); + List expectedSanitizedNames = Lists.newArrayList("_9x", "x_", "a_x2Eb", "_x2603", "a_x23b"); + Assert.assertEquals(expectedSanitizedNames, sanitizedNames); + + List origNames = Lists.newArrayList( + Iterables.transform(avroSchema.getFields(), f -> f.getProp(AvroSchemaUtil.ICEBERG_FIELD_NAME_PROP))); + List expectedOrigNames = Lists.newArrayList(names); + expectedOrigNames.set(1, null); // Name at pos 1 is valid so ICEBERG_FIELD_NAME_PROP is not set + Assert.assertEquals(expectedOrigNames, origNames); + } }