Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
47 changes: 47 additions & 0 deletions core/src/main/java/org/apache/iceberg/avro/AvroSchemaUtil.java
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,8 @@ public class AvroSchemaUtil {

private AvroSchemaUtil() {}

// Original Iceberg field name corresponding to a sanitized Avro name
public static final String ICEBERG_FIELD_NAME_PROP = "iceberg-field-name";
public static final String FIELD_ID_PROP = "field-id";
public static final String KEY_ID_PROP = "key-id";
public static final String VALUE_ID_PROP = "value-id";
Expand Down Expand Up @@ -274,4 +276,49 @@ static Schema.Field copyField(Schema.Field field, Schema newSchema, String newNa

return copy;
}

static boolean validAvroName(String name) {
int length = name.length();
Preconditions.checkArgument(length > 0, "Empty name");
char first = name.charAt(0);
if (!(Character.isLetter(first) || first == '_')) {
return false;
}

for (int i = 1; i < length; i++) {
char character = name.charAt(i);
if (!(Character.isLetterOrDigit(character) || character == '_')) {
return false;
}
}
return true;
}

static String sanitize(String name) {
int length = name.length();
StringBuilder sb = new StringBuilder(name.length());
char first = name.charAt(0);
if (!(Character.isLetter(first) || first == '_')) {
sb.append(sanitize(first));
} else {
sb.append(first);
}

for (int i = 1; i < length; i++) {
char character = name.charAt(i);
if (!(Character.isLetterOrDigit(character) || character == '_')) {
sb.append(sanitize(character));
} else {
sb.append(character);
}
}
return sb.toString();
}

private static String sanitize(char character) {
if (Character.isDigit(character)) {
return "_" + character;
}
return "_x" + Integer.toHexString(character).toUpperCase();
}
}
8 changes: 7 additions & 1 deletion core/src/main/java/org/apache/iceberg/avro/TypeToSchema.java
Original file line number Diff line number Diff line change
Expand Up @@ -86,9 +86,15 @@ public Schema struct(Types.StructType struct, List<Schema> fieldSchemas) {
List<Schema.Field> fields = Lists.newArrayListWithExpectedSize(fieldSchemas.size());
for (int i = 0; i < structFields.size(); i += 1) {
Types.NestedField structField = structFields.get(i);
String origFieldName = structField.name();
boolean isValidFieldName = AvroSchemaUtil.validAvroName(origFieldName);
String fieldName = isValidFieldName ? origFieldName : AvroSchemaUtil.sanitize(origFieldName);
Schema.Field field = new Schema.Field(
structField.name(), fieldSchemas.get(i), null,
fieldName, fieldSchemas.get(i), null,
structField.isOptional() ? JsonProperties.NULL_VALUE : null);
if (!isValidFieldName) {
field.addProp(AvroSchemaUtil.ICEBERG_FIELD_NAME_PROP, origFieldName);
}
field.addProp(AvroSchemaUtil.FIELD_ID_PROP, structField.fieldId());
fields.add(field);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@

package org.apache.iceberg.avro;

import com.google.common.collect.Iterables;
import com.google.common.collect.Lists;
import java.util.List;
import org.apache.avro.LogicalTypes;
Expand Down Expand Up @@ -277,4 +278,25 @@ public void testComplexSchema() {
AvroSchemaUtil.convert(schema, "newTableName").toString(true);
}

@Test
public void testSpecialChars() {
List<String> names = Lists.newArrayList("9x", "x_", "a.b", "☃", "a#b");
org.apache.iceberg.Schema schema = new org.apache.iceberg.Schema(
required(1, names.get(0), Types.IntegerType.get()),
required(2, names.get(1), Types.StringType.get()),
required(3, names.get(2), Types.IntegerType.get()),
required(4, names.get(3), Types.IntegerType.get()),
required(5, names.get(4), Types.IntegerType.get()));

Schema avroSchema = AvroSchemaUtil.convert(schema.asStruct());
List<String> sanitizedNames = Lists.newArrayList(Iterables.transform(avroSchema.getFields(), Schema.Field::name));
List<String> expectedSanitizedNames = Lists.newArrayList("_9x", "x_", "a_x2Eb", "_x2603", "a_x23b");
Assert.assertEquals(expectedSanitizedNames, sanitizedNames);

List<String> origNames = Lists.newArrayList(
Iterables.transform(avroSchema.getFields(), f -> f.getProp(AvroSchemaUtil.ICEBERG_FIELD_NAME_PROP)));
List<String> expectedOrigNames = Lists.newArrayList(names);
expectedOrigNames.set(1, null); // Name at pos 1 is valid so ICEBERG_FIELD_NAME_PROP is not set
Assert.assertEquals(expectedOrigNames, origNames);
}
}