diff --git a/presto-hive/src/main/java/com/facebook/presto/hive/parquet/ParquetTypeUtils.java b/presto-hive/src/main/java/com/facebook/presto/hive/parquet/ParquetTypeUtils.java index 4c0f2ba4635c4..ddf6b037e40bf 100644 --- a/presto-hive/src/main/java/com/facebook/presto/hive/parquet/ParquetTypeUtils.java +++ b/presto-hive/src/main/java/com/facebook/presto/hive/parquet/ParquetTypeUtils.java @@ -71,6 +71,13 @@ public static GroupColumnIO getMapKeyValueColumn(GroupColumnIO groupColumnIO) return groupColumnIO; } + /* For backward-compatibility, the type of elements in LIST-annotated structures should always be determined by the following rules: + * 1. If the repeated field is not a group, then its type is the element type and elements are required. + * 2. If the repeated field is a group with multiple fields, then its type is the element type and elements are required. + * 3. If the repeated field is a group with one field and is named either array or uses the LIST-annotated group's name with _tuple appended then the repeated type is the element type and elements are required. + * 4. Otherwise, the repeated field's type is the element type with the repeated field's repetition. + * https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#lists + */ public static ColumnIO getArrayElementColumn(ColumnIO columnIO) { while (columnIO instanceof GroupColumnIO && !columnIO.getType().isRepetition(REPEATED)) { @@ -86,7 +93,9 @@ public static ColumnIO getArrayElementColumn(ColumnIO columnIO) */ if (columnIO instanceof GroupColumnIO && columnIO.getType().getOriginalType() == null && - ((GroupColumnIO) columnIO).getChildrenCount() == 1) { + ((GroupColumnIO) columnIO).getChildrenCount() == 1 && + !columnIO.getName().equals("array") && + !columnIO.getName().equals(columnIO.getParent().getName() + "_tuple")) { return ((GroupColumnIO) columnIO).getChild(0); } diff --git a/presto-hive/src/main/java/parquet/io/ColumnIOConverter.java b/presto-hive/src/main/java/parquet/io/ColumnIOConverter.java index 1a7e34fd0bfa0..5f8b758350751 100644 --- a/presto-hive/src/main/java/parquet/io/ColumnIOConverter.java +++ b/presto-hive/src/main/java/parquet/io/ColumnIOConverter.java @@ -54,7 +54,7 @@ public static Optional constructField(Type type, ColumnIO columnIO) if (ROW.equals(type.getTypeSignature().getBase())) { GroupColumnIO groupColumnIO = (GroupColumnIO) columnIO; List parameters = type.getTypeParameters(); - ImmutableList.Builder> fileldsBuilder = ImmutableList.builder(); + ImmutableList.Builder> fieldsBuilder = ImmutableList.builder(); List fields = type.getTypeSignature().getParameters(); boolean structHasParameters = false; for (int i = 0; i < fields.size(); i++) { @@ -62,10 +62,10 @@ public static Optional constructField(Type type, ColumnIO columnIO) String name = namedTypeSignature.getName().get().toLowerCase(Locale.ENGLISH); Optional field = constructField(parameters.get(i), groupColumnIO.getChild(name)); structHasParameters |= field.isPresent(); - fileldsBuilder.add(field); + fieldsBuilder.add(field); } if (structHasParameters) { - return Optional.of(new GroupField(type, repetitionLevel, definitionLevel, required, fileldsBuilder.build())); + return Optional.of(new GroupField(type, repetitionLevel, definitionLevel, required, fieldsBuilder.build())); } return Optional.empty(); } @@ -73,7 +73,7 @@ else if (MAP.equals(type.getTypeSignature().getBase())) { GroupColumnIO groupColumnIO = (GroupColumnIO) columnIO; MapType mapType = (MapType) type; GroupColumnIO keyValueColumnIO = getMapKeyValueColumn(groupColumnIO); - if (keyValueColumnIO.getChildrenCount() < 2) { + if (keyValueColumnIO.getChildrenCount() != 2) { return Optional.empty(); } Optional keyField = constructField(mapType.getKeyType(), keyValueColumnIO.getChild(0)); @@ -83,7 +83,7 @@ else if (MAP.equals(type.getTypeSignature().getBase())) { else if (ARRAY.equals(type.getTypeSignature().getBase())) { GroupColumnIO groupColumnIO = (GroupColumnIO) columnIO; List types = type.getTypeParameters(); - if (groupColumnIO.getChildrenCount() == 0) { + if (groupColumnIO.getChildrenCount() != 1) { return Optional.empty(); } Optional field = constructField(types.get(0), getArrayElementColumn(groupColumnIO.getChild(0))); diff --git a/presto-hive/src/test/java/com/facebook/presto/hive/parquet/AbstractTestParquetReader.java b/presto-hive/src/test/java/com/facebook/presto/hive/parquet/AbstractTestParquetReader.java index 909816901c55a..8039f475a3904 100644 --- a/presto-hive/src/test/java/com/facebook/presto/hive/parquet/AbstractTestParquetReader.java +++ b/presto-hive/src/test/java/com/facebook/presto/hive/parquet/AbstractTestParquetReader.java @@ -206,7 +206,7 @@ public void testCustomSchemaArrayOfStucts() Iterable> values = createTestArrays(structs); List structFieldNames = asList("a", "b", "c"); Type structType = RowType.from(asList(field("a", BIGINT), field("b", BOOLEAN), field("c", VARCHAR))); - tester.testRoundTrip( + tester.testSingleLevelArrayRoundTrip( getStandardListObjectInspector(getStandardStructObjectInspector(structFieldNames, asList(javaLongObjectInspector, javaBooleanObjectInspector, javaStringObjectInspector))), values, values, new ArrayType(structType), Optional.of(customSchemaArrayOfStucts)); } @@ -379,6 +379,44 @@ public void testSingleLevelArrayOfMapOfStruct() values, values, new ArrayType(mapType(INTEGER, structType))); } + @Test + public void testSingleLevelArrayOfStructOfSingleElement() + throws Exception + { + Iterable structs = createTestStructs(transform(intsBetween(0, 31_234), Object::toString)); + Iterable> values = createTestArrays(structs); + List structFieldNames = singletonList("test"); + Type structType = RowType.from(singletonList(field("test", VARCHAR))); + tester.testRoundTrip( + getStandardListObjectInspector(getStandardStructObjectInspector(structFieldNames, singletonList(javaStringObjectInspector))), + values, values, new ArrayType(structType)); + tester.testSingleLevelArraySchemaRoundTrip( + getStandardListObjectInspector(getStandardStructObjectInspector(structFieldNames, singletonList(javaStringObjectInspector))), + values, values, new ArrayType(structType)); + } + + @Test + public void testSingleLevelArrayOfStructOfStructOfSingleElement() + throws Exception + { + Iterable structs = createTestStructs(transform(intsBetween(0, 31_234), Object::toString)); + Iterable structsOfStructs = createTestStructs(structs); + Iterable> values = createTestArrays(structsOfStructs); + List structFieldNames = singletonList("test"); + List structsOfStructsFieldNames = singletonList("test"); + Type structType = RowType.from(singletonList(field("test", VARCHAR))); + Type structsOfStructsType = RowType.from(singletonList(field("test", structType))); + ObjectInspector structObjectInspector = getStandardStructObjectInspector(structFieldNames, singletonList(javaStringObjectInspector)); + tester.testRoundTrip( + getStandardListObjectInspector( + getStandardStructObjectInspector(structsOfStructsFieldNames, singletonList(structObjectInspector))), + values, values, new ArrayType(structsOfStructsType)); + tester.testSingleLevelArraySchemaRoundTrip( + getStandardListObjectInspector( + getStandardStructObjectInspector(structsOfStructsFieldNames, singletonList(structObjectInspector))), + values, values, new ArrayType(structsOfStructsType)); + } + @Test public void testArrayOfMapOfArray() throws Exception @@ -1085,7 +1123,7 @@ public void testSchemaWithRequiredOptionalRequired2Fields() ObjectInspector eInspector = getStandardStructObjectInspector(singletonList("f"), singletonList(fInspector)); tester.testRoundTrip(asList(aInspector, eInspector), new Iterable[] {aValues, eValues}, new Iterable[] {aValues, eValues}, - asList("a", "e"), asList(aType, eType), Optional.of(parquetSchema)); + asList("a", "e"), asList(aType, eType), Optional.of(parquetSchema), false); } @Test @@ -1098,7 +1136,7 @@ public void testOldAvroArray() " }" + "} "); Iterable> nonNullArrayElements = createTestArrays(intsBetween(0, 31_234)); - tester.testRoundTrip(getStandardListObjectInspector(javaIntObjectInspector), nonNullArrayElements, nonNullArrayElements, new ArrayType(INTEGER), Optional.of(parquetMrAvroSchema)); + tester.testSingleLevelArrayRoundTrip(getStandardListObjectInspector(javaIntObjectInspector), nonNullArrayElements, nonNullArrayElements, new ArrayType(INTEGER), Optional.of(parquetMrAvroSchema)); } @Test diff --git a/presto-hive/src/test/java/com/facebook/presto/hive/parquet/ParquetTester.java b/presto-hive/src/test/java/com/facebook/presto/hive/parquet/ParquetTester.java index 172a83fe91f32..e48457fa59bbb 100644 --- a/presto-hive/src/test/java/com/facebook/presto/hive/parquet/ParquetTester.java +++ b/presto-hive/src/test/java/com/facebook/presto/hive/parquet/ParquetTester.java @@ -152,10 +152,10 @@ public void testSingleLevelArraySchemaRoundTrip(ObjectInspector objectInspector, { ArrayList typeInfos = TypeInfoUtils.getTypeInfosFromTypeString(objectInspector.getTypeName()); MessageType schema = SingleLevelArraySchemaConverter.convert(TEST_COLUMN, typeInfos); - testRoundTrip(objectInspector, writeValues, readValues, type, Optional.of(schema)); + testSingleLevelArrayRoundTrip(objectInspector, writeValues, readValues, type, Optional.of(schema)); if (objectInspector.getTypeName().contains("map<")) { schema = SingleLevelArrayMapKeyValuesSchemaConverter.convert(TEST_COLUMN, typeInfos); - testRoundTrip(objectInspector, writeValues, readValues, type, Optional.of(schema)); + testSingleLevelArrayRoundTrip(objectInspector, writeValues, readValues, type, Optional.of(schema)); } } @@ -163,7 +163,8 @@ public void testRoundTrip(ObjectInspector objectInspector, Iterable writeValu throws Exception { // just the values - testRoundTripType(singletonList(objectInspector), new Iterable[] {writeValues}, new Iterable[] {readValues}, TEST_COLUMN, singletonList(type), Optional.empty()); + testRoundTripType(singletonList(objectInspector), new Iterable[] {writeValues}, + new Iterable[] {readValues}, TEST_COLUMN, singletonList(type), Optional.empty(), false); // all nulls assertRoundTrip(singletonList(objectInspector), new Iterable[] {transform(writeValues, constant(null))}, @@ -173,7 +174,7 @@ public void testRoundTrip(ObjectInspector objectInspector, Iterable writeValu MessageType schema = MapKeyValuesSchemaConverter.convert(TEST_COLUMN, typeInfos); // just the values testRoundTripType(singletonList(objectInspector), new Iterable[] {writeValues}, new Iterable[] { - readValues}, TEST_COLUMN, singletonList(type), Optional.of(schema)); + readValues}, TEST_COLUMN, singletonList(type), Optional.of(schema), false); // all nulls assertRoundTrip(singletonList(objectInspector), new Iterable[] {transform(writeValues, constant(null))}, @@ -184,34 +185,40 @@ public void testRoundTrip(ObjectInspector objectInspector, Iterable writeValu public void testRoundTrip(ObjectInspector objectInspector, Iterable writeValues, Iterable readValues, Type type, Optional parquetSchema) throws Exception { - testRoundTrip(singletonList(objectInspector), new Iterable[] {writeValues}, new Iterable[] {readValues}, TEST_COLUMN, singletonList(type), parquetSchema); + testRoundTrip(singletonList(objectInspector), new Iterable[] {writeValues}, new Iterable[] {readValues}, TEST_COLUMN, singletonList(type), parquetSchema, false); } - public void testRoundTrip(List objectInspectors, Iterable[] writeValues, Iterable[] readValues, List columnNames, List columnTypes, Optional parquetSchema) + public void testSingleLevelArrayRoundTrip(ObjectInspector objectInspector, Iterable writeValues, Iterable readValues, Type type, Optional parquetSchema) + throws Exception + { + testRoundTrip(singletonList(objectInspector), new Iterable[] {writeValues}, new Iterable[] {readValues}, TEST_COLUMN, singletonList(type), parquetSchema, true); + } + + public void testRoundTrip(List objectInspectors, Iterable[] writeValues, Iterable[] readValues, List columnNames, List columnTypes, Optional parquetSchema, boolean singleLevelArray) throws Exception { // just the values - testRoundTripType(objectInspectors, writeValues, readValues, columnNames, columnTypes, parquetSchema); + testRoundTripType(objectInspectors, writeValues, readValues, columnNames, columnTypes, parquetSchema, singleLevelArray); // all nulls - assertRoundTrip(objectInspectors, transformToNulls(writeValues), transformToNulls(readValues), columnNames, columnTypes, parquetSchema); + assertRoundTrip(objectInspectors, transformToNulls(writeValues), transformToNulls(readValues), columnNames, columnTypes, parquetSchema, singleLevelArray); } private void testRoundTripType(List objectInspectors, Iterable[] writeValues, Iterable[] readValues, - List columnNames, List columnTypes, Optional parquetSchema) + List columnNames, List columnTypes, Optional parquetSchema, boolean singleLevelArray) throws Exception { // forward order - assertRoundTrip(objectInspectors, writeValues, readValues, columnNames, columnTypes, parquetSchema); + assertRoundTrip(objectInspectors, writeValues, readValues, columnNames, columnTypes, parquetSchema, singleLevelArray); // reverse order - assertRoundTrip(objectInspectors, reverse(writeValues), reverse(readValues), columnNames, columnTypes, parquetSchema); + assertRoundTrip(objectInspectors, reverse(writeValues), reverse(readValues), columnNames, columnTypes, parquetSchema, singleLevelArray); // forward order with nulls - assertRoundTrip(objectInspectors, insertNullEvery(5, writeValues), insertNullEvery(5, readValues), columnNames, columnTypes, parquetSchema); + assertRoundTrip(objectInspectors, insertNullEvery(5, writeValues), insertNullEvery(5, readValues), columnNames, columnTypes, parquetSchema, singleLevelArray); // reverse order with nulls - assertRoundTrip(objectInspectors, insertNullEvery(5, reverse(writeValues)), insertNullEvery(5, reverse(readValues)), columnNames, columnTypes, parquetSchema); + assertRoundTrip(objectInspectors, insertNullEvery(5, reverse(writeValues)), insertNullEvery(5, reverse(readValues)), columnNames, columnTypes, parquetSchema, singleLevelArray); } void assertRoundTrip(List objectInspectors, @@ -221,6 +228,18 @@ void assertRoundTrip(List objectInspectors, List columnTypes, Optional parquetSchema) throws Exception + { + assertRoundTrip(objectInspectors, writeValues, readValues, columnNames, columnTypes, parquetSchema, false); + } + + void assertRoundTrip(List objectInspectors, + Iterable[] writeValues, + Iterable[] readValues, + List columnNames, + List columnTypes, + Optional parquetSchema, + boolean singleLevelArray) + throws Exception { for (WriterVersion version : versions) { for (CompressionCodecName compressionCodecName : compressions) { @@ -236,7 +255,8 @@ void assertRoundTrip(List objectInspectors, createTableProperties(columnNames, objectInspectors), getStandardStructObjectInspector(columnNames, objectInspectors), getIterators(writeValues), - parquetSchema); + parquetSchema, + singleLevelArray); assertFileContents( tempFile.getFile(), getIterators(readValues), @@ -383,10 +403,11 @@ private static DataSize writeParquetColumn(JobConf jobConf, Properties tableProperties, SettableStructObjectInspector objectInspector, Iterator[] valuesByField, - Optional parquetSchema) + Optional parquetSchema, + boolean singleLevelArray) throws Exception { - RecordWriter recordWriter = new TestMapredParquetOutputFormat(parquetSchema) + RecordWriter recordWriter = new TestMapredParquetOutputFormat(parquetSchema, singleLevelArray) .getHiveRecordWriter( jobConf, new Path(outputFile.toURI()), diff --git a/presto-hive/src/test/java/com/facebook/presto/hive/parquet/write/SingleLevelArraySchemaConverter.java b/presto-hive/src/test/java/com/facebook/presto/hive/parquet/write/SingleLevelArraySchemaConverter.java index 3f6514a1923ce..946520190eaa8 100644 --- a/presto-hive/src/test/java/com/facebook/presto/hive/parquet/write/SingleLevelArraySchemaConverter.java +++ b/presto-hive/src/test/java/com/facebook/presto/hive/parquet/write/SingleLevelArraySchemaConverter.java @@ -162,7 +162,7 @@ else if (typeInfo.getCategory().equals(Category.UNION)) { private static GroupType convertArrayType(final String name, final ListTypeInfo typeInfo, final Repetition repetition) { final TypeInfo subType = typeInfo.getListElementTypeInfo(); - return listWrapper(name, OriginalType.LIST, convertType("array_element", subType, Repetition.REPEATED), repetition); + return listWrapper(name, OriginalType.LIST, convertType("array", subType, Repetition.REPEATED), repetition); } // An optional group containing multiple elements diff --git a/presto-hive/src/test/java/com/facebook/presto/hive/parquet/write/TestDataWritableWriteSupport.java b/presto-hive/src/test/java/com/facebook/presto/hive/parquet/write/TestDataWritableWriteSupport.java index 0e605ca7eed0a..85b980009c3fb 100644 --- a/presto-hive/src/test/java/com/facebook/presto/hive/parquet/write/TestDataWritableWriteSupport.java +++ b/presto-hive/src/test/java/com/facebook/presto/hive/parquet/write/TestDataWritableWriteSupport.java @@ -31,6 +31,12 @@ class TestDataWritableWriteSupport { private TestDataWritableWriter writer; private MessageType schema; + private boolean singleLevelArray; + + public TestDataWritableWriteSupport(boolean singleLevelArray) + { + this.singleLevelArray = singleLevelArray; + } @Override public WriteContext init(final Configuration configuration) @@ -42,7 +48,7 @@ public WriteContext init(final Configuration configuration) @Override public void prepareForWrite(final RecordConsumer recordConsumer) { - writer = new TestDataWritableWriter(recordConsumer, schema); + writer = new TestDataWritableWriter(recordConsumer, schema, singleLevelArray); } @Override diff --git a/presto-hive/src/test/java/com/facebook/presto/hive/parquet/write/TestDataWritableWriter.java b/presto-hive/src/test/java/com/facebook/presto/hive/parquet/write/TestDataWritableWriter.java index cae5db6059548..c9ffd88f26de6 100644 --- a/presto-hive/src/test/java/com/facebook/presto/hive/parquet/write/TestDataWritableWriter.java +++ b/presto-hive/src/test/java/com/facebook/presto/hive/parquet/write/TestDataWritableWriter.java @@ -54,7 +54,7 @@ /** * This class is copied from org.apache.hadoop.hive.ql.io.parquet.write.DataWritableWriter * and extended to support empty arrays and maps (HIVE-13632). - * Additionally, there is a support for arrays without include an inner element layer and + * Additionally, there is a support for arrays without an inner element layer and * support for maps where MAP_KEY_VALUE is incorrectly used in place of MAP * for backward-compatibility rules testing (https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#lists) */ @@ -63,11 +63,13 @@ public class TestDataWritableWriter private static final Logger log = Logger.get(DataWritableWriter.class); private final RecordConsumer recordConsumer; private final GroupType schema; + private final boolean singleLevelArray; - public TestDataWritableWriter(final RecordConsumer recordConsumer, final GroupType schema) + public TestDataWritableWriter(final RecordConsumer recordConsumer, final GroupType schema, boolean singleLevelArray) { this.recordConsumer = recordConsumer; this.schema = schema; + this.singleLevelArray = singleLevelArray; } /** @@ -139,7 +141,12 @@ private void writeValue(final Object value, final ObjectInspector inspector, fin if (originalType != null && originalType.equals(OriginalType.LIST)) { checkInspectorCategory(inspector, ObjectInspector.Category.LIST); - writeArray(value, (ListObjectInspector) inspector, groupType); + if (singleLevelArray) { + writeSingleLevelArray(value, (ListObjectInspector) inspector, groupType); + } + else { + writeArray(value, (ListObjectInspector) inspector, groupType); + } } else if (originalType != null && (originalType.equals(OriginalType.MAP) || originalType.equals(OriginalType.MAP_KEY_VALUE))) { checkInspectorCategory(inspector, ObjectInspector.Category.MAP); @@ -198,16 +205,8 @@ private void writeGroup(final Object value, final StructObjectInspector inspecto */ private void writeArray(final Object value, final ListObjectInspector inspector, final GroupType type) { - if (type.getType(0).isPrimitive()) { - writeSingleLevelArray(value, inspector, type); - return; - } // Get the internal array structure GroupType repeatedType = type.getType(0).asGroupType(); - if (repeatedType.getOriginalType() != null || repeatedType.getFieldCount() > 1) { - writeSingleLevelArray(value, inspector, type); - return; - } recordConsumer.startGroup(); List arrayValues = inspector.getList(value); diff --git a/presto-hive/src/test/java/com/facebook/presto/hive/parquet/write/TestMapredParquetOutputFormat.java b/presto-hive/src/test/java/com/facebook/presto/hive/parquet/write/TestMapredParquetOutputFormat.java index db58de6bed07c..6757a702ee463 100644 --- a/presto-hive/src/test/java/com/facebook/presto/hive/parquet/write/TestMapredParquetOutputFormat.java +++ b/presto-hive/src/test/java/com/facebook/presto/hive/parquet/write/TestMapredParquetOutputFormat.java @@ -41,9 +41,9 @@ public class TestMapredParquetOutputFormat { private final Optional schema; - public TestMapredParquetOutputFormat(Optional schema) + public TestMapredParquetOutputFormat(Optional schema, boolean singleLevelArray) { - super(new ParquetOutputFormat<>(new TestDataWritableWriteSupport())); + super(new ParquetOutputFormat<>(new TestDataWritableWriteSupport(singleLevelArray))); this.schema = requireNonNull(schema, "schema is null"); }