diff --git a/parquet/src/main/java/org/apache/iceberg/parquet/RemoveIds.java b/parquet/src/main/java/org/apache/iceberg/parquet/RemoveIds.java index 6096f0f5b1dc..f1ff2cecf10d 100644 --- a/parquet/src/main/java/org/apache/iceberg/parquet/RemoveIds.java +++ b/parquet/src/main/java/org/apache/iceberg/parquet/RemoveIds.java @@ -48,16 +48,19 @@ public Type struct(GroupType struct, List fields) { @Override public Type list(GroupType array, Type item) { - return Types.list(array.getRepetition()) - .element(item) + GroupType repeatedElement = array.getType(0).asGroupType(); + return Types.buildGroup(array.getRepetition()) + .as(array.getLogicalTypeAnnotation()) + .addField(repeatedElement.withNewFields(item)) .named(array.getName()); } @Override public Type map(GroupType map, Type key, Type value) { - return Types.map(map.getRepetition()) - .key(key) - .value(value) + GroupType repeatedKeyValue = map.getType(0).asGroupType(); + return Types.buildGroup(map.getRepetition()) + .as(map.getLogicalTypeAnnotation()) + .addField(repeatedKeyValue.withNewFields(key, value)) .named(map.getName()); } diff --git a/parquet/src/test/java/org/apache/iceberg/parquet/TestParquetTypeVisitor.java b/parquet/src/test/java/org/apache/iceberg/parquet/TestParquetTypeVisitor.java new file mode 100644 index 000000000000..c0a65125662a --- /dev/null +++ b/parquet/src/test/java/org/apache/iceberg/parquet/TestParquetTypeVisitor.java @@ -0,0 +1,177 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iceberg.parquet; + +import org.apache.iceberg.Schema; +import org.apache.iceberg.types.Types.DoubleType; +import org.apache.iceberg.types.Types.ListType; +import org.apache.iceberg.types.Types.MapType; +import org.apache.iceberg.types.Types.NestedField; +import org.apache.iceberg.types.Types.StringType; +import org.apache.iceberg.types.Types.StructType; +import org.apache.parquet.schema.LogicalTypeAnnotation; +import org.apache.parquet.schema.MessageType; +import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName; +import org.apache.parquet.schema.Type; +import org.apache.parquet.schema.Types; +import org.junit.Assert; +import org.junit.Test; + +public class TestParquetTypeVisitor { + MessageType mapSchema = Types.buildMessage() + .addField(Types.buildGroup(Type.Repetition.OPTIONAL) + .addField(Types.buildGroup(Type.Repetition.REPEATED) + .addField(Types.primitive(PrimitiveTypeName.BINARY, Type.Repetition.REQUIRED) + .as(LogicalTypeAnnotation.stringType()) + .id(2) + .named("key")) + .addField(Types.buildGroup(Type.Repetition.OPTIONAL) + .addField(Types.primitive(PrimitiveTypeName.DOUBLE, Type.Repetition.REQUIRED).id(4).named("x")) + .addField(Types.primitive(PrimitiveTypeName.DOUBLE, Type.Repetition.REQUIRED).id(5).named("y")) + .addField(Types.primitive(PrimitiveTypeName.DOUBLE, Type.Repetition.REQUIRED).id(6).named("z")) + .id(3) + .named("value")) + .named("custom_key_value_name")) + .as(LogicalTypeAnnotation.mapType()) + .id(1) + .named("m")) + .named("table"); + + MessageType mapSchemaWithoutIds = Types.buildMessage() + .addField(Types.buildGroup(Type.Repetition.OPTIONAL) + .addField(Types.buildGroup(Type.Repetition.REPEATED) + .addField(Types.primitive(PrimitiveTypeName.BINARY, Type.Repetition.REQUIRED) + .as(LogicalTypeAnnotation.stringType()) + .named("key")) + .addField(Types.buildGroup(Type.Repetition.OPTIONAL) + .addField(Types.primitive(PrimitiveTypeName.DOUBLE, Type.Repetition.REQUIRED).named("x")) + .addField(Types.primitive(PrimitiveTypeName.DOUBLE, Type.Repetition.REQUIRED).named("y")) + .addField(Types.primitive(PrimitiveTypeName.DOUBLE, Type.Repetition.REQUIRED).named("z")) + .named("value")) + .named("custom_key_value_name")) + .as(LogicalTypeAnnotation.mapType()) + .named("m")) + .named("table"); + + MessageType listSchema = Types.buildMessage() + .addField(Types.buildGroup(Type.Repetition.OPTIONAL) + .addField(Types.buildGroup(Type.Repetition.REPEATED) + .addField(Types.buildGroup(Type.Repetition.OPTIONAL) + .addField(Types.primitive(PrimitiveTypeName.DOUBLE, Type.Repetition.REQUIRED).id(4).named("x")) + .addField(Types.primitive(PrimitiveTypeName.DOUBLE, Type.Repetition.REQUIRED).id(5).named("y")) + .addField(Types.primitive(PrimitiveTypeName.DOUBLE, Type.Repetition.REQUIRED).id(6).named("z")) + .id(3) + .named("custom_element_name")) + .named("custom_repeated_name")) + .as(LogicalTypeAnnotation.listType()) + .id(1) + .named("m")) + .named("table"); + + MessageType listSchemaWithoutIds = Types.buildMessage() + .addField(Types.buildGroup(Type.Repetition.OPTIONAL) + .addField(Types.buildGroup(Type.Repetition.REPEATED) + .addField(Types.buildGroup(Type.Repetition.OPTIONAL) + .addField(Types.primitive(PrimitiveTypeName.DOUBLE, Type.Repetition.REQUIRED).named("x")) + .addField(Types.primitive(PrimitiveTypeName.DOUBLE, Type.Repetition.REQUIRED).named("y")) + .addField(Types.primitive(PrimitiveTypeName.DOUBLE, Type.Repetition.REQUIRED).named("z")) + .named("custom_element_name")) + .named("custom_repeated_name")) + .as(LogicalTypeAnnotation.listType()) + .named("m")) + .named("table"); + + @Test + public void testRemoveIdsMap() { + MessageType actual = RemoveIds.removeIds(mapSchema); + Assert.assertEquals("RemoveIds should not rename repeated struct", mapSchemaWithoutIds, actual); + } + + @Test + public void testRemoveIdsList() { + MessageType actual = RemoveIds.removeIds(listSchema); + Assert.assertEquals("RemoveIds should not rename repeated struct", listSchemaWithoutIds, actual); + } + + @Test + public void testPruneColumnsMap() { + // project map.value.x and map.value.y + Schema projection = new Schema( + NestedField.optional(1, "m", MapType.ofOptional(2, 3, + StringType.get(), + StructType.of( + NestedField.required(4, "x", DoubleType.get()), + NestedField.required(5, "y", DoubleType.get()) + ) + )) + ); + + MessageType expected = Types.buildMessage() + .addField(Types.buildGroup(Type.Repetition.OPTIONAL) + .addField(Types.buildGroup(Type.Repetition.REPEATED) + .addField(Types.primitive(PrimitiveTypeName.BINARY, Type.Repetition.REQUIRED) + .as(LogicalTypeAnnotation.stringType()) + .id(2) + .named("key")) + .addField(Types.buildGroup(Type.Repetition.OPTIONAL) + .addField(Types.primitive(PrimitiveTypeName.DOUBLE, Type.Repetition.REQUIRED).id(4).named("x")) + .addField(Types.primitive(PrimitiveTypeName.DOUBLE, Type.Repetition.REQUIRED).id(5).named("y")) + .id(3) + .named("value")) + .named("custom_key_value_name")) + .as(LogicalTypeAnnotation.mapType()) + .id(1) + .named("m")) + .named("table"); + + MessageType actual = ParquetSchemaUtil.pruneColumns(mapSchema, projection); + Assert.assertEquals("Pruned schema should not rename repeated struct", expected, actual); + } + + @Test + public void testPruneColumnsList() { + // project map.value.x and map.value.y + Schema projection = new Schema( + NestedField.optional(1, "m", ListType.ofOptional(3, + StructType.of( + NestedField.required(4, "x", DoubleType.get()), + NestedField.required(5, "y", DoubleType.get()) + ) + )) + ); + + MessageType expected = Types.buildMessage() + .addField(Types.buildGroup(Type.Repetition.OPTIONAL) + .addField(Types.buildGroup(Type.Repetition.REPEATED) + .addField(Types.buildGroup(Type.Repetition.OPTIONAL) + .addField(Types.primitive(PrimitiveTypeName.DOUBLE, Type.Repetition.REQUIRED).id(4).named("x")) + .addField(Types.primitive(PrimitiveTypeName.DOUBLE, Type.Repetition.REQUIRED).id(5).named("y")) + .id(3) + .named("custom_element_name")) + .named("custom_repeated_name")) + .as(LogicalTypeAnnotation.listType()) + .id(1) + .named("m")) + .named("table"); + + MessageType actual = ParquetSchemaUtil.pruneColumns(listSchema, projection); + Assert.assertEquals("Pruned schema should not rename repeated struct", expected, actual); + } +} diff --git a/parquet/src/test/java/org/apache/iceberg/parquet/TestPruneColumns.java b/parquet/src/test/java/org/apache/iceberg/parquet/TestPruneColumns.java deleted file mode 100644 index dfa7e64a4758..000000000000 --- a/parquet/src/test/java/org/apache/iceberg/parquet/TestPruneColumns.java +++ /dev/null @@ -1,136 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package org.apache.iceberg.parquet; - -import org.apache.iceberg.Schema; -import org.apache.iceberg.types.Types.DoubleType; -import org.apache.iceberg.types.Types.ListType; -import org.apache.iceberg.types.Types.MapType; -import org.apache.iceberg.types.Types.NestedField; -import org.apache.iceberg.types.Types.StringType; -import org.apache.iceberg.types.Types.StructType; -import org.apache.parquet.schema.LogicalTypeAnnotation; -import org.apache.parquet.schema.MessageType; -import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName; -import org.apache.parquet.schema.Type; -import org.apache.parquet.schema.Types; -import org.junit.Assert; -import org.junit.Test; - -public class TestPruneColumns { - @Test - public void testMapKeyValueName() { - MessageType fileSchema = Types.buildMessage() - .addField(Types.buildGroup(Type.Repetition.OPTIONAL) - .addField(Types.buildGroup(Type.Repetition.REPEATED) - .addField(Types.primitive(PrimitiveTypeName.BINARY, Type.Repetition.REQUIRED) - .as(LogicalTypeAnnotation.stringType()) - .id(2) - .named("key")) - .addField(Types.buildGroup(Type.Repetition.OPTIONAL) - .addField(Types.primitive(PrimitiveTypeName.DOUBLE, Type.Repetition.REQUIRED).id(4).named("x")) - .addField(Types.primitive(PrimitiveTypeName.DOUBLE, Type.Repetition.REQUIRED).id(5).named("y")) - .addField(Types.primitive(PrimitiveTypeName.DOUBLE, Type.Repetition.REQUIRED).id(6).named("z")) - .id(3) - .named("value")) - .named("custom_key_value_name")) - .as(LogicalTypeAnnotation.mapType()) - .id(1) - .named("m")) - .named("table"); - - // project map.value.x and map.value.y - Schema projection = new Schema( - NestedField.optional(1, "m", MapType.ofOptional(2, 3, - StringType.get(), - StructType.of( - NestedField.required(4, "x", DoubleType.get()), - NestedField.required(5, "y", DoubleType.get()) - ) - )) - ); - - MessageType expected = Types.buildMessage() - .addField(Types.buildGroup(Type.Repetition.OPTIONAL) - .addField(Types.buildGroup(Type.Repetition.REPEATED) - .addField(Types.primitive(PrimitiveTypeName.BINARY, Type.Repetition.REQUIRED) - .as(LogicalTypeAnnotation.stringType()) - .id(2) - .named("key")) - .addField(Types.buildGroup(Type.Repetition.OPTIONAL) - .addField(Types.primitive(PrimitiveTypeName.DOUBLE, Type.Repetition.REQUIRED).id(4).named("x")) - .addField(Types.primitive(PrimitiveTypeName.DOUBLE, Type.Repetition.REQUIRED).id(5).named("y")) - .id(3) - .named("value")) - .named("custom_key_value_name")) - .as(LogicalTypeAnnotation.mapType()) - .id(1) - .named("m")) - .named("table"); - - MessageType actual = ParquetSchemaUtil.pruneColumns(fileSchema, projection); - Assert.assertEquals("Pruned schema should not rename repeated struct", expected, actual); - } - - @Test - public void testListElementName() { - MessageType fileSchema = Types.buildMessage() - .addField(Types.buildGroup(Type.Repetition.OPTIONAL) - .addField(Types.buildGroup(Type.Repetition.REPEATED) - .addField(Types.buildGroup(Type.Repetition.OPTIONAL) - .addField(Types.primitive(PrimitiveTypeName.DOUBLE, Type.Repetition.REQUIRED).id(4).named("x")) - .addField(Types.primitive(PrimitiveTypeName.DOUBLE, Type.Repetition.REQUIRED).id(5).named("y")) - .addField(Types.primitive(PrimitiveTypeName.DOUBLE, Type.Repetition.REQUIRED).id(6).named("z")) - .id(3) - .named("custom_element_name")) - .named("custom_repeated_name")) - .as(LogicalTypeAnnotation.listType()) - .id(1) - .named("m")) - .named("table"); - - // project map.value.x and map.value.y - Schema projection = new Schema( - NestedField.optional(1, "m", ListType.ofOptional(3, - StructType.of( - NestedField.required(4, "x", DoubleType.get()), - NestedField.required(5, "y", DoubleType.get()) - ) - )) - ); - - MessageType expected = Types.buildMessage() - .addField(Types.buildGroup(Type.Repetition.OPTIONAL) - .addField(Types.buildGroup(Type.Repetition.REPEATED) - .addField(Types.buildGroup(Type.Repetition.OPTIONAL) - .addField(Types.primitive(PrimitiveTypeName.DOUBLE, Type.Repetition.REQUIRED).id(4).named("x")) - .addField(Types.primitive(PrimitiveTypeName.DOUBLE, Type.Repetition.REQUIRED).id(5).named("y")) - .id(3) - .named("custom_element_name")) - .named("custom_repeated_name")) - .as(LogicalTypeAnnotation.listType()) - .id(1) - .named("m")) - .named("table"); - - MessageType actual = ParquetSchemaUtil.pruneColumns(fileSchema, projection); - Assert.assertEquals("Pruned schema should not rename repeated struct", expected, actual); - } -}