diff --git a/api/src/main/java/org/apache/iceberg/types/PruneColumns.java b/api/src/main/java/org/apache/iceberg/types/PruneColumns.java index f58670365ea4..2944ec7bb5c0 100644 --- a/api/src/main/java/org/apache/iceberg/types/PruneColumns.java +++ b/api/src/main/java/org/apache/iceberg/types/PruneColumns.java @@ -24,13 +24,26 @@ import org.apache.iceberg.Schema; import org.apache.iceberg.relocated.com.google.common.base.Preconditions; import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.types.Types.ListType; +import org.apache.iceberg.types.Types.MapType; +import org.apache.iceberg.types.Types.StructType; class PruneColumns extends TypeUtil.SchemaVisitor { private final Set selected; + private final boolean selectFullTypes; - PruneColumns(Set selected) { + /** + * Visits a schema and returns only the fields selected by the id set. + *

+ * When selectFullTypes is false selecting list or map types is undefined and forbidden. + * + * @param selected ids of elements to return + * @param selectFullTypes whether to select all subfields of a selected nested type + */ + PruneColumns(Set selected, boolean selectFullTypes) { Preconditions.checkNotNull(selected, "Selected field ids cannot be null"); this.selected = selected; + this.selectFullTypes = selectFullTypes; } @Override @@ -77,10 +90,19 @@ public Type struct(Types.StructType struct, List fieldResults) { @Override public Type field(Types.NestedField field, Type fieldResult) { if (selected.contains(field.fieldId())) { - return field.type(); + if (selectFullTypes) { + return field.type(); + } else if (field.type().isStructType()) { + return projectSelectedStruct(fieldResult); + } else { + Preconditions.checkArgument(!field.type().isNestedType(), + "Cannot explicitly project List or Map types, %s:%s of type %s was selected", + field.fieldId(), field.name(), field.type()); + // Selected non-struct field + return field.type(); + } } else if (fieldResult != null) { - // this isn't necessarily the same as field.type() because a struct may not have all - // fields selected. + // This field wasn't selected but a subfield was so include that return fieldResult; } return null; @@ -89,15 +111,19 @@ public Type field(Types.NestedField field, Type fieldResult) { @Override public Type list(Types.ListType list, Type elementResult) { if (selected.contains(list.elementId())) { - return list; - } else if (elementResult != null) { - if (list.elementType() == elementResult) { + if (selectFullTypes) { return list; - } else if (list.isElementOptional()) { - return Types.ListType.ofOptional(list.elementId(), elementResult); + } else if (list.elementType().isStructType()) { + StructType projectedStruct = projectSelectedStruct(elementResult); + return projectList(list, projectedStruct); } else { - return Types.ListType.ofRequired(list.elementId(), elementResult); + Preconditions.checkArgument(list.elementType().isPrimitiveType(), + "Cannot explicitly project List or Map types, List element %s of type %s was selected", + list.elementId(), list.elementType()); + return list; } + } else if (elementResult != null) { + return projectList(list, elementResult); } return null; } @@ -105,15 +131,19 @@ public Type list(Types.ListType list, Type elementResult) { @Override public Type map(Types.MapType map, Type ignored, Type valueResult) { if (selected.contains(map.valueId())) { - return map; - } else if (valueResult != null) { - if (map.valueType() == valueResult) { + if (selectFullTypes) { return map; - } else if (map.isValueOptional()) { - return Types.MapType.ofOptional(map.keyId(), map.valueId(), map.keyType(), valueResult); + } else if (map.valueType().isStructType()) { + Type projectedStruct = projectSelectedStruct(valueResult); + return projectMap(map, projectedStruct); } else { - return Types.MapType.ofRequired(map.keyId(), map.valueId(), map.keyType(), valueResult); + Preconditions.checkArgument(map.valueType().isPrimitiveType(), + "Cannot explicitly project List or Map types, Map value %s of type %s was selected", + map.valueId(), map.valueType()); + return map; } + } else if (valueResult != null) { + return projectMap(map, valueResult); } else if (selected.contains(map.keyId())) { // right now, maps can't be selected without values return map; @@ -125,4 +155,44 @@ public Type map(Types.MapType map, Type ignored, Type valueResult) { public Type primitive(Type.PrimitiveType primitive) { return null; } + + private ListType projectList(ListType list, Type elementResult) { + Preconditions.checkArgument(elementResult != null, "Cannot project a list when the element result is null"); + if (list.elementType() == elementResult) { + return list; + } else if (list.isElementOptional()) { + return Types.ListType.ofOptional(list.elementId(), elementResult); + } else { + return Types.ListType.ofRequired(list.elementId(), elementResult); + } + } + + private MapType projectMap(MapType map, Type valueResult) { + Preconditions.checkArgument(valueResult != null, "Attempted to project a map without a defined map value type"); + if (map.valueType() == valueResult) { + return map; + } else if (map.isValueOptional()) { + return Types.MapType.ofOptional(map.keyId(), map.valueId(), map.keyType(), valueResult); + } else { + return Types.MapType.ofRequired(map.keyId(), map.valueId(), map.keyType(), valueResult); + } + } + + /** + * If select full types is disabled we need to recreate the struct with only the selected + * subfields. If no subfields are selected we return an empty struct. + * @param projectedField subfields already selected in this projection + * @return projected struct + */ + private StructType projectSelectedStruct(Type projectedField) { + Preconditions.checkArgument(projectedField == null || projectedField.isStructType()); + // the struct was selected, ensure at least an empty struct is returned + if (projectedField == null) { + // no sub-fields were selected but the struct was, return an empty struct + return Types.StructType.of(); + } else { + // sub-fields were selected so return the projected struct + return projectedField.asStructType(); + } + } } diff --git a/api/src/main/java/org/apache/iceberg/types/TypeUtil.java b/api/src/main/java/org/apache/iceberg/types/TypeUtil.java index 5185038f66fb..e93b10afa818 100644 --- a/api/src/main/java/org/apache/iceberg/types/TypeUtil.java +++ b/api/src/main/java/org/apache/iceberg/types/TypeUtil.java @@ -19,6 +19,7 @@ package org.apache.iceberg.types; +import java.util.Collections; import java.util.List; import java.util.Locale; import java.util.Map; @@ -42,6 +43,46 @@ public class TypeUtil { private TypeUtil() { } + /** + * Project extracts particular fields from a schema by ID. + *

+ * Unlike {@link TypeUtil#select(Schema, Set)}, project will pick out only the fields enumerated. Structs that are + * explicitly projected are empty unless sub-fields are explicitly projected. Maps and lists cannot be explicitly + * selected in fieldIds. + * @param schema to project fields from + * @param fieldIds list of explicit fields to extract + * @return the schema with all fields fields not selected removed + */ + public static Schema project(Schema schema, Set fieldIds) { + Preconditions.checkNotNull(schema, "Schema cannot be null"); + + Types.StructType result = project(schema.asStruct(), fieldIds); + if (schema.asStruct().equals(result)) { + return schema; + } else if (result != null) { + if (schema.getAliases() != null) { + return new Schema(result.fields(), schema.getAliases()); + } else { + return new Schema(result.fields()); + } + } + return new Schema(Collections.emptyList(), schema.getAliases()); + } + + public static Types.StructType project(Types.StructType struct, Set fieldIds) { + Preconditions.checkNotNull(struct, "Struct cannot be null"); + Preconditions.checkNotNull(fieldIds, "Field ids cannot be null"); + + Type result = visit(struct, new PruneColumns(fieldIds, false)); + if (struct.equals(result)) { + return struct; + } else if (result != null) { + return result.asStructType(); + } + + return Types.StructType.of(); + } + public static Schema select(Schema schema, Set fieldIds) { Preconditions.checkNotNull(schema, "Schema cannot be null"); @@ -63,8 +104,8 @@ public static Types.StructType select(Types.StructType struct, Set fiel Preconditions.checkNotNull(struct, "Struct cannot be null"); Preconditions.checkNotNull(fieldIds, "Field ids cannot be null"); - Type result = visit(struct, new PruneColumns(fieldIds)); - if (struct == result) { + Type result = visit(struct, new PruneColumns(fieldIds, true)); + if (struct.equals(result)) { return struct; } else if (result != null) { return result.asStructType(); diff --git a/api/src/test/java/org/apache/iceberg/types/TestTypeUtil.java b/api/src/test/java/org/apache/iceberg/types/TestTypeUtil.java index c11c859edacf..f9d9ef13e565 100644 --- a/api/src/test/java/org/apache/iceberg/types/TestTypeUtil.java +++ b/api/src/test/java/org/apache/iceberg/types/TestTypeUtil.java @@ -20,12 +20,15 @@ package org.apache.iceberg.types; +import org.apache.iceberg.AssertHelpers; import org.apache.iceberg.Schema; import org.apache.iceberg.relocated.com.google.common.collect.Lists; import org.apache.iceberg.relocated.com.google.common.collect.Sets; +import org.apache.iceberg.types.Types.IntegerType; import org.junit.Assert; import org.junit.Test; +import static org.apache.iceberg.types.Types.NestedField.optional; import static org.apache.iceberg.types.Types.NestedField.required; @@ -103,6 +106,350 @@ public void testAssignIncreasingFreshIdNewIdentifier() { Sets.newHashSet(sourceSchema.findField("a").fieldId()), actualSchema.identifierFieldIds()); } + @Test + public void testProject() { + Schema schema = new Schema( + Lists.newArrayList( + required(10, "a", Types.IntegerType.get()), + required(11, "A", Types.IntegerType.get()), + required(12, "someStruct", Types.StructType.of( + required(13, "b", Types.IntegerType.get()), + required(14, "B", Types.IntegerType.get()), + required(15, "anotherStruct", Types.StructType.of( + required(16, "c", Types.IntegerType.get()), + required(17, "C", Types.IntegerType.get())) + ))))); + + Schema expectedTop = new Schema( + Lists.newArrayList( + required(11, "A", Types.IntegerType.get()))); + + Schema actualTop = TypeUtil.project(schema, Sets.newHashSet(11)); + Assert.assertEquals(expectedTop.asStruct(), actualTop.asStruct()); + + Schema expectedDepthOne = new Schema( + Lists.newArrayList( + required(10, "a", Types.IntegerType.get()), + required(12, "someStruct", Types.StructType.of( + required(13, "b", Types.IntegerType.get()))))); + + Schema actualDepthOne = TypeUtil.project(schema, Sets.newHashSet(10, 12, 13)); + Assert.assertEquals(expectedDepthOne.asStruct(), actualDepthOne.asStruct()); + + Schema expectedDepthTwo = new Schema( + Lists.newArrayList( + required(11, "A", Types.IntegerType.get()), + required(12, "someStruct", Types.StructType.of( + required(15, "anotherStruct", Types.StructType.of( + required(17, "C", Types.IntegerType.get())) + ))))); + + Schema actualDepthTwo = TypeUtil.project(schema, Sets.newHashSet(11, 12, 15, 17)); + Schema actualDepthTwoChildren = TypeUtil.project(schema, Sets.newHashSet(11, 17)); + Assert.assertEquals(expectedDepthTwo.asStruct(), actualDepthTwo.asStruct()); + Assert.assertEquals(expectedDepthTwo.asStruct(), actualDepthTwoChildren.asStruct()); + } + + @Test + public void testProjectNaturallyEmpty() { + Schema schema = new Schema( + Lists.newArrayList( + required(12, "someStruct", Types.StructType.of( + required(15, "anotherStruct", Types.StructType.of( + required(20, "empty", Types.StructType.of()) + )))))); + + Schema expectedDepthOne = new Schema( + Lists.newArrayList( + required(12, "someStruct", Types.StructType.of()))); + + Schema actualDepthOne = TypeUtil.project(schema, Sets.newHashSet(12)); + Assert.assertEquals(expectedDepthOne.asStruct(), actualDepthOne.asStruct()); + + Schema expectedDepthTwo = new Schema( + Lists.newArrayList( + required(12, "someStruct", Types.StructType.of( + required(15, "anotherStruct", Types.StructType.of()))))); + + Schema actualDepthTwo = TypeUtil.project(schema, Sets.newHashSet(12, 15)); + Assert.assertEquals(expectedDepthTwo.asStruct(), actualDepthTwo.asStruct()); + + Schema expectedDepthThree = new Schema( + Lists.newArrayList( + required(12, "someStruct", Types.StructType.of( + required(15, "anotherStruct", Types.StructType.of( + required(20, "empty", Types.StructType.of()) + )))))); + + Schema actualDepthThree = TypeUtil.project(schema, Sets.newHashSet(12, 15, 20)); + Schema actualDepthThreeChildren = TypeUtil.project(schema, Sets.newHashSet(20)); + Assert.assertEquals(expectedDepthThree.asStruct(), actualDepthThree.asStruct()); + Assert.assertEquals(expectedDepthThree.asStruct(), actualDepthThreeChildren.asStruct()); + } + + @Test + public void testProjectEmpty() { + Schema schema = new Schema( + Lists.newArrayList( + required(10, "a", Types.IntegerType.get()), + required(11, "A", Types.IntegerType.get()), + required(12, "someStruct", Types.StructType.of( + required(13, "b", Types.IntegerType.get()), + required(14, "B", Types.IntegerType.get()), + required(15, "anotherStruct", Types.StructType.of( + required(16, "c", Types.IntegerType.get()), + required(17, "C", Types.IntegerType.get())) + ))))); + + Schema expectedDepthOne = new Schema( + Lists.newArrayList( + required(12, "someStruct", Types.StructType.of()))); + + Schema actualDepthOne = TypeUtil.project(schema, Sets.newHashSet(12)); + Assert.assertEquals(expectedDepthOne.asStruct(), actualDepthOne.asStruct()); + + Schema expectedDepthTwo = new Schema( + Lists.newArrayList( + required(12, "someStruct", Types.StructType.of( + required(15, "anotherStruct", Types.StructType.of()))))); + + Schema actualDepthTwo = TypeUtil.project(schema, Sets.newHashSet(12, 15)); + Assert.assertEquals(expectedDepthTwo.asStruct(), actualDepthTwo.asStruct()); + } + + @Test + public void testSelect() { + Schema schema = new Schema( + Lists.newArrayList( + required(10, "a", Types.IntegerType.get()), + required(11, "A", Types.IntegerType.get()), + required(12, "someStruct", Types.StructType.of( + required(13, "b", Types.IntegerType.get()), + required(14, "B", Types.IntegerType.get()), + required(15, "anotherStruct", Types.StructType.of( + required(16, "c", Types.IntegerType.get()), + required(17, "C", Types.IntegerType.get())) + ))))); + + Schema expectedTop = new Schema( + Lists.newArrayList( + required(11, "A", Types.IntegerType.get()))); + + Schema actualTop = TypeUtil.select(schema, Sets.newHashSet(11)); + Assert.assertEquals(expectedTop.asStruct(), actualTop.asStruct()); + + Schema expectedDepthOne = new Schema( + Lists.newArrayList( + required(10, "a", Types.IntegerType.get()), + required(12, "someStruct", Types.StructType.of( + required(13, "b", Types.IntegerType.get()), + required(14, "B", Types.IntegerType.get()), + required(15, "anotherStruct", Types.StructType.of( + required(16, "c", Types.IntegerType.get()), + required(17, "C", Types.IntegerType.get()))))))); + + Schema actualDepthOne = TypeUtil.select(schema, Sets.newHashSet(10, 12)); + Assert.assertEquals(expectedDepthOne.asStruct(), actualDepthOne.asStruct()); + + Schema expectedDepthTwo = new Schema( + Lists.newArrayList( + required(11, "A", Types.IntegerType.get()), + required(12, "someStruct", Types.StructType.of( + required(15, "anotherStruct", Types.StructType.of( + required(17, "C", Types.IntegerType.get())) + ))))); + + Schema actualDepthTwo = TypeUtil.select(schema, Sets.newHashSet(11, 17)); + Assert.assertEquals(expectedDepthTwo.asStruct(), actualDepthTwo.asStruct()); + } + + @Test + public void testProjectMap() { + // We can't partially project keys because it changes key equality + Schema schema = new Schema( + Lists.newArrayList( + required(10, "a", Types.IntegerType.get()), + required(11, "A", Types.IntegerType.get()), + required(12, "map", Types.MapType.ofRequired(13, 14, + Types.StructType.of( + optional(100, "x", Types.IntegerType.get()), + optional(101, "y", Types.IntegerType.get())), + Types.StructType.of( + required(200, "z", Types.IntegerType.get()), + optional(201, "innerMap", Types.MapType.ofOptional(202, 203, + Types.IntegerType.get(), + Types.StructType.of( + required(300, "foo", Types.IntegerType.get()), + required(301, "bar", Types.IntegerType.get()))))))))); + + Assert.assertThrows("Cannot project maps explicitly", IllegalArgumentException.class, + () -> TypeUtil.project(schema, Sets.newHashSet(12))); + + Assert.assertThrows("Cannot project maps explicitly", IllegalArgumentException.class, + () -> TypeUtil.project(schema, Sets.newHashSet(201))); + + Schema expectedTopLevel = new Schema( + Lists.newArrayList(required(10, "a", Types.IntegerType.get()))); + Schema actualTopLevel = TypeUtil.project(schema, Sets.newHashSet(10)); + Assert.assertEquals(expectedTopLevel.asStruct(), actualTopLevel.asStruct()); + + Schema expectedDepthOne = new Schema( + Lists.newArrayList( + required(10, "a", Types.IntegerType.get()), + required(12, "map", Types.MapType.ofRequired(13, 14, + Types.StructType.of( + optional(100, "x", Types.IntegerType.get()), + optional(101, "y", Types.IntegerType.get())), + Types.StructType.of())))); + Schema actualDepthOne = TypeUtil.project(schema, Sets.newHashSet(10, 13, 14, 100, 101)); + Schema actualDepthOneNoKeys = TypeUtil.project(schema, Sets.newHashSet(10, 13, 14)); + Assert.assertEquals(expectedDepthOne.asStruct(), actualDepthOne.asStruct()); + Assert.assertEquals(expectedDepthOne.asStruct(), actualDepthOneNoKeys.asStruct()); + + Schema expectedDepthTwo = new Schema( + Lists.newArrayList( + required(10, "a", Types.IntegerType.get()), + required(12, "map", Types.MapType.ofRequired(13, 14, + Types.StructType.of( + optional(100, "x", Types.IntegerType.get()), + optional(101, "y", Types.IntegerType.get())), + Types.StructType.of( + required(200, "z", Types.IntegerType.get()), + optional(201, "innerMap", Types.MapType.ofOptional(202, 203, + Types.IntegerType.get(), + Types.StructType.of()))))))); + Schema actualDepthTwo = TypeUtil.project(schema, Sets.newHashSet(10, 13, 14, 100, 101, 200, 202, 203)); + Assert.assertEquals(expectedDepthTwo.asStruct(), actualDepthTwo.asStruct()); + } + + @Test + public void testProjectList() { + Schema schema = new Schema( + Lists.newArrayList( + required(10, "a", Types.IntegerType.get()), + required(11, "A", Types.IntegerType.get()), + required(12, "list", Types.ListType.ofRequired(13, + Types.StructType.of( + optional(20, "foo", Types.IntegerType.get()), + required(21, "subList", Types.ListType.ofRequired(14, + Types.StructType.of( + required(15, "x", Types.IntegerType.get()), + required(16, "y", Types.IntegerType.get()), + required(17, "z", Types.IntegerType.get()))))))))); + + + AssertHelpers.assertThrows("Cannot explicitly project List", + IllegalArgumentException.class, + () -> TypeUtil.project(schema, Sets.newHashSet(12)) + ); + + AssertHelpers.assertThrows("Cannot explicitly project List", + IllegalArgumentException.class, + () -> TypeUtil.project(schema, Sets.newHashSet(21)) + ); + + Schema expectedDepthOne = new Schema( + Lists.newArrayList( + required(12, "list", Types.ListType.ofRequired(13, + Types.StructType.of())))); + Schema actualDepthOne = TypeUtil.project(schema, Sets.newHashSet(13)); + Assert.assertEquals(expectedDepthOne.asStruct(), actualDepthOne.asStruct()); + + Schema expectedDepthTwo = new Schema( + Lists.newArrayList( + required(10, "a", Types.IntegerType.get()), + required(12, "list", Types.ListType.ofRequired(13, + Types.StructType.of( + optional(20, "foo", Types.IntegerType.get()), + required(21, "subList", Types.ListType.ofRequired(14, + Types.StructType.of()))))))); + Schema actualDepthTwo = TypeUtil.project(schema, Sets.newHashSet(10, 13, 20, 14)); + Assert.assertEquals(expectedDepthTwo.asStruct(), actualDepthTwo.asStruct()); + } + + @Test + public void testProjectListNested() { + Schema schema = new Schema( + Lists.newArrayList( + required(12, "list", Types.ListType.ofRequired(13, + Types.ListType.ofRequired(14, + Types.MapType.ofRequired(15, 16, + IntegerType.get(), + Types.StructType.of( + required(17, "x", Types.IntegerType.get()), + required(18, "y", Types.IntegerType.get()) + ))))))); + + AssertHelpers.assertThrows("Cannot explicitly project List", + IllegalArgumentException.class, + () -> TypeUtil.project(schema, Sets.newHashSet(12)) + ); + + AssertHelpers.assertThrows("Cannot explicitly project List", + IllegalArgumentException.class, + () -> TypeUtil.project(schema, Sets.newHashSet(13)) + ); + + AssertHelpers.assertThrows("Cannot explicitly project Map", + IllegalArgumentException.class, + () -> TypeUtil.project(schema, Sets.newHashSet(14)) + ); + + Schema expected = new Schema( + Lists.newArrayList( + required(12, "list", Types.ListType.ofRequired(13, + Types.ListType.ofRequired(14, + Types.MapType.ofRequired(15, 16, + IntegerType.get(), + Types.StructType.of())))))); + + Schema actual = TypeUtil.project(schema, Sets.newHashSet(16)); + Assert.assertEquals(expected.asStruct(), actual.asStruct()); + } + + @Test + public void testProjectMapNested() { + Schema schema = new Schema( + Lists.newArrayList( + required(12, "map", Types.MapType.ofRequired(13, 14, + Types.IntegerType.get(), + Types.MapType.ofRequired(15, 16, + Types.IntegerType.get(), + Types.ListType.ofRequired(17, + Types.StructType.of( + required(18, "x", Types.IntegerType.get()), + required(19, "y", Types.IntegerType.get()) + ))))))); + + + AssertHelpers.assertThrows("Cannot explicitly project Map", + IllegalArgumentException.class, + () -> TypeUtil.project(schema, Sets.newHashSet(12)) + ); + + AssertHelpers.assertThrows("Cannot explicitly project Map", + IllegalArgumentException.class, + () -> TypeUtil.project(schema, Sets.newHashSet(14)) + ); + + AssertHelpers.assertThrows("Cannot explicitly project List", + IllegalArgumentException.class, + () -> TypeUtil.project(schema, Sets.newHashSet(16)) + ); + + Schema expected = new Schema( + Lists.newArrayList( + required(12, "map", Types.MapType.ofRequired(13, 14, + Types.IntegerType.get(), + Types.MapType.ofRequired(15, 16, + Types.IntegerType.get(), + Types.ListType.ofRequired(17, + Types.StructType.of())))))); + + Schema actual = TypeUtil.project(schema, Sets.newHashSet(17)); + Assert.assertEquals(expected.asStruct(), actual.asStruct()); + } + @Test(expected = IllegalArgumentException.class) public void testReassignIdsIllegalArgumentException() { Schema schema = new Schema(