Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -25,11 +25,8 @@
import java.util.Comparator;
import java.util.Deque;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.Optional;
import java.util.function.Function;
import java.util.stream.Collectors;
import org.apache.iceberg.expressions.PathUtil;
import org.apache.iceberg.relocated.com.google.common.base.Preconditions;
import org.apache.iceberg.relocated.com.google.common.collect.Lists;
Expand Down Expand Up @@ -403,19 +400,14 @@ public Type array(VariantArray array, List<Type> elementResults) {
return null;
}

// Choose most common type as shredding type and build 3-level list
Type defaultTYpe = elementResults.get(0);
Type shredType =
elementResults.stream()
.filter(Objects::nonNull)
.collect(Collectors.groupingBy(Function.identity(), Collectors.counting()))
.entrySet()
.stream()
.max(Map.Entry.comparingByValue())
.map(Map.Entry::getKey)
.orElse(defaultTYpe);

return list(shredType);
// Shred if all the elements are of a uniform type and build 3-level list
Type shredType = elementResults.get(0);
if (shredType != null
&& elementResults.stream().allMatch(type -> Objects.equals(type, shredType))) {
return list(shredType);
}

return null;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do you think there are cases where we want to shred an array of mixed types, with each value stored in value as an encoded variant? I'm not sure.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think we will have the case that all the elements will be stored in value since I would expect engines will at least choose a shredding type that matches some of the elements. I think this is more common: by sampling the elements, we choose INT32 as the shredding type while some are strings. If all of theme are in value and they are binary, that doesn't seem to be useful.

}

private static GroupType list(Type shreddedType) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,7 @@ public class TestVariantWriters {
private static final ByteBuffer NESTED_ARRAY_BUFFER =
VariantTestUtil.createArray(
array(Variants.of("string"), Variants.of("iceberg")),
array(Variants.of("string"), Variants.of("iceberg")));
array(Variants.of("apple"), Variants.of("banana")));
private static final ByteBuffer MIXED_NESTED_ARRAY_BUFFER =
VariantTestUtil.createArray(
array(Variants.of("string"), Variants.of("iceberg"), Variants.of(34)),
Expand Down