From 7759b3e28e10780640204a18da42c91d8e13b96f Mon Sep 17 00:00:00 2001 From: Piotr Findeisen Date: Mon, 26 Nov 2018 09:35:05 +0100 Subject: [PATCH] Make array_distinct compare elements with "is distinct" --- .../presto/operator/aggregation/TypedSet.java | 31 +++++++++++++++++-- .../scalar/ArrayDistinctFunction.java | 19 ++++++++---- .../com/facebook/presto/type/TypeUtils.java | 17 ++++++++++ .../presto/type/TestArrayOperators.java | 10 ++++++ 4 files changed, 68 insertions(+), 9 deletions(-) diff --git a/presto-main/src/main/java/com/facebook/presto/operator/aggregation/TypedSet.java b/presto-main/src/main/java/com/facebook/presto/operator/aggregation/TypedSet.java index ac5eec5bf886e..6d4e6020dcbf1 100644 --- a/presto-main/src/main/java/com/facebook/presto/operator/aggregation/TypedSet.java +++ b/presto-main/src/main/java/com/facebook/presto/operator/aggregation/TypedSet.java @@ -22,9 +22,13 @@ import it.unimi.dsi.fastutil.ints.IntArrayList; import org.openjdk.jol.info.ClassLayout; +import java.lang.invoke.MethodHandle; +import java.util.Optional; + import static com.facebook.presto.spi.StandardErrorCode.EXCEEDED_FUNCTION_MEMORY_LIMIT; import static com.facebook.presto.spi.StandardErrorCode.GENERIC_INSUFFICIENT_RESOURCES; import static com.facebook.presto.type.TypeUtils.hashPosition; +import static com.facebook.presto.type.TypeUtils.positionDistinctFromPosition; import static com.facebook.presto.type.TypeUtils.positionEqualsPosition; import static com.google.common.base.Preconditions.checkArgument; import static io.airlift.units.DataSize.Unit.MEGABYTE; @@ -43,6 +47,7 @@ public class TypedSet static final long FOUR_MEGABYTES = MAX_FUNCTION_MEMORY.toBytes(); private final Type elementType; + private final Optional elementIsDistinctFrom; private final IntArrayList blockPositionByHash; private final BlockBuilder elementBlock; private final String functionName; @@ -55,9 +60,21 @@ public class TypedSet private boolean containsNullElement; public TypedSet(Type elementType, int expectedSize, String functionName) + { + // TODO revise other usages of TypedSet and determine whether they should use equality or distinctness semantics + this(elementType, Optional.empty(), expectedSize, functionName); + } + + public TypedSet(Type elementType, MethodHandle elementIsDistinctFrom, int expectedSize, String functionName) + { + this(elementType, Optional.of(elementIsDistinctFrom), expectedSize, functionName); + } + + private TypedSet(Type elementType, Optional elementIsDistinctFrom, int expectedSize, String functionName) { checkArgument(expectedSize >= 0, "expectedSize must not be negative"); this.elementType = requireNonNull(elementType, "elementType must not be null"); + this.elementIsDistinctFrom = requireNonNull(elementIsDistinctFrom, "elementIsDistinctFrom is null"); this.elementBlock = elementType.createBlockBuilder(null, expectedSize); this.functionName = functionName; @@ -126,12 +143,12 @@ private int getHashPositionOfElement(Block block, int position) int hashPosition = getMaskedHash(hashPosition(elementType, block, position)); while (true) { int blockPosition = blockPositionByHash.get(hashPosition); - // Doesn't have this element if (blockPosition == EMPTY_SLOT) { + // Doesn't have this element return hashPosition; } - // Already has this element - else if (positionEqualsPosition(elementType, elementBlock, blockPosition, block, position)) { + if (isContainedAt(block, position, blockPosition)) { + // Already has this element return hashPosition; } @@ -139,6 +156,14 @@ else if (positionEqualsPosition(elementType, elementBlock, blockPosition, block, } } + private boolean isContainedAt(Block block, int position, int atPosition) + { + if (elementIsDistinctFrom.isPresent()) { + return !positionDistinctFromPosition(elementType, elementBlock, atPosition, block, position, elementIsDistinctFrom.get()); + } + return positionEqualsPosition(elementType, elementBlock, atPosition, block, position); + } + private void addNewElement(int hashPosition, Block block, int position) { elementType.appendTo(block, position, elementBlock); diff --git a/presto-main/src/main/java/com/facebook/presto/operator/scalar/ArrayDistinctFunction.java b/presto-main/src/main/java/com/facebook/presto/operator/scalar/ArrayDistinctFunction.java index a5e6a8d4ce9fe..c8b1cccc55276 100644 --- a/presto-main/src/main/java/com/facebook/presto/operator/scalar/ArrayDistinctFunction.java +++ b/presto-main/src/main/java/com/facebook/presto/operator/scalar/ArrayDistinctFunction.java @@ -18,6 +18,7 @@ import com.facebook.presto.spi.block.Block; import com.facebook.presto.spi.block.BlockBuilder; import com.facebook.presto.spi.function.Description; +import com.facebook.presto.spi.function.OperatorDependency; import com.facebook.presto.spi.function.ScalarFunction; import com.facebook.presto.spi.function.SqlType; import com.facebook.presto.spi.function.TypeParameter; @@ -26,7 +27,12 @@ import it.unimi.dsi.fastutil.longs.LongOpenHashSet; import it.unimi.dsi.fastutil.longs.LongSet; +import java.lang.invoke.MethodHandle; + +import static com.facebook.presto.spi.function.OperatorType.IS_DISTINCT_FROM; import static com.facebook.presto.spi.type.BigintType.BIGINT; +import static com.facebook.presto.spi.type.StandardTypes.BOOLEAN; +import static com.facebook.presto.type.TypeUtils.positionDistinctFromPosition; @ScalarFunction("array_distinct") @Description("Remove duplicate values from the given array") @@ -42,22 +48,23 @@ public ArrayDistinctFunction(@TypeParameter("E") Type elementType) @TypeParameter("E") @SqlType("array(E)") - public Block distinct(@TypeParameter("E") Type type, @SqlType("array(E)") Block array) + public Block distinct( + @TypeParameter("E") Type type, + @OperatorDependency(operator = IS_DISTINCT_FROM, returnType = BOOLEAN, argumentTypes = {"E", "E"}) MethodHandle elementIsDistinctFrom, + @SqlType("array(E)") Block array) { if (array.getPositionCount() < 2) { return array; } if (array.getPositionCount() == 2) { - if (type.equalTo(array, 0, array, 1)) { - return array.getSingleValueBlock(0); - } - else { + if (positionDistinctFromPosition(type, array, 0, array, 1, elementIsDistinctFrom)) { return array; } + return array.getSingleValueBlock(0); } - TypedSet typedSet = new TypedSet(type, array.getPositionCount(), "array_distinct"); + TypedSet typedSet = new TypedSet(type, elementIsDistinctFrom, array.getPositionCount(), "array_distinct"); int distinctCount = 0; if (pageBuilder.isFull()) { diff --git a/presto-main/src/main/java/com/facebook/presto/type/TypeUtils.java b/presto-main/src/main/java/com/facebook/presto/type/TypeUtils.java index 73d14115bdc6b..5f39cda7095ae 100644 --- a/presto-main/src/main/java/com/facebook/presto/type/TypeUtils.java +++ b/presto-main/src/main/java/com/facebook/presto/type/TypeUtils.java @@ -31,6 +31,9 @@ import static com.facebook.presto.spi.StandardErrorCode.NOT_SUPPORTED; import static com.facebook.presto.spi.type.BigintType.BIGINT; +import static com.facebook.presto.spi.type.TypeUtils.readNativeValue; +import static com.facebook.presto.util.Failures.internalError; +import static com.google.common.base.Defaults.defaultValue; import static com.google.common.base.Preconditions.checkArgument; import static com.google.common.base.Throwables.throwIfUnchecked; import static com.google.common.collect.ImmutableList.toImmutableList; @@ -100,6 +103,20 @@ public static boolean positionEqualsPosition(Type type, Block leftBlock, int lef return type.equalTo(leftBlock, leftPosition, rightBlock, rightPosition); } + public static boolean positionDistinctFromPosition(Type type, Block leftBlock, int leftPosition, Block rightBlock, int rightPosition, MethodHandle isDistinctFrom) + { + boolean firstValueNull = leftBlock.isNull(leftPosition); + Object firstValue = firstValueNull ? defaultValue(type.getJavaType()) : readNativeValue(type, leftBlock, leftPosition); + boolean secondValueNull = rightBlock.isNull(rightPosition); + Object secondValue = secondValueNull ? defaultValue(type.getJavaType()) : readNativeValue(type, rightBlock, rightPosition); + try { + return (boolean) isDistinctFrom.invoke(firstValue, firstValueNull, secondValue, secondValueNull); + } + catch (Throwable t) { + throw internalError(t); + } + } + public static Type resolveType(TypeSignature typeName, TypeManager typeManager) { return typeManager.getType(typeName); diff --git a/presto-main/src/test/java/com/facebook/presto/type/TestArrayOperators.java b/presto-main/src/test/java/com/facebook/presto/type/TestArrayOperators.java index d87419fb30243..760b48c7da63e 100644 --- a/presto-main/src/test/java/com/facebook/presto/type/TestArrayOperators.java +++ b/presto-main/src/test/java/com/facebook/presto/type/TestArrayOperators.java @@ -992,6 +992,16 @@ public void testDistinct() assertFunction("ARRAY_DISTINCT(ARRAY [NULL, NULL])", new ArrayType(UNKNOWN), asList((Object) null)); assertFunction("ARRAY_DISTINCT(ARRAY [NULL, NULL, NULL])", new ArrayType(UNKNOWN), asList((Object) null)); + // Indeterminate values + assertFunction( + "ARRAY_DISTINCT(ARRAY[(123, 'abc'), (123, NULL)])", + new ArrayType(RowType.anonymous(ImmutableList.of(INTEGER, createVarcharType(3)))), + ImmutableList.of(asList(123, "abc"), asList(123, null))); + assertFunction( + "ARRAY_DISTINCT(ARRAY[(NULL, NULL), (42, 'def'), (NULL, 'abc'), (123, NULL), (42, 'def'), (NULL, NULL), (NULL, 'abc'), (123, NULL)])", + new ArrayType(RowType.anonymous(ImmutableList.of(INTEGER, createVarcharType(3)))), + ImmutableList.of(asList(null, null), asList(42, "def"), asList(null, "abc"), asList(123, null))); + // Test for BIGINT-optimized implementation assertFunction("ARRAY_DISTINCT(ARRAY [CAST(5 AS BIGINT), NULL, CAST(12 AS BIGINT), NULL])", new ArrayType(BIGINT), asList(5L, null, 12L)); assertFunction("ARRAY_DISTINCT(ARRAY [CAST(100 AS BIGINT), NULL, CAST(100 AS BIGINT), NULL, 0, -2, 0])", new ArrayType(BIGINT), asList(100L, null, 0L, -2L));