prestodb · rschlussel · Jun 25, 2024 · Jun 5, 2024 · Jun 5, 2024 · Jun 5, 2024
@@ -184,13 +184,23 @@ General Aggregate Functions
     Returns an array created from the distinct input ``x`` elements.
 
     If the input includes ``NULL``, ``NULL`` will be included in the returned array.
+    If the input includes arrays with ``NULL`` elements or rows with ``NULL`` fields, they will
+    be included in the returned array.  This function uses ``IS DISTINCT FROM`` to determine
+    distinctness. ::
+
+        SELECT set_agg(x) FROM (VALUES(1), (2), (null), (2), (null)) t(x) -- ARRAY[1, 2, null]
+        SELECT set_agg(x) FROM (VALUES(ROW(ROW(1, null))), ROW((ROW(2, 'a'))), ROW((ROW(1, null))), (null)) t(x) -- ARRAY[ROW(1, null), ROW(2, 'a'), null]
+
 
 .. function:: set_union(array(T)) -> array(T)
 
     Returns an array of all the distinct values contained in each array of the input.
 
     When all inputs are ``NULL``, this function returns an empty array. If ``NULL`` is
     an element of one of the input arrays, ``NULL`` will be included in the returned array.
+    If the input includes arrays with ``NULL`` elements or rows with ``NULL`` fields, they will
+    be included in the returned array.  This function uses ``IS DISTINCT FROM`` to determine
+    distinctness.
 
     Example::
 

@@ -50,14 +50,25 @@ Array Functions
 .. function:: array_distinct(x) -> array
 
     Remove duplicate values from the array ``x``.
+    This function uses ``IS DISTINCT FROM`` to determine the distinct elements. ::
+
+        SELECT array_distinct(ARRAY [1, 2, null, null, 2]) -- ARRAY[1, 2, null]
+        SELECT array_distinct(ARRAY [ROW(1, null), ROW (1, null)] -- ARRAY[ROW(1, null)
 
 .. function:: array_duplicates(array(T)) -> array(bigint/varchar)
 
     Returns a set of elements that occur more than once in ``array``.
+    Throws an exception if any of the elements are rows or arrays that contain nulls. ::
+
+        SELECT array_duplicates(ARRAY[1, 2, null, 1, null, 3]) -- ARRAY[1, null]
+        SELECT array_duplicates(ARRAY[ROW(1, null), ROW(1, null)]) -- "map key cannot be null or contain nulls"
 
 .. function:: array_except(x, y) -> array
 
     Returns an array of elements in ``x`` but not in ``y``, without duplicates.
+    This function uses ``IS NOT DISTINCT FROM`` to determine which elements are the same. ::
+
+        SELECT array_except(ARRAY[1, 3, 3, 2, null], ARRAY[1,2, 2, 4]) -- ARRAY[3, null]
 
 .. function:: array_frequency(array(E)) -> map(E, int)
 
@@ -67,31 +78,47 @@ Array Functions
 .. function:: array_has_duplicates(array(T)) -> boolean
 
     Returns a boolean: whether ``array`` has any elements that occur more than once.
+    Throws an exception if any of the elements are rows or arrays that contain nulls. ::
+
+    SELECT array_has_duplicates(ARRAY[1, 2, null, 1, null, 3]) -- true
+    SELECT array_has_duplicates(ARRAY[ROW(1, null), ROW(1, null)]) -- "map key cannot be null or contain nulls"
 
 .. function:: array_intersect(x, y) -> array
 
     Returns an array of the elements in the intersection of ``x`` and ``y``, without duplicates.
+    This function uses ``IS NOT DISTINCT FROM`` to determine which elements are the same. ::
+
+        SELECT array_intersect(ARRAY[1, 2, 3, 2, null], ARRAY[1,2, 2, 4, null]) -- ARRAY[1, 2, null]
 
 .. function:: array_intersect(array(array(E))) -> array(E)
 
     Returns an array of the elements in the intersection of all arrays in the given array, without duplicates.
+    This function uses ``IS NOT DISTINCT FROM`` to determine which elements are the same. ::
+
+        SELECT array_intersect(ARRAY[ARRAY[1, 2, 3, 2, null], ARRAY[1,2,2, 4, null], ARRAY [1, 2, 3, 4 null]])  -- ARRAY[1, 2, null]
 
 .. function:: array_join(x, delimiter, null_replacement) -> varchar
 
     Concatenates the elements of the given array using the delimiter and an optional string to replace nulls.
 
 .. function:: array_least_frequent(array(T)) -> array(T)
 
-    Returns the least frequent element of an array. If there are multiple elements with same frequency, the function returns the smallest element. ::
+    Returns the least frequent non-null element of an array. If there are multiple elements with the same frequency, the function returns the smallest element.
+    If the array has more than one element and any elements are ``ROWS`` with null fields or ``ARRAYS`` with null elements, an exception is returned. ::
 
         SELECT array_least_frequent(ARRAY[1, 0 , 5])  -- ARRAY[0]
+        select array_least_frequent(ARRAY[1, null, 1]) -- ARRAY[1]
+        select array_least_frequent(ARRAY[ROW(1,null), ROW(1, null)]) -- "map key cannot be null or contain nulls"
 
 .. function:: array_least_frequent(array(T), n) -> array(T)
 
-    Returns ``n`` least frequent elements of an array. The elements are ordered in increasing order of their frequencies.
-    If two elements have same frequency, smaller elements will appear first. ::
+    Returns ``n`` least frequent non-null elements of an array. The elements are ordered in increasing order of their frequencies.
+    If two elements have the same frequency, smaller elements will appear first.
+    If the array has more than one element and any elements are ``ROWS`` with null fields or ``ARRAYS`` with null elements, an exception is returned. ::
 
         SELECT array_least_frequent(ARRAY[3, 2, 2, 6, 6, 1, 1], 3) -- ARRAY[3, 1, 2]
+        select array_least_frequent(ARRAY[1, null, 1], 2) -- ARRAY[1]
+        select array_least_frequent(ARRAY[ROW(1,null), ROW(1, null)], 2) -- "map key cannot be null or contain nulls"
 
 .. function:: array_max(x) -> x
 
@@ -139,7 +166,7 @@ Array Functions
 .. function:: array_sort(x) -> array
 
     Sorts and returns the array ``x``. The elements of ``x`` must be orderable.
-    Null elements will be placed at the end of the returned array.
+    Null elements are placed at the end of the returned array.
 
 .. function:: array_sort(array(T), function(T,T,int)) -> array(T)
 
@@ -174,7 +201,7 @@ Array Functions
 .. function:: array_sort_desc(x) -> array
 
     Returns the ``array`` sorted in the descending order. Elements of the ``array`` must be orderable.
-    Null elements will be placed at the end of the returned array.::
+    Null elements are placed at the end of the returned array. ::
 
         SELECT array_sort_desc(ARRAY [100, 1, 10, 50]); -- [100, 50, 10, 1]
         SELECT array_sort_desc(ARRAY [null, 100, null, 1, 10, 50]); -- [100, 50, 10, 1, null, null]
@@ -201,10 +228,19 @@ Array Functions
 
     Tests if arrays ``x`` and ``y`` have any non-null elements in common.
     Returns null if there are no non-null elements in common but either array contains null.
+    Throws a ``NOT_SUPPORTED`` exception on elements of ``ROW`` or ``ARRAY`` type that contain null values. ::
+
+        SELECT arrays_overlap(ARRAY [1, 2, null], ARRAY [2, 3, null]) -- true
+        SELECT arrays_overlap(ARRAY [1, 2], ARRAY [3, 4]) -- false
+        SELECT arrays_overlap(ARRAY [1, null], ARRAY[2]) -- null
+        SELECT arrays_overlap(ARRAY[ROW(1, null)], ARRAY[1, 2]) -- "ROW comparison not supported for fields with null elements"
 
 .. function:: array_union(x, y) -> array
 
     Returns an array of the elements in the union of ``x`` and ``y``, without duplicates.
+    This function uses ``IS NOT DISTINCT FROM`` to determine which elements are the same. ::
+
+        SELECT array_union(ARRAY[1, 2, 3, 2, null], ARRAY[1,2, 2, 4, null]) -- ARRAY[1, 2, 3, 4 null]
 
 .. function:: cardinality(x) -> bigint
 

@@ -90,8 +90,6 @@
 import com.facebook.presto.operator.aggregation.VarianceAggregation;
 import com.facebook.presto.operator.aggregation.approxmostfrequent.ApproximateMostFrequent;
 import com.facebook.presto.operator.aggregation.arrayagg.ArrayAggregationFunction;
-import com.facebook.presto.operator.aggregation.arrayagg.SetAggregationFunction;
-import com.facebook.presto.operator.aggregation.arrayagg.SetUnionFunction;
 import com.facebook.presto.operator.aggregation.differentialentropy.DifferentialEntropyAggregation;
 import com.facebook.presto.operator.aggregation.histogram.Histogram;
 import com.facebook.presto.operator.aggregation.multimapagg.AlternativeMultimapAggregationFunction;
@@ -372,6 +370,8 @@
 import static com.facebook.presto.operator.aggregation.TDigestAggregationFunction.TDIGEST_AGG;
 import static com.facebook.presto.operator.aggregation.TDigestAggregationFunction.TDIGEST_AGG_WITH_WEIGHT;
 import static com.facebook.presto.operator.aggregation.TDigestAggregationFunction.TDIGEST_AGG_WITH_WEIGHT_AND_COMPRESSION;
+import static com.facebook.presto.operator.aggregation.arrayagg.SetAggregationFunction.SET_AGG;
+import static com.facebook.presto.operator.aggregation.arrayagg.SetUnionFunction.SET_UNION;
 import static com.facebook.presto.operator.aggregation.minmaxby.AlternativeMaxByAggregationFunction.ALTERNATIVE_MAX_BY;
 import static com.facebook.presto.operator.aggregation.minmaxby.AlternativeMinByAggregationFunction.ALTERNATIVE_MIN_BY;
 import static com.facebook.presto.operator.aggregation.minmaxby.MaxByAggregationFunction.MAX_BY;
@@ -932,8 +932,7 @@ private List<? extends SqlFunction> getBuiltInFunctions(FeaturesConfig featuresC
                 .function(ARRAY_FLATTEN_FUNCTION)
                 .function(ARRAY_CONCAT_FUNCTION)
                 .functions(ARRAY_CONSTRUCTOR, ARRAY_SUBSCRIPT, ARRAY_TO_JSON, JSON_TO_ARRAY, JSON_STRING_TO_ARRAY)
-                .aggregate(SetAggregationFunction.class)
-                .aggregate(SetUnionFunction.class)
+                .functions(SET_AGG, SET_UNION)
                 .function(new ArrayAggregationFunction(featuresConfig.isLegacyArrayAgg(), featuresConfig.getArrayAggGroupImplementation()))
                 .functions(new MapSubscriptOperator(featuresConfig.isLegacyMapSubscript()))
                 .functions(MAP_CONSTRUCTOR, MAP_TO_JSON, JSON_TO_MAP, JSON_STRING_TO_MAP)

@@ -203,6 +203,7 @@ private DynamicFilterSourceOperator(
             this.blockBuilders[channelIndex] = type.createBlockBuilder(null, EXPECTED_BLOCK_BUILDER_SIZE);
             this.valueSets[channelIndex] = new TypedSet(
                     type,
+                    Optional.empty(),
                     blockBuilders[channelIndex],
                     EXPECTED_BLOCK_BUILDER_SIZE,
                     String.format("DynamicFilterSourceOperator_%s_%d", planNodeId, channelIndex),

@@ -21,14 +21,19 @@
 import com.facebook.presto.operator.project.SelectedPositions;
 import org.openjdk.jol.info.ClassLayout;
 
+import java.lang.invoke.MethodHandle;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.List;
+import java.util.Optional;
 
 import static com.facebook.presto.common.array.Arrays.ensureCapacity;
+import static com.facebook.presto.common.type.TypeUtils.readNativeValue;
 import static com.facebook.presto.operator.project.SelectedPositions.positionsList;
 import static com.facebook.presto.type.TypeUtils.hashPosition;
 import static com.facebook.presto.type.TypeUtils.positionEqualsPosition;
+import static com.facebook.presto.util.Failures.internalError;
+import static com.google.common.base.Defaults.defaultValue;
 import static com.google.common.base.Preconditions.checkArgument;
 import static io.airlift.slice.SizeOf.sizeOf;
 import static it.unimi.dsi.fastutil.HashCommon.arraySize;
@@ -47,6 +52,7 @@ public class OptimizedTypedSet
     private static final SelectedPositions EMPTY_SELECTED_POSITIONS = positionsList(new int[0], 0, 0);
 
     private final Type elementType;
+    private final Optional<MethodHandle> elementIsDistinctFrom;
     private final int hashCapacity;
     private final int hashMask;
 
@@ -56,17 +62,18 @@ public class OptimizedTypedSet
     private long[] blockPositionByHash;  // Each 64-bit long is 32-bit index for blocks + 32-bit position within block
     private int currentBlockIndex = -1;  // The index into the blocks array and positionsForBlocks list
 
-    public OptimizedTypedSet(Type elementType, int maxPositionCount)
+    public OptimizedTypedSet(Type elementType, MethodHandle elementIsDistinctFrom, int maxPositionCount)
     {
-        this(elementType, INITIAL_BLOCK_COUNT, maxPositionCount);
+        this(elementType, Optional.of(elementIsDistinctFrom), INITIAL_BLOCK_COUNT, maxPositionCount);
     }
 
-    public OptimizedTypedSet(Type elementType, int expectedBlockCount, int maxPositionCount)
+    public OptimizedTypedSet(Type elementType, Optional<MethodHandle> elementIsDistinctFrom, int expectedBlockCount, int maxPositionCount)
     {
         checkArgument(expectedBlockCount >= 0, "expectedBlockCount must not be negative");
         checkArgument(maxPositionCount >= 0, "maxPositionCount must not be negative");
 
         this.elementType = requireNonNull(elementType, "elementType must not be null");
+        this.elementIsDistinctFrom = requireNonNull(elementIsDistinctFrom, "elementIsDistinctFrom is null");
         this.hashCapacity = arraySize(maxPositionCount, FILL_RATIO);
         this.hashMask = hashCapacity - 1;
 
@@ -293,14 +300,31 @@ private int getInsertPosition(long[] hashtable, int hashPosition, Block block, i
             // Already has this element
             int blockIndex = (int) ((blockPosition & 0xffff_ffff_0000_0000L) >> 32);
             int positionWithinBlock = (int) (blockPosition & 0xffff_ffff);
-            if (positionEqualsPosition(elementType, blocks[blockIndex], positionWithinBlock, block, position)) {
+            if (isContainedAt(blocks[blockIndex], positionWithinBlock, block, position)) {
                 return INVALID_POSITION;
             }
 
             hashPosition = getMaskedHash(hashPosition + 1);
         }
     }
 
+    private boolean isContainedAt(Block firstBlock, int positionWithinFirstBlock, Block secondBlock, int positionWithinSecondBlock)
+    {
+        if (elementIsDistinctFrom.isPresent()) {
+            boolean firstValueNull = firstBlock.isNull(positionWithinFirstBlock);
+            Object firstValue = firstValueNull ? defaultValue(elementType.getJavaType()) : readNativeValue(elementType, firstBlock, positionWithinFirstBlock);
+            boolean secondValueNull = secondBlock.isNull(positionWithinSecondBlock);
+            Object secondValue = secondValueNull ? defaultValue(elementType.getJavaType()) : readNativeValue(elementType, secondBlock, positionWithinSecondBlock);
+            try {
+                return !(boolean) elementIsDistinctFrom.get().invoke(firstValue, firstValueNull, secondValue, secondValueNull);
+            }
+            catch (Throwable t) {
+                throw internalError(t);
+            }
+        }
+        return positionEqualsPosition(elementType, firstBlock, positionWithinFirstBlock, secondBlock, positionWithinSecondBlock);
+    }
+
     /**
      * Add an element to the hash table if it's not already existed.
      *
@@ -322,7 +346,7 @@ private boolean addElement(long[] hashtable, int hashPosition, Block block, int
             // Already has this element
             int blockIndex = (int) ((blockPosition & 0xffff_ffff_0000_0000L) >> 32);
             int positionWithinBlock = (int) (blockPosition & 0xffff_ffff);
-            if (positionEqualsPosition(elementType, blocks[blockIndex], positionWithinBlock, block, position)) {
+            if (isContainedAt(blocks[blockIndex], positionWithinBlock, block, position)) {
                 return false;
             }
 

@@ -19,12 +19,15 @@
 import com.facebook.presto.spi.PrestoException;
 import org.openjdk.jol.info.ClassLayout;
 
+import java.lang.invoke.MethodHandle;
 import java.util.Arrays;
 
+import static com.facebook.presto.common.type.TypeUtils.readNativeValue;
 import static com.facebook.presto.spi.StandardErrorCode.GENERIC_INSUFFICIENT_RESOURCES;
 import static com.facebook.presto.type.TypeUtils.expectedValueSize;
 import static com.facebook.presto.type.TypeUtils.hashPosition;
-import static com.facebook.presto.type.TypeUtils.positionEqualsPosition;
+import static com.facebook.presto.util.Failures.internalError;
+import static com.google.common.base.Defaults.defaultValue;
 import static com.google.common.base.Preconditions.checkArgument;
 import static io.airlift.slice.SizeOf.sizeOf;
 import static it.unimi.dsi.fastutil.HashCommon.arraySize;
@@ -40,15 +43,17 @@ public final class SetOfValues
 
     private final BlockBuilder valueBlockBuilder;
     private final Type valueType;
+    MethodHandle elementIsDistinctFrom;
 
     private int[] valuePositionByHash;
     private int hashCapacity;
     private int maxFill;
     private int hashMask;
 
-    public SetOfValues(Type valueType)
+    public SetOfValues(Type valueType, MethodHandle elementIsDistinctFrom)
     {
         this.valueType = requireNonNull(valueType, "valueType is null");
+        this.elementIsDistinctFrom = requireNonNull(elementIsDistinctFrom, "elementIsDistinctFrom is null");
         valueBlockBuilder = this.valueType.createBlockBuilder(null, EXPECTED_ENTRIES, expectedValueSize(valueType, EXPECTED_ENTRY_SIZE));
         hashCapacity = arraySize(EXPECTED_ENTRIES, FILL_RATIO);
         this.maxFill = calculateMaxFill(hashCapacity);
@@ -57,9 +62,9 @@ public SetOfValues(Type valueType)
         Arrays.fill(valuePositionByHash, EMPTY_SLOT);
     }
 
-    public SetOfValues(Block serialized, Type elementType)
+    public SetOfValues(Block serialized, Type elementType, MethodHandle elementIsDistinctFrom)
     {
-        this(elementType);
+        this(elementType, elementIsDistinctFrom);
         deserialize(requireNonNull(serialized, "serialized is null"));
     }
 
@@ -111,13 +116,27 @@ private int getHashPositionOfValue(Block value, int position)
             if (valuePositionByHash[hashPosition] == EMPTY_SLOT) {
                 return hashPosition;
             }
-            else if (positionEqualsPosition(valueType, valueBlockBuilder, valuePositionByHash[hashPosition], value, position)) {
+            else if (isContainedAt(valueBlockBuilder, valuePositionByHash[hashPosition], value, position)) {
                 return hashPosition;
             }
             hashPosition = getMaskedHash(hashPosition + 1);
         }
     }
 
+    private boolean isContainedAt(Block firstBlock, int positionWithinFirstBlock, Block secondBlock, int positionWithinSecondBlock)
+    {
+        boolean firstValueNull = firstBlock.isNull(positionWithinFirstBlock);
+        Object firstValue = firstValueNull ? defaultValue(valueType.getJavaType()) : readNativeValue(valueType, firstBlock, positionWithinFirstBlock);
+        boolean secondValueNull = secondBlock.isNull(positionWithinSecondBlock);
+        Object secondValue = secondValueNull ? defaultValue(valueType.getJavaType()) : readNativeValue(valueType, secondBlock, positionWithinSecondBlock);
+        try {
+            return !(boolean) elementIsDistinctFrom.invoke(firstValue, firstValueNull, secondValue, secondValueNull);
+        }
+        catch (Throwable t) {
+            throw internalError(t);
+        }
+    }
+
     private void rehash()
     {
         long newCapacityLong = hashCapacity * 2L;