-
Notifications
You must be signed in to change notification settings - Fork 5.5k
Use distinct semantics for distincting array functions #22938
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
03a544f
49c6260
3d962ba
c4fa9a3
ae90200
600d474
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -21,14 +21,19 @@ | |
| import com.facebook.presto.operator.project.SelectedPositions; | ||
| import org.openjdk.jol.info.ClassLayout; | ||
|
|
||
| import java.lang.invoke.MethodHandle; | ||
| import java.util.ArrayList; | ||
| import java.util.Arrays; | ||
| import java.util.List; | ||
| import java.util.Optional; | ||
|
|
||
| import static com.facebook.presto.common.array.Arrays.ensureCapacity; | ||
| import static com.facebook.presto.common.type.TypeUtils.readNativeValue; | ||
| import static com.facebook.presto.operator.project.SelectedPositions.positionsList; | ||
| import static com.facebook.presto.type.TypeUtils.hashPosition; | ||
| import static com.facebook.presto.type.TypeUtils.positionEqualsPosition; | ||
| import static com.facebook.presto.util.Failures.internalError; | ||
| import static com.google.common.base.Defaults.defaultValue; | ||
| import static com.google.common.base.Preconditions.checkArgument; | ||
| import static io.airlift.slice.SizeOf.sizeOf; | ||
| import static it.unimi.dsi.fastutil.HashCommon.arraySize; | ||
|
|
@@ -47,6 +52,7 @@ public class OptimizedTypedSet | |
| private static final SelectedPositions EMPTY_SELECTED_POSITIONS = positionsList(new int[0], 0, 0); | ||
|
|
||
| private final Type elementType; | ||
| private final Optional<MethodHandle> elementIsDistinctFrom; | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. How difficult would it be to make it required and always use
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I didn't do it because this is also used for MapConcatFunction, and maps don't support indeterminate types, but it's pretty easy to change, and the maps should fail later anyway. I can make this change.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. actually, map_concat doesn't otherwise check that map keys are not indeterminate or null (caused failing tests because of this), and it's a bit challenging to get it to do that because it's a varargs function. I'm going to put this back to the way it was, and look into improving it as a follow up. |
||
| private final int hashCapacity; | ||
| private final int hashMask; | ||
|
|
||
|
|
@@ -56,17 +62,18 @@ public class OptimizedTypedSet | |
| private long[] blockPositionByHash; // Each 64-bit long is 32-bit index for blocks + 32-bit position within block | ||
| private int currentBlockIndex = -1; // The index into the blocks array and positionsForBlocks list | ||
|
|
||
| public OptimizedTypedSet(Type elementType, int maxPositionCount) | ||
| public OptimizedTypedSet(Type elementType, MethodHandle elementIsDistinctFrom, int maxPositionCount) | ||
| { | ||
| this(elementType, INITIAL_BLOCK_COUNT, maxPositionCount); | ||
| this(elementType, Optional.of(elementIsDistinctFrom), INITIAL_BLOCK_COUNT, maxPositionCount); | ||
| } | ||
|
|
||
| public OptimizedTypedSet(Type elementType, int expectedBlockCount, int maxPositionCount) | ||
| public OptimizedTypedSet(Type elementType, Optional<MethodHandle> elementIsDistinctFrom, int expectedBlockCount, int maxPositionCount) | ||
| { | ||
| checkArgument(expectedBlockCount >= 0, "expectedBlockCount must not be negative"); | ||
| checkArgument(maxPositionCount >= 0, "maxPositionCount must not be negative"); | ||
|
|
||
| this.elementType = requireNonNull(elementType, "elementType must not be null"); | ||
| this.elementIsDistinctFrom = requireNonNull(elementIsDistinctFrom, "elementIsDistinctFrom is null"); | ||
| this.hashCapacity = arraySize(maxPositionCount, FILL_RATIO); | ||
| this.hashMask = hashCapacity - 1; | ||
|
|
||
|
|
@@ -293,14 +300,31 @@ private int getInsertPosition(long[] hashtable, int hashPosition, Block block, i | |
| // Already has this element | ||
| int blockIndex = (int) ((blockPosition & 0xffff_ffff_0000_0000L) >> 32); | ||
| int positionWithinBlock = (int) (blockPosition & 0xffff_ffff); | ||
| if (positionEqualsPosition(elementType, blocks[blockIndex], positionWithinBlock, block, position)) { | ||
| if (isContainedAt(blocks[blockIndex], positionWithinBlock, block, position)) { | ||
| return INVALID_POSITION; | ||
| } | ||
|
|
||
| hashPosition = getMaskedHash(hashPosition + 1); | ||
| } | ||
| } | ||
|
|
||
| private boolean isContainedAt(Block firstBlock, int positionWithinFirstBlock, Block secondBlock, int positionWithinSecondBlock) | ||
| { | ||
| if (elementIsDistinctFrom.isPresent()) { | ||
| boolean firstValueNull = firstBlock.isNull(positionWithinFirstBlock); | ||
| Object firstValue = firstValueNull ? defaultValue(elementType.getJavaType()) : readNativeValue(elementType, firstBlock, positionWithinFirstBlock); | ||
| boolean secondValueNull = secondBlock.isNull(positionWithinSecondBlock); | ||
| Object secondValue = secondValueNull ? defaultValue(elementType.getJavaType()) : readNativeValue(elementType, secondBlock, positionWithinSecondBlock); | ||
| try { | ||
| return !(boolean) elementIsDistinctFrom.get().invoke(firstValue, firstValueNull, secondValue, secondValueNull); | ||
| } | ||
| catch (Throwable t) { | ||
| throw internalError(t); | ||
| } | ||
| } | ||
| return positionEqualsPosition(elementType, firstBlock, positionWithinFirstBlock, secondBlock, positionWithinSecondBlock); | ||
| } | ||
|
|
||
| /** | ||
| * Add an element to the hash table if it's not already existed. | ||
| * | ||
|
|
@@ -322,7 +346,7 @@ private boolean addElement(long[] hashtable, int hashPosition, Block block, int | |
| // Already has this element | ||
| int blockIndex = (int) ((blockPosition & 0xffff_ffff_0000_0000L) >> 32); | ||
| int positionWithinBlock = (int) (blockPosition & 0xffff_ffff); | ||
| if (positionEqualsPosition(elementType, blocks[blockIndex], positionWithinBlock, block, position)) { | ||
| if (isContainedAt(blocks[blockIndex], positionWithinBlock, block, position)) { | ||
| return false; | ||
| } | ||
|
|
||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Do we ever collect dynamic filters for complex types (e.g.: rows?)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'm not sure, but dynamic filter needs to match join semantics, and joins fail on indeterminate arrays/rows, so dynamic filter should do the same.
examples: