-
Notifications
You must be signed in to change notification settings - Fork 3.5k
Compute min/max for too long orc string columns #11652
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -22,8 +22,13 @@ | |
| import static com.google.common.base.Preconditions.checkArgument; | ||
| import static com.google.common.base.Preconditions.checkState; | ||
| import static com.google.common.base.Verify.verify; | ||
| import static io.airlift.slice.SliceUtf8.codePointToUtf8; | ||
| import static io.airlift.slice.SliceUtf8.getCodePointAt; | ||
| import static io.airlift.slice.Slices.EMPTY_SLICE; | ||
| import static io.trino.orc.metadata.statistics.StringStatistics.STRING_VALUE_BYTES_OVERHEAD; | ||
| import static java.lang.Character.MAX_CODE_POINT; | ||
| import static java.lang.Math.addExact; | ||
| import static java.lang.System.arraycopy; | ||
| import static java.util.Objects.requireNonNull; | ||
|
|
||
| public class StringStatisticsBuilder | ||
|
|
@@ -36,20 +41,34 @@ public class StringStatisticsBuilder | |
| private Slice maximum; | ||
| private long sum; | ||
| private final BloomFilterBuilder bloomFilterBuilder; | ||
| private final boolean shouldCompactMinMax; | ||
|
|
||
| public StringStatisticsBuilder(int stringStatisticsLimitInBytes, BloomFilterBuilder bloomFilterBuilder) | ||
| { | ||
| this(stringStatisticsLimitInBytes, 0, null, null, 0, bloomFilterBuilder); | ||
| this(stringStatisticsLimitInBytes, 0, null, null, 0, bloomFilterBuilder, true); | ||
| } | ||
|
|
||
| private StringStatisticsBuilder(int stringStatisticsLimitInBytes, long nonNullValueCount, Slice minimum, Slice maximum, long sum, BloomFilterBuilder bloomFilterBuilder) | ||
| public StringStatisticsBuilder(int stringStatisticsLimitInBytes, BloomFilterBuilder bloomFilterBuilder, boolean shouldCompactMinMax) | ||
| { | ||
| this(stringStatisticsLimitInBytes, 0, null, null, 0, bloomFilterBuilder, shouldCompactMinMax); | ||
| } | ||
|
|
||
| private StringStatisticsBuilder( | ||
| int stringStatisticsLimitInBytes, | ||
| long nonNullValueCount, | ||
| Slice minimum, | ||
| Slice maximum, | ||
| long sum, | ||
| BloomFilterBuilder bloomFilterBuilder, | ||
| boolean shouldCompactMinMax) | ||
| { | ||
| this.stringStatisticsLimitInBytes = stringStatisticsLimitInBytes; | ||
| this.nonNullValueCount = nonNullValueCount; | ||
| this.minimum = minimum; | ||
| this.maximum = maximum; | ||
| this.sum = sum; | ||
| this.bloomFilterBuilder = requireNonNull(bloomFilterBuilder, "bloomFilterBuilder"); | ||
| this.shouldCompactMinMax = shouldCompactMinMax; | ||
| } | ||
|
|
||
| public long getNonNullValueCount() | ||
|
|
@@ -112,8 +131,8 @@ private Optional<StringStatistics> buildStringStatistics() | |
| if (nonNullValueCount == 0) { | ||
| return Optional.empty(); | ||
| } | ||
| minimum = dropStringMinMaxIfNecessary(minimum); | ||
| maximum = dropStringMinMaxIfNecessary(maximum); | ||
| minimum = computeStringMinMax(minimum, true); | ||
| maximum = computeStringMinMax(maximum, false); | ||
| if (minimum == null && maximum == null) { | ||
| // Create string stats only when min or max is not null. | ||
| // This corresponds to the behavior of metadata reader. | ||
|
|
@@ -158,16 +177,87 @@ public static Optional<StringStatistics> mergeStringStatistics(List<ColumnStatis | |
| return stringStatisticsBuilder.buildStringStatistics(); | ||
| } | ||
|
|
||
| private Slice dropStringMinMaxIfNecessary(Slice minOrMax) | ||
| private Slice computeStringMinMax(Slice minOrMax, boolean isMin) | ||
| { | ||
| if (minOrMax == null || minOrMax.length() > stringStatisticsLimitInBytes) { | ||
| if (minOrMax == null || (!shouldCompactMinMax && minOrMax.length() > stringStatisticsLimitInBytes)) { | ||
| return null; | ||
| } | ||
| if (minOrMax.length() > stringStatisticsLimitInBytes) { | ||
| if (isMin) { | ||
| return StringCompactor.truncateMin(minOrMax, stringStatisticsLimitInBytes); | ||
| } | ||
| else { | ||
| return StringCompactor.truncateMax(minOrMax, stringStatisticsLimitInBytes); | ||
| } | ||
| } | ||
|
|
||
| // Do not hold the entire slice where the actual stats could be small | ||
| if (minOrMax.isCompact()) { | ||
| return minOrMax; | ||
| } | ||
| return Slices.copyOf(minOrMax); | ||
| } | ||
|
|
||
| static final class StringCompactor | ||
| { | ||
| private static final int INDEX_NOT_FOUND = -1; | ||
|
|
||
| private StringCompactor() {} | ||
|
|
||
| public static Slice truncateMin(Slice slice, int maxBytes) | ||
| { | ||
| checkArgument(slice.length() > maxBytes); | ||
| int lastIndex = findLastCharacterInRange(slice, maxBytes); | ||
| if (lastIndex == INDEX_NOT_FOUND) { | ||
| return EMPTY_SLICE; | ||
| } | ||
| return slice.slice(0, lastIndex); | ||
| } | ||
|
|
||
| public static Slice truncateMax(Slice slice, int maxBytes) | ||
| { | ||
| int firstRemovedCharacterIndex = findLastCharacterInRange(slice, maxBytes); | ||
| int lastRetainedCharacterIndex = findLastCharacterInRange(slice, firstRemovedCharacterIndex - 1); | ||
| if (firstRemovedCharacterIndex == INDEX_NOT_FOUND || lastRetainedCharacterIndex == INDEX_NOT_FOUND) { | ||
| return EMPTY_SLICE; | ||
| } | ||
| int lastRetainedCharacter = getCodePointAt(slice, lastRetainedCharacterIndex); | ||
| while (lastRetainedCharacter == MAX_CODE_POINT && lastRetainedCharacterIndex > 0) { | ||
| lastRetainedCharacterIndex = findLastCharacterInRange(slice, lastRetainedCharacterIndex - 1); | ||
| if (lastRetainedCharacterIndex == INDEX_NOT_FOUND) { | ||
| return EMPTY_SLICE; | ||
| } | ||
| lastRetainedCharacter = getCodePointAt(slice, lastRetainedCharacterIndex); | ||
| } | ||
|
|
||
| if (lastRetainedCharacterIndex == 0 && lastRetainedCharacter == MAX_CODE_POINT) { | ||
| // whole string is made of MAX_CODE_POINT characters, we cannot provide upper bound that is shorter than maxBytes | ||
| return EMPTY_SLICE; | ||
| } | ||
|
|
||
| lastRetainedCharacter++; | ||
| Slice sliceToAppend = codePointToUtf8(lastRetainedCharacter); | ||
| byte[] result = new byte[lastRetainedCharacterIndex + sliceToAppend.length()]; | ||
| arraycopy(slice.byteArray(), slice.byteArrayOffset(), result, 0, lastRetainedCharacterIndex); | ||
| arraycopy(sliceToAppend.byteArray(), 0, result, lastRetainedCharacterIndex, sliceToAppend.length()); | ||
| return Slices.wrappedBuffer(result); | ||
| } | ||
|
|
||
| private static int findLastCharacterInRange(Slice slice, int toInclusive) | ||
| { | ||
| int pos = toInclusive; | ||
| while (pos >= 0) { | ||
| if (isUtfBlockStartChar(slice.getByte(pos))) { | ||
| return pos; | ||
| } | ||
| pos--; | ||
| } | ||
| return INDEX_NOT_FOUND; | ||
| } | ||
|
|
||
| private static boolean isUtfBlockStartChar(byte b) | ||
|
||
| { | ||
| return (b & 0xC0) != 0x80; | ||
| } | ||
| } | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
this will throw (
IllegalArgumentException("Provided byte array is not a valid utf8 string")) when input is eg 4-byte utf8 sequence, and maxBytes=3 (please add a test)let's
findLastCharacterInRangereturns-1(or perhaps Optional.empty) in such caseif (maxBytes == 0)above, as it becomes obsoleteThere was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ok but this will look very ugly
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
propose something else?