diff --git a/lib/trino-orc/src/main/java/io/trino/orc/OrcWriter.java b/lib/trino-orc/src/main/java/io/trino/orc/OrcWriter.java index f0a1496d1df2..49e6e08b1a76 100644 --- a/lib/trino-orc/src/main/java/io/trino/orc/OrcWriter.java +++ b/lib/trino-orc/src/main/java/io/trino/orc/OrcWriter.java @@ -183,7 +183,8 @@ public OrcWriter( compression, maxCompressionBufferSize, options.getMaxStringStatisticsLimit(), - getBloomFilterBuilder(options, columnNames.get(fieldId))); + getBloomFilterBuilder(options, columnNames.get(fieldId)), + options.isShouldCompactMinMax()); columnWriters.add(columnWriter); if (columnWriter instanceof SliceDictionaryColumnWriter) { diff --git a/lib/trino-orc/src/main/java/io/trino/orc/OrcWriterOptions.java b/lib/trino-orc/src/main/java/io/trino/orc/OrcWriterOptions.java index 0f60910d0ed4..01dc8aee67f0 100644 --- a/lib/trino-orc/src/main/java/io/trino/orc/OrcWriterOptions.java +++ b/lib/trino-orc/src/main/java/io/trino/orc/OrcWriterOptions.java @@ -61,6 +61,7 @@ public enum WriterIdentification private final DataSize maxCompressionBufferSize; private final Set bloomFilterColumns; private final double bloomFilterFpp; + private final boolean shouldCompactMinMax; public OrcWriterOptions() { @@ -74,7 +75,8 @@ public OrcWriterOptions() DEFAULT_MAX_STRING_STATISTICS_LIMIT, DEFAULT_MAX_COMPRESSION_BUFFER_SIZE, ImmutableSet.of(), - DEFAULT_BLOOM_FILTER_FPP); + DEFAULT_BLOOM_FILTER_FPP, + true); } private OrcWriterOptions( @@ -87,7 +89,8 @@ private OrcWriterOptions( DataSize maxStringStatisticsLimit, DataSize maxCompressionBufferSize, Set bloomFilterColumns, - double bloomFilterFpp) + double bloomFilterFpp, + boolean shouldCompactMinMax) { requireNonNull(stripeMinSize, "stripeMinSize is null"); requireNonNull(stripeMaxSize, "stripeMaxSize is null"); @@ -109,6 +112,7 @@ private OrcWriterOptions( this.maxCompressionBufferSize = maxCompressionBufferSize; this.bloomFilterColumns = ImmutableSet.copyOf(bloomFilterColumns); this.bloomFilterFpp = bloomFilterFpp; + this.shouldCompactMinMax = shouldCompactMinMax; } public WriterIdentification getWriterIdentification() @@ -231,6 +235,18 @@ public OrcWriterOptions withBloomFilterFpp(double bloomFilterFpp) .build(); } + public boolean isShouldCompactMinMax() + { + return shouldCompactMinMax; + } + + public OrcWriterOptions withShouldCompactMinMax(boolean shouldCompactMinMax) + { + return builderFrom(this) + .setShouldCompactMinMax(shouldCompactMinMax) + .build(); + } + @Override public String toString() { @@ -269,6 +285,7 @@ public static final class Builder private DataSize maxCompressionBufferSize; private Set bloomFilterColumns; private double bloomFilterFpp; + private boolean shouldCompactMinMax; private Builder(OrcWriterOptions options) { @@ -284,6 +301,7 @@ private Builder(OrcWriterOptions options) this.maxCompressionBufferSize = options.maxCompressionBufferSize; this.bloomFilterColumns = ImmutableSet.copyOf(options.bloomFilterColumns); this.bloomFilterFpp = options.bloomFilterFpp; + this.shouldCompactMinMax = options.shouldCompactMinMax; } public Builder setWriterIdentification(WriterIdentification writerIdentification) @@ -346,6 +364,12 @@ public Builder setBloomFilterFpp(double bloomFilterFpp) return this; } + public Builder setShouldCompactMinMax(boolean shouldCompactMinMax) + { + this.shouldCompactMinMax = shouldCompactMinMax; + return this; + } + public OrcWriterOptions build() { return new OrcWriterOptions( @@ -358,7 +382,8 @@ public OrcWriterOptions build() maxStringStatisticsLimit, maxCompressionBufferSize, bloomFilterColumns, - bloomFilterFpp); + bloomFilterFpp, + shouldCompactMinMax); } } } diff --git a/lib/trino-orc/src/main/java/io/trino/orc/metadata/statistics/StringStatisticsBuilder.java b/lib/trino-orc/src/main/java/io/trino/orc/metadata/statistics/StringStatisticsBuilder.java index b60907a2451e..57ba350d4ee9 100644 --- a/lib/trino-orc/src/main/java/io/trino/orc/metadata/statistics/StringStatisticsBuilder.java +++ b/lib/trino-orc/src/main/java/io/trino/orc/metadata/statistics/StringStatisticsBuilder.java @@ -22,8 +22,13 @@ import static com.google.common.base.Preconditions.checkArgument; import static com.google.common.base.Preconditions.checkState; import static com.google.common.base.Verify.verify; +import static io.airlift.slice.SliceUtf8.codePointToUtf8; +import static io.airlift.slice.SliceUtf8.getCodePointAt; +import static io.airlift.slice.Slices.EMPTY_SLICE; import static io.trino.orc.metadata.statistics.StringStatistics.STRING_VALUE_BYTES_OVERHEAD; +import static java.lang.Character.MAX_CODE_POINT; import static java.lang.Math.addExact; +import static java.lang.System.arraycopy; import static java.util.Objects.requireNonNull; public class StringStatisticsBuilder @@ -36,13 +41,26 @@ public class StringStatisticsBuilder private Slice maximum; private long sum; private final BloomFilterBuilder bloomFilterBuilder; + private final boolean shouldCompactMinMax; public StringStatisticsBuilder(int stringStatisticsLimitInBytes, BloomFilterBuilder bloomFilterBuilder) { - this(stringStatisticsLimitInBytes, 0, null, null, 0, bloomFilterBuilder); + this(stringStatisticsLimitInBytes, 0, null, null, 0, bloomFilterBuilder, true); } - private StringStatisticsBuilder(int stringStatisticsLimitInBytes, long nonNullValueCount, Slice minimum, Slice maximum, long sum, BloomFilterBuilder bloomFilterBuilder) + public StringStatisticsBuilder(int stringStatisticsLimitInBytes, BloomFilterBuilder bloomFilterBuilder, boolean shouldCompactMinMax) + { + this(stringStatisticsLimitInBytes, 0, null, null, 0, bloomFilterBuilder, shouldCompactMinMax); + } + + private StringStatisticsBuilder( + int stringStatisticsLimitInBytes, + long nonNullValueCount, + Slice minimum, + Slice maximum, + long sum, + BloomFilterBuilder bloomFilterBuilder, + boolean shouldCompactMinMax) { this.stringStatisticsLimitInBytes = stringStatisticsLimitInBytes; this.nonNullValueCount = nonNullValueCount; @@ -50,6 +68,7 @@ private StringStatisticsBuilder(int stringStatisticsLimitInBytes, long nonNullVa this.maximum = maximum; this.sum = sum; this.bloomFilterBuilder = requireNonNull(bloomFilterBuilder, "bloomFilterBuilder"); + this.shouldCompactMinMax = shouldCompactMinMax; } public long getNonNullValueCount() @@ -112,8 +131,8 @@ private Optional buildStringStatistics() if (nonNullValueCount == 0) { return Optional.empty(); } - minimum = dropStringMinMaxIfNecessary(minimum); - maximum = dropStringMinMaxIfNecessary(maximum); + minimum = computeStringMinMax(minimum, true); + maximum = computeStringMinMax(maximum, false); if (minimum == null && maximum == null) { // Create string stats only when min or max is not null. // This corresponds to the behavior of metadata reader. @@ -158,11 +177,19 @@ public static Optional mergeStringStatistics(List stringStatisticsLimitInBytes) { + if (minOrMax == null || (!shouldCompactMinMax && minOrMax.length() > stringStatisticsLimitInBytes)) { return null; } + if (minOrMax.length() > stringStatisticsLimitInBytes) { + if (isMin) { + return StringCompactor.truncateMin(minOrMax, stringStatisticsLimitInBytes); + } + else { + return StringCompactor.truncateMax(minOrMax, stringStatisticsLimitInBytes); + } + } // Do not hold the entire slice where the actual stats could be small if (minOrMax.isCompact()) { @@ -170,4 +197,67 @@ private Slice dropStringMinMaxIfNecessary(Slice minOrMax) } return Slices.copyOf(minOrMax); } + + static final class StringCompactor + { + private static final int INDEX_NOT_FOUND = -1; + + private StringCompactor() {} + + public static Slice truncateMin(Slice slice, int maxBytes) + { + checkArgument(slice.length() > maxBytes); + int lastIndex = findLastCharacterInRange(slice, maxBytes); + if (lastIndex == INDEX_NOT_FOUND) { + return EMPTY_SLICE; + } + return slice.slice(0, lastIndex); + } + + public static Slice truncateMax(Slice slice, int maxBytes) + { + int firstRemovedCharacterIndex = findLastCharacterInRange(slice, maxBytes); + int lastRetainedCharacterIndex = findLastCharacterInRange(slice, firstRemovedCharacterIndex - 1); + if (firstRemovedCharacterIndex == INDEX_NOT_FOUND || lastRetainedCharacterIndex == INDEX_NOT_FOUND) { + return EMPTY_SLICE; + } + int lastRetainedCharacter = getCodePointAt(slice, lastRetainedCharacterIndex); + while (lastRetainedCharacter == MAX_CODE_POINT && lastRetainedCharacterIndex > 0) { + lastRetainedCharacterIndex = findLastCharacterInRange(slice, lastRetainedCharacterIndex - 1); + if (lastRetainedCharacterIndex == INDEX_NOT_FOUND) { + return EMPTY_SLICE; + } + lastRetainedCharacter = getCodePointAt(slice, lastRetainedCharacterIndex); + } + + if (lastRetainedCharacterIndex == 0 && lastRetainedCharacter == MAX_CODE_POINT) { + // whole string is made of MAX_CODE_POINT characters, we cannot provide upper bound that is shorter than maxBytes + return EMPTY_SLICE; + } + + lastRetainedCharacter++; + Slice sliceToAppend = codePointToUtf8(lastRetainedCharacter); + byte[] result = new byte[lastRetainedCharacterIndex + sliceToAppend.length()]; + arraycopy(slice.byteArray(), slice.byteArrayOffset(), result, 0, lastRetainedCharacterIndex); + arraycopy(sliceToAppend.byteArray(), 0, result, lastRetainedCharacterIndex, sliceToAppend.length()); + return Slices.wrappedBuffer(result); + } + + private static int findLastCharacterInRange(Slice slice, int toInclusive) + { + int pos = toInclusive; + while (pos >= 0) { + if (isUtfBlockStartChar(slice.getByte(pos))) { + return pos; + } + pos--; + } + return INDEX_NOT_FOUND; + } + + private static boolean isUtfBlockStartChar(byte b) + { + return (b & 0xC0) != 0x80; + } + } } diff --git a/lib/trino-orc/src/main/java/io/trino/orc/writer/ColumnWriters.java b/lib/trino-orc/src/main/java/io/trino/orc/writer/ColumnWriters.java index 3d1fdc301900..172dd8b2325c 100644 --- a/lib/trino-orc/src/main/java/io/trino/orc/writer/ColumnWriters.java +++ b/lib/trino-orc/src/main/java/io/trino/orc/writer/ColumnWriters.java @@ -47,7 +47,8 @@ public static ColumnWriter createColumnWriter( CompressionKind compression, int bufferSize, DataSize stringStatisticsLimit, - Supplier bloomFilterBuilder) + Supplier bloomFilterBuilder, + boolean shouldCompactMinMax) { requireNonNull(type, "type is null"); OrcType orcType = orcTypes.get(columnId); @@ -92,12 +93,12 @@ public static ColumnWriter createColumnWriter( case CHAR: case VARCHAR: case STRING: - return new SliceDictionaryColumnWriter(columnId, type, compression, bufferSize, () -> new StringStatisticsBuilder(toIntExact(stringStatisticsLimit.toBytes()), bloomFilterBuilder.get())); + return new SliceDictionaryColumnWriter(columnId, type, compression, bufferSize, () -> new StringStatisticsBuilder(toIntExact(stringStatisticsLimit.toBytes()), bloomFilterBuilder.get(), shouldCompactMinMax)); case LIST: { OrcColumnId fieldColumnIndex = orcType.getFieldTypeIndex(0); Type fieldType = type.getTypeParameters().get(0); - ColumnWriter elementWriter = createColumnWriter(fieldColumnIndex, orcTypes, fieldType, compression, bufferSize, stringStatisticsLimit, bloomFilterBuilder); + ColumnWriter elementWriter = createColumnWriter(fieldColumnIndex, orcTypes, fieldType, compression, bufferSize, stringStatisticsLimit, bloomFilterBuilder, shouldCompactMinMax); return new ListColumnWriter(columnId, compression, bufferSize, elementWriter); } @@ -109,7 +110,8 @@ public static ColumnWriter createColumnWriter( compression, bufferSize, stringStatisticsLimit, - bloomFilterBuilder); + bloomFilterBuilder, + shouldCompactMinMax); ColumnWriter valueWriter = createColumnWriter( orcType.getFieldTypeIndex(1), orcTypes, @@ -117,7 +119,8 @@ public static ColumnWriter createColumnWriter( compression, bufferSize, stringStatisticsLimit, - bloomFilterBuilder); + bloomFilterBuilder, + shouldCompactMinMax); return new MapColumnWriter(columnId, compression, bufferSize, keyWriter, valueWriter); } @@ -126,7 +129,7 @@ public static ColumnWriter createColumnWriter( for (int fieldId = 0; fieldId < orcType.getFieldCount(); fieldId++) { OrcColumnId fieldColumnIndex = orcType.getFieldTypeIndex(fieldId); Type fieldType = type.getTypeParameters().get(fieldId); - fieldWriters.add(createColumnWriter(fieldColumnIndex, orcTypes, fieldType, compression, bufferSize, stringStatisticsLimit, bloomFilterBuilder)); + fieldWriters.add(createColumnWriter(fieldColumnIndex, orcTypes, fieldType, compression, bufferSize, stringStatisticsLimit, bloomFilterBuilder, shouldCompactMinMax)); } return new StructColumnWriter(columnId, compression, bufferSize, fieldWriters.build()); } diff --git a/lib/trino-orc/src/test/java/io/trino/orc/metadata/statistics/TestStringStatisticsBuilder.java b/lib/trino-orc/src/test/java/io/trino/orc/metadata/statistics/TestStringStatisticsBuilder.java index 50e115562f9d..2a087fee8992 100644 --- a/lib/trino-orc/src/test/java/io/trino/orc/metadata/statistics/TestStringStatisticsBuilder.java +++ b/lib/trino-orc/src/test/java/io/trino/orc/metadata/statistics/TestStringStatisticsBuilder.java @@ -16,6 +16,7 @@ import com.google.common.collect.ImmutableList; import io.airlift.slice.Slice; import io.airlift.slice.Slices; +import org.testng.annotations.DataProvider; import org.testng.annotations.Test; import java.util.ArrayList; @@ -27,6 +28,10 @@ import static io.trino.orc.metadata.statistics.AbstractStatisticsBuilderTest.StatisticsType.STRING; import static io.trino.orc.metadata.statistics.ColumnStatistics.mergeColumnStatistics; import static io.trino.orc.metadata.statistics.StringStatistics.STRING_VALUE_BYTES_OVERHEAD; +import static io.trino.orc.metadata.statistics.StringStatisticsBuilder.StringCompactor.truncateMax; +import static io.trino.orc.metadata.statistics.StringStatisticsBuilder.StringCompactor.truncateMin; +import static java.nio.charset.StandardCharsets.UTF_8; +import static org.assertj.core.api.Assertions.assertThat; import static org.testng.Assert.assertEquals; import static org.testng.Assert.assertFalse; import static org.testng.Assert.assertNotNull; @@ -164,7 +169,7 @@ public void testMixingAddValueAndMergeWithLimit() // max merged to null List statisticsList = new ArrayList<>(); - StringStatisticsBuilder statisticsBuilder = new StringStatisticsBuilder(7, new NoOpBloomFilterBuilder()); + StringStatisticsBuilder statisticsBuilder = new StringStatisticsBuilder(7, new NoOpBloomFilterBuilder(), false); statisticsList.add(statisticsBuilder.buildColumnStatistics()); assertMergedStringStatistics(statisticsList, 0, 0); @@ -191,7 +196,7 @@ public void testMixingAddValueAndMergeWithLimit() // min merged to null statisticsList = new ArrayList<>(); - statisticsBuilder = new StringStatisticsBuilder(7, new NoOpBloomFilterBuilder()); + statisticsBuilder = new StringStatisticsBuilder(7, new NoOpBloomFilterBuilder(), false); statisticsList.add(statisticsBuilder.buildColumnStatistics()); assertMergedStringStatistics(statisticsList, 0, 0); @@ -218,7 +223,7 @@ public void testMixingAddValueAndMergeWithLimit() // min and max both merged to null statisticsList = new ArrayList<>(); - statisticsBuilder = new StringStatisticsBuilder(7, new NoOpBloomFilterBuilder()); + statisticsBuilder = new StringStatisticsBuilder(7, new NoOpBloomFilterBuilder(), false); statisticsBuilder.addValue(MEDIUM_BOTTOM_VALUE); statisticsList.add(statisticsBuilder.buildColumnStatistics()); assertMinMax(mergeColumnStatistics(statisticsList).getStringStatistics(), MEDIUM_BOTTOM_VALUE, MEDIUM_BOTTOM_VALUE); @@ -270,7 +275,7 @@ private void assertMergedStringStatistics(List statisticsList, private static void assertMinMaxValuesWithLimit(Slice expectedMin, Slice expectedMax, List values, int limit) { checkArgument(values != null && !values.isEmpty()); - StringStatisticsBuilder builder = new StringStatisticsBuilder(limit, new NoOpBloomFilterBuilder()); + StringStatisticsBuilder builder = new StringStatisticsBuilder(limit, new NoOpBloomFilterBuilder(), false); for (Slice value : values) { builder.addValue(value); } @@ -331,4 +336,66 @@ public void testBloomFilter() assertTrue(bloomFilter.testSlice(MEDIUM_BOTTOM_VALUE)); assertFalse(bloomFilter.testSlice(LOW_TOP_VALUE)); } + + @DataProvider(name = "computeMin") + public static Object[][] computeMinProvider() + { + return new Object[][] { + {"simple/case", "simple", 6}, + {"simple/ƒ", "simple/", 8}, + {"simple/語", "simple/", 9}, + {"simple/語", "simple/", 8}, + {"simple/\uD80C\uDE02", "simple/", 10}, + {"simple/\uD80C\uDE02", "simple/", 9}, + {"simple/\uD80C\uDE02", "simple/", 8}, + {"\uDBFF\uDFFF", "", 3}, + }; + } + + @Test(dataProvider = "computeMin") + public void testComputeMin(String input, String expected, int maxLength) + { + assertThat(truncateMin(Slices.wrappedBuffer(input.getBytes(UTF_8)), maxLength).getBytes()).containsExactly(expected.getBytes(UTF_8)); + } + + @DataProvider(name = "computeMax") + public static Object[][] computeMaxProvider() + { + return new Object[][] { + {"simple/case", "simplf", 6}, + {"simple/ƒ", "simple0", 8}, + {"simple/語", "simple0", 9}, + {"simple/語", "simple0", 8}, + {"simple/\uD80C\uDE02", "simple0", 10}, + {"simple/\uD80C\uDE02", "simple0", 9}, + {"simple/\uD80C\uDE02", "simple0", 8}, + {"simple/ƒƒ", "simple/Ɠ", 10}, + {"\uDBFF\uDFFF", "", 3}, + }; + } + + @Test(dataProvider = "computeMax") + public void testComputeMax(String input, String expected, int maxLength) + { + assertThat(truncateMax(Slices.wrappedBuffer(input.getBytes(UTF_8)), maxLength).getBytes()).containsExactly(expected.getBytes(UTF_8)); + } + + @Test + public void testComputeMaxForMaxUtf8Chars() + { + assertThat(truncateMax(Slices.wrappedBuffer("\uDBFF\uDFFF\uDBFF\uDFFF\uDBFF\uDFFF".getBytes(UTF_8)), 11)).isEqualTo(EMPTY_SLICE); + } + + @Test + public void testComputeMaxSkipsMaxUtf8Chars() + { + assertThat(truncateMax(Slices.wrappedBuffer("a\uDBFF\uDFFF\uDBFF\uDFFF\uDBFF\uDFFF".getBytes(UTF_8)), 12).getBytes()).containsExactly("b".getBytes(UTF_8)); + } + + @Test + public void testComputeMixMaxReturnsNullForMaxLengthEquals0() + { + assertThat(truncateMin(Slices.wrappedBuffer("simple/test".getBytes(UTF_8)), 0)).isEqualTo(EMPTY_SLICE); + assertThat(truncateMax(Slices.wrappedBuffer("simple/test".getBytes(UTF_8)), 0)).isEqualTo(EMPTY_SLICE); + } }