From 1585b4a80d820c73f04c790956a85c1d70727335 Mon Sep 17 00:00:00 2001 From: Martin Traverso Date: Fri, 17 Feb 2023 15:48:37 -0800 Subject: [PATCH 1/3] Add more benchmarks for LIKE --- .../trino/operator/scalar/BenchmarkLike.java | 119 ++++++++++-------- 1 file changed, 70 insertions(+), 49 deletions(-) diff --git a/core/trino-main/src/test/java/io/trino/operator/scalar/BenchmarkLike.java b/core/trino-main/src/test/java/io/trino/operator/scalar/BenchmarkLike.java index 7aad0cfce2dd..b7fe29d521f8 100644 --- a/core/trino-main/src/test/java/io/trino/operator/scalar/BenchmarkLike.java +++ b/core/trino-main/src/test/java/io/trino/operator/scalar/BenchmarkLike.java @@ -40,6 +40,7 @@ import java.util.Optional; +import static com.google.common.base.Strings.repeat; import static io.airlift.joni.constants.MetaChar.INEFFECTIVE_META_CHAR; import static io.airlift.joni.constants.SyntaxProperties.OP_ASTERISK_ZERO_INF; import static io.airlift.joni.constants.SyntaxProperties.OP_DOT_ANYCHAR; @@ -60,6 +61,12 @@ @Measurement(iterations = 30, time = 500, timeUnit = MILLISECONDS) public class BenchmarkLike { + private static final String LONG_STRING = repeat("a", 100) + + repeat("b", 100) + + repeat("a", 100) + + repeat("b", 100) + + "the quick brown fox jumps over the lazy dog"; + private static final Syntax SYNTAX = new Syntax( OP_DOT_ANYCHAR | OP_ASTERISK_ZERO_INF | OP_LINE_ANCHOR, 0, @@ -73,51 +80,65 @@ public class BenchmarkLike INEFFECTIVE_META_CHAR, /* one or more time '+' */ INEFFECTIVE_META_CHAR)); /* anychar anytime */ + public enum BenchmarkCase + { + ANY("%", LONG_STRING), + WILDCARD_PREFIX("_%", LONG_STRING), + WILDCARD_SUFFIX("%_", LONG_STRING), + PREFIX("the%", "the quick brown fox jumps over the lazy dog"), + SUFFIX("%dog", "the quick brown fox jumps over the lazy dog"), + FIXED_WILDCARD("_____", "abcdef"), + SHORT_TOKENS_1("%a%b%a%b%", LONG_STRING), + SHORT_TOKENS_2("%the%quick%brown%fox%jumps%over%the%lazy%dog%", LONG_STRING), + SHORT_TOKEN("%the%", LONG_STRING), + LONG_TOKENS_1("%aaaaaaaaab%bbbbbbbbba%aaaaaaaaab%bbbbbbbbbt%", LONG_STRING), + LONG_TOKENS_2("%aaaaaaaaaaaaaaaaaaaaaaaaaa%aaaaaaaaaaaaaaaaaaaaaaaaaathe%", LONG_STRING), + LONG_TOKEN_1("%bbbbbbbbbbbbbbbthe%", LONG_STRING), + LONG_TOKEN_2("%the quick brown fox%", LONG_STRING), + LONG_TOKEN_3("%aaaaaaaxaaaaaa%", LONG_STRING), + SHORT_TOKENS_WITH_LONG_SKIP("%the%dog%", LONG_STRING); + + private final String pattern; + private final String text; + + BenchmarkCase(String pattern, String text) + { + this.pattern = pattern; + this.text = text; + } + + public String pattern() + { + return pattern; + } + + public String text() + { + return text; + } + } + @State(Thread) public static class Data { - @Param({ - "%", - "_%", - "%_", - "abc%", - "%abc", - "_____", - "abc%def%ghi", - "%abc%def%", - "%a%a%a%a%", - "%aaaaaaaaaaaaaaaaaaaaaaaaaa%" - }) - private String pattern; + @Param + private BenchmarkCase benchmarkCase; private Slice data; private byte[] bytes; private JoniRegexp joniPattern; - private LikeMatcher dfaMatcher; - private LikeMatcher nfaMatcher; + private LikeMatcher optimizedMatcher; + private LikeMatcher nonOptimizedMatcher; @Setup public void setup() { - data = Slices.utf8Slice( - switch (pattern) { - case "%" -> "qeroighqeorhgqerhb2eriuyerqiubgierubgleuqrbgilquebriuqebryqebrhqerhqsnajkbcowuhet"; - case "_%", "%_" -> "qeroighqeorhgqerhb2eriuyerqiubgierubgleuqrbgilquebriuqebryqebrhqerhqsnajkbcowuhet"; - case "abc%" -> "abcqeroighqeorhgqerhb2eriuyerqiubgierubgleuqrbgilquebriuqebryqebrhqerhqsnajkbcowuhet"; - case "%abc" -> "qeroighqeorhgqerhb2eriuyerqiubgierubgleuqrbgilquebriuqebryqebrhqerhqsnajkbcowuhetabc"; - case "_____" -> "abcde"; - case "abc%def%ghi" -> "abc qeroighqeorhgqerhb2eriuyerqiubgier def ubgleuqrbgilquebriuqebryqebrhqerhqsnajkbcowuhet ghi"; - case "%abc%def%" -> "fdnbqerbfklerqbgqjerbgkr abc qeroighqeorhgqerhb2eriuyerqiubgier def ubgleuqrbgilquebriuqebryqebrhqerhqsnajkbcowuhet"; - case "%a%a%a%a%" -> "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"; - case "%aaaaaaaaaaaaaaaaaaaaaaaaaa%" -> "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"; - default -> throw new IllegalArgumentException("Unknown pattern: " + pattern); - }); - - dfaMatcher = LikeMatcher.compile(pattern, Optional.empty(), true); - nfaMatcher = LikeMatcher.compile(pattern, Optional.empty(), false); - joniPattern = compileJoni(Slices.utf8Slice(pattern).toStringUtf8(), '0', false); - - bytes = data.getBytes(); + optimizedMatcher = LikeMatcher.compile(benchmarkCase.pattern(), Optional.empty(), true); + nonOptimizedMatcher = LikeMatcher.compile(benchmarkCase.pattern(), Optional.empty(), false); + joniPattern = compileJoni(benchmarkCase.pattern(), '0', false); + + bytes = benchmarkCase.text().getBytes(UTF_8); + data = Slices.wrappedBuffer(bytes); } } @@ -128,52 +149,52 @@ public boolean matchJoni(Data data) } @Benchmark - public boolean matchDfa(Data data) + public boolean matchOptimized(Data data) { - return data.dfaMatcher.match(data.bytes, 0, data.bytes.length); + return data.optimizedMatcher.match(data.bytes, 0, data.bytes.length); } @Benchmark - public boolean matchNfa(Data data) + public boolean matchNonOptimized(Data data) { - return data.nfaMatcher.match(data.bytes, 0, data.bytes.length); + return data.nonOptimizedMatcher.match(data.bytes, 0, data.bytes.length); } @Benchmark public JoniRegexp compileJoni(Data data) { - return compileJoni(data.pattern, (char) 0, false); + return compileJoni(data.benchmarkCase.pattern(), (char) 0, false); } @Benchmark - public LikeMatcher compileDfa(Data data) + public LikeMatcher compileOptimized(Data data) { - return LikeMatcher.compile(data.pattern, Optional.empty(), true); + return LikeMatcher.compile(data.benchmarkCase.pattern(), Optional.empty(), true); } @Benchmark - public LikeMatcher compileNfa(Data data) + public LikeMatcher compileNonOptimized(Data data) { - return LikeMatcher.compile(data.pattern, Optional.empty(), false); + return LikeMatcher.compile(data.benchmarkCase.pattern(), Optional.empty(), false); } @Benchmark - public boolean allJoni(Data data) + public boolean dynamicJoni(Data data) { - return likeVarchar(data.data, compileJoni(Slices.utf8Slice(data.pattern).toStringUtf8(), '0', false)); + return likeVarchar(data.data, compileJoni(Slices.utf8Slice(data.benchmarkCase.pattern()).toStringUtf8(), '0', false)); } @Benchmark - public boolean allDfa(Data data) + public boolean dynamicOptimized(Data data) { - return LikeMatcher.compile(data.pattern, Optional.empty(), true) + return LikeMatcher.compile(data.benchmarkCase.pattern(), Optional.empty(), true) .match(data.bytes, 0, data.bytes.length); } @Benchmark - public boolean allNfa(Data data) + public boolean dynamicNonOptimized(Data data) { - return LikeMatcher.compile(data.pattern, Optional.empty(), false) + return LikeMatcher.compile(data.benchmarkCase.pattern(), Optional.empty(), false) .match(data.bytes, 0, data.bytes.length); } From c2bcdd15955d4530ffd3dc7f78b639ff847409f4 Mon Sep 17 00:00:00 2001 From: Martin Traverso Date: Fri, 17 Feb 2023 15:50:00 -0800 Subject: [PATCH 2/3] Improve performance for LIKE patterns involving % MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When the pattern contains only literals and %, use substring search for each of the tokens, via an implementation of the FJS algorithm: https://cgjennings.ca/articles/fjs/ Benchmark results follow: * dynamicXXX measures the end-to-end performance of compiling the matcher and calling it. * matchXXX measures the performance of the match call after the matcher has been compiled * xxxNonOptimized vs xxxOptimized measures the performance when LikeMatcher is constructed with optimize = true/false Benchmark (case) Before After dynamicNonOptimized SHORT_TOKENS_1 3206.181 ± 16.858 ns/op 1301.583 ± 6.762 ns/op dynamicNonOptimized SHORT_TOKENS_2 3534.404 ± 20.939 ns/op 2073.400 ± 17.597 ns/op dynamicNonOptimized SHORT_TOKEN 2568.900 ± 24.562 ns/op 582.184 ± 2.452 ns/op dynamicNonOptimized LONG_TOKENS_1 12055.974 ± 72.518 ns/op 1594.760 ± 8.006 ns/op dynamicNonOptimized LONG_TOKENS_2 17133.678 ± 119.793 ns/op 700.485 ± 3.883 ns/op dynamicNonOptimized LONG_TOKEN_1 7152.323 ± 54.488 ns/op 451.341 ± 2.386 ns/op dynamicNonOptimized LONG_TOKEN_2 2852.432 ± 29.256 ns/op 342.418 ± 3.757 ns/op dynamicNonOptimized LONG_TOKEN_3 5238.197 ± 46.751 ns/op 933.180 ± 5.290 ns/op dynamicNonOptimized SHORT_TOKENS_WITH_LONG_SKIP 3063.792 ± 37.088 ns/op 833.256 ± 26.775 ns/op dynamicOptimized SHORT_TOKENS_1 283428.816 ± 1611.467 ns/op 1305.750 ± 9.497 ns/op dynamicOptimized SHORT_TOKENS_2 10059684.325 ± 44593.208 ns/op 2013.463 ± 15.444 ns/op dynamicOptimized SHORT_TOKEN 81244.561 ± 339.620 ns/op 586.187 ± 2.540 ns/op dynamicOptimized LONG_TOKENS_1 4733209.512 ± 30825.948 ns/op 1603.712 ± 15.636 ns/op dynamicOptimized LONG_TOKENS_2 6875531.823 ± 33728.556 ns/op 707.062 ± 3.214 ns/op dynamicOptimized LONG_TOKEN_1 665877.955 ± 30123.355 ns/op 453.508 ± 2.343 ns/op dynamicOptimized LONG_TOKEN_2 370405.576 ± 2891.106 ns/op 342.558 ± 2.781 ns/op dynamicOptimized LONG_TOKEN_3 402514.307 ± 1920.966 ns/op 932.587 ± 4.264 ns/op dynamicOptimized SHORT_TOKENS_WITH_LONG_SKIP 254232.154 ± 1114.968 ns/op 821.808 ± 4.116 ns/op matchNonOptimized SHORT_TOKENS_1 2833.111 ± 13.485 ns/op 701.785 ± 3.181 ns/op matchNonOptimized SHORT_TOKENS_2 3221.687 ± 20.231 ns/op 543.724 ± 2.822 ns/op matchNonOptimized SHORT_TOKEN 2311.488 ± 11.088 ns/op 458.462 ± 1.643 ns/op matchNonOptimized LONG_TOKENS_1 11778.521 ± 52.387 ns/op 865.535 ± 3.973 ns/op matchNonOptimized LONG_TOKENS_2 16922.399 ± 72.356 ns/op 193.247 ± 0.574 ns/op matchNonOptimized LONG_TOKEN_1 6871.454 ± 35.185 ns/op 259.938 ± 1.161 ns/op matchNonOptimized LONG_TOKEN_2 2517.248 ± 13.335 ns/op 151.030 ± 0.579 ns/op matchNonOptimized LONG_TOKEN_3 5021.075 ± 39.784 ns/op 709.089 ± 3.854 ns/op matchNonOptimized SHORT_TOKENS_WITH_LONG_SKIP 2757.342 ± 16.299 ns/op 504.451 ± 1.964 ns/op matchOptimized SHORT_TOKENS_1 783.268 ± 3.646 ns/op 702.478 ± 3.716 ns/op matchOptimized SHORT_TOKENS_2 1147.895 ± 4.307 ns/op 543.043 ± 2.447 ns/op matchOptimized SHORT_TOKEN 1044.000 ± 4.159 ns/op 458.934 ± 2.049 ns/op matchOptimized LONG_TOKENS_1 1044.809 ± 5.375 ns/op 867.075 ± 4.226 ns/op matchOptimized LONG_TOKENS_2 1062.192 ± 5.323 ns/op 193.253 ± 0.678 ns/op matchOptimized LONG_TOKEN_1 1045.351 ± 4.702 ns/op 259.962 ± 1.199 ns/op matchOptimized LONG_TOKEN_2 1084.966 ± 3.921 ns/op 150.928 ± 0.652 ns/op matchOptimized LONG_TOKEN_3 1061.450 ± 3.678 ns/op 707.735 ± 3.565 ns/op matchOptimized SHORT_TOKENS_WITH_LONG_SKIP 1148.827 ± 8.071 ns/op 504.854 ± 2.521 ns/op --- .../java/io/trino/likematcher/FjsMatcher.java | 210 ++++++++++++++++++ .../io/trino/likematcher/LikeMatcher.java | 19 +- .../io/trino/likematcher/TestLikeMatcher.java | 12 + 3 files changed, 238 insertions(+), 3 deletions(-) create mode 100644 core/trino-main/src/main/java/io/trino/likematcher/FjsMatcher.java diff --git a/core/trino-main/src/main/java/io/trino/likematcher/FjsMatcher.java b/core/trino-main/src/main/java/io/trino/likematcher/FjsMatcher.java new file mode 100644 index 000000000000..f2f4fbe81a4a --- /dev/null +++ b/core/trino-main/src/main/java/io/trino/likematcher/FjsMatcher.java @@ -0,0 +1,210 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.likematcher; + +import java.nio.charset.StandardCharsets; +import java.util.ArrayList; +import java.util.List; + +import static com.google.common.base.Preconditions.checkArgument; +import static java.util.Objects.requireNonNull; + +public class FjsMatcher + implements Matcher +{ + private final List pattern; + private final int start; + private final int end; + private final boolean exact; + + private volatile Fjs matcher; + + public FjsMatcher(List pattern, int start, int end, boolean exact) + { + this.pattern = requireNonNull(pattern, "pattern is null"); + this.start = start; + this.end = end; + this.exact = exact; + } + + @Override + public boolean match(byte[] input, int offset, int length) + { + Fjs matcher = this.matcher; + if (matcher == null) { + matcher = new Fjs(pattern, start, end, exact); + this.matcher = matcher; + } + + return matcher.match(input, offset, length); + } + + private static class Fjs + { + private final boolean exact; + private final List patterns = new ArrayList<>(); + private final List bmsShifts = new ArrayList<>(); + private final List kmpShifts = new ArrayList<>(); + + public Fjs(List pattern, int start, int end, boolean exact) + { + this.exact = exact; + + for (int i = start; i <= end; i++) { + Pattern element = pattern.get(i); + + if (element instanceof Pattern.Literal literal) { + checkArgument(i == 0 || !(pattern.get(i - 1) instanceof Pattern.Literal), "Multiple consecutive literals found"); + byte[] bytes = literal.value().getBytes(StandardCharsets.UTF_8); + patterns.add(bytes); + bmsShifts.add(computeBmsShifts(bytes)); + kmpShifts.add(computeKmpShifts(bytes)); + } + else if (element instanceof Pattern.Any) { + throw new IllegalArgumentException("'any' pattern not supported"); + } + } + } + + private static int[] computeKmpShifts(byte[] pattern) + { + int[] result = new int[pattern.length + 1]; + result[0] = -1; + + int j = -1; + for (int i = 1; i < result.length; i++) { + while (j >= 0 && pattern[i - 1] != pattern[j]) { + j = result[j]; + } + j++; + result[i] = j; + } + + return result; + } + + private static int[] computeBmsShifts(byte[] pattern) + { + int[] result = new int[256]; + + for (int i = 0; i < pattern.length; i++) { + result[pattern[i] & 0xFF] = i + 1; + } + + return result; + } + + private static int find(byte[] input, final int offset, final int length, byte[] pattern, int[] bmsShifts, int[] kmpShifts) + { + if (pattern.length > length || pattern.length == 0) { + return -1; + } + + final int inputLimit = offset + length; + + int i = offset; + while (true) { + // Attempt to match the last position of the pattern + // As long as it doesn't match, skip ahead based on the Boyer-Moore-Sunday heuristic + int matchEnd = i + pattern.length - 1; + while (matchEnd < inputLimit - 1 && input[matchEnd] != pattern[pattern.length - 1]) { + int shift = pattern.length + 1 - bmsShifts[input[matchEnd + 1] & 0xFF]; + matchEnd += shift; + } + + if (matchEnd == inputLimit - 1 && match(input, inputLimit - pattern.length, pattern)) { + return inputLimit - pattern.length; + } + else if (matchEnd >= inputLimit - 1) { + return -1; + } + + // At this point, we know the last position of the pattern matches with some + // position in the input text given by "matchEnd" + // Use KMP to match the first length-1 characters of the pattern + + i = matchEnd - (pattern.length - 1); + + int j = findLongestMatch(input, i, pattern, 0, pattern.length - 1); + i += j; + + if (j == pattern.length - 1) { + return i - j; + } + + j = kmpShifts[j]; + + // Continue to match the whole pattern using KMP + while (j > 0) { + int x = findLongestMatch(input, i, pattern, j, pattern.length); + i += x; + j += x; + + if (j == pattern.length) { + return i - j; + } + + j = kmpShifts[j]; + } + + i++; + } + } + + private static int findLongestMatch(byte[] input, int inputOffset, byte[] pattern, int patternOffset, int patternLimit) + { + int k = 0; + while (patternOffset + k < patternLimit && input[inputOffset + k] == pattern[patternOffset + k]) { + k++; + } + return k; + } + + private static boolean match(byte[] input, int offset, byte[] pattern) + { + for (int i = 0; i < pattern.length; i++) { + if (input[offset + i] != pattern[i]) { + return false; + } + } + + return true; + } + + public boolean match(byte[] input, int offset, int length) + { + int start = offset; + int remaining = length; + + for (int i = 0; i < patterns.size(); i++) { + if (remaining == 0) { + return false; + } + + byte[] term = patterns.get(i); + + int position = find(input, start, remaining, term, bmsShifts.get(i), kmpShifts.get(i)); + if (position == -1) { + return false; + } + + position += term.length; + remaining -= position - start; + start = position; + } + + return !exact || remaining == 0; + } + } +} diff --git a/core/trino-main/src/main/java/io/trino/likematcher/LikeMatcher.java b/core/trino-main/src/main/java/io/trino/likematcher/LikeMatcher.java index 575cb25f2d7b..6cd7961665c7 100644 --- a/core/trino-main/src/main/java/io/trino/likematcher/LikeMatcher.java +++ b/core/trino-main/src/main/java/io/trino/likematcher/LikeMatcher.java @@ -132,11 +132,24 @@ else if (expression instanceof Any any) { Optional matcher = Optional.empty(); if (patternStart <= patternEnd) { - if (optimize) { - matcher = Optional.of(new DenseDfaMatcher(parsed, patternStart, patternEnd, exact)); + boolean hasAny = false; + for (int i = patternStart; i <= patternEnd; i++) { + if (parsed.get(i) instanceof Any) { + hasAny = true; + break; + } + } + + if (hasAny) { + if (optimize) { + matcher = Optional.of(new DenseDfaMatcher(parsed, patternStart, patternEnd, exact)); + } + else { + matcher = Optional.of(new NfaMatcher(parsed, patternStart, patternEnd, exact)); + } } else { - matcher = Optional.of(new NfaMatcher(parsed, patternStart, patternEnd, exact)); + matcher = Optional.of(new FjsMatcher(parsed, patternStart, patternEnd, exact)); } } diff --git a/core/trino-main/src/test/java/io/trino/likematcher/TestLikeMatcher.java b/core/trino-main/src/test/java/io/trino/likematcher/TestLikeMatcher.java index a02b21aa0ac3..2334ed6369dc 100644 --- a/core/trino-main/src/test/java/io/trino/likematcher/TestLikeMatcher.java +++ b/core/trino-main/src/test/java/io/trino/likematcher/TestLikeMatcher.java @@ -13,6 +13,7 @@ */ package io.trino.likematcher; +import com.google.common.base.Strings; import org.junit.jupiter.api.Test; import java.nio.charset.StandardCharsets; @@ -80,6 +81,17 @@ public void test() assertTrue(match("%aaaa%bbbb%aaaa%bbbb%aaaa%bbbb%", "aaaabbbbaaaabbbbaaaabbbb")); assertTrue(match("%aaaaaaaaaaaaaaaaaaaaaaaaaa%", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa")); + assertTrue(match("%aab%bba%aab%bba%", "aaaabbbbaaaabbbbaaaa")); + assertFalse(match("%aab%bba%aab%bba%", "aaaabbbbaaaabbbbcccc")); + assertTrue(match("%abaca%", "abababababacabababa")); + assertFalse(match("%bcccccccca%", "bbbbbbbbxax")); + assertFalse(match("%bbxxxxxa%", "bbbxxxxaz")); + assertFalse(match("%aaaaaaxaaaaaa%", Strings.repeat("a", 20) + + Strings.repeat("b", 20) + + Strings.repeat("a", 20) + + Strings.repeat("b", 20) + + "the quick brown fox jumps over the lazy dog")); + // utf-8 LikeMatcher singleOptimized = LikeMatcher.compile("_", Optional.empty(), true); LikeMatcher multipleOptimized = LikeMatcher.compile("_a%b_", Optional.empty(), true); // prefix and suffix with _a and b_ to avoid optimizations From 6bbf1ca968691ed4d1bfafbb4c9892955bd598b5 Mon Sep 17 00:00:00 2001 From: Martin Traverso Date: Wed, 5 Apr 2023 11:50:08 -0700 Subject: [PATCH 3/3] fixup! address review comments --- .../java/io/trino/likematcher/FjsMatcher.java | 22 ++++++++++--------- .../io/trino/likematcher/TestLikeMatcher.java | 2 ++ 2 files changed, 14 insertions(+), 10 deletions(-) diff --git a/core/trino-main/src/main/java/io/trino/likematcher/FjsMatcher.java b/core/trino-main/src/main/java/io/trino/likematcher/FjsMatcher.java index f2f4fbe81a4a..24fee50d8072 100644 --- a/core/trino-main/src/main/java/io/trino/likematcher/FjsMatcher.java +++ b/core/trino-main/src/main/java/io/trino/likematcher/FjsMatcher.java @@ -137,19 +137,19 @@ else if (matchEnd >= inputLimit - 1) { i = matchEnd - (pattern.length - 1); int j = findLongestMatch(input, i, pattern, 0, pattern.length - 1); - i += j; if (j == pattern.length - 1) { - return i - j; + return i; } + i += j; j = kmpShifts[j]; // Continue to match the whole pattern using KMP while (j > 0) { - int x = findLongestMatch(input, i, pattern, j, pattern.length); - i += x; - j += x; + int size = findLongestMatch(input, i, pattern, j, Math.min(inputLimit - i, pattern.length - j)); + i += size; + j += size; if (j == pattern.length) { return i - j; @@ -162,13 +162,15 @@ else if (matchEnd >= inputLimit - 1) { } } - private static int findLongestMatch(byte[] input, int inputOffset, byte[] pattern, int patternOffset, int patternLimit) + private static int findLongestMatch(byte[] input, int inputOffset, byte[] pattern, int patternOffset, int length) { - int k = 0; - while (patternOffset + k < patternLimit && input[inputOffset + k] == pattern[patternOffset + k]) { - k++; + for (int i = 0; i < length; i++) { + if (input[inputOffset + i] != pattern[patternOffset + i]) { + return i; + } } - return k; + + return length; } private static boolean match(byte[] input, int offset, byte[] pattern) diff --git a/core/trino-main/src/test/java/io/trino/likematcher/TestLikeMatcher.java b/core/trino-main/src/test/java/io/trino/likematcher/TestLikeMatcher.java index 2334ed6369dc..1c9d0af4fb2a 100644 --- a/core/trino-main/src/test/java/io/trino/likematcher/TestLikeMatcher.java +++ b/core/trino-main/src/test/java/io/trino/likematcher/TestLikeMatcher.java @@ -92,6 +92,8 @@ public void test() Strings.repeat("b", 20) + "the quick brown fox jumps over the lazy dog")); + assertFalse(match("%abaaa%", "ababaa")); + // utf-8 LikeMatcher singleOptimized = LikeMatcher.compile("_", Optional.empty(), true); LikeMatcher multipleOptimized = LikeMatcher.compile("_a%b_", Optional.empty(), true); // prefix and suffix with _a and b_ to avoid optimizations