diff --git a/core/trino-main/src/main/java/io/trino/likematcher/FjsMatcher.java b/core/trino-main/src/main/java/io/trino/likematcher/FjsMatcher.java new file mode 100644 index 000000000000..24fee50d8072 --- /dev/null +++ b/core/trino-main/src/main/java/io/trino/likematcher/FjsMatcher.java @@ -0,0 +1,212 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.likematcher; + +import java.nio.charset.StandardCharsets; +import java.util.ArrayList; +import java.util.List; + +import static com.google.common.base.Preconditions.checkArgument; +import static java.util.Objects.requireNonNull; + +public class FjsMatcher + implements Matcher +{ + private final List pattern; + private final int start; + private final int end; + private final boolean exact; + + private volatile Fjs matcher; + + public FjsMatcher(List pattern, int start, int end, boolean exact) + { + this.pattern = requireNonNull(pattern, "pattern is null"); + this.start = start; + this.end = end; + this.exact = exact; + } + + @Override + public boolean match(byte[] input, int offset, int length) + { + Fjs matcher = this.matcher; + if (matcher == null) { + matcher = new Fjs(pattern, start, end, exact); + this.matcher = matcher; + } + + return matcher.match(input, offset, length); + } + + private static class Fjs + { + private final boolean exact; + private final List patterns = new ArrayList<>(); + private final List bmsShifts = new ArrayList<>(); + private final List kmpShifts = new ArrayList<>(); + + public Fjs(List pattern, int start, int end, boolean exact) + { + this.exact = exact; + + for (int i = start; i <= end; i++) { + Pattern element = pattern.get(i); + + if (element instanceof Pattern.Literal literal) { + checkArgument(i == 0 || !(pattern.get(i - 1) instanceof Pattern.Literal), "Multiple consecutive literals found"); + byte[] bytes = literal.value().getBytes(StandardCharsets.UTF_8); + patterns.add(bytes); + bmsShifts.add(computeBmsShifts(bytes)); + kmpShifts.add(computeKmpShifts(bytes)); + } + else if (element instanceof Pattern.Any) { + throw new IllegalArgumentException("'any' pattern not supported"); + } + } + } + + private static int[] computeKmpShifts(byte[] pattern) + { + int[] result = new int[pattern.length + 1]; + result[0] = -1; + + int j = -1; + for (int i = 1; i < result.length; i++) { + while (j >= 0 && pattern[i - 1] != pattern[j]) { + j = result[j]; + } + j++; + result[i] = j; + } + + return result; + } + + private static int[] computeBmsShifts(byte[] pattern) + { + int[] result = new int[256]; + + for (int i = 0; i < pattern.length; i++) { + result[pattern[i] & 0xFF] = i + 1; + } + + return result; + } + + private static int find(byte[] input, final int offset, final int length, byte[] pattern, int[] bmsShifts, int[] kmpShifts) + { + if (pattern.length > length || pattern.length == 0) { + return -1; + } + + final int inputLimit = offset + length; + + int i = offset; + while (true) { + // Attempt to match the last position of the pattern + // As long as it doesn't match, skip ahead based on the Boyer-Moore-Sunday heuristic + int matchEnd = i + pattern.length - 1; + while (matchEnd < inputLimit - 1 && input[matchEnd] != pattern[pattern.length - 1]) { + int shift = pattern.length + 1 - bmsShifts[input[matchEnd + 1] & 0xFF]; + matchEnd += shift; + } + + if (matchEnd == inputLimit - 1 && match(input, inputLimit - pattern.length, pattern)) { + return inputLimit - pattern.length; + } + else if (matchEnd >= inputLimit - 1) { + return -1; + } + + // At this point, we know the last position of the pattern matches with some + // position in the input text given by "matchEnd" + // Use KMP to match the first length-1 characters of the pattern + + i = matchEnd - (pattern.length - 1); + + int j = findLongestMatch(input, i, pattern, 0, pattern.length - 1); + + if (j == pattern.length - 1) { + return i; + } + + i += j; + j = kmpShifts[j]; + + // Continue to match the whole pattern using KMP + while (j > 0) { + int size = findLongestMatch(input, i, pattern, j, Math.min(inputLimit - i, pattern.length - j)); + i += size; + j += size; + + if (j == pattern.length) { + return i - j; + } + + j = kmpShifts[j]; + } + + i++; + } + } + + private static int findLongestMatch(byte[] input, int inputOffset, byte[] pattern, int patternOffset, int length) + { + for (int i = 0; i < length; i++) { + if (input[inputOffset + i] != pattern[patternOffset + i]) { + return i; + } + } + + return length; + } + + private static boolean match(byte[] input, int offset, byte[] pattern) + { + for (int i = 0; i < pattern.length; i++) { + if (input[offset + i] != pattern[i]) { + return false; + } + } + + return true; + } + + public boolean match(byte[] input, int offset, int length) + { + int start = offset; + int remaining = length; + + for (int i = 0; i < patterns.size(); i++) { + if (remaining == 0) { + return false; + } + + byte[] term = patterns.get(i); + + int position = find(input, start, remaining, term, bmsShifts.get(i), kmpShifts.get(i)); + if (position == -1) { + return false; + } + + position += term.length; + remaining -= position - start; + start = position; + } + + return !exact || remaining == 0; + } + } +} diff --git a/core/trino-main/src/main/java/io/trino/likematcher/LikeMatcher.java b/core/trino-main/src/main/java/io/trino/likematcher/LikeMatcher.java index 575cb25f2d7b..6cd7961665c7 100644 --- a/core/trino-main/src/main/java/io/trino/likematcher/LikeMatcher.java +++ b/core/trino-main/src/main/java/io/trino/likematcher/LikeMatcher.java @@ -132,11 +132,24 @@ else if (expression instanceof Any any) { Optional matcher = Optional.empty(); if (patternStart <= patternEnd) { - if (optimize) { - matcher = Optional.of(new DenseDfaMatcher(parsed, patternStart, patternEnd, exact)); + boolean hasAny = false; + for (int i = patternStart; i <= patternEnd; i++) { + if (parsed.get(i) instanceof Any) { + hasAny = true; + break; + } + } + + if (hasAny) { + if (optimize) { + matcher = Optional.of(new DenseDfaMatcher(parsed, patternStart, patternEnd, exact)); + } + else { + matcher = Optional.of(new NfaMatcher(parsed, patternStart, patternEnd, exact)); + } } else { - matcher = Optional.of(new NfaMatcher(parsed, patternStart, patternEnd, exact)); + matcher = Optional.of(new FjsMatcher(parsed, patternStart, patternEnd, exact)); } } diff --git a/core/trino-main/src/test/java/io/trino/likematcher/TestLikeMatcher.java b/core/trino-main/src/test/java/io/trino/likematcher/TestLikeMatcher.java index a02b21aa0ac3..1c9d0af4fb2a 100644 --- a/core/trino-main/src/test/java/io/trino/likematcher/TestLikeMatcher.java +++ b/core/trino-main/src/test/java/io/trino/likematcher/TestLikeMatcher.java @@ -13,6 +13,7 @@ */ package io.trino.likematcher; +import com.google.common.base.Strings; import org.junit.jupiter.api.Test; import java.nio.charset.StandardCharsets; @@ -80,6 +81,19 @@ public void test() assertTrue(match("%aaaa%bbbb%aaaa%bbbb%aaaa%bbbb%", "aaaabbbbaaaabbbbaaaabbbb")); assertTrue(match("%aaaaaaaaaaaaaaaaaaaaaaaaaa%", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa")); + assertTrue(match("%aab%bba%aab%bba%", "aaaabbbbaaaabbbbaaaa")); + assertFalse(match("%aab%bba%aab%bba%", "aaaabbbbaaaabbbbcccc")); + assertTrue(match("%abaca%", "abababababacabababa")); + assertFalse(match("%bcccccccca%", "bbbbbbbbxax")); + assertFalse(match("%bbxxxxxa%", "bbbxxxxaz")); + assertFalse(match("%aaaaaaxaaaaaa%", Strings.repeat("a", 20) + + Strings.repeat("b", 20) + + Strings.repeat("a", 20) + + Strings.repeat("b", 20) + + "the quick brown fox jumps over the lazy dog")); + + assertFalse(match("%abaaa%", "ababaa")); + // utf-8 LikeMatcher singleOptimized = LikeMatcher.compile("_", Optional.empty(), true); LikeMatcher multipleOptimized = LikeMatcher.compile("_a%b_", Optional.empty(), true); // prefix and suffix with _a and b_ to avoid optimizations diff --git a/core/trino-main/src/test/java/io/trino/operator/scalar/BenchmarkLike.java b/core/trino-main/src/test/java/io/trino/operator/scalar/BenchmarkLike.java index 7aad0cfce2dd..b7fe29d521f8 100644 --- a/core/trino-main/src/test/java/io/trino/operator/scalar/BenchmarkLike.java +++ b/core/trino-main/src/test/java/io/trino/operator/scalar/BenchmarkLike.java @@ -40,6 +40,7 @@ import java.util.Optional; +import static com.google.common.base.Strings.repeat; import static io.airlift.joni.constants.MetaChar.INEFFECTIVE_META_CHAR; import static io.airlift.joni.constants.SyntaxProperties.OP_ASTERISK_ZERO_INF; import static io.airlift.joni.constants.SyntaxProperties.OP_DOT_ANYCHAR; @@ -60,6 +61,12 @@ @Measurement(iterations = 30, time = 500, timeUnit = MILLISECONDS) public class BenchmarkLike { + private static final String LONG_STRING = repeat("a", 100) + + repeat("b", 100) + + repeat("a", 100) + + repeat("b", 100) + + "the quick brown fox jumps over the lazy dog"; + private static final Syntax SYNTAX = new Syntax( OP_DOT_ANYCHAR | OP_ASTERISK_ZERO_INF | OP_LINE_ANCHOR, 0, @@ -73,51 +80,65 @@ public class BenchmarkLike INEFFECTIVE_META_CHAR, /* one or more time '+' */ INEFFECTIVE_META_CHAR)); /* anychar anytime */ + public enum BenchmarkCase + { + ANY("%", LONG_STRING), + WILDCARD_PREFIX("_%", LONG_STRING), + WILDCARD_SUFFIX("%_", LONG_STRING), + PREFIX("the%", "the quick brown fox jumps over the lazy dog"), + SUFFIX("%dog", "the quick brown fox jumps over the lazy dog"), + FIXED_WILDCARD("_____", "abcdef"), + SHORT_TOKENS_1("%a%b%a%b%", LONG_STRING), + SHORT_TOKENS_2("%the%quick%brown%fox%jumps%over%the%lazy%dog%", LONG_STRING), + SHORT_TOKEN("%the%", LONG_STRING), + LONG_TOKENS_1("%aaaaaaaaab%bbbbbbbbba%aaaaaaaaab%bbbbbbbbbt%", LONG_STRING), + LONG_TOKENS_2("%aaaaaaaaaaaaaaaaaaaaaaaaaa%aaaaaaaaaaaaaaaaaaaaaaaaaathe%", LONG_STRING), + LONG_TOKEN_1("%bbbbbbbbbbbbbbbthe%", LONG_STRING), + LONG_TOKEN_2("%the quick brown fox%", LONG_STRING), + LONG_TOKEN_3("%aaaaaaaxaaaaaa%", LONG_STRING), + SHORT_TOKENS_WITH_LONG_SKIP("%the%dog%", LONG_STRING); + + private final String pattern; + private final String text; + + BenchmarkCase(String pattern, String text) + { + this.pattern = pattern; + this.text = text; + } + + public String pattern() + { + return pattern; + } + + public String text() + { + return text; + } + } + @State(Thread) public static class Data { - @Param({ - "%", - "_%", - "%_", - "abc%", - "%abc", - "_____", - "abc%def%ghi", - "%abc%def%", - "%a%a%a%a%", - "%aaaaaaaaaaaaaaaaaaaaaaaaaa%" - }) - private String pattern; + @Param + private BenchmarkCase benchmarkCase; private Slice data; private byte[] bytes; private JoniRegexp joniPattern; - private LikeMatcher dfaMatcher; - private LikeMatcher nfaMatcher; + private LikeMatcher optimizedMatcher; + private LikeMatcher nonOptimizedMatcher; @Setup public void setup() { - data = Slices.utf8Slice( - switch (pattern) { - case "%" -> "qeroighqeorhgqerhb2eriuyerqiubgierubgleuqrbgilquebriuqebryqebrhqerhqsnajkbcowuhet"; - case "_%", "%_" -> "qeroighqeorhgqerhb2eriuyerqiubgierubgleuqrbgilquebriuqebryqebrhqerhqsnajkbcowuhet"; - case "abc%" -> "abcqeroighqeorhgqerhb2eriuyerqiubgierubgleuqrbgilquebriuqebryqebrhqerhqsnajkbcowuhet"; - case "%abc" -> "qeroighqeorhgqerhb2eriuyerqiubgierubgleuqrbgilquebriuqebryqebrhqerhqsnajkbcowuhetabc"; - case "_____" -> "abcde"; - case "abc%def%ghi" -> "abc qeroighqeorhgqerhb2eriuyerqiubgier def ubgleuqrbgilquebriuqebryqebrhqerhqsnajkbcowuhet ghi"; - case "%abc%def%" -> "fdnbqerbfklerqbgqjerbgkr abc qeroighqeorhgqerhb2eriuyerqiubgier def ubgleuqrbgilquebriuqebryqebrhqerhqsnajkbcowuhet"; - case "%a%a%a%a%" -> "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"; - case "%aaaaaaaaaaaaaaaaaaaaaaaaaa%" -> "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"; - default -> throw new IllegalArgumentException("Unknown pattern: " + pattern); - }); - - dfaMatcher = LikeMatcher.compile(pattern, Optional.empty(), true); - nfaMatcher = LikeMatcher.compile(pattern, Optional.empty(), false); - joniPattern = compileJoni(Slices.utf8Slice(pattern).toStringUtf8(), '0', false); - - bytes = data.getBytes(); + optimizedMatcher = LikeMatcher.compile(benchmarkCase.pattern(), Optional.empty(), true); + nonOptimizedMatcher = LikeMatcher.compile(benchmarkCase.pattern(), Optional.empty(), false); + joniPattern = compileJoni(benchmarkCase.pattern(), '0', false); + + bytes = benchmarkCase.text().getBytes(UTF_8); + data = Slices.wrappedBuffer(bytes); } } @@ -128,52 +149,52 @@ public boolean matchJoni(Data data) } @Benchmark - public boolean matchDfa(Data data) + public boolean matchOptimized(Data data) { - return data.dfaMatcher.match(data.bytes, 0, data.bytes.length); + return data.optimizedMatcher.match(data.bytes, 0, data.bytes.length); } @Benchmark - public boolean matchNfa(Data data) + public boolean matchNonOptimized(Data data) { - return data.nfaMatcher.match(data.bytes, 0, data.bytes.length); + return data.nonOptimizedMatcher.match(data.bytes, 0, data.bytes.length); } @Benchmark public JoniRegexp compileJoni(Data data) { - return compileJoni(data.pattern, (char) 0, false); + return compileJoni(data.benchmarkCase.pattern(), (char) 0, false); } @Benchmark - public LikeMatcher compileDfa(Data data) + public LikeMatcher compileOptimized(Data data) { - return LikeMatcher.compile(data.pattern, Optional.empty(), true); + return LikeMatcher.compile(data.benchmarkCase.pattern(), Optional.empty(), true); } @Benchmark - public LikeMatcher compileNfa(Data data) + public LikeMatcher compileNonOptimized(Data data) { - return LikeMatcher.compile(data.pattern, Optional.empty(), false); + return LikeMatcher.compile(data.benchmarkCase.pattern(), Optional.empty(), false); } @Benchmark - public boolean allJoni(Data data) + public boolean dynamicJoni(Data data) { - return likeVarchar(data.data, compileJoni(Slices.utf8Slice(data.pattern).toStringUtf8(), '0', false)); + return likeVarchar(data.data, compileJoni(Slices.utf8Slice(data.benchmarkCase.pattern()).toStringUtf8(), '0', false)); } @Benchmark - public boolean allDfa(Data data) + public boolean dynamicOptimized(Data data) { - return LikeMatcher.compile(data.pattern, Optional.empty(), true) + return LikeMatcher.compile(data.benchmarkCase.pattern(), Optional.empty(), true) .match(data.bytes, 0, data.bytes.length); } @Benchmark - public boolean allNfa(Data data) + public boolean dynamicNonOptimized(Data data) { - return LikeMatcher.compile(data.pattern, Optional.empty(), false) + return LikeMatcher.compile(data.benchmarkCase.pattern(), Optional.empty(), false) .match(data.bytes, 0, data.bytes.length); }