From 3e2001e2c2d884ac301380da8f5a24eb64582c7e Mon Sep 17 00:00:00 2001 From: Martin Traverso Date: Mon, 6 Feb 2023 16:20:56 -0800 Subject: [PATCH 01/11] Benchmark LIKE pattern compilation --- .../io/trino/operator/scalar/BenchmarkLike.java | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/core/trino-main/src/test/java/io/trino/operator/scalar/BenchmarkLike.java b/core/trino-main/src/test/java/io/trino/operator/scalar/BenchmarkLike.java index 778fe40f676c..ec5e0b551b7a 100644 --- a/core/trino-main/src/test/java/io/trino/operator/scalar/BenchmarkLike.java +++ b/core/trino-main/src/test/java/io/trino/operator/scalar/BenchmarkLike.java @@ -85,6 +85,8 @@ public static class Data "_____", "abc%def%ghi", "%abc%def%", + "%a%a%a%a%", + "%aaaaaaaaaaaaaaaaaaaaaaaaaa%" }) private String pattern; @@ -105,6 +107,8 @@ public void setup() case "_____" -> "abcde"; case "abc%def%ghi" -> "abc qeroighqeorhgqerhb2eriuyerqiubgier def ubgleuqrbgilquebriuqebryqebrhqerhqsnajkbcowuhet ghi"; case "%abc%def%" -> "fdnbqerbfklerqbgqjerbgkr abc qeroighqeorhgqerhb2eriuyerqiubgier def ubgleuqrbgilquebriuqebryqebrhqerhqsnajkbcowuhet"; + case "%a%a%a%a%" -> "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"; + case "%aaaaaaaaaaaaaaaaaaaaaaaaaa%" -> "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"; default -> throw new IllegalArgumentException("Unknown pattern: " + pattern); }); @@ -127,6 +131,18 @@ public boolean benchmarkCurrent(Data data) return data.matcher.match(data.bytes, 0, data.bytes.length); } + @Benchmark + public JoniRegexp compileJoni(Data data) + { + return compileJoni(data.pattern, (char) 0, false); + } + + @Benchmark + public LikeMatcher compile(Data data) + { + return LikeMatcher.compile(data.pattern, Optional.empty()); + } + public static boolean likeVarchar(Slice value, JoniRegexp pattern) { Matcher matcher; From ba5f1a1c2ccf7481e712ec2f4f4a642e3d58ca55 Mon Sep 17 00:00:00 2001 From: Martin Traverso Date: Mon, 6 Feb 2023 16:10:37 -0800 Subject: [PATCH 02/11] Simplify construction of LIKE matcher NFA MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Instead of using epsilon transitions for %, as in: e ┌─────────────────┐ ▼ │ ┌─────┐ ┌┴────┐ │ │ │ │ ──►│ 0 ├──────────────►│ 1 ├──► │ │ │ │ └───┬─┘ └─────┘ │ ▲ └──────────────────┘ e it can be modeled as a state with a loopback match on any input: ┌───┐ │ │ ▼ │ ┌────┴┐ │ │ ───►│ 0 ├──► │ │ └─────┘ This removes the need to calculate the transitive closure of the states during the transformation to a DFA, and results in a minor performance improvement for pattern compilation. (pattern) Before After % 26.327 ± 0.239 ns/op 26.216 ± 0.353 ns/op _% 198676.805 ± 3001.159 ns/op 137534.589 ± 937.491 ns/op %_ 233316.578 ± 3901.336 ns/op 148844.829 ± 1654.260 ns/op abc% 144.492 ± 4.819 ns/op 131.837 ± 3.707 ns/op %abc 101.722 ± 1.923 ns/op 124.846 ± 1.603 ns/op _____ 1088049.595 ± 9539.347 ns/op 682908.928 ± 8803.749 ns/op abc%def%ghi 502509.362 ± 5676.648 ns/op 273538.092 ± 1928.285 ns/op %abc%def% 1460704.116 ± 24356.287 ns/op 756419.174 ± 9816.760 ns/op --- .../io/trino/likematcher/LikeMatcher.java | 33 +++++---------- .../main/java/io/trino/likematcher/NFA.java | 41 ++----------------- 2 files changed, 15 insertions(+), 59 deletions(-) diff --git a/core/trino-main/src/main/java/io/trino/likematcher/LikeMatcher.java b/core/trino-main/src/main/java/io/trino/likematcher/LikeMatcher.java index 560f04ddce8f..0873ba064018 100644 --- a/core/trino-main/src/main/java/io/trino/likematcher/LikeMatcher.java +++ b/core/trino-main/src/main/java/io/trino/likematcher/LikeMatcher.java @@ -313,21 +313,14 @@ private static NFA makeNfa(List pattern) } } else if (item instanceof Any any) { - NFA.State previous; - int i = 0; - do { - previous = state; - state = matchSingleUtf8(builder, state); - i++; - } - while (i < any.min()); - - if (any.min() == 0) { - builder.addTransition(previous, new NFA.Epsilon(), state); + for (int i = 0; i < any.min(); i++) { + NFA.State next = builder.addState(); + matchSingleUtf8(builder, state, next); + state = next; } if (any.unbounded()) { - builder.addTransition(state, new NFA.Epsilon(), previous); + matchSingleUtf8(builder, state, state); } } else { @@ -347,7 +340,7 @@ private static NFA.State matchByte(NFA.Builder builder, NFA.State state, byte va return next; } - private static NFA.State matchSingleUtf8(NFA.Builder builder, NFA.State start) + private static void matchSingleUtf8(NFA.Builder builder, NFA.State from, NFA.State to) { /* Implements a state machine to recognize UTF-8 characters. @@ -365,22 +358,18 @@ private static NFA.State matchSingleUtf8(NFA.Builder builder, NFA.State start) 0xxxxxxx */ - NFA.State next = builder.addState(); - - builder.addTransition(start, new NFA.Prefix(0, 1), next); + builder.addTransition(from, new NFA.Prefix(0, 1), to); NFA.State state1 = builder.addState(); NFA.State state2 = builder.addState(); NFA.State state3 = builder.addState(); - builder.addTransition(start, new NFA.Prefix(0b11110, 5), state1); - builder.addTransition(start, new NFA.Prefix(0b1110, 4), state2); - builder.addTransition(start, new NFA.Prefix(0b110, 3), state3); + builder.addTransition(from, new NFA.Prefix(0b11110, 5), state1); + builder.addTransition(from, new NFA.Prefix(0b1110, 4), state2); + builder.addTransition(from, new NFA.Prefix(0b110, 3), state3); builder.addTransition(state1, new NFA.Prefix(0b10, 2), state2); builder.addTransition(state2, new NFA.Prefix(0b10, 2), state3); - builder.addTransition(state3, new NFA.Prefix(0b10, 2), next); - - return next; + builder.addTransition(state3, new NFA.Prefix(0b10, 2), to); } } diff --git a/core/trino-main/src/main/java/io/trino/likematcher/NFA.java b/core/trino-main/src/main/java/io/trino/likematcher/NFA.java index 70316f2eb79d..926f7a30d070 100644 --- a/core/trino-main/src/main/java/io/trino/likematcher/NFA.java +++ b/core/trino-main/src/main/java/io/trino/likematcher/NFA.java @@ -48,7 +48,7 @@ public DFA toDfa() builder.addTransition(failed, i, failed); } - Set initial = transitiveClosure(Set.of(this.start)); + Set initial = Set.of(this.start); Queue> queue = new ArrayDeque<>(); queue.add(initial); @@ -85,9 +85,8 @@ else if (condition instanceof Prefix prefixTransition) { DFA.State from = activeStates.get(current); DFA.State to = failed; if (!next.isEmpty()) { - Set closure = transitiveClosure(next); - to = activeStates.computeIfAbsent(closure, nfaStates -> builder.addState(makeLabel(nfaStates), nfaStates.contains(accept))); - queue.add(closure); + to = activeStates.computeIfAbsent(next, nfaStates -> builder.addState(makeLabel(nfaStates), nfaStates.contains(accept))); + queue.add(next); } builder.addTransition(from, byteValue, to); } @@ -101,35 +100,6 @@ private List transitions(State state) return transitions.getOrDefault(state.id(), ImmutableList.of()); } - /** - * Traverse epsilon transitions to compute the reachable set of states - */ - private Set transitiveClosure(Set states) - { - Set result = new HashSet<>(); - - Queue queue = new ArrayDeque<>(states); - while (!queue.isEmpty()) { - State state = queue.poll(); - - if (result.contains(state)) { - continue; - } - - transitions(state).stream() - .filter(transition -> transition.condition() instanceof Epsilon) - .forEach(transition -> { - State target = this.states.get(transition.target()); - result.add(target); - queue.add(target); - }); - } - - result.addAll(states); - - return result; - } - private String makeLabel(Set states) { return "{" + states.stream() @@ -191,13 +161,10 @@ public String toString() record Transition(int target, Condition condition) {} sealed interface Condition - permits Epsilon, Value, Prefix + permits Value, Prefix { } - record Epsilon() - implements Condition {} - record Value(byte value) implements Condition {} From 1fb8e1d16eeb363113a6c58c1d2e208574fb7047 Mon Sep 17 00:00:00 2001 From: Martin Traverso Date: Mon, 6 Feb 2023 16:35:44 -0800 Subject: [PATCH 03/11] Encapsulate translation of pattern into DFA This is in preparation for introducing alternative matching strategies that are no so expensive to compile. --- .../io/trino/likematcher/DenseDfaMatcher.java | 84 ++++++++++++++++++- .../io/trino/likematcher/LikeMatcher.java | 78 +---------------- 2 files changed, 84 insertions(+), 78 deletions(-) diff --git a/core/trino-main/src/main/java/io/trino/likematcher/DenseDfaMatcher.java b/core/trino-main/src/main/java/io/trino/likematcher/DenseDfaMatcher.java index ab70e14e7896..7a3546d0999f 100644 --- a/core/trino-main/src/main/java/io/trino/likematcher/DenseDfaMatcher.java +++ b/core/trino-main/src/main/java/io/trino/likematcher/DenseDfaMatcher.java @@ -13,6 +13,11 @@ */ package io.trino.likematcher; +import java.util.List; + +import static com.google.common.base.Preconditions.checkArgument; +import static java.nio.charset.StandardCharsets.UTF_8; + class DenseDfaMatcher { // The DFA is encoded as a sequence of transitions for each possible byte value for each state. @@ -35,8 +40,10 @@ class DenseDfaMatcher /** * @param exact whether to match to the end of the input */ - public static DenseDfaMatcher newInstance(DFA dfa, boolean exact) + public static DenseDfaMatcher newInstance(List pattern, boolean exact) { + DFA dfa = makeNfa(pattern).toDfa(); + int[] transitions = new int[dfa.states().size() * 256]; boolean[] accept = new boolean[dfa.states().size()]; @@ -111,4 +118,79 @@ private boolean prefixMatch(byte[] input, int offset, int length) return accept[state >>> 8]; } + + private static NFA makeNfa(List pattern) + { + checkArgument(!pattern.isEmpty(), "pattern is empty"); + + NFA.Builder builder = new NFA.Builder(); + + NFA.State state = builder.addStartState(); + + for (Pattern item : pattern) { + if (item instanceof Pattern.Literal literal) { + for (byte current : literal.value().getBytes(UTF_8)) { + state = matchByte(builder, state, current); + } + } + else if (item instanceof Pattern.Any any) { + for (int i = 0; i < any.min(); i++) { + NFA.State next = builder.addState(); + matchSingleUtf8(builder, state, next); + state = next; + } + + if (any.unbounded()) { + matchSingleUtf8(builder, state, state); + } + } + else { + throw new UnsupportedOperationException("Not supported: " + item.getClass().getName()); + } + } + + builder.setAccept(state); + + return builder.build(); + } + + private static NFA.State matchByte(NFA.Builder builder, NFA.State state, byte value) + { + NFA.State next = builder.addState(); + builder.addTransition(state, new NFA.Value(value), next); + return next; + } + + private static void matchSingleUtf8(NFA.Builder builder, NFA.State from, NFA.State to) + { + /* + Implements a state machine to recognize UTF-8 characters. + + 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx + O ───────────► O ───────────► O ───────────► O ───────────► O + │ ▲ ▲ ▲ + ├─────────────────────────────┘ │ │ + │ 1110xxxx │ │ + │ │ │ + ├────────────────────────────────────────────┘ │ + │ 110xxxxx │ + │ │ + └───────────────────────────────────────────────────────────┘ + 0xxxxxxx + */ + + builder.addTransition(from, new NFA.Prefix(0, 1), to); + + NFA.State state1 = builder.addState(); + NFA.State state2 = builder.addState(); + NFA.State state3 = builder.addState(); + + builder.addTransition(from, new NFA.Prefix(0b11110, 5), state1); + builder.addTransition(from, new NFA.Prefix(0b1110, 4), state2); + builder.addTransition(from, new NFA.Prefix(0b110, 3), state3); + + builder.addTransition(state1, new NFA.Prefix(0b10, 2), state2); + builder.addTransition(state2, new NFA.Prefix(0b10, 2), state3); + builder.addTransition(state3, new NFA.Prefix(0b10, 2), to); + } } diff --git a/core/trino-main/src/main/java/io/trino/likematcher/LikeMatcher.java b/core/trino-main/src/main/java/io/trino/likematcher/LikeMatcher.java index 0873ba064018..a71627581652 100644 --- a/core/trino-main/src/main/java/io/trino/likematcher/LikeMatcher.java +++ b/core/trino-main/src/main/java/io/trino/likematcher/LikeMatcher.java @@ -21,7 +21,6 @@ import java.util.Optional; import java.util.OptionalInt; -import static com.google.common.base.Preconditions.checkArgument; import static java.nio.charset.StandardCharsets.UTF_8; public class LikeMatcher @@ -146,7 +145,7 @@ else if (i == optimized.size() - 1) { Optional matcher = Optional.empty(); if (!middle.isEmpty()) { - matcher = Optional.of(DenseDfaMatcher.newInstance(makeNfa(middle).toDfa(), exact)); + matcher = Optional.of(DenseDfaMatcher.newInstance(middle, exact)); } return new LikeMatcher( @@ -297,79 +296,4 @@ private static Any collapse(List pattern, int start, int end) return new Any(min, unbounded); } - - private static NFA makeNfa(List pattern) - { - checkArgument(!pattern.isEmpty(), "pattern is empty"); - - NFA.Builder builder = new NFA.Builder(); - - NFA.State state = builder.addStartState(); - - for (Pattern item : pattern) { - if (item instanceof Literal literal) { - for (byte current : literal.value().getBytes(UTF_8)) { - state = matchByte(builder, state, current); - } - } - else if (item instanceof Any any) { - for (int i = 0; i < any.min(); i++) { - NFA.State next = builder.addState(); - matchSingleUtf8(builder, state, next); - state = next; - } - - if (any.unbounded()) { - matchSingleUtf8(builder, state, state); - } - } - else { - throw new UnsupportedOperationException("Not supported: " + item.getClass().getName()); - } - } - - builder.setAccept(state); - - return builder.build(); - } - - private static NFA.State matchByte(NFA.Builder builder, NFA.State state, byte value) - { - NFA.State next = builder.addState(); - builder.addTransition(state, new NFA.Value(value), next); - return next; - } - - private static void matchSingleUtf8(NFA.Builder builder, NFA.State from, NFA.State to) - { - /* - Implements a state machine to recognize UTF-8 characters. - - 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx - O ───────────► O ───────────► O ───────────► O ───────────► O - │ ▲ ▲ ▲ - ├─────────────────────────────┘ │ │ - │ 1110xxxx │ │ - │ │ │ - ├────────────────────────────────────────────┘ │ - │ 110xxxxx │ - │ │ - └───────────────────────────────────────────────────────────┘ - 0xxxxxxx - */ - - builder.addTransition(from, new NFA.Prefix(0, 1), to); - - NFA.State state1 = builder.addState(); - NFA.State state2 = builder.addState(); - NFA.State state3 = builder.addState(); - - builder.addTransition(from, new NFA.Prefix(0b11110, 5), state1); - builder.addTransition(from, new NFA.Prefix(0b1110, 4), state2); - builder.addTransition(from, new NFA.Prefix(0b110, 3), state3); - - builder.addTransition(state1, new NFA.Prefix(0b10, 2), state2); - builder.addTransition(state2, new NFA.Prefix(0b10, 2), state3); - builder.addTransition(state3, new NFA.Prefix(0b10, 2), to); - } } From cf48b71addcc02bd23349ea7a6cb4835054f3adb Mon Sep 17 00:00:00 2001 From: Martin Traverso Date: Mon, 6 Feb 2023 16:40:11 -0800 Subject: [PATCH 04/11] Use ordered list instead of map for transitions --- .../src/main/java/io/trino/likematcher/DFA.java | 13 +++++-------- .../src/main/java/io/trino/likematcher/NFA.java | 13 ++++++------- 2 files changed, 11 insertions(+), 15 deletions(-) diff --git a/core/trino-main/src/main/java/io/trino/likematcher/DFA.java b/core/trino-main/src/main/java/io/trino/likematcher/DFA.java index 79aaebc9ede6..832303c7bd2a 100644 --- a/core/trino-main/src/main/java/io/trino/likematcher/DFA.java +++ b/core/trino-main/src/main/java/io/trino/likematcher/DFA.java @@ -14,25 +14,22 @@ package io.trino.likematcher; import com.google.common.collect.ImmutableList; -import com.google.common.collect.ImmutableMap; import java.util.ArrayList; -import java.util.HashMap; import java.util.List; -import java.util.Map; import static com.google.common.base.Preconditions.checkState; import static java.lang.String.format; import static java.util.Objects.requireNonNull; -record DFA(State start, State failed, List states, Map> transitions) +record DFA(State start, State failed, List states, List> transitions) { DFA { requireNonNull(start, "start is null"); requireNonNull(failed, "failed is null"); states = ImmutableList.copyOf(states); - transitions = ImmutableMap.copyOf(transitions); + transitions = ImmutableList.copyOf(transitions); } public List transitions(State state) @@ -67,12 +64,13 @@ public static class Builder private State start; private State failed; private final List states = new ArrayList<>(); - private final Map> transitions = new HashMap<>(); + private final List> transitions = new ArrayList<>(); public State addState(String label, boolean accept) { State state = new State(nextId++, label, accept); states.add(state); + transitions.add(new ArrayList<>()); return state; } @@ -94,8 +92,7 @@ public State addFailState() public void addTransition(State from, int value, State to) { - transitions.computeIfAbsent(from.id(), key -> new ArrayList<>()) - .add(new Transition(value, to)); + transitions.get(from.id()).add(new Transition(value, to)); } public DFA build() diff --git a/core/trino-main/src/main/java/io/trino/likematcher/NFA.java b/core/trino-main/src/main/java/io/trino/likematcher/NFA.java index 926f7a30d070..cb01053d3d1c 100644 --- a/core/trino-main/src/main/java/io/trino/likematcher/NFA.java +++ b/core/trino-main/src/main/java/io/trino/likematcher/NFA.java @@ -14,7 +14,6 @@ package io.trino.likematcher; import com.google.common.collect.ImmutableList; -import com.google.common.collect.ImmutableMap; import java.util.ArrayDeque; import java.util.ArrayList; @@ -29,13 +28,13 @@ import static com.google.common.base.Preconditions.checkState; import static java.util.Objects.requireNonNull; -record NFA(State start, State accept, List states, Map> transitions) +record NFA(State start, State accept, List states, List> transitions) { NFA { requireNonNull(start, "start is null"); requireNonNull(accept, "accept is null"); states = ImmutableList.copyOf(states); - transitions = ImmutableMap.copyOf(transitions); + transitions = ImmutableList.copyOf(transitions); } public DFA toDfa() @@ -97,7 +96,7 @@ else if (condition instanceof Prefix prefixTransition) { private List transitions(State state) { - return transitions.getOrDefault(state.id(), ImmutableList.of()); + return transitions.get(state.id()); } private String makeLabel(Set states) @@ -115,12 +114,13 @@ public static class Builder private State start; private State accept; private final List states = new ArrayList<>(); - private final Map> transitions = new HashMap<>(); + private final List> transitions = new ArrayList<>(); public State addState() { State state = new State(nextId++); states.add(state); + transitions.add(new ArrayList<>()); return state; } @@ -139,8 +139,7 @@ public void setAccept(State state) public void addTransition(State from, Condition condition, State to) { - transitions.computeIfAbsent(from.id(), key -> new ArrayList<>()) - .add(new Transition(to.id(), condition)); + transitions.get(from.id()).add(new Transition(to.id(), condition)); } public NFA build() From abe6a6289586f5de8a959d09884cc616aee6a106 Mon Sep 17 00:00:00 2001 From: Martin Traverso Date: Mon, 6 Feb 2023 16:42:39 -0800 Subject: [PATCH 05/11] Convert NFA to class To have more control over how instances are created. --- .../main/java/io/trino/likematcher/NFA.java | 38 ++++++++++--------- 1 file changed, 21 insertions(+), 17 deletions(-) diff --git a/core/trino-main/src/main/java/io/trino/likematcher/NFA.java b/core/trino-main/src/main/java/io/trino/likematcher/NFA.java index cb01053d3d1c..ab57638b8823 100644 --- a/core/trino-main/src/main/java/io/trino/likematcher/NFA.java +++ b/core/trino-main/src/main/java/io/trino/likematcher/NFA.java @@ -13,8 +13,6 @@ */ package io.trino.likematcher; -import com.google.common.collect.ImmutableList; - import java.util.ArrayDeque; import java.util.ArrayList; import java.util.HashMap; @@ -28,18 +26,24 @@ import static com.google.common.base.Preconditions.checkState; import static java.util.Objects.requireNonNull; -record NFA(State start, State accept, List states, List> transitions) +final class NFA { - NFA { - requireNonNull(start, "start is null"); - requireNonNull(accept, "accept is null"); - states = ImmutableList.copyOf(states); - transitions = ImmutableList.copyOf(transitions); + private final State start; + private final State accept; + private final List states; + private final List> transitions; + + private NFA(State start, State accept, List states, List> transitions) + { + this.start = requireNonNull(start, "start is null"); + this.accept = requireNonNull(accept, "accept is null"); + this.states = requireNonNull(states, "states is null"); + this.transitions = requireNonNull(transitions, "transitions is null"); } public DFA toDfa() { - Map, DFA.State> activeStates = new HashMap<>(); + Map, DFA.State> activeStates = new HashMap<>(); DFA.Builder builder = new DFA.Builder(); DFA.State failed = builder.addFailState(); @@ -47,16 +51,16 @@ public DFA toDfa() builder.addTransition(failed, i, failed); } - Set initial = Set.of(this.start); - Queue> queue = new ArrayDeque<>(); + Set initial = Set.of(this.start); + Queue> queue = new ArrayDeque<>(); queue.add(initial); DFA.State dfaStartState = builder.addStartState(makeLabel(initial), initial.contains(accept)); activeStates.put(initial, dfaStartState); - Set> visited = new HashSet<>(); + Set> visited = new HashSet<>(); while (!queue.isEmpty()) { - Set current = queue.poll(); + Set current = queue.poll(); if (!visited.add(current)) { continue; @@ -64,8 +68,8 @@ public DFA toDfa() // For each possible byte value... for (int byteValue = 0; byteValue < 256; byteValue++) { - Set next = new HashSet<>(); - for (NFA.State nfaState : current) { + Set next = new HashSet<>(); + for (State nfaState : current) { for (Transition transition : transitions(nfaState)) { Condition condition = transition.condition(); State target = states.get(transition.target()); @@ -99,10 +103,10 @@ private List transitions(State state) return transitions.get(state.id()); } - private String makeLabel(Set states) + private String makeLabel(Set states) { return "{" + states.stream() - .map(NFA.State::id) + .map(State::id) .map(Object::toString) .sorted() .collect(Collectors.joining(",")) + "}"; From 59ff843b09e0cd29ae24a298009cea447cf680ec Mon Sep 17 00:00:00 2001 From: Martin Traverso Date: Mon, 6 Feb 2023 16:55:21 -0800 Subject: [PATCH 06/11] Build NFA/DFA with more efficient data structures MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Use fastutil sets and primitive ints instead of HashSet. Before After % 26.274 ± 0.176 ns/op 26.496 ± 0.081 ns/op _% 129592.079 ± 505.848 ns/op 47977.429 ± 181.518 ns/op %_ 128077.197 ± 483.910 ns/op 44113.168 ± 270.582 ns/op abc% 129.875 ± 0.854 ns/op 142.563 ± 0.719 ns/op %abc 93.918 ± 0.826 ns/op 124.741 ± 0.545 ns/op _____ 597602.869 ± 2663.150 ns/op 195144.440 ± 733.982 ns/op abc%def%ghi 258900.167 ± 1012.373 ns/op 94865.609 ± 5586.094 ns/op %abc%def% 675161.396 ± 3570.877 ns/op 271948.762 ± 1989.130 ns/op --- .../main/java/io/trino/likematcher/DFA.java | 63 +++++-------- .../io/trino/likematcher/DenseDfaMatcher.java | 34 +++---- .../main/java/io/trino/likematcher/NFA.java | 89 +++++++------------ 3 files changed, 71 insertions(+), 115 deletions(-) diff --git a/core/trino-main/src/main/java/io/trino/likematcher/DFA.java b/core/trino-main/src/main/java/io/trino/likematcher/DFA.java index 832303c7bd2a..b667b2b14b9a 100644 --- a/core/trino-main/src/main/java/io/trino/likematcher/DFA.java +++ b/core/trino-main/src/main/java/io/trino/likematcher/DFA.java @@ -14,42 +14,23 @@ package io.trino.likematcher; import com.google.common.collect.ImmutableList; +import it.unimi.dsi.fastutil.ints.IntArrayList; import java.util.ArrayList; import java.util.List; -import static com.google.common.base.Preconditions.checkState; import static java.lang.String.format; import static java.util.Objects.requireNonNull; -record DFA(State start, State failed, List states, List> transitions) +record DFA(int start, int failed, IntArrayList acceptStates, List> transitions) { DFA { - requireNonNull(start, "start is null"); - requireNonNull(failed, "failed is null"); - states = ImmutableList.copyOf(states); + requireNonNull(acceptStates, "acceptStates is null"); transitions = ImmutableList.copyOf(transitions); } - public List transitions(State state) - { - return transitions.get(state.id); - } - - record State(int id, String label, boolean accept) - { - @Override - public String toString() - { - return "%s:%s%s".formatted( - id, - accept ? "*" : "", - label); - } - } - - record Transition(int value, State target) + record Transition(int value, int target) { @Override public String toString() @@ -61,43 +42,41 @@ public String toString() public static class Builder { private int nextId; - private State start; - private State failed; - private final List states = new ArrayList<>(); + private int start; + private int failed; + private final IntArrayList acceptStates = new IntArrayList(); private final List> transitions = new ArrayList<>(); - public State addState(String label, boolean accept) + public int addState(boolean accept) { - State state = new State(nextId++, label, accept); - states.add(state); + int state = nextId++; transitions.add(new ArrayList<>()); + if (accept) { + acceptStates.add(state); + } return state; } - public State addStartState(String label, boolean accept) + public int addStartState(boolean accept) { - checkState(start == null, "Start state already set"); - State state = addState(label, accept); - start = state; - return state; + start = addState(accept); + return start; } - public State addFailState() + public int addFailState() { - checkState(failed == null, "Fail state already set"); - State state = addState("fail", false); - failed = state; - return state; + failed = addState(false); + return failed; } - public void addTransition(State from, int value, State to) + public void addTransition(int from, int value, int to) { - transitions.get(from.id()).add(new Transition(value, to)); + transitions.get(from).add(new Transition(value, to)); } public DFA build() { - return new DFA(start, failed, states, transitions); + return new DFA(start, failed, acceptStates, transitions); } } } diff --git a/core/trino-main/src/main/java/io/trino/likematcher/DenseDfaMatcher.java b/core/trino-main/src/main/java/io/trino/likematcher/DenseDfaMatcher.java index 7a3546d0999f..e6c5f78f730f 100644 --- a/core/trino-main/src/main/java/io/trino/likematcher/DenseDfaMatcher.java +++ b/core/trino-main/src/main/java/io/trino/likematcher/DenseDfaMatcher.java @@ -44,20 +44,20 @@ public static DenseDfaMatcher newInstance(List pattern, boolean exact) { DFA dfa = makeNfa(pattern).toDfa(); - int[] transitions = new int[dfa.states().size() * 256]; - boolean[] accept = new boolean[dfa.states().size()]; + int[] transitions = new int[dfa.transitions().size() * 256]; - for (DFA.State state : dfa.states()) { - for (DFA.Transition transition : dfa.transitions(state)) { - transitions[state.id() * 256 + transition.value()] = transition.target().id() * 256; + for (int state = 0; state < dfa.transitions().size(); state++) { + for (DFA.Transition transition : dfa.transitions().get(state)) { + transitions[state * 256 + transition.value()] = transition.target() * 256; } + } - if (state.accept()) { - accept[state.id()] = true; - } + boolean[] accept = new boolean[dfa.transitions().size()]; + for (int state : dfa.acceptStates()) { + accept[state] = true; } - return new DenseDfaMatcher(transitions, dfa.start().id(), accept, 0, exact); + return new DenseDfaMatcher(transitions, dfa.start(), accept, 0, exact); } private DenseDfaMatcher(int[] transitions, int start, boolean[] accept, int fail, boolean exact) @@ -125,7 +125,7 @@ private static NFA makeNfa(List pattern) NFA.Builder builder = new NFA.Builder(); - NFA.State state = builder.addStartState(); + int state = builder.addStartState(); for (Pattern item : pattern) { if (item instanceof Pattern.Literal literal) { @@ -135,7 +135,7 @@ private static NFA makeNfa(List pattern) } else if (item instanceof Pattern.Any any) { for (int i = 0; i < any.min(); i++) { - NFA.State next = builder.addState(); + int next = builder.addState(); matchSingleUtf8(builder, state, next); state = next; } @@ -154,14 +154,14 @@ else if (item instanceof Pattern.Any any) { return builder.build(); } - private static NFA.State matchByte(NFA.Builder builder, NFA.State state, byte value) + private static int matchByte(NFA.Builder builder, int state, byte value) { - NFA.State next = builder.addState(); + int next = builder.addState(); builder.addTransition(state, new NFA.Value(value), next); return next; } - private static void matchSingleUtf8(NFA.Builder builder, NFA.State from, NFA.State to) + private static void matchSingleUtf8(NFA.Builder builder, int from, int to) { /* Implements a state machine to recognize UTF-8 characters. @@ -181,9 +181,9 @@ private static void matchSingleUtf8(NFA.Builder builder, NFA.State from, NFA.Sta builder.addTransition(from, new NFA.Prefix(0, 1), to); - NFA.State state1 = builder.addState(); - NFA.State state2 = builder.addState(); - NFA.State state3 = builder.addState(); + int state1 = builder.addState(); + int state2 = builder.addState(); + int state3 = builder.addState(); builder.addTransition(from, new NFA.Prefix(0b11110, 5), state1); builder.addTransition(from, new NFA.Prefix(0b1110, 4), state2); diff --git a/core/trino-main/src/main/java/io/trino/likematcher/NFA.java b/core/trino-main/src/main/java/io/trino/likematcher/NFA.java index ab57638b8823..067c81d5c554 100644 --- a/core/trino-main/src/main/java/io/trino/likematcher/NFA.java +++ b/core/trino-main/src/main/java/io/trino/likematcher/NFA.java @@ -13,6 +13,9 @@ */ package io.trino.likematcher; +import it.unimi.dsi.fastutil.ints.IntArraySet; +import it.unimi.dsi.fastutil.ints.IntSet; + import java.util.ArrayDeque; import java.util.ArrayList; import java.util.HashMap; @@ -21,46 +24,43 @@ import java.util.Map; import java.util.Queue; import java.util.Set; -import java.util.stream.Collectors; -import static com.google.common.base.Preconditions.checkState; import static java.util.Objects.requireNonNull; final class NFA { - private final State start; - private final State accept; - private final List states; + private final int start; + private final int accept; private final List> transitions; - private NFA(State start, State accept, List states, List> transitions) + private NFA(int start, int accept, List> transitions) { - this.start = requireNonNull(start, "start is null"); - this.accept = requireNonNull(accept, "accept is null"); - this.states = requireNonNull(states, "states is null"); + this.start = start; + this.accept = accept; this.transitions = requireNonNull(transitions, "transitions is null"); } public DFA toDfa() { - Map, DFA.State> activeStates = new HashMap<>(); + Map activeStates = new HashMap<>(); DFA.Builder builder = new DFA.Builder(); - DFA.State failed = builder.addFailState(); + int failed = builder.addFailState(); for (int i = 0; i < 256; i++) { builder.addTransition(failed, i, failed); } - Set initial = Set.of(this.start); - Queue> queue = new ArrayDeque<>(); + IntSet initial = new IntArraySet(); + initial.add(start); + Queue queue = new ArrayDeque<>(); queue.add(initial); - DFA.State dfaStartState = builder.addStartState(makeLabel(initial), initial.contains(accept)); + int dfaStartState = builder.addStartState(initial.contains(accept)); activeStates.put(initial, dfaStartState); - Set> visited = new HashSet<>(); + Set visited = new HashSet<>(); while (!queue.isEmpty()) { - Set current = queue.poll(); + IntSet current = queue.poll(); if (!visited.add(current)) { continue; @@ -68,11 +68,11 @@ public DFA toDfa() // For each possible byte value... for (int byteValue = 0; byteValue < 256; byteValue++) { - Set next = new HashSet<>(); - for (State nfaState : current) { + IntSet next = new IntArraySet(); + for (int nfaState : current) { for (Transition transition : transitions(nfaState)) { Condition condition = transition.condition(); - State target = states.get(transition.target()); + int target = transition.target(); if (condition instanceof Value valueTransition && valueTransition.value() == (byte) byteValue) { next.add(target); @@ -85,10 +85,10 @@ else if (condition instanceof Prefix prefixTransition) { } } - DFA.State from = activeStates.get(current); - DFA.State to = failed; + int from = activeStates.get(current); + int to = failed; if (!next.isEmpty()) { - to = activeStates.computeIfAbsent(next, nfaStates -> builder.addState(makeLabel(nfaStates), nfaStates.contains(accept))); + to = activeStates.computeIfAbsent(next, nfaStates -> builder.addState(nfaStates.contains(accept))); queue.add(next); } builder.addTransition(from, byteValue, to); @@ -98,66 +98,43 @@ else if (condition instanceof Prefix prefixTransition) { return builder.build(); } - private List transitions(State state) + private List transitions(int state) { - return transitions.get(state.id()); - } - - private String makeLabel(Set states) - { - return "{" + states.stream() - .map(State::id) - .map(Object::toString) - .sorted() - .collect(Collectors.joining(",")) + "}"; + return transitions.get(state); } public static class Builder { private int nextId; - private State start; - private State accept; - private final List states = new ArrayList<>(); + private int start; + private int accept; private final List> transitions = new ArrayList<>(); - public State addState() + public int addState() { - State state = new State(nextId++); - states.add(state); transitions.add(new ArrayList<>()); - return state; + return nextId++; } - public State addStartState() + public int addStartState() { - checkState(start == null, "Start state is already set"); start = addState(); return start; } - public void setAccept(State state) + public void setAccept(int state) { - checkState(accept == null, "Accept state is already set"); accept = state; } - public void addTransition(State from, Condition condition, State to) + public void addTransition(int from, Condition condition, int to) { - transitions.get(from.id()).add(new Transition(to.id(), condition)); + transitions.get(from).add(new Transition(to, condition)); } public NFA build() { - return new NFA(start, accept, states, transitions); - } - } - - public record State(int id) - { - @Override - public String toString() - { - return "(" + id + ")"; + return new NFA(start, accept, transitions); } } From eb3c15c6d329133db4d46034a9c069df08275606 Mon Sep 17 00:00:00 2001 From: Martin Traverso Date: Mon, 6 Feb 2023 17:15:01 -0800 Subject: [PATCH 07/11] Do not create a failed state Instead, use a hard-coded state id to represent the failed state. This helps reduce the complexity of the NFA and DFA by removing unnecessary transitions. --- .../src/main/java/io/trino/likematcher/DFA.java | 11 ++--------- .../io/trino/likematcher/DenseDfaMatcher.java | 16 ++++++++-------- .../src/main/java/io/trino/likematcher/NFA.java | 12 ++++-------- 3 files changed, 14 insertions(+), 25 deletions(-) diff --git a/core/trino-main/src/main/java/io/trino/likematcher/DFA.java b/core/trino-main/src/main/java/io/trino/likematcher/DFA.java index b667b2b14b9a..2eec9c32dda8 100644 --- a/core/trino-main/src/main/java/io/trino/likematcher/DFA.java +++ b/core/trino-main/src/main/java/io/trino/likematcher/DFA.java @@ -22,7 +22,7 @@ import static java.lang.String.format; import static java.util.Objects.requireNonNull; -record DFA(int start, int failed, IntArrayList acceptStates, List> transitions) +record DFA(int start, IntArrayList acceptStates, List> transitions) { DFA { @@ -43,7 +43,6 @@ public static class Builder { private int nextId; private int start; - private int failed; private final IntArrayList acceptStates = new IntArrayList(); private final List> transitions = new ArrayList<>(); @@ -63,12 +62,6 @@ public int addStartState(boolean accept) return start; } - public int addFailState() - { - failed = addState(false); - return failed; - } - public void addTransition(int from, int value, int to) { transitions.get(from).add(new Transition(value, to)); @@ -76,7 +69,7 @@ public void addTransition(int from, int value, int to) public DFA build() { - return new DFA(start, failed, acceptStates, transitions); + return new DFA(start, acceptStates, transitions); } } } diff --git a/core/trino-main/src/main/java/io/trino/likematcher/DenseDfaMatcher.java b/core/trino-main/src/main/java/io/trino/likematcher/DenseDfaMatcher.java index e6c5f78f730f..d79bf6c5ab90 100644 --- a/core/trino-main/src/main/java/io/trino/likematcher/DenseDfaMatcher.java +++ b/core/trino-main/src/main/java/io/trino/likematcher/DenseDfaMatcher.java @@ -13,6 +13,7 @@ */ package io.trino.likematcher; +import java.util.Arrays; import java.util.List; import static com.google.common.base.Preconditions.checkArgument; @@ -20,6 +21,8 @@ class DenseDfaMatcher { + public static final int FAIL_STATE = -1; + // The DFA is encoded as a sequence of transitions for each possible byte value for each state. // I.e., 256 transitions per state. // The content of the transitions array is the base offset into @@ -32,9 +35,6 @@ class DenseDfaMatcher // For each state, whether it's an accepting state private final boolean[] accept; - // Artificial state to sink all invalid matches - private final int fail; - private final boolean exact; /** @@ -45,6 +45,7 @@ public static DenseDfaMatcher newInstance(List pattern, boolean exact) DFA dfa = makeNfa(pattern).toDfa(); int[] transitions = new int[dfa.transitions().size() * 256]; + Arrays.fill(transitions, FAIL_STATE); for (int state = 0; state < dfa.transitions().size(); state++) { for (DFA.Transition transition : dfa.transitions().get(state)) { @@ -57,15 +58,14 @@ public static DenseDfaMatcher newInstance(List pattern, boolean exact) accept[state] = true; } - return new DenseDfaMatcher(transitions, dfa.start(), accept, 0, exact); + return new DenseDfaMatcher(transitions, dfa.start(), accept, exact); } - private DenseDfaMatcher(int[] transitions, int start, boolean[] accept, int fail, boolean exact) + private DenseDfaMatcher(int[] transitions, int start, boolean[] accept, boolean exact) { this.transitions = transitions; this.start = start; this.accept = accept; - this.fail = fail; this.exact = exact; } @@ -88,7 +88,7 @@ private boolean exactMatch(byte[] input, int offset, int length) byte inputByte = input[i]; state = transitions[state | (inputByte & 0xFF)]; - if (state == fail) { + if (state == FAIL_STATE) { return false; } } @@ -107,7 +107,7 @@ private boolean prefixMatch(byte[] input, int offset, int length) byte inputByte = input[i]; state = transitions[state | (inputByte & 0xFF)]; - if (state == fail) { + if (state == FAIL_STATE) { return false; } diff --git a/core/trino-main/src/main/java/io/trino/likematcher/NFA.java b/core/trino-main/src/main/java/io/trino/likematcher/NFA.java index 067c81d5c554..f06e954a389a 100644 --- a/core/trino-main/src/main/java/io/trino/likematcher/NFA.java +++ b/core/trino-main/src/main/java/io/trino/likematcher/NFA.java @@ -45,10 +45,6 @@ public DFA toDfa() Map activeStates = new HashMap<>(); DFA.Builder builder = new DFA.Builder(); - int failed = builder.addFailState(); - for (int i = 0; i < 256; i++) { - builder.addTransition(failed, i, failed); - } IntSet initial = new IntArraySet(); initial.add(start); @@ -85,13 +81,13 @@ else if (condition instanceof Prefix prefixTransition) { } } - int from = activeStates.get(current); - int to = failed; if (!next.isEmpty()) { - to = activeStates.computeIfAbsent(next, nfaStates -> builder.addState(nfaStates.contains(accept))); + int from = activeStates.get(current); + int to = activeStates.computeIfAbsent(next, nfaStates -> builder.addState(nfaStates.contains(accept))); + builder.addTransition(from, byteValue, to); + queue.add(next); } - builder.addTransition(from, byteValue, to); } } From 505ad91a0c249199440991bd84ba0686eac8141b Mon Sep 17 00:00:00 2001 From: Martin Traverso Date: Mon, 6 Feb 2023 18:22:01 -0800 Subject: [PATCH 08/11] Add NFA-based matcher for dynamic LIKE patterns MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The matcher is much cheaper to construct as it doesn't require translating the NFA to a DFA. Benchmark including compilation and matching times for the "%abc%def%" pattern: dfa 257710.213 ± 22629.891 ns/op nfa 571.737 ± 5.254 ns/op joni 1146.428 ± 14.036 ns/op Benchmark just for the matching portion: dfa 148.657 ± 0.772 ns/op nfa 259.458 ± 4.031 ns/op joni 447.760 ± 2.812 ns/op --- .../io/trino/likematcher/DenseDfaMatcher.java | 2 + .../io/trino/likematcher/LikeMatcher.java | 20 ++- .../java/io/trino/likematcher/Matcher.java | 19 ++ .../java/io/trino/likematcher/NfaMatcher.java | 162 ++++++++++++++++++ .../java/io/trino/type/LikeFunctions.java | 4 +- .../io/trino/likematcher/TestLikeMatcher.java | 27 ++- .../trino/operator/scalar/BenchmarkLike.java | 48 +++++- 7 files changed, 263 insertions(+), 19 deletions(-) create mode 100644 core/trino-main/src/main/java/io/trino/likematcher/Matcher.java create mode 100644 core/trino-main/src/main/java/io/trino/likematcher/NfaMatcher.java diff --git a/core/trino-main/src/main/java/io/trino/likematcher/DenseDfaMatcher.java b/core/trino-main/src/main/java/io/trino/likematcher/DenseDfaMatcher.java index d79bf6c5ab90..0121390ace27 100644 --- a/core/trino-main/src/main/java/io/trino/likematcher/DenseDfaMatcher.java +++ b/core/trino-main/src/main/java/io/trino/likematcher/DenseDfaMatcher.java @@ -20,6 +20,7 @@ import static java.nio.charset.StandardCharsets.UTF_8; class DenseDfaMatcher + implements Matcher { public static final int FAIL_STATE = -1; @@ -69,6 +70,7 @@ private DenseDfaMatcher(int[] transitions, int start, boolean[] accept, boolean this.exact = exact; } + @Override public boolean match(byte[] input, int offset, int length) { if (exact) { diff --git a/core/trino-main/src/main/java/io/trino/likematcher/LikeMatcher.java b/core/trino-main/src/main/java/io/trino/likematcher/LikeMatcher.java index a71627581652..4ce6f0597684 100644 --- a/core/trino-main/src/main/java/io/trino/likematcher/LikeMatcher.java +++ b/core/trino-main/src/main/java/io/trino/likematcher/LikeMatcher.java @@ -32,7 +32,7 @@ public class LikeMatcher private final OptionalInt maxSize; private final byte[] prefix; private final byte[] suffix; - private final Optional matcher; + private final Optional matcher; private LikeMatcher( String pattern, @@ -41,7 +41,7 @@ private LikeMatcher( OptionalInt maxSize, byte[] prefix, byte[] suffix, - Optional matcher) + Optional matcher) { this.pattern = pattern; this.escape = escape; @@ -64,10 +64,15 @@ public Optional getEscape() public static LikeMatcher compile(String pattern) { - return compile(pattern, Optional.empty()); + return compile(pattern, Optional.empty(), true); } public static LikeMatcher compile(String pattern, Optional escape) + { + return compile(pattern, escape, true); + } + + public static LikeMatcher compile(String pattern, Optional escape, boolean optimize) { List parsed = parse(pattern, escape); List optimized = optimize(parsed); @@ -143,9 +148,14 @@ else if (i == optimized.size() - 1) { } } - Optional matcher = Optional.empty(); + Optional matcher = Optional.empty(); if (!middle.isEmpty()) { - matcher = Optional.of(DenseDfaMatcher.newInstance(middle, exact)); + if (optimize) { + matcher = Optional.of(DenseDfaMatcher.newInstance(middle, exact)); + } + else { + matcher = Optional.of(new NfaMatcher(middle, exact)); + } } return new LikeMatcher( diff --git a/core/trino-main/src/main/java/io/trino/likematcher/Matcher.java b/core/trino-main/src/main/java/io/trino/likematcher/Matcher.java new file mode 100644 index 000000000000..1ca657cad848 --- /dev/null +++ b/core/trino-main/src/main/java/io/trino/likematcher/Matcher.java @@ -0,0 +1,19 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.likematcher; + +public interface Matcher +{ + boolean match(byte[] input, int offset, int length); +} diff --git a/core/trino-main/src/main/java/io/trino/likematcher/NfaMatcher.java b/core/trino-main/src/main/java/io/trino/likematcher/NfaMatcher.java new file mode 100644 index 000000000000..122556f9d795 --- /dev/null +++ b/core/trino-main/src/main/java/io/trino/likematcher/NfaMatcher.java @@ -0,0 +1,162 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.likematcher; + +import java.util.Arrays; +import java.util.List; + +final class NfaMatcher + implements Matcher +{ + private static final int ANY = -1; + private static final int NONE = -2; + private static final int INVALID_CODEPOINT = -1; + + private final boolean exact; + + private final boolean[] loopback; + private final int[] match; + private final int acceptState; + private final int stateCount; + + public NfaMatcher(List pattern, boolean exact) + { + this.exact = exact; + + stateCount = calculateStateCount(pattern); + + loopback = new boolean[stateCount]; + match = new int[stateCount]; + Arrays.fill(match, NONE); + acceptState = stateCount - 1; + + int state = 0; + for (Pattern element : pattern) { + if (element instanceof Pattern.Literal literal) { + for (int i = 0; i < literal.value().length(); i++) { + match[state++] = literal.value().charAt(i); + } + } + else if (element instanceof Pattern.Any any) { + for (int i = 0; i < any.min(); i++) { + match[state++] = ANY; + } + + if (any.unbounded()) { + loopback[state] = true; + } + } + } + } + + private static int calculateStateCount(List pattern) + { + int states = 1; + for (Pattern element : pattern) { + if (element instanceof Pattern.Literal literal) { + states += literal.value().length(); + } + else if (element instanceof Pattern.Any any) { + states += any.min(); + } + } + return states; + } + + @Override + public boolean match(byte[] input, int offset, int length) + { + boolean[] seen = new boolean[stateCount + 1]; + int[] currentStates = new int[stateCount]; + int[] nextStates = new int[stateCount]; + int currentStatesIndex = 0; + int nextStatesIndex; + + currentStates[currentStatesIndex++] = 0; + + int limit = offset + length; + int current = offset; + boolean accept = false; + while (current < limit) { + int codepoint = INVALID_CODEPOINT; + + // decode the next UTF-8 codepoint + int header = input[current] & 0xFF; + if (header < 0x80) { + // normal ASCII + // 0xxx_xxxx + codepoint = header; + current++; + } + else if ((header & 0b1110_0000) == 0b1100_0000) { + // 110x_xxxx 10xx_xxxx + if (current + 1 < limit) { + codepoint = ((header & 0b0001_1111) << 6) | (input[current + 1] & 0b0011_1111); + current += 2; + } + } + else if ((header & 0b1111_0000) == 0b1110_0000) { + // 1110_xxxx 10xx_xxxx 10xx_xxxx + if (current + 2 < limit) { + codepoint = ((header & 0b0000_1111) << 12) | ((input[current + 1] & 0b0011_1111) << 6) | (input[current + 2] & 0b0011_1111); + current += 3; + } + } + else if ((header & 0b1111_1000) == 0b1111_0000) { + // 1111_0xxx 10xx_xxxx 10xx_xxxx 10xx_xxxx + if (current + 3 < limit) { + codepoint = ((header & 0b0000_0111) << 18) | ((input[current + 1] & 0b0011_1111) << 12) | ((input[current + 2] & 0b0011_1111) << 6) | (input[current + 3] & 0b0011_1111); + current += 4; + } + } + + if (codepoint == INVALID_CODEPOINT) { + return false; + } + + accept = false; + nextStatesIndex = 0; + Arrays.fill(seen, false); + for (int i = 0; i < currentStatesIndex; i++) { + int state = currentStates[i]; + if (!seen[state] && loopback[state]) { + nextStates[nextStatesIndex++] = state; + accept |= state == acceptState; + seen[state] = true; + } + int next = state + 1; + if (!seen[next] && (match[state] == ANY || match[state] == codepoint)) { + nextStates[nextStatesIndex++] = next; + accept |= next == acceptState; + seen[next] = true; + } + } + + if (nextStatesIndex == 0) { + return false; + } + + if (!exact && accept) { + return true; + } + + int[] tmp = currentStates; + currentStates = nextStates; + nextStates = tmp; + currentStatesIndex = nextStatesIndex; + } + + return accept; + } +} diff --git a/core/trino-main/src/main/java/io/trino/type/LikeFunctions.java b/core/trino-main/src/main/java/io/trino/type/LikeFunctions.java index 96936c9b27d6..839bbcedc50d 100644 --- a/core/trino-main/src/main/java/io/trino/type/LikeFunctions.java +++ b/core/trino-main/src/main/java/io/trino/type/LikeFunctions.java @@ -61,7 +61,7 @@ public static boolean likeVarchar(@SqlType("varchar") Slice value, @SqlType(Like @SqlType(LikePatternType.NAME) public static LikeMatcher likePattern(@SqlType("varchar") Slice pattern) { - return LikeMatcher.compile(pattern.toStringUtf8(), Optional.empty()); + return LikeMatcher.compile(pattern.toStringUtf8(), Optional.empty(), false); } @ScalarFunction(value = LIKE_PATTERN_FUNCTION_NAME, hidden = true) @@ -69,7 +69,7 @@ public static LikeMatcher likePattern(@SqlType("varchar") Slice pattern) public static LikeMatcher likePattern(@SqlType("varchar") Slice pattern, @SqlType("varchar") Slice escape) { try { - return LikeMatcher.compile(pattern.toStringUtf8(), getEscapeCharacter(Optional.of(escape))); + return LikeMatcher.compile(pattern.toStringUtf8(), getEscapeCharacter(Optional.of(escape)), false); } catch (RuntimeException e) { throw new TrinoException(INVALID_FUNCTION_ARGUMENT, e); diff --git a/core/trino-main/src/test/java/io/trino/likematcher/TestLikeMatcher.java b/core/trino-main/src/test/java/io/trino/likematcher/TestLikeMatcher.java index 539e77279258..3becc3c8ccf3 100644 --- a/core/trino-main/src/test/java/io/trino/likematcher/TestLikeMatcher.java +++ b/core/trino-main/src/test/java/io/trino/likematcher/TestLikeMatcher.java @@ -74,13 +74,23 @@ public void test() // optimization of consecutive _ and % assertTrue(match("_%_%_%_%", "abcdefghij")); + assertTrue(match("%a%a%a%a%a%a%", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa")); + assertTrue(match("%a%a%a%a%a%a%", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaab")); + assertTrue(match("%a%b%a%b%a%b%", "aabbaabbaabbaabbaabbaabbaabbaabbaabbaabbaabbaabbaabbaabb")); + assertTrue(match("%aaaa%bbbb%aaaa%bbbb%aaaa%bbbb%", "aaaabbbbaaaabbbbaaaabbbb")); + assertTrue(match("%aaaaaaaaaaaaaaaaaaaaaaaaaa%", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa")); + // utf-8 - LikeMatcher single = LikeMatcher.compile("_"); - LikeMatcher multiple = LikeMatcher.compile("_a%b_"); // prefix and suffix with _a and b_ to avoid optimizations + LikeMatcher singleOptimized = LikeMatcher.compile("_", Optional.empty(), true); + LikeMatcher multipleOptimized = LikeMatcher.compile("_a%b_", Optional.empty(), true); // prefix and suffix with _a and b_ to avoid optimizations + LikeMatcher single = LikeMatcher.compile("_", Optional.empty(), false); + LikeMatcher multiple = LikeMatcher.compile("_a%b_", Optional.empty(), false); // prefix and suffix with _a and b_ to avoid optimizations for (int i = 0; i < Character.MAX_CODE_POINT; i++) { + assertTrue(singleOptimized.match(Character.toString(i).getBytes(StandardCharsets.UTF_8))); assertTrue(single.match(Character.toString(i).getBytes(StandardCharsets.UTF_8))); String value = "aa" + (char) i + "bb"; + assertTrue(multipleOptimized.match(value.getBytes(StandardCharsets.UTF_8))); assertTrue(multiple.match(value.getBytes(StandardCharsets.UTF_8))); } } @@ -109,10 +119,17 @@ private static boolean match(String pattern, String value, Optional e String padded = padding + value + padding; byte[] bytes = padded.getBytes(StandardCharsets.UTF_8); - boolean withoutPadding = LikeMatcher.compile(pattern, escape).match(value.getBytes(StandardCharsets.UTF_8)); - boolean withPadding = LikeMatcher.compile(pattern, escape).match(bytes, padding.length(), bytes.length - padding.length() * 2); // exclude padding + boolean optimizedWithoutPadding = LikeMatcher.compile(pattern, escape, true).match(value.getBytes(StandardCharsets.UTF_8)); + + boolean optimizedWithPadding = LikeMatcher.compile(pattern, escape, true).match(bytes, padding.length(), bytes.length - padding.length() * 2); // exclude padding + assertEquals(optimizedWithoutPadding, optimizedWithPadding); + + boolean withoutPadding = LikeMatcher.compile(pattern, escape, false).match(value.getBytes(StandardCharsets.UTF_8)); + assertEquals(optimizedWithoutPadding, withoutPadding); + + boolean withPadding = LikeMatcher.compile(pattern, escape, false).match(bytes, padding.length(), bytes.length - padding.length() * 2); // exclude padding + assertEquals(optimizedWithoutPadding, withPadding); - assertEquals(withoutPadding, withPadding); return withPadding; } } diff --git a/core/trino-main/src/test/java/io/trino/operator/scalar/BenchmarkLike.java b/core/trino-main/src/test/java/io/trino/operator/scalar/BenchmarkLike.java index ec5e0b551b7a..7aad0cfce2dd 100644 --- a/core/trino-main/src/test/java/io/trino/operator/scalar/BenchmarkLike.java +++ b/core/trino-main/src/test/java/io/trino/operator/scalar/BenchmarkLike.java @@ -93,7 +93,8 @@ public static class Data private Slice data; private byte[] bytes; private JoniRegexp joniPattern; - private LikeMatcher matcher; + private LikeMatcher dfaMatcher; + private LikeMatcher nfaMatcher; @Setup public void setup() @@ -112,7 +113,8 @@ public void setup() default -> throw new IllegalArgumentException("Unknown pattern: " + pattern); }); - matcher = LikeMatcher.compile(pattern, Optional.empty()); + dfaMatcher = LikeMatcher.compile(pattern, Optional.empty(), true); + nfaMatcher = LikeMatcher.compile(pattern, Optional.empty(), false); joniPattern = compileJoni(Slices.utf8Slice(pattern).toStringUtf8(), '0', false); bytes = data.getBytes(); @@ -120,15 +122,21 @@ public void setup() } @Benchmark - public boolean benchmarkJoni(Data data) + public boolean matchJoni(Data data) { return likeVarchar(data.data, data.joniPattern); } @Benchmark - public boolean benchmarkCurrent(Data data) + public boolean matchDfa(Data data) { - return data.matcher.match(data.bytes, 0, data.bytes.length); + return data.dfaMatcher.match(data.bytes, 0, data.bytes.length); + } + + @Benchmark + public boolean matchNfa(Data data) + { + return data.nfaMatcher.match(data.bytes, 0, data.bytes.length); } @Benchmark @@ -138,9 +146,35 @@ public JoniRegexp compileJoni(Data data) } @Benchmark - public LikeMatcher compile(Data data) + public LikeMatcher compileDfa(Data data) + { + return LikeMatcher.compile(data.pattern, Optional.empty(), true); + } + + @Benchmark + public LikeMatcher compileNfa(Data data) + { + return LikeMatcher.compile(data.pattern, Optional.empty(), false); + } + + @Benchmark + public boolean allJoni(Data data) + { + return likeVarchar(data.data, compileJoni(Slices.utf8Slice(data.pattern).toStringUtf8(), '0', false)); + } + + @Benchmark + public boolean allDfa(Data data) + { + return LikeMatcher.compile(data.pattern, Optional.empty(), true) + .match(data.bytes, 0, data.bytes.length); + } + + @Benchmark + public boolean allNfa(Data data) { - return LikeMatcher.compile(data.pattern, Optional.empty()); + return LikeMatcher.compile(data.pattern, Optional.empty(), false) + .match(data.bytes, 0, data.bytes.length); } public static boolean likeVarchar(Slice value, JoniRegexp pattern) From 5509719a489ad5589255d39dd526bf2d8210f316 Mon Sep 17 00:00:00 2001 From: Martin Traverso Date: Tue, 7 Feb 2023 14:13:39 -0800 Subject: [PATCH 09/11] Construct dense DFA lazily --- .../io/trino/likematcher/DenseDfaMatcher.java | 246 +++++++++--------- .../io/trino/likematcher/LikeMatcher.java | 2 +- 2 files changed, 131 insertions(+), 117 deletions(-) diff --git a/core/trino-main/src/main/java/io/trino/likematcher/DenseDfaMatcher.java b/core/trino-main/src/main/java/io/trino/likematcher/DenseDfaMatcher.java index 0121390ace27..2d96b3d6497f 100644 --- a/core/trino-main/src/main/java/io/trino/likematcher/DenseDfaMatcher.java +++ b/core/trino-main/src/main/java/io/trino/likematcher/DenseDfaMatcher.java @@ -18,154 +18,167 @@ import static com.google.common.base.Preconditions.checkArgument; import static java.nio.charset.StandardCharsets.UTF_8; +import static java.util.Objects.requireNonNull; class DenseDfaMatcher implements Matcher { public static final int FAIL_STATE = -1; - // The DFA is encoded as a sequence of transitions for each possible byte value for each state. - // I.e., 256 transitions per state. - // The content of the transitions array is the base offset into - // the next state to follow. I.e., the desired state * 256 - private final int[] transitions; - - // The starting state - private final int start; - - // For each state, whether it's an accepting state - private final boolean[] accept; - + private final List pattern; private final boolean exact; - /** - * @param exact whether to match to the end of the input - */ - public static DenseDfaMatcher newInstance(List pattern, boolean exact) - { - DFA dfa = makeNfa(pattern).toDfa(); - - int[] transitions = new int[dfa.transitions().size() * 256]; - Arrays.fill(transitions, FAIL_STATE); - - for (int state = 0; state < dfa.transitions().size(); state++) { - for (DFA.Transition transition : dfa.transitions().get(state)) { - transitions[state * 256 + transition.value()] = transition.target() * 256; - } - } - - boolean[] accept = new boolean[dfa.transitions().size()]; - for (int state : dfa.acceptStates()) { - accept[state] = true; - } - - return new DenseDfaMatcher(transitions, dfa.start(), accept, exact); - } + private volatile DenseDfa matcher; - private DenseDfaMatcher(int[] transitions, int start, boolean[] accept, boolean exact) + public DenseDfaMatcher(List pattern, boolean exact) { - this.transitions = transitions; - this.start = start; - this.accept = accept; + this.pattern = requireNonNull(pattern, "pattern is null"); this.exact = exact; } @Override public boolean match(byte[] input, int offset, int length) { + DenseDfa matcher = this.matcher; + if (matcher == null) { + matcher = DenseDfa.newInstance(pattern); + this.matcher = matcher; + } + if (exact) { - return exactMatch(input, offset, length); + return matcher.exactMatch(input, offset, length); } - return prefixMatch(input, offset, length); + return matcher.prefixMatch(input, offset, length); } - /** - * Returns a positive match when the final state after all input has been consumed is an accepting state - */ - private boolean exactMatch(byte[] input, int offset, int length) + private static class DenseDfa { - int state = start << 8; - for (int i = offset; i < offset + length; i++) { - byte inputByte = input[i]; - state = transitions[state | (inputByte & 0xFF)]; + // The DFA is encoded as a sequence of transitions for each possible byte value for each state. + // I.e., 256 transitions per state. + // The content of the transitions array is the base offset into + // the next state to follow. I.e., the desired state * 256 + private final int[] transitions; - if (state == FAIL_STATE) { - return false; - } - } + // The starting state + private final int start; - return accept[state >>> 8]; - } + // For each state, whether it's an accepting state + private final boolean[] accept; - /** - * Returns a positive match as soon as the DFA reaches an accepting state, regardless of whether - * the whole input has been consumed - */ - private boolean prefixMatch(byte[] input, int offset, int length) - { - int state = start << 8; - for (int i = offset; i < offset + length; i++) { - byte inputByte = input[i]; - state = transitions[state | (inputByte & 0xFF)]; + public static DenseDfa newInstance(List pattern) + { + DFA dfa = makeNfa(pattern).toDfa(); - if (state == FAIL_STATE) { - return false; - } + int[] transitions = new int[dfa.transitions().size() * 256]; + Arrays.fill(transitions, FAIL_STATE); - if (accept[state >>> 8]) { - return true; + for (int state = 0; state < dfa.transitions().size(); state++) { + for (DFA.Transition transition : dfa.transitions().get(state)) { + transitions[state * 256 + transition.value()] = transition.target() * 256; + } + } + boolean[] accept = new boolean[dfa.transitions().size()]; + for (int state : dfa.acceptStates()) { + accept[state] = true; } + + return new DenseDfa(transitions, dfa.start(), accept); } - return accept[state >>> 8]; - } + private DenseDfa(int[] transitions, int start, boolean[] accept) + { + this.transitions = transitions; + this.start = start; + this.accept = accept; + } - private static NFA makeNfa(List pattern) - { - checkArgument(!pattern.isEmpty(), "pattern is empty"); + /** + * Returns a positive match when the final state after all input has been consumed is an accepting state + */ + public boolean exactMatch(byte[] input, int offset, int length) + { + int state = start << 8; + for (int i = offset; i < offset + length; i++) { + byte inputByte = input[i]; + state = transitions[state | (inputByte & 0xFF)]; + + if (state == FAIL_STATE) { + return false; + } + } - NFA.Builder builder = new NFA.Builder(); + return accept[state >>> 8]; + } - int state = builder.addStartState(); + /** + * Returns a positive match as soon as the DFA reaches an accepting state, regardless of whether + * the whole input has been consumed + */ + public boolean prefixMatch(byte[] input, int offset, int length) + { + int state = start << 8; + for (int i = offset; i < offset + length; i++) { + byte inputByte = input[i]; + state = transitions[state | (inputByte & 0xFF)]; + + if (state == FAIL_STATE) { + return false; + } - for (Pattern item : pattern) { - if (item instanceof Pattern.Literal literal) { - for (byte current : literal.value().getBytes(UTF_8)) { - state = matchByte(builder, state, current); + if (accept[state >>> 8]) { + return true; } } - else if (item instanceof Pattern.Any any) { - for (int i = 0; i < any.min(); i++) { - int next = builder.addState(); - matchSingleUtf8(builder, state, next); - state = next; - } - if (any.unbounded()) { - matchSingleUtf8(builder, state, state); + return accept[state >>> 8]; + } + + private static NFA makeNfa(List pattern) + { + checkArgument(!pattern.isEmpty(), "pattern is empty"); + + NFA.Builder builder = new NFA.Builder(); + + int state = builder.addStartState(); + + for (Pattern item : pattern) { + if (item instanceof Pattern.Literal literal) { + for (byte current : literal.value().getBytes(UTF_8)) { + state = matchByte(builder, state, current); + } + } + else if (item instanceof Pattern.Any any) { + for (int i = 0; i < any.min(); i++) { + int next = builder.addState(); + matchSingleUtf8(builder, state, next); + state = next; + } + + if (any.unbounded()) { + matchSingleUtf8(builder, state, state); + } + } + else { + throw new UnsupportedOperationException("Not supported: " + item.getClass().getName()); } } - else { - throw new UnsupportedOperationException("Not supported: " + item.getClass().getName()); - } - } - builder.setAccept(state); + builder.setAccept(state); - return builder.build(); - } + return builder.build(); + } - private static int matchByte(NFA.Builder builder, int state, byte value) - { - int next = builder.addState(); - builder.addTransition(state, new NFA.Value(value), next); - return next; - } + private static int matchByte(NFA.Builder builder, int state, byte value) + { + int next = builder.addState(); + builder.addTransition(state, new NFA.Value(value), next); + return next; + } - private static void matchSingleUtf8(NFA.Builder builder, int from, int to) - { - /* + private static void matchSingleUtf8(NFA.Builder builder, int from, int to) + { + /* Implements a state machine to recognize UTF-8 characters. 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx @@ -179,20 +192,21 @@ private static void matchSingleUtf8(NFA.Builder builder, int from, int to) │ │ └───────────────────────────────────────────────────────────┘ 0xxxxxxx - */ + */ - builder.addTransition(from, new NFA.Prefix(0, 1), to); + builder.addTransition(from, new NFA.Prefix(0, 1), to); - int state1 = builder.addState(); - int state2 = builder.addState(); - int state3 = builder.addState(); + int state1 = builder.addState(); + int state2 = builder.addState(); + int state3 = builder.addState(); - builder.addTransition(from, new NFA.Prefix(0b11110, 5), state1); - builder.addTransition(from, new NFA.Prefix(0b1110, 4), state2); - builder.addTransition(from, new NFA.Prefix(0b110, 3), state3); + builder.addTransition(from, new NFA.Prefix(0b11110, 5), state1); + builder.addTransition(from, new NFA.Prefix(0b1110, 4), state2); + builder.addTransition(from, new NFA.Prefix(0b110, 3), state3); - builder.addTransition(state1, new NFA.Prefix(0b10, 2), state2); - builder.addTransition(state2, new NFA.Prefix(0b10, 2), state3); - builder.addTransition(state3, new NFA.Prefix(0b10, 2), to); + builder.addTransition(state1, new NFA.Prefix(0b10, 2), state2); + builder.addTransition(state2, new NFA.Prefix(0b10, 2), state3); + builder.addTransition(state3, new NFA.Prefix(0b10, 2), to); + } } } diff --git a/core/trino-main/src/main/java/io/trino/likematcher/LikeMatcher.java b/core/trino-main/src/main/java/io/trino/likematcher/LikeMatcher.java index 4ce6f0597684..2add8d94366d 100644 --- a/core/trino-main/src/main/java/io/trino/likematcher/LikeMatcher.java +++ b/core/trino-main/src/main/java/io/trino/likematcher/LikeMatcher.java @@ -151,7 +151,7 @@ else if (i == optimized.size() - 1) { Optional matcher = Optional.empty(); if (!middle.isEmpty()) { if (optimize) { - matcher = Optional.of(DenseDfaMatcher.newInstance(middle, exact)); + matcher = Optional.of(new DenseDfaMatcher(middle, exact)); } else { matcher = Optional.of(new NfaMatcher(middle, exact)); From 0b74c4cadfd8dcd626fcf4a9341fab8440527bc0 Mon Sep 17 00:00:00 2001 From: Martin Traverso Date: Fri, 10 Feb 2023 16:44:08 -0500 Subject: [PATCH 10/11] Simplify parsing and optimization of pattern The parser now produces an optimized pattern consisting of a sequence of literals and compacted "any" elements. Previously, the sequence would contain one "any" element for each _ and %, which would later be coalesced by the optimize() method. --- .../io/trino/likematcher/LikeMatcher.java | 83 ++++++------------- .../io/trino/likematcher/TestLikeMatcher.java | 2 + 2 files changed, 27 insertions(+), 58 deletions(-) diff --git a/core/trino-main/src/main/java/io/trino/likematcher/LikeMatcher.java b/core/trino-main/src/main/java/io/trino/likematcher/LikeMatcher.java index 2add8d94366d..91cd545c7896 100644 --- a/core/trino-main/src/main/java/io/trino/likematcher/LikeMatcher.java +++ b/core/trino-main/src/main/java/io/trino/likematcher/LikeMatcher.java @@ -75,7 +75,6 @@ public static LikeMatcher compile(String pattern, Optional escape) public static LikeMatcher compile(String pattern, Optional escape, boolean optimize) { List parsed = parse(pattern, escape); - List optimized = optimize(parsed); // Calculate minimum and maximum size for candidate strings // This is used for short-circuiting the match if the size of @@ -83,7 +82,7 @@ public static LikeMatcher compile(String pattern, Optional escape, bo int minSize = 0; int maxSize = 0; boolean unbounded = false; - for (Pattern expression : optimized) { + for (Pattern expression : parsed) { if (expression instanceof Literal literal) { int length = literal.value().getBytes(UTF_8).length; minSize += length; @@ -107,8 +106,8 @@ else if (expression instanceof Any any) { byte[] prefix = new byte[0]; byte[] suffix = new byte[0]; List middle = new ArrayList<>(); - for (int i = 0; i < optimized.size(); i++) { - Pattern expression = optimized.get(i); + for (int i = 0; i < parsed.size(); i++) { + Pattern expression = parsed.get(i); if (i == 0) { if (expression instanceof Literal literal) { @@ -116,7 +115,7 @@ else if (expression instanceof Any any) { continue; } } - else if (i == optimized.size() - 1) { + else if (i == parsed.size() - 1) { if (expression instanceof Literal literal) { suffix = literal.value().getBytes(UTF_8); continue; @@ -209,11 +208,13 @@ private boolean startsWith(byte[] pattern, byte[] input, int offset) return true; } - private static List parse(String pattern, Optional escape) + static List parse(String pattern, Optional escape) { List result = new ArrayList<>(); StringBuilder literal = new StringBuilder(); + int anyCount = 0; + boolean anyUnbounded = false; boolean inEscape = false; for (int i = 0; i < pattern.length(); i++) { char character = pattern.charAt(i); @@ -222,26 +223,39 @@ private static List parse(String pattern, Optional escape) if (character != '%' && character != '_' && character != escape.get()) { throw new IllegalArgumentException("Escape character must be followed by '%', '_' or the escape character itself"); } + literal.append(character); inEscape = false; } else if (escape.isPresent() && character == escape.get()) { inEscape = true; + + if (anyUnbounded || anyCount != 0) { + result.add(new Any(anyCount, anyUnbounded)); + anyCount = 0; + anyUnbounded = false; + } } else if (character == '%' || character == '_') { if (literal.length() != 0) { result.add(new Literal(literal.toString())); - literal = new StringBuilder(); + literal.setLength(0); } if (character == '%') { - result.add(new Any(0, true)); + anyUnbounded = true; } else { - result.add(new Any(1, false)); + anyCount++; } } else { + if (anyUnbounded || anyCount != 0) { + result.add(new Any(anyCount, anyUnbounded)); + anyCount = 0; + anyUnbounded = false; + } + literal.append(character); } } @@ -253,57 +267,10 @@ else if (character == '%' || character == '_') { if (literal.length() != 0) { result.add(new Literal(literal.toString())); } - - return result; - } - - private static List optimize(List pattern) - { - if (pattern.isEmpty()) { - return pattern; - } - - List result = new ArrayList<>(); - - int anyPatternStart = -1; - for (int i = 0; i < pattern.size(); i++) { - Pattern current = pattern.get(i); - - if (anyPatternStart == -1 && current instanceof Any) { - anyPatternStart = i; - } - else if (current instanceof Literal) { - if (anyPatternStart != -1) { - result.add(collapse(pattern, anyPatternStart, i)); - } - - result.add(current); - anyPatternStart = -1; - } - } - - if (anyPatternStart != -1) { - result.add(collapse(pattern, anyPatternStart, pattern.size())); + else if (anyUnbounded || anyCount != 0) { + result.add(new Any(anyCount, anyUnbounded)); } return result; } - - /** - * Collapses a sequence of consecutive Any items - */ - private static Any collapse(List pattern, int start, int end) - { - int min = 0; - boolean unbounded = false; - - for (int i = start; i < end; i++) { - Any any = (Any) pattern.get(i); - - min += any.min(); - unbounded = unbounded || any.unbounded(); - } - - return new Any(min, unbounded); - } } diff --git a/core/trino-main/src/test/java/io/trino/likematcher/TestLikeMatcher.java b/core/trino-main/src/test/java/io/trino/likematcher/TestLikeMatcher.java index 3becc3c8ccf3..a02b21aa0ac3 100644 --- a/core/trino-main/src/test/java/io/trino/likematcher/TestLikeMatcher.java +++ b/core/trino-main/src/test/java/io/trino/likematcher/TestLikeMatcher.java @@ -101,6 +101,8 @@ public void testEscape() assertTrue(match("-%", "%", '-')); assertTrue(match("-_", "_", '-')); assertTrue(match("--", "-", '-')); + + assertTrue(match("%$_%", "xxxxx_xxxxx", '$')); } private static boolean match(String pattern, String value) From 021441e2437a26f4c4b552a9c58107acdf26c63e Mon Sep 17 00:00:00 2001 From: Martin Traverso Date: Fri, 10 Feb 2023 17:19:22 -0500 Subject: [PATCH 11/11] Avoid creating intermediate pattern list when constructing matcher --- .../io/trino/likematcher/DenseDfaMatcher.java | 26 +++--- .../io/trino/likematcher/LikeMatcher.java | 85 +++++++++---------- .../java/io/trino/likematcher/NfaMatcher.java | 23 ++--- .../java/io/trino/likematcher/Pattern.java | 21 +++-- 4 files changed, 84 insertions(+), 71 deletions(-) diff --git a/core/trino-main/src/main/java/io/trino/likematcher/DenseDfaMatcher.java b/core/trino-main/src/main/java/io/trino/likematcher/DenseDfaMatcher.java index 2d96b3d6497f..d5aa193fbabf 100644 --- a/core/trino-main/src/main/java/io/trino/likematcher/DenseDfaMatcher.java +++ b/core/trino-main/src/main/java/io/trino/likematcher/DenseDfaMatcher.java @@ -26,13 +26,17 @@ class DenseDfaMatcher public static final int FAIL_STATE = -1; private final List pattern; + private final int start; + private final int end; private final boolean exact; private volatile DenseDfa matcher; - public DenseDfaMatcher(List pattern, boolean exact) + public DenseDfaMatcher(List pattern, int start, int end, boolean exact) { this.pattern = requireNonNull(pattern, "pattern is null"); + this.start = start; + this.end = end; this.exact = exact; } @@ -41,7 +45,7 @@ public boolean match(byte[] input, int offset, int length) { DenseDfa matcher = this.matcher; if (matcher == null) { - matcher = DenseDfa.newInstance(pattern); + matcher = DenseDfa.newInstance(pattern, start, end); this.matcher = matcher; } @@ -66,9 +70,9 @@ private static class DenseDfa // For each state, whether it's an accepting state private final boolean[] accept; - public static DenseDfa newInstance(List pattern) + public static DenseDfa newInstance(List pattern, int start, int end) { - DFA dfa = makeNfa(pattern).toDfa(); + DFA dfa = makeNfa(pattern, start, end).toDfa(); int[] transitions = new int[dfa.transitions().size() * 256]; Arrays.fill(transitions, FAIL_STATE); @@ -134,7 +138,7 @@ public boolean prefixMatch(byte[] input, int offset, int length) return accept[state >>> 8]; } - private static NFA makeNfa(List pattern) + private static NFA makeNfa(List pattern, int start, int end) { checkArgument(!pattern.isEmpty(), "pattern is empty"); @@ -142,22 +146,22 @@ private static NFA makeNfa(List pattern) int state = builder.addStartState(); - for (Pattern item : pattern) { + for (int e = start; e <= end; e++) { + Pattern item = pattern.get(e); if (item instanceof Pattern.Literal literal) { for (byte current : literal.value().getBytes(UTF_8)) { state = matchByte(builder, state, current); } } else if (item instanceof Pattern.Any any) { - for (int i = 0; i < any.min(); i++) { + for (int i = 0; i < any.length(); i++) { int next = builder.addState(); matchSingleUtf8(builder, state, next); state = next; } - - if (any.unbounded()) { - matchSingleUtf8(builder, state, state); - } + } + else if (item instanceof Pattern.ZeroOrMore) { + matchSingleUtf8(builder, state, state); } else { throw new UnsupportedOperationException("Not supported: " + item.getClass().getName()); diff --git a/core/trino-main/src/main/java/io/trino/likematcher/LikeMatcher.java b/core/trino-main/src/main/java/io/trino/likematcher/LikeMatcher.java index 91cd545c7896..575cb25f2d7b 100644 --- a/core/trino-main/src/main/java/io/trino/likematcher/LikeMatcher.java +++ b/core/trino-main/src/main/java/io/trino/likematcher/LikeMatcher.java @@ -88,12 +88,13 @@ public static LikeMatcher compile(String pattern, Optional escape, bo minSize += length; maxSize += length; } + else if (expression instanceof Pattern.ZeroOrMore) { + unbounded = true; + } else if (expression instanceof Any any) { - int length = any.min(); + int length = any.length(); minSize += length; maxSize += length * 4; // at most 4 bytes for a single UTF-8 codepoint - - unbounded = unbounded || any.unbounded(); } else { throw new UnsupportedOperationException("Not supported: " + expression.getClass().getName()); @@ -105,24 +106,17 @@ else if (expression instanceof Any any) { // exact match to short-circuit DFA evaluation byte[] prefix = new byte[0]; byte[] suffix = new byte[0]; - List middle = new ArrayList<>(); - for (int i = 0; i < parsed.size(); i++) { - Pattern expression = parsed.get(i); - - if (i == 0) { - if (expression instanceof Literal literal) { - prefix = literal.value().getBytes(UTF_8); - continue; - } - } - else if (i == parsed.size() - 1) { - if (expression instanceof Literal literal) { - suffix = literal.value().getBytes(UTF_8); - continue; - } - } - middle.add(expression); + int patternStart = 0; + int patternEnd = parsed.size() - 1; + if (parsed.size() > 0 && parsed.get(0) instanceof Literal literal) { + prefix = literal.value().getBytes(UTF_8); + patternStart++; + } + + if (parsed.size() > 1 && parsed.get(parsed.size() - 1) instanceof Literal literal) { + suffix = literal.value().getBytes(UTF_8); + patternEnd--; } // If the pattern (after excluding constant prefix/suffixes) ends with an unbounded match (i.e., %) @@ -130,30 +124,19 @@ else if (i == parsed.size() - 1) { // is no need to consume the remaining input // This section determines whether the pattern is a candidate for non-exact match. boolean exact = true; // whether to match to the end of the input - if (!middle.isEmpty()) { - // guaranteed to be Any because any Literal would've been turned into a suffix above - Any last = (Any) middle.get(middle.size() - 1); - if (last.unbounded()) { - exact = false; - - // Since the matcher will stop early, no need for an unbounded matcher (it produces a simpler DFA) - if (last.min() == 0) { - // We'd end up with an empty string match at the end, so just remove it - middle.remove(middle.size() - 1); - } - else { - middle.set(middle.size() - 1, new Any(last.min(), false)); - } - } + if (patternStart <= patternEnd && parsed.get(patternEnd) instanceof Pattern.ZeroOrMore) { + // guaranteed to be Any or ZeroOrMore because any Literal would've been turned into a suffix above + exact = false; + patternEnd--; } Optional matcher = Optional.empty(); - if (!middle.isEmpty()) { + if (patternStart <= patternEnd) { if (optimize) { - matcher = Optional.of(new DenseDfaMatcher(middle, exact)); + matcher = Optional.of(new DenseDfaMatcher(parsed, patternStart, patternEnd, exact)); } else { - matcher = Optional.of(new NfaMatcher(middle, exact)); + matcher = Optional.of(new NfaMatcher(parsed, patternStart, patternEnd, exact)); } } @@ -230,9 +213,13 @@ static List parse(String pattern, Optional escape) else if (escape.isPresent() && character == escape.get()) { inEscape = true; - if (anyUnbounded || anyCount != 0) { - result.add(new Any(anyCount, anyUnbounded)); + if (anyCount != 0) { + result.add(new Any(anyCount)); anyCount = 0; + } + + if (anyUnbounded) { + result.add(new Pattern.ZeroOrMore()); anyUnbounded = false; } } @@ -250,9 +237,13 @@ else if (character == '%' || character == '_') { } } else { - if (anyUnbounded || anyCount != 0) { - result.add(new Any(anyCount, anyUnbounded)); + if (anyCount != 0) { + result.add(new Any(anyCount)); anyCount = 0; + } + + if (anyUnbounded) { + result.add(new Pattern.ZeroOrMore()); anyUnbounded = false; } @@ -267,8 +258,14 @@ else if (character == '%' || character == '_') { if (literal.length() != 0) { result.add(new Literal(literal.toString())); } - else if (anyUnbounded || anyCount != 0) { - result.add(new Any(anyCount, anyUnbounded)); + else { + if (anyCount != 0) { + result.add(new Any(anyCount)); + } + + if (anyUnbounded) { + result.add(new Pattern.ZeroOrMore()); + } } return result; diff --git a/core/trino-main/src/main/java/io/trino/likematcher/NfaMatcher.java b/core/trino-main/src/main/java/io/trino/likematcher/NfaMatcher.java index 122556f9d795..c5beff515924 100644 --- a/core/trino-main/src/main/java/io/trino/likematcher/NfaMatcher.java +++ b/core/trino-main/src/main/java/io/trino/likematcher/NfaMatcher.java @@ -30,11 +30,11 @@ final class NfaMatcher private final int acceptState; private final int stateCount; - public NfaMatcher(List pattern, boolean exact) + public NfaMatcher(List pattern, int start, int end, boolean exact) { this.exact = exact; - stateCount = calculateStateCount(pattern); + stateCount = calculateStateCount(pattern, start, end); loopback = new boolean[stateCount]; match = new int[stateCount]; @@ -42,33 +42,34 @@ public NfaMatcher(List pattern, boolean exact) acceptState = stateCount - 1; int state = 0; - for (Pattern element : pattern) { + for (int j = start; j <= end; j++) { + Pattern element = pattern.get(j); if (element instanceof Pattern.Literal literal) { for (int i = 0; i < literal.value().length(); i++) { match[state++] = literal.value().charAt(i); } } else if (element instanceof Pattern.Any any) { - for (int i = 0; i < any.min(); i++) { + for (int i = 0; i < any.length(); i++) { match[state++] = ANY; } - - if (any.unbounded()) { - loopback[state] = true; - } + } + else if (element instanceof Pattern.ZeroOrMore) { + loopback[state] = true; } } } - private static int calculateStateCount(List pattern) + private static int calculateStateCount(List pattern, int start, int end) { int states = 1; - for (Pattern element : pattern) { + for (int i = start; i <= end; i++) { + Pattern element = pattern.get(i); if (element instanceof Pattern.Literal literal) { states += literal.value().length(); } else if (element instanceof Pattern.Any any) { - states += any.min(); + states += any.length(); } } return states; diff --git a/core/trino-main/src/main/java/io/trino/likematcher/Pattern.java b/core/trino-main/src/main/java/io/trino/likematcher/Pattern.java index dbf6cea13cbd..cd9b6947539c 100644 --- a/core/trino-main/src/main/java/io/trino/likematcher/Pattern.java +++ b/core/trino-main/src/main/java/io/trino/likematcher/Pattern.java @@ -13,11 +13,12 @@ */ package io.trino.likematcher; +import com.google.common.base.Strings; + import static com.google.common.base.Preconditions.checkArgument; -import static java.lang.String.format; sealed interface Pattern - permits Pattern.Any, Pattern.Literal + permits Pattern.Any, Pattern.Literal, Pattern.ZeroOrMore { record Literal(String value) implements Pattern @@ -29,18 +30,28 @@ public String toString() } } - record Any(int min, boolean unbounded) + record ZeroOrMore() + implements Pattern + { + @Override + public String toString() + { + return "%"; + } + } + + record Any(int length) implements Pattern { public Any { - checkArgument(min > 0 || unbounded, "Any must be unbounded or require at least 1 character"); + checkArgument(length > 0, "Length must be > 0"); } @Override public String toString() { - return format("{%s%s}", min, unbounded ? "+" : ""); + return Strings.repeat("_", length); } } }