diff --git a/presto-main/src/main/java/com/facebook/presto/likematcher/DFA.java b/presto-main/src/main/java/com/facebook/presto/likematcher/DFA.java new file mode 100644 index 0000000000000..868b32fd7132a --- /dev/null +++ b/presto-main/src/main/java/com/facebook/presto/likematcher/DFA.java @@ -0,0 +1,152 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.facebook.presto.likematcher; + +import it.unimi.dsi.fastutil.ints.IntArrayList; + +import java.util.ArrayList; +import java.util.List; + +public class DFA +{ + private final int start; + private final IntArrayList acceptStates; + private final List> transitions; + + // Constructor + public DFA(int start, IntArrayList acceptStates, List> transitions) + { + this.start = start; + this.acceptStates = acceptStates; + this.transitions = transitions; + } + + // Getters + public int getStart() + { + return start; + } + + public IntArrayList getAcceptStates() + { + return acceptStates; + } + + public List> getTransitions() + { + return transitions; + } + + public List transitions(State state) + { + return transitions.get(state.getId()); + } + + public static class State + { + private final int id; + private final String label; + private final boolean accept; + + public State(int id, String label, boolean accept) + { + this.id = id; + this.label = label; + this.accept = accept; + } + + public int getId() + { + return id; + } + + public String getLabel() + { + return label; + } + + public boolean isAccept() + { + return accept; + } + + @Override + public String toString() + { + return String.format("%d:%s%s", id, accept ? "*" : "", label); + } + } + + public static class Transition + { + private final int value; + private final int target; + + public Transition(int value, int target) + { + this.value = value; + this.target = target; + } + + public int getValue() + { + return value; + } + + public int getTarget() + { + return target; + } + + @Override + public String toString() + { + return String.format("-[%s]-> %s", value, target); + } + } + + public static class Builder + { + private int nextId; + private int start; + private final IntArrayList acceptStates = new IntArrayList(); + private final List> transitions = new ArrayList<>(); + + public int addState(boolean accept) + { + int state = nextId++; + transitions.add(new ArrayList<>()); + if (accept) { + acceptStates.add(state); + } + return state; + } + + public int addStartState(boolean accept) + { + start = addState(accept); + return start; + } + + public void addTransition(int from, int value, int to) + { + transitions.get(from).add(new Transition(value, to)); + } + + public DFA build() + { + return new DFA(start, acceptStates, transitions); + } + } +} diff --git a/presto-main/src/main/java/com/facebook/presto/likematcher/DenseDfaMatcher.java b/presto-main/src/main/java/com/facebook/presto/likematcher/DenseDfaMatcher.java new file mode 100644 index 0000000000000..693124ffee211 --- /dev/null +++ b/presto-main/src/main/java/com/facebook/presto/likematcher/DenseDfaMatcher.java @@ -0,0 +1,218 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.facebook.presto.likematcher; + +import java.util.Arrays; +import java.util.List; + +import static com.google.common.base.Preconditions.checkArgument; +import static java.nio.charset.StandardCharsets.UTF_8; +import static java.util.Objects.requireNonNull; + +class DenseDfaMatcher + implements Matcher +{ + public static final int FAIL_STATE = -1; + + private final List pattern; + private final int start; + private final int end; + private final boolean exact; + + private volatile DenseDfa matcher; + + public DenseDfaMatcher(List pattern, int start, int end, boolean exact) + { + this.pattern = requireNonNull(pattern, "pattern is null"); + this.start = start; + this.end = end; + this.exact = exact; + } + + @Override + public boolean match(byte[] input, int offset, int length) + { + DenseDfa matcher = this.matcher; + if (matcher == null) { + matcher = DenseDfa.newInstance(pattern, start, end); + this.matcher = matcher; + } + + if (exact) { + return matcher.exactMatch(input, offset, length); + } + + return matcher.prefixMatch(input, offset, length); + } + + private static class DenseDfa + { + // The DFA is encoded as a sequence of transitions for each possible byte value for each state. + // I.e., 256 transitions per state. + // The content of the transitions array is the base offset into + // the next state to follow. I.e., the desired state * 256 + private final int[] transitions; + + // The starting state + private final int start; + + // For each state, whether it's an accepting state + private final boolean[] accept; + + public static DenseDfa newInstance(List pattern, int start, int end) + { + DFA dfa = makeNfa(pattern, start, end).toDfa(); + + int[] transitions = new int[dfa.getTransitions().size() * 256]; + Arrays.fill(transitions, FAIL_STATE); + + for (int state = 0; state < dfa.getTransitions().size(); state++) { + for (DFA.Transition transition : dfa.getTransitions().get(state)) { + transitions[state * 256 + transition.getValue()] = transition.getTarget() * 256; + } + } + boolean[] accept = new boolean[dfa.getTransitions().size()]; + for (int state : dfa.getAcceptStates()) { + accept[state] = true; + } + + return new DenseDfa(transitions, dfa.getStart(), accept); + } + + private DenseDfa(int[] transitions, int start, boolean[] accept) + { + this.transitions = transitions; + this.start = start; + this.accept = accept; + } + + /** + * Returns a positive match when the final state after all input has been consumed is an accepting state + */ + public boolean exactMatch(byte[] input, int offset, int length) + { + int state = start << 8; + for (int i = offset; i < offset + length; i++) { + byte inputByte = input[i]; + state = transitions[state | (inputByte & 0xFF)]; + + if (state == FAIL_STATE) { + return false; + } + } + + return accept[state >>> 8]; + } + + /** + * Returns a positive match as soon as the DFA reaches an accepting state, regardless of whether + * the whole input has been consumed + */ + public boolean prefixMatch(byte[] input, int offset, int length) + { + int state = start << 8; + for (int i = offset; i < offset + length; i++) { + byte inputByte = input[i]; + state = transitions[state | (inputByte & 0xFF)]; + + if (state == FAIL_STATE) { + return false; + } + + if (accept[state >>> 8]) { + return true; + } + } + + return accept[state >>> 8]; + } + + private static NFA makeNfa(List pattern, int start, int end) + { + checkArgument(!pattern.isEmpty(), "pattern is empty"); + + NFA.Builder builder = new NFA.Builder(); + + int state = builder.addStartState(); + + for (int e = start; e <= end; e++) { + Pattern item = pattern.get(e); + if (item instanceof Pattern.Literal) { + Pattern.Literal literal = (Pattern.Literal) item; + for (byte current : literal.getValue().getBytes(UTF_8)) { + state = matchByte(builder, state, current); + } + } + else if (item instanceof Pattern.Any) { + Pattern.Any any = (Pattern.Any) item; + for (int i = 0; i < any.getLength(); i++) { + int next = builder.addState(); + matchSingleUtf8(builder, state, next); + state = next; + } + } + else if (item instanceof Pattern.ZeroOrMore) { + matchSingleUtf8(builder, state, state); + } + else { + throw new UnsupportedOperationException("Not supported: " + item.getClass().getName()); + } + } + + builder.setAccept(state); + + return builder.build(); + } + + private static int matchByte(NFA.Builder builder, int state, byte value) + { + int next = builder.addState(); + builder.addTransition(state, new NFA.Value(value), next); + return next; + } + + private static void matchSingleUtf8(NFA.Builder builder, int from, int to) + { + /* + Implements a state machine to recognize UTF-8 characters. + + 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx + O ───────────► O ───────────► O ───────────► O ───────────► O + │ ▲ ▲ ▲ + ├─────────────────────────────┘ │ │ + │ 1110xxxx │ │ + │ │ │ + ├────────────────────────────────────────────┘ │ + │ 110xxxxx │ + │ │ + └───────────────────────────────────────────────────────────┘ + 0xxxxxxx + */ + + builder.addTransition(from, new NFA.Prefix(0, 1), to); + + int state1 = builder.addState(); + int state2 = builder.addState(); + int state3 = builder.addState(); + + builder.addTransition(from, new NFA.Prefix(0b11110, 5), state1); + builder.addTransition(from, new NFA.Prefix(0b1110, 4), state2); + builder.addTransition(from, new NFA.Prefix(0b110, 3), state3); + + builder.addTransition(state1, new NFA.Prefix(0b10, 2), state2); + builder.addTransition(state2, new NFA.Prefix(0b10, 2), state3); + builder.addTransition(state3, new NFA.Prefix(0b10, 2), to); + } + } +} diff --git a/presto-main/src/main/java/com/facebook/presto/likematcher/LikeMatcher.java b/presto-main/src/main/java/com/facebook/presto/likematcher/LikeMatcher.java new file mode 100644 index 0000000000000..7487e2f5202bf --- /dev/null +++ b/presto-main/src/main/java/com/facebook/presto/likematcher/LikeMatcher.java @@ -0,0 +1,274 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.facebook.presto.likematcher; + +import java.util.ArrayList; +import java.util.List; +import java.util.Optional; +import java.util.OptionalInt; + +import static java.nio.charset.StandardCharsets.UTF_8; + +public class LikeMatcher +{ + private final String pattern; + private final Optional escape; + + private final int minSize; + private final OptionalInt maxSize; + private final byte[] prefix; + private final byte[] suffix; + private final Optional matcher; + + private LikeMatcher( + String pattern, + Optional escape, + int minSize, + OptionalInt maxSize, + byte[] prefix, + byte[] suffix, + Optional matcher) + { + this.pattern = pattern; + this.escape = escape; + this.minSize = minSize; + this.maxSize = maxSize; + this.prefix = prefix; + this.suffix = suffix; + this.matcher = matcher; + } + + public String getPattern() + { + return pattern; + } + + public Optional getEscape() + { + return escape; + } + + public static LikeMatcher compile(String pattern) + { + return compile(pattern, Optional.empty(), true); + } + + public static LikeMatcher compile(String pattern, Optional escape) + { + return compile(pattern, escape, true); + } + + public static LikeMatcher compile(String pattern, Optional escape, boolean optimize) + { + List parsed = parse(pattern, escape); + + // Calculate minimum and maximum size for candidate strings + // This is used for short-circuiting the match if the size of + // the input is outside those bounds + int minSize = 0; + int maxSize = 0; + boolean unbounded = false; + for (Pattern expression : parsed) { + if (expression instanceof Pattern.Literal) { + Pattern.Literal literal = (Pattern.Literal) expression; + int length = literal.getValue().getBytes(UTF_8).length; + minSize += length; + maxSize += length; + } + else if (expression instanceof Pattern.ZeroOrMore) { + unbounded = true; + } + else if (expression instanceof Pattern.Any) { + Pattern.Any any = (Pattern.Any) expression; + int length = any.getLength(); + minSize += length; + maxSize += length * 4; // at most 4 bytes for a single UTF-8 codepoint + } + else { + throw new UnsupportedOperationException("Not supported: " + expression.getClass().getName()); + } + } + + // Calculate exact match prefix and suffix + // If the pattern starts and ends with a literal, we can perform a quick + // exact match to short-circuit DFA evaluation + byte[] prefix = new byte[0]; + byte[] suffix = new byte[0]; + + int patternStart = 0; + int patternEnd = parsed.size() - 1; + if (parsed.size() > 0 && parsed.get(0) instanceof Pattern.Literal) { + Pattern.Literal literal = (Pattern.Literal) parsed.get(0); + prefix = literal.getValue().getBytes(UTF_8); + patternStart++; + } + + if (parsed.size() > 1 && parsed.get(parsed.size() - 1) instanceof Pattern.Literal) { + Pattern.Literal literal = (Pattern.Literal) parsed.get(parsed.size() - 1); + suffix = literal.getValue().getBytes(UTF_8); + patternEnd--; + } + + // If the pattern (after excluding constant prefix/suffixes) ends with an unbounded match (i.e., %) + // we can perform a non-exact match and end as soon as the DFA reaches an accept state -- there + // is no need to consume the remaining input + // This section determines whether the pattern is a candidate for non-exact match. + boolean exact = true; // whether to match to the end of the input + if (patternStart <= patternEnd && parsed.get(patternEnd) instanceof Pattern.ZeroOrMore) { + // guaranteed to be Any or ZeroOrMore because any Literal would've been turned into a suffix above + exact = false; + patternEnd--; + } + + Optional matcher = Optional.empty(); + if (patternStart <= patternEnd) { + if (optimize) { + matcher = Optional.of(new DenseDfaMatcher(parsed, patternStart, patternEnd, exact)); + } + else { + matcher = Optional.of(new NfaMatcher(parsed, patternStart, patternEnd, exact)); + } + } + + return new LikeMatcher( + pattern, + escape, + minSize, + unbounded ? OptionalInt.empty() : OptionalInt.of(maxSize), + prefix, + suffix, + matcher); + } + + public boolean match(byte[] input) + { + return match(input, 0, input.length); + } + + public boolean match(byte[] input, int offset, int length) + { + if (length < minSize) { + return false; + } + + if (maxSize.isPresent() && length > maxSize.getAsInt()) { + return false; + } + + if (!startsWith(prefix, input, offset)) { + return false; + } + + if (!startsWith(suffix, input, offset + length - suffix.length)) { + return false; + } + + if (matcher.isPresent()) { + return matcher.get().match(input, offset + prefix.length, length - suffix.length - prefix.length); + } + + return true; + } + + private boolean startsWith(byte[] pattern, byte[] input, int offset) + { + for (int i = 0; i < pattern.length; i++) { + if (pattern[i] != input[offset + i]) { + return false; + } + } + + return true; + } + + static List parse(String pattern, Optional escape) + { + List result = new ArrayList<>(); + + StringBuilder literal = new StringBuilder(); + int anyCount = 0; + boolean anyUnbounded = false; + boolean inEscape = false; + for (int i = 0; i < pattern.length(); i++) { + char character = pattern.charAt(i); + + if (inEscape) { + if (character != '%' && character != '_' && character != escape.get()) { + throw new IllegalArgumentException("Escape character must be followed by '%', '_' or the escape character itself"); + } + + literal.append(character); + inEscape = false; + } + else if (escape.isPresent() && character == escape.get()) { + inEscape = true; + + if (anyCount != 0) { + result.add(new Pattern.Any(anyCount)); + anyCount = 0; + } + + if (anyUnbounded) { + result.add(new Pattern.ZeroOrMore()); + anyUnbounded = false; + } + } + else if (character == '%' || character == '_') { + if (literal.length() != 0) { + result.add(new Pattern.Literal(literal.toString())); + literal.setLength(0); + } + + if (character == '%') { + anyUnbounded = true; + } + else { + anyCount++; + } + } + else { + if (anyCount != 0) { + result.add(new Pattern.Any(anyCount)); + anyCount = 0; + } + + if (anyUnbounded) { + result.add(new Pattern.ZeroOrMore()); + anyUnbounded = false; + } + + literal.append(character); + } + } + + if (inEscape) { + throw new IllegalArgumentException("Escape character must be followed by '%', '_' or the escape character itself"); + } + + if (literal.length() != 0) { + result.add(new Pattern.Literal(literal.toString())); + } + else { + if (anyCount != 0) { + result.add(new Pattern.Any(anyCount)); + } + + if (anyUnbounded) { + result.add(new Pattern.ZeroOrMore()); + } + } + + return result; + } +} diff --git a/presto-main/src/main/java/com/facebook/presto/likematcher/Matcher.java b/presto-main/src/main/java/com/facebook/presto/likematcher/Matcher.java new file mode 100644 index 0000000000000..f417c2eeb4d93 --- /dev/null +++ b/presto-main/src/main/java/com/facebook/presto/likematcher/Matcher.java @@ -0,0 +1,19 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.facebook.presto.likematcher; + +public interface Matcher +{ + boolean match(byte[] input, int offset, int length); +} diff --git a/presto-main/src/main/java/com/facebook/presto/likematcher/NFA.java b/presto-main/src/main/java/com/facebook/presto/likematcher/NFA.java new file mode 100644 index 0000000000000..b40f0d86194fd --- /dev/null +++ b/presto-main/src/main/java/com/facebook/presto/likematcher/NFA.java @@ -0,0 +1,212 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.facebook.presto.likematcher; + +import it.unimi.dsi.fastutil.ints.IntArraySet; +import it.unimi.dsi.fastutil.ints.IntSet; + +import java.util.ArrayDeque; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Queue; +import java.util.Set; + +import static java.util.Objects.requireNonNull; + +final class NFA +{ + private final int start; + private final int accept; + private final List> transitions; + + private NFA(int start, int accept, List> transitions) + { + this.start = start; + this.accept = accept; + this.transitions = requireNonNull(transitions, "transitions is null"); + } + + public DFA toDfa() + { + Map activeStates = new HashMap<>(); + + DFA.Builder builder = new DFA.Builder(); + + IntSet initial = new IntArraySet(); + initial.add(start); + Queue queue = new ArrayDeque<>(); + queue.add(initial); + + int dfaStartState = builder.addStartState(initial.contains(accept)); + activeStates.put(initial, dfaStartState); + + Set visited = new HashSet<>(); + while (!queue.isEmpty()) { + IntSet current = queue.poll(); + + if (!visited.add(current)) { + continue; + } + + // For each possible byte value... + for (int byteValue = 0; byteValue < 256; byteValue++) { + IntSet next = new IntArraySet(); + for (int nfaState : current) { + for (Transition transition : transitions(nfaState)) { + Condition condition = transition.getCondition(); + int target = transition.getTarget(); + + if (condition instanceof Value && ((Value) condition).getValue() == (byte) byteValue) { + next.add(target); + } + else if (condition instanceof Prefix) { + Prefix prefixTransition = (Prefix) condition; + if (byteValue >>> (8 - prefixTransition.getBits()) == prefixTransition.getPrefix()) { + next.add(target); + } + } + } + } + + if (!next.isEmpty()) { + int from = activeStates.get(current); + int to = activeStates.computeIfAbsent(next, nfaStates -> builder.addState(nfaStates.contains(accept))); + builder.addTransition(from, byteValue, to); + + queue.add(next); + } + } + } + + return builder.build(); + } + + private List transitions(int state) + { + return transitions.get(state); + } + + public static class Builder + { + private int nextId; + private int start; + private int accept; + private final List> transitions = new ArrayList<>(); + + public int addState() + { + transitions.add(new ArrayList<>()); + return nextId++; + } + + public int addStartState() + { + start = addState(); + return start; + } + + public void setAccept(int state) + { + accept = state; + } + + public void addTransition(int from, Condition condition, int to) + { + transitions.get(from).add(new Transition(to, condition)); + } + + public NFA build() + { + return new NFA(start, accept, transitions); + } + } + + public static class Transition + { + private final int target; + private final Condition condition; + + public Transition(int target, Condition condition) + { + this.target = target; + this.condition = condition; + } + + public int getTarget() + { + return target; + } + + public Condition getCondition() + { + return condition; + } + + // Implement equals(), hashCode(), and toString() as needed + // ... + } + + public interface Condition + { + // methods, if any + } + + public static class Value + implements Condition + { + private final byte value; + + public Value(byte value) + { + this.value = value; + } + + public byte getValue() + { + return value; + } + + // Implement equals(), hashCode(), and toString() as needed + // ... + } + + public static class Prefix + implements Condition + { + private final int prefix; + private final int bits; + + public Prefix(int prefix, int bits) + { + this.prefix = prefix; + this.bits = bits; + } + + public int getPrefix() + { + return prefix; + } + + public int getBits() + { + return bits; + } + + // Implement equals(), hashCode(), and toString() as needed + // ... + } +} diff --git a/presto-main/src/main/java/com/facebook/presto/likematcher/NfaMatcher.java b/presto-main/src/main/java/com/facebook/presto/likematcher/NfaMatcher.java new file mode 100644 index 0000000000000..0593d43452ca1 --- /dev/null +++ b/presto-main/src/main/java/com/facebook/presto/likematcher/NfaMatcher.java @@ -0,0 +1,167 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.facebook.presto.likematcher; + +import java.util.Arrays; +import java.util.List; + +final class NfaMatcher + implements Matcher +{ + private static final int ANY = -1; + private static final int NONE = -2; + private static final int INVALID_CODEPOINT = -1; + + private final boolean exact; + + private final boolean[] loopback; + private final int[] match; + private final int acceptState; + private final int stateCount; + + public NfaMatcher(List pattern, int start, int end, boolean exact) + { + this.exact = exact; + + stateCount = calculateStateCount(pattern, start, end); + + loopback = new boolean[stateCount]; + match = new int[stateCount]; + Arrays.fill(match, NONE); + acceptState = stateCount - 1; + + int state = 0; + for (int j = start; j <= end; j++) { + Pattern element = pattern.get(j); + if (element instanceof Pattern.Literal) { + Pattern.Literal literal = (Pattern.Literal) element; + for (int i = 0; i < literal.getValue().length(); i++) { + match[state++] = literal.getValue().charAt(i); + } + } + else if (element instanceof Pattern.Any) { + Pattern.Any any = (Pattern.Any) element; + for (int i = 0; i < any.getLength(); i++) { + match[state++] = ANY; + } + } + else if (element instanceof Pattern.ZeroOrMore) { + loopback[state] = true; + } + } + } + + private static int calculateStateCount(List pattern, int start, int end) + { + int states = 1; + for (int i = start; i <= end; i++) { + Pattern element = pattern.get(i); + if (element instanceof Pattern.Literal) { + Pattern.Literal literal = (Pattern.Literal) element; + states += literal.getValue().length(); + } + else if (element instanceof Pattern.Any) { + Pattern.Any any = (Pattern.Any) element; + states += any.getLength(); + } + } + return states; + } + + @Override + public boolean match(byte[] input, int offset, int length) + { + boolean[] seen = new boolean[stateCount + 1]; + int[] currentStates = new int[stateCount]; + int[] nextStates = new int[stateCount]; + int currentStatesIndex = 0; + int nextStatesIndex; + + currentStates[currentStatesIndex++] = 0; + + int limit = offset + length; + int current = offset; + boolean accept = false; + while (current < limit) { + int codepoint = INVALID_CODEPOINT; + + // decode the next UTF-8 codepoint + int header = input[current] & 0xFF; + if (header < 0x80) { + // normal ASCII + // 0xxx_xxxx + codepoint = header; + current++; + } + else if ((header & 0b1110_0000) == 0b1100_0000) { + // 110x_xxxx 10xx_xxxx + if (current + 1 < limit) { + codepoint = ((header & 0b0001_1111) << 6) | (input[current + 1] & 0b0011_1111); + current += 2; + } + } + else if ((header & 0b1111_0000) == 0b1110_0000) { + // 1110_xxxx 10xx_xxxx 10xx_xxxx + if (current + 2 < limit) { + codepoint = ((header & 0b0000_1111) << 12) | ((input[current + 1] & 0b0011_1111) << 6) | (input[current + 2] & 0b0011_1111); + current += 3; + } + } + else if ((header & 0b1111_1000) == 0b1111_0000) { + // 1111_0xxx 10xx_xxxx 10xx_xxxx 10xx_xxxx + if (current + 3 < limit) { + codepoint = ((header & 0b0000_0111) << 18) | ((input[current + 1] & 0b0011_1111) << 12) | ((input[current + 2] & 0b0011_1111) << 6) | (input[current + 3] & 0b0011_1111); + current += 4; + } + } + + if (codepoint == INVALID_CODEPOINT) { + return false; + } + + accept = false; + nextStatesIndex = 0; + Arrays.fill(seen, false); + for (int i = 0; i < currentStatesIndex; i++) { + int state = currentStates[i]; + if (!seen[state] && loopback[state]) { + nextStates[nextStatesIndex++] = state; + accept |= state == acceptState; + seen[state] = true; + } + int next = state + 1; + if (!seen[next] && (match[state] == ANY || match[state] == codepoint)) { + nextStates[nextStatesIndex++] = next; + accept |= next == acceptState; + seen[next] = true; + } + } + + if (nextStatesIndex == 0) { + return false; + } + + if (!exact && accept) { + return true; + } + + int[] tmp = currentStates; + currentStates = nextStates; + nextStates = tmp; + currentStatesIndex = nextStatesIndex; + } + + return accept; + } +} diff --git a/presto-main/src/main/java/com/facebook/presto/likematcher/Pattern.java b/presto-main/src/main/java/com/facebook/presto/likematcher/Pattern.java new file mode 100644 index 0000000000000..e17c6050cdb85 --- /dev/null +++ b/presto-main/src/main/java/com/facebook/presto/likematcher/Pattern.java @@ -0,0 +1,102 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.facebook.presto.likematcher; + +import com.google.common.base.Strings; + +import java.util.Objects; + +import static com.google.common.base.Preconditions.checkArgument; + +public interface Pattern +{ + class Literal + implements Pattern + { + private final String value; + + public Literal(String value) + { + this.value = value; + } + + public String getValue() + { + return value; + } + + @Override + public String toString() + { + return value; + } + + // You can add equals, hashCode, and other utility methods if required + } + + public class ZeroOrMore + implements Pattern + { + @Override + public String toString() + { + return "%"; + } + + // Equals and hashCode methods might be required based on usage, + // the record provided them by default + @Override + public boolean equals(Object obj) + { + if (this == obj) { + return true; + } + if (obj == null || getClass() != obj.getClass()) { + return false; + } + return true; + } + + @Override + public int hashCode() + { + return Objects.hash(); // No fields to hash in this case + } + } + + class Any + implements Pattern + { + private final int length; + + public Any(int length) + { + checkArgument(length > 0, "Length must be > 0"); + this.length = length; + } + + public int getLength() + { + return length; + } + + @Override + public String toString() + { + return Strings.repeat("_", length); + } + + // You can add equals, hashCode, and other utility methods if required + } +} diff --git a/presto-main/src/main/java/com/facebook/presto/sql/planner/ExpressionInterpreter.java b/presto-main/src/main/java/com/facebook/presto/sql/planner/ExpressionInterpreter.java index c644ce5152cbd..05393ecd92401 100644 --- a/presto-main/src/main/java/com/facebook/presto/sql/planner/ExpressionInterpreter.java +++ b/presto-main/src/main/java/com/facebook/presto/sql/planner/ExpressionInterpreter.java @@ -29,6 +29,7 @@ import com.facebook.presto.common.type.Type; import com.facebook.presto.common.type.TypeUtils; import com.facebook.presto.expressions.DynamicFilters; +import com.facebook.presto.likematcher.LikeMatcher; import com.facebook.presto.metadata.FunctionAndTypeManager; import com.facebook.presto.metadata.Metadata; import com.facebook.presto.operator.scalar.ArraySubscriptOperator; @@ -98,7 +99,6 @@ import com.google.common.collect.ImmutableMap; import com.google.common.collect.ImmutableSet; import com.google.common.primitives.Primitives; -import io.airlift.joni.Regex; import io.airlift.slice.Slice; import java.lang.invoke.MethodHandle; @@ -176,7 +176,7 @@ public class ExpressionInterpreter private final Visitor visitor; // identity-based cache for LIKE expressions with constant pattern and escape char - private final IdentityHashMap likePatternCache = new IdentityHashMap<>(); + private final IdentityHashMap likePatternCache = new IdentityHashMap<>(); private final IdentityHashMap> inListCache = new IdentityHashMap<>(); public static ExpressionInterpreter expressionInterpreter(Expression expression, Metadata metadata, Session session, Map, Type> expressionTypes) @@ -1061,15 +1061,15 @@ protected Object visitLikePredicate(LikePredicate node, Object context) if (value instanceof Slice && pattern instanceof Slice && (escape == null || escape instanceof Slice)) { - Regex regex; + LikeMatcher matcher; if (escape == null) { - regex = LikeFunctions.likePattern((Slice) pattern); + matcher = LikeMatcher.compile(((Slice) pattern).toStringUtf8(), Optional.empty()); } else { - regex = LikeFunctions.likePattern((Slice) pattern, (Slice) escape); + matcher = LikeFunctions.likePattern((Slice) pattern, (Slice) escape); } - return interpretLikePredicate(type(node.getValue()), (Slice) value, regex); + return interpretLikePredicate(type(node.getValue()), (Slice) value, matcher); } // if pattern is a constant without % or _ replace with a comparison @@ -1103,9 +1103,9 @@ protected Object visitLikePredicate(LikePredicate node, Object context) optimizedEscape); } - private Regex getConstantPattern(LikePredicate node) + private LikeMatcher getConstantPattern(LikePredicate node) { - Regex result = likePatternCache.get(node); + LikeMatcher result = likePatternCache.get(node); if (result == null) { StringLiteral pattern = (StringLiteral) node.getPattern(); @@ -1115,7 +1115,7 @@ private Regex getConstantPattern(LikePredicate node) result = LikeFunctions.likePattern(pattern.getSlice(), escape); } else { - result = LikeFunctions.likePattern(pattern.getSlice()); + result = LikeMatcher.compile(pattern.getValue(), Optional.empty()); } likePatternCache.put(node, result); diff --git a/presto-main/src/main/java/com/facebook/presto/sql/planner/Interpreters.java b/presto-main/src/main/java/com/facebook/presto/sql/planner/Interpreters.java index a20601473c632..39660d859c84e 100644 --- a/presto-main/src/main/java/com/facebook/presto/sql/planner/Interpreters.java +++ b/presto-main/src/main/java/com/facebook/presto/sql/planner/Interpreters.java @@ -17,9 +17,9 @@ import com.facebook.presto.common.type.CharType; import com.facebook.presto.common.type.Type; import com.facebook.presto.common.type.VarcharType; +import com.facebook.presto.likematcher.LikeMatcher; import com.facebook.presto.spi.relation.VariableReferenceExpression; import com.facebook.presto.type.LikeFunctions; -import io.airlift.joni.Regex; import io.airlift.slice.Slice; import java.util.Map; @@ -58,14 +58,14 @@ else if (!javaType.isPrimitive()) { throw new UnsupportedOperationException("Dereference a unsupported primitive type: " + javaType.getName()); } - static boolean interpretLikePredicate(Type valueType, Slice value, Regex regex) + static boolean interpretLikePredicate(Type valueType, Slice value, LikeMatcher matcher) { if (valueType instanceof VarcharType) { - return LikeFunctions.likeVarchar(value, regex); + return LikeFunctions.likeVarchar(value, matcher); } checkState(valueType instanceof CharType, "LIKE value is neither VARCHAR or CHAR"); - return LikeFunctions.likeChar((long) ((CharType) valueType).getLength(), value, regex); + return LikeFunctions.likeChar((long) ((CharType) valueType).getLength(), value, matcher); } public static class LambdaVariableResolver diff --git a/presto-main/src/main/java/com/facebook/presto/sql/planner/RowExpressionInterpreter.java b/presto-main/src/main/java/com/facebook/presto/sql/planner/RowExpressionInterpreter.java index a4ee88c1a327e..43e9f9d421013 100644 --- a/presto-main/src/main/java/com/facebook/presto/sql/planner/RowExpressionInterpreter.java +++ b/presto-main/src/main/java/com/facebook/presto/sql/planner/RowExpressionInterpreter.java @@ -23,6 +23,7 @@ import com.facebook.presto.common.type.RowType; import com.facebook.presto.common.type.Type; import com.facebook.presto.common.type.TypeSignature; +import com.facebook.presto.likematcher.LikeMatcher; import com.facebook.presto.metadata.FunctionAndTypeManager; import com.facebook.presto.metadata.Metadata; import com.facebook.presto.spi.ConnectorSession; @@ -47,7 +48,6 @@ import com.facebook.presto.util.Failures; import com.google.common.collect.ImmutableList; import com.google.common.primitives.Primitives; -import io.airlift.joni.Regex; import io.airlift.slice.Slice; import java.lang.invoke.MethodHandle; @@ -904,8 +904,8 @@ private SpecialCallResult tryHandleLike(CallExpression callExpression, List getEscapeChar(Slice escape) { String escapeString = escape.toStringUtf8(); if (escapeString.isEmpty()) { // escaping disabled - return (char) -1; // invalid character + return Optional.empty(); // invalid character } if (escapeString.length() == 1) { - return escapeString.charAt(0); + return Optional.of(escapeString.charAt(0)); } throw new PrestoException(INVALID_FUNCTION_ARGUMENT, "Escape string must be a single character"); } diff --git a/presto-main/src/main/java/com/facebook/presto/type/LikePatternType.java b/presto-main/src/main/java/com/facebook/presto/type/LikePatternType.java index 7d1ab5ff7264e..5e7035ea443bf 100644 --- a/presto-main/src/main/java/com/facebook/presto/type/LikePatternType.java +++ b/presto-main/src/main/java/com/facebook/presto/type/LikePatternType.java @@ -19,8 +19,8 @@ import com.facebook.presto.common.function.SqlFunctionProperties; import com.facebook.presto.common.type.AbstractPrimitiveType; import com.facebook.presto.common.type.TypeSignature; +import com.facebook.presto.likematcher.LikeMatcher; import com.facebook.presto.spi.PrestoException; -import io.airlift.joni.Regex; import static com.facebook.presto.spi.StandardErrorCode.GENERIC_INTERNAL_ERROR; @@ -32,7 +32,7 @@ public class LikePatternType public LikePatternType() { - super(new TypeSignature(NAME), Regex.class); + super(new TypeSignature(NAME), LikeMatcher.class); } @Override diff --git a/presto-main/src/test/java/com/facebook/presto/sql/BenchmarkLike.java b/presto-main/src/test/java/com/facebook/presto/sql/BenchmarkLike.java new file mode 100644 index 0000000000000..c4eb9ba3d8c99 --- /dev/null +++ b/presto-main/src/test/java/com/facebook/presto/sql/BenchmarkLike.java @@ -0,0 +1,205 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +//package com.facebook.presto.sql; +// +//import com.facebook.presto.likematcher.LikeMatcher; +//import io.airlift.jcodings.specific.NonStrictUTF8Encoding; +//import io.airlift.joni.Matcher; +//import io.airlift.joni.Option; +//import io.airlift.joni.Regex; +//import io.airlift.joni.Syntax; +//import io.airlift.slice.Slice; +//import io.airlift.slice.Slices; +//import io.trino.likematcher.LikeMatcher; +//import io.trino.type.JoniRegexp; +//import org.openjdk.jmh.annotations.Benchmark; +//import org.openjdk.jmh.annotations.BenchmarkMode; +//import org.openjdk.jmh.annotations.Fork; +//import org.openjdk.jmh.annotations.Measurement; +//import org.openjdk.jmh.annotations.OutputTimeUnit; +//import org.openjdk.jmh.annotations.Param; +//import org.openjdk.jmh.annotations.Setup; +//import org.openjdk.jmh.annotations.State; +//import org.openjdk.jmh.annotations.Warmup; +//import org.openjdk.jmh.results.format.ResultFormatType; +//import org.openjdk.jmh.runner.Runner; +//import org.openjdk.jmh.runner.RunnerException; +//import org.openjdk.jmh.runner.options.Options; +//import org.openjdk.jmh.runner.options.OptionsBuilder; +//import org.openjdk.jmh.runner.options.VerboseMode; +// +//import java.util.Optional; +// +//import static io.airlift.joni.constants.MetaChar.INEFFECTIVE_META_CHAR; +//import static io.airlift.joni.constants.SyntaxProperties.OP_ASTERISK_ZERO_INF; +//import static io.airlift.joni.constants.SyntaxProperties.OP_DOT_ANYCHAR; +//import static io.airlift.joni.constants.SyntaxProperties.OP_LINE_ANCHOR; +//import static java.nio.charset.StandardCharsets.UTF_8; +//import static java.util.concurrent.TimeUnit.MILLISECONDS; +//import static java.util.concurrent.TimeUnit.NANOSECONDS; +//import static org.openjdk.jmh.annotations.Mode.AverageTime; +//import static org.openjdk.jmh.annotations.Scope.Thread; +// +//@State(Thread) +//@OutputTimeUnit(NANOSECONDS) +//@BenchmarkMode(AverageTime) +//@Fork(3) +//@Warmup(iterations = 10, time = 500, timeUnit = MILLISECONDS) +//@Measurement(iterations = 30, time = 500, timeUnit = MILLISECONDS) +//public class BenchmarkLike +//{ +// private static final Syntax SYNTAX = new Syntax( +// OP_DOT_ANYCHAR | OP_ASTERISK_ZERO_INF | OP_LINE_ANCHOR, +// 0, +// 0, +// Option.NONE, +// new Syntax.MetaCharTable( +// '\\', /* esc */ +// INEFFECTIVE_META_CHAR, /* anychar '.' */ +// INEFFECTIVE_META_CHAR, /* anytime '*' */ +// INEFFECTIVE_META_CHAR, /* zero or one time '?' */ +// INEFFECTIVE_META_CHAR, /* one or more time '+' */ +// INEFFECTIVE_META_CHAR)); /* anychar anytime */ +// +// @State(Thread) +// public static class Data +// { +// @Param({ +// "%", +// "_%", +// "%_", +// "abc%", +// "%abc", +// "_____", +// "abc%def%ghi", +// "%abc%def%", +// }) +// private String pattern; +// +// private Slice data; +// private byte[] bytes; +// private Regex regex; +// private LikeMatcher matcher; +// +// @Setup +// public void setup() +// { +// data = Slices.utf8Slice( +// switch (pattern) { +// case "%" -> "qeroighqeorhgqerhb2eriuyerqiubgierubgleuqrbgilquebriuqebryqebrhqerhqsnajkbcowuhet"; +// case "_%", "%_" -> "qeroighqeorhgqerhb2eriuyerqiubgierubgleuqrbgilquebriuqebryqebrhqerhqsnajkbcowuhet"; +// case "abc%" -> "abcqeroighqeorhgqerhb2eriuyerqiubgierubgleuqrbgilquebriuqebryqebrhqerhqsnajkbcowuhet"; +// case "%abc" -> "qeroighqeorhgqerhb2eriuyerqiubgierubgleuqrbgilquebriuqebryqebrhqerhqsnajkbcowuhetabc"; +// case "_____" -> "abcde"; +// case "abc%def%ghi" -> "abc qeroighqeorhgqerhb2eriuyerqiubgier def ubgleuqrbgilquebriuqebryqebrhqerhqsnajkbcowuhet ghi"; +// case "%abc%def%" -> "fdnbqerbfklerqbgqjerbgkr abc qeroighqeorhgqerhb2eriuyerqiubgier def ubgleuqrbgilquebriuqebryqebrhqerhqsnajkbcowuhet"; +// default -> throw new IllegalArgumentException("Unknown pattern: " + pattern); +// }); +// +// matcher = LikeMatcher.compile(pattern, Optional.empty()); +// joniPattern = compileJoni(Slices.utf8Slice(pattern).toStringUtf8(), '0', false); +// +// bytes = data.getBytes(); +// } +// } +// +// @Benchmark +// public boolean benchmarkJoni(Data data) +// { +// return likeVarchar(data.data, data.joniPattern); +// } +// +// @Benchmark +// public boolean benchmarkCurrent(Data data) +// { +// return data.matcher.match(data.bytes, 0, data.bytes.length); +// } +// +// public static boolean likeVarchar(Slice value, JoniRegexp pattern) +// { +// Matcher matcher; +// int offset; +// if (value.hasByteArray()) { +// offset = value.byteArrayOffset(); +// matcher = pattern.regex().matcher(value.byteArray(), offset, offset + value.length()); +// } +// else { +// offset = 0; +// matcher = pattern.matcher(value.getBytes()); +// } +// return matcher.match(offset, offset + value.length(), Option.NONE) != -1; +// } +// +// private static JoniRegexp compileJoni(String patternString, char escapeChar, boolean shouldEscape) +// { +// byte[] bytes = likeToRegex(patternString, escapeChar, shouldEscape).getBytes(UTF_8); +// Regex joniRegex = new Regex(bytes, 0, bytes.length, Option.MULTILINE, NonStrictUTF8Encoding.INSTANCE, SYNTAX); +// return new JoniRegexp(Slices.wrappedBuffer(bytes), joniRegex); +// } +// +// private static String likeToRegex(String patternString, char escapeChar, boolean shouldEscape) +// { +// StringBuilder regex = new StringBuilder(patternString.length() * 2); +// +// regex.append('^'); +// boolean escaped = false; +// for (char currentChar : patternString.toCharArray()) { +// checkEscape(!escaped || currentChar == '%' || currentChar == '_' || currentChar == escapeChar); +// if (shouldEscape && !escaped && (currentChar == escapeChar)) { +// escaped = true; +// } +// else { +// switch (currentChar) { +// case '%' -> { +// regex.append(escaped ? "%" : ".*"); +// escaped = false; +// } +// case '_' -> { +// regex.append(escaped ? "_" : "."); +// escaped = false; +// } +// default -> { +// // escape special regex characters +// switch (currentChar) { +// case '\\', '^', '$', '.', '*' -> regex.append('\\'); +// } +// regex.append(currentChar); +// escaped = false; +// } +// } +// } +// } +// checkEscape(!escaped); +// regex.append('$'); +// return regex.toString(); +// } +// +// private static void checkEscape(boolean condition) +// { +// checkCondition(condition, INVALID_FUNCTION_ARGUMENT, "Escape character must be followed by '%%', '_' or the escape character itself"); +// } +// +// public static void main(String[] args) +// throws RunnerException +// { +// Options options = new OptionsBuilder() +// .verbosity(VerboseMode.NORMAL) +// .include(".*" + BenchmarkLike.class.getSimpleName() + ".*") +// .resultFormat(ResultFormatType.JSON) +// .build(); +// +// new Runner(options).run(); +// } +//} diff --git a/presto-main/src/test/java/com/facebook/presto/sql/TestLikeFunctions.java b/presto-main/src/test/java/com/facebook/presto/sql/TestLikeFunctions.java index 496cd578519bf..b14611a1b432c 100644 --- a/presto-main/src/test/java/com/facebook/presto/sql/TestLikeFunctions.java +++ b/presto-main/src/test/java/com/facebook/presto/sql/TestLikeFunctions.java @@ -13,9 +13,9 @@ */ package com.facebook.presto.sql; +import com.facebook.presto.likematcher.LikeMatcher; import com.facebook.presto.operator.scalar.AbstractTestFunctions; import com.facebook.presto.spi.PrestoException; -import io.airlift.joni.Regex; import io.airlift.slice.Slice; import io.airlift.slice.Slices; import org.testng.annotations.Test; @@ -47,9 +47,9 @@ private static Slice offsetHeapSlice(String value) @Test public void testLikeBasic() { - Regex regex = likePattern(utf8Slice("f%b__")); - assertTrue(likeVarchar(utf8Slice("foobar"), regex)); - assertTrue(likeVarchar(offsetHeapSlice("foobar"), regex)); + LikeMatcher matcher = likePattern(utf8Slice("f%b__")); + assertTrue(likeVarchar(utf8Slice("foobar"), matcher)); + assertTrue(likeVarchar(offsetHeapSlice("foobar"), matcher)); assertFunction("'foob' LIKE 'f%b__'", BOOLEAN, false); assertFunction("'foob' LIKE 'f%b'", BOOLEAN, true); @@ -58,13 +58,13 @@ public void testLikeBasic() @Test public void testLikeChar() { - Regex regex = likePattern(utf8Slice("f%b__")); - assertTrue(likeChar(6L, utf8Slice("foobar"), regex)); - assertTrue(likeChar(6L, offsetHeapSlice("foobar"), regex)); - assertTrue(likeChar(6L, utf8Slice("foob"), regex)); - assertTrue(likeChar(6L, offsetHeapSlice("foob"), regex)); - assertFalse(likeChar(7L, utf8Slice("foob"), regex)); - assertFalse(likeChar(7L, offsetHeapSlice("foob"), regex)); + LikeMatcher matcher = likePattern(utf8Slice("f%b__")); + assertTrue(likeChar(6L, utf8Slice("foobar"), matcher)); + assertTrue(likeChar(6L, offsetHeapSlice("foobar"), matcher)); + assertTrue(likeChar(6L, utf8Slice("foob"), matcher)); + assertTrue(likeChar(6L, offsetHeapSlice("foob"), matcher)); + assertFalse(likeChar(7L, utf8Slice("foob"), matcher)); + assertFalse(likeChar(7L, offsetHeapSlice("foob"), matcher)); assertFunction("cast('foob' as char(6)) LIKE 'f%b__'", BOOLEAN, true); assertFunction("cast('foob' as char(7)) LIKE 'f%b__'", BOOLEAN, false); @@ -73,41 +73,41 @@ public void testLikeChar() @Test public void testLikeSpacesInPattern() { - Regex regex = likePattern(utf8Slice("ala ")); - assertTrue(likeVarchar(utf8Slice("ala "), regex)); - assertFalse(likeVarchar(utf8Slice("ala"), regex)); + LikeMatcher matcher = likePattern(utf8Slice("ala ")); + assertTrue(likeVarchar(utf8Slice("ala "), matcher)); + assertFalse(likeVarchar(utf8Slice("ala"), matcher)); - regex = castCharToLikePattern(5L, utf8Slice("ala")); - assertTrue(likeVarchar(utf8Slice("ala "), regex)); - assertFalse(likeVarchar(utf8Slice("ala"), regex)); + matcher = castCharToLikePattern(5L, utf8Slice("ala")); + assertTrue(likeVarchar(utf8Slice("ala "), matcher)); + assertFalse(likeVarchar(utf8Slice("ala"), matcher)); } @Test public void testLikeNewlineInPattern() { - Regex regex = likePattern(utf8Slice("%o\nbar")); - assertTrue(likeVarchar(utf8Slice("foo\nbar"), regex)); + LikeMatcher matcher = likePattern(utf8Slice("%o\nbar")); + assertTrue(likeVarchar(utf8Slice("foo\nbar"), matcher)); } @Test public void testLikeNewlineBeforeMatch() { - Regex regex = likePattern(utf8Slice("%b%")); - assertTrue(likeVarchar(utf8Slice("foo\nbar"), regex)); + LikeMatcher matcher = likePattern(utf8Slice("%b%")); + assertTrue(likeVarchar(utf8Slice("foo\nbar"), matcher)); } @Test public void testLikeNewlineInMatch() { - Regex regex = likePattern(utf8Slice("f%b%")); - assertTrue(likeVarchar(utf8Slice("foo\nbar"), regex)); + LikeMatcher matcher = likePattern(utf8Slice("f%b%")); + assertTrue(likeVarchar(utf8Slice("foo\nbar"), matcher)); } @Test(timeOut = 1000) public void testLikeUtf8Pattern() { - Regex regex = likePattern(utf8Slice("%\u540d\u8a89%"), utf8Slice("\\")); - assertFalse(likeVarchar(utf8Slice("foo"), regex)); + LikeMatcher matcher = likePattern(utf8Slice("%\u540d\u8a89%"), utf8Slice("\\")); + assertFalse(likeVarchar(utf8Slice("foo"), matcher)); } @SuppressWarnings("NumericCastThatLosesPrecision") @@ -115,29 +115,29 @@ public void testLikeUtf8Pattern() public void testLikeInvalidUtf8Value() { Slice value = Slices.wrappedBuffer(new byte[] {'a', 'b', 'c', (byte) 0xFF, 'x', 'y'}); - Regex regex = likePattern(utf8Slice("%b%"), utf8Slice("\\")); - assertTrue(likeVarchar(value, regex)); + LikeMatcher matcher = likePattern(utf8Slice("%b%"), utf8Slice("\\")); + assertTrue(likeVarchar(value, matcher)); } @Test public void testBackslashesNoSpecialTreatment() { - Regex regex = likePattern(utf8Slice("\\abc\\/\\\\")); - assertTrue(likeVarchar(utf8Slice("\\abc\\/\\\\"), regex)); + LikeMatcher matcher = likePattern(utf8Slice("\\abc\\/\\\\")); + assertTrue(likeVarchar(utf8Slice("\\abc\\/\\\\"), matcher)); } @Test public void testSelfEscaping() { - Regex regex = likePattern(utf8Slice("\\\\abc\\%"), utf8Slice("\\")); - assertTrue(likeVarchar(utf8Slice("\\abc%"), regex)); + LikeMatcher matcher = likePattern(utf8Slice("\\\\abc\\%"), utf8Slice("\\")); + assertTrue(likeVarchar(utf8Slice("\\abc%"), matcher)); } @Test public void testAlternateEscapedCharacters() { - Regex regex = likePattern(utf8Slice("xxx%x_abcxx"), utf8Slice("x")); - assertTrue(likeVarchar(utf8Slice("x%_abcx"), regex)); + LikeMatcher matcher = likePattern(utf8Slice("xxx%x_abcxx"), utf8Slice("x")); + assertTrue(likeVarchar(utf8Slice("x%_abcx"), matcher)); } @Test @@ -172,4 +172,44 @@ public void testUnescapeValidLikePattern() assertEquals(unescapeLiteralLikePattern(utf8Slice("a##bc#_"), utf8Slice("#")), utf8Slice("a#bc_")); assertEquals(unescapeLiteralLikePattern(utf8Slice("a###_bc"), utf8Slice("#")), utf8Slice("a#_bc")); } + + @Test + public void testSimplifiedLikePattern() + { + // simplify the successive wildcards into one + LikeMatcher matcher; + + matcher = likePattern(utf8Slice("%%%%%%%%%%%%%%%%%%%%%bounce")); + assertFalse(likeVarchar(utf8Slice("xzsadfjasdkfjsadsfasgsdfgsdfgsdfgsdfgfsdgsdfgsdgsdfg"), matcher)); + assertTrue(likeVarchar(utf8Slice("xzsadfjasdkfjsadsfasgsdfgsdfgsdfgsdfgfsdgsdfgsdgsdfgbounce"), matcher)); + assertFalse(likeVarchar(utf8Slice("xzsadfjasdkfjsadsfasgsdfgsdfgsdfgsdfgfsdgsdfgsdgsdfgbouncexxxx"), matcher)); + + matcher = likePattern(utf8Slice("%%%%%%%%%%%%%%%%%%%bounce%%%%%%%%%%%%%")); + assertFalse(likeVarchar(utf8Slice("xzsadfjasdkfjsadsfasgsdfgsdfgsdfgsdfgfsdgsdfgsdgsdfg"), matcher)); + assertTrue(likeVarchar(utf8Slice("xzsadfjasdkfjsadsfasgsdfgsdfgsdfgsdfgfsdgsdfgsdgsdfgbounce"), matcher)); + assertTrue(likeVarchar(utf8Slice("xzsadfjasdkfjsadsfasgsdfgsdfgsdfgsdfgfsdgsdfgsdgsdfgbouncexxxx"), matcher)); + + matcher = likePattern(utf8Slice("xzsad%%%%%%%%%%%%%%%%%%%%bounce%%%%%%%%%%%%%x")); + assertFalse(likeVarchar(utf8Slice("xzsadfjasdkfjsadsfasgsdfgsdfgsdfgsdfgfsdgsdfgsdgsdfg"), matcher)); + assertFalse(likeVarchar(utf8Slice("xzsadfjasdkfjsadsfasgsdfgsdfgsdfgsdfgfsdgsdfgsdgsdfgbounce"), matcher)); + assertTrue(likeVarchar(utf8Slice("xzsadfjasdkfjsadsfasgsdfgsdfgsdfgsdfgfsdgsdfgsdgsdfgbouncexxxx"), matcher)); + + matcher = likePattern(utf8Slice("xz%%%%%.*%%%%%bounce%%%%%%%")); + assertTrue(likeVarchar(utf8Slice("xzPPPP.*bounce"), matcher)); + assertFalse(likeVarchar(utf8Slice("xzPPPP.bounce"), matcher)); + assertTrue(likeVarchar(utf8Slice("xz.*bounce"), matcher)); + assertFalse(likeVarchar(utf8Slice("xzPPPP*bounce"), matcher)); + + for (String escapeChar : new String[] {"%", "#"}) { + // xz%%bounce%%" + matcher = likePattern(utf8Slice("xz" + escapeChar + "%bounce" + escapeChar + "%"), utf8Slice(escapeChar)); + assertTrue(likeVarchar(utf8Slice("xz%bounce%"), matcher)); + assertFalse(likeVarchar(utf8Slice("xz%bounceff%"), matcher)); + + // xz%%bou_n%_ce%%" + matcher = likePattern(utf8Slice("xz" + escapeChar + "%bou_n" + escapeChar + "_ce" + escapeChar + "%"), utf8Slice(escapeChar)); + assertTrue(likeVarchar(utf8Slice("xz%bouXn_ce%"), matcher)); + assertFalse(likeVarchar(utf8Slice("xz%bouXnXce%"), matcher)); + } + } } diff --git a/presto-main/src/test/java/com/facebook/presto/sql/TestLikeMatcher.java b/presto-main/src/test/java/com/facebook/presto/sql/TestLikeMatcher.java new file mode 100644 index 0000000000000..dad138325bfc5 --- /dev/null +++ b/presto-main/src/test/java/com/facebook/presto/sql/TestLikeMatcher.java @@ -0,0 +1,137 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.facebook.presto.sql; + +import com.facebook.presto.likematcher.LikeMatcher; +import org.testng.annotations.Test; + +import java.nio.charset.StandardCharsets; +import java.util.Optional; + +import static org.testng.Assert.assertEquals; +import static org.testng.Assert.assertFalse; +import static org.testng.Assert.assertTrue; + +public class TestLikeMatcher +{ + @Test + public void test() + { + // min length short-circuit + assertFalse(match("__", "a")); + + // max length short-circuit + assertFalse(match("__", "abcdefghi")); + + // prefix short-circuit + assertFalse(match("a%", "xyz")); + + // prefix match + assertTrue(match("a%", "a")); + assertTrue(match("a%", "ab")); + assertTrue(match("a_", "ab")); + + // suffix short-circuit + assertFalse(match("%a", "xyz")); + + // suffix match + assertTrue(match("%z", "z")); + assertTrue(match("%z", "yz")); + assertTrue(match("_z", "yz")); + + // match literal + assertTrue(match("abcd", "abcd")); + + // match one + assertFalse(match("_", "")); + assertTrue(match("_", "a")); + assertFalse(match("_", "ab")); + + // match zero or more + assertTrue(match("%", "")); + assertTrue(match("%", "a")); + assertTrue(match("%", "ab")); + + // non-strict matching + assertTrue(match("_%", "abcdefg")); + assertFalse(match("_a%", "abcdefg")); + + // strict matching + assertTrue(match("_ab_", "xabc")); + assertFalse(match("_ab_", "xyxw")); + assertTrue(match("_a%b_", "xaxxxbx")); + + // optimization of consecutive _ and % + assertTrue(match("_%_%_%_%", "abcdefghij")); + + assertTrue(match("%a%a%a%a%a%a%", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa")); + assertTrue(match("%a%a%a%a%a%a%", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaab")); + assertTrue(match("%a%b%a%b%a%b%", "aabbaabbaabbaabbaabbaabbaabbaabbaabbaabbaabbaabbaabbaabb")); + assertTrue(match("%aaaa%bbbb%aaaa%bbbb%aaaa%bbbb%", "aaaabbbbaaaabbbbaaaabbbb")); + assertTrue(match("%aaaaaaaaaaaaaaaaaaaaaaaaaa%", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa")); + + // utf-8 + LikeMatcher singleOptimized = LikeMatcher.compile("_", Optional.empty(), true); + LikeMatcher multipleOptimized = LikeMatcher.compile("_a%b_", Optional.empty(), true); // prefix and suffix with _a and b_ to avoid optimizations + LikeMatcher single = LikeMatcher.compile("_", Optional.empty(), false); + LikeMatcher multiple = LikeMatcher.compile("_a%b_", Optional.empty(), false); // prefix and suffix with _a and b_ to avoid optimizations + for (int i = 0; i < Character.MAX_CODE_POINT; i++) { + assertTrue(singleOptimized.match(Character.toString(i).getBytes(StandardCharsets.UTF_8))); + assertTrue(single.match(Character.toString(i).getBytes(StandardCharsets.UTF_8))); + + String value = "aa" + (char) i + "bb"; + assertTrue(multipleOptimized.match(value.getBytes(StandardCharsets.UTF_8))); + assertTrue(multiple.match(value.getBytes(StandardCharsets.UTF_8))); + } + } + + @Test + public void testEscape() + { + assertTrue(match("-%", "%", '-')); + assertTrue(match("-_", "_", '-')); + assertTrue(match("--", "-", '-')); + assertTrue(match("%$_%", "xxxxx_xxxxx", '$')); + } + + private static boolean match(String pattern, String value) + { + return match(pattern, value, Optional.empty()); + } + + private static boolean match(String pattern, String value, char escape) + { + return match(pattern, value, Optional.of(escape)); + } + + private static boolean match(String pattern, String value, Optional escape) + { + String padding = "++++"; + String padded = padding + value + padding; + byte[] bytes = padded.getBytes(StandardCharsets.UTF_8); + + boolean optimizedWithoutPadding = LikeMatcher.compile(pattern, escape, true).match(value.getBytes(StandardCharsets.UTF_8)); + + boolean optimizedWithPadding = LikeMatcher.compile(pattern, escape, true).match(bytes, padding.length(), bytes.length - padding.length() * 2); // exclude padding + assertEquals(optimizedWithoutPadding, optimizedWithPadding); + + boolean withoutPadding = LikeMatcher.compile(pattern, escape, false).match(value.getBytes(StandardCharsets.UTF_8)); + assertEquals(optimizedWithoutPadding, withoutPadding); + + boolean withPadding = LikeMatcher.compile(pattern, escape, false).match(bytes, padding.length(), bytes.length - padding.length() * 2); // exclude padding + assertEquals(optimizedWithoutPadding, withPadding); + + return withPadding; + } +} diff --git a/presto-main/src/test/java/com/facebook/presto/sql/gen/TestExpressionCompiler.java b/presto-main/src/test/java/com/facebook/presto/sql/gen/TestExpressionCompiler.java index dfd268a014ec7..2b4baf83682b9 100644 --- a/presto-main/src/test/java/com/facebook/presto/sql/gen/TestExpressionCompiler.java +++ b/presto-main/src/test/java/com/facebook/presto/sql/gen/TestExpressionCompiler.java @@ -22,6 +22,7 @@ import com.facebook.presto.common.type.TimeZoneKey; import com.facebook.presto.common.type.Type; import com.facebook.presto.common.type.VarcharType; +import com.facebook.presto.likematcher.LikeMatcher; import com.facebook.presto.operator.scalar.BitwiseFunctions; import com.facebook.presto.operator.scalar.DateTimeFunctions; import com.facebook.presto.operator.scalar.FunctionAssertions; @@ -41,7 +42,6 @@ import com.google.common.util.concurrent.Futures; import com.google.common.util.concurrent.ListenableFuture; import com.google.common.util.concurrent.ListeningExecutorService; -import io.airlift.joni.Regex; import io.airlift.slice.Slice; import io.airlift.slice.Slices; import io.airlift.units.Duration; @@ -1553,7 +1553,7 @@ public void testLike() for (String pattern : stringLefts) { Boolean expected = null; if (value != null && pattern != null) { - Regex regex = LikeFunctions.likePattern(utf8Slice(pattern), utf8Slice("\\")); + LikeMatcher regex = LikeFunctions.likePattern(utf8Slice(pattern), utf8Slice("\\")); expected = LikeFunctions.likeVarchar(utf8Slice(value), regex); } assertExecute(generateExpression("%s like %s", value, pattern), BOOLEAN, expected);