-
Notifications
You must be signed in to change notification settings - Fork 25.6k
Build complex automatons more efficiently #66724
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 1 commit
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -8,6 +8,8 @@ | |
| import org.apache.lucene.util.automaton.Automata; | ||
| import org.apache.lucene.util.automaton.Automaton; | ||
| import org.apache.lucene.util.automaton.CharacterRunAutomaton; | ||
| import org.apache.lucene.util.automaton.MinimizationOperations; | ||
| import org.apache.lucene.util.automaton.Operations; | ||
| import org.apache.lucene.util.automaton.RegExp; | ||
| import org.elasticsearch.common.cache.Cache; | ||
| import org.elasticsearch.common.cache.CacheBuilder; | ||
|
|
@@ -19,11 +21,13 @@ | |
| import java.util.ArrayList; | ||
| import java.util.Arrays; | ||
| import java.util.Collection; | ||
| import java.util.HashSet; | ||
| import java.util.List; | ||
| import java.util.Set; | ||
| import java.util.concurrent.ExecutionException; | ||
| import java.util.function.Function; | ||
| import java.util.function.Predicate; | ||
|
|
||
| import static org.apache.lucene.util.automaton.MinimizationOperations.minimize; | ||
| import static org.apache.lucene.util.automaton.Operations.DEFAULT_MAX_DETERMINIZED_STATES; | ||
| import static org.apache.lucene.util.automaton.Operations.concatenate; | ||
| import static org.apache.lucene.util.automaton.Operations.intersection; | ||
|
|
@@ -84,10 +88,74 @@ public static Automaton patterns(Collection<String> patterns) { | |
| } | ||
|
|
||
| private static Automaton buildAutomaton(Collection<String> patterns) { | ||
| List<Automaton> automata = new ArrayList<>(patterns.size()); | ||
| for (String pattern : patterns) { | ||
| final Automaton patternAutomaton = pattern(pattern); | ||
| automata.add(patternAutomaton); | ||
| if (patterns.size() == 1) { | ||
| return minimize(pattern(patterns.iterator().next())); | ||
| } | ||
|
|
||
| final Function<Collection<String>, Automaton> build = strings -> { | ||
| List<Automaton> automata = new ArrayList<>(strings.size()); | ||
| for (String pattern : strings) { | ||
| final Automaton patternAutomaton = pattern(pattern); | ||
| automata.add(patternAutomaton); | ||
| } | ||
| return unionAndMinimize(automata); | ||
| }; | ||
|
|
||
| // We originally just compiled each automaton separately and then unioned them all. | ||
| // However, that approach can be quite slow, and very memory intensive. | ||
| // It is far more efficient if | ||
| // 1. we strip leading/trailing "*" | ||
| // 2. union the automaton produced from the remaining text | ||
| // 3. append/prepend MatchAnyString automatons as appropriate | ||
| // That is: | ||
| // - `MATCH_ALL + (bullseye|daredevil) + MATCH_ALL` | ||
| // can be determinized more efficiently than | ||
| // - `(MATCH_ALL + bullseye + MATCH_ALL)|(MATCH_ALL + daredevil + MATCH_ALL)` | ||
|
|
||
| final Set<String> prefix = new HashSet<>(); | ||
| final Set<String> infix = new HashSet<>(); | ||
| final Set<String> suffix = new HashSet<>(); | ||
| final Set<String> misc = new HashSet<>(); | ||
|
|
||
| for (String p : patterns) { | ||
| final char first = p.charAt(0); | ||
| final char last = p.charAt(p.length() - 1); | ||
|
Comment on lines
+127
to
+128
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Shoud we guard these with a check for |
||
| if (p.length() <= 1 || first == '/') { | ||
| // Single character strings (like "x" or "*") or regex ("/something/") | ||
| misc.add(p); | ||
| } else if (first == '*') { | ||
| if (last == '*') { | ||
| // *something* | ||
| infix.add(p.substring(1, p.length() - 1)); | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It is possible that the pattern is |
||
| } else { | ||
| // *something | ||
| suffix.add(p.substring(1)); | ||
| } | ||
| } else if (last == '*' && p.indexOf('*') != p.length() - 1) { | ||
| // some*thing* | ||
| // For simple prefix patterns ("something*") it's more efficient to do a single pass | ||
| // Lucene handles the shared trailing '*' on an accept state well, | ||
| // and performing 2 minimizes (on for the union of strings, then on again after concatenating MATCH_ANY) is slower. | ||
|
||
| // But, that's not true if the string has an embedded '*' in it - in that case, we should handle them in this special way. | ||
| prefix.add(p.substring(0, p.length() - 1)); | ||
| } else { | ||
| // some*thing / some?thing / etc | ||
|
||
| misc.add(p); | ||
| } | ||
| } | ||
|
|
||
| final List<Automaton> automata = new ArrayList<>(); | ||
| if (prefix.isEmpty() == false) { | ||
| automata.add(Operations.concatenate(build.apply(prefix), Automata.makeAnyString())); | ||
| } | ||
| if (suffix.isEmpty() == false) { | ||
| automata.add(Operations.concatenate(Automata.makeAnyString(), build.apply(suffix))); | ||
| } | ||
| if (infix.isEmpty() == false) { | ||
| automata.add(Operations.concatenate(List.of(Automata.makeAnyString(), build.apply(infix), Automata.makeAnyString()))); | ||
| } | ||
| if (misc.isEmpty() == false) { | ||
| automata.add(build.apply(misc)); | ||
| } | ||
| return unionAndMinimize(automata); | ||
| } | ||
|
|
@@ -172,18 +240,22 @@ static Automaton wildcard(String text) { | |
| } | ||
|
|
||
| public static Automaton unionAndMinimize(Collection<Automaton> automata) { | ||
| Automaton res = union(automata); | ||
| return minimize(res, maxDeterminizedStates); | ||
| Automaton res = automata.size() == 1 ? automata.iterator().next() : union(automata); | ||
| return minimize(res); | ||
| } | ||
|
|
||
| public static Automaton minusAndMinimize(Automaton a1, Automaton a2) { | ||
| Automaton res = minus(a1, a2, maxDeterminizedStates); | ||
| return minimize(res, maxDeterminizedStates); | ||
| return minimize(res); | ||
| } | ||
|
|
||
| public static Automaton intersectAndMinimize(Automaton a1, Automaton a2) { | ||
| Automaton res = intersection(a1, a2); | ||
| return minimize(res, maxDeterminizedStates); | ||
| return minimize(res); | ||
| } | ||
|
|
||
| private static Automaton minimize(Automaton automaton) { | ||
| return MinimizationOperations.minimize(automaton, maxDeterminizedStates); | ||
| } | ||
|
|
||
| public static Predicate<String> predicate(String... patterns) { | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Nit: I'd prefer to name this variable with a noun, something like
buildFunc.