Build complex automatons more efficiently (#66901)

tvernum · web-flow · commit 5dfaae85eff0 · 2020-12-31T17:19:14.000+11:00
This change substantially reduces the CPU and Heap usage of StringMatcher when processing large complex patterns. The improvement is achieved by switching the order in which we perform concatenation and union for common styles of wildcard patterns. Given a set of wildcard strings: - "*-logs-*" - "*-metrics-*" - "web-*-prod-*" - "web-*-staging-*" The old implementation would perform steps roughly like: minimize { union { concatenate { MATCH_ANY, "-logs-", MATCH_ANY } concatenate { MATCH_ANY, "-metrics-", MATCH_ANY } concatenate { "web-", MATCH_ANY, "prod-", MATCH_ANY } concatenate { "web-", MATCH_ANY, "staging-", MATCH_ANY } } } The outer minimize would require determinizing the automaton, which was highly inefficient The new implementation is: minimize { union { concatenate { MATCH_ANY , minimize { union { "-logs-", "-metrics"- } } MATCH_ANY } concatenate { minimize { union { concatenate { "web-", MATCH_ANY, "prod-" } concatenate { "web-", MATCH_ANY, "staging-" } } } MATCH_ANY } } } By performing a union of the inner strings before concatenating the MATCH_ANY ("*") the time & heap space spent on determinizing the automaton is greatly reduced. Backport of: #66724
diff --git a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/security/support/Automatons.java b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/security/support/Automatons.java
@@ -8,6 +8,8 @@
 import org.apache.lucene.util.automaton.Automata;
 import org.apache.lucene.util.automaton.Automaton;
 import org.apache.lucene.util.automaton.CharacterRunAutomaton;
+import org.apache.lucene.util.automaton.MinimizationOperations;
+import org.apache.lucene.util.automaton.Operations;
 import org.apache.lucene.util.automaton.RegExp;
 import org.elasticsearch.common.cache.Cache;
 import org.elasticsearch.common.cache.CacheBuilder;
@@ -19,11 +21,13 @@
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Collection;
+import java.util.HashSet;
 import java.util.List;
+import java.util.Set;
 import java.util.concurrent.ExecutionException;
+import java.util.function.Function;
 import java.util.function.Predicate;
 
-import static org.apache.lucene.util.automaton.MinimizationOperations.minimize;
 import static org.apache.lucene.util.automaton.Operations.DEFAULT_MAX_DETERMINIZED_STATES;
 import static org.apache.lucene.util.automaton.Operations.concatenate;
 import static org.apache.lucene.util.automaton.Operations.intersection;
@@ -84,10 +88,82 @@ public static Automaton patterns(Collection<String> patterns) {
     }
 
     private static Automaton buildAutomaton(Collection<String> patterns) {
-        List<Automaton> automata = new ArrayList<>(patterns.size());
-        for (String pattern : patterns) {
-            final Automaton patternAutomaton = pattern(pattern);
-            automata.add(patternAutomaton);
+        if (patterns.size() == 1) {
+            return minimize(pattern(patterns.iterator().next()));
+        }
+
+        final Function<Collection<String>, Automaton> build = strings -> {
+            List<Automaton> automata = new ArrayList<>(strings.size());
+            for (String pattern : strings) {
+                final Automaton patternAutomaton = pattern(pattern);
+                automata.add(patternAutomaton);
+            }
+            return unionAndMinimize(automata);
+        };
+
+        // We originally just compiled each automaton separately and then unioned them all.
+        // However, that approach can be quite slow, and very memory intensive.
+        // It is far more efficient if
+        //   1. we strip leading/trailing "*"
+        //   2. union the automaton produced from the remaining text
+        //   3. append/prepend MatchAnyString automatons as appropriate
+        // That is:
+        //  - `MATCH_ALL + (bullseye|daredevil) + MATCH_ALL`
+        //  can be determinized more efficiently than
+        //  - `(MATCH_ALL + bullseye + MATCH_ALL)|(MATCH_ALL + daredevil + MATCH_ALL)`
+
+        final Set<String> prefix = new HashSet<>();
+        final Set<String> infix = new HashSet<>();
+        final Set<String> suffix = new HashSet<>();
+        final Set<String> misc = new HashSet<>();
+
+        for (String p : patterns) {
+            if (p.length() <= 1) {
+                // Single character strings (like "x" or "*"), or stray empty strings
+                misc.add(p);
+                continue;
+            }
+
+            final char first = p.charAt(0);
+            final char last = p.charAt(p.length() - 1);
+            if (first == '/') {
+                // regex ("/something/")
+                misc.add(p);
+            } else if (first == '*') {
+                if (last == '*') {
+                    // *something*
+                    infix.add(p.substring(1, p.length() - 1));
+                } else {
+                    // *something
+                    suffix.add(p.substring(1));
+                }
+            } else if (last == '*' && p.indexOf('*') != p.length() - 1) {
+                // some*thing*
+                // For simple prefix patterns ("something*") it's more efficient to do a single pass
+                // Lucene can efficiently determinize automata that share a trailing MATCH_ANY accept state,
+                // If we were to handle them here, we would run 2 minimize operations (one for the union of strings,
+                // then another after concatenating MATCH_ANY), which is substantially slower.
+                // However, that's not true if the string has an embedded '*' in it - in that case it is more efficient to determinize
+                //   the set of prefixes (with the embedded MATCH_ANY) and then concatenate another MATCH_ANY and minimize.
+                prefix.add(p.substring(0, p.length() - 1));
+            } else {
+                // something* / some*thing / some?thing / etc
+                misc.add(p);
+            }
+        }
+
+        final List<Automaton> automata = new ArrayList<>();
+        if (prefix.isEmpty() == false) {
+            automata.add(Operations.concatenate(build.apply(prefix), Automata.makeAnyString()));
+        }
+        if (suffix.isEmpty() == false) {
+            automata.add(Operations.concatenate(Automata.makeAnyString(), build.apply(suffix)));
+        }
+        if (infix.isEmpty() == false) {
+            automata.add(Operations.concatenate(Arrays.asList(Automata.makeAnyString(), build.apply(infix), Automata.makeAnyString())));
+        }
+        if (misc.isEmpty() == false) {
+            automata.add(build.apply(misc));
         }
         return unionAndMinimize(automata);
     }
@@ -172,18 +248,22 @@ static Automaton wildcard(String text) {
     }
 
     public static Automaton unionAndMinimize(Collection<Automaton> automata) {
-        Automaton res = union(automata);
-        return minimize(res, maxDeterminizedStates);
+        Automaton res = automata.size() == 1 ? automata.iterator().next() : union(automata);
+        return minimize(res);
     }
 
     public static Automaton minusAndMinimize(Automaton a1, Automaton a2) {
         Automaton res = minus(a1, a2, maxDeterminizedStates);
-        return minimize(res, maxDeterminizedStates);
+        return minimize(res);
     }
 
     public static Automaton intersectAndMinimize(Automaton a1, Automaton a2) {
         Automaton res = intersection(a1, a2);
-        return minimize(res, maxDeterminizedStates);
+        return minimize(res);
+    }
+
+    private static Automaton minimize(Automaton automaton) {
+        return MinimizationOperations.minimize(automaton, maxDeterminizedStates);
     }
 
     public static Predicate<String> predicate(String... patterns) {
diff --git a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/security/support/StringMatcher.java b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/security/support/StringMatcher.java
@@ -162,7 +162,7 @@ private static Predicate<String> buildAutomataPredicate(Collection<String> patte
                 if (description.length() > 80) {
                     description = Strings.cleanTruncate(description, 80) + "...";
                 }
-                throw new ElasticsearchSecurityException("The set patterns [{}] is too complex to evaluate", e, description);
+                throw new ElasticsearchSecurityException("The set of patterns [{}] is too complex to evaluate", e, description);
             }
         }
     }

Original file line number	Diff line number	Diff line change
`@@ -162,7 +162,7 @@ private static Predicate<String> buildAutomataPredicate(Collection<String> patte`
`162`	`162`	`if (description.length() > 80) {`
`163`	`163`	`description = Strings.cleanTruncate(description, 80) + "...";`
`164`	`164`	`}`
`165`		`- throw new ElasticsearchSecurityException("The set patterns [{}] is too complex to evaluate", e, description);`
	`165`	`+ throw new ElasticsearchSecurityException("The set of patterns [{}] is too complex to evaluate", e, description);`
`166`	`166`	`}`
`167`	`167`	`}`
`168`	`168`	`}`