Skip to content

Commit 5dfaae8

Browse files
authored
Build complex automatons more efficiently (#66901)
This change substantially reduces the CPU and Heap usage of StringMatcher when processing large complex patterns. The improvement is achieved by switching the order in which we perform concatenation and union for common styles of wildcard patterns. Given a set of wildcard strings: - "*-logs-*" - "*-metrics-*" - "web-*-prod-*" - "web-*-staging-*" The old implementation would perform steps roughly like: minimize { union { concatenate { MATCH_ANY, "-logs-", MATCH_ANY } concatenate { MATCH_ANY, "-metrics-", MATCH_ANY } concatenate { "web-", MATCH_ANY, "prod-", MATCH_ANY } concatenate { "web-", MATCH_ANY, "staging-", MATCH_ANY } } } The outer minimize would require determinizing the automaton, which was highly inefficient The new implementation is: minimize { union { concatenate { MATCH_ANY , minimize { union { "-logs-", "-metrics"- } } MATCH_ANY } concatenate { minimize { union { concatenate { "web-", MATCH_ANY, "prod-" } concatenate { "web-", MATCH_ANY, "staging-" } } } MATCH_ANY } } } By performing a union of the inner strings before concatenating the MATCH_ANY ("*") the time & heap space spent on determinizing the automaton is greatly reduced. Backport of: #66724
1 parent 288db85 commit 5dfaae8

File tree

2 files changed

+90
-10
lines changed

2 files changed

+90
-10
lines changed

x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/security/support/Automatons.java

Lines changed: 89 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,8 @@
88
import org.apache.lucene.util.automaton.Automata;
99
import org.apache.lucene.util.automaton.Automaton;
1010
import org.apache.lucene.util.automaton.CharacterRunAutomaton;
11+
import org.apache.lucene.util.automaton.MinimizationOperations;
12+
import org.apache.lucene.util.automaton.Operations;
1113
import org.apache.lucene.util.automaton.RegExp;
1214
import org.elasticsearch.common.cache.Cache;
1315
import org.elasticsearch.common.cache.CacheBuilder;
@@ -19,11 +21,13 @@
1921
import java.util.ArrayList;
2022
import java.util.Arrays;
2123
import java.util.Collection;
24+
import java.util.HashSet;
2225
import java.util.List;
26+
import java.util.Set;
2327
import java.util.concurrent.ExecutionException;
28+
import java.util.function.Function;
2429
import java.util.function.Predicate;
2530

26-
import static org.apache.lucene.util.automaton.MinimizationOperations.minimize;
2731
import static org.apache.lucene.util.automaton.Operations.DEFAULT_MAX_DETERMINIZED_STATES;
2832
import static org.apache.lucene.util.automaton.Operations.concatenate;
2933
import static org.apache.lucene.util.automaton.Operations.intersection;
@@ -84,10 +88,82 @@ public static Automaton patterns(Collection<String> patterns) {
8488
}
8589

8690
private static Automaton buildAutomaton(Collection<String> patterns) {
87-
List<Automaton> automata = new ArrayList<>(patterns.size());
88-
for (String pattern : patterns) {
89-
final Automaton patternAutomaton = pattern(pattern);
90-
automata.add(patternAutomaton);
91+
if (patterns.size() == 1) {
92+
return minimize(pattern(patterns.iterator().next()));
93+
}
94+
95+
final Function<Collection<String>, Automaton> build = strings -> {
96+
List<Automaton> automata = new ArrayList<>(strings.size());
97+
for (String pattern : strings) {
98+
final Automaton patternAutomaton = pattern(pattern);
99+
automata.add(patternAutomaton);
100+
}
101+
return unionAndMinimize(automata);
102+
};
103+
104+
// We originally just compiled each automaton separately and then unioned them all.
105+
// However, that approach can be quite slow, and very memory intensive.
106+
// It is far more efficient if
107+
// 1. we strip leading/trailing "*"
108+
// 2. union the automaton produced from the remaining text
109+
// 3. append/prepend MatchAnyString automatons as appropriate
110+
// That is:
111+
// - `MATCH_ALL + (bullseye|daredevil) + MATCH_ALL`
112+
// can be determinized more efficiently than
113+
// - `(MATCH_ALL + bullseye + MATCH_ALL)|(MATCH_ALL + daredevil + MATCH_ALL)`
114+
115+
final Set<String> prefix = new HashSet<>();
116+
final Set<String> infix = new HashSet<>();
117+
final Set<String> suffix = new HashSet<>();
118+
final Set<String> misc = new HashSet<>();
119+
120+
for (String p : patterns) {
121+
if (p.length() <= 1) {
122+
// Single character strings (like "x" or "*"), or stray empty strings
123+
misc.add(p);
124+
continue;
125+
}
126+
127+
final char first = p.charAt(0);
128+
final char last = p.charAt(p.length() - 1);
129+
if (first == '/') {
130+
// regex ("/something/")
131+
misc.add(p);
132+
} else if (first == '*') {
133+
if (last == '*') {
134+
// *something*
135+
infix.add(p.substring(1, p.length() - 1));
136+
} else {
137+
// *something
138+
suffix.add(p.substring(1));
139+
}
140+
} else if (last == '*' && p.indexOf('*') != p.length() - 1) {
141+
// some*thing*
142+
// For simple prefix patterns ("something*") it's more efficient to do a single pass
143+
// Lucene can efficiently determinize automata that share a trailing MATCH_ANY accept state,
144+
// If we were to handle them here, we would run 2 minimize operations (one for the union of strings,
145+
// then another after concatenating MATCH_ANY), which is substantially slower.
146+
// However, that's not true if the string has an embedded '*' in it - in that case it is more efficient to determinize
147+
// the set of prefixes (with the embedded MATCH_ANY) and then concatenate another MATCH_ANY and minimize.
148+
prefix.add(p.substring(0, p.length() - 1));
149+
} else {
150+
// something* / some*thing / some?thing / etc
151+
misc.add(p);
152+
}
153+
}
154+
155+
final List<Automaton> automata = new ArrayList<>();
156+
if (prefix.isEmpty() == false) {
157+
automata.add(Operations.concatenate(build.apply(prefix), Automata.makeAnyString()));
158+
}
159+
if (suffix.isEmpty() == false) {
160+
automata.add(Operations.concatenate(Automata.makeAnyString(), build.apply(suffix)));
161+
}
162+
if (infix.isEmpty() == false) {
163+
automata.add(Operations.concatenate(Arrays.asList(Automata.makeAnyString(), build.apply(infix), Automata.makeAnyString())));
164+
}
165+
if (misc.isEmpty() == false) {
166+
automata.add(build.apply(misc));
91167
}
92168
return unionAndMinimize(automata);
93169
}
@@ -172,18 +248,22 @@ static Automaton wildcard(String text) {
172248
}
173249

174250
public static Automaton unionAndMinimize(Collection<Automaton> automata) {
175-
Automaton res = union(automata);
176-
return minimize(res, maxDeterminizedStates);
251+
Automaton res = automata.size() == 1 ? automata.iterator().next() : union(automata);
252+
return minimize(res);
177253
}
178254

179255
public static Automaton minusAndMinimize(Automaton a1, Automaton a2) {
180256
Automaton res = minus(a1, a2, maxDeterminizedStates);
181-
return minimize(res, maxDeterminizedStates);
257+
return minimize(res);
182258
}
183259

184260
public static Automaton intersectAndMinimize(Automaton a1, Automaton a2) {
185261
Automaton res = intersection(a1, a2);
186-
return minimize(res, maxDeterminizedStates);
262+
return minimize(res);
263+
}
264+
265+
private static Automaton minimize(Automaton automaton) {
266+
return MinimizationOperations.minimize(automaton, maxDeterminizedStates);
187267
}
188268

189269
public static Predicate<String> predicate(String... patterns) {

x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/security/support/StringMatcher.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -162,7 +162,7 @@ private static Predicate<String> buildAutomataPredicate(Collection<String> patte
162162
if (description.length() > 80) {
163163
description = Strings.cleanTruncate(description, 80) + "...";
164164
}
165-
throw new ElasticsearchSecurityException("The set patterns [{}] is too complex to evaluate", e, description);
165+
throw new ElasticsearchSecurityException("The set of patterns [{}] is too complex to evaluate", e, description);
166166
}
167167
}
168168
}

0 commit comments

Comments
 (0)