Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@
import org.apache.lucene.util.automaton.Automata;
import org.apache.lucene.util.automaton.Automaton;
import org.apache.lucene.util.automaton.CharacterRunAutomaton;
import org.apache.lucene.util.automaton.MinimizationOperations;
import org.apache.lucene.util.automaton.Operations;
import org.apache.lucene.util.automaton.RegExp;
import org.elasticsearch.common.cache.Cache;
import org.elasticsearch.common.cache.CacheBuilder;
Expand All @@ -19,11 +21,13 @@
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.concurrent.ExecutionException;
import java.util.function.Function;
import java.util.function.Predicate;

import static org.apache.lucene.util.automaton.MinimizationOperations.minimize;
import static org.apache.lucene.util.automaton.Operations.DEFAULT_MAX_DETERMINIZED_STATES;
import static org.apache.lucene.util.automaton.Operations.concatenate;
import static org.apache.lucene.util.automaton.Operations.intersection;
Expand Down Expand Up @@ -84,10 +88,74 @@ public static Automaton patterns(Collection<String> patterns) {
}

private static Automaton buildAutomaton(Collection<String> patterns) {
List<Automaton> automata = new ArrayList<>(patterns.size());
for (String pattern : patterns) {
final Automaton patternAutomaton = pattern(pattern);
automata.add(patternAutomaton);
if (patterns.size() == 1) {
return minimize(pattern(patterns.iterator().next()));
}

final Function<Collection<String>, Automaton> build = strings -> {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nit: I'd prefer to name this variable with a noun, something like buildFunc.

List<Automaton> automata = new ArrayList<>(strings.size());
for (String pattern : strings) {
final Automaton patternAutomaton = pattern(pattern);
automata.add(patternAutomaton);
}
return unionAndMinimize(automata);
};

// We originally just compiled each automaton separately and then unioned them all.
// However, that approach can be quite slow, and very memory intensive.
// It is far more efficient if
// 1. we strip leading/trailing "*"
// 2. union the automaton produced from the remaining text
// 3. append/prepend MatchAnyString automatons as appropriate
// That is:
// - `MATCH_ALL + (bullseye|daredevil) + MATCH_ALL`
// can be determinized more efficiently than
// - `(MATCH_ALL + bullseye + MATCH_ALL)|(MATCH_ALL + daredevil + MATCH_ALL)`

final Set<String> prefix = new HashSet<>();
final Set<String> infix = new HashSet<>();
final Set<String> suffix = new HashSet<>();
final Set<String> misc = new HashSet<>();

for (String p : patterns) {
final char first = p.charAt(0);
final char last = p.charAt(p.length() - 1);
Comment on lines +127 to +128
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Shoud we guard these with a check for p.length() == 0?

if (p.length() <= 1 || first == '/') {
// Single character strings (like "x" or "*") or regex ("/something/")
misc.add(p);
} else if (first == '*') {
if (last == '*') {
// *something*
infix.add(p.substring(1, p.length() - 1));
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It is possible that the pattern is **. As such the infix is an empty string and can be skipped. But as discussed, it is probably better handled as part of compacting all consecutive *, which can be done in a separate PR.

} else {
// *something
suffix.add(p.substring(1));
}
} else if (last == '*' && p.indexOf('*') != p.length() - 1) {
// some*thing*
// For simple prefix patterns ("something*") it's more efficient to do a single pass
// Lucene handles the shared trailing '*' on an accept state well,
// and performing 2 minimizes (on for the union of strings, then on again after concatenating MATCH_ANY) is slower.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This sentence reads weird to me. Should it be something like ... (one for the union of strings, and another one after ...)

// But, that's not true if the string has an embedded '*' in it - in that case, we should handle them in this special way.
prefix.add(p.substring(0, p.length() - 1));
} else {
// some*thing / some?thing / etc
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think the pattern like something* should also reach here and should be part of the comment.

misc.add(p);
}
}

final List<Automaton> automata = new ArrayList<>();
if (prefix.isEmpty() == false) {
automata.add(Operations.concatenate(build.apply(prefix), Automata.makeAnyString()));
}
if (suffix.isEmpty() == false) {
automata.add(Operations.concatenate(Automata.makeAnyString(), build.apply(suffix)));
}
if (infix.isEmpty() == false) {
automata.add(Operations.concatenate(List.of(Automata.makeAnyString(), build.apply(infix), Automata.makeAnyString())));
}
if (misc.isEmpty() == false) {
automata.add(build.apply(misc));
}
return unionAndMinimize(automata);
}
Expand Down Expand Up @@ -172,18 +240,22 @@ static Automaton wildcard(String text) {
}

public static Automaton unionAndMinimize(Collection<Automaton> automata) {
Automaton res = union(automata);
return minimize(res, maxDeterminizedStates);
Automaton res = automata.size() == 1 ? automata.iterator().next() : union(automata);
return minimize(res);
}

public static Automaton minusAndMinimize(Automaton a1, Automaton a2) {
Automaton res = minus(a1, a2, maxDeterminizedStates);
return minimize(res, maxDeterminizedStates);
return minimize(res);
}

public static Automaton intersectAndMinimize(Automaton a1, Automaton a2) {
Automaton res = intersection(a1, a2);
return minimize(res, maxDeterminizedStates);
return minimize(res);
}

private static Automaton minimize(Automaton automaton) {
return MinimizationOperations.minimize(automaton, maxDeterminizedStates);
}

public static Predicate<String> predicate(String... patterns) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -162,7 +162,7 @@ private static Predicate<String> buildAutomataPredicate(Collection<String> patte
if (description.length() > 80) {
description = Strings.cleanTruncate(description, 80) + "...";
}
throw new ElasticsearchSecurityException("The set patterns [{}] is too complex to evaluate", e, description);
throw new ElasticsearchSecurityException("The set of patterns [{}] is too complex to evaluate", e, description);
}
}
}
Expand Down