-
Notifications
You must be signed in to change notification settings - Fork 25.8k
Add lucene query for wildcards on high cardinality keyword fields. #139746
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,87 @@ | ||
| /* | ||
| * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one | ||
| * or more contributor license agreements. Licensed under the "Elastic License | ||
| * 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side | ||
| * Public License v 1"; you may not use this file except in compliance with, at | ||
| * your election, the "Elastic License 2.0", the "GNU Affero General Public | ||
| * License v3.0 only", or the "Server Side Public License, v 1". | ||
| */ | ||
|
|
||
| package org.elasticsearch.lucene.queries; | ||
|
|
||
| import org.apache.lucene.index.BinaryDocValues; | ||
| import org.apache.lucene.index.DocValues; | ||
| import org.apache.lucene.index.LeafReaderContext; | ||
| import org.apache.lucene.search.ConstantScoreScorer; | ||
| import org.apache.lucene.search.ConstantScoreWeight; | ||
| import org.apache.lucene.search.IndexSearcher; | ||
| import org.apache.lucene.search.Query; | ||
| import org.apache.lucene.search.QueryVisitor; | ||
| import org.apache.lucene.search.ScoreMode; | ||
| import org.apache.lucene.search.ScorerSupplier; | ||
| import org.apache.lucene.search.TwoPhaseIterator; | ||
| import org.apache.lucene.search.Weight; | ||
| import org.apache.lucene.util.BytesRef; | ||
| import org.elasticsearch.index.mapper.blockloader.docvalues.CustomBinaryDocValuesReader; | ||
|
|
||
| import java.io.IOException; | ||
| import java.util.Objects; | ||
| import java.util.function.Predicate; | ||
|
|
||
| abstract class AbstractBinaryDocValuesQuery extends Query { | ||
|
|
||
| final String fieldName; | ||
| final Predicate<BytesRef> matcher; | ||
|
|
||
| AbstractBinaryDocValuesQuery(String fieldName, Predicate<BytesRef> matcher) { | ||
| this.fieldName = Objects.requireNonNull(fieldName); | ||
| this.matcher = Objects.requireNonNull(matcher); | ||
| } | ||
|
|
||
| @Override | ||
| public Weight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost) throws IOException { | ||
| float matchCost = matchCost(); | ||
| return new ConstantScoreWeight(this, boost) { | ||
|
|
||
| @Override | ||
| public ScorerSupplier scorerSupplier(LeafReaderContext context) throws IOException { | ||
| final BinaryDocValues values = context.reader().getBinaryDocValues(fieldName); | ||
| if (values == null) { | ||
| return null; | ||
| } | ||
|
|
||
| final TwoPhaseIterator iterator = new TwoPhaseIterator(values) { | ||
|
|
||
| final CustomBinaryDocValuesReader reader = new CustomBinaryDocValuesReader(); | ||
|
|
||
| @Override | ||
| public boolean matches() throws IOException { | ||
| BytesRef binaryValue = values.binaryValue(); | ||
| return reader.match(binaryValue, matcher); | ||
| } | ||
|
|
||
| @Override | ||
| public float matchCost() { | ||
| return matchCost; | ||
| } | ||
| }; | ||
|
|
||
| return new DefaultScorerSupplier(new ConstantScoreScorer(score(), scoreMode, iterator)); | ||
| } | ||
|
|
||
| @Override | ||
| public boolean isCacheable(LeafReaderContext ctx) { | ||
| return DocValues.isCacheable(ctx, fieldName); | ||
| } | ||
| }; | ||
| } | ||
|
|
||
| protected abstract float matchCost(); | ||
|
|
||
| @Override | ||
| public void visit(QueryVisitor visitor) { | ||
| if (visitor.acceptField(fieldName)) { | ||
| visitor.visitLeaf(this); | ||
| } | ||
| } | ||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,89 @@ | ||
| /* | ||
| * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one | ||
| * or more contributor license agreements. Licensed under the "Elastic License | ||
| * 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side | ||
| * Public License v 1"; you may not use this file except in compliance with, at | ||
| * your election, the "Elastic License 2.0", the "GNU Affero General Public | ||
| * License v3.0 only", or the "Server Side Public License, v 1". | ||
| */ | ||
|
|
||
| package org.elasticsearch.lucene.queries; | ||
|
|
||
| import org.apache.lucene.index.Term; | ||
| import org.apache.lucene.search.WildcardQuery; | ||
| import org.apache.lucene.util.automaton.Automaton; | ||
| import org.apache.lucene.util.automaton.ByteRunAutomaton; | ||
| import org.apache.lucene.util.automaton.Operations; | ||
| import org.elasticsearch.common.lucene.search.AutomatonQueries; | ||
|
|
||
| import java.util.Objects; | ||
|
|
||
| /** | ||
| * A query for matching an exact BytesRef value for a specific field. | ||
| * The equavalent of {@link org.elasticsearch.search.runtime.StringScriptFieldWildcardQuery}, but then without the scripting overhead and | ||
| * just for binary doc values. | ||
| * <p> | ||
| * This implementation is slow, because it potentially scans binary doc values for each document. | ||
| */ | ||
| // TODO: create abstract class for binary doc values based automaton queries in follow up, in order to support regex and fuzzy queries. | ||
| public final class SlowCustomBinaryDocValuesWildcardQuery extends AbstractBinaryDocValuesQuery { | ||
|
|
||
| private final String pattern; | ||
| private final boolean caseInsensitive; | ||
|
|
||
| public SlowCustomBinaryDocValuesWildcardQuery(String fieldName, String pattern, boolean caseInsensitive) { | ||
| this(fieldName, pattern, caseInsensitive, buildByteRunAutomaton(fieldName, pattern, caseInsensitive)); | ||
| } | ||
|
|
||
| private SlowCustomBinaryDocValuesWildcardQuery(String fieldName, String pattern, boolean caseInsensitive, ByteRunAutomaton automaton) { | ||
| super(fieldName, value -> automaton.run(value.bytes, value.offset, value.length)); | ||
| this.pattern = Objects.requireNonNull(pattern); | ||
| this.caseInsensitive = caseInsensitive; | ||
| } | ||
|
|
||
| private static ByteRunAutomaton buildByteRunAutomaton(String fieldName, String pattern, boolean caseInsensitive) { | ||
| Term term = new Term(Objects.requireNonNull(fieldName), Objects.requireNonNull(pattern)); | ||
| Automaton automaton; | ||
| if (caseInsensitive) { | ||
| automaton = AutomatonQueries.toCaseInsensitiveWildcardAutomaton(term); | ||
| } else { | ||
| automaton = WildcardQuery.toAutomaton(term, Operations.DEFAULT_DETERMINIZE_WORK_LIMIT); | ||
| } | ||
| return new ByteRunAutomaton(automaton); | ||
| } | ||
|
|
||
| @Override | ||
| protected float matchCost() { | ||
| return 1000f; // This is just expensive, not sure what the actual cost is. | ||
| } | ||
|
|
||
| @Override | ||
| public String toString(String field) { | ||
| return "SlowCustomBinaryDocValuesWildcardQuery(fieldName=" | ||
| + field | ||
| + ",pattern=" | ||
| + pattern | ||
| + ",caseInsensitive=" | ||
| + caseInsensitive | ||
| + ")"; | ||
| } | ||
|
|
||
| @Override | ||
| public boolean equals(Object o) { | ||
| if (this == o) { | ||
| return true; | ||
| } | ||
| if (sameClassAs(o) == false) { | ||
| return false; | ||
| } | ||
| SlowCustomBinaryDocValuesWildcardQuery that = (SlowCustomBinaryDocValuesWildcardQuery) o; | ||
| return Objects.equals(fieldName, that.fieldName) | ||
| && Objects.equals(pattern, that.pattern) | ||
| && caseInsensitive == that.caseInsensitive; | ||
| } | ||
|
|
||
| @Override | ||
| public int hashCode() { | ||
| return Objects.hash(classHash(), fieldName, pattern, caseInsensitive); | ||
| } | ||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -54,6 +54,8 @@ | |
| import org.elasticsearch.index.analysis.TokenizerFactory; | ||
| import org.elasticsearch.index.mapper.KeywordFieldMapper.KeywordFieldType; | ||
| import org.elasticsearch.index.mapper.MappedFieldType.Relation; | ||
| import org.elasticsearch.lucene.queries.SlowCustomBinaryDocValuesTermQuery; | ||
| import org.elasticsearch.lucene.queries.SlowCustomBinaryDocValuesWildcardQuery; | ||
| import org.elasticsearch.script.ScriptCompiler; | ||
|
|
||
| import java.io.IOException; | ||
|
|
@@ -97,6 +99,20 @@ public void testTermQuery() { | |
| assertEquals("Cannot search on field [field] since it is not indexed nor has doc values.", e.getMessage()); | ||
| } | ||
|
|
||
| public void testTermQueryHighCardinality() { | ||
|
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Not related to the change, but this was missing from the previous change. |
||
| KeywordFieldMapper.Builder builder = new KeywordFieldMapper.Builder("field", defaultIndexSettings()); | ||
| builder.docValues(FieldMapper.DocValuesParameter.Values.Cardinality.HIGH); | ||
| MappedFieldType ft = new KeywordFieldType( | ||
| "field", | ||
| IndexType.docValuesOnly(), | ||
| TextSearchInfo.SIMPLE_MATCH_ONLY, | ||
| null, | ||
| builder, | ||
| true | ||
| ); | ||
| assertEquals(new SlowCustomBinaryDocValuesTermQuery("field", new BytesRef("foo")), ft.termQuery("foo", MOCK_CONTEXT)); | ||
| } | ||
|
|
||
| public void testTermQueryWithNormalizer() { | ||
| Analyzer normalizer = new Analyzer() { | ||
| @Override | ||
|
|
@@ -177,6 +193,20 @@ public void testRangeQuery() { | |
| ); | ||
| } | ||
|
|
||
| public void testWildcardQueryHighCardinality() { | ||
|
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Note that |
||
| KeywordFieldMapper.Builder builder = new KeywordFieldMapper.Builder("field", defaultIndexSettings()); | ||
| builder.docValues(FieldMapper.DocValuesParameter.Values.Cardinality.HIGH); | ||
| MappedFieldType ft = new KeywordFieldType( | ||
| "field", | ||
| IndexType.docValuesOnly(), | ||
| TextSearchInfo.SIMPLE_MATCH_ONLY, | ||
| null, | ||
| builder, | ||
| true | ||
| ); | ||
| assertEquals(new SlowCustomBinaryDocValuesWildcardQuery("field", "foo*", false), ft.wildcardQuery("foo*", null, MOCK_CONTEXT)); | ||
| } | ||
|
|
||
| public void testRegexpQuery() { | ||
| MappedFieldType ft = new KeywordFieldType("field"); | ||
| assertEquals(new RegexpQuery(new Term("field", "foo.*")), ft.regexpQuery("foo.*", 0, 0, 10, null, MOCK_CONTEXT)); | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
question: why not using
SlowCustomBinaryDocValuesWildcardQuerywhen case sensitive?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
That is only used if
storedInBinaryDocValues()return true and if code ends up here then we don't use binary doc values.Currently the
WildcardQuery(term, Operations.DEFAULT_DETERMINIZE_WORK_LIMIT, MultiTermQuery.DOC_VALUES_REWRITE)can't be used ifcaseInsensitiveis true. I do think this is possible, but theCaseInsensitiveWildcardQueryneeds to be extended to work with doc values rewrite?Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I am asking this since I see in
buildByteRunAutomatonthe automaton is built based on thecaseInsensitiveflag which suggests this query can handle both based on the boolean value ofcaseInsensitive. This pattern could potentially be applied to theSortedSetDocValuespath as well resulting in just 2 query classes (one per doc values format), each taking acaseInsensitiveboolean, rather than 4 separate code paths. But maybe this is what you are planning to do as a followup.