diff --git a/server/src/main/java/org/elasticsearch/index/mapper/KeywordFieldMapper.java b/server/src/main/java/org/elasticsearch/index/mapper/KeywordFieldMapper.java index 3bf030bca1096..1a45836848aca 100644 --- a/server/src/main/java/org/elasticsearch/index/mapper/KeywordFieldMapper.java +++ b/server/src/main/java/org/elasticsearch/index/mapper/KeywordFieldMapper.java @@ -72,6 +72,7 @@ import org.elasticsearch.index.query.SearchExecutionContext; import org.elasticsearch.index.similarity.SimilarityProvider; import org.elasticsearch.lucene.queries.SlowCustomBinaryDocValuesTermQuery; +import org.elasticsearch.lucene.queries.SlowCustomBinaryDocValuesWildcardQuery; import org.elasticsearch.script.Script; import org.elasticsearch.script.ScriptCompiler; import org.elasticsearch.script.SortedBinaryDocValuesStringFieldScript; @@ -1080,15 +1081,16 @@ public Query wildcardQuery( value = indexedValueForSearch(value).utf8ToString(); } - if (caseInsensitive == false && storedInBinaryDocValues() == false) { + if (storedInBinaryDocValues()) { + return new SlowCustomBinaryDocValuesWildcardQuery(name(), value, caseInsensitive); + } + + if (caseInsensitive == false) { Term term = new Term(name(), value); return new WildcardQuery(term, Operations.DEFAULT_DETERMINIZE_WORK_LIMIT, MultiTermQuery.DOC_VALUES_REWRITE); } - StringFieldScript.LeafFactory leafFactory = storedInBinaryDocValues() - ? ctx -> new SortedBinaryDocValuesStringFieldScript(name(), context.lookup(), ctx) - : ctx -> new SortedSetDocValuesStringFieldScript(name(), context.lookup(), ctx); - + StringFieldScript.LeafFactory leafFactory = ctx -> new SortedSetDocValuesStringFieldScript(name(), context.lookup(), ctx); return new StringScriptFieldWildcardQuery(new Script(""), leafFactory, name(), value, caseInsensitive); } } diff --git a/server/src/main/java/org/elasticsearch/lucene/queries/AbstractBinaryDocValuesQuery.java b/server/src/main/java/org/elasticsearch/lucene/queries/AbstractBinaryDocValuesQuery.java new file mode 100644 index 0000000000000..5ada4f8315ab7 --- /dev/null +++ b/server/src/main/java/org/elasticsearch/lucene/queries/AbstractBinaryDocValuesQuery.java @@ -0,0 +1,87 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the "Elastic License + * 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side + * Public License v 1"; you may not use this file except in compliance with, at + * your election, the "Elastic License 2.0", the "GNU Affero General Public + * License v3.0 only", or the "Server Side Public License, v 1". + */ + +package org.elasticsearch.lucene.queries; + +import org.apache.lucene.index.BinaryDocValues; +import org.apache.lucene.index.DocValues; +import org.apache.lucene.index.LeafReaderContext; +import org.apache.lucene.search.ConstantScoreScorer; +import org.apache.lucene.search.ConstantScoreWeight; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.QueryVisitor; +import org.apache.lucene.search.ScoreMode; +import org.apache.lucene.search.ScorerSupplier; +import org.apache.lucene.search.TwoPhaseIterator; +import org.apache.lucene.search.Weight; +import org.apache.lucene.util.BytesRef; +import org.elasticsearch.index.mapper.blockloader.docvalues.CustomBinaryDocValuesReader; + +import java.io.IOException; +import java.util.Objects; +import java.util.function.Predicate; + +abstract class AbstractBinaryDocValuesQuery extends Query { + + final String fieldName; + final Predicate matcher; + + AbstractBinaryDocValuesQuery(String fieldName, Predicate matcher) { + this.fieldName = Objects.requireNonNull(fieldName); + this.matcher = Objects.requireNonNull(matcher); + } + + @Override + public Weight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost) throws IOException { + float matchCost = matchCost(); + return new ConstantScoreWeight(this, boost) { + + @Override + public ScorerSupplier scorerSupplier(LeafReaderContext context) throws IOException { + final BinaryDocValues values = context.reader().getBinaryDocValues(fieldName); + if (values == null) { + return null; + } + + final TwoPhaseIterator iterator = new TwoPhaseIterator(values) { + + final CustomBinaryDocValuesReader reader = new CustomBinaryDocValuesReader(); + + @Override + public boolean matches() throws IOException { + BytesRef binaryValue = values.binaryValue(); + return reader.match(binaryValue, matcher); + } + + @Override + public float matchCost() { + return matchCost; + } + }; + + return new DefaultScorerSupplier(new ConstantScoreScorer(score(), scoreMode, iterator)); + } + + @Override + public boolean isCacheable(LeafReaderContext ctx) { + return DocValues.isCacheable(ctx, fieldName); + } + }; + } + + protected abstract float matchCost(); + + @Override + public void visit(QueryVisitor visitor) { + if (visitor.acceptField(fieldName)) { + visitor.visitLeaf(this); + } + } +} diff --git a/server/src/main/java/org/elasticsearch/lucene/queries/SlowCustomBinaryDocValuesTermQuery.java b/server/src/main/java/org/elasticsearch/lucene/queries/SlowCustomBinaryDocValuesTermQuery.java index 901046d92713e..8fcbc3b14b550 100644 --- a/server/src/main/java/org/elasticsearch/lucene/queries/SlowCustomBinaryDocValuesTermQuery.java +++ b/server/src/main/java/org/elasticsearch/lucene/queries/SlowCustomBinaryDocValuesTermQuery.java @@ -9,22 +9,8 @@ package org.elasticsearch.lucene.queries; -import org.apache.lucene.index.BinaryDocValues; -import org.apache.lucene.index.DocValues; -import org.apache.lucene.index.LeafReaderContext; -import org.apache.lucene.search.ConstantScoreScorer; -import org.apache.lucene.search.ConstantScoreWeight; -import org.apache.lucene.search.IndexSearcher; -import org.apache.lucene.search.Query; -import org.apache.lucene.search.QueryVisitor; -import org.apache.lucene.search.ScoreMode; -import org.apache.lucene.search.ScorerSupplier; -import org.apache.lucene.search.TwoPhaseIterator; -import org.apache.lucene.search.Weight; import org.apache.lucene.util.BytesRef; -import org.elasticsearch.index.mapper.blockloader.docvalues.CustomBinaryDocValuesReader; -import java.io.IOException; import java.util.Objects; /** @@ -34,51 +20,18 @@ *

* This implementation is slow, because it potentially scans binary doc values for each document. */ -public final class SlowCustomBinaryDocValuesTermQuery extends Query { +public final class SlowCustomBinaryDocValuesTermQuery extends AbstractBinaryDocValuesQuery { - private final String fieldName; private final BytesRef term; public SlowCustomBinaryDocValuesTermQuery(String fieldName, BytesRef term) { - this.fieldName = Objects.requireNonNull(fieldName); + super(fieldName, term::equals); this.term = Objects.requireNonNull(term); } @Override - public Weight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost) throws IOException { - return new ConstantScoreWeight(this, boost) { - - @Override - public ScorerSupplier scorerSupplier(LeafReaderContext context) throws IOException { - final BinaryDocValues values = context.reader().getBinaryDocValues(fieldName); - if (values == null) { - return null; - } - - final TwoPhaseIterator iterator = new TwoPhaseIterator(values) { - - final CustomBinaryDocValuesReader reader = new CustomBinaryDocValuesReader(); - - @Override - public boolean matches() throws IOException { - BytesRef binaryValue = values.binaryValue(); - return reader.match(binaryValue, term::equals); - } - - @Override - public float matchCost() { - return 10; // because one comparison - } - }; - - return new DefaultScorerSupplier(new ConstantScoreScorer(score(), scoreMode, iterator)); - } - - @Override - public boolean isCacheable(LeafReaderContext ctx) { - return DocValues.isCacheable(ctx, fieldName); - } - }; + protected float matchCost() { + return 10; // because one comparison } @Override @@ -86,13 +39,6 @@ public String toString(String field) { return "SlowCustomBinaryDocValuesTermQuery(fieldName=" + field + ",term=" + term.utf8ToString() + ")"; } - @Override - public void visit(QueryVisitor visitor) { - if (visitor.acceptField(fieldName)) { - visitor.visitLeaf(this); - } - } - @Override public boolean equals(Object o) { if (this == o) { diff --git a/server/src/main/java/org/elasticsearch/lucene/queries/SlowCustomBinaryDocValuesWildcardQuery.java b/server/src/main/java/org/elasticsearch/lucene/queries/SlowCustomBinaryDocValuesWildcardQuery.java new file mode 100644 index 0000000000000..0d215d7bf6063 --- /dev/null +++ b/server/src/main/java/org/elasticsearch/lucene/queries/SlowCustomBinaryDocValuesWildcardQuery.java @@ -0,0 +1,89 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the "Elastic License + * 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side + * Public License v 1"; you may not use this file except in compliance with, at + * your election, the "Elastic License 2.0", the "GNU Affero General Public + * License v3.0 only", or the "Server Side Public License, v 1". + */ + +package org.elasticsearch.lucene.queries; + +import org.apache.lucene.index.Term; +import org.apache.lucene.search.WildcardQuery; +import org.apache.lucene.util.automaton.Automaton; +import org.apache.lucene.util.automaton.ByteRunAutomaton; +import org.apache.lucene.util.automaton.Operations; +import org.elasticsearch.common.lucene.search.AutomatonQueries; + +import java.util.Objects; + +/** + * A query for matching an exact BytesRef value for a specific field. + * The equavalent of {@link org.elasticsearch.search.runtime.StringScriptFieldWildcardQuery}, but then without the scripting overhead and + * just for binary doc values. + *

+ * This implementation is slow, because it potentially scans binary doc values for each document. + */ +// TODO: create abstract class for binary doc values based automaton queries in follow up, in order to support regex and fuzzy queries. +public final class SlowCustomBinaryDocValuesWildcardQuery extends AbstractBinaryDocValuesQuery { + + private final String pattern; + private final boolean caseInsensitive; + + public SlowCustomBinaryDocValuesWildcardQuery(String fieldName, String pattern, boolean caseInsensitive) { + this(fieldName, pattern, caseInsensitive, buildByteRunAutomaton(fieldName, pattern, caseInsensitive)); + } + + private SlowCustomBinaryDocValuesWildcardQuery(String fieldName, String pattern, boolean caseInsensitive, ByteRunAutomaton automaton) { + super(fieldName, value -> automaton.run(value.bytes, value.offset, value.length)); + this.pattern = Objects.requireNonNull(pattern); + this.caseInsensitive = caseInsensitive; + } + + private static ByteRunAutomaton buildByteRunAutomaton(String fieldName, String pattern, boolean caseInsensitive) { + Term term = new Term(Objects.requireNonNull(fieldName), Objects.requireNonNull(pattern)); + Automaton automaton; + if (caseInsensitive) { + automaton = AutomatonQueries.toCaseInsensitiveWildcardAutomaton(term); + } else { + automaton = WildcardQuery.toAutomaton(term, Operations.DEFAULT_DETERMINIZE_WORK_LIMIT); + } + return new ByteRunAutomaton(automaton); + } + + @Override + protected float matchCost() { + return 1000f; // This is just expensive, not sure what the actual cost is. + } + + @Override + public String toString(String field) { + return "SlowCustomBinaryDocValuesWildcardQuery(fieldName=" + + field + + ",pattern=" + + pattern + + ",caseInsensitive=" + + caseInsensitive + + ")"; + } + + @Override + public boolean equals(Object o) { + if (this == o) { + return true; + } + if (sameClassAs(o) == false) { + return false; + } + SlowCustomBinaryDocValuesWildcardQuery that = (SlowCustomBinaryDocValuesWildcardQuery) o; + return Objects.equals(fieldName, that.fieldName) + && Objects.equals(pattern, that.pattern) + && caseInsensitive == that.caseInsensitive; + } + + @Override + public int hashCode() { + return Objects.hash(classHash(), fieldName, pattern, caseInsensitive); + } +} diff --git a/server/src/test/java/org/elasticsearch/index/mapper/KeywordFieldTypeTests.java b/server/src/test/java/org/elasticsearch/index/mapper/KeywordFieldTypeTests.java index 8c2f19d02e046..1afe72d222f1c 100644 --- a/server/src/test/java/org/elasticsearch/index/mapper/KeywordFieldTypeTests.java +++ b/server/src/test/java/org/elasticsearch/index/mapper/KeywordFieldTypeTests.java @@ -54,6 +54,8 @@ import org.elasticsearch.index.analysis.TokenizerFactory; import org.elasticsearch.index.mapper.KeywordFieldMapper.KeywordFieldType; import org.elasticsearch.index.mapper.MappedFieldType.Relation; +import org.elasticsearch.lucene.queries.SlowCustomBinaryDocValuesTermQuery; +import org.elasticsearch.lucene.queries.SlowCustomBinaryDocValuesWildcardQuery; import org.elasticsearch.script.ScriptCompiler; import java.io.IOException; @@ -97,6 +99,20 @@ public void testTermQuery() { assertEquals("Cannot search on field [field] since it is not indexed nor has doc values.", e.getMessage()); } + public void testTermQueryHighCardinality() { + KeywordFieldMapper.Builder builder = new KeywordFieldMapper.Builder("field", defaultIndexSettings()); + builder.docValues(FieldMapper.DocValuesParameter.Values.Cardinality.HIGH); + MappedFieldType ft = new KeywordFieldType( + "field", + IndexType.docValuesOnly(), + TextSearchInfo.SIMPLE_MATCH_ONLY, + null, + builder, + true + ); + assertEquals(new SlowCustomBinaryDocValuesTermQuery("field", new BytesRef("foo")), ft.termQuery("foo", MOCK_CONTEXT)); + } + public void testTermQueryWithNormalizer() { Analyzer normalizer = new Analyzer() { @Override @@ -177,6 +193,20 @@ public void testRangeQuery() { ); } + public void testWildcardQueryHighCardinality() { + KeywordFieldMapper.Builder builder = new KeywordFieldMapper.Builder("field", defaultIndexSettings()); + builder.docValues(FieldMapper.DocValuesParameter.Values.Cardinality.HIGH); + MappedFieldType ft = new KeywordFieldType( + "field", + IndexType.docValuesOnly(), + TextSearchInfo.SIMPLE_MATCH_ONLY, + null, + builder, + true + ); + assertEquals(new SlowCustomBinaryDocValuesWildcardQuery("field", "foo*", false), ft.wildcardQuery("foo*", null, MOCK_CONTEXT)); + } + public void testRegexpQuery() { MappedFieldType ft = new KeywordFieldType("field"); assertEquals(new RegexpQuery(new Term("field", "foo.*")), ft.regexpQuery("foo.*", 0, 0, 10, null, MOCK_CONTEXT)); diff --git a/server/src/test/java/org/elasticsearch/lucene/queries/SlowCustomBinaryDocValuesWildcardQueryTests.java b/server/src/test/java/org/elasticsearch/lucene/queries/SlowCustomBinaryDocValuesWildcardQueryTests.java new file mode 100644 index 0000000000000..5cf0e9f389f43 --- /dev/null +++ b/server/src/test/java/org/elasticsearch/lucene/queries/SlowCustomBinaryDocValuesWildcardQueryTests.java @@ -0,0 +1,164 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the "Elastic License + * 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side + * Public License v 1"; you may not use this file except in compliance with, at + * your election, the "Elastic License 2.0", the "GNU Affero General Public + * License v3.0 only", or the "Server Side Public License, v 1". + */ +package org.elasticsearch.lucene.queries; + +import org.apache.lucene.document.Document; +import org.apache.lucene.document.SortedSetDocValuesField; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.IndexWriterConfig; +import org.apache.lucene.index.Term; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.MultiTermQuery; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.TopDocs; +import org.apache.lucene.search.WildcardQuery; +import org.apache.lucene.store.Directory; +import org.apache.lucene.tests.index.RandomIndexWriter; +import org.apache.lucene.tests.util.TestUtil; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.automaton.Operations; +import org.elasticsearch.index.codec.tsdb.es819.ES819TSDBDocValuesFormat; +import org.elasticsearch.index.mapper.BinaryFieldMapper; +import org.elasticsearch.test.ESTestCase; + +import java.io.IOException; +import java.nio.charset.StandardCharsets; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +import static org.hamcrest.Matchers.equalTo; +import static org.hamcrest.Matchers.greaterThanOrEqualTo; + +public class SlowCustomBinaryDocValuesWildcardQueryTests extends ESTestCase { + + public void testBasics() throws Exception { + String fieldName = "field"; + try (Directory dir = newDirectory()) { + Map expectedCounts = new HashMap<>(); + expectedCounts.put("a", 2L); + expectedCounts.put("b", 5L); + expectedCounts.put("c", 1L); + expectedCounts.put("d", 3L); + expectedCounts.put("e", 10L); + try (RandomIndexWriter writer = newRandomIndexWriter(dir)) { + for (var entry : expectedCounts.entrySet()) { + for (int i = 0; i < entry.getValue(); i++) { + Document document = new Document(); + var field = new BinaryFieldMapper.CustomBinaryDocValuesField( + "field", + entry.getKey().getBytes(StandardCharsets.UTF_8) + ); + if (randomBoolean()) { + field.add("z".getBytes(StandardCharsets.UTF_8)); + } + document.add(field); + writer.addDocument(document); + } + } + + // search + try (IndexReader reader = writer.getReader()) { + IndexSearcher searcher = newSearcher(reader); + for (var entry : expectedCounts.entrySet()) { + long count = searcher.count(new SlowCustomBinaryDocValuesWildcardQuery(fieldName, entry.getKey() + "*", false)); + assertEquals(entry.getValue().longValue(), count); + } + } + } + } + } + + public void testNoField() throws IOException { + String fieldName = "field"; + + // no field in index + try (Directory dir = newDirectory()) { + try (RandomIndexWriter writer = newRandomIndexWriter(dir)) { + writer.addDocument(new Document()); + try (IndexReader reader = writer.getReader()) { + IndexSearcher searcher = newSearcher(reader); + Query query = new SlowCustomBinaryDocValuesWildcardQuery(fieldName, "a*", false); + assertEquals(0, searcher.count(query)); + } + } + } + + // no field in segment + try (Directory dir = newDirectory()) { + try (RandomIndexWriter writer = newRandomIndexWriter(dir)) { + Document document = new Document(); + document.add(new BinaryFieldMapper.CustomBinaryDocValuesField("field", "a".getBytes(StandardCharsets.UTF_8))); + writer.addDocument(document); + writer.commit(); + writer.addDocument(new Document()); + try (IndexReader reader = writer.getReader()) { + IndexSearcher searcher = newSearcher(reader); + Query query = new SlowCustomBinaryDocValuesWildcardQuery(fieldName, "a*", false); + assertEquals(1, searcher.count(query)); + } + } + } + } + + public void testAgainstWildcardQuery() throws IOException { + List randomValues = randomList(8, 32, () -> randomAlphaOfLength(8)); + try (Directory dir = newDirectory()) { + try (RandomIndexWriter writer = newRandomIndexWriter(dir)) { + for (String randomValue : randomValues) { + Document document = new Document(); + document.add(new SortedSetDocValuesField("baseline_field", new BytesRef(randomValue))); + var binaryDVField = new BinaryFieldMapper.CustomBinaryDocValuesField( + "contender_field", + randomValue.getBytes(StandardCharsets.UTF_8) + ); + document.add(binaryDVField); + if (randomBoolean()) { + String extraRandomValue = randomFrom(randomValues); + binaryDVField.add(extraRandomValue.getBytes(StandardCharsets.UTF_8)); + document.add(new SortedSetDocValuesField("baseline_field", new BytesRef(extraRandomValue))); + } + writer.addDocument(document); + } + + try (IndexReader reader = writer.getReader()) { + String randomWildcard = randomFrom(randomValues).substring(0, 2) + "*"; + IndexSearcher searcher = newSearcher(reader); + + Query baselineQuery = new WildcardQuery( + new Term("baseline_field", randomWildcard), + Operations.DEFAULT_DETERMINIZE_WORK_LIMIT, + MultiTermQuery.DOC_VALUES_REWRITE + ); + TopDocs baselineResults = searcher.search(baselineQuery, 32); + + Query contenderQuery = new SlowCustomBinaryDocValuesWildcardQuery("contender_field", randomWildcard, false); + TopDocs contenderResults = searcher.search(contenderQuery, 32); + + assertThat(contenderResults.totalHits, equalTo(baselineResults.totalHits)); + assertThat(baselineResults.scoreDocs.length, greaterThanOrEqualTo(1)); + assertThat(baselineResults.scoreDocs.length, equalTo(contenderResults.scoreDocs.length)); + for (int i = 0; i < baselineResults.scoreDocs.length; i++) { + assertThat(baselineResults.scoreDocs[i].doc, equalTo(contenderResults.scoreDocs[i].doc)); + assertThat(baselineResults.scoreDocs[i].score, equalTo(contenderResults.scoreDocs[i].score)); + } + } + } + } + } + + private static RandomIndexWriter newRandomIndexWriter(Directory dir) throws IOException { + IndexWriterConfig iwc = newIndexWriterConfig(); + if (randomBoolean()) { + iwc.setCodec(TestUtil.alwaysDocValuesFormat(new ES819TSDBDocValuesFormat())); + } + return new RandomIndexWriter(random(), dir, iwc); + } + +}