Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@
import org.elasticsearch.index.query.SearchExecutionContext;
import org.elasticsearch.index.similarity.SimilarityProvider;
import org.elasticsearch.lucene.queries.SlowCustomBinaryDocValuesTermQuery;
import org.elasticsearch.lucene.queries.SlowCustomBinaryDocValuesWildcardQuery;
import org.elasticsearch.script.Script;
import org.elasticsearch.script.ScriptCompiler;
import org.elasticsearch.script.SortedBinaryDocValuesStringFieldScript;
Expand Down Expand Up @@ -1080,15 +1081,16 @@ public Query wildcardQuery(
value = indexedValueForSearch(value).utf8ToString();
}

if (caseInsensitive == false && storedInBinaryDocValues() == false) {
if (storedInBinaryDocValues()) {
return new SlowCustomBinaryDocValuesWildcardQuery(name(), value, caseInsensitive);
}

if (caseInsensitive == false) {
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

question: why not using SlowCustomBinaryDocValuesWildcardQuery when case sensitive?

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That is only used if storedInBinaryDocValues() return true and if code ends up here then we don't use binary doc values.

Currently the WildcardQuery(term, Operations.DEFAULT_DETERMINIZE_WORK_LIMIT, MultiTermQuery.DOC_VALUES_REWRITE) can't be used if caseInsensitive is true. I do think this is possible, but the CaseInsensitiveWildcardQuery needs to be extended to work with doc values rewrite?

Copy link
Copy Markdown
Contributor

@salvatore-campagna salvatore-campagna Dec 19, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am asking this since I see in buildByteRunAutomaton the automaton is built based on the caseInsensitive flag which suggests this query can handle both based on the boolean value of caseInsensitive. This pattern could potentially be applied to the SortedSetDocValues path as well resulting in just 2 query classes (one per doc values format), each taking a caseInsensitive boolean, rather than 4 separate code paths. But maybe this is what you are planning to do as a followup.

Term term = new Term(name(), value);
return new WildcardQuery(term, Operations.DEFAULT_DETERMINIZE_WORK_LIMIT, MultiTermQuery.DOC_VALUES_REWRITE);
}

StringFieldScript.LeafFactory leafFactory = storedInBinaryDocValues()
? ctx -> new SortedBinaryDocValuesStringFieldScript(name(), context.lookup(), ctx)
: ctx -> new SortedSetDocValuesStringFieldScript(name(), context.lookup(), ctx);

StringFieldScript.LeafFactory leafFactory = ctx -> new SortedSetDocValuesStringFieldScript(name(), context.lookup(), ctx);
return new StringScriptFieldWildcardQuery(new Script(""), leafFactory, name(), value, caseInsensitive);
}
}
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the "Elastic License
* 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side
* Public License v 1"; you may not use this file except in compliance with, at
* your election, the "Elastic License 2.0", the "GNU Affero General Public
* License v3.0 only", or the "Server Side Public License, v 1".
*/

package org.elasticsearch.lucene.queries;

import org.apache.lucene.index.BinaryDocValues;
import org.apache.lucene.index.DocValues;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.search.ConstantScoreScorer;
import org.apache.lucene.search.ConstantScoreWeight;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.QueryVisitor;
import org.apache.lucene.search.ScoreMode;
import org.apache.lucene.search.ScorerSupplier;
import org.apache.lucene.search.TwoPhaseIterator;
import org.apache.lucene.search.Weight;
import org.apache.lucene.util.BytesRef;
import org.elasticsearch.index.mapper.blockloader.docvalues.CustomBinaryDocValuesReader;

import java.io.IOException;
import java.util.Objects;
import java.util.function.Predicate;

abstract class AbstractBinaryDocValuesQuery extends Query {

final String fieldName;
final Predicate<BytesRef> matcher;

AbstractBinaryDocValuesQuery(String fieldName, Predicate<BytesRef> matcher) {
this.fieldName = Objects.requireNonNull(fieldName);
this.matcher = Objects.requireNonNull(matcher);
}

@Override
public Weight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost) throws IOException {
float matchCost = matchCost();
return new ConstantScoreWeight(this, boost) {

@Override
public ScorerSupplier scorerSupplier(LeafReaderContext context) throws IOException {
final BinaryDocValues values = context.reader().getBinaryDocValues(fieldName);
if (values == null) {
return null;
}

final TwoPhaseIterator iterator = new TwoPhaseIterator(values) {

final CustomBinaryDocValuesReader reader = new CustomBinaryDocValuesReader();

@Override
public boolean matches() throws IOException {
BytesRef binaryValue = values.binaryValue();
return reader.match(binaryValue, matcher);
}

@Override
public float matchCost() {
return matchCost;
}
};

return new DefaultScorerSupplier(new ConstantScoreScorer(score(), scoreMode, iterator));
}

@Override
public boolean isCacheable(LeafReaderContext ctx) {
return DocValues.isCacheable(ctx, fieldName);
}
};
}

protected abstract float matchCost();

@Override
public void visit(QueryVisitor visitor) {
if (visitor.acceptField(fieldName)) {
visitor.visitLeaf(this);
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -9,22 +9,8 @@

package org.elasticsearch.lucene.queries;

import org.apache.lucene.index.BinaryDocValues;
import org.apache.lucene.index.DocValues;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.search.ConstantScoreScorer;
import org.apache.lucene.search.ConstantScoreWeight;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.QueryVisitor;
import org.apache.lucene.search.ScoreMode;
import org.apache.lucene.search.ScorerSupplier;
import org.apache.lucene.search.TwoPhaseIterator;
import org.apache.lucene.search.Weight;
import org.apache.lucene.util.BytesRef;
import org.elasticsearch.index.mapper.blockloader.docvalues.CustomBinaryDocValuesReader;

import java.io.IOException;
import java.util.Objects;

/**
Expand All @@ -34,65 +20,25 @@
* <p>
* This implementation is slow, because it potentially scans binary doc values for each document.
*/
public final class SlowCustomBinaryDocValuesTermQuery extends Query {
public final class SlowCustomBinaryDocValuesTermQuery extends AbstractBinaryDocValuesQuery {

private final String fieldName;
private final BytesRef term;

public SlowCustomBinaryDocValuesTermQuery(String fieldName, BytesRef term) {
this.fieldName = Objects.requireNonNull(fieldName);
super(fieldName, term::equals);
this.term = Objects.requireNonNull(term);
}

@Override
public Weight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost) throws IOException {
return new ConstantScoreWeight(this, boost) {

@Override
public ScorerSupplier scorerSupplier(LeafReaderContext context) throws IOException {
final BinaryDocValues values = context.reader().getBinaryDocValues(fieldName);
if (values == null) {
return null;
}

final TwoPhaseIterator iterator = new TwoPhaseIterator(values) {

final CustomBinaryDocValuesReader reader = new CustomBinaryDocValuesReader();

@Override
public boolean matches() throws IOException {
BytesRef binaryValue = values.binaryValue();
return reader.match(binaryValue, term::equals);
}

@Override
public float matchCost() {
return 10; // because one comparison
}
};

return new DefaultScorerSupplier(new ConstantScoreScorer(score(), scoreMode, iterator));
}

@Override
public boolean isCacheable(LeafReaderContext ctx) {
return DocValues.isCacheable(ctx, fieldName);
}
};
protected float matchCost() {
return 10; // because one comparison
}

@Override
public String toString(String field) {
return "SlowCustomBinaryDocValuesTermQuery(fieldName=" + field + ",term=" + term.utf8ToString() + ")";
}

@Override
public void visit(QueryVisitor visitor) {
if (visitor.acceptField(fieldName)) {
visitor.visitLeaf(this);
}
}

@Override
public boolean equals(Object o) {
if (this == o) {
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the "Elastic License
* 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side
* Public License v 1"; you may not use this file except in compliance with, at
* your election, the "Elastic License 2.0", the "GNU Affero General Public
* License v3.0 only", or the "Server Side Public License, v 1".
*/

package org.elasticsearch.lucene.queries;

import org.apache.lucene.index.Term;
import org.apache.lucene.search.WildcardQuery;
import org.apache.lucene.util.automaton.Automaton;
import org.apache.lucene.util.automaton.ByteRunAutomaton;
import org.apache.lucene.util.automaton.Operations;
import org.elasticsearch.common.lucene.search.AutomatonQueries;

import java.util.Objects;

/**
* A query for matching an exact BytesRef value for a specific field.
* The equavalent of {@link org.elasticsearch.search.runtime.StringScriptFieldWildcardQuery}, but then without the scripting overhead and
* just for binary doc values.
* <p>
* This implementation is slow, because it potentially scans binary doc values for each document.
*/
// TODO: create abstract class for binary doc values based automaton queries in follow up, in order to support regex and fuzzy queries.
public final class SlowCustomBinaryDocValuesWildcardQuery extends AbstractBinaryDocValuesQuery {

private final String pattern;
private final boolean caseInsensitive;

public SlowCustomBinaryDocValuesWildcardQuery(String fieldName, String pattern, boolean caseInsensitive) {
this(fieldName, pattern, caseInsensitive, buildByteRunAutomaton(fieldName, pattern, caseInsensitive));
}

private SlowCustomBinaryDocValuesWildcardQuery(String fieldName, String pattern, boolean caseInsensitive, ByteRunAutomaton automaton) {
super(fieldName, value -> automaton.run(value.bytes, value.offset, value.length));
this.pattern = Objects.requireNonNull(pattern);
this.caseInsensitive = caseInsensitive;
}

private static ByteRunAutomaton buildByteRunAutomaton(String fieldName, String pattern, boolean caseInsensitive) {
Term term = new Term(Objects.requireNonNull(fieldName), Objects.requireNonNull(pattern));
Automaton automaton;
if (caseInsensitive) {
automaton = AutomatonQueries.toCaseInsensitiveWildcardAutomaton(term);
} else {
automaton = WildcardQuery.toAutomaton(term, Operations.DEFAULT_DETERMINIZE_WORK_LIMIT);
}
return new ByteRunAutomaton(automaton);
}

@Override
protected float matchCost() {
return 1000f; // This is just expensive, not sure what the actual cost is.
}

@Override
public String toString(String field) {
return "SlowCustomBinaryDocValuesWildcardQuery(fieldName="
+ field
+ ",pattern="
+ pattern
+ ",caseInsensitive="
+ caseInsensitive
+ ")";
}

@Override
public boolean equals(Object o) {
if (this == o) {
return true;
}
if (sameClassAs(o) == false) {
return false;
}
SlowCustomBinaryDocValuesWildcardQuery that = (SlowCustomBinaryDocValuesWildcardQuery) o;
return Objects.equals(fieldName, that.fieldName)
&& Objects.equals(pattern, that.pattern)
&& caseInsensitive == that.caseInsensitive;
}

@Override
public int hashCode() {
return Objects.hash(classHash(), fieldName, pattern, caseInsensitive);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,8 @@
import org.elasticsearch.index.analysis.TokenizerFactory;
import org.elasticsearch.index.mapper.KeywordFieldMapper.KeywordFieldType;
import org.elasticsearch.index.mapper.MappedFieldType.Relation;
import org.elasticsearch.lucene.queries.SlowCustomBinaryDocValuesTermQuery;
import org.elasticsearch.lucene.queries.SlowCustomBinaryDocValuesWildcardQuery;
import org.elasticsearch.script.ScriptCompiler;

import java.io.IOException;
Expand Down Expand Up @@ -97,6 +99,20 @@ public void testTermQuery() {
assertEquals("Cannot search on field [field] since it is not indexed nor has doc values.", e.getMessage());
}

public void testTermQueryHighCardinality() {
Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not related to the change, but this was missing from the previous change.

KeywordFieldMapper.Builder builder = new KeywordFieldMapper.Builder("field", defaultIndexSettings());
builder.docValues(FieldMapper.DocValuesParameter.Values.Cardinality.HIGH);
MappedFieldType ft = new KeywordFieldType(
"field",
IndexType.docValuesOnly(),
TextSearchInfo.SIMPLE_MATCH_ONLY,
null,
builder,
true
);
assertEquals(new SlowCustomBinaryDocValuesTermQuery("field", new BytesRef("foo")), ft.termQuery("foo", MOCK_CONTEXT));
}

public void testTermQueryWithNormalizer() {
Analyzer normalizer = new Analyzer() {
@Override
Expand Down Expand Up @@ -177,6 +193,20 @@ public void testRangeQuery() {
);
}

public void testWildcardQueryHighCardinality() {
Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Note that 395_binary_doc_values_search.yml handles integration testing for this change, but this unit test just checks that we use the right lucene query if cardinality is set to high.

KeywordFieldMapper.Builder builder = new KeywordFieldMapper.Builder("field", defaultIndexSettings());
builder.docValues(FieldMapper.DocValuesParameter.Values.Cardinality.HIGH);
MappedFieldType ft = new KeywordFieldType(
"field",
IndexType.docValuesOnly(),
TextSearchInfo.SIMPLE_MATCH_ONLY,
null,
builder,
true
);
assertEquals(new SlowCustomBinaryDocValuesWildcardQuery("field", "foo*", false), ft.wildcardQuery("foo*", null, MOCK_CONTEXT));
}

public void testRegexpQuery() {
MappedFieldType ft = new KeywordFieldType("field");
assertEquals(new RegexpQuery(new Term("field", "foo.*")), ft.regexpQuery("foo.*", 0, 0, 10, null, MOCK_CONTEXT));
Expand Down
Loading