Skip to content

Commit 8ee9182

Browse files
authored
BlendedTermQuery should ignore fields that don't exists in the index (#41125)
Today the blended term query detects if a term exists in a field by looking at the term statistics in the index. However the value to indicate that a term has no occurence in a field have changed in Lucene. A non-existing term now returns a doc and total term frequency of 0. Because of this disrepancy the blended term query picks 0 as the minimum frequency for a term even if other fields have documents for this terms. This confuses the term queries that the blending creates since some of them contain a custom state that indicates a frequency of 0 even though the term has some occurence in the field. For these terms an exception is thrown because the term query always checks that the term state's frequency is greater than 0 if there are documents associate to it. This change fixes this bug by ignoring terms with a doc freq of 0 when the blended term query picks the minimum term frequency among the requested fields. Closes #41118
1 parent 3c66cff commit 8ee9182

File tree

2 files changed

+64
-25
lines changed

2 files changed

+64
-25
lines changed

server/src/main/java/org/apache/lucene/queries/BlendedTermQuery.java

Lines changed: 5 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -113,23 +113,17 @@ protected void blend(final TermStates[] contexts, int maxDoc, IndexReader reader
113113
// TODO: Maybe it could also make sense to assume independent distributions of documents and eg. have:
114114
// df = df1 + df2 - (df1 * df2 / maxDoc)?
115115
max = Math.max(df, max);
116-
if (minSumTTF != -1 && ctx.totalTermFreq() != -1) {
116+
if (ctx.totalTermFreq() > 0) {
117117
// we need to find out the minimum sumTTF to adjust the statistics
118118
// otherwise the statistics don't match
119119
minSumTTF = Math.min(minSumTTF, reader.getSumTotalTermFreq(terms[i].field()));
120-
} else {
121-
minSumTTF = -1;
122120
}
123121

124122
}
125-
if (minSumTTF != -1 && maxDoc > minSumTTF) {
126-
maxDoc = (int)minSumTTF;
127-
}
128-
129123
if (max == 0) {
130124
return; // we are done that term doesn't exist at all
131125
}
132-
long sumTTF = minSumTTF == -1 ? -1 : 0;
126+
long sumTTF = 0;
133127
final int[] tieBreak = new int[contexts.length];
134128
for (int i = 0; i < tieBreak.length; ++i) {
135129
tieBreak[i] = i;
@@ -165,29 +159,20 @@ protected int compare(int i, int j) {
165159
}
166160
contexts[i] = ctx = adjustDF(reader.getContext(), ctx, Math.min(maxDoc, actualDf));
167161
prev = current;
168-
if (sumTTF >= 0 && ctx.totalTermFreq() >= 0) {
169-
sumTTF += ctx.totalTermFreq();
170-
} else {
171-
sumTTF = -1; // omit once TF is omitted anywhere!
172-
}
162+
sumTTF += ctx.totalTermFreq();
173163
}
174164
sumTTF = Math.min(sumTTF, minSumTTF);
175165
for (int i = 0; i < contexts.length; i++) {
176166
int df = contexts[i].docFreq();
177167
if (df == 0) {
178168
continue;
179169
}
180-
// the blended sumTTF can't be greater than the sumTTTF on the field
181-
final long fixedTTF = sumTTF == -1 ? -1 : sumTTF;
182-
contexts[i] = adjustTTF(reader.getContext(), contexts[i], fixedTTF);
170+
contexts[i] = adjustTTF(reader.getContext(), contexts[i], sumTTF);
183171
}
184172
}
185173

186174
private TermStates adjustTTF(IndexReaderContext readerContext, TermStates termContext, long sumTTF) throws IOException {
187175
assert termContext.wasBuiltFor(readerContext);
188-
if (sumTTF == -1 && termContext.totalTermFreq() == -1) {
189-
return termContext;
190-
}
191176
TermStates newTermContext = new TermStates(readerContext);
192177
List<LeafReaderContext> leaves = readerContext.leaves();
193178
final int len;
@@ -213,12 +198,7 @@ private TermStates adjustTTF(IndexReaderContext readerContext, TermStates termCo
213198
private static TermStates adjustDF(IndexReaderContext readerContext, TermStates ctx, int newDocFreq) throws IOException {
214199
assert ctx.wasBuiltFor(readerContext);
215200
// Use a value of ttf that is consistent with the doc freq (ie. gte)
216-
long newTTF;
217-
if (ctx.totalTermFreq() < 0) {
218-
newTTF = -1;
219-
} else {
220-
newTTF = Math.max(ctx.totalTermFreq(), newDocFreq);
221-
}
201+
long newTTF = Math.max(ctx.totalTermFreq(), newDocFreq);
222202
List<LeafReaderContext> leaves = readerContext.leaves();
223203
final int len;
224204
if (leaves == null) {

server/src/test/java/org/apache/lucene/queries/BlendedTermQueryTests.java

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,10 +28,12 @@
2828
import org.apache.lucene.index.IndexWriter;
2929
import org.apache.lucene.index.MultiReader;
3030
import org.apache.lucene.index.Term;
31+
import org.apache.lucene.index.TermStates;
3132
import org.apache.lucene.search.BooleanClause;
3233
import org.apache.lucene.search.BooleanQuery;
3334
import org.apache.lucene.search.DisjunctionMaxQuery;
3435
import org.apache.lucene.search.IndexSearcher;
36+
import org.apache.lucene.search.Query;
3537
import org.apache.lucene.search.QueryUtils;
3638
import org.apache.lucene.search.ScoreDoc;
3739
import org.apache.lucene.search.ScoreMode;
@@ -52,6 +54,8 @@
5254

5355
import static org.hamcrest.Matchers.containsInAnyOrder;
5456
import static org.hamcrest.Matchers.equalTo;
57+
import static org.hamcrest.Matchers.greaterThan;
58+
import static org.hamcrest.Matchers.instanceOf;
5559

5660
public class BlendedTermQueryTests extends ESTestCase {
5761
public void testDismaxQuery() throws IOException {
@@ -114,6 +118,61 @@ public void testDismaxQuery() throws IOException {
114118
assertEquals(Integer.toString(1), reader.document(scoreDocs[0].doc).getField("id").stringValue());
115119

116120
}
121+
{
122+
// test with an unknown field
123+
String[] fields = new String[] {"username", "song", "unknown_field"};
124+
Query query = BlendedTermQuery.dismaxBlendedQuery(toTerms(fields, "foo"), 1.0f);
125+
Query rewrite = searcher.rewrite(query);
126+
assertThat(rewrite, instanceOf(BooleanQuery.class));
127+
for (BooleanClause clause : (BooleanQuery) rewrite) {
128+
assertThat(clause.getQuery(), instanceOf(TermQuery.class));
129+
TermQuery termQuery = (TermQuery) clause.getQuery();
130+
TermStates termStates = termQuery.getTermStates();
131+
if (termQuery.getTerm().field().equals("unknown_field")) {
132+
assertThat(termStates.docFreq(), equalTo(0));
133+
assertThat(termStates.totalTermFreq(), equalTo(0L));
134+
} else {
135+
assertThat(termStates.docFreq(), greaterThan(0));
136+
assertThat(termStates.totalTermFreq(), greaterThan(0L));
137+
}
138+
}
139+
assertThat(searcher.search(query, 10).totalHits.value, equalTo((long) iters + username.length));
140+
}
141+
{
142+
// test with an unknown field and an unknown term
143+
String[] fields = new String[] {"username", "song", "unknown_field"};
144+
Query query = BlendedTermQuery.dismaxBlendedQuery(toTerms(fields, "unknown_term"), 1.0f);
145+
Query rewrite = searcher.rewrite(query);
146+
assertThat(rewrite, instanceOf(BooleanQuery.class));
147+
for (BooleanClause clause : (BooleanQuery) rewrite) {
148+
assertThat(clause.getQuery(), instanceOf(TermQuery.class));
149+
TermQuery termQuery = (TermQuery) clause.getQuery();
150+
TermStates termStates = termQuery.getTermStates();
151+
assertThat(termStates.docFreq(), equalTo(0));
152+
assertThat(termStates.totalTermFreq(), equalTo(0L));
153+
}
154+
assertThat(searcher.search(query, 10).totalHits.value, equalTo(0L));
155+
}
156+
{
157+
// test with an unknown field and a term that is present in only one field
158+
String[] fields = new String[] {"username", "song", "id", "unknown_field"};
159+
Query query = BlendedTermQuery.dismaxBlendedQuery(toTerms(fields, "fan"), 1.0f);
160+
Query rewrite = searcher.rewrite(query);
161+
assertThat(rewrite, instanceOf(BooleanQuery.class));
162+
for (BooleanClause clause : (BooleanQuery) rewrite) {
163+
assertThat(clause.getQuery(), instanceOf(TermQuery.class));
164+
TermQuery termQuery = (TermQuery) clause.getQuery();
165+
TermStates termStates = termQuery.getTermStates();
166+
if (termQuery.getTerm().field().equals("username")) {
167+
assertThat(termStates.docFreq(), equalTo(1));
168+
assertThat(termStates.totalTermFreq(), equalTo(1L));
169+
} else {
170+
assertThat(termStates.docFreq(), equalTo(0));
171+
assertThat(termStates.totalTermFreq(), equalTo(0L));
172+
}
173+
}
174+
assertThat(searcher.search(query, 10).totalHits.value, equalTo(1L));
175+
}
117176
reader.close();
118177
w.close();
119178
dir.close();

0 commit comments

Comments
 (0)