Skip to content

Commit f6d38d4

Browse files
authored
Integrate UnifiedHighlighter (#21621)
* Integrate UnifiedHighlighter This change integrates the Lucene highlighter called "unified" in the list of supported highlighters for ES. This highlighter can extract offsets from either postings, term vectors, or via re-analyzing text. The best strategy is picked automatically at query time and depends on the field and the query to highlight.
1 parent f90051e commit f6d38d4

File tree

18 files changed

+1448
-435
lines changed

18 files changed

+1448
-435
lines changed

core/src/main/java/org/apache/lucene/search/postingshighlight/Snippet.java renamed to core/src/main/java/org/apache/lucene/search/highlight/Snippet.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
* under the License.
1818
*/
1919

20-
package org.apache.lucene.search.postingshighlight;
20+
package org.apache.lucene.search.highlight;
2121

2222
/**
2323
* Represents a scored highlighted snippet.

core/src/main/java/org/apache/lucene/search/postingshighlight/CustomPassageFormatter.java

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919

2020
package org.apache.lucene.search.postingshighlight;
2121

22+
import org.apache.lucene.search.highlight.Snippet;
2223
import org.apache.lucene.search.highlight.Encoder;
2324
import org.elasticsearch.search.fetch.subphase.highlight.HighlightUtils;
2425

@@ -46,10 +47,10 @@ public Snippet[] format(Passage[] passages, String content) {
4647
for (int j = 0; j < passages.length; j++) {
4748
Passage passage = passages[j];
4849
StringBuilder sb = new StringBuilder();
49-
pos = passage.startOffset;
50-
for (int i = 0; i < passage.numMatches; i++) {
51-
int start = passage.matchStarts[i];
52-
int end = passage.matchEnds[i];
50+
pos = passage.getStartOffset();
51+
for (int i = 0; i < passage.getNumMatches(); i++) {
52+
int start = passage.getMatchStarts()[i];
53+
int end = passage.getMatchEnds()[i];
5354
// its possible to have overlapping terms
5455
if (start > pos) {
5556
append(sb, content, pos, start);
@@ -62,15 +63,15 @@ public Snippet[] format(Passage[] passages, String content) {
6263
}
6364
}
6465
// its possible a "term" from the analyzer could span a sentence boundary.
65-
append(sb, content, pos, Math.max(pos, passage.endOffset));
66+
append(sb, content, pos, Math.max(pos, passage.getEndOffset()));
6667
//we remove the paragraph separator if present at the end of the snippet (we used it as separator between values)
6768
if (sb.charAt(sb.length() - 1) == HighlightUtils.PARAGRAPH_SEPARATOR) {
6869
sb.deleteCharAt(sb.length() - 1);
6970
} else if (sb.charAt(sb.length() - 1) == HighlightUtils.NULL_SEPARATOR) {
7071
sb.deleteCharAt(sb.length() - 1);
7172
}
7273
//and we trim the snippets too
73-
snippets[j] = new Snippet(sb.toString().trim(), passage.score, passage.numMatches > 0);
74+
snippets[j] = new Snippet(sb.toString().trim(), passage.getScore(), passage.getNumMatches() > 0);
7475
}
7576
return snippets;
7677
}

core/src/main/java/org/apache/lucene/search/postingshighlight/CustomPostingsHighlighter.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
import org.apache.lucene.analysis.Analyzer;
2323
import org.apache.lucene.search.IndexSearcher;
2424
import org.apache.lucene.search.Query;
25+
import org.apache.lucene.search.highlight.Snippet;
2526

2627
import java.io.IOException;
2728
import java.text.BreakIterator;
Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,82 @@
1+
/*
2+
* Licensed to Elasticsearch under one or more contributor
3+
* license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright
5+
* ownership. Elasticsearch licenses this file to you under
6+
* the Apache License, Version 2.0 (the "License"); you may
7+
* not use this file except in compliance with the License.
8+
* You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing,
13+
* software distributed under the License is distributed on an
14+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15+
* KIND, either express or implied. See the License for the
16+
* specific language governing permissions and limitations
17+
* under the License.
18+
*/
19+
20+
package org.apache.lucene.search.uhighlight;
21+
22+
import org.apache.lucene.search.highlight.Encoder;
23+
import org.apache.lucene.search.highlight.Snippet;
24+
import org.elasticsearch.search.fetch.subphase.highlight.HighlightUtils;
25+
26+
/**
27+
* Custom passage formatter that allows us to:
28+
* 1) extract different snippets (instead of a single big string) together with their scores ({@link Snippet})
29+
* 2) use the {@link Encoder} implementations that are already used with the other highlighters
30+
*/
31+
public class CustomPassageFormatter extends PassageFormatter {
32+
33+
private final String preTag;
34+
private final String postTag;
35+
private final Encoder encoder;
36+
37+
public CustomPassageFormatter(String preTag, String postTag, Encoder encoder) {
38+
this.preTag = preTag;
39+
this.postTag = postTag;
40+
this.encoder = encoder;
41+
}
42+
43+
@Override
44+
public Snippet[] format(Passage[] passages, String content) {
45+
Snippet[] snippets = new Snippet[passages.length];
46+
int pos;
47+
for (int j = 0; j < passages.length; j++) {
48+
Passage passage = passages[j];
49+
StringBuilder sb = new StringBuilder();
50+
pos = passage.getStartOffset();
51+
for (int i = 0; i < passage.getNumMatches(); i++) {
52+
int start = passage.getMatchStarts()[i];
53+
int end = passage.getMatchEnds()[i];
54+
// its possible to have overlapping terms
55+
if (start > pos) {
56+
append(sb, content, pos, start);
57+
}
58+
if (end > pos) {
59+
sb.append(preTag);
60+
append(sb, content, Math.max(pos, start), end);
61+
sb.append(postTag);
62+
pos = end;
63+
}
64+
}
65+
// its possible a "term" from the analyzer could span a sentence boundary.
66+
append(sb, content, pos, Math.max(pos, passage.getEndOffset()));
67+
//we remove the paragraph separator if present at the end of the snippet (we used it as separator between values)
68+
if (sb.charAt(sb.length() - 1) == HighlightUtils.PARAGRAPH_SEPARATOR) {
69+
sb.deleteCharAt(sb.length() - 1);
70+
} else if (sb.charAt(sb.length() - 1) == HighlightUtils.NULL_SEPARATOR) {
71+
sb.deleteCharAt(sb.length() - 1);
72+
}
73+
//and we trim the snippets too
74+
snippets[j] = new Snippet(sb.toString().trim(), passage.getScore(), passage.getNumMatches() > 0);
75+
}
76+
return snippets;
77+
}
78+
79+
private void append(StringBuilder dest, String content, int start, int end) {
80+
dest.append(encoder.encodeText(content.substring(start, end)));
81+
}
82+
}
Lines changed: 204 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,204 @@
1+
/*
2+
* Licensed to Elasticsearch under one or more contributor
3+
* license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright
5+
* ownership. Elasticsearch licenses this file to you under
6+
* the Apache License, Version 2.0 (the "License"); you may
7+
* not use this file except in compliance with the License.
8+
* You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing,
13+
* software distributed under the License is distributed on an
14+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15+
* KIND, either express or implied. See the License for the
16+
* specific language governing permissions and limitations
17+
* under the License.
18+
*/
19+
20+
package org.apache.lucene.search.uhighlight;
21+
22+
import org.apache.lucene.analysis.Analyzer;
23+
import org.apache.lucene.index.Term;
24+
import org.apache.lucene.queries.CommonTermsQuery;
25+
import org.apache.lucene.search.DocIdSetIterator;
26+
import org.apache.lucene.search.IndexSearcher;
27+
import org.apache.lucene.search.PrefixQuery;
28+
import org.apache.lucene.search.Query;
29+
import org.apache.lucene.search.TermQuery;
30+
import org.apache.lucene.search.highlight.Snippet;
31+
import org.apache.lucene.search.spans.SpanMultiTermQueryWrapper;
32+
import org.apache.lucene.search.spans.SpanNearQuery;
33+
import org.apache.lucene.search.spans.SpanOrQuery;
34+
import org.apache.lucene.search.spans.SpanQuery;
35+
import org.apache.lucene.search.spans.SpanTermQuery;
36+
import org.elasticsearch.common.Nullable;
37+
import org.elasticsearch.common.lucene.all.AllTermQuery;
38+
import org.elasticsearch.common.lucene.search.MultiPhrasePrefixQuery;
39+
import org.elasticsearch.common.lucene.search.function.FiltersFunctionScoreQuery;
40+
import org.elasticsearch.common.lucene.search.function.FunctionScoreQuery;
41+
42+
import java.io.IOException;
43+
import java.text.BreakIterator;
44+
import java.util.ArrayList;
45+
import java.util.Collection;
46+
import java.util.Collections;
47+
import java.util.List;
48+
import java.util.Locale;
49+
import java.util.Map;
50+
51+
/**
52+
* Subclass of the {@link UnifiedHighlighter} that works for a single field in a single document.
53+
* Uses a custom {@link PassageFormatter}. Accepts field content as a constructor
54+
* argument, given that loadings field value can be done reading from _source field.
55+
* Supports using different {@link BreakIterator} to break the text into fragments. Considers every distinct field
56+
* value as a discrete passage for highlighting (unless the whole content needs to be highlighted).
57+
* Supports both returning empty snippets and non highlighted snippets when no highlighting can be performed.
58+
*/
59+
public class CustomUnifiedHighlighter extends UnifiedHighlighter {
60+
private static final Snippet[] EMPTY_SNIPPET = new Snippet[0];
61+
62+
private final String fieldValue;
63+
private final PassageFormatter passageFormatter;
64+
private final BreakIterator breakIterator;
65+
private final boolean returnNonHighlightedSnippets;
66+
67+
/**
68+
* Creates a new instance of {@link CustomUnifiedHighlighter}
69+
*
70+
* @param analyzer the analyzer used for the field at index time, used for multi term queries internally
71+
* @param passageFormatter our own {@link CustomPassageFormatter}
72+
* which generates snippets in forms of {@link Snippet} objects
73+
* @param breakIterator the {@link BreakIterator} to use for dividing text into passages.
74+
* If null {@link BreakIterator#getSentenceInstance(Locale)} is used.
75+
* @param fieldValue the original field values as constructor argument, loaded from the _source field or
76+
* the relevant stored field.
77+
* @param returnNonHighlightedSnippets whether non highlighted snippets should be
78+
* returned rather than empty snippets when no highlighting can be performed
79+
*/
80+
public CustomUnifiedHighlighter(IndexSearcher searcher,
81+
Analyzer analyzer,
82+
PassageFormatter passageFormatter,
83+
@Nullable BreakIterator breakIterator,
84+
String fieldValue,
85+
boolean returnNonHighlightedSnippets) {
86+
super(searcher, analyzer);
87+
this.breakIterator = breakIterator;
88+
this.passageFormatter = passageFormatter;
89+
this.fieldValue = fieldValue;
90+
this.returnNonHighlightedSnippets = returnNonHighlightedSnippets;
91+
}
92+
93+
/**
94+
* Highlights terms extracted from the provided query within the content of the provided field name
95+
*/
96+
public Snippet[] highlightField(String field, Query query, int docId, int maxPassages) throws IOException {
97+
Map<String, Object[]> fieldsAsObjects = super.highlightFieldsAsObjects(new String[]{field}, query,
98+
new int[]{docId}, new int[]{maxPassages});
99+
Object[] snippetObjects = fieldsAsObjects.get(field);
100+
if (snippetObjects != null) {
101+
//one single document at a time
102+
assert snippetObjects.length == 1;
103+
Object snippetObject = snippetObjects[0];
104+
if (snippetObject != null && snippetObject instanceof Snippet[]) {
105+
return (Snippet[]) snippetObject;
106+
}
107+
}
108+
return EMPTY_SNIPPET;
109+
}
110+
111+
@Override
112+
protected List<CharSequence[]> loadFieldValues(String[] fields, DocIdSetIterator docIter,
113+
int cacheCharsThreshold) throws IOException {
114+
//we only highlight one field, one document at a time
115+
return Collections.singletonList(new String[]{fieldValue});
116+
}
117+
118+
@Override
119+
protected BreakIterator getBreakIterator(String field) {
120+
if (breakIterator != null) {
121+
return breakIterator;
122+
}
123+
return super.getBreakIterator(field);
124+
}
125+
126+
@Override
127+
protected PassageFormatter getFormatter(String field) {
128+
return passageFormatter;
129+
}
130+
131+
@Override
132+
protected int getMaxNoHighlightPassages(String field) {
133+
if (returnNonHighlightedSnippets) {
134+
return 1;
135+
}
136+
return 0;
137+
}
138+
139+
@Override
140+
protected Collection<Query> preMultiTermQueryRewrite(Query query) {
141+
return rewriteCustomQuery(query);
142+
}
143+
144+
@Override
145+
protected Collection<Query> preSpanQueryRewrite(Query query) {
146+
return rewriteCustomQuery(query);
147+
}
148+
149+
150+
/**
151+
* Translate custom queries in queries that are supported by the unified highlighter.
152+
*/
153+
private Collection<Query> rewriteCustomQuery(Query query) {
154+
if (query instanceof MultiPhrasePrefixQuery) {
155+
MultiPhrasePrefixQuery mpq = (MultiPhrasePrefixQuery) query;
156+
Term[][] terms = mpq.getTerms();
157+
int[] positions = mpq.getPositions();
158+
SpanQuery[] positionSpanQueries = new SpanQuery[positions.length];
159+
int sizeMinus1 = terms.length - 1;
160+
for (int i = 0; i < positions.length; i++) {
161+
SpanQuery[] innerQueries = new SpanQuery[terms[i].length];
162+
for (int j = 0; j < terms[i].length; j++) {
163+
if (i == sizeMinus1) {
164+
innerQueries[j] = new SpanMultiTermQueryWrapper(new PrefixQuery(terms[i][j]));
165+
} else {
166+
innerQueries[j] = new SpanTermQuery(terms[i][j]);
167+
}
168+
}
169+
if (innerQueries.length > 1) {
170+
positionSpanQueries[i] = new SpanOrQuery(innerQueries);
171+
} else {
172+
positionSpanQueries[i] = innerQueries[0];
173+
}
174+
}
175+
// sum position increments beyond 1
176+
int positionGaps = 0;
177+
if (positions.length >= 2) {
178+
// positions are in increasing order. max(0,...) is just a safeguard.
179+
positionGaps = Math.max(0, positions[positions.length - 1] - positions[0] - positions.length + 1);
180+
}
181+
182+
//if original slop is 0 then require inOrder
183+
boolean inorder = (mpq.getSlop() == 0);
184+
return Collections.singletonList(new SpanNearQuery(positionSpanQueries,
185+
mpq.getSlop() + positionGaps, inorder));
186+
} else if (query instanceof CommonTermsQuery) {
187+
CommonTermsQuery ctq = (CommonTermsQuery) query;
188+
List<Query> tqs = new ArrayList<> ();
189+
for (Term term : ctq.getTerms()) {
190+
tqs.add(new TermQuery(term));
191+
}
192+
return tqs;
193+
} else if (query instanceof AllTermQuery) {
194+
AllTermQuery atq = (AllTermQuery) query;
195+
return Collections.singletonList(new TermQuery(atq.getTerm()));
196+
} else if (query instanceof FunctionScoreQuery) {
197+
return Collections.singletonList(((FunctionScoreQuery) query).getSubQuery());
198+
} else if (query instanceof FiltersFunctionScoreQuery) {
199+
return Collections.singletonList(((FiltersFunctionScoreQuery) query).getSubQuery());
200+
} else {
201+
return null;
202+
}
203+
}
204+
}

core/src/main/java/org/elasticsearch/common/lucene/all/AllTermQuery.java

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,6 @@
3232
import org.apache.lucene.search.DocIdSetIterator;
3333
import org.apache.lucene.search.Explanation;
3434
import org.apache.lucene.search.IndexSearcher;
35-
import org.apache.lucene.search.MatchNoDocsQuery;
3635
import org.apache.lucene.search.Query;
3736
import org.apache.lucene.search.Scorer;
3837
import org.apache.lucene.search.TermQuery;
@@ -87,21 +86,18 @@ public Query rewrite(IndexReader reader) throws IOException {
8786
if (rewritten != this) {
8887
return rewritten;
8988
}
90-
boolean fieldExists = false;
9189
boolean hasPayloads = false;
9290
for (LeafReaderContext context : reader.leaves()) {
9391
final Terms terms = context.reader().terms(term.field());
9492
if (terms != null) {
95-
fieldExists = true;
9693
if (terms.hasPayloads()) {
9794
hasPayloads = true;
9895
break;
9996
}
10097
}
10198
}
102-
if (fieldExists == false) {
103-
return new MatchNoDocsQuery();
104-
}
99+
// if the terms does not exist we could return a MatchNoDocsQuery but this would break the unified highlighter
100+
// which rewrites query with an empty reader.
105101
if (hasPayloads == false) {
106102
return new TermQuery(term);
107103
}

0 commit comments

Comments
 (0)