Skip to content

Commit 9f7e65c

Browse files
Limit the analyzed text for highlighting (#27934)
- Introduce index level settings to control the max number of character to be analyzed for highlighting - Create a deprecation warning if analysis is required on a larger text Closes #27517
1 parent ed7021b commit 9f7e65c

File tree

10 files changed

+158
-4
lines changed

10 files changed

+158
-4
lines changed

core/src/main/java/org/apache/lucene/search/uhighlight/CustomUnifiedHighlighter.java

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,9 +35,12 @@
3535
import org.apache.lucene.util.BytesRef;
3636
import org.apache.lucene.util.automaton.CharacterRunAutomaton;
3737
import org.elasticsearch.common.Nullable;
38+
import org.elasticsearch.common.logging.DeprecationLogger;
39+
import org.elasticsearch.common.logging.Loggers;
3840
import org.elasticsearch.common.lucene.all.AllTermQuery;
3941
import org.elasticsearch.common.lucene.search.MultiPhrasePrefixQuery;
4042
import org.elasticsearch.common.lucene.search.function.FunctionScoreQuery;
43+
import org.elasticsearch.index.IndexSettings;
4144
import org.elasticsearch.index.search.ESToParentBlockJoinQuery;
4245

4346
import java.io.IOException;
@@ -68,6 +71,7 @@ public class CustomUnifiedHighlighter extends UnifiedHighlighter {
6871
private final BreakIterator breakIterator;
6972
private final Locale breakIteratorLocale;
7073
private final int noMatchSize;
74+
private final int maxAnalyzedOffset;
7175

7276
/**
7377
* Creates a new instance of {@link CustomUnifiedHighlighter}
@@ -82,6 +86,7 @@ public class CustomUnifiedHighlighter extends UnifiedHighlighter {
8286
* If null {@link BreakIterator#getSentenceInstance(Locale)} is used.
8387
* @param fieldValue the original field values delimited by MULTIVAL_SEP_CHAR.
8488
* @param noMatchSize The size of the text that should be returned when no highlighting can be performed.
89+
* @param maxAnalyzedOffset The maximum number of characters that will be analyzed for highlighting.
8590
*/
8691
public CustomUnifiedHighlighter(IndexSearcher searcher,
8792
Analyzer analyzer,
@@ -90,14 +95,16 @@ public CustomUnifiedHighlighter(IndexSearcher searcher,
9095
@Nullable Locale breakIteratorLocale,
9196
@Nullable BreakIterator breakIterator,
9297
String fieldValue,
93-
int noMatchSize) {
98+
int noMatchSize,
99+
int maxAnalyzedOffset) {
94100
super(searcher, analyzer);
95101
this.offsetSource = offsetSource;
96102
this.breakIterator = breakIterator;
97103
this.breakIteratorLocale = breakIteratorLocale == null ? Locale.ROOT : breakIteratorLocale;
98104
this.passageFormatter = passageFormatter;
99105
this.fieldValue = fieldValue;
100106
this.noMatchSize = noMatchSize;
107+
this.maxAnalyzedOffset = maxAnalyzedOffset;
101108
}
102109

103110
/**
@@ -121,6 +128,14 @@ public Snippet[] highlightField(String field, Query query, int docId, int maxPas
121128
@Override
122129
protected List<CharSequence[]> loadFieldValues(String[] fields, DocIdSetIterator docIter,
123130
int cacheCharsThreshold) throws IOException {
131+
if ((offsetSource == OffsetSource.ANALYSIS) && (fieldValue.length() > maxAnalyzedOffset)) {
132+
DeprecationLogger deprecationLogger = new DeprecationLogger(Loggers.getLogger(CustomUnifiedHighlighter.class));
133+
deprecationLogger.deprecated(
134+
"Deprecated large text to be analyzed for highlighting! The length has exceeded the allowed maximum of [" +
135+
maxAnalyzedOffset + "]. " + "This maximum can be set by changing the [" +
136+
IndexSettings.MAX_ANALYZED_OFFSET_SETTING.getKey() + "] index level setting. " +
137+
"For large texts, indexing with offsets or term vectors is recommended!");
138+
}
124139
// we only highlight one field, one document at a time
125140
return Collections.singletonList(new String[]{fieldValue});
126141
}

core/src/main/java/org/elasticsearch/common/settings/IndexScopedSettings.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -118,6 +118,7 @@ public final class IndexScopedSettings extends AbstractScopedSettings {
118118
IndexSettings.MAX_SHINGLE_DIFF_SETTING,
119119
IndexSettings.MAX_RESCORE_WINDOW_SETTING,
120120
IndexSettings.MAX_ADJACENCY_MATRIX_FILTERS_SETTING,
121+
IndexSettings.MAX_ANALYZED_OFFSET_SETTING,
121122
IndexSettings.INDEX_TRANSLOG_SYNC_INTERVAL_SETTING,
122123
IndexSettings.DEFAULT_FIELD_SETTING,
123124
IndexSettings.QUERY_STRING_LENIENT_SETTING,

core/src/main/java/org/elasticsearch/index/IndexSettings.java

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -121,6 +121,17 @@ public final class IndexSettings {
121121
public static final Setting<Integer> MAX_INNER_RESULT_WINDOW_SETTING =
122122
Setting.intSetting("index.max_inner_result_window", 100, 1, Property.Dynamic, Property.IndexScope);
123123

124+
125+
/**
126+
* A setting describing the maximum number of characters that will be analyzed for a highlight request.
127+
* This setting is only applicable when highlighting is requested on a text that was indexed without
128+
* offsets or term vectors.
129+
* The default maximum of 10000 characters is defensive as for highlighting larger texts,
130+
* indexing with offsets or term vectors is recommended.
131+
*/
132+
public static final Setting<Integer> MAX_ANALYZED_OFFSET_SETTING =
133+
Setting.intSetting("index.highlight.max_analyzed_offset", 10000, 1, Property.Dynamic, Property.IndexScope);
134+
124135
/**
125136
* Index setting describing for NGramTokenizer and NGramTokenFilter
126137
* the maximum difference between
@@ -276,6 +287,8 @@ public final class IndexSettings {
276287
private volatile int maxNgramDiff;
277288
private volatile int maxShingleDiff;
278289
private volatile boolean TTLPurgeDisabled;
290+
private volatile int maxAnalyzedOffset;
291+
279292
/**
280293
* The maximum number of refresh listeners allows on this shard.
281294
*/
@@ -383,6 +396,7 @@ public IndexSettings(final IndexMetaData indexMetaData, final Settings nodeSetti
383396
TTLPurgeDisabled = scopedSettings.get(INDEX_TTL_DISABLE_PURGE_SETTING);
384397
maxRefreshListeners = scopedSettings.get(MAX_REFRESH_LISTENERS_PER_SHARD);
385398
maxSlicesPerScroll = scopedSettings.get(MAX_SLICES_PER_SCROLL);
399+
maxAnalyzedOffset = scopedSettings.get(MAX_ANALYZED_OFFSET_SETTING);
386400
this.mergePolicyConfig = new MergePolicyConfig(logger, this);
387401
this.indexSortConfig = new IndexSortConfig(this);
388402
singleType = INDEX_MAPPING_SINGLE_TYPE_SETTING.get(indexMetaData.getSettings()); // get this from metadata - it's not registered
@@ -423,6 +437,7 @@ public IndexSettings(final IndexMetaData indexMetaData, final Settings nodeSetti
423437
scopedSettings.addSettingsUpdateConsumer(INDEX_TRANSLOG_RETENTION_SIZE_SETTING, this::setTranslogRetentionSize);
424438
scopedSettings.addSettingsUpdateConsumer(INDEX_REFRESH_INTERVAL_SETTING, this::setRefreshInterval);
425439
scopedSettings.addSettingsUpdateConsumer(MAX_REFRESH_LISTENERS_PER_SHARD, this::setMaxRefreshListeners);
440+
scopedSettings.addSettingsUpdateConsumer(MAX_ANALYZED_OFFSET_SETTING, this::setHighlightMaxAnalyzedOffset);
426441
scopedSettings.addSettingsUpdateConsumer(MAX_SLICES_PER_SCROLL, this::setMaxSlicesPerScroll);
427442
scopedSettings.addSettingsUpdateConsumer(DEFAULT_FIELD_SETTING, this::setDefaultFields);
428443
}
@@ -695,6 +710,13 @@ private void setMaxDocvalueFields(int maxDocvalueFields) {
695710

696711
private void setMaxShingleDiff(int maxShingleDiff) { this.maxShingleDiff = maxShingleDiff; }
697712

713+
/**
714+
* Returns the maximum number of chars that will be analyzed in a highlight request
715+
*/
716+
public int getHighlightMaxAnalyzedOffset() { return this.maxAnalyzedOffset; }
717+
718+
private void setHighlightMaxAnalyzedOffset(int maxAnalyzedOffset) { this.maxAnalyzedOffset = maxAnalyzedOffset; }
719+
698720
/**
699721
* Returns the maximum number of allowed script_fields to retrieve in a search request
700722
*/

core/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/PlainHighlighter.java

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,10 @@
3434
import org.apache.lucene.util.BytesRefHash;
3535
import org.apache.lucene.util.CollectionUtil;
3636
import org.elasticsearch.ExceptionsHelper;
37+
import org.elasticsearch.common.logging.DeprecationLogger;
38+
import org.elasticsearch.common.logging.Loggers;
3739
import org.elasticsearch.common.text.Text;
40+
import org.elasticsearch.index.IndexSettings;
3841
import org.elasticsearch.index.mapper.FieldMapper;
3942
import org.elasticsearch.search.fetch.FetchPhaseExecutionException;
4043
import org.elasticsearch.search.fetch.FetchSubPhase;
@@ -103,11 +106,22 @@ public HighlightField highlight(HighlighterContext highlighterContext) {
103106
ArrayList<TextFragment> fragsList = new ArrayList<>();
104107
List<Object> textsToHighlight;
105108
Analyzer analyzer = getAnalyzer(context.mapperService().documentMapper(hitContext.hit().getType()), mapper.fieldType());
109+
final int maxAnalyzedOffset = context.indexShard().indexSettings().getHighlightMaxAnalyzedOffset();
110+
106111
try {
107112
textsToHighlight = HighlightUtils.loadFieldValues(field, mapper, context, hitContext);
108113

109114
for (Object textToHighlight : textsToHighlight) {
110115
String text = convertFieldValue(mapper.fieldType(), textToHighlight);
116+
if (text.length() > maxAnalyzedOffset) {
117+
DeprecationLogger deprecationLogger = new DeprecationLogger(Loggers.getLogger(PlainHighlighter.class));
118+
deprecationLogger.deprecated(
119+
"Deprecated large text to be analyzed for highlighting! The length has exceeded the allowed maximum of [" +
120+
maxAnalyzedOffset + "]. " + "This maximum can be set by changing the [" +
121+
IndexSettings.MAX_ANALYZED_OFFSET_SETTING.getKey() + "] index level setting. " +
122+
"For large texts, indexing with offsets or term vectors, and highlighting with unified or " +
123+
"fvh highlighter is recommended!");
124+
}
111125

112126
try (TokenStream tokenStream = analyzer.tokenStream(mapper.fieldType().name(), text)) {
113127
if (!tokenStream.hasAttribute(CharTermAttribute.class) || !tokenStream.hasAttribute(OffsetAttribute.class)) {

core/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/UnifiedHighlighter.java

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,7 @@ public HighlightField highlight(HighlighterContext highlighterContext) {
6767
Encoder encoder = field.fieldOptions().encoder().equals("html") ? HighlightUtils.Encoders.HTML : HighlightUtils.Encoders.DEFAULT;
6868
CustomPassageFormatter passageFormatter = new CustomPassageFormatter(field.fieldOptions().preTags()[0],
6969
field.fieldOptions().postTags()[0], encoder);
70+
final int maxAnalyzedOffset = context.indexShard().indexSettings().getHighlightMaxAnalyzedOffset();
7071

7172
List<Snippet> snippets = new ArrayList<>();
7273
int numberOfFragments;
@@ -88,14 +89,15 @@ public HighlightField highlight(HighlighterContext highlighterContext) {
8889
// get back a snippet per value
8990
CustomSeparatorBreakIterator breakIterator = new CustomSeparatorBreakIterator(MULTIVAL_SEP_CHAR);
9091
highlighter = new CustomUnifiedHighlighter(searcher, analyzer, offsetSource, passageFormatter,
91-
field.fieldOptions().boundaryScannerLocale(), breakIterator, fieldValue, field.fieldOptions().noMatchSize());
92+
field.fieldOptions().boundaryScannerLocale(), breakIterator, fieldValue, field.fieldOptions().noMatchSize(),
93+
maxAnalyzedOffset);
9294
numberOfFragments = fieldValues.size(); // we are highlighting the whole content, one snippet per value
9395
} else {
9496
//using paragraph separator we make sure that each field value holds a discrete passage for highlighting
9597
BreakIterator bi = getBreakIterator(field);
9698
highlighter = new CustomUnifiedHighlighter(searcher, analyzer, offsetSource, passageFormatter,
9799
field.fieldOptions().boundaryScannerLocale(), bi,
98-
fieldValue, field.fieldOptions().noMatchSize());
100+
fieldValue, field.fieldOptions().noMatchSize(), maxAnalyzedOffset);
99101
numberOfFragments = field.fieldOptions().numberOfFragments();
100102
}
101103

core/src/test/java/org/apache/lucene/search/uhighlight/CustomUnifiedHighlighterTests.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -79,7 +79,7 @@ private void assertHighlightOneDoc(String fieldName, String[] inputs, Analyzer a
7979
String rawValue = Strings.arrayToDelimitedString(inputs, String.valueOf(MULTIVAL_SEP_CHAR));
8080
CustomUnifiedHighlighter highlighter = new CustomUnifiedHighlighter(searcher, analyzer, null,
8181
new CustomPassageFormatter("<b>", "</b>", new DefaultEncoder()), locale,
82-
breakIterator, rawValue, noMatchSize);
82+
breakIterator, rawValue, noMatchSize, 10000);
8383
highlighter.setFieldMatcher((name) -> "text".equals(name));
8484
final Snippet[] snippets =
8585
highlighter.highlightField("text", query, topDocs.scoreDocs[0].doc, expectedPassages.length);

docs/reference/index-modules.asciidoc

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -182,6 +182,11 @@ specific index module:
182182
Maximum number of refresh listeners available on each shard of the index.
183183
These listeners are used to implement <<docs-refresh,`refresh=wait_for`>>.
184184

185+
`index.highlight.max_analyzed_offset`::
186+
187+
The maximum number of characters that will be analyzed for a highlight request.
188+
This setting is only applicable when highlighting is requested on a text that was indexed without offsets or term vectors.
189+
Defaults to `10000`.
185190

186191
[float]
187192
=== Settings in other index modules

docs/reference/migration/migrate_6_0/analysis.asciidoc

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,3 +11,11 @@ and will be ignored when used in new indices. These parameters
1111
will continue to function as before when used in indices
1212
created in 5.x.
1313

14+
==== Limiting the length of an analyzed text during highlighting
15+
16+
Highlighting a text that was indexed without offsets or term vectors,
17+
requires analysis of this text in memory real time during the search request.
18+
For large texts this analysis may take substantial amount of time and memory.
19+
To protect against this, the maximum number of characters that will be analyzed has been
20+
limited to 10000. This default limit can be changed
21+
for a particular index with the index setting `index.highlight.max_analyzed_offset`.

docs/reference/search/request/highlighting.asciidoc

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -104,6 +104,12 @@ Lucene's query execution planner to get access to low-level match information on
104104
the current document. This is repeated for every field and every document that
105105
needs highlighting. The `plain` highlighter always uses plain highlighting.
106106

107+
[WARNING]
108+
Plain highlighting for large texts may require substantial amount of time and memory.
109+
To protect against this, the maximum number of text characters that will be analyzed has been
110+
limited to 10000. This default limit can be changed
111+
for a particular index with the index setting `index.highlight.max_analyzed_offset`.
112+
107113
[[highlighting-settings]]
108114
==== Highlighting Settings
109115

Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,81 @@
1+
---
2+
setup:
3+
- do:
4+
indices.create:
5+
index: test1
6+
body:
7+
settings:
8+
number_of_shards: 1
9+
index.highlight.max_analyzed_offset: 10
10+
mappings:
11+
test_type:
12+
properties:
13+
field1:
14+
type: text
15+
field2:
16+
type: text
17+
index_options: offsets
18+
19+
- do:
20+
index:
21+
index: test1
22+
type: test_type
23+
id: 1
24+
body:
25+
"field1" : "The quick brown fox went to the forest and saw another fox."
26+
"field2" : "The quick brown fox went to the forest and saw another fox."
27+
28+
- do:
29+
indices.refresh: {}
30+
31+
---
32+
"Unified highlighter on a field WITHOUT OFFSETS exceeding index.highlight.max_analyzed_offset should FAIL":
33+
- skip:
34+
version: " - 6.1.99"
35+
reason: index.highlight.max_analyzed_offset setting has been added in 6.2
36+
features: "warnings"
37+
- do:
38+
search:
39+
index: test1
40+
body: {"query" : {"match" : {"field1" : "fox"}}, "highlight" : {"type" : "unified", "fields" : {"field1" : {}}}}
41+
warnings:
42+
- Deprecated large text to be analyzed for highlighting! The length has exceeded the allowed maximum of [10]. This maximum can be set by changing the [index.highlight.max_analyzed_offset] index level setting. For large texts, indexing with offsets or term vectors is recommended!
43+
44+
45+
---
46+
"Plain highlighter on a field WITHOUT OFFSETS exceeding index.highlight.max_analyzed_offset should FAIL":
47+
- skip:
48+
version: " - 6.1.99"
49+
reason: index.highlight.max_analyzed_offset setting has been added in 6.2
50+
features: "warnings"
51+
- do:
52+
search:
53+
index: test1
54+
body: {"query" : {"match" : {"field1" : "fox"}}, "highlight" : {"type" : "plain", "fields" : {"field1" : {}}}}
55+
warnings:
56+
- Deprecated large text to be analyzed for highlighting! The length has exceeded the allowed maximum of [10]. This maximum can be set by changing the [index.highlight.max_analyzed_offset] index level setting. For large texts, indexing with offsets or term vectors, and highlighting with unified or fvh highlighter is recommended!
57+
58+
---
59+
"Unified highlighter on a field WITH OFFSETS exceeding index.highlight.max_analyzed_offset should SUCCEED":
60+
- skip:
61+
version: " - 6.1.99"
62+
reason: index.highligt.max_analyzed_offset setting has been added in 6.2
63+
- do:
64+
search:
65+
index: test1
66+
body: {"query" : {"match" : {"field2" : "fox"}}, "highlight" : {"type" : "unified", "fields" : {"field2" : {}}}}
67+
- match: {hits.hits.0.highlight.field2.0: "The quick brown <em>fox</em> went to the forest and saw another <em>fox</em>."}
68+
69+
70+
---
71+
"Plain highlighter on a field WITH OFFSETS exceeding index.highlight.max_analyzed_offset should FAIL":
72+
- skip:
73+
version: " - 6.1.99"
74+
reason: index.highlight.max_analyzed_offset setting has been added in 6.2
75+
features: "warnings"
76+
- do:
77+
search:
78+
index: test1
79+
body: {"query" : {"match" : {"field2" : "fox"}}, "highlight" : {"type" : "plain", "fields" : {"field2" : {}}}}
80+
warnings:
81+
- Deprecated large text to be analyzed for highlighting! The length has exceeded the allowed maximum of [10]. This maximum can be set by changing the [index.highlight.max_analyzed_offset] index level setting. For large texts, indexing with offsets or term vectors, and highlighting with unified or fvh highlighter is recommended!

0 commit comments

Comments
 (0)