Limit the analyzed text for highlighting (#27934)

mayya-sharipova · mayya-sharipova · commit 9f7e65c0c1ad · 2017-12-27T16:22:04.000-05:00
- Introduce index level settings to control the max number of character to be analyzed for highlighting - Create a deprecation warning if analysis is required on a larger text Closes #27517
diff --git a/core/src/main/java/org/apache/lucene/search/uhighlight/CustomUnifiedHighlighter.java b/core/src/main/java/org/apache/lucene/search/uhighlight/CustomUnifiedHighlighter.java
@@ -35,9 +35,12 @@
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.automaton.CharacterRunAutomaton;
 import org.elasticsearch.common.Nullable;
+import org.elasticsearch.common.logging.DeprecationLogger;
+import org.elasticsearch.common.logging.Loggers;
 import org.elasticsearch.common.lucene.all.AllTermQuery;
 import org.elasticsearch.common.lucene.search.MultiPhrasePrefixQuery;
 import org.elasticsearch.common.lucene.search.function.FunctionScoreQuery;
+import org.elasticsearch.index.IndexSettings;
 import org.elasticsearch.index.search.ESToParentBlockJoinQuery;
 
 import java.io.IOException;
@@ -68,6 +71,7 @@ public class CustomUnifiedHighlighter extends UnifiedHighlighter {
     private final BreakIterator breakIterator;
     private final Locale breakIteratorLocale;
     private final int noMatchSize;
+    private final int maxAnalyzedOffset;
 
     /**
      * Creates a new instance of {@link CustomUnifiedHighlighter}
@@ -82,6 +86,7 @@ public class CustomUnifiedHighlighter extends UnifiedHighlighter {
      *                    If null {@link BreakIterator#getSentenceInstance(Locale)} is used.
      * @param fieldValue the original field values delimited by MULTIVAL_SEP_CHAR.
      * @param noMatchSize The size of the text that should be returned when no highlighting can be performed.
+     * @param maxAnalyzedOffset The maximum number of characters that will be analyzed for highlighting.
      */
     public CustomUnifiedHighlighter(IndexSearcher searcher,
                                     Analyzer analyzer,
@@ -90,14 +95,16 @@ public CustomUnifiedHighlighter(IndexSearcher searcher,
                                     @Nullable Locale breakIteratorLocale,
                                     @Nullable BreakIterator breakIterator,
                                     String fieldValue,
-                                    int noMatchSize) {
+                                    int noMatchSize,
+                                    int maxAnalyzedOffset) {
         super(searcher, analyzer);
         this.offsetSource = offsetSource;
         this.breakIterator = breakIterator;
         this.breakIteratorLocale = breakIteratorLocale == null ? Locale.ROOT : breakIteratorLocale;
         this.passageFormatter = passageFormatter;
         this.fieldValue = fieldValue;
         this.noMatchSize = noMatchSize;
+        this.maxAnalyzedOffset = maxAnalyzedOffset;
     }
 
     /**
@@ -121,6 +128,14 @@ public Snippet[] highlightField(String field, Query query, int docId, int maxPas
     @Override
     protected List<CharSequence[]> loadFieldValues(String[] fields, DocIdSetIterator docIter,
                                                    int cacheCharsThreshold) throws IOException {
+        if ((offsetSource == OffsetSource.ANALYSIS) && (fieldValue.length() > maxAnalyzedOffset)) {
+            DeprecationLogger deprecationLogger = new DeprecationLogger(Loggers.getLogger(CustomUnifiedHighlighter.class));
+            deprecationLogger.deprecated(
+                "Deprecated large text to be analyzed for highlighting! The length has exceeded the allowed maximum of [" +
+                    maxAnalyzedOffset + "]. " + "This maximum can be set by changing the [" +
+                    IndexSettings.MAX_ANALYZED_OFFSET_SETTING.getKey() + "] index level setting. " +
+                    "For large texts, indexing with offsets or term vectors is recommended!");
+        }
         // we only highlight one field, one document at a time
         return Collections.singletonList(new String[]{fieldValue});
     }
diff --git a/core/src/main/java/org/elasticsearch/common/settings/IndexScopedSettings.java b/core/src/main/java/org/elasticsearch/common/settings/IndexScopedSettings.java
@@ -118,6 +118,7 @@ public final class IndexScopedSettings extends AbstractScopedSettings {
         IndexSettings.MAX_SHINGLE_DIFF_SETTING,
         IndexSettings.MAX_RESCORE_WINDOW_SETTING,
         IndexSettings.MAX_ADJACENCY_MATRIX_FILTERS_SETTING,
+        IndexSettings.MAX_ANALYZED_OFFSET_SETTING,
         IndexSettings.INDEX_TRANSLOG_SYNC_INTERVAL_SETTING,
         IndexSettings.DEFAULT_FIELD_SETTING,
         IndexSettings.QUERY_STRING_LENIENT_SETTING,
diff --git a/core/src/main/java/org/elasticsearch/index/IndexSettings.java b/core/src/main/java/org/elasticsearch/index/IndexSettings.java
@@ -121,6 +121,17 @@ public final class IndexSettings {
     public static final Setting<Integer> MAX_INNER_RESULT_WINDOW_SETTING =
         Setting.intSetting("index.max_inner_result_window", 100, 1, Property.Dynamic, Property.IndexScope);
 
+
+    /**
+     * A setting describing the maximum number of characters that will be analyzed for a highlight request.
+     * This setting is only applicable when highlighting is requested on a text that was indexed without
+     * offsets or term vectors.
+     * The default maximum of 10000 characters is defensive as for highlighting larger texts,
+     * indexing with offsets or term vectors is recommended.
+     */
+    public static final Setting<Integer> MAX_ANALYZED_OFFSET_SETTING =
+        Setting.intSetting("index.highlight.max_analyzed_offset", 10000, 1, Property.Dynamic, Property.IndexScope);
+
     /**
      * Index setting describing for NGramTokenizer and NGramTokenFilter
      * the maximum difference between
@@ -276,6 +287,8 @@ public final class IndexSettings {
     private volatile int maxNgramDiff;
     private volatile int maxShingleDiff;
     private volatile boolean TTLPurgeDisabled;
+    private volatile int maxAnalyzedOffset;
+
     /**
      * The maximum number of refresh listeners allows on this shard.
      */
@@ -383,6 +396,7 @@ public IndexSettings(final IndexMetaData indexMetaData, final Settings nodeSetti
         TTLPurgeDisabled = scopedSettings.get(INDEX_TTL_DISABLE_PURGE_SETTING);
         maxRefreshListeners = scopedSettings.get(MAX_REFRESH_LISTENERS_PER_SHARD);
         maxSlicesPerScroll = scopedSettings.get(MAX_SLICES_PER_SCROLL);
+        maxAnalyzedOffset = scopedSettings.get(MAX_ANALYZED_OFFSET_SETTING);
         this.mergePolicyConfig = new MergePolicyConfig(logger, this);
         this.indexSortConfig = new IndexSortConfig(this);
         singleType = INDEX_MAPPING_SINGLE_TYPE_SETTING.get(indexMetaData.getSettings()); // get this from metadata - it's not registered
@@ -423,6 +437,7 @@ public IndexSettings(final IndexMetaData indexMetaData, final Settings nodeSetti
         scopedSettings.addSettingsUpdateConsumer(INDEX_TRANSLOG_RETENTION_SIZE_SETTING, this::setTranslogRetentionSize);
         scopedSettings.addSettingsUpdateConsumer(INDEX_REFRESH_INTERVAL_SETTING, this::setRefreshInterval);
         scopedSettings.addSettingsUpdateConsumer(MAX_REFRESH_LISTENERS_PER_SHARD, this::setMaxRefreshListeners);
+        scopedSettings.addSettingsUpdateConsumer(MAX_ANALYZED_OFFSET_SETTING, this::setHighlightMaxAnalyzedOffset);
         scopedSettings.addSettingsUpdateConsumer(MAX_SLICES_PER_SCROLL, this::setMaxSlicesPerScroll);
         scopedSettings.addSettingsUpdateConsumer(DEFAULT_FIELD_SETTING, this::setDefaultFields);
     }
@@ -695,6 +710,13 @@ private void setMaxDocvalueFields(int maxDocvalueFields) {
 
     private void setMaxShingleDiff(int maxShingleDiff) { this.maxShingleDiff = maxShingleDiff; }
 
+    /**
+     *  Returns the maximum number of chars that will be analyzed in a highlight request
+     */
+    public int getHighlightMaxAnalyzedOffset() { return this.maxAnalyzedOffset; }
+
+    private void setHighlightMaxAnalyzedOffset(int maxAnalyzedOffset) { this.maxAnalyzedOffset = maxAnalyzedOffset; }
+
     /**
      * Returns the maximum number of allowed script_fields to retrieve in a search request
      */
diff --git a/core/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/PlainHighlighter.java b/core/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/PlainHighlighter.java
@@ -34,7 +34,10 @@
 import org.apache.lucene.util.BytesRefHash;
 import org.apache.lucene.util.CollectionUtil;
 import org.elasticsearch.ExceptionsHelper;
+import org.elasticsearch.common.logging.DeprecationLogger;
+import org.elasticsearch.common.logging.Loggers;
 import org.elasticsearch.common.text.Text;
+import org.elasticsearch.index.IndexSettings;
 import org.elasticsearch.index.mapper.FieldMapper;
 import org.elasticsearch.search.fetch.FetchPhaseExecutionException;
 import org.elasticsearch.search.fetch.FetchSubPhase;
@@ -103,11 +106,22 @@ public HighlightField highlight(HighlighterContext highlighterContext) {
         ArrayList<TextFragment> fragsList = new ArrayList<>();
         List<Object> textsToHighlight;
         Analyzer analyzer = getAnalyzer(context.mapperService().documentMapper(hitContext.hit().getType()), mapper.fieldType());
+        final int maxAnalyzedOffset = context.indexShard().indexSettings().getHighlightMaxAnalyzedOffset();
+
         try {
             textsToHighlight = HighlightUtils.loadFieldValues(field, mapper, context, hitContext);
 
             for (Object textToHighlight : textsToHighlight) {
                 String text = convertFieldValue(mapper.fieldType(), textToHighlight);
+                if (text.length() > maxAnalyzedOffset) {
+                    DeprecationLogger deprecationLogger = new DeprecationLogger(Loggers.getLogger(PlainHighlighter.class));
+                    deprecationLogger.deprecated(
+                        "Deprecated large text to be analyzed for highlighting! The length has exceeded the allowed maximum of [" +
+                            maxAnalyzedOffset + "]. " + "This maximum can be set by changing the [" +
+                            IndexSettings.MAX_ANALYZED_OFFSET_SETTING.getKey() + "] index level setting. " +
+                            "For large texts, indexing with offsets or term vectors, and highlighting with unified or " +
+                            "fvh highlighter is recommended!");
+                }
 
                 try (TokenStream tokenStream = analyzer.tokenStream(mapper.fieldType().name(), text)) {
                     if (!tokenStream.hasAttribute(CharTermAttribute.class) || !tokenStream.hasAttribute(OffsetAttribute.class)) {
diff --git a/core/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/UnifiedHighlighter.java b/core/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/UnifiedHighlighter.java
@@ -67,6 +67,7 @@ public HighlightField highlight(HighlighterContext highlighterContext) {
         Encoder encoder = field.fieldOptions().encoder().equals("html") ? HighlightUtils.Encoders.HTML : HighlightUtils.Encoders.DEFAULT;
         CustomPassageFormatter passageFormatter = new CustomPassageFormatter(field.fieldOptions().preTags()[0],
             field.fieldOptions().postTags()[0], encoder);
+        final int maxAnalyzedOffset = context.indexShard().indexSettings().getHighlightMaxAnalyzedOffset();
 
         List<Snippet> snippets = new ArrayList<>();
         int numberOfFragments;
@@ -88,14 +89,15 @@ public HighlightField highlight(HighlighterContext highlighterContext) {
                 // get back a snippet per value
                 CustomSeparatorBreakIterator breakIterator = new CustomSeparatorBreakIterator(MULTIVAL_SEP_CHAR);
                 highlighter = new CustomUnifiedHighlighter(searcher, analyzer, offsetSource, passageFormatter,
-                    field.fieldOptions().boundaryScannerLocale(), breakIterator, fieldValue, field.fieldOptions().noMatchSize());
+                    field.fieldOptions().boundaryScannerLocale(), breakIterator, fieldValue, field.fieldOptions().noMatchSize(),
+                    maxAnalyzedOffset);
                 numberOfFragments = fieldValues.size(); // we are highlighting the whole content, one snippet per value
             } else {
                 //using paragraph separator we make sure that each field value holds a discrete passage for highlighting
                 BreakIterator bi = getBreakIterator(field);
                 highlighter = new CustomUnifiedHighlighter(searcher, analyzer, offsetSource, passageFormatter,
                     field.fieldOptions().boundaryScannerLocale(), bi,
-                    fieldValue, field.fieldOptions().noMatchSize());
+                    fieldValue, field.fieldOptions().noMatchSize(), maxAnalyzedOffset);
                 numberOfFragments = field.fieldOptions().numberOfFragments();
             }
 
diff --git a/core/src/test/java/org/apache/lucene/search/uhighlight/CustomUnifiedHighlighterTests.java b/core/src/test/java/org/apache/lucene/search/uhighlight/CustomUnifiedHighlighterTests.java
@@ -79,7 +79,7 @@ private void assertHighlightOneDoc(String fieldName, String[] inputs, Analyzer a
         String rawValue = Strings.arrayToDelimitedString(inputs, String.valueOf(MULTIVAL_SEP_CHAR));
         CustomUnifiedHighlighter highlighter = new CustomUnifiedHighlighter(searcher, analyzer, null,
                 new CustomPassageFormatter("<b>", "</b>", new DefaultEncoder()), locale,
-                breakIterator, rawValue, noMatchSize);
+                breakIterator, rawValue, noMatchSize, 10000);
         highlighter.setFieldMatcher((name) -> "text".equals(name));
         final Snippet[] snippets =
             highlighter.highlightField("text", query, topDocs.scoreDocs[0].doc, expectedPassages.length);
diff --git a/docs/reference/index-modules.asciidoc b/docs/reference/index-modules.asciidoc
@@ -182,6 +182,11 @@ specific index module:
     Maximum number of refresh listeners available on each shard of the index.
     These listeners are used to implement <<docs-refresh,`refresh=wait_for`>>.
 
+ `index.highlight.max_analyzed_offset`::
+
+     The maximum number of characters that will be analyzed for a highlight request.
+     This setting is only applicable when highlighting is requested on a text that was indexed without offsets or term vectors.
+     Defaults to `10000`.
 
 [float]
 === Settings in other index modules
diff --git a/docs/reference/migration/migrate_6_0/analysis.asciidoc b/docs/reference/migration/migrate_6_0/analysis.asciidoc
@@ -11,3 +11,11 @@ and will be ignored when used in new indices.  These parameters
 will continue to function as before when used in indices
 created in 5.x.
 
+==== Limiting the length of an analyzed text during highlighting
+
+Highlighting a text that was indexed without offsets or term vectors,
+requires analysis of this text in memory real time during the search request.
+For large texts this analysis may take substantial amount of time and memory.
+To protect against this, the maximum number of characters that will be analyzed has been
+limited to 10000. This default limit can be changed
+for a particular index with the index setting `index.highlight.max_analyzed_offset`.
diff --git a/docs/reference/search/request/highlighting.asciidoc b/docs/reference/search/request/highlighting.asciidoc
@@ -104,6 +104,12 @@ Lucene's query execution planner to get access to low-level match information on
 the current document. This is repeated for every field and every document that
 needs highlighting. The `plain` highlighter always uses plain highlighting.
 
+[WARNING]
+Plain highlighting for large texts may require substantial amount of time and memory.
+To protect against this, the maximum number of text characters that will be analyzed has been
+limited to 10000. This default limit can be changed
+for a particular index with the index setting `index.highlight.max_analyzed_offset`.
+
 [[highlighting-settings]]
 ==== Highlighting Settings
 
diff --git a/rest-api-spec/src/main/resources/rest-api-spec/test/search.highlight/30_max_analyzed_offset.yml b/rest-api-spec/src/main/resources/rest-api-spec/test/search.highlight/30_max_analyzed_offset.yml
@@ -0,0 +1,81 @@
+---
+setup:
+  - do:
+      indices.create:
+          index: test1
+          body:
+              settings:
+                  number_of_shards: 1
+                  index.highlight.max_analyzed_offset: 10
+              mappings:
+                  test_type:
+                      properties:
+                          field1:
+                              type: text
+                          field2:
+                              type: text
+                              index_options: offsets
+
+  - do:
+      index:
+          index: test1
+          type:  test_type
+          id:    1
+          body:
+              "field1" : "The quick brown fox went to the forest and saw another fox."
+              "field2" : "The quick brown fox went to the forest and saw another fox."
+
+  - do:
+      indices.refresh: {}
+
+---
+"Unified highlighter on a field WITHOUT OFFSETS exceeding index.highlight.max_analyzed_offset should FAIL":
+  - skip:
+      version: " - 6.1.99"
+      reason: index.highlight.max_analyzed_offset setting has been added in 6.2
+      features: "warnings"
+  - do:
+      search:
+          index: test1
+          body: {"query" : {"match" : {"field1" : "fox"}}, "highlight" : {"type" : "unified", "fields" : {"field1" : {}}}}
+      warnings:
+          - Deprecated large text to be analyzed for highlighting! The length has exceeded the allowed maximum of [10]. This maximum can be set by changing the [index.highlight.max_analyzed_offset] index level setting. For large texts, indexing with offsets or term vectors is recommended!
+
+
+---
+"Plain highlighter on a field WITHOUT OFFSETS exceeding index.highlight.max_analyzed_offset should FAIL":
+  - skip:
+      version: " - 6.1.99"
+      reason: index.highlight.max_analyzed_offset setting has been added in 6.2
+      features: "warnings"
+  - do:
+      search:
+          index: test1
+          body: {"query" : {"match" : {"field1" : "fox"}}, "highlight" : {"type" : "plain", "fields" : {"field1" : {}}}}
+      warnings:
+          - Deprecated large text to be analyzed for highlighting! The length has exceeded the allowed maximum of [10]. This maximum can be set by changing the [index.highlight.max_analyzed_offset] index level setting. For large texts, indexing with offsets or term vectors, and highlighting with unified or fvh highlighter is recommended!
+
+---
+"Unified highlighter on a field WITH OFFSETS exceeding index.highlight.max_analyzed_offset should SUCCEED":
+  - skip:
+      version: " - 6.1.99"
+      reason: index.highligt.max_analyzed_offset setting has been added in 6.2
+  - do:
+      search:
+          index: test1
+          body: {"query" : {"match" : {"field2" : "fox"}}, "highlight" : {"type" : "unified", "fields" : {"field2" : {}}}}
+  - match: {hits.hits.0.highlight.field2.0: "The quick brown <em>fox</em> went to the forest and saw another <em>fox</em>."}
+
+
+---
+"Plain highlighter on a field WITH OFFSETS exceeding index.highlight.max_analyzed_offset should FAIL":
+  - skip:
+      version: " - 6.1.99"
+      reason: index.highlight.max_analyzed_offset setting has been added in 6.2
+      features: "warnings"
+  - do:
+      search:
+          index: test1
+          body: {"query" : {"match" : {"field2" : "fox"}}, "highlight" : {"type" : "plain", "fields" : {"field2" : {}}}}
+      warnings:
+          - Deprecated large text to be analyzed for highlighting! The length has exceeded the allowed maximum of [10]. This maximum can be set by changing the [index.highlight.max_analyzed_offset] index level setting. For large texts, indexing with offsets or term vectors, and highlighting with unified or fvh highlighter is recommended!