Allow word_delimiter_graph_filter to not adjust internal offsets (#36699)

romseygeek · web-flow · commit af5757583814 · 2018-12-18T13:20:51.000Z
This commit adds an adjust_offsets parameter to the word_delimiter_graph token filter, defaulting to true. Most of the time you'd want sub-tokens emitted by this filter to have offsets that are adjusted to their real position in the token stream; however, some token filters can change the length or starting position of a token (eg trim) without changing their offset attributes, and this can lead to word_delimiter_graph emitting illegal offsets. Setting adjust_offsets to false in these cases will allow indexing again. Fixes #34741, #33710
diff --git a/docs/reference/analysis/tokenfilters/word-delimiter-graph-tokenfilter.asciidoc b/docs/reference/analysis/tokenfilters/word-delimiter-graph-tokenfilter.asciidoc
@@ -71,6 +71,15 @@ Advance settings include:
     to a file configured with protected words (one on each line).
     Automatically resolves to `config/` based location if exists.
 
+`adjust_offsets`::
+    By default, the filter tries to output subtokens with adjusted offsets
+    to reflect their actual position in the token stream.  However, when
+    used in combination with other filters that alter the length or starting
+    position of tokens without changing their offsets
+    (e.g. <<analysis-trim-tokenfilter,`trim`>>) this can cause tokens with
+    illegal offsets to be emitted.  Setting `adjust_offsets` to false will
+    stop `word_delimiter_graph` from adjusting these internal offsets.
+
 `type_table`::
     A custom type mapping table, for example (when configured
     using `type_table_path`):
diff --git a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/WordDelimiterGraphTokenFilterFactory.java b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/WordDelimiterGraphTokenFilterFactory.java
@@ -55,6 +55,7 @@ public class WordDelimiterGraphTokenFilterFactory extends AbstractTokenFilterFac
     private final byte[] charTypeTable;
     private final int flags;
     private final CharArraySet protoWords;
+    private final boolean adjustOffsets;
 
     public WordDelimiterGraphTokenFilterFactory(IndexSettings indexSettings, Environment env,
             String name, Settings settings) {
@@ -95,11 +96,12 @@ public WordDelimiterGraphTokenFilterFactory(IndexSettings indexSettings, Environ
         Set<?> protectedWords = Analysis.getWordSet(env, settings, "protected_words");
         this.protoWords = protectedWords == null ? null : CharArraySet.copy(protectedWords);
         this.flags = flags;
+        this.adjustOffsets = settings.getAsBoolean("adjust_offsets", true);
     }
 
     @Override
     public TokenStream create(TokenStream tokenStream) {
-        return new WordDelimiterGraphFilter(tokenStream, true, charTypeTable, flags, protoWords);
+        return new WordDelimiterGraphFilter(tokenStream, adjustOffsets, charTypeTable, flags, protoWords);
     }
 
     @Override
diff --git a/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/WordDelimiterGraphTokenFilterFactoryTests.java b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/WordDelimiterGraphTokenFilterFactoryTests.java
@@ -76,10 +76,35 @@ public void testPartsAndCatenate() throws IOException {
         String source = "PowerShot";
         int[] expectedIncr = new int[]{1, 0, 1};
         int[] expectedPosLen = new int[]{2, 1, 1};
+        int[] expectedStartOffsets = new int[]{0, 0, 5};
+        int[] expectedEndOffsets = new int[]{9, 5, 9};
         String[] expected = new String[]{"PowerShot", "Power", "Shot" };
         Tokenizer tokenizer = new WhitespaceTokenizer();
         tokenizer.setReader(new StringReader(source));
-        assertTokenStreamContents(tokenFilter.create(tokenizer), expected, null, null, null,
+        assertTokenStreamContents(tokenFilter.create(tokenizer), expected, expectedStartOffsets, expectedEndOffsets, null,
+            expectedIncr, expectedPosLen, null);
+    }
+
+    public void testAdjustingOffsets() throws IOException {
+        ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(
+            Settings.builder()
+                .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
+                .put("index.analysis.filter.my_word_delimiter.type", type)
+                .put("index.analysis.filter.my_word_delimiter.catenate_words", "true")
+                .put("index.analysis.filter.my_word_delimiter.generate_word_parts", "true")
+                .put("index.analysis.filter.my_word_delimiter.adjust_offsets", "false")
+                .build(),
+            new CommonAnalysisPlugin());
+        TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter");
+        String source = "PowerShot";
+        int[] expectedIncr = new int[]{1, 0, 1};
+        int[] expectedPosLen = new int[]{2, 1, 1};
+        int[] expectedStartOffsets = new int[]{0, 0, 0};
+        int[] expectedEndOffsets = new int[]{9, 9, 9};
+        String[] expected = new String[]{"PowerShot", "Power", "Shot" };
+        Tokenizer tokenizer = new WhitespaceTokenizer();
+        tokenizer.setReader(new StringReader(source));
+        assertTokenStreamContents(tokenFilter.create(tokenizer), expected, expectedStartOffsets, expectedEndOffsets, null,
             expectedIncr, expectedPosLen, null);
     }
 }
diff --git a/modules/analysis-common/src/test/resources/rest-api-spec/test/analysis-common/40_token_filters.yml b/modules/analysis-common/src/test/resources/rest-api-spec/test/analysis-common/40_token_filters.yml
@@ -157,6 +157,26 @@
     - match:  { tokens.2.token: brown }
     - match:  { tokens.3.token: fox }
 
+    - do:
+        indices.analyze:
+          body:
+            text:      the qu1ck brown fox
+            tokenizer: standard
+            filter:
+              - type: word_delimiter_graph
+                adjust_offsets: false
+    - length: { tokens: 6 }
+    - match:  { tokens.0.token: the }
+    - match:  { tokens.1.token: qu }
+    - match:  { tokens.1.start_offset: 4 }
+    - match:  { tokens.1.end_offset: 9 }
+    - match:  { tokens.2.token: "1" }
+    - match:  { tokens.2.start_offset: 4 }
+    - match:  { tokens.2.end_offset: 9 }
+    - match:  { tokens.3.token: ck }
+    - match:  { tokens.3.start_offset: 4 }
+    - match:  { tokens.3.end_offset: 9 }
+
     - do:
         indices.analyze:
           body: