diff --git a/docs/reference/analysis/tokenfilters/word-delimiter-graph-tokenfilter.asciidoc b/docs/reference/analysis/tokenfilters/word-delimiter-graph-tokenfilter.asciidoc index 183d587090b96..4acd0163109a4 100644 --- a/docs/reference/analysis/tokenfilters/word-delimiter-graph-tokenfilter.asciidoc +++ b/docs/reference/analysis/tokenfilters/word-delimiter-graph-tokenfilter.asciidoc @@ -71,6 +71,15 @@ Advance settings include: to a file configured with protected words (one on each line). Automatically resolves to `config/` based location if exists. +`adjust_offsets`:: + By default, the filter tries to output subtokens with adjusted offsets + to reflect their actual position in the token stream. However, when + used in combination with other filters that alter the length or starting + position of tokens without changing their offsets + (e.g. <>) this can cause tokens with + illegal offsets to be emitted. Setting `adjust_offsets` to false will + stop `word_delimiter_graph` from adjusting these internal offsets. + `type_table`:: A custom type mapping table, for example (when configured using `type_table_path`): diff --git a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/WordDelimiterGraphTokenFilterFactory.java b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/WordDelimiterGraphTokenFilterFactory.java index 205f4072579a3..e64bb96bfcfd4 100644 --- a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/WordDelimiterGraphTokenFilterFactory.java +++ b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/WordDelimiterGraphTokenFilterFactory.java @@ -55,6 +55,7 @@ public class WordDelimiterGraphTokenFilterFactory extends AbstractTokenFilterFac private final byte[] charTypeTable; private final int flags; private final CharArraySet protoWords; + private final boolean adjustOffsets; public WordDelimiterGraphTokenFilterFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) { @@ -95,11 +96,12 @@ public WordDelimiterGraphTokenFilterFactory(IndexSettings indexSettings, Environ Set protectedWords = Analysis.getWordSet(env, settings, "protected_words"); this.protoWords = protectedWords == null ? null : CharArraySet.copy(protectedWords); this.flags = flags; + this.adjustOffsets = settings.getAsBoolean("adjust_offsets", true); } @Override public TokenStream create(TokenStream tokenStream) { - return new WordDelimiterGraphFilter(tokenStream, true, charTypeTable, flags, protoWords); + return new WordDelimiterGraphFilter(tokenStream, adjustOffsets, charTypeTable, flags, protoWords); } @Override diff --git a/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/WordDelimiterGraphTokenFilterFactoryTests.java b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/WordDelimiterGraphTokenFilterFactoryTests.java index bd7ff2f0c0188..0a77c0c42d0a0 100644 --- a/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/WordDelimiterGraphTokenFilterFactoryTests.java +++ b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/WordDelimiterGraphTokenFilterFactoryTests.java @@ -76,10 +76,35 @@ public void testPartsAndCatenate() throws IOException { String source = "PowerShot"; int[] expectedIncr = new int[]{1, 0, 1}; int[] expectedPosLen = new int[]{2, 1, 1}; + int[] expectedStartOffsets = new int[]{0, 0, 5}; + int[] expectedEndOffsets = new int[]{9, 5, 9}; String[] expected = new String[]{"PowerShot", "Power", "Shot" }; Tokenizer tokenizer = new WhitespaceTokenizer(); tokenizer.setReader(new StringReader(source)); - assertTokenStreamContents(tokenFilter.create(tokenizer), expected, null, null, null, + assertTokenStreamContents(tokenFilter.create(tokenizer), expected, expectedStartOffsets, expectedEndOffsets, null, + expectedIncr, expectedPosLen, null); + } + + public void testAdjustingOffsets() throws IOException { + ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings( + Settings.builder() + .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) + .put("index.analysis.filter.my_word_delimiter.type", type) + .put("index.analysis.filter.my_word_delimiter.catenate_words", "true") + .put("index.analysis.filter.my_word_delimiter.generate_word_parts", "true") + .put("index.analysis.filter.my_word_delimiter.adjust_offsets", "false") + .build(), + new CommonAnalysisPlugin()); + TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter"); + String source = "PowerShot"; + int[] expectedIncr = new int[]{1, 0, 1}; + int[] expectedPosLen = new int[]{2, 1, 1}; + int[] expectedStartOffsets = new int[]{0, 0, 0}; + int[] expectedEndOffsets = new int[]{9, 9, 9}; + String[] expected = new String[]{"PowerShot", "Power", "Shot" }; + Tokenizer tokenizer = new WhitespaceTokenizer(); + tokenizer.setReader(new StringReader(source)); + assertTokenStreamContents(tokenFilter.create(tokenizer), expected, expectedStartOffsets, expectedEndOffsets, null, expectedIncr, expectedPosLen, null); } } diff --git a/modules/analysis-common/src/test/resources/rest-api-spec/test/analysis-common/40_token_filters.yml b/modules/analysis-common/src/test/resources/rest-api-spec/test/analysis-common/40_token_filters.yml index 4106237f2cca6..75658d9351f39 100644 --- a/modules/analysis-common/src/test/resources/rest-api-spec/test/analysis-common/40_token_filters.yml +++ b/modules/analysis-common/src/test/resources/rest-api-spec/test/analysis-common/40_token_filters.yml @@ -157,6 +157,26 @@ - match: { tokens.2.token: brown } - match: { tokens.3.token: fox } + - do: + indices.analyze: + body: + text: the qu1ck brown fox + tokenizer: standard + filter: + - type: word_delimiter_graph + adjust_offsets: false + - length: { tokens: 6 } + - match: { tokens.0.token: the } + - match: { tokens.1.token: qu } + - match: { tokens.1.start_offset: 4 } + - match: { tokens.1.end_offset: 9 } + - match: { tokens.2.token: "1" } + - match: { tokens.2.start_offset: 4 } + - match: { tokens.2.end_offset: 9 } + - match: { tokens.3.token: ck } + - match: { tokens.3.start_offset: 4 } + - match: { tokens.3.end_offset: 9 } + - do: indices.analyze: body: