Skip to content

Commit af57575

Browse files
authored
Allow word_delimiter_graph_filter to not adjust internal offsets (#36699)
This commit adds an adjust_offsets parameter to the word_delimiter_graph token filter, defaulting to true. Most of the time you'd want sub-tokens emitted by this filter to have offsets that are adjusted to their real position in the token stream; however, some token filters can change the length or starting position of a token (eg trim) without changing their offset attributes, and this can lead to word_delimiter_graph emitting illegal offsets. Setting adjust_offsets to false in these cases will allow indexing again. Fixes #34741, #33710
1 parent 0ff1f1f commit af57575

File tree

4 files changed

+58
-2
lines changed

4 files changed

+58
-2
lines changed

docs/reference/analysis/tokenfilters/word-delimiter-graph-tokenfilter.asciidoc

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,15 @@ Advance settings include:
7171
to a file configured with protected words (one on each line).
7272
Automatically resolves to `config/` based location if exists.
7373

74+
`adjust_offsets`::
75+
By default, the filter tries to output subtokens with adjusted offsets
76+
to reflect their actual position in the token stream. However, when
77+
used in combination with other filters that alter the length or starting
78+
position of tokens without changing their offsets
79+
(e.g. <<analysis-trim-tokenfilter,`trim`>>) this can cause tokens with
80+
illegal offsets to be emitted. Setting `adjust_offsets` to false will
81+
stop `word_delimiter_graph` from adjusting these internal offsets.
82+
7483
`type_table`::
7584
A custom type mapping table, for example (when configured
7685
using `type_table_path`):

modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/WordDelimiterGraphTokenFilterFactory.java

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,7 @@ public class WordDelimiterGraphTokenFilterFactory extends AbstractTokenFilterFac
5555
private final byte[] charTypeTable;
5656
private final int flags;
5757
private final CharArraySet protoWords;
58+
private final boolean adjustOffsets;
5859

5960
public WordDelimiterGraphTokenFilterFactory(IndexSettings indexSettings, Environment env,
6061
String name, Settings settings) {
@@ -95,11 +96,12 @@ public WordDelimiterGraphTokenFilterFactory(IndexSettings indexSettings, Environ
9596
Set<?> protectedWords = Analysis.getWordSet(env, settings, "protected_words");
9697
this.protoWords = protectedWords == null ? null : CharArraySet.copy(protectedWords);
9798
this.flags = flags;
99+
this.adjustOffsets = settings.getAsBoolean("adjust_offsets", true);
98100
}
99101

100102
@Override
101103
public TokenStream create(TokenStream tokenStream) {
102-
return new WordDelimiterGraphFilter(tokenStream, true, charTypeTable, flags, protoWords);
104+
return new WordDelimiterGraphFilter(tokenStream, adjustOffsets, charTypeTable, flags, protoWords);
103105
}
104106

105107
@Override

modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/WordDelimiterGraphTokenFilterFactoryTests.java

Lines changed: 26 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -76,10 +76,35 @@ public void testPartsAndCatenate() throws IOException {
7676
String source = "PowerShot";
7777
int[] expectedIncr = new int[]{1, 0, 1};
7878
int[] expectedPosLen = new int[]{2, 1, 1};
79+
int[] expectedStartOffsets = new int[]{0, 0, 5};
80+
int[] expectedEndOffsets = new int[]{9, 5, 9};
7981
String[] expected = new String[]{"PowerShot", "Power", "Shot" };
8082
Tokenizer tokenizer = new WhitespaceTokenizer();
8183
tokenizer.setReader(new StringReader(source));
82-
assertTokenStreamContents(tokenFilter.create(tokenizer), expected, null, null, null,
84+
assertTokenStreamContents(tokenFilter.create(tokenizer), expected, expectedStartOffsets, expectedEndOffsets, null,
85+
expectedIncr, expectedPosLen, null);
86+
}
87+
88+
public void testAdjustingOffsets() throws IOException {
89+
ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(
90+
Settings.builder()
91+
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
92+
.put("index.analysis.filter.my_word_delimiter.type", type)
93+
.put("index.analysis.filter.my_word_delimiter.catenate_words", "true")
94+
.put("index.analysis.filter.my_word_delimiter.generate_word_parts", "true")
95+
.put("index.analysis.filter.my_word_delimiter.adjust_offsets", "false")
96+
.build(),
97+
new CommonAnalysisPlugin());
98+
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter");
99+
String source = "PowerShot";
100+
int[] expectedIncr = new int[]{1, 0, 1};
101+
int[] expectedPosLen = new int[]{2, 1, 1};
102+
int[] expectedStartOffsets = new int[]{0, 0, 0};
103+
int[] expectedEndOffsets = new int[]{9, 9, 9};
104+
String[] expected = new String[]{"PowerShot", "Power", "Shot" };
105+
Tokenizer tokenizer = new WhitespaceTokenizer();
106+
tokenizer.setReader(new StringReader(source));
107+
assertTokenStreamContents(tokenFilter.create(tokenizer), expected, expectedStartOffsets, expectedEndOffsets, null,
83108
expectedIncr, expectedPosLen, null);
84109
}
85110
}

modules/analysis-common/src/test/resources/rest-api-spec/test/analysis-common/40_token_filters.yml

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -157,6 +157,26 @@
157157
- match: { tokens.2.token: brown }
158158
- match: { tokens.3.token: fox }
159159

160+
- do:
161+
indices.analyze:
162+
body:
163+
text: the qu1ck brown fox
164+
tokenizer: standard
165+
filter:
166+
- type: word_delimiter_graph
167+
adjust_offsets: false
168+
- length: { tokens: 6 }
169+
- match: { tokens.0.token: the }
170+
- match: { tokens.1.token: qu }
171+
- match: { tokens.1.start_offset: 4 }
172+
- match: { tokens.1.end_offset: 9 }
173+
- match: { tokens.2.token: "1" }
174+
- match: { tokens.2.start_offset: 4 }
175+
- match: { tokens.2.end_offset: 9 }
176+
- match: { tokens.3.token: ck }
177+
- match: { tokens.3.start_offset: 4 }
178+
- match: { tokens.3.end_offset: 9 }
179+
160180
- do:
161181
indices.analyze:
162182
body:

0 commit comments

Comments
 (0)