Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,15 @@ Advance settings include:
to a file configured with protected words (one on each line).
Automatically resolves to `config/` based location if exists.

`adjust_offsets`::
By default, the filter tries to output subtokens with adjusted offsets
to reflect their actual position in the token stream. However, when
used in combination with other filters that alter the length or starting
position of tokens without changing their offsets
(e.g. <<analysis-trim-tokenfilter,`trim`>>) this can cause tokens with
illegal offsets to be emitted. Setting `adjust_offsets` to false will
stop `word_delimiter_graph` from adjusting these internal offsets.

`type_table`::
A custom type mapping table, for example (when configured
using `type_table_path`):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ public class WordDelimiterGraphTokenFilterFactory extends AbstractTokenFilterFac
private final byte[] charTypeTable;
private final int flags;
private final CharArraySet protoWords;
private final boolean adjustOffsets;

public WordDelimiterGraphTokenFilterFactory(IndexSettings indexSettings, Environment env,
String name, Settings settings) {
Expand Down Expand Up @@ -95,11 +96,12 @@ public WordDelimiterGraphTokenFilterFactory(IndexSettings indexSettings, Environ
Set<?> protectedWords = Analysis.getWordSet(env, settings, "protected_words");
this.protoWords = protectedWords == null ? null : CharArraySet.copy(protectedWords);
this.flags = flags;
this.adjustOffsets = settings.getAsBoolean("adjust_offsets", true);
}

@Override
public TokenStream create(TokenStream tokenStream) {
return new WordDelimiterGraphFilter(tokenStream, true, charTypeTable, flags, protoWords);
return new WordDelimiterGraphFilter(tokenStream, adjustOffsets, charTypeTable, flags, protoWords);
}

@Override
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -76,10 +76,35 @@ public void testPartsAndCatenate() throws IOException {
String source = "PowerShot";
int[] expectedIncr = new int[]{1, 0, 1};
int[] expectedPosLen = new int[]{2, 1, 1};
int[] expectedStartOffsets = new int[]{0, 0, 5};
int[] expectedEndOffsets = new int[]{9, 5, 9};
String[] expected = new String[]{"PowerShot", "Power", "Shot" };
Tokenizer tokenizer = new WhitespaceTokenizer();
tokenizer.setReader(new StringReader(source));
assertTokenStreamContents(tokenFilter.create(tokenizer), expected, null, null, null,
assertTokenStreamContents(tokenFilter.create(tokenizer), expected, expectedStartOffsets, expectedEndOffsets, null,
expectedIncr, expectedPosLen, null);
}

public void testAdjustingOffsets() throws IOException {
ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(
Settings.builder()
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
.put("index.analysis.filter.my_word_delimiter.type", type)
.put("index.analysis.filter.my_word_delimiter.catenate_words", "true")
.put("index.analysis.filter.my_word_delimiter.generate_word_parts", "true")
.put("index.analysis.filter.my_word_delimiter.adjust_offsets", "false")
.build(),
new CommonAnalysisPlugin());
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter");
String source = "PowerShot";
int[] expectedIncr = new int[]{1, 0, 1};
int[] expectedPosLen = new int[]{2, 1, 1};
int[] expectedStartOffsets = new int[]{0, 0, 0};
int[] expectedEndOffsets = new int[]{9, 9, 9};
String[] expected = new String[]{"PowerShot", "Power", "Shot" };
Tokenizer tokenizer = new WhitespaceTokenizer();
tokenizer.setReader(new StringReader(source));
assertTokenStreamContents(tokenFilter.create(tokenizer), expected, expectedStartOffsets, expectedEndOffsets, null,
expectedIncr, expectedPosLen, null);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -157,6 +157,26 @@
- match: { tokens.2.token: brown }
- match: { tokens.3.token: fox }

- do:
indices.analyze:
body:
text: the qu1ck brown fox
tokenizer: standard
filter:
- type: word_delimiter_graph
adjust_offsets: false
- length: { tokens: 6 }
- match: { tokens.0.token: the }
- match: { tokens.1.token: qu }
- match: { tokens.1.start_offset: 4 }
- match: { tokens.1.end_offset: 9 }
- match: { tokens.2.token: "1" }
- match: { tokens.2.start_offset: 4 }
- match: { tokens.2.end_offset: 9 }
- match: { tokens.3.token: ck }
- match: { tokens.3.start_offset: 4 }
- match: { tokens.3.end_offset: 9 }

- do:
indices.analyze:
body:
Expand Down