Allow word_delimiter_graph_filter to not adjust internal offsets (#36699)

This commit adds an adjust_offsets parameter to the word_delimiter_graph token filter, defaulting
to true. Most of the time you'd want sub-tokens emitted by this filter to have offsets that are
adjusted to their real position in the token stream; however, some token filters can change the 
length or starting position of a token (eg trim) without changing their offset attributes, and this 
can lead to word_delimiter_graph emitting illegal offsets. Setting adjust_offsets to false in these 
cases will allow indexing again.

Fixes #34741, #33710
This commit is contained in:
Alan Woodward 2018-12-18 13:20:51 +00:00 committed by GitHub
parent 0ff1f1fa18
commit af57575838
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 58 additions and 2 deletions

View File

@ -71,6 +71,15 @@ Advance settings include:
to a file configured with protected words (one on each line).
Automatically resolves to `config/` based location if exists.
`adjust_offsets`::
By default, the filter tries to output subtokens with adjusted offsets
to reflect their actual position in the token stream. However, when
used in combination with other filters that alter the length or starting
position of tokens without changing their offsets
(e.g. <<analysis-trim-tokenfilter,`trim`>>) this can cause tokens with
illegal offsets to be emitted. Setting `adjust_offsets` to false will
stop `word_delimiter_graph` from adjusting these internal offsets.
`type_table`::
A custom type mapping table, for example (when configured
using `type_table_path`):

View File

@ -55,6 +55,7 @@ public class WordDelimiterGraphTokenFilterFactory extends AbstractTokenFilterFac
private final byte[] charTypeTable;
private final int flags;
private final CharArraySet protoWords;
private final boolean adjustOffsets;
public WordDelimiterGraphTokenFilterFactory(IndexSettings indexSettings, Environment env,
String name, Settings settings) {
@ -95,11 +96,12 @@ public class WordDelimiterGraphTokenFilterFactory extends AbstractTokenFilterFac
Set<?> protectedWords = Analysis.getWordSet(env, settings, "protected_words");
this.protoWords = protectedWords == null ? null : CharArraySet.copy(protectedWords);
this.flags = flags;
this.adjustOffsets = settings.getAsBoolean("adjust_offsets", true);
}
@Override
public TokenStream create(TokenStream tokenStream) {
return new WordDelimiterGraphFilter(tokenStream, true, charTypeTable, flags, protoWords);
return new WordDelimiterGraphFilter(tokenStream, adjustOffsets, charTypeTable, flags, protoWords);
}
@Override

View File

@ -76,10 +76,35 @@ public class WordDelimiterGraphTokenFilterFactoryTests
String source = "PowerShot";
int[] expectedIncr = new int[]{1, 0, 1};
int[] expectedPosLen = new int[]{2, 1, 1};
int[] expectedStartOffsets = new int[]{0, 0, 5};
int[] expectedEndOffsets = new int[]{9, 5, 9};
String[] expected = new String[]{"PowerShot", "Power", "Shot" };
Tokenizer tokenizer = new WhitespaceTokenizer();
tokenizer.setReader(new StringReader(source));
assertTokenStreamContents(tokenFilter.create(tokenizer), expected, null, null, null,
assertTokenStreamContents(tokenFilter.create(tokenizer), expected, expectedStartOffsets, expectedEndOffsets, null,
expectedIncr, expectedPosLen, null);
}
public void testAdjustingOffsets() throws IOException {
ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(
Settings.builder()
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
.put("index.analysis.filter.my_word_delimiter.type", type)
.put("index.analysis.filter.my_word_delimiter.catenate_words", "true")
.put("index.analysis.filter.my_word_delimiter.generate_word_parts", "true")
.put("index.analysis.filter.my_word_delimiter.adjust_offsets", "false")
.build(),
new CommonAnalysisPlugin());
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter");
String source = "PowerShot";
int[] expectedIncr = new int[]{1, 0, 1};
int[] expectedPosLen = new int[]{2, 1, 1};
int[] expectedStartOffsets = new int[]{0, 0, 0};
int[] expectedEndOffsets = new int[]{9, 9, 9};
String[] expected = new String[]{"PowerShot", "Power", "Shot" };
Tokenizer tokenizer = new WhitespaceTokenizer();
tokenizer.setReader(new StringReader(source));
assertTokenStreamContents(tokenFilter.create(tokenizer), expected, expectedStartOffsets, expectedEndOffsets, null,
expectedIncr, expectedPosLen, null);
}
}

View File

@ -157,6 +157,26 @@
- match: { tokens.2.token: brown }
- match: { tokens.3.token: fox }
- do:
indices.analyze:
body:
text: the qu1ck brown fox
tokenizer: standard
filter:
- type: word_delimiter_graph
adjust_offsets: false
- length: { tokens: 6 }
- match: { tokens.0.token: the }
- match: { tokens.1.token: qu }
- match: { tokens.1.start_offset: 4 }
- match: { tokens.1.end_offset: 9 }
- match: { tokens.2.token: "1" }
- match: { tokens.2.start_offset: 4 }
- match: { tokens.2.end_offset: 9 }
- match: { tokens.3.token: ck }
- match: { tokens.3.start_offset: 4 }
- match: { tokens.3.end_offset: 9 }
- do:
indices.analyze:
body: