Allow word_delimiter_graph_filter to not adjust internal offsets (#36699)
This commit adds an adjust_offsets parameter to the word_delimiter_graph token filter, defaulting to true. Most of the time you'd want sub-tokens emitted by this filter to have offsets that are adjusted to their real position in the token stream; however, some token filters can change the length or starting position of a token (eg trim) without changing their offset attributes, and this can lead to word_delimiter_graph emitting illegal offsets. Setting adjust_offsets to false in these cases will allow indexing again. Fixes #34741, #33710
This commit is contained in:
parent
0ff1f1fa18
commit
af57575838
|
@ -71,6 +71,15 @@ Advance settings include:
|
|||
to a file configured with protected words (one on each line).
|
||||
Automatically resolves to `config/` based location if exists.
|
||||
|
||||
`adjust_offsets`::
|
||||
By default, the filter tries to output subtokens with adjusted offsets
|
||||
to reflect their actual position in the token stream. However, when
|
||||
used in combination with other filters that alter the length or starting
|
||||
position of tokens without changing their offsets
|
||||
(e.g. <<analysis-trim-tokenfilter,`trim`>>) this can cause tokens with
|
||||
illegal offsets to be emitted. Setting `adjust_offsets` to false will
|
||||
stop `word_delimiter_graph` from adjusting these internal offsets.
|
||||
|
||||
`type_table`::
|
||||
A custom type mapping table, for example (when configured
|
||||
using `type_table_path`):
|
||||
|
|
|
@ -55,6 +55,7 @@ public class WordDelimiterGraphTokenFilterFactory extends AbstractTokenFilterFac
|
|||
private final byte[] charTypeTable;
|
||||
private final int flags;
|
||||
private final CharArraySet protoWords;
|
||||
private final boolean adjustOffsets;
|
||||
|
||||
public WordDelimiterGraphTokenFilterFactory(IndexSettings indexSettings, Environment env,
|
||||
String name, Settings settings) {
|
||||
|
@ -95,11 +96,12 @@ public class WordDelimiterGraphTokenFilterFactory extends AbstractTokenFilterFac
|
|||
Set<?> protectedWords = Analysis.getWordSet(env, settings, "protected_words");
|
||||
this.protoWords = protectedWords == null ? null : CharArraySet.copy(protectedWords);
|
||||
this.flags = flags;
|
||||
this.adjustOffsets = settings.getAsBoolean("adjust_offsets", true);
|
||||
}
|
||||
|
||||
@Override
|
||||
public TokenStream create(TokenStream tokenStream) {
|
||||
return new WordDelimiterGraphFilter(tokenStream, true, charTypeTable, flags, protoWords);
|
||||
return new WordDelimiterGraphFilter(tokenStream, adjustOffsets, charTypeTable, flags, protoWords);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -76,10 +76,35 @@ public class WordDelimiterGraphTokenFilterFactoryTests
|
|||
String source = "PowerShot";
|
||||
int[] expectedIncr = new int[]{1, 0, 1};
|
||||
int[] expectedPosLen = new int[]{2, 1, 1};
|
||||
int[] expectedStartOffsets = new int[]{0, 0, 5};
|
||||
int[] expectedEndOffsets = new int[]{9, 5, 9};
|
||||
String[] expected = new String[]{"PowerShot", "Power", "Shot" };
|
||||
Tokenizer tokenizer = new WhitespaceTokenizer();
|
||||
tokenizer.setReader(new StringReader(source));
|
||||
assertTokenStreamContents(tokenFilter.create(tokenizer), expected, null, null, null,
|
||||
assertTokenStreamContents(tokenFilter.create(tokenizer), expected, expectedStartOffsets, expectedEndOffsets, null,
|
||||
expectedIncr, expectedPosLen, null);
|
||||
}
|
||||
|
||||
public void testAdjustingOffsets() throws IOException {
|
||||
ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(
|
||||
Settings.builder()
|
||||
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
|
||||
.put("index.analysis.filter.my_word_delimiter.type", type)
|
||||
.put("index.analysis.filter.my_word_delimiter.catenate_words", "true")
|
||||
.put("index.analysis.filter.my_word_delimiter.generate_word_parts", "true")
|
||||
.put("index.analysis.filter.my_word_delimiter.adjust_offsets", "false")
|
||||
.build(),
|
||||
new CommonAnalysisPlugin());
|
||||
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter");
|
||||
String source = "PowerShot";
|
||||
int[] expectedIncr = new int[]{1, 0, 1};
|
||||
int[] expectedPosLen = new int[]{2, 1, 1};
|
||||
int[] expectedStartOffsets = new int[]{0, 0, 0};
|
||||
int[] expectedEndOffsets = new int[]{9, 9, 9};
|
||||
String[] expected = new String[]{"PowerShot", "Power", "Shot" };
|
||||
Tokenizer tokenizer = new WhitespaceTokenizer();
|
||||
tokenizer.setReader(new StringReader(source));
|
||||
assertTokenStreamContents(tokenFilter.create(tokenizer), expected, expectedStartOffsets, expectedEndOffsets, null,
|
||||
expectedIncr, expectedPosLen, null);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -157,6 +157,26 @@
|
|||
- match: { tokens.2.token: brown }
|
||||
- match: { tokens.3.token: fox }
|
||||
|
||||
- do:
|
||||
indices.analyze:
|
||||
body:
|
||||
text: the qu1ck brown fox
|
||||
tokenizer: standard
|
||||
filter:
|
||||
- type: word_delimiter_graph
|
||||
adjust_offsets: false
|
||||
- length: { tokens: 6 }
|
||||
- match: { tokens.0.token: the }
|
||||
- match: { tokens.1.token: qu }
|
||||
- match: { tokens.1.start_offset: 4 }
|
||||
- match: { tokens.1.end_offset: 9 }
|
||||
- match: { tokens.2.token: "1" }
|
||||
- match: { tokens.2.start_offset: 4 }
|
||||
- match: { tokens.2.end_offset: 9 }
|
||||
- match: { tokens.3.token: ck }
|
||||
- match: { tokens.3.start_offset: 4 }
|
||||
- match: { tokens.3.end_offset: 9 }
|
||||
|
||||
- do:
|
||||
indices.analyze:
|
||||
body:
|
||||
|
|
Loading…
Reference in New Issue