LUCENE-8395: WordDelimiterGraphFilter can incorrectly add holes

This commit is contained in:
Alan Woodward 2018-07-12 13:12:23 +01:00
parent b68829e656
commit 8997d41357
3 changed files with 19 additions and 1 deletions

View File

@ -154,6 +154,10 @@ Bug Fixes:
if the last token in the stream was subsequently dropped; FixedShingleFilter did
not set position increment in end() (Alan Woodward)
* LUCENE-8395: WordDelimiterGraphFilter would incorrectly insert a hole into a
TokenStream if a token consisting entirely of delimiter characters was
encountered, but preserve_original was set. (Alan Woodward)
Changes in Runtime Behavior:
* LUCENE-7976: TieredMergePolicy now respects maxSegmentSizeMB by default when executing

View File

@ -368,6 +368,7 @@ public final class WordDelimiterGraphFilter extends TokenFilter {
if (has(PRESERVE_ORIGINAL) == false) {
continue;
} else {
accumPosInc = 0;
return true;
}
}

View File

@ -213,6 +213,19 @@ public class TestWordDelimiterGraphFilter extends BaseTokenStreamTestCase {
}
public void testPositionIncrements() throws Exception {
Analyzer a4 = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
final int flags = SPLIT_ON_NUMERICS | GENERATE_WORD_PARTS | PRESERVE_ORIGINAL | GENERATE_NUMBER_PARTS | SPLIT_ON_CASE_CHANGE;
return new TokenStreamComponents(tokenizer, new WordDelimiterGraphFilter(tokenizer, flags, CharArraySet.EMPTY_SET));
}
};
assertAnalyzesTo(a4, "SAL_S8371 - SAL",
new String[]{ "SAL_S8371", "SAL", "S", "8371", "-", "SAL"},
new int[]{ 1, 0, 1, 1, 1, 1});
final int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE;
final CharArraySet protWords = new CharArraySet(new HashSet<>(Arrays.asList("NUTCH")), false);
@ -327,7 +340,7 @@ public class TestWordDelimiterGraphFilter extends BaseTokenStreamTestCase {
null,
false);
IOUtils.close(a, a2, a3);
IOUtils.close(a, a2, a3, a4);
}
public void testKeywordFilter() throws Exception {