LUCENE-8395: WordDelimiterGraphFilter can incorrectly add holes

This commit is contained in:
Alan Woodward 2018-07-12 13:12:23 +01:00
parent b68829e656
commit 8997d41357
3 changed files with 19 additions and 1 deletions

View File

@ -154,6 +154,10 @@ Bug Fixes:
if the last token in the stream was subsequently dropped; FixedShingleFilter did if the last token in the stream was subsequently dropped; FixedShingleFilter did
not set position increment in end() (Alan Woodward) not set position increment in end() (Alan Woodward)
* LUCENE-8395: WordDelimiterGraphFilter would incorrectly insert a hole into a
TokenStream if a token consisting entirely of delimiter characters was
encountered, but preserve_original was set. (Alan Woodward)
Changes in Runtime Behavior: Changes in Runtime Behavior:
* LUCENE-7976: TieredMergePolicy now respects maxSegmentSizeMB by default when executing * LUCENE-7976: TieredMergePolicy now respects maxSegmentSizeMB by default when executing

View File

@ -368,6 +368,7 @@ public final class WordDelimiterGraphFilter extends TokenFilter {
if (has(PRESERVE_ORIGINAL) == false) { if (has(PRESERVE_ORIGINAL) == false) {
continue; continue;
} else { } else {
accumPosInc = 0;
return true; return true;
} }
} }

View File

@ -213,6 +213,19 @@ public class TestWordDelimiterGraphFilter extends BaseTokenStreamTestCase {
} }
public void testPositionIncrements() throws Exception { public void testPositionIncrements() throws Exception {
Analyzer a4 = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
final int flags = SPLIT_ON_NUMERICS | GENERATE_WORD_PARTS | PRESERVE_ORIGINAL | GENERATE_NUMBER_PARTS | SPLIT_ON_CASE_CHANGE;
return new TokenStreamComponents(tokenizer, new WordDelimiterGraphFilter(tokenizer, flags, CharArraySet.EMPTY_SET));
}
};
assertAnalyzesTo(a4, "SAL_S8371 - SAL",
new String[]{ "SAL_S8371", "SAL", "S", "8371", "-", "SAL"},
new int[]{ 1, 0, 1, 1, 1, 1});
final int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE; final int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE;
final CharArraySet protWords = new CharArraySet(new HashSet<>(Arrays.asList("NUTCH")), false); final CharArraySet protWords = new CharArraySet(new HashSet<>(Arrays.asList("NUTCH")), false);
@ -327,7 +340,7 @@ public class TestWordDelimiterGraphFilter extends BaseTokenStreamTestCase {
null, null,
false); false);
IOUtils.close(a, a2, a3); IOUtils.close(a, a2, a3, a4);
} }
public void testKeywordFilter() throws Exception { public void testKeywordFilter() throws Exception {