mirror of https://github.com/apache/lucene.git
LUCENE-8395: WordDelimiterGraphFilter can incorrectly add holes
This commit is contained in:
parent
b68829e656
commit
8997d41357
|
@ -154,6 +154,10 @@ Bug Fixes:
|
|||
if the last token in the stream was subsequently dropped; FixedShingleFilter did
|
||||
not set position increment in end() (Alan Woodward)
|
||||
|
||||
* LUCENE-8395: WordDelimiterGraphFilter would incorrectly insert a hole into a
|
||||
TokenStream if a token consisting entirely of delimiter characters was
|
||||
encountered, but preserve_original was set. (Alan Woodward)
|
||||
|
||||
Changes in Runtime Behavior:
|
||||
|
||||
* LUCENE-7976: TieredMergePolicy now respects maxSegmentSizeMB by default when executing
|
||||
|
|
|
@ -368,6 +368,7 @@ public final class WordDelimiterGraphFilter extends TokenFilter {
|
|||
if (has(PRESERVE_ORIGINAL) == false) {
|
||||
continue;
|
||||
} else {
|
||||
accumPosInc = 0;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -213,6 +213,19 @@ public class TestWordDelimiterGraphFilter extends BaseTokenStreamTestCase {
|
|||
}
|
||||
|
||||
public void testPositionIncrements() throws Exception {
|
||||
|
||||
Analyzer a4 = new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName) {
|
||||
Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
|
||||
final int flags = SPLIT_ON_NUMERICS | GENERATE_WORD_PARTS | PRESERVE_ORIGINAL | GENERATE_NUMBER_PARTS | SPLIT_ON_CASE_CHANGE;
|
||||
return new TokenStreamComponents(tokenizer, new WordDelimiterGraphFilter(tokenizer, flags, CharArraySet.EMPTY_SET));
|
||||
}
|
||||
};
|
||||
assertAnalyzesTo(a4, "SAL_S8371 - SAL",
|
||||
new String[]{ "SAL_S8371", "SAL", "S", "8371", "-", "SAL"},
|
||||
new int[]{ 1, 0, 1, 1, 1, 1});
|
||||
|
||||
final int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE;
|
||||
final CharArraySet protWords = new CharArraySet(new HashSet<>(Arrays.asList("NUTCH")), false);
|
||||
|
||||
|
@ -327,7 +340,7 @@ public class TestWordDelimiterGraphFilter extends BaseTokenStreamTestCase {
|
|||
null,
|
||||
false);
|
||||
|
||||
IOUtils.close(a, a2, a3);
|
||||
IOUtils.close(a, a2, a3, a4);
|
||||
}
|
||||
|
||||
public void testKeywordFilter() throws Exception {
|
||||
|
|
Loading…
Reference in New Issue