mirror of https://github.com/apache/lucene.git
LUCENE-8395: WordDelimiterGraphFilter can incorrectly add holes
This commit is contained in:
parent
b68829e656
commit
8997d41357
|
@ -154,6 +154,10 @@ Bug Fixes:
|
||||||
if the last token in the stream was subsequently dropped; FixedShingleFilter did
|
if the last token in the stream was subsequently dropped; FixedShingleFilter did
|
||||||
not set position increment in end() (Alan Woodward)
|
not set position increment in end() (Alan Woodward)
|
||||||
|
|
||||||
|
* LUCENE-8395: WordDelimiterGraphFilter would incorrectly insert a hole into a
|
||||||
|
TokenStream if a token consisting entirely of delimiter characters was
|
||||||
|
encountered, but preserve_original was set. (Alan Woodward)
|
||||||
|
|
||||||
Changes in Runtime Behavior:
|
Changes in Runtime Behavior:
|
||||||
|
|
||||||
* LUCENE-7976: TieredMergePolicy now respects maxSegmentSizeMB by default when executing
|
* LUCENE-7976: TieredMergePolicy now respects maxSegmentSizeMB by default when executing
|
||||||
|
|
|
@ -368,6 +368,7 @@ public final class WordDelimiterGraphFilter extends TokenFilter {
|
||||||
if (has(PRESERVE_ORIGINAL) == false) {
|
if (has(PRESERVE_ORIGINAL) == false) {
|
||||||
continue;
|
continue;
|
||||||
} else {
|
} else {
|
||||||
|
accumPosInc = 0;
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -213,6 +213,19 @@ public class TestWordDelimiterGraphFilter extends BaseTokenStreamTestCase {
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testPositionIncrements() throws Exception {
|
public void testPositionIncrements() throws Exception {
|
||||||
|
|
||||||
|
Analyzer a4 = new Analyzer() {
|
||||||
|
@Override
|
||||||
|
protected TokenStreamComponents createComponents(String fieldName) {
|
||||||
|
Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
|
||||||
|
final int flags = SPLIT_ON_NUMERICS | GENERATE_WORD_PARTS | PRESERVE_ORIGINAL | GENERATE_NUMBER_PARTS | SPLIT_ON_CASE_CHANGE;
|
||||||
|
return new TokenStreamComponents(tokenizer, new WordDelimiterGraphFilter(tokenizer, flags, CharArraySet.EMPTY_SET));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
assertAnalyzesTo(a4, "SAL_S8371 - SAL",
|
||||||
|
new String[]{ "SAL_S8371", "SAL", "S", "8371", "-", "SAL"},
|
||||||
|
new int[]{ 1, 0, 1, 1, 1, 1});
|
||||||
|
|
||||||
final int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE;
|
final int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE;
|
||||||
final CharArraySet protWords = new CharArraySet(new HashSet<>(Arrays.asList("NUTCH")), false);
|
final CharArraySet protWords = new CharArraySet(new HashSet<>(Arrays.asList("NUTCH")), false);
|
||||||
|
|
||||||
|
@ -327,7 +340,7 @@ public class TestWordDelimiterGraphFilter extends BaseTokenStreamTestCase {
|
||||||
null,
|
null,
|
||||||
false);
|
false);
|
||||||
|
|
||||||
IOUtils.close(a, a2, a3);
|
IOUtils.close(a, a2, a3, a4);
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testKeywordFilter() throws Exception {
|
public void testKeywordFilter() throws Exception {
|
||||||
|
|
Loading…
Reference in New Issue