From 8997d41357ab95eaa88702b93c75de4b1813457b Mon Sep 17 00:00:00 2001 From: Alan Woodward Date: Thu, 12 Jul 2018 13:12:23 +0100 Subject: [PATCH] LUCENE-8395: WordDelimiterGraphFilter can incorrectly add holes --- lucene/CHANGES.txt | 4 ++++ .../miscellaneous/WordDelimiterGraphFilter.java | 1 + .../TestWordDelimiterGraphFilter.java | 15 ++++++++++++++- 3 files changed, 19 insertions(+), 1 deletion(-) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 42a540521c5..01b32f60726 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -154,6 +154,10 @@ Bug Fixes: if the last token in the stream was subsequently dropped; FixedShingleFilter did not set position increment in end() (Alan Woodward) +* LUCENE-8395: WordDelimiterGraphFilter would incorrectly insert a hole into a + TokenStream if a token consisting entirely of delimiter characters was + encountered, but preserve_original was set. (Alan Woodward) + Changes in Runtime Behavior: * LUCENE-7976: TieredMergePolicy now respects maxSegmentSizeMB by default when executing diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterGraphFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterGraphFilter.java index 7d021c5a71c..a4382132f5a 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterGraphFilter.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterGraphFilter.java @@ -368,6 +368,7 @@ public final class WordDelimiterGraphFilter extends TokenFilter { if (has(PRESERVE_ORIGINAL) == false) { continue; } else { + accumPosInc = 0; return true; } } diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterGraphFilter.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterGraphFilter.java index b5dc8a3e950..65d3b027df4 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterGraphFilter.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterGraphFilter.java @@ -213,6 +213,19 @@ public class TestWordDelimiterGraphFilter extends BaseTokenStreamTestCase { } public void testPositionIncrements() throws Exception { + + Analyzer a4 = new Analyzer() { + @Override + protected TokenStreamComponents createComponents(String fieldName) { + Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false); + final int flags = SPLIT_ON_NUMERICS | GENERATE_WORD_PARTS | PRESERVE_ORIGINAL | GENERATE_NUMBER_PARTS | SPLIT_ON_CASE_CHANGE; + return new TokenStreamComponents(tokenizer, new WordDelimiterGraphFilter(tokenizer, flags, CharArraySet.EMPTY_SET)); + } + }; + assertAnalyzesTo(a4, "SAL_S8371 - SAL", + new String[]{ "SAL_S8371", "SAL", "S", "8371", "-", "SAL"}, + new int[]{ 1, 0, 1, 1, 1, 1}); + final int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE; final CharArraySet protWords = new CharArraySet(new HashSet<>(Arrays.asList("NUTCH")), false); @@ -327,7 +340,7 @@ public class TestWordDelimiterGraphFilter extends BaseTokenStreamTestCase { null, false); - IOUtils.close(a, a2, a3); + IOUtils.close(a, a2, a3, a4); } public void testKeywordFilter() throws Exception {