LUCENE-8395: WordDelimiterGraphFilter can incorrectly add holes

2018-07-12 13:12:23 +01:00 · 2018-07-12 13:12:23 +01:00 · 8997d41357
parent b68829e656
commit 8997d41357
3 changed files with 19 additions and 1 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -154,6 +154,10 @@ Bug Fixes:
  if the last token in the stream was subsequently dropped; FixedShingleFilter did
  not set position increment in end() (Alan Woodward)
 * LUCENE-8395: WordDelimiterGraphFilter would incorrectly insert a hole into a
  TokenStream if a token consisting entirely of delimiter characters was 
  encountered, but preserve_original was set. (Alan Woodward)
 Changes in Runtime Behavior:
 * LUCENE-7976: TieredMergePolicy now respects maxSegmentSizeMB by default when executing
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterGraphFilter.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterGraphFilter.java
@ -368,6 +368,7 @@ public final class WordDelimiterGraphFilter extends TokenFilter {
          if (has(PRESERVE_ORIGINAL) == false) {
            continue;
          } else {
            accumPosInc = 0;
            return true;
          }
        }
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterGraphFilter.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterGraphFilter.java
@ -213,6 +213,19 @@ public class TestWordDelimiterGraphFilter extends BaseTokenStreamTestCase {
  }
  public void testPositionIncrements() throws Exception {
    Analyzer a4 = new Analyzer() {
      @Override
      protected TokenStreamComponents createComponents(String fieldName) {
        Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
        final int flags = SPLIT_ON_NUMERICS | GENERATE_WORD_PARTS | PRESERVE_ORIGINAL | GENERATE_NUMBER_PARTS | SPLIT_ON_CASE_CHANGE;
        return new TokenStreamComponents(tokenizer, new WordDelimiterGraphFilter(tokenizer, flags, CharArraySet.EMPTY_SET));
      }
    };
    assertAnalyzesTo(a4, "SAL_S8371 - SAL",
        new String[]{ "SAL_S8371", "SAL", "S", "8371", "-", "SAL"},
        new int[]{    1,            0,    1,    1,      1,    1});
    final int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE;
    final CharArraySet protWords = new CharArraySet(new HashSet<>(Arrays.asList("NUTCH")), false);
@ -327,7 +340,7 @@ public class TestWordDelimiterGraphFilter extends BaseTokenStreamTestCase {
        null,
        false);
-    IOUtils.close(a, a2, a3);
+    IOUtils.close(a, a2, a3, a4);
  }
  public void testKeywordFilter() throws Exception {