From 8997d41357ab95eaa88702b93c75de4b1813457b Mon Sep 17 00:00:00 2001
From: Alan Woodward <romseygeek@apache.org>
Date: Thu, 12 Jul 2018 13:12:23 +0100
Subject: [PATCH] LUCENE-8395: WordDelimiterGraphFilter can incorrectly add
 holes

---
 lucene/CHANGES.txt                                |  4 ++++
 .../miscellaneous/WordDelimiterGraphFilter.java   |  1 +
 .../TestWordDelimiterGraphFilter.java             | 15 ++++++++++++++-
 3 files changed, 19 insertions(+), 1 deletion(-)

diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index 42a540521c5..01b32f60726 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -154,6 +154,10 @@ Bug Fixes:
   if the last token in the stream was subsequently dropped; FixedShingleFilter did
   not set position increment in end() (Alan Woodward)
 
+* LUCENE-8395: WordDelimiterGraphFilter would incorrectly insert a hole into a
+  TokenStream if a token consisting entirely of delimiter characters was 
+  encountered, but preserve_original was set. (Alan Woodward)
+
 Changes in Runtime Behavior:
 
 * LUCENE-7976: TieredMergePolicy now respects maxSegmentSizeMB by default when executing
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterGraphFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterGraphFilter.java
index 7d021c5a71c..a4382132f5a 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterGraphFilter.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterGraphFilter.java
@@ -368,6 +368,7 @@ public final class WordDelimiterGraphFilter extends TokenFilter {
           if (has(PRESERVE_ORIGINAL) == false) {
             continue;
           } else {
+            accumPosInc = 0;
             return true;
           }
         }
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterGraphFilter.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterGraphFilter.java
index b5dc8a3e950..65d3b027df4 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterGraphFilter.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterGraphFilter.java
@@ -213,6 +213,19 @@ public class TestWordDelimiterGraphFilter extends BaseTokenStreamTestCase {
   }
 
   public void testPositionIncrements() throws Exception {
+
+    Analyzer a4 = new Analyzer() {
+      @Override
+      protected TokenStreamComponents createComponents(String fieldName) {
+        Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
+        final int flags = SPLIT_ON_NUMERICS | GENERATE_WORD_PARTS | PRESERVE_ORIGINAL | GENERATE_NUMBER_PARTS | SPLIT_ON_CASE_CHANGE;
+        return new TokenStreamComponents(tokenizer, new WordDelimiterGraphFilter(tokenizer, flags, CharArraySet.EMPTY_SET));
+      }
+    };
+    assertAnalyzesTo(a4, "SAL_S8371 - SAL",
+        new String[]{ "SAL_S8371", "SAL", "S", "8371", "-", "SAL"},
+        new int[]{    1,            0,    1,    1,      1,    1});
+
     final int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE;
     final CharArraySet protWords = new CharArraySet(new HashSet<>(Arrays.asList("NUTCH")), false);
     
@@ -327,7 +340,7 @@ public class TestWordDelimiterGraphFilter extends BaseTokenStreamTestCase {
         null,
         false);
 
-    IOUtils.close(a, a2, a3);
+    IOUtils.close(a, a2, a3, a4);
   }
   
   public void testKeywordFilter() throws Exception {