LUCENE-8730: WordDelimiterGraphFilter always emits its original token first

2019-04-01 18:21:06 +01:00 · 2019-04-01 18:21:06 +01:00 · 3de0b36719
parent 2480b74887
commit 3de0b36719
3 changed files with 33 additions and 4 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -90,6 +90,11 @@ Changes in Runtime Behavior
 * LUCENE-8671: Load FST off-heap also for ID-like fields if reader is not opened
  from an IndexWriter. (Simon Willnauer)

+* LUCENE-8730: WordDelimiterGraphFilter always emits its original token first.  This
+  brings its behaviour into line with the deprecated WordDelimiterFilter, so that
+  the only difference in output between the two is in the position length
+  attribute.  (Alan Woodward, Jim Ferenczi)
+
 Other

 * LUCENE-8680: Refactor EdgeTree#relateTriangle method. (Ignacio Vera)
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterGraphFilter.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterGraphFilter.java
@ -268,6 +268,12 @@ public final class WordDelimiterGraphFilter extends TokenFilter {
    lastConcatCount = 0;
    wordPos = 0;

+    if (has(PRESERVE_ORIGINAL)) {
+      // add the original token now so that it is always emitted first
+      // we will edit the term length after all other parts have been buffered
+      buffer(0, 1, 0, savedTermLength);
+    }
+
    if (iterator.isSingleWord()) {
      buffer(wordPos, wordPos+1, iterator.current, iterator.end);
      wordPos++;
@ -320,15 +326,16 @@ public final class WordDelimiterGraphFilter extends TokenFilter {
    }

    if (has(PRESERVE_ORIGINAL)) {
+      // we now know how many tokens need to be injected, so we can set the original
+      // token's position length
      if (wordPos == 0) {
        // can happen w/ strange flag combos and inputs :)
        wordPos++;
      }
-      // add the original token now so that we can set the correct end position
-      buffer(0, wordPos, 0, savedTermLength);
+      bufferedParts[1] = wordPos;
    }
            
-    sorter.sort(0, bufferedLen);
+    sorter.sort(has(PRESERVE_ORIGINAL) ? 1 : 0, bufferedLen);
    wordPos = 0;

    // set back to 0 for iterating from the buffer
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterGraphFilter.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterGraphFilter.java
@ -380,6 +380,23 @@ public class TestWordDelimiterGraphFilter extends BaseTokenStreamTestCase {
    };
  }

+  public void testOriginalTokenEmittedFirst() throws Exception {
+    final int flags = PRESERVE_ORIGINAL | GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_WORDS | CATENATE_NUMBERS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE;
+
+    /* analyzer that uses whitespace + wdf */
+    Analyzer a = new Analyzer() {
+      @Override
+      public TokenStreamComponents createComponents(String field) {
+        Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
+        return new TokenStreamComponents(tokenizer, new WordDelimiterGraphFilter(tokenizer, true, DEFAULT_WORD_DELIM_TABLE, flags, null));
+      }
+    };
+
+    assertAnalyzesTo(a, "abc-def abcDEF abc123",
+        new String[] { "abc-def", "abcdef", "abc", "def", "abcDEF", "abcDEF", "abc", "DEF", "abc123", "abc123", "abc", "123" });
+    a.close();
+  }
+
  /** concat numbers + words + all */
  public void testLotsOfConcatenating() throws Exception {
    final int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_WORDS | CATENATE_NUMBERS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE;    
@ -418,7 +435,7 @@ public class TestWordDelimiterGraphFilter extends BaseTokenStreamTestCase {
    };
    
    assertAnalyzesTo(a, "abc-def-123-456", 
-                     new String[] { "abcdef123456", "abc-def-123-456", "abcdef", "abc", "def", "123456", "123", "456" }, 
+                     new String[] { "abc-def-123-456", "abcdef123456", "abcdef", "abc", "def", "123456", "123", "456" },
                     new int[] { 0, 0, 0, 0, 0, 0, 0, 0 },
                     new int[] { 15, 15, 15, 15, 15, 15, 15, 15 },
                     null,