LUCENE-9006: WDGF catenateAll should come before parts

Fixes #953
2019-10-15 12:22:53 -04:00 · 2019-10-15 12:22:53 -04:00 · 517bfd0ab7
parent 1d7cd61575
commit 517bfd0ab7
4 changed files with 51 additions and 16 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -75,8 +75,9 @@ New Features
 (No changes)

 Improvements
---------------------
-(No changes)
+
+* LUCENE-9006: WordDelimiterGraphFilter's catenateAll token is now ordered before any token parts, like WDF did.
+  (David Smiley)

 Optimizations

--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterGraphFilter.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterGraphFilter.java
@ -447,7 +447,7 @@ public final class WordDelimiterGraphFilter extends TokenFilter {
  private class PositionSorter extends InPlaceMergeSorter {
    @Override
    protected int compare(int i, int j) {
-      // sort by smaller start position
+      // smaller start position
      int iPosStart = bufferedParts[4*i];
      int jPosStart = bufferedParts[4*j];
      int cmp = Integer.compare(iPosStart, jPosStart);
@ -455,10 +455,18 @@ public final class WordDelimiterGraphFilter extends TokenFilter {
        return cmp;
      }

-      // tie break by longest pos length:
+      // longest pos length:
      int iPosEnd = bufferedParts[4*i+1];
      int jPosEnd = bufferedParts[4*j+1];
-      return Integer.compare(jPosEnd, iPosEnd);
+      cmp = Integer.compare(jPosEnd, iPosEnd);
+      if (cmp != 0) {
+        return cmp;
+      }
+
+      // smaller start offset
+      int iOff = bufferedParts[4*i + 2];
+      int jOff = bufferedParts[4*j + 2];
+      return Integer.compare(iOff, jOff);
    }

    @Override
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterGraphFilter.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterGraphFilter.java
@ -397,6 +397,34 @@ public class TestWordDelimiterGraphFilter extends BaseTokenStreamTestCase {
    a.close();
  }

+  // https://issues.apache.org/jira/browse/LUCENE-9006
+  public void testCatenateAllEmittedBeforeParts() throws Exception {
+    // no number parts
+    final int flags = PRESERVE_ORIGINAL | GENERATE_WORD_PARTS | CATENATE_ALL;
+
+    //not using getAnalyzer because we want adjustInternalOffsets=true
+    Analyzer a = new Analyzer() {
+      @Override
+      public TokenStreamComponents createComponents(String field) {
+        Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
+        return new TokenStreamComponents(tokenizer, new WordDelimiterGraphFilter(tokenizer, true, DEFAULT_WORD_DELIM_TABLE, flags, null));
+      }
+    };
+
+    // input starts with a number, but we don't generate numbers.
+    //   Nonetheless preserve-original and concatenate-all show up first.
+    assertTokenStreamContents(a.tokenStream("dummy", "8-other"),
+        new String[] { "8-other", "8other", "other" }, new int[]{0, 0, 2}, new int[]{7, 7, 7});
+
+    boolean useCharFilter = true;
+    boolean graphOffsetsAreCorrect = false; // note: could solve via always incrementing wordPos on first word ('8')
+    checkAnalysisConsistency(random(), a, useCharFilter, "8-other", graphOffsetsAreCorrect);
+
+    verify("8-other", flags); // uses getAnalyzer which uses adjustInternalOffsets=false which works
+
+    a.close();
+  }
+
  /** concat numbers + words + all */
  public void testLotsOfConcatenating() throws Exception {
    final int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_WORDS | CATENATE_NUMBERS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE;    
@ -947,6 +975,9 @@ public class TestWordDelimiterGraphFilter extends BaseTokenStreamTestCase {

      fail(b.toString());
    }
+
+    boolean useCharFilter = true;
+    checkAnalysisConsistency(random(), getAnalyzer(flags), useCharFilter, text);
  }

  public void testOnlyNumbers() throws Exception {
--- a/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java
@ -373,26 +373,26 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
  }
  
  public static void assertAnalyzesTo(Analyzer a, String input, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[]) throws IOException {
+    assertTokenStreamContents(a.tokenStream("dummy", input), output, startOffsets, endOffsets, types, posIncrements, null, input.length());
    checkResetException(a, input);
    checkAnalysisConsistency(random(), a, true, input);
-    assertTokenStreamContents(a.tokenStream("dummy", input), output, startOffsets, endOffsets, types, posIncrements, null, input.length());
  }
  
  public static void assertAnalyzesTo(Analyzer a, String input, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[], int posLengths[]) throws IOException {
+    assertTokenStreamContents(a.tokenStream("dummy", input), output, startOffsets, endOffsets, types, posIncrements, posLengths, input.length());
    checkResetException(a, input);
    checkAnalysisConsistency(random(), a, true, input);
-    assertTokenStreamContents(a.tokenStream("dummy", input), output, startOffsets, endOffsets, types, posIncrements, posLengths, input.length());
  }

  public static void assertAnalyzesTo(Analyzer a, String input, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[], int posLengths[], boolean graphOffsetsAreCorrect) throws IOException {
+    assertTokenStreamContents(a.tokenStream("dummy", input), output, startOffsets, endOffsets, types, posIncrements, posLengths, input.length(), graphOffsetsAreCorrect);
    checkResetException(a, input);
    checkAnalysisConsistency(random(), a, true, input, graphOffsetsAreCorrect);
-    assertTokenStreamContents(a.tokenStream("dummy", input), output, startOffsets, endOffsets, types, posIncrements, posLengths, input.length(), graphOffsetsAreCorrect);
  }

  public static void assertAnalyzesTo(Analyzer a, String input, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[], int posLengths[], boolean graphOffsetsAreCorrect, byte[][] payloads) throws IOException {
-    checkResetException(a, input);
    assertTokenStreamContents(a.tokenStream("dummy", input), output, startOffsets, endOffsets, types, posIncrements, posLengths, input.length(), null, null, graphOffsetsAreCorrect, payloads);
+    checkResetException(a, input);
  }

  public static void assertAnalyzesTo(Analyzer a, String input, String[] output) throws IOException {
@ -948,13 +948,8 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
    w.close();
  }
  
-  static int[] toIntArray(List<Integer> list) {
-    int ret[] = new int[list.size()];
-    int offset = 0;
-    for (Integer i : list) {
-      ret[offset++] = i;
-    }
-    return ret;
+  private static int[] toIntArray(List<Integer> list) {
+    return list.stream().mapToInt(Integer::intValue).toArray();
  }

  protected static MockTokenizer whitespaceMockTokenizer(Reader input) throws IOException {