SOLR-1362: fix WDF position increment bugs

git-svn-id: https://svn.apache.org/repos/asf/lucene/solr/trunk@808005 13f79535-47bb-0310-9956-ffa450edef68
2009-08-26 13:32:46 +00:00 · 2009-08-26 13:32:46 +00:00 · b4dd3f78f4
parent 1ece23820f
commit b4dd3f78f4
3 changed files with 131 additions and 2 deletions
--- a/CHANGES.txt
+++ b/CHANGES.txt
@ -509,6 +509,11 @@ Bug Fixes
    already been closed/destroyed; if it hasn't a, SEVERE error is
    logged first.  (noble, hossman)

+60. SOLR-1362: WordDelimiterFilter had inconsistent behavior when setting
+    the position increment of tokens following a token consisting of all
+    delimiters, and could additionally lose big position increments.
+    (Robert Muir, yonik
+
 Other Changes
 ----------------------
 1. Upgraded to Lucene 2.4.0 (yonik)
--- a/src/java/org/apache/solr/analysis/WordDelimiterFilter.java
+++ b/src/java/org/apache/solr/analysis/WordDelimiterFilter.java
@ -384,13 +384,15 @@ final class WordDelimiterFilter extends TokenFilter {
      int start=0;
      if (len ==0) continue;

+      int posInc = t.getPositionIncrement();
+      origPosIncrement += posInc;
+
      //skip protected tokens
      if (protWords != null && protWords.contains(termBuffer, 0, len)) {
+        t.setPositionIncrement(origPosIncrement);
        return t;
      }

-      origPosIncrement += t.getPositionIncrement();
-
      // Avoid calling charType more than once for each char (basically
      // avoid any backtracking).
      // makes code slightly more difficult, but faster.
@ -482,6 +484,7 @@ final class WordDelimiterFilter extends TokenFilter {
            if (start==0) {
              // the subword is the whole original token, so
              // return it unchanged.
+              t.setPositionIncrement(origPosIncrement);
              return t;
            }

@ -492,6 +495,7 @@ final class WordDelimiterFilter extends TokenFilter {
              // of the original token
              t.setTermBuffer(termBuffer, start, len-start);
              t.setStartOffset(t.startOffset() + start);
+              t.setPositionIncrement(origPosIncrement);              
              return t;
            }

@ -524,6 +528,9 @@ final class WordDelimiterFilter extends TokenFilter {
        if (preserveOriginal != 0) {
          return t;
        }
+
+        // if this token had a "normal" gap of 1, remove it.
+        if (posInc==1) origPosIncrement-=1;
        continue;
      }

--- a/src/test/org/apache/solr/analysis/TestWordDelimiterFilter.java
+++ b/src/test/org/apache/solr/analysis/TestWordDelimiterFilter.java
@ -18,12 +18,21 @@
 package org.apache.solr.analysis;

 import org.apache.solr.util.AbstractSolrTestCase;
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.analysis.TokenFilter;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.Token;
 import org.apache.lucene.analysis.WhitespaceTokenizer;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;

 import java.io.IOException;
+import java.io.Reader;
 import java.io.StringReader;
+import java.util.Arrays;
+import java.util.HashSet;

 /**
 * New WordDelimiterFilter tests... most of the tests are in ConvertedLegacyTest
@ -388,5 +397,113 @@ public class TestWordDelimiterFilter extends AbstractSolrTestCase {
    doSplitPossessive(1, "ra's", "ra");
    doSplitPossessive(0, "ra's", "ra", "s");
  }
+  
+  /*
+   * Set a large position increment gap of 10 if the token is "largegap" or "/"
+   */
+  private final class LargePosIncTokenFilter extends TokenFilter {
+    private TermAttribute termAtt;
+    private PositionIncrementAttribute posIncAtt;
+    
+    protected LargePosIncTokenFilter(TokenStream input) {
+      super(input);
+      termAtt = (TermAttribute) addAttribute(TermAttribute.class);
+      posIncAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class);
+    }

+    @Override
+    public boolean incrementToken() throws IOException {
+      if (input.incrementToken()) {
+        if (termAtt.term().equals("largegap") || termAtt.term().equals("/"))
+          posIncAtt.setPositionIncrement(10);
+        return true;
+      } else {
+        return false;
+      }
+    }  
+  }
+  
+  public void testPositionIncrements() throws Exception {
+    final CharArraySet protWords = new CharArraySet(new HashSet<String>(Arrays.asList("NUTCH")), false);
+    
+    /* analyzer that uses whitespace + wdf */
+    Analyzer a = new Analyzer() {
+      public TokenStream tokenStream(String field, Reader reader) {
+        return new WordDelimiterFilter(
+            new WhitespaceTokenizer(reader),
+            1, 1, 0, 0, 1, 1, 0, 1, 1, protWords);
+      }
+    };
+
+    /* in this case, works as expected. */
+    assertAnalyzesTo(a, "LUCENE / SOLR", new String[] { "LUCENE", "SOLR" },
+        new int[] { 0, 9 },
+        new int[] { 6, 13 },
+        new int[] { 1, 1 });
+    
+    /* only in this case, posInc of 2 ?! */
+    assertAnalyzesTo(a, "LUCENE / solR", new String[] { "LUCENE", "sol", "R", "solR" },
+        new int[] { 0, 9, 12, 9 },
+        new int[] { 6, 12, 13, 13 },
+        new int[] { 1, 1, 1, 0 });
+    
+    assertAnalyzesTo(a, "LUCENE / NUTCH SOLR", new String[] { "LUCENE", "NUTCH", "SOLR" },
+        new int[] { 0, 9, 15 },
+        new int[] { 6, 14, 19 },
+        new int[] { 1, 1, 1 });
+    
+    /* analyzer that will consume tokens with large position increments */
+    Analyzer a2 = new Analyzer() {
+      public TokenStream tokenStream(String field, Reader reader) {
+        return new WordDelimiterFilter(
+            new LargePosIncTokenFilter(
+            new WhitespaceTokenizer(reader)),
+            1, 1, 0, 0, 1, 1, 0, 1, 1, protWords);
+      }
+    };
+    
+    /* increment of "largegap" is preserved */
+    assertAnalyzesTo(a2, "LUCENE largegap SOLR", new String[] { "LUCENE", "largegap", "SOLR" },
+        new int[] { 0, 7, 16 },
+        new int[] { 6, 15, 20 },
+        new int[] { 1, 10, 1 });
+    
+    /* the "/" had a position increment of 10, where did it go?!?!! */
+    assertAnalyzesTo(a2, "LUCENE / SOLR", new String[] { "LUCENE", "SOLR" },
+        new int[] { 0, 9 },
+        new int[] { 6, 13 },
+        new int[] { 1, 11 });
+    
+    /* in this case, the increment of 10 from the "/" is carried over */
+    assertAnalyzesTo(a2, "LUCENE / solR", new String[] { "LUCENE", "sol", "R", "solR" },
+        new int[] { 0, 9, 12, 9 },
+        new int[] { 6, 12, 13, 13 },
+        new int[] { 1, 11, 1, 0 });
+    
+    assertAnalyzesTo(a2, "LUCENE / NUTCH SOLR", new String[] { "LUCENE", "NUTCH", "SOLR" },
+        new int[] { 0, 9, 15 },
+        new int[] { 6, 14, 19 },
+        new int[] { 1, 11, 1 });
+  }
+
+  private void assertAnalyzesTo(Analyzer a, String input, String[] output,
+      int startOffsets[], int endOffsets[], int posIncs[]) throws Exception {
+
+    TokenStream ts = a.tokenStream("dummy", new StringReader(input));
+    TermAttribute termAtt = (TermAttribute) ts
+        .getAttribute(TermAttribute.class);
+    OffsetAttribute offsetAtt = (OffsetAttribute) ts
+        .getAttribute(OffsetAttribute.class);
+    PositionIncrementAttribute posIncAtt = (PositionIncrementAttribute) ts
+        .getAttribute(PositionIncrementAttribute.class);
+    for (int i = 0; i < output.length; i++) {
+      assertTrue(ts.incrementToken());
+      assertEquals(output[i], termAtt.term());
+      assertEquals(startOffsets[i], offsetAtt.startOffset());
+      assertEquals(endOffsets[i], offsetAtt.endOffset());
+      assertEquals(posIncs[i], posIncAtt.getPositionIncrement());
+    }
+    assertFalse(ts.incrementToken());
+    ts.close();
+  }
 }