LUCENE-1903: Fix incorrect ShingleFilter behavior when outputUnigrams == false

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@812779 13f79535-47bb-0310-9956-ffa450edef68
2009-09-09 06:02:54 +00:00 · 2009-09-09 06:02:54 +00:00 · a8eb5c4b80
parent 3507752e42
commit a8eb5c4b80
3 changed files with 120 additions and 14 deletions
--- a/contrib/CHANGES.txt
+++ b/contrib/CHANGES.txt
@ -28,8 +28,10 @@ API Changes
 * LUCENE-1460: Change contrib TokenStreams/Filters to use the new
    TokenStream API. (Robert Muir, Michael Busch)

- * LUCENE-1775: Change remaining TokenFilters (shingle, prefix-suffix) to
-    use the new TokenStream API. (Robert Muir, Michael Busch)
+ * LUCENE-1775, LUCENE-1903: Change remaining TokenFilters (shingle, prefix-suffix)
+    to use the new TokenStream API. ShingleFilter is much more efficient now,
+    it clones much less often and computes the tokens mostly on the fly now.
+    Also added more tests. (Robert Muir, Michael Busch)
    
 * LUCENE-1685: The position aware SpanScorer has become the default scorer
    for Highlighting. The SpanScorer implementation has replaced QueryScorer
--- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java
@ -88,7 +88,6 @@ public class ShingleFilter extends TokenFilter {
    this.offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
    this.posIncrAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class);
    this.typeAtt = (TypeAttribute) addAttribute(TypeAttribute.class);
-    
  }

  /**
@ -174,11 +173,15 @@ public class ShingleFilter extends TokenFilter {
      
      nextToken = (AttributeSource.State) shingleBuf.getFirst();
      
-      if (shingleBufferPosition == 0 && (! shingleBuf.isEmpty()) && outputUnigrams) {
-        restoreState(nextToken);
-        posIncrAtt.setPositionIncrement(1);
+      if (outputUnigrams) {
+        if (shingleBufferPosition == 0) {
+          restoreState(nextToken);
+          posIncrAtt.setPositionIncrement(1);
+          shingleBufferPosition++;
+          return true;
+        }
+      } else {
        shingleBufferPosition++;
-        return true;
      }
  
      if (shingleBufferPosition < shingleBuf.size()) {
@ -277,7 +280,7 @@ public class ShingleFilter extends TokenFilter {
        shingleBuf.add(captureState());
        if (shingleBuf.size() > maxShingleSize)
        {
-          shingleBuf.remove(0);
+          shingleBuf.removeFirst();
        }
        addedToken = true;
      } else {
@ -294,7 +297,7 @@ public class ShingleFilter extends TokenFilter {
     * the end of the input stream and have to discard the least recent token.
     */
    if (! addedToken) {
-      shingleBuf.remove(0);
+      shingleBuf.removeFirst();
    }
    
    if (shingleBuf.isEmpty()) {
--- a/contrib/analyzers/common/src/test/org/apache/lucene/analysis/shingle/ShingleFilterTest.java
+++ b/contrib/analyzers/common/src/test/org/apache/lucene/analysis/shingle/ShingleFilterTest.java
@ -115,6 +115,60 @@ public class ShingleFilterTest extends BaseTokenStreamTestCase {
    1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
  };

+  public static final Token[] BI_GRAM_TOKENS_WITHOUT_UNIGRAMS = new Token[] {
+    createToken("please divide", 0, 13),
+    createToken("divide this", 7, 18),
+    createToken("this sentence", 14, 27),
+    createToken("sentence into", 19, 32),
+    createToken("into shingles", 28, 39),
+  };
+
+  public static final int[] BI_GRAM_POSITION_INCREMENTS_WITHOUT_UNIGRAMS = new int[] {
+    1, 1, 1, 1, 1
+  };
+
+  public static final String[] BI_GRAM_TYPES_WITHOUT_UNIGRAMS = new String[] {
+    "shingle", "shingle", "shingle", "shingle", "shingle"
+  };
+
+  public static final Token[] BI_GRAM_TOKENS_WITH_HOLES_WITHOUT_UNIGRAMS = new Token[] {
+    createToken("please divide", 0, 13),
+    createToken("divide _", 7, 19),
+    createToken("_ sentence", 19, 27),
+    createToken("sentence _", 19, 33),
+    createToken("_ shingles", 33, 39),
+  };
+
+  public static final int[] BI_GRAM_POSITION_INCREMENTS_WITH_HOLES_WITHOUT_UNIGRAMS = new int[] {
+    1, 1, 1, 1, 1, 1
+  };
+
+
+  public static final Token[] TEST_SINGLE_TOKEN = new Token[] {
+    createToken("please", 0, 6)
+  };
+
+  public static final Token[] SINGLE_TOKEN = new Token[] {
+    createToken("please", 0, 6)
+  };
+
+  public static final int[] SINGLE_TOKEN_INCREMENTS = new int[] {
+    1
+  };
+
+  public static final String[] SINGLE_TOKEN_TYPES = new String[] {
+    "word"
+  };
+
+  public static final Token[] EMPTY_TOKEN_ARRAY = new Token[] {
+  };
+
+  public static final int[] EMPTY_TOKEN_INCREMENTS_ARRAY = new int[] {
+  };
+
+  public static final String[] EMPTY_TOKEN_TYPES_ARRAY = new String[] {
+  };
+
  public static final Token[] TRI_GRAM_TOKENS = new Token[] {
    createToken("please", 0, 6),
    createToken("please divide", 0, 13),
@ -165,18 +219,59 @@ public class ShingleFilterTest extends BaseTokenStreamTestCase {
   */
  public void testBiGramFilter() throws IOException {
    this.shingleFilterTest(2, TEST_TOKEN, BI_GRAM_TOKENS,
-                           BI_GRAM_POSITION_INCREMENTS, BI_GRAM_TYPES);
+                           BI_GRAM_POSITION_INCREMENTS, BI_GRAM_TYPES,
+                           true);
  }

  public void testBiGramFilterWithHoles() throws IOException {
    this.shingleFilterTest(2, testTokenWithHoles, BI_GRAM_TOKENS_WITH_HOLES,
-                           BI_GRAM_POSITION_INCREMENTS, BI_GRAM_TYPES);
+                           BI_GRAM_POSITION_INCREMENTS, BI_GRAM_TYPES,
+                           true);
+  }
+
+  public void testBiGramFilterWithoutUnigrams() throws IOException {
+    this.shingleFilterTest(2, TEST_TOKEN, BI_GRAM_TOKENS_WITHOUT_UNIGRAMS,
+                           BI_GRAM_POSITION_INCREMENTS_WITHOUT_UNIGRAMS, BI_GRAM_TYPES_WITHOUT_UNIGRAMS,
+                           false);
+  }
+
+  public void testBiGramFilterWithHolesWithoutUnigrams() throws IOException {
+    this.shingleFilterTest(2, testTokenWithHoles, BI_GRAM_TOKENS_WITH_HOLES_WITHOUT_UNIGRAMS,
+                           BI_GRAM_POSITION_INCREMENTS_WITH_HOLES_WITHOUT_UNIGRAMS, BI_GRAM_TYPES_WITHOUT_UNIGRAMS,
+                           false);
+  }
+
+  public void testBiGramFilterWithSingleToken() throws IOException {
+    this.shingleFilterTest(2, TEST_SINGLE_TOKEN, SINGLE_TOKEN,
+                           SINGLE_TOKEN_INCREMENTS, SINGLE_TOKEN_TYPES,
+                           true);
+  }
+
+  public void testBiGramFilterWithSingleTokenWithoutUnigrams() throws IOException {
+    this.shingleFilterTest(2, TEST_SINGLE_TOKEN, EMPTY_TOKEN_ARRAY,
+                           EMPTY_TOKEN_INCREMENTS_ARRAY, EMPTY_TOKEN_TYPES_ARRAY,
+                           false);
+  }
+
+  public void testBiGramFilterWithEmptyTokenStream() throws IOException {
+    this.shingleFilterTest(2, EMPTY_TOKEN_ARRAY, EMPTY_TOKEN_ARRAY,
+                           EMPTY_TOKEN_INCREMENTS_ARRAY, EMPTY_TOKEN_TYPES_ARRAY,
+                           true);
+  }
+
+  public void testBiGramFilterWithEmptyTokenStreamWithoutUnigrams() throws IOException {
+    this.shingleFilterTest(2, EMPTY_TOKEN_ARRAY, EMPTY_TOKEN_ARRAY,
+                           EMPTY_TOKEN_INCREMENTS_ARRAY, EMPTY_TOKEN_TYPES_ARRAY,
+                           false);
  }

  public void testTriGramFilter() throws IOException {
    this.shingleFilterTest(3, TEST_TOKEN, TRI_GRAM_TOKENS,
-                           TRI_GRAM_POSITION_INCREMENTS, TRI_GRAM_TYPES);
+                           TRI_GRAM_POSITION_INCREMENTS, TRI_GRAM_TYPES,
+                           true);
  }
+
+
  
  public void testReset() throws Exception {
    Tokenizer wsTokenizer = new WhitespaceTokenizer(new StringReader("please divide this sentence"));
@ -197,10 +292,13 @@ public class ShingleFilterTest extends BaseTokenStreamTestCase {
  }
  
  protected void shingleFilterTest(int maxSize, Token[] tokensToShingle, Token[] tokensToCompare,
-                                   int[] positionIncrements, String[] types)
+                                   int[] positionIncrements, String[] types,
+                                   boolean outputUnigrams)
    throws IOException {

-    TokenStream filter = new ShingleFilter(new TestTokenStream(tokensToShingle), maxSize);
+    ShingleFilter filter = new ShingleFilter(new TestTokenStream(tokensToShingle), maxSize);
+    filter.setOutputUnigrams(outputUnigrams);
+
    TermAttribute termAtt = (TermAttribute) filter.addAttribute(TermAttribute.class);
    OffsetAttribute offsetAtt = (OffsetAttribute) filter.addAttribute(OffsetAttribute.class);
    PositionIncrementAttribute posIncrAtt = (PositionIncrementAttribute) filter.addAttribute(PositionIncrementAttribute.class);
@ -208,6 +306,7 @@ public class ShingleFilterTest extends BaseTokenStreamTestCase {

    int i = 0;
    while (filter.incrementToken()) {
+      assertTrue("ShingleFilter outputted more tokens than expected", i < tokensToCompare.length);
      String termText = termAtt.term();
      String goldText = tokensToCompare[i].term();
      assertEquals("Wrong termText", goldText, termText);
@ -220,6 +319,8 @@ public class ShingleFilterTest extends BaseTokenStreamTestCase {
      assertEquals("Wrong type for token \"" + termText + "\"", types[i], typeAtt.type());
      i++;
    }
+    assertEquals("ShingleFilter outputted wrong # of tokens. (# output = " + i + "; # expected =" + tokensToCompare.length + ")",
+                 tokensToCompare.length, i);
  }

  private static Token createToken(String term, int start, int offset)