LUCENE-1903: Fix incorrect ShingleFilter behavior when outputUnigrams == false

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@812779 13f79535-47bb-0310-9956-ffa450edef68
2009-09-09 06:02:54 +00:00 · 2009-09-09 06:02:54 +00:00 · a8eb5c4b80
parent 3507752e42
commit a8eb5c4b80
3 changed files with 120 additions and 14 deletions
--- a/contrib/CHANGES.txt
+++ b/contrib/CHANGES.txt
@ -28,8 +28,10 @@ API Changes
 * LUCENE-1460: Change contrib TokenStreams/Filters to use the new
    TokenStream API. (Robert Muir, Michael Busch)
- * LUCENE-1775: Change remaining TokenFilters (shingle, prefix-suffix) to
+ * LUCENE-1775, LUCENE-1903: Change remaining TokenFilters (shingle, prefix-suffix)
-    use the new TokenStream API. (Robert Muir, Michael Busch)
+    to use the new TokenStream API. ShingleFilter is much more efficient now,
    it clones much less often and computes the tokens mostly on the fly now.
    Also added more tests. (Robert Muir, Michael Busch)
 * LUCENE-1685: The position aware SpanScorer has become the default scorer
    for Highlighting. The SpanScorer implementation has replaced QueryScorer
--- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java
@ -88,7 +88,6 @@ public class ShingleFilter extends TokenFilter {
    this.offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
    this.posIncrAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class);
    this.typeAtt = (TypeAttribute) addAttribute(TypeAttribute.class);
  }
  /**
@ -174,11 +173,15 @@ public class ShingleFilter extends TokenFilter {
      nextToken = (AttributeSource.State) shingleBuf.getFirst();
-      if (shingleBufferPosition == 0 && (! shingleBuf.isEmpty()) && outputUnigrams) {
+      if (outputUnigrams) {
-        restoreState(nextToken);
+        if (shingleBufferPosition == 0) {
-        posIncrAtt.setPositionIncrement(1);
+          restoreState(nextToken);
          posIncrAtt.setPositionIncrement(1);
          shingleBufferPosition++;
          return true;
        }
      } else {
        shingleBufferPosition++;
        return true;
      }
      if (shingleBufferPosition < shingleBuf.size()) {
@ -277,7 +280,7 @@ public class ShingleFilter extends TokenFilter {
        shingleBuf.add(captureState());
        if (shingleBuf.size() > maxShingleSize)
        {
-          shingleBuf.remove(0);
+          shingleBuf.removeFirst();
        }
        addedToken = true;
      } else {
@ -294,7 +297,7 @@ public class ShingleFilter extends TokenFilter {
     * the end of the input stream and have to discard the least recent token.
     */
    if (! addedToken) {
-      shingleBuf.remove(0);
+      shingleBuf.removeFirst();
    }
    if (shingleBuf.isEmpty()) {
--- a/contrib/analyzers/common/src/test/org/apache/lucene/analysis/shingle/ShingleFilterTest.java
+++ b/contrib/analyzers/common/src/test/org/apache/lucene/analysis/shingle/ShingleFilterTest.java
@ -115,6 +115,60 @@ public class ShingleFilterTest extends BaseTokenStreamTestCase {
    1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
  };
  public static final Token[] BI_GRAM_TOKENS_WITHOUT_UNIGRAMS = new Token[] {
    createToken("please divide", 0, 13),
    createToken("divide this", 7, 18),
    createToken("this sentence", 14, 27),
    createToken("sentence into", 19, 32),
    createToken("into shingles", 28, 39),
  };
  public static final int[] BI_GRAM_POSITION_INCREMENTS_WITHOUT_UNIGRAMS = new int[] {
    1, 1, 1, 1, 1
  };
  public static final String[] BI_GRAM_TYPES_WITHOUT_UNIGRAMS = new String[] {
    "shingle", "shingle", "shingle", "shingle", "shingle"
  };
  public static final Token[] BI_GRAM_TOKENS_WITH_HOLES_WITHOUT_UNIGRAMS = new Token[] {
    createToken("please divide", 0, 13),
    createToken("divide _", 7, 19),
    createToken("_ sentence", 19, 27),
    createToken("sentence _", 19, 33),
    createToken("_ shingles", 33, 39),
  };
  public static final int[] BI_GRAM_POSITION_INCREMENTS_WITH_HOLES_WITHOUT_UNIGRAMS = new int[] {
    1, 1, 1, 1, 1, 1
  };
  public static final Token[] TEST_SINGLE_TOKEN = new Token[] {
    createToken("please", 0, 6)
  };
  public static final Token[] SINGLE_TOKEN = new Token[] {
    createToken("please", 0, 6)
  };
  public static final int[] SINGLE_TOKEN_INCREMENTS = new int[] {
    1
  };
  public static final String[] SINGLE_TOKEN_TYPES = new String[] {
    "word"
  };
  public static final Token[] EMPTY_TOKEN_ARRAY = new Token[] {
  };
  public static final int[] EMPTY_TOKEN_INCREMENTS_ARRAY = new int[] {
  };
  public static final String[] EMPTY_TOKEN_TYPES_ARRAY = new String[] {
  };
  public static final Token[] TRI_GRAM_TOKENS = new Token[] {
    createToken("please", 0, 6),
    createToken("please divide", 0, 13),
@ -165,19 +219,60 @@ public class ShingleFilterTest extends BaseTokenStreamTestCase {
   */
  public void testBiGramFilter() throws IOException {
    this.shingleFilterTest(2, TEST_TOKEN, BI_GRAM_TOKENS,
-                           BI_GRAM_POSITION_INCREMENTS, BI_GRAM_TYPES);
+                           BI_GRAM_POSITION_INCREMENTS, BI_GRAM_TYPES,
                           true);
  }
  public void testBiGramFilterWithHoles() throws IOException {
    this.shingleFilterTest(2, testTokenWithHoles, BI_GRAM_TOKENS_WITH_HOLES,
-                           BI_GRAM_POSITION_INCREMENTS, BI_GRAM_TYPES);
+                           BI_GRAM_POSITION_INCREMENTS, BI_GRAM_TYPES,
                           true);
  }
  public void testBiGramFilterWithoutUnigrams() throws IOException {
    this.shingleFilterTest(2, TEST_TOKEN, BI_GRAM_TOKENS_WITHOUT_UNIGRAMS,
                           BI_GRAM_POSITION_INCREMENTS_WITHOUT_UNIGRAMS, BI_GRAM_TYPES_WITHOUT_UNIGRAMS,
                           false);
  }
  public void testBiGramFilterWithHolesWithoutUnigrams() throws IOException {
    this.shingleFilterTest(2, testTokenWithHoles, BI_GRAM_TOKENS_WITH_HOLES_WITHOUT_UNIGRAMS,
                           BI_GRAM_POSITION_INCREMENTS_WITH_HOLES_WITHOUT_UNIGRAMS, BI_GRAM_TYPES_WITHOUT_UNIGRAMS,
                           false);
  }
  public void testBiGramFilterWithSingleToken() throws IOException {
    this.shingleFilterTest(2, TEST_SINGLE_TOKEN, SINGLE_TOKEN,
                           SINGLE_TOKEN_INCREMENTS, SINGLE_TOKEN_TYPES,
                           true);
  }
  public void testBiGramFilterWithSingleTokenWithoutUnigrams() throws IOException {
    this.shingleFilterTest(2, TEST_SINGLE_TOKEN, EMPTY_TOKEN_ARRAY,
                           EMPTY_TOKEN_INCREMENTS_ARRAY, EMPTY_TOKEN_TYPES_ARRAY,
                           false);
  }
  public void testBiGramFilterWithEmptyTokenStream() throws IOException {
    this.shingleFilterTest(2, EMPTY_TOKEN_ARRAY, EMPTY_TOKEN_ARRAY,
                           EMPTY_TOKEN_INCREMENTS_ARRAY, EMPTY_TOKEN_TYPES_ARRAY,
                           true);
  }
  public void testBiGramFilterWithEmptyTokenStreamWithoutUnigrams() throws IOException {
    this.shingleFilterTest(2, EMPTY_TOKEN_ARRAY, EMPTY_TOKEN_ARRAY,
                           EMPTY_TOKEN_INCREMENTS_ARRAY, EMPTY_TOKEN_TYPES_ARRAY,
                           false);
  }
  public void testTriGramFilter() throws IOException {
    this.shingleFilterTest(3, TEST_TOKEN, TRI_GRAM_TOKENS,
-                           TRI_GRAM_POSITION_INCREMENTS, TRI_GRAM_TYPES);
+                           TRI_GRAM_POSITION_INCREMENTS, TRI_GRAM_TYPES,
                           true);
  }
  public void testReset() throws Exception {
    Tokenizer wsTokenizer = new WhitespaceTokenizer(new StringReader("please divide this sentence"));
    TokenStream filter = new ShingleFilter(wsTokenizer, 2);
@ -197,10 +292,13 @@ public class ShingleFilterTest extends BaseTokenStreamTestCase {
  }
  protected void shingleFilterTest(int maxSize, Token[] tokensToShingle, Token[] tokensToCompare,
-                                   int[] positionIncrements, String[] types)
+                                   int[] positionIncrements, String[] types,
                                   boolean outputUnigrams)
    throws IOException {
-    TokenStream filter = new ShingleFilter(new TestTokenStream(tokensToShingle), maxSize);
+    ShingleFilter filter = new ShingleFilter(new TestTokenStream(tokensToShingle), maxSize);
    filter.setOutputUnigrams(outputUnigrams);
    TermAttribute termAtt = (TermAttribute) filter.addAttribute(TermAttribute.class);
    OffsetAttribute offsetAtt = (OffsetAttribute) filter.addAttribute(OffsetAttribute.class);
    PositionIncrementAttribute posIncrAtt = (PositionIncrementAttribute) filter.addAttribute(PositionIncrementAttribute.class);
@ -208,6 +306,7 @@ public class ShingleFilterTest extends BaseTokenStreamTestCase {
    int i = 0;
    while (filter.incrementToken()) {
      assertTrue("ShingleFilter outputted more tokens than expected", i < tokensToCompare.length);
      String termText = termAtt.term();
      String goldText = tokensToCompare[i].term();
      assertEquals("Wrong termText", goldText, termText);
@ -220,6 +319,8 @@ public class ShingleFilterTest extends BaseTokenStreamTestCase {
      assertEquals("Wrong type for token \"" + termText + "\"", types[i], typeAtt.type());
      i++;
    }
    assertEquals("ShingleFilter outputted wrong # of tokens. (# output = " + i + "; # expected =" + tokensToCompare.length + ")",
                 tokensToCompare.length, i);
  }
  private static Token createToken(String term, int start, int offset)