LUCENE-2199: ShingleFilter skipped over tri-gram shingles if outputUnigram was set to false

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@897672 13f79535-47bb-0310-9956-ffa450edef68
2025-02-27 21:09:19 +00:00 · 2010-01-10 18:06:19 +00:00 · 2010-01-10 18:06:19 +00:00 · 673e368bf7
commit 673e368bf7
parent cfb822be6d
3 changed files with 111 additions and 4 deletions
--- a/contrib/CHANGES.txt
+++ b/contrib/CHANGES.txt
@ -18,6 +18,9 @@ Changes in runtime behavior
 Bug fixes
 * LUCENE-2199: ShingleFilter skipped over tri-gram shingles if outputUnigram
   was set to false. (Simon Willnauer)
 * LUCENE-2068: Fixed ReverseStringFilter which was not aware of supplementary
   characters. During reverse the filter created unpaired surrogates, which
   will be replaced by U+FFFD by the indexer, but not at query time. The filter
--- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java
@ -182,7 +182,7 @@ public final class ShingleFilter extends TokenFilter {
          shingleBufferPosition++;
          return true;
        }
-      } else {
+      } else if (shingleBufferPosition % this.maxShingleSize == 0){
        shingleBufferPosition++;
      }
@ -197,7 +197,7 @@ public final class ShingleFilter extends TokenFilter {
          termBuffer = termAtt.resizeTermBuffer(termLength);
        buf.getChars(0, termLength, termBuffer, 0);
        termAtt.setTermLength(termLength);
-        if ((! outputUnigrams) && shingleBufferPosition == 1) {
+        if ((! outputUnigrams) && shingleBufferPosition % this.maxShingleSize == 1) {
          posIncrAtt.setPositionIncrement(1);
        } else {
          posIncrAtt.setPositionIncrement(0);
--- a/contrib/analyzers/common/src/test/org/apache/lucene/analysis/shingle/ShingleFilterTest.java
+++ b/contrib/analyzers/common/src/test/org/apache/lucene/analysis/shingle/ShingleFilterTest.java
@ -200,6 +200,93 @@ public class ShingleFilterTest extends BaseTokenStreamTestCase {
    "word", "shingle",
    "word"
  };
  public static final Token[] TRI_GRAM_TOKENS_WITHOUT_UNIGRAMS = new Token[] {
    createToken("please divide", 0, 13),
    createToken("please divide this", 0, 18),
    createToken("divide this", 7, 18),
    createToken("divide this sentence", 7, 27),
    createToken("this sentence", 14, 27),
    createToken("this sentence into", 14, 32),
    createToken("sentence into", 19, 32),
    createToken("sentence into shingles", 19, 39),
    createToken("into shingles", 28, 39),
  };
  public static final int[] TRI_GRAM_POSITION_INCREMENTS_WITHOUT_UNIGRAMS = new int[] {
    1, 0, 1, 0, 1, 0, 1, 0, 1
  };
  public static final String[] TRI_GRAM_TYPES_WITHOUT_UNIGRAMS = new String[] {
    "shingle", "shingle",
    "shingle", "shingle",
    "shingle", "shingle",
    "shingle", "shingle",
    "shingle",
  };
  public static final Token[] FOUR_GRAM_TOKENS = new Token[] {
    createToken("please", 0, 6),
    createToken("please divide", 0, 13),
    createToken("please divide this", 0, 18),
    createToken("please divide this sentence", 0, 27),
    createToken("divide", 7, 13),
    createToken("divide this", 7, 18),
    createToken("divide this sentence", 7, 27),
    createToken("divide this sentence into", 7, 32),
    createToken("this", 14, 18),
    createToken("this sentence", 14, 27),
    createToken("this sentence into", 14, 32),
    createToken("this sentence into shingles", 14, 39),
    createToken("sentence", 19, 27),
    createToken("sentence into", 19, 32),
    createToken("sentence into shingles", 19, 39),
    createToken("into", 28, 32),
    createToken("into shingles", 28, 39),
    createToken("shingles", 33, 39)
  };
  public static final int[] FOUR_GRAM_POSITION_INCREMENTS = new int[] {
    1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1
  };
  public static final String[] FOUR_GRAM_TYPES = new String[] {
    "word", "shingle", "shingle", "shingle",
    "word", "shingle", "shingle", "shingle",
    "word", "shingle", "shingle", "shingle",
    "word", "shingle", "shingle",
    "word", "shingle",
    "word"
  };
  public static final Token[] FOUR_GRAM_TOKENS_WITHOUT_UNIGRAMS = new Token[] {
    createToken("please divide", 0, 13),
    createToken("please divide this", 0, 18),
    createToken("please divide this sentence", 0, 27),
    createToken("divide this", 7, 18),
    createToken("divide this sentence", 7, 27),
    createToken("divide this sentence into", 7, 32),
    createToken("this sentence", 14, 27),
    createToken("this sentence into", 14, 32),
    createToken("this sentence into shingles", 14, 39),
    createToken("sentence into", 19, 32),
    createToken("sentence into shingles", 19, 39),
    createToken("into shingles", 28, 39),
  };
  public static final int[] FOUR_GRAM_POSITION_INCREMENTS_WITHOUT_UNIGRAMS = new int[] {
    1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1
  };
  public static final String[] FOUR_GRAM_TYPES_WITHOUT_UNIGRAMS = new String[] {
    "shingle", "shingle",
    "shingle", "shingle",
    "shingle", "shingle",
    "shingle", "shingle",
    "shingle", "shingle",
    "shingle", "shingle",
  };
  @Override
@ -272,8 +359,25 @@ public class ShingleFilterTest extends BaseTokenStreamTestCase {
                           TRI_GRAM_POSITION_INCREMENTS, TRI_GRAM_TYPES,
                           true);
  }
-
+  
-
+  public void testTriGramFilterWithoutUnigrams() throws IOException {
    this.shingleFilterTest(3, TEST_TOKEN, TRI_GRAM_TOKENS_WITHOUT_UNIGRAMS,
                           TRI_GRAM_POSITION_INCREMENTS_WITHOUT_UNIGRAMS, TRI_GRAM_TYPES_WITHOUT_UNIGRAMS,
                           false);
  }
  public void testFourGramFilter() throws IOException {
    this.shingleFilterTest(4, TEST_TOKEN, FOUR_GRAM_TOKENS,
        FOUR_GRAM_POSITION_INCREMENTS, FOUR_GRAM_TYPES,
                           true);
  }
  public void testFourGramFilterWithoutUnigrams() throws IOException {
    this.shingleFilterTest(4, TEST_TOKEN, FOUR_GRAM_TOKENS_WITHOUT_UNIGRAMS,
        FOUR_GRAM_POSITION_INCREMENTS_WITHOUT_UNIGRAMS,
        FOUR_GRAM_TYPES_WITHOUT_UNIGRAMS, false);
  }
  public void testReset() throws Exception {
    Tokenizer wsTokenizer = new WhitespaceTokenizer(new StringReader("please divide this sentence"));