LUCENE-2400: ShingleFilter: don't output all-filler shingles/unigrams; also, convert from TermAttribute to CharTermAttribute

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@940451 13f79535-47bb-0310-9956-ffa450edef68
2010-05-03 13:28:14 +00:00 · 2010-05-03 13:28:14 +00:00 · 7cd9a8da03
parent 5abbf3429c
commit 7cd9a8da03
3 changed files with 350 additions and 113 deletions
--- a/lucene/contrib/CHANGES.txt
+++ b/lucene/contrib/CHANGES.txt
@ -145,6 +145,10 @@ New features
   multiple languages.  The default configuration includes special support for
   Thai, Lao, Myanmar, and Khmer.  (Robert Muir, Uwe Schindler)

+ * LUCENE-2400: ShingleFilter was changed to don't output all-filler shingles and 
+   unigrams, and uses a more performant algorithm to build grams using a linked list
+   of AttributeSource.cloneAttributes() instances and the new copyTo() method.
+
 Build

 * LUCENE-2124: Moved the JDK-based collation support from contrib/collation 
--- a/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java
+++ b/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java
@ -18,14 +18,16 @@ package org.apache.lucene.analysis.shingle;
 */

 import java.io.IOException;
+import java.util.Iterator;
 import java.util.LinkedList;

 import org.apache.lucene.analysis.TokenFilter;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
 import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
-import org.apache.lucene.analysis.tokenattributes.TermAttribute;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
+import org.apache.lucene.util.AttributeSource;


 /**
@ -66,12 +68,12 @@ public final class ShingleFilter extends TokenFilter {
   */
  public static final String TOKEN_SEPARATOR = " ";

-
  /**
   * The sequence of input stream tokens (or filler tokens, if necessary)
   * that will be composed to form output shingles.
   */
-  private LinkedList<State> inputWindow = new LinkedList<State>();
+  private LinkedList<InputWindowToken> inputWindow
+    = new LinkedList<InputWindowToken>();
  
  /**
   * The number of input tokens in the next output token.  This is the "n" in
@ -80,9 +82,9 @@ public final class ShingleFilter extends TokenFilter {
  private CircularSequence gramSize;

  /**
-   * Shingle text is composed here.
+   * Shingle and unigram text is composed here.
   */
-  private StringBuilder shingleBuilder = new StringBuilder();
+  private StringBuilder gramBuilder = new StringBuilder();

  /**
   * The token type attribute value to use - default is "shingle"
@ -111,18 +113,31 @@ public final class ShingleFilter extends TokenFilter {
  private int minShingleSize;

  /**
-   * The remaining number of filler tokens inserted into the input stream
+   * The remaining number of filler tokens to be inserted into the input stream
   * from which shingles are composed, to handle position increments greater
   * than one.
   */
  private int numFillerTokensToInsert;

  /**
-   * The next input stream token.
+   * When the next input stream token has a position increment greater than
+   * one, it is stored in this field until sufficient filler tokens have been
+   * inserted to account for the position increment. 
   */
-  private State nextInputStreamToken;
+  private AttributeSource nextInputStreamToken;
+
+  /**
+   * Whether or not there is a next input stream token.
+   */
+  private boolean isNextInputStreamToken = false;
+
+  /**
+   * Whether at least one unigram or shingle has been output at the current 
+   * position.
+   */
+  private boolean isOutputHere = false;
  
-  private final TermAttribute termAtt;
+  private final CharTermAttribute termAtt;
  private final OffsetAttribute offsetAtt;
  private final PositionIncrementAttribute posIncrAtt;
  private final TypeAttribute typeAtt;
@ -140,7 +155,7 @@ public final class ShingleFilter extends TokenFilter {
    super(input);
    setMaxShingleSize(maxShingleSize);
    setMinShingleSize(minShingleSize);
-    this.termAtt = addAttribute(TermAttribute.class);
+    this.termAtt = addAttribute(CharTermAttribute.class);
    this.offsetAtt = addAttribute(OffsetAttribute.class);
    this.posIncrAtt = addAttribute(PositionIncrementAttribute.class);
    this.typeAtt = addAttribute(TypeAttribute.class);
@ -241,23 +256,49 @@ public final class ShingleFilter extends TokenFilter {
    this.tokenSeparator = null == tokenSeparator ? "" : tokenSeparator;
  }

-  /* (non-Javadoc)
-   * @see org.apache.lucene.analysis.TokenStream#next()
-   */
  @Override
  public final boolean incrementToken() throws IOException {
-    boolean tokenAvailable = false; 
+    boolean tokenAvailable = false;
+    int builtGramSize = 0;
    if (gramSize.atMinValue() || inputWindow.size() < gramSize.getValue()) {
      shiftInputWindow();
+      gramBuilder.setLength(0);
+    } else {
+      builtGramSize = gramSize.getPreviousValue();
    }
-    if ( ! inputWindow.isEmpty()) {
-      restoreState(inputWindow.getFirst());
-      if (1 == gramSize.getValue()) {
-        posIncrAtt.setPositionIncrement(1);
-        gramSize.advance();
-        tokenAvailable = true;
-      } else if (inputWindow.size() >= gramSize.getValue()) {
-        getNextShingle();
+    if (inputWindow.size() >= gramSize.getValue()) {
+      boolean isAllFiller = true;
+      InputWindowToken nextToken = null;
+      Iterator<InputWindowToken> iter = inputWindow.iterator();
+      for (int gramNum = 1 ;
+           iter.hasNext() && builtGramSize < gramSize.getValue() ;
+           ++gramNum) {
+        nextToken = iter.next();
+        if (builtGramSize < gramNum) {
+          if (builtGramSize > 0) {
+            gramBuilder.append(tokenSeparator);
+          }
+          gramBuilder.append(nextToken.termAtt.buffer(), 0, 
+                             nextToken.termAtt.length());
+          ++builtGramSize;
+        }
+        if (isAllFiller && nextToken.isFiller) {
+          if (gramNum == gramSize.getValue()) {
+            gramSize.advance();
+          }
+        } else { 
+          isAllFiller = false;
+        }
+      }
+      if ( ! isAllFiller && builtGramSize == gramSize.getValue()) {
+        inputWindow.getFirst().attSource.copyTo(this);
+        posIncrAtt.setPositionIncrement(isOutputHere ? 0 : 1);
+        termAtt.setEmpty().append(gramBuilder);
+        if (gramSize.getValue() > 1) {
+          typeAtt.setType(tokenType);
+        }
+        offsetAtt.setOffset(offsetAtt.startOffset(), nextToken.offsetAtt.endOffset());
+        isOutputHere = true;
        gramSize.advance();
        tokenAvailable = true;
      }
@ -265,83 +306,69 @@ public final class ShingleFilter extends TokenFilter {
    return tokenAvailable;
  }

-  /**
-   * <p>Makes the next token a shingle of length {@link #gramSize}, 
-   * composed of tokens taken from {@link #inputWindow}.
-   * <p>Callers of this method must first insure that there are at least 
-   * <code>gramSize</code> tokens available in <code>inputWindow</code>.
-   */
-  private void getNextShingle() {
-    int startOffset = offsetAtt.startOffset();
-
-    int minTokNum = gramSize.getValue() - 1; // zero-based inputWindow position
-    if (gramSize.getValue() == minShingleSize) {
-      // Clear the shingle text buffer if this is the first shingle
-      // at the current position in the input stream.
-      shingleBuilder.setLength(0);
-      minTokNum = 0;
-    }
-    for (int tokNum = minTokNum ; tokNum < gramSize.getValue() ; ++tokNum) {
-      if (tokNum > 0) {
-        shingleBuilder.append(tokenSeparator);
-      }
-      restoreState(inputWindow.get(tokNum));
-      shingleBuilder.append(termAtt.termBuffer(), 0, termAtt.termLength());
-    }
-    char[] termBuffer = termAtt.termBuffer();
-    int termLength = shingleBuilder.length();
-    if (termBuffer.length < termLength) {
-      termBuffer = termAtt.resizeTermBuffer(termLength);
-    }
-    shingleBuilder.getChars(0, termLength, termBuffer, 0);
-    termAtt.setTermLength(termLength);
-    posIncrAtt.setPositionIncrement(gramSize.atMinValue() ? 1 : 0);
-    typeAtt.setType(tokenType);
-    offsetAtt.setOffset(startOffset, offsetAtt.endOffset());
-  }
-  
  /**
   * <p>Get the next token from the input stream.
   * <p>If the next token has <code>positionIncrement > 1</code>,
   * <code>positionIncrement - 1</code> {@link #FILLER_TOKEN}s are
   * inserted first.
-   * @return false for end of stream; true otherwise
+   * @param target Where to put the new token; if null, a new instance is created.
+   * @return On success, the populated token; null otherwise
   * @throws IOException if the input stream has a problem
   */
-  private boolean getNextToken() throws IOException {
-    boolean success = false;
+  private InputWindowToken getNextToken(InputWindowToken target) 
+    throws IOException {
+    InputWindowToken newTarget = target;
    if (numFillerTokensToInsert > 0) {
-      insertFillerToken();
-      success = true;
-    } else if (null != nextInputStreamToken) {
-      restoreState(nextInputStreamToken);
-      nextInputStreamToken = null;
-      success = true;
-    } else if (input.incrementToken()) {
-      if (posIncrAtt.getPositionIncrement() > 1) {
-        numFillerTokensToInsert = posIncrAtt.getPositionIncrement() - 1;
-        insertFillerToken();
+      if (null == target) {
+        newTarget = new InputWindowToken(nextInputStreamToken.cloneAttributes());
+      } else {
+        nextInputStreamToken.copyTo(target.attSource);
+      }
+      // A filler token occupies no space
+      newTarget.offsetAtt.setOffset(newTarget.offsetAtt.startOffset(), 
+                                    newTarget.offsetAtt.startOffset());
+      newTarget.termAtt.copyBuffer(FILLER_TOKEN, 0, FILLER_TOKEN.length);
+      newTarget.isFiller = true;
+      --numFillerTokensToInsert;
+    } else if (isNextInputStreamToken) {
+      if (null == target) {
+        newTarget = new InputWindowToken(nextInputStreamToken.cloneAttributes());
+      } else {
+        nextInputStreamToken.copyTo(target.attSource);
+      }
+      isNextInputStreamToken = false;
+      newTarget.isFiller = false;
+    } else if (input.incrementToken()) {
+      if (null == target) {
+        newTarget = new InputWindowToken(cloneAttributes());
+      } else {
+        this.copyTo(target.attSource);
+      }
+      if (posIncrAtt.getPositionIncrement() > 1) {
+        // Each output shingle must contain at least one input token, 
+        // so no more than (maxShingleSize - 1) filler tokens will be inserted.
+        numFillerTokensToInsert 
+          = Math.min(posIncrAtt.getPositionIncrement() - 1, maxShingleSize - 1);
+        // Save the current token as the next input stream token
+        if (null == nextInputStreamToken) {
+          nextInputStreamToken = cloneAttributes();
+        } else {
+          this.copyTo(nextInputStreamToken);
+        }
+        isNextInputStreamToken = true;
+        // A filler token occupies no space
+        newTarget.offsetAtt.setOffset(offsetAtt.startOffset(), offsetAtt.startOffset());
+        newTarget.termAtt.copyBuffer(FILLER_TOKEN, 0, FILLER_TOKEN.length);
+        newTarget.isFiller = true;
+        --numFillerTokensToInsert;
+      } else {
+        newTarget.isFiller = false;
      }
-      success = true;
-    }
-    return success;
-	}
-
-  /**
-   * Inserts a {@link #FILLER_TOKEN} and decrements
-   * {@link #numFillerTokensToInsert}.
-   */
-  private void insertFillerToken() {
-    if (null == nextInputStreamToken) {
-      nextInputStreamToken = captureState();
    } else {
-      restoreState(nextInputStreamToken);
+      newTarget = null;
    }
-    --numFillerTokensToInsert;
-    // A filler token occupies no space
-    offsetAtt.setOffset(offsetAtt.startOffset(), offsetAtt.startOffset());
-    termAtt.setTermBuffer(FILLER_TOKEN, 0, FILLER_TOKEN.length);
-  }
+    return newTarget;
+	}

  /**
   * <p>Fills {@link #inputWindow} with input stream tokens, if available, 
@ -351,16 +378,29 @@ public final class ShingleFilter extends TokenFilter {
   * @throws IOException if there's a problem getting the next token
   */
  private void shiftInputWindow() throws IOException {
+    InputWindowToken firstToken = null;
    if (inputWindow.size() > 0) {
-      inputWindow.removeFirst();
+      firstToken = inputWindow.removeFirst();
    }
-    while (getNextToken()) {
-      inputWindow.add(captureState());
-      if (inputWindow.size() == maxShingleSize) {
-        break;
+    while (inputWindow.size() < maxShingleSize) {
+      if (null != firstToken) {  // recycle the firstToken, if available
+        if (null != getNextToken(firstToken)) {
+          inputWindow.add(firstToken); // the firstToken becomes the last
+          firstToken = null;
+        } else {
+          break; // end of input stream
+        }
+      } else {
+        InputWindowToken nextToken = getNextToken(null);
+        if (null != nextToken) {
+          inputWindow.add(nextToken);
+        } else {
+          break; // end of input stream
+        }
      }
    }
    gramSize.reset();
+    isOutputHere = false;
  }

  @Override
@ -369,6 +409,7 @@ public final class ShingleFilter extends TokenFilter {
    gramSize.reset();
    inputWindow.clear();
    numFillerTokensToInsert = 0;
+    isOutputHere = false;
  }


@ -383,6 +424,7 @@ public final class ShingleFilter extends TokenFilter {
   */
  private class CircularSequence {
    private int value;
+    private int previousValue;
    private int minValue;
    
    public CircularSequence() {
@ -405,10 +447,9 @@ public final class ShingleFilter extends TokenFilter {
     * <b>{ [ 1, ] {@link #minShingleSize} [ , ... , {@link #maxShingleSize} ] }</b>.
     * <p>1 is included in the circular sequence only if 
     * {@link #outputUnigrams} = true.
-     * 
-     * @return the next member in the circular sequence
     */
-    public int advance() {
+    public void advance() {
+      previousValue = value;
      if (value == 1) {
        value = minShingleSize;
      } else if (value == maxShingleSize) {
@ -416,7 +457,6 @@ public final class ShingleFilter extends TokenFilter {
      } else {
        ++value;
      }
-      return value;
    }

    /**
@ -428,7 +468,7 @@ public final class ShingleFilter extends TokenFilter {
     * {@link #outputUnigrams} = true.
     */
    public void reset() {
-      value = minValue;
+      previousValue = value = minValue;
    }

    /**
@ -443,5 +483,25 @@ public final class ShingleFilter extends TokenFilter {
    public boolean atMinValue() {
      return value == minValue;
    }
+
+    /**
+     * @return the value this instance had before the last advance() call
+     */
+    public int getPreviousValue() {
+      return previousValue;
+    }
+  }
+    
+  private class InputWindowToken {
+    final AttributeSource attSource;
+    final CharTermAttribute termAtt;
+    final OffsetAttribute offsetAtt;
+    boolean isFiller = false;
+      
+    public InputWindowToken(AttributeSource attSource) {
+      this.attSource = attSource;
+      this.termAtt = attSource.getAttribute(CharTermAttribute.class);
+      this.offsetAtt = attSource.getAttribute(OffsetAttribute.class);
+    }
  }
 }
--- a/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/shingle/ShingleFilterTest.java
+++ b/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/shingle/ShingleFilterTest.java
@ -34,7 +34,7 @@ public class ShingleFilterTest extends BaseTokenStreamTestCase {
    protected int index = 0;
    protected Token[] testToken;
    
-    private TermAttribute termAtt;
+    private CharTermAttribute termAtt;
    private OffsetAttribute offsetAtt;
    private PositionIncrementAttribute posIncrAtt;
    private TypeAttribute typeAtt;
@ -42,7 +42,7 @@ public class ShingleFilterTest extends BaseTokenStreamTestCase {
    public TestTokenStream(Token[] testToken) {
      super();
      this.testToken = testToken;
-      this.termAtt = addAttribute(TermAttribute.class);
+      this.termAtt = addAttribute(CharTermAttribute.class);
      this.offsetAtt = addAttribute(OffsetAttribute.class);
      this.posIncrAtt = addAttribute(PositionIncrementAttribute.class);
      this.typeAtt = addAttribute(TypeAttribute.class);
@ -53,7 +53,7 @@ public class ShingleFilterTest extends BaseTokenStreamTestCase {
      clearAttributes();
      if (index < testToken.length) {
        Token t = testToken[index++];
-        termAtt.setTermBuffer(t.termBuffer(), 0, t.termLength());
+        termAtt.copyBuffer(t.buffer(), 0, t.length());
        offsetAtt.setOffset(t.startOffset(), t.endOffset());
        posIncrAtt.setPositionIncrement(t.getPositionIncrement());
        typeAtt.setType(TypeAttributeImpl.DEFAULT_TYPE);
@ -103,17 +103,20 @@ public class ShingleFilterTest extends BaseTokenStreamTestCase {
    createToken("please divide", 0, 13),
    createToken("divide", 7, 13),
    createToken("divide _", 7, 19),
-    createToken("_", 19, 19),
    createToken("_ sentence", 19, 27),
    createToken("sentence", 19, 27),
    createToken("sentence _", 19, 33),
-    createToken("_", 33, 33),
    createToken("_ shingles", 33, 39),
    createToken("shingles", 33, 39),
  };

  public static final int[] BI_GRAM_POSITION_INCREMENTS_WITH_HOLES = new int[] {
-    1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
+    1, 0, 1, 0, 1, 1, 0, 1, 1
+  };
+
+  private static final String[] BI_GRAM_TYPES_WITH_HOLES = {
+    "word", "shingle", 
+    "word", "shingle", "shingle", "word", "shingle", "shingle", "word"
  };

  public static final Token[] BI_GRAM_TOKENS_WITHOUT_UNIGRAMS = new Token[] {
@ -642,18 +645,157 @@ public class ShingleFilterTest extends BaseTokenStreamTestCase {
    "word"
  };
  
+  public static final Token[] TEST_TOKEN_POS_INCR_EQUAL_TO_N = new Token[] {
+    createToken("please", 0, 6),
+    createToken("divide", 7, 13),
+    createToken("this", 14, 18),
+    createToken("sentence", 29, 37, 3),
+    createToken("into", 38, 42),
+    createToken("shingles", 43, 49),
+  };
+
+  public static final Token[] TRI_GRAM_TOKENS_POS_INCR_EQUAL_TO_N = new Token[] {
+    createToken("please", 0, 6),
+    createToken("please divide", 0, 13),
+    createToken("please divide this", 0, 18),
+    createToken("divide", 7, 13),
+    createToken("divide this", 7, 18),
+    createToken("divide this _", 7, 29),
+    createToken("this", 14, 18),
+    createToken("this _", 14, 29),
+    createToken("this _ _", 14, 29),
+    createToken("_ _ sentence", 29, 37),
+    createToken("_ sentence", 29, 37),
+    createToken("_ sentence into", 29, 42),
+    createToken("sentence", 29, 37),
+    createToken("sentence into", 29, 42),
+    createToken("sentence into shingles", 29, 49),
+    createToken("into", 38, 42),
+    createToken("into shingles", 38, 49),
+    createToken("shingles", 43, 49)
+  };
+  
+  public static final int[] TRI_GRAM_POSITION_INCREMENTS_POS_INCR_EQUAL_TO_N = new int[] {
+    1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1
+  };
+  
+  public static final String[] TRI_GRAM_TYPES_POS_INCR_EQUAL_TO_N = new String[] {
+    "word", "shingle", "shingle",
+    "word", "shingle", "shingle",
+    "word", "shingle", "shingle",
+    "shingle", "shingle", "shingle", "word", "shingle", "shingle",
+    "word", "shingle",
+    "word"
+  };
+  
+  public static final Token[] TRI_GRAM_TOKENS_POS_INCR_EQUAL_TO_N_WITHOUT_UNIGRAMS = new Token[] {
+    createToken("please divide", 0, 13),
+    createToken("please divide this", 0, 18),
+    createToken("divide this", 7, 18),
+    createToken("divide this _", 7, 29),
+    createToken("this _", 14, 29),
+    createToken("this _ _", 14, 29),
+    createToken("_ _ sentence", 29, 37),
+    createToken("_ sentence", 29, 37),
+    createToken("_ sentence into", 29, 42),
+    createToken("sentence into", 29, 42),
+    createToken("sentence into shingles", 29, 49),
+    createToken("into shingles", 38, 49),
+  };
+
+  public static final int[] TRI_GRAM_POSITION_INCREMENTS_POS_INCR_EQUAL_TO_N_WITHOUT_UNIGRAMS = new int[] {
+    1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1
+  };
+
+  public static final String[] TRI_GRAM_TYPES_POS_INCR_EQUAL_TO_N_WITHOUT_UNIGRAMS = new String[] {
+    "shingle", "shingle",
+    "shingle", "shingle",
+    "shingle", "shingle",
+    "shingle", "shingle", "shingle",
+    "shingle", "shingle",
+    "shingle",
+  };
+
+  public static final Token[] TEST_TOKEN_POS_INCR_GREATER_THAN_N = new Token[] {
+    createToken("please", 0, 6),
+    createToken("divide", 57, 63, 8),
+    createToken("this", 64, 68),
+    createToken("sentence", 69, 77),
+    createToken("into", 78, 82),
+    createToken("shingles", 83, 89),
+  };
+  
+  public static final Token[] TRI_GRAM_TOKENS_POS_INCR_GREATER_THAN_N = new Token[] {
+    createToken("please", 0, 6),
+    createToken("please _", 0, 57),
+    createToken("please _ _", 0, 57),
+    createToken("_ _ divide", 57, 63),
+    createToken("_ divide", 57, 63),
+    createToken("_ divide this", 57, 68),
+    createToken("divide", 57, 63),
+    createToken("divide this", 57, 68),
+    createToken("divide this sentence", 57, 77),
+    createToken("this", 64, 68),
+    createToken("this sentence", 64, 77),
+    createToken("this sentence into", 64, 82),
+    createToken("sentence", 69, 77),
+    createToken("sentence into", 69, 82),
+    createToken("sentence into shingles", 69, 89),
+    createToken("into", 78, 82),
+    createToken("into shingles", 78, 89),
+    createToken("shingles", 83, 89)
+  };
+  
+  public static final int[] TRI_GRAM_POSITION_INCREMENTS_POS_INCR_GREATER_THAN_N = new int[] {
+    1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1
+  };
+  public static final String[] TRI_GRAM_TYPES_POS_INCR_GREATER_THAN_N = new String[] {
+    "word", "shingle", "shingle",
+    "shingle",
+    "shingle", "shingle", 
+    "word", "shingle", "shingle",
+    "word", "shingle", "shingle",
+    "word", "shingle", "shingle",
+    "word", "shingle",
+    "word"
+  };
+  
+  public static final Token[] TRI_GRAM_TOKENS_POS_INCR_GREATER_THAN_N_WITHOUT_UNIGRAMS = new Token[] {
+    createToken("please _", 0, 57),
+    createToken("please _ _", 0, 57),
+    createToken("_ _ divide", 57, 63),
+    createToken("_ divide", 57, 63),
+    createToken("_ divide this", 57, 68),
+    createToken("divide this", 57, 68),
+    createToken("divide this sentence", 57, 77),
+    createToken("this sentence", 64, 77),
+    createToken("this sentence into", 64, 82),
+    createToken("sentence into", 69, 82),
+    createToken("sentence into shingles", 69, 89),
+    createToken("into shingles", 78, 89),
+  };
+
+  public static final int[] TRI_GRAM_POSITION_INCREMENTS_POS_INCR_GREATER_THAN_N_WITHOUT_UNIGRAMS = new int[] {
+    1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1
+  };
+
+  public static final String[] TRI_GRAM_TYPES_POS_INCR_GREATER_THAN_N_WITHOUT_UNIGRAMS = new String[] {
+    "shingle", "shingle",
+    "shingle", "shingle",
+    "shingle", "shingle",
+    "shingle", "shingle", "shingle", "shingle", "shingle",
+    "shingle",
+  };
+
  @Override
  protected void setUp() throws Exception {
    super.setUp();
    testTokenWithHoles = new Token[] {
      createToken("please", 0, 6),
      createToken("divide", 7, 13),
-      createToken("sentence", 19, 27),
-      createToken("shingles", 33, 39),
+      createToken("sentence", 19, 27, 2),
+      createToken("shingles", 33, 39, 2),
    };
-
-    testTokenWithHoles[2].setPositionIncrement(2);
-    testTokenWithHoles[3].setPositionIncrement(2);
  }

  /*
@ -667,7 +809,8 @@ public class ShingleFilterTest extends BaseTokenStreamTestCase {

  public void testBiGramFilterWithHoles() throws IOException {
    this.shingleFilterTest(2, testTokenWithHoles, BI_GRAM_TOKENS_WITH_HOLES,
-                           BI_GRAM_POSITION_INCREMENTS, BI_GRAM_TYPES,
+                           BI_GRAM_POSITION_INCREMENTS_WITH_HOLES, 
+                           BI_GRAM_TYPES_WITH_HOLES, 
                           true);
  }

@ -832,7 +975,31 @@ public class ShingleFilterTest extends BaseTokenStreamTestCase {
                           TRI_GRAM_POSITION_INCREMENTS_NULL_SEPARATOR, 
                           TRI_GRAM_TYPES_NULL_SEPARATOR, true);
  }
+
+  public void testPositionIncrementEqualToN() throws IOException {
+    this.shingleFilterTest(2, 3, TEST_TOKEN_POS_INCR_EQUAL_TO_N, TRI_GRAM_TOKENS_POS_INCR_EQUAL_TO_N,
+                           TRI_GRAM_POSITION_INCREMENTS_POS_INCR_EQUAL_TO_N, 
+                           TRI_GRAM_TYPES_POS_INCR_EQUAL_TO_N, true);
+  }
  
+  public void testPositionIncrementEqualToNWithoutUnigrams() throws IOException {
+    this.shingleFilterTest(2, 3, TEST_TOKEN_POS_INCR_EQUAL_TO_N, TRI_GRAM_TOKENS_POS_INCR_EQUAL_TO_N_WITHOUT_UNIGRAMS,
+                           TRI_GRAM_POSITION_INCREMENTS_POS_INCR_EQUAL_TO_N_WITHOUT_UNIGRAMS, 
+                           TRI_GRAM_TYPES_POS_INCR_EQUAL_TO_N_WITHOUT_UNIGRAMS, false);
+  }
+  
+  
+  public void testPositionIncrementGreaterThanN() throws IOException {
+    this.shingleFilterTest(2, 3, TEST_TOKEN_POS_INCR_GREATER_THAN_N, TRI_GRAM_TOKENS_POS_INCR_GREATER_THAN_N,
+                           TRI_GRAM_POSITION_INCREMENTS_POS_INCR_GREATER_THAN_N, 
+                           TRI_GRAM_TYPES_POS_INCR_GREATER_THAN_N, true);
+  }
+  
+  public void testPositionIncrementGreaterThanNWithoutUnigrams() throws IOException {
+    this.shingleFilterTest(2, 3, TEST_TOKEN_POS_INCR_GREATER_THAN_N, TRI_GRAM_TOKENS_POS_INCR_GREATER_THAN_N_WITHOUT_UNIGRAMS,
+                           TRI_GRAM_POSITION_INCREMENTS_POS_INCR_GREATER_THAN_N_WITHOUT_UNIGRAMS, 
+                           TRI_GRAM_TYPES_POS_INCR_GREATER_THAN_N_WITHOUT_UNIGRAMS, false);
+  }
  
  public void testReset() throws Exception {
    Tokenizer wsTokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("please divide this sentence"));
@ -896,18 +1063,24 @@ public class ShingleFilterTest extends BaseTokenStreamTestCase {
    int endOffsets[] = new int[tokensToCompare.length];
    
    for (int i = 0; i < tokensToCompare.length; i++) {
-      text[i] = tokensToCompare[i].term();
+      text[i] = new String(tokensToCompare[i].buffer(),0, tokensToCompare[i].length());
      startOffsets[i] = tokensToCompare[i].startOffset();
      endOffsets[i] = tokensToCompare[i].endOffset();
    }
    
    assertTokenStreamContents(filter, text, startOffsets, endOffsets, types, positionIncrements);
  }
+  
+  private static Token createToken(String term, int start, int offset) {
+    return createToken(term, start, offset, 1);
+  }

-  private static Token createToken(String term, int start, int offset)
+  private static Token createToken
+    (String term, int start, int offset, int positionIncrement)
  {
    Token token = new Token(start, offset);
-    token.setTermBuffer(term);
+    token.copyBuffer(term.toCharArray(), 0, term.length());
+    token.setPositionIncrement(positionIncrement);
    return token;
  }
 }