LUCENE-400: Added ShingleFilter (token based ngram)

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@642612 13f79535-47bb-0310-9956-ffa450edef68
2008-03-29 21:11:33 +00:00 · 2008-03-29 21:11:33 +00:00 · cc955c9748
parent 09b3a56ae2
commit cc955c9748
5 changed files with 759 additions and 0 deletions
--- a/CHANGES.txt
+++ b/CHANGES.txt
@ -139,6 +139,9 @@ New features
    Index store similar to MemoryIndex but allows for multiple documents 
    in memory.  (Karl Wettin via Grant Ingersoll)

+12. LUCENE-400: Added word based n-gram filter (in contrib/analyzers) called ShingleFilter and an Analyzer wrapper
+    that wraps another Analyzer's token stream with a ShingleFilter (Sebastian Kirsch, Steve Rowe via Grant Ingersoll) 
+

 Optimizations

--- a/contrib/analyzers/src/java/org/apache/lucene/analysis/shingle/ShingleAnalyzerWrapper.java
+++ b/contrib/analyzers/src/java/org/apache/lucene/analysis/shingle/ShingleAnalyzerWrapper.java
@ -0,0 +1,96 @@
+package org.apache.lucene.analysis.shingle;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.Reader;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.standard.StandardAnalyzer;
+
+/**
+ * A ShingleAnalyzerWrapper wraps a ShingleFilter around another analyzer.  A shingle is another term for a token based
+ * n-gram.
+ */
+public class ShingleAnalyzerWrapper extends Analyzer {
+
+	protected Analyzer defaultAnalyzer;
+	protected int maxShingleSize = 2;
+	protected boolean outputUnigrams = true;
+
+	public ShingleAnalyzerWrapper(Analyzer defaultAnalyzer) {
+		super();
+		this.defaultAnalyzer = defaultAnalyzer;
+	}
+
+	public ShingleAnalyzerWrapper(Analyzer defaultAnalyzer, int maxShingleSize) {
+		this(defaultAnalyzer);
+		this.maxShingleSize = maxShingleSize;
+	}
+
+	public ShingleAnalyzerWrapper() {
+		super();
+		this.defaultAnalyzer = new StandardAnalyzer();
+	}
+
+	public ShingleAnalyzerWrapper(int nGramSize) {
+		this();
+		this.maxShingleSize = nGramSize;
+	}
+
+  /**
+   * The max shingle (ngram) size
+   * @return The max shingle (ngram) size
+   */
+  public int getMaxShingleSize() {
+    return maxShingleSize;
+  }
+
+  /**
+   * Set the maximum size of output shingles (default: 2)
+   *
+   * @param maxShingleSize max shingle size
+   */
+	public void setMaxShingleSize(int maxShingleSize) {
+		this.maxShingleSize = maxShingleSize;
+	}
+
+
+  public boolean isOutputUnigrams() {
+    return outputUnigrams;
+  }
+
+  /**
+   * Shall the filter pass the original tokens (the "unigrams") to the output
+   * stream? (default: true)
+   *
+   * @param outputUnigrams Whether or not the filter shall pass the original
+   *                       tokens to the output stream
+   */
+	public void setOutputUnigrams(boolean outputUnigrams) {
+		this.outputUnigrams = outputUnigrams;
+	}
+
+	public TokenStream tokenStream(String fieldName, Reader reader) {
+		ShingleFilter filter
+      = new ShingleFilter(defaultAnalyzer.tokenStream(fieldName, reader));
+		filter.setMaxShingleSize(maxShingleSize);
+	  filter.setOutputUnigrams(outputUnigrams);
+    return filter;
+  }
+}
--- a/contrib/analyzers/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java
+++ b/contrib/analyzers/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java
@ -0,0 +1,274 @@
+package org.apache.lucene.analysis.shingle;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.LinkedList;
+import java.util.Iterator;
+
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Token;
+
+/**
+ * <p>A ShingleFilter constructs shingles (token n-grams) from a token stream,
+ * that is, combinations of tokens that are indexed as one token.
+ *
+ * <p>For example, the sentence "please divide this sentence into shingles"
+ * would be tokenized into the tokens "please divide", "divide this",
+ * "this sentence", "sentence into", and "into shingles".
+ *
+ * <p>This filter handles position increments > 1 by inserting filler tokens
+ * (tokens with termtext "_"). It does not handle a position increment of 0.
+ */
+public class ShingleFilter extends TokenFilter {
+
+  private LinkedList shingleBuf = new LinkedList();
+  private LinkedList outputBuf = new LinkedList();
+  private LinkedList tokenBuf = new LinkedList();
+  private StringBuffer[] shingles;
+  private String tokenType = "shingle";
+
+  /**
+   * filler token for when positionIncrement is more than 1
+   */
+  public static final String FILLER_TOKEN = "_";
+
+
+  /**
+   * default maximum shingle size is 2.
+   */
+  public static final int DEFAULT_MAX_SHINGLE_SIZE = 2;
+
+  /**
+   * The string to use when joining adjacent tokens to form a shingle
+   */
+  public static final String TOKEN_SEPARATOR = " ";
+
+  /**
+   * By default, we output unigrams (individual tokens) as well as shingles
+   * (token n-grams).
+   */
+  private boolean outputUnigrams = true;
+
+  /**
+   * maximum shingle size (number of tokens)
+   */
+  private int maxShingleSize;
+
+  /**
+   * Construct a ShingleFilter with the specified single size from the
+   * TokenStream <code>input</code>
+   *
+   * @param input input stream
+   * @param maxShingleSize maximum shingle size produced by the filter.
+   */
+  public ShingleFilter(TokenStream input, int maxShingleSize) {
+    super(input);
+    setMaxShingleSize(maxShingleSize);
+  }
+
+  /**
+   * Construct a ShingleFilter with default shingle size.
+   *
+   * @param input input stream
+   */
+  public ShingleFilter(TokenStream input) {
+    this(input, DEFAULT_MAX_SHINGLE_SIZE);
+  }
+
+  /**
+   * Construct a ShingleFilter with the specified token type for shingle tokens.
+   *
+   * @param input input stream
+   * @param tokenType token type for shingle tokens
+   */
+  public ShingleFilter(TokenStream input, String tokenType) {
+    this(input, DEFAULT_MAX_SHINGLE_SIZE);
+    setTokenType(tokenType);
+  }
+
+  /**
+   * Set the type of the shingle tokens produced by this filter.
+   * (default: "shingle")
+   *
+   * @param tokenType token tokenType
+   */
+  public void setTokenType(String tokenType) {
+    this.tokenType = tokenType;
+  }
+
+  /**
+   * Shall the output stream contain the input tokens (unigrams) as well as
+   * shingles? (default: true.)
+   *
+   * @param outputUnigrams Whether or not the output stream shall contain
+   * the input tokens (unigrams)
+   */
+  public void setOutputUnigrams(boolean outputUnigrams) {
+    this.outputUnigrams = outputUnigrams;
+  }
+
+  /**
+   * Set the max shingle size (default: 2)
+   *
+   * @param maxShingleSize max size of output shingles
+   */
+  public void setMaxShingleSize(int maxShingleSize) {
+    if (maxShingleSize < 2) {
+      throw new IllegalArgumentException("Max shingle size must be >= 2");
+    }
+    shingles = new StringBuffer[maxShingleSize];
+    for (int i = 0; i < shingles.length; i++) {
+      shingles[i] = new StringBuffer();
+    }
+    this.maxShingleSize = maxShingleSize;
+  }
+
+  /**
+   * Clear the StringBuffers that are used for storing the output shingles.
+   */
+  private void clearShingles() {
+    for (int i = 0; i < shingles.length; i++) {
+      shingles[i].setLength(0);
+    }
+  }
+
+  /* (non-Javadoc)
+	 * @see org.apache.lucene.analysis.TokenStream#next()
+	 */
+  public Token next() throws IOException {
+    if (outputBuf.isEmpty()) {
+      fillOutputBuf();
+    }
+    Token nextToken = null;
+    if ( ! outputBuf.isEmpty())
+    {
+      nextToken = (Token)outputBuf.remove(0);
+    }
+    return nextToken;
+  }
+
+  /**
+   * Get the next token from the input stream and push it on the token buffer.
+   * If we encounter a token with position increment > 1, we put filler tokens
+   * on the token buffer.
+   * <p/>
+   * Returns null when the end of the input stream is reached.
+   * @return the next token, or null if at end of input stream
+   * @throws IOException if the input stream has a problem
+   */
+  private Token getNextToken() throws IOException {
+    if (tokenBuf.isEmpty()) {
+      Token lastToken = input.next();
+      if (lastToken != null) {
+        for (int i = 1; i < lastToken.getPositionIncrement(); i++) {
+          tokenBuf.add(new Token(FILLER_TOKEN, lastToken.startOffset(),
+                                 lastToken.startOffset()));
+        }
+        tokenBuf.add(lastToken);
+        return getNextToken();
+      } else {
+        return null;
+      }
+    } else {
+      return (Token)tokenBuf.remove(0);
+    }
+  }
+
+  /**
+   * Fill the output buffer with new shingles.
+   *
+   * @throws IOException if there's a problem getting the next token
+   */
+  private void fillOutputBuf() throws IOException {
+    boolean addedToken = false;
+    /*
+     * Try to fill the shingle buffer.
+     */
+    do {
+      Token token = getNextToken();
+      if (token != null) {
+        shingleBuf.add(token);
+        if (shingleBuf.size() > maxShingleSize)
+        {
+          shingleBuf.remove(0);
+        }
+        addedToken = true;
+      } else {
+        break;
+      }
+    } while (shingleBuf.size() < maxShingleSize);
+
+    /*
+     * If no new token could be added to the shingle buffer, we have reached
+     * the end of the input stream and have to discard the least recent token.
+     */
+    if (! addedToken) {
+      if (shingleBuf.isEmpty()) {
+        return;
+      } else {
+        shingleBuf.remove(0);
+      }
+    }
+
+    clearShingles();
+
+    int[] endOffsets = new int[shingleBuf.size()];
+    for (int i = 0; i < endOffsets.length; i++) {
+      endOffsets[i] = 0;
+    }
+
+    int i = 0;
+    Token token = null;
+    for (Iterator it = shingleBuf.iterator(); it.hasNext(); ) {
+      token = (Token) it.next();
+      for (int j = i; j < shingles.length; j++) {
+        if (shingles[j].length() != 0) {
+          shingles[j].append(TOKEN_SEPARATOR);
+        }
+        shingles[j].append(token.termBuffer(), 0, token.termLength());
+      }
+
+      endOffsets[i] = token.endOffset();
+      i++;
+    }
+
+    if ((! shingleBuf.isEmpty()) && outputUnigrams) {
+      Token unigram = (Token) shingleBuf.getFirst();
+      unigram.setPositionIncrement(1);
+      outputBuf.add(unigram);
+    }
+
+    /*
+     * Push new tokens to the output buffer.
+     */
+    for (int j = 1; j < shingleBuf.size(); j++) {
+      Token shingle = new Token(shingles[j].toString(),
+                                ((Token) shingleBuf.get(0)).startOffset(),
+                                endOffsets[j],
+                                tokenType);
+      if ((! outputUnigrams) && j == 1) {
+        shingle.setPositionIncrement(1);
+      } else {
+        shingle.setPositionIncrement(0);
+      }
+      outputBuf.add(shingle);
+    }
+  }
+}
--- a/contrib/analyzers/src/test/org/apache/lucene/analysis/shingle/ShingleAnalyzerWrapperTest.java
+++ b/contrib/analyzers/src/test/org/apache/lucene/analysis/shingle/ShingleAnalyzerWrapperTest.java
@ -0,0 +1,196 @@
+package org.apache.lucene.analysis.shingle;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.StringReader;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.WhitespaceAnalyzer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Token;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.queryParser.QueryParser;
+import org.apache.lucene.search.BooleanQuery;
+import org.apache.lucene.search.Hits;
+import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.search.PhraseQuery;
+import org.apache.lucene.search.Query;
+import org.apache.lucene.search.TermQuery;
+import org.apache.lucene.search.BooleanClause;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.store.RAMDirectory;
+
+import junit.framework.TestCase;
+
+/**
+ * A test class for ShingleAnalyzerWrapper as regards queries and scoring.
+ */
+public class ShingleAnalyzerWrapperTest extends TestCase {
+
+  public IndexSearcher searcher;
+
+  public static void main(String[] args) {
+    junit.textui.TestRunner.run(ShingleAnalyzerWrapperTest.class);
+  }
+
+  /**
+   * Set up a new index in RAM with three test phrases and the supplied Analyzer.
+   *
+   * @param analyzer the analyzer to use
+   * @return an indexSearcher on the test index.
+   * @throws Exception if an error occurs with index writer or searcher
+   */
+  public IndexSearcher setUpSearcher(Analyzer analyzer) throws Exception {
+    Directory dir = new RAMDirectory();
+    IndexWriter writer = new IndexWriter(dir, analyzer, true);
+
+    Document doc;
+    doc = new Document();
+    doc.add(new Field("content", "please divide this sentence into shingles",
+            Field.Store.YES,Field.Index.TOKENIZED));
+    writer.addDocument(doc);
+
+    doc = new Document();
+    doc.add(new Field("content", "just another test sentence",
+                      Field.Store.YES,Field.Index.TOKENIZED));
+    writer.addDocument(doc);
+
+    doc = new Document();
+    doc.add(new Field("content", "a sentence which contains no test",
+                      Field.Store.YES,Field.Index.TOKENIZED));
+    writer.addDocument(doc);
+
+    writer.close();
+
+    return new IndexSearcher(dir);
+  }
+
+  protected Hits queryParsingTest(Analyzer analyzer, String qs) throws Exception {
+    searcher = setUpSearcher(analyzer);
+
+    QueryParser qp = new QueryParser("content", analyzer);
+
+    Query q = qp.parse(qs);
+
+    return searcher.search(q);
+  }
+
+  protected void compareRanks(Hits hits, int[] ranks) throws Exception {
+    assertEquals(ranks.length, hits.length());
+    for (int i = 0; i < ranks.length; i++) {
+      assertEquals(ranks[i], hits.id(i));
+    }
+  }
+
+  /*
+   * Will not work on an index without unigrams, since QueryParser automatically
+   * tokenizes on whitespace.
+   */
+  public void testShingleAnalyzerWrapperQueryParsing() throws Exception {
+    Hits hits = queryParsingTest(new ShingleAnalyzerWrapper
+                                     (new WhitespaceAnalyzer(), 2),
+                                 "test sentence");
+    int[] ranks = new int[] { 1, 2, 0 };
+    compareRanks(hits, ranks);
+  }
+
+  /*
+   * This one fails with an exception.
+   */
+  public void testShingleAnalyzerWrapperPhraseQueryParsingFails() throws Exception {
+    Hits hits = queryParsingTest(new ShingleAnalyzerWrapper
+                                     (new WhitespaceAnalyzer(), 2),
+                                 "\"this sentence\"");
+    int[] ranks = new int[] { 0 };
+    compareRanks(hits, ranks);
+  }
+
+  /*
+   * This one works, actually.
+   */
+  public void testShingleAnalyzerWrapperPhraseQueryParsing() throws Exception {
+    Hits hits = queryParsingTest(new ShingleAnalyzerWrapper
+                                     (new WhitespaceAnalyzer(), 2),
+                                 "\"test sentence\"");
+    int[] ranks = new int[] { 1 };
+    compareRanks(hits, ranks);
+  }
+
+  /*
+   * Same as above, is tokenized without using the analyzer.
+   */
+  public void testShingleAnalyzerWrapperRequiredQueryParsing() throws Exception {
+    Hits hits = queryParsingTest(new ShingleAnalyzerWrapper
+                                     (new WhitespaceAnalyzer(), 2),
+                                 "+test +sentence");
+    int[] ranks = new int[] { 1, 2 };
+    compareRanks(hits, ranks);
+  }
+
+  /*
+   * This shows how to construct a phrase query containing shingles.
+   */
+  public void testShingleAnalyzerWrapperPhraseQuery() throws Exception {
+    Analyzer analyzer = new ShingleAnalyzerWrapper(new WhitespaceAnalyzer(), 2);
+    searcher = setUpSearcher(analyzer);
+
+    PhraseQuery q = new PhraseQuery();
+
+    TokenStream ts = analyzer.tokenStream("content",
+                                          new StringReader("this sentence"));
+    Token token;
+    int j = -1;
+    while ((token = ts.next()) != null) {
+      j += token.getPositionIncrement();
+      String termText = new String(token.termBuffer(), 0, token.termLength());
+      q.add(new Term("content", termText), j);
+    }
+
+    Hits hits = searcher.search(q);
+    int[] ranks = new int[] { 0 };
+    compareRanks(hits, ranks);
+  }
+
+  /*
+   * How to construct a boolean query with shingles. A query like this will
+   * implicitly score those documents higher that contain the words in the query
+   * in the right order and adjacent to each other.
+   */
+  public void testShingleAnalyzerWrapperBooleanQuery() throws Exception {
+    Analyzer analyzer = new ShingleAnalyzerWrapper(new WhitespaceAnalyzer(), 2);
+    searcher = setUpSearcher(analyzer);
+
+    BooleanQuery q = new BooleanQuery();
+
+    TokenStream ts = analyzer.tokenStream("content",
+                                          new StringReader("test sentence"));
+    Token token;
+    while ((token = ts.next()) != null) {
+      String termText =  new String(token.termBuffer(), 0, token.termLength());
+      q.add(new TermQuery(new Term("content", termText)),
+            BooleanClause.Occur.SHOULD);
+    }
+
+    Hits hits = searcher.search(q);
+    int[] ranks = new int[] { 1, 2, 0 };
+    compareRanks(hits, ranks);
+  }
+}
--- a/contrib/analyzers/src/test/org/apache/lucene/analysis/shingle/ShingleFilterTest.java
+++ b/contrib/analyzers/src/test/org/apache/lucene/analysis/shingle/ShingleFilterTest.java
@ -0,0 +1,190 @@
+package org.apache.lucene.analysis.shingle;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+import junit.framework.TestCase;
+import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.TokenStream;
+
+public class ShingleFilterTest extends TestCase {
+
+  public class TestTokenStream extends TokenStream {
+
+    protected int index = 0;
+    protected Token[] testToken;
+
+    public TestTokenStream(Token[] testToken) {
+      super();
+      this.testToken = testToken;
+    }
+
+    public Token next() throws IOException {
+      if (index < testToken.length) {
+        return testToken[index++];
+      } else {
+        return null;
+      }
+    }
+  }
+
+  public static void main(String[] args) {
+    junit.textui.TestRunner.run(ShingleFilterTest.class);
+  }
+
+  public static final Token[] TEST_TOKEN = new Token[] {
+      new Token("please", 0, 6),
+      new Token("divide", 7, 13),
+      new Token("this", 14, 18),
+      new Token("sentence", 19, 27),
+      new Token("into", 28, 32),
+      new Token("shingles", 33, 39),
+  };
+
+  public static Token[] testTokenWithHoles;
+
+  public static final Token[] BI_GRAM_TOKENS = new Token[] {
+    new Token("please", 0, 6),
+    new Token("please divide", 0, 13),
+    new Token("divide", 7, 13),
+    new Token("divide this", 7, 18),
+    new Token("this", 14, 18),
+    new Token("this sentence", 14, 27),
+    new Token("sentence", 19, 27),
+    new Token("sentence into", 19, 32),
+    new Token("into", 28, 32),
+    new Token("into shingles", 28, 39),
+    new Token("shingles", 33, 39),
+  };
+
+  public static final int[] BI_GRAM_POSITION_INCREMENTS = new int[] {
+    1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
+  };
+
+  public static final String[] BI_GRAM_TYPES = new String[] {
+    "word", "shingle", "word", "shingle", "word", "shingle", "word",
+    "shingle", "word", "shingle", "word"
+  };
+
+  public static final Token[] BI_GRAM_TOKENS_WITH_HOLES = new Token[] {
+    new Token("please", 0, 6),
+    new Token("please divide", 0, 13),
+    new Token("divide", 7, 13),
+    new Token("divide _", 7, 19),
+    new Token("_", 19, 19),
+    new Token("_ sentence", 19, 27),
+    new Token("sentence", 19, 27),
+    new Token("sentence _", 19, 33),
+    new Token("_", 33, 33),
+    new Token("_ shingles", 33, 39),
+    new Token("shingles", 33, 39),
+  };
+
+  public static final int[] BI_GRAM_POSITION_INCREMENTS_WITH_HOLES = new int[] {
+    1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
+  };
+
+  public static final Token[] TRI_GRAM_TOKENS = new Token[] {
+    new Token("please", 0, 6),
+    new Token("please divide", 0, 13),
+    new Token("please divide this", 0, 18),
+    new Token("divide", 7, 13),
+    new Token("divide this", 7, 18),
+    new Token("divide this sentence", 7, 27),
+    new Token("this", 14, 18),
+    new Token("this sentence", 14, 27),
+    new Token("this sentence into", 14, 32),
+    new Token("sentence", 19, 27),
+    new Token("sentence into", 19, 32),
+    new Token("sentence into shingles", 19, 39),
+    new Token("into", 28, 32),
+    new Token("into shingles", 28, 39),
+    new Token("shingles", 33, 39)
+  };
+
+  public static final int[] TRI_GRAM_POSITION_INCREMENTS = new int[] {
+    1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1
+  };
+
+  public static final String[] TRI_GRAM_TYPES = new String[] {
+    "word", "shingle", "shingle",
+    "word", "shingle", "shingle",
+    "word", "shingle", "shingle",
+    "word", "shingle", "shingle",
+    "word", "shingle",
+    "word"
+  };
+
+
+  protected void setUp() throws Exception {
+    super.setUp();
+    testTokenWithHoles = new Token[] {
+      new Token("please", 0, 6),
+      new Token("divide", 7, 13),
+      new Token("sentence", 19, 27),
+      new Token("shingles", 33, 39),
+    };
+
+    testTokenWithHoles[2].setPositionIncrement(2);
+    testTokenWithHoles[3].setPositionIncrement(2);
+  }
+
+  /*
+   * Class under test for void ShingleFilter(TokenStream, int)
+   */
+  public void testBiGramFilter() throws IOException {
+    this.shingleFilterTest(2, TEST_TOKEN, BI_GRAM_TOKENS,
+                           BI_GRAM_POSITION_INCREMENTS, BI_GRAM_TYPES);
+  }
+
+  public void testBiGramFilterWithHoles() throws IOException {
+    this.shingleFilterTest(2, testTokenWithHoles, BI_GRAM_TOKENS_WITH_HOLES,
+                           BI_GRAM_POSITION_INCREMENTS, BI_GRAM_TYPES);
+  }
+
+  public void testTriGramFilter() throws IOException {
+    this.shingleFilterTest(3, TEST_TOKEN, TRI_GRAM_TOKENS,
+                           TRI_GRAM_POSITION_INCREMENTS, TRI_GRAM_TYPES);
+  }
+
+  protected void shingleFilterTest(int n, Token[] testToken, Token[] tokens,
+                                   int[] positionIncrements, String[] types)
+    throws IOException {
+
+    TokenStream filter = new ShingleFilter(new TestTokenStream(testToken), n);
+    Token token;
+    int i = 0;
+
+    while ((token = filter.next()) != null) {
+      String termText = new String(token.termBuffer(), 0, token.termLength());
+      String goldText
+        = new String(tokens[i].termBuffer(), 0, tokens[i].termLength());
+      assertEquals("Wrong termText", goldText, termText);
+      assertEquals("Wrong startOffset for token \"" + termText + "\"",
+                   tokens[i].startOffset(), token.startOffset());
+      assertEquals("Wrong endOffset for token \"" + termText + "\"",
+                   tokens[i].endOffset(), token.endOffset());
+      assertEquals("Wrong positionIncrement for token \"" + termText + "\"",
+                   positionIncrements[i], token.getPositionIncrement());
+      assertEquals("Wrong type for token \"" + termText + "\"",
+                   types[i], token.type());
+      i++;
+    }
+  }
+}