LUCENE-5165: add SuggestStopFilter

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1513940 13f79535-47bb-0310-9956-ffa450edef68
2013-08-14 15:55:43 +00:00 · 2013-08-14 15:55:43 +00:00 · c74dda1495
parent cec43e7d92
commit c74dda1495
8 changed files with 338 additions and 279 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -83,6 +83,13 @@ New features
  FacetsAggregator.createOrdinalValueResolver. This gives better options for
  resolving an ordinal's value by FacetAggregators. (Shai Erera)

+* LUCENE-5165: Add SuggestStopFilter, to be used with analyzing
+  suggesters, so that a stop word at the very end of the lookup query,
+  and without any trailing token characters, will be preserved.  This
+  enables query "a" to suggest apple; see 
+  http://blog.mikemccandless.com/2013/08/suggeststopfilter-carefully-removes.html
+  for details.
+
 Bug Fixes

 * LUCENE-5116: IndexWriter.addIndexes(IndexReader...) should drop empty (or all
--- a/lucene/suggest/build.xml
+++ b/lucene/suggest/build.xml
@ -34,6 +34,15 @@
    <path refid="base.classpath"/>
  </path>

+
+  <target name="javadocs" depends="javadocs-queries,compile-core">
+    <invoke-module-javadoc>
+      <links>
+        <link href="../analyzers-common"/>
+      </links>
+    </invoke-module-javadoc>
+  </target>
+
  <target name="compile-core" depends="jar-misc, jar-analyzers-common, common.compile-core" />

 </project>
--- a/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/SuggestStopFilter.java
+++ b/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/SuggestStopFilter.java
@ -0,0 +1,129 @@
+package org.apache.lucene.search.suggest.analyzing;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.core.StopFilter;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.analysis.util.CharArraySet;
+import org.apache.lucene.util.AttributeSource.State;
+
+/** Like {@link StopFilter} except it will not remove the
+ *  last token if that token was not followed by some token
+ *  separator.  For example, a query 'find the' would
+ *  preserve the 'the' since it was not followed by a space or
+ *  punctuation or something, and mark it KEYWORD so future
+ *  stemmers won't touch it either while a query like "find
+ *  the popsicle' would remove 'the' as a stopword.
+ *
+ *  <p>Normally you'd use the ordinary {@link StopFilter}
+ *  in your indexAnalyzer and then this class in your
+ *  queryAnalyzer, when using one of the analyzing suggesters. */
+
+public final class SuggestStopFilter extends TokenFilter {
+
+  private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+  private final PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);
+  private final KeywordAttribute keywordAtt = addAttribute(KeywordAttribute.class);
+  private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
+  private final CharArraySet stopWords;
+
+  private State endState;
+  private boolean ended;
+
+  /** Sole constructor. */
+  public SuggestStopFilter(TokenStream input, CharArraySet stopWords) {
+    super(input);
+    this.stopWords = stopWords;
+  }
+
+  @Override
+  public void reset() throws IOException {
+    super.reset();
+    ended = false;
+    endState = null;
+  }
+
+  @Override
+  public void end() throws IOException {
+    if (!ended) {
+      super.end();
+    } else {
+      // NOTE: we already called .end() from our .next() when
+      // the stream was complete, so we do not call
+      // super.end() here
+
+      if (endState != null) {
+        restoreState(endState);
+      }
+    }
+  }
+
+  @Override
+  public boolean incrementToken() throws IOException {
+    if (ended) {
+      return false;
+    }
+
+    if (!input.incrementToken()) {
+      return false;
+    }
+
+    int skippedPositions = 0;
+    while (true) {
+      if (stopWords.contains(termAtt.buffer(), 0, termAtt.length())) {
+        int posInc = posIncAtt.getPositionIncrement();
+        int endOffset = offsetAtt.endOffset();
+        // This token may be a stopword, if it's not end:
+        State sav = captureState();
+        if (input.incrementToken()) {
+          // It was a stopword; skip it
+          skippedPositions += posInc;
+        } else {
+          input.end();
+          ended = true;
+          int finalEndOffset = offsetAtt.endOffset();
+          assert finalEndOffset >= endOffset;
+          if (finalEndOffset > endOffset) {
+            // OK there was a token separator after the
+            // stopword, so it was a stopword
+            return false;
+          } else {
+            // No token separator after final token that
+            // looked like a stop-word; don't filter it:
+            endState = captureState();
+            restoreState(sav);
+            posIncAtt.setPositionIncrement(skippedPositions + posIncAtt.getPositionIncrement());
+            keywordAtt.setKeyword(true);
+            return true;
+          }
+        }
+      } else {
+        // Not a stopword; return the current token:
+        posIncAtt.setPositionIncrement(skippedPositions + posIncAtt.getPositionIncrement());
+        return true;
+      }
+    }
+  }
+}
--- a/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/AnalyzingInfixSuggesterTest.java
+++ b/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/AnalyzingInfixSuggesterTest.java
@ -25,11 +25,8 @@ import java.util.Locale;
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.MockAnalyzer;
 import org.apache.lucene.analysis.MockTokenizer;
-import org.apache.lucene.search.BooleanClause;
-import org.apache.lucene.search.BooleanQuery;
-import org.apache.lucene.search.PrefixQuery;
-import org.apache.lucene.search.Query;
-import org.apache.lucene.search.TermQuery;
+import org.apache.lucene.analysis.core.StopFilter;
+import org.apache.lucene.analysis.util.CharArraySet;
 import org.apache.lucene.search.suggest.Lookup.LookupResult;
 import org.apache.lucene.search.suggest.TermFreqPayload;
 import org.apache.lucene.search.suggest.TermFreqPayloadArrayIterator;
@ -294,16 +291,32 @@ public class AnalyzingInfixSuggesterTest extends LuceneTestCase {
    suggester.close();
  }

-  public void testForkLastToken() throws Exception {
-    Analyzer a = new Analyzer() {
+  public void testSuggestStopFilter() throws Exception {
+    final CharArraySet stopWords = StopFilter.makeStopSet(TEST_VERSION_CURRENT, "a");
+    Analyzer indexAnalyzer = new Analyzer() {
        @Override
        protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
          MockTokenizer tokens = new MockTokenizer(reader);
-          // ForkLastTokenFilter is a bit evil:
-          tokens.setEnableChecks(false);
          return new TokenStreamComponents(tokens,
-                                           new StopKeywordFilter(TEST_VERSION_CURRENT,
-                                                                 new ForkLastTokenFilter(tokens), StopKeywordFilter.makeStopSet(TEST_VERSION_CURRENT, "a")));
+                                           new StopFilter(TEST_VERSION_CURRENT, tokens, stopWords));
+        }
+      };
+
+    Analyzer queryAnalyzer = new Analyzer() {
+        @Override
+        protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+          MockTokenizer tokens = new MockTokenizer(reader);
+          return new TokenStreamComponents(tokens,
+                                           new SuggestStopFilter(tokens, stopWords));
+        }
+      };
+
+    File tempDir = _TestUtil.getTempDir("AnalyzingInfixSuggesterTest");
+
+    AnalyzingInfixSuggester suggester = new AnalyzingInfixSuggester(TEST_VERSION_CURRENT, tempDir, indexAnalyzer, queryAnalyzer, 3) {
+        @Override
+        protected Directory getDirectory(File path) {
+          return newDirectory();
        }
      };

@ -311,47 +324,6 @@ public class AnalyzingInfixSuggesterTest extends LuceneTestCase {
      new TermFreqPayload("a bob for apples", 10, new BytesRef("foobaz")),
    };

-    File tempDir = _TestUtil.getTempDir("AnalyzingInfixSuggesterTest");
-
-    AnalyzingInfixSuggester suggester = new AnalyzingInfixSuggester(TEST_VERSION_CURRENT, tempDir, a, a, 3) {
-        @Override
-        protected Query finishQuery(BooleanQuery in, boolean allTermsRequired) {
-          List<BooleanClause> clauses = in.clauses();
-          if (clauses.size() >= 2 && allTermsRequired) {
-            String t1 = getTerm(clauses.get(clauses.size()-2).getQuery());
-            String t2 = getTerm(clauses.get(clauses.size()-1).getQuery());
-            if (t1.equals(t2)) {
-              // The last 2 tokens came from
-              // ForkLastTokenFilter; we remove them and
-              // replace them with a MUST BooleanQuery that
-              // SHOULDs the two of them together:
-              BooleanQuery sub = new BooleanQuery();
-              BooleanClause other = clauses.get(clauses.size()-2);
-              sub.add(new BooleanClause(clauses.get(clauses.size()-2).getQuery(), BooleanClause.Occur.SHOULD));
-              sub.add(new BooleanClause(clauses.get(clauses.size()-1).getQuery(), BooleanClause.Occur.SHOULD));
-              clauses.subList(clauses.size()-2, clauses.size()).clear();
-              clauses.add(new BooleanClause(sub, BooleanClause.Occur.MUST));
-            }
-          }
-          return in;
-        }
-
-        private String getTerm(Query query) {
-          if (query instanceof TermQuery) {
-            return ((TermQuery) query).getTerm().text();
-          } else if (query instanceof PrefixQuery) {
-            return ((PrefixQuery) query).getPrefix().text();
-          } else {
-            return null;
-          }
-        }
-
-        @Override
-        protected Directory getDirectory(File path) {
-          return newDirectory();
-        }
-      };
-
    suggester.build(new TermFreqPayloadArrayIterator(keys));
    List<LookupResult> results = suggester.lookup(_TestUtil.stringToCharSequence("a", random()), 10, true, true);
    assertEquals(1, results.size());
--- a/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/ForkLastTokenFilter.java
+++ b/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/ForkLastTokenFilter.java
@ -1,89 +0,0 @@
-package org.apache.lucene.search.suggest.analyzing;
-
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-import java.io.IOException;
-
-import org.apache.lucene.analysis.TokenFilter;
-import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
-import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
-import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
-
-/** Repeats the last token, if the endOffset indicates that
- *  the token didn't have any characters after it (i.e. it
- *  is not "done").  This is useful in analyzing
- *  suggesters along with StopKeywordFilter: imagine the
- *  user has typed 'a', but your stop filter would normally
- *  remove that.  This token filter will repeat that last a
- *  token, setting {@link KeywordAttribute}, so that the
- *  {@link StopKeywordFilter} won't remove it, and then
- *  suggestions starting with a will be shown.  */
-
-final class ForkLastTokenFilter extends TokenFilter {
-
-  private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
-  private final KeywordAttribute keywordAtt = addAttribute(KeywordAttribute.class);
-  private final PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);
-
-  State lastToken;
-  int maxEndOffset;
-  boolean stop = false;
-
-  public ForkLastTokenFilter(TokenStream in) {
-    super(in);
-  }
-
-  @Override
-  public boolean incrementToken() throws IOException {
-    if (stop) {
-      return false;
-    } else if (input.incrementToken()) {
-      lastToken = captureState();
-      maxEndOffset = Math.max(maxEndOffset, offsetAtt.endOffset());
-      return true;
-    } else if (lastToken == null) {
-      return false;
-    } else {
-
-      // TODO: this is iffy!!!  maybe somehow instead caller
-      // could tell us endOffset up front?
-      input.end();
-
-      if (offsetAtt.endOffset() == maxEndOffset) {
-        // Text did not see end of token char:
-        restoreState(lastToken);
-        keywordAtt.setKeyword(true);
-        posIncAtt.setPositionIncrement(0);
-        lastToken = null;
-        stop = true;
-        return true;
-      } else {
-        return false;
-      }
-    }
-  }
-
-  @Override
-  public void reset() throws IOException {
-    super.reset();
-    lastToken = null;
-    maxEndOffset = -1;
-    stop = false;
-  }
-}
--- a/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/StopKeywordFilter.java
+++ b/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/StopKeywordFilter.java
@ -1,131 +0,0 @@
-package org.apache.lucene.search.suggest.analyzing;
-
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-import java.util.Arrays;
-import java.util.List;
-
-import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
-import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
-import org.apache.lucene.analysis.util.CharArraySet;
-import org.apache.lucene.analysis.util.FilteringTokenFilter;
-import org.apache.lucene.util.Version;
-
-/**
- * Removes stop words from a token stream, if
- * {@link KeywordAttribute} is set then does not remove the
- * word.
- * 
- * <a name="version"/>
- * <p>You must specify the required {@link Version}
- * compatibility when creating StopFilter:
- * <ul>
- *   <li> As of 3.1, StopFilter correctly handles Unicode 4.0
- *         supplementary characters in stopwords and position
- *         increments are preserved
- * </ul>
- */
-final class StopKeywordFilter extends FilteringTokenFilter {
-
-  private final CharArraySet stopWords;
-  private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
-  private final KeywordAttribute keywordAtt = addAttribute(KeywordAttribute.class);
-  
-  /**
-   * Constructs a filter which removes words from the input TokenStream that are
-   * named in the Set.
-   * 
-   * @param matchVersion
-   *          Lucene version to enable correct Unicode 4.0 behavior in the stop
-   *          set if Version > 3.0.  See <a href="#version">above</a> for details.
-   * @param in
-   *          Input stream
-   * @param stopWords
-   *          A {@link CharArraySet} representing the stopwords.
-   * @see #makeStopSet(Version, java.lang.String...)
-   */
-  public StopKeywordFilter(Version matchVersion, TokenStream in, CharArraySet stopWords) {
-    super(matchVersion, in);
-    this.stopWords = stopWords;
-  }
-
-  /**
-   * Builds a Set from an array of stop words,
-   * appropriate for passing into the StopFilter constructor.
-   * This permits this stopWords construction to be cached once when
-   * an Analyzer is constructed.
-   * 
-   * @param matchVersion Lucene version to enable correct Unicode 4.0 behavior in the returned set if Version > 3.0
-   * @param stopWords An array of stopwords
-   * @see #makeStopSet(Version, java.lang.String[], boolean) passing false to ignoreCase
-   */
-  public static CharArraySet makeStopSet(Version matchVersion, String... stopWords) {
-    return makeStopSet(matchVersion, stopWords, false);
-  }
-  
-  /**
-   * Builds a Set from an array of stop words,
-   * appropriate for passing into the StopFilter constructor.
-   * This permits this stopWords construction to be cached once when
-   * an Analyzer is constructed.
-   * 
-   * @param matchVersion Lucene version to enable correct Unicode 4.0 behavior in the returned set if Version > 3.0
-   * @param stopWords A List of Strings or char[] or any other toString()-able list representing the stopwords
-   * @return A Set ({@link CharArraySet}) containing the words
-   * @see #makeStopSet(Version, java.lang.String[], boolean) passing false to ignoreCase
-   */
-  public static CharArraySet makeStopSet(Version matchVersion, List<?> stopWords) {
-    return makeStopSet(matchVersion, stopWords, false);
-  }
-    
-  /**
-   * Creates a stopword set from the given stopword array.
-   * 
-   * @param matchVersion Lucene version to enable correct Unicode 4.0 behavior in the returned set if Version > 3.0
-   * @param stopWords An array of stopwords
-   * @param ignoreCase If true, all words are lower cased first.  
-   * @return a Set containing the words
-   */    
-  public static CharArraySet makeStopSet(Version matchVersion, String[] stopWords, boolean ignoreCase) {
-    CharArraySet stopSet = new CharArraySet(matchVersion, stopWords.length, ignoreCase);
-    stopSet.addAll(Arrays.asList(stopWords));
-    return stopSet;
-  }
-  
-  /**
-   * Creates a stopword set from the given stopword list.
-   * @param matchVersion Lucene version to enable correct Unicode 4.0 behavior in the returned set if Version > 3.0
-   * @param stopWords A List of Strings or char[] or any other toString()-able list representing the stopwords
-   * @param ignoreCase if true, all words are lower cased first
-   * @return A Set ({@link CharArraySet}) containing the words
-   */
-  public static CharArraySet makeStopSet(Version matchVersion, List<?> stopWords, boolean ignoreCase){
-    CharArraySet stopSet = new CharArraySet(matchVersion, stopWords.size(), ignoreCase);
-    stopSet.addAll(stopWords);
-    return stopSet;
-  }
-  
-  /**
-   * Returns the next input Token whose term() is not a stop word.
-   */
-  @Override
-  protected boolean accept() {
-    return keywordAtt.isKeyword() || !stopWords.contains(termAtt.buffer(), 0, termAtt.length());
-  }
-}
--- a/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/TestSuggestStopFilter.java
+++ b/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/TestSuggestStopFilter.java
@ -0,0 +1,140 @@
+package org.apache.lucene.search.suggest.analyzing;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.StringReader;
+
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.MockTokenizer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.core.StopFilter;
+import org.apache.lucene.analysis.util.CharArraySet;
+
+public class TestSuggestStopFilter extends BaseTokenStreamTestCase {
+
+  public void testEndNotStopWord() throws Exception {
+    CharArraySet stopWords = StopFilter.makeStopSet(TEST_VERSION_CURRENT, "to");
+    TokenStream stream = new MockTokenizer(new StringReader("go to"));
+    TokenStream filter = new SuggestStopFilter(stream, stopWords);
+    assertTokenStreamContents(filter,
+                              new String[] {"go", "to"},
+                              new int[] {0, 3},
+                              new int[] {2, 5},
+                              null,
+                              new int[] {1, 1},
+                              null,
+                              5,
+                              new boolean[] {false, true},
+                              true);
+  }
+
+  public void testEndIsStopWord() throws Exception {
+                              
+    CharArraySet stopWords = StopFilter.makeStopSet(TEST_VERSION_CURRENT, "to");
+    TokenStream stream = new MockTokenizer(new StringReader("go to "));
+    TokenStream filter = new SuggestStopFilter(stream, stopWords);
+
+    filter = new SuggestStopFilter(stream, stopWords);
+    assertTokenStreamContents(filter,
+                              new String[] {"go"},
+                              new int[] {0},
+                              new int[] {2},
+                              null,
+                              new int[] {1},
+                              null,
+                              6,
+                              new boolean[] {false},
+                              true);
+  }
+
+  public void testMidStopWord() throws Exception {
+                              
+    CharArraySet stopWords = StopFilter.makeStopSet(TEST_VERSION_CURRENT, "to");
+    TokenStream stream = new MockTokenizer(new StringReader("go to school"));
+    TokenStream filter = new SuggestStopFilter(stream, stopWords);
+
+    filter = new SuggestStopFilter(stream, stopWords);
+    assertTokenStreamContents(filter,
+                              new String[] {"go", "school"},
+                              new int[] {0, 6},
+                              new int[] {2, 12},
+                              null,
+                              new int[] {1, 2},
+                              null,
+                              12,
+                              new boolean[] {false, false},
+                              true);
+  }
+
+  public void testMultipleStopWords() throws Exception {
+                              
+    CharArraySet stopWords = StopFilter.makeStopSet(TEST_VERSION_CURRENT, "to", "the", "a");
+    TokenStream stream = new MockTokenizer(new StringReader("go to a the school"));
+    TokenStream filter = new SuggestStopFilter(stream, stopWords);
+
+    filter = new SuggestStopFilter(stream, stopWords);
+    assertTokenStreamContents(filter,
+                              new String[] { "go", "school" },
+                              new int[] {0, 12},
+                              new int[] {2, 18},
+                              null,
+                              new int[] {1, 4},
+                              null,
+                              18,
+                              new boolean[] {false, false},
+                              true);
+  }
+
+  public void testMultipleStopWordsEnd() throws Exception {
+                              
+    CharArraySet stopWords = StopFilter.makeStopSet(TEST_VERSION_CURRENT, "to", "the", "a");
+    TokenStream stream = new MockTokenizer(new StringReader("go to a the"));
+    TokenStream filter = new SuggestStopFilter(stream, stopWords);
+
+    filter = new SuggestStopFilter(stream, stopWords);
+    assertTokenStreamContents(filter,
+                              new String[] { "go", "the"},
+                              new int[] {0, 8},
+                              new int[] {2, 11},
+                              null,
+                              new int[] {1, 3},
+                              null,
+                              11,
+                              new boolean[] {false, true},
+                              true);
+  }
+
+  public void testMultipleStopWordsEnd2() throws Exception {
+                              
+    CharArraySet stopWords = StopFilter.makeStopSet(TEST_VERSION_CURRENT, "to", "the", "a");
+    TokenStream stream = new MockTokenizer(new StringReader("go to a the "));
+    TokenStream filter = new SuggestStopFilter(stream, stopWords);
+
+    filter = new SuggestStopFilter(stream, stopWords);
+    assertTokenStreamContents(filter,
+                              new String[] { "go"},
+                              new int[] {0},
+                              new int[] {2},
+                              null,
+                              new int[] {1},
+                              null,
+                              12,
+                              new boolean[] {false},
+                              true);
+  }
+}
--- a/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java
@ -111,7 +111,8 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
  //     arriving to pos Y have the same endOffset)
  //   - offsets only move forwards (startOffset >=
  //     lastStartOffset)
-  public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[], int posLengths[], Integer finalOffset,
+  public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[],
+                                               int posLengths[], Integer finalOffset, boolean[] keywordAtts,
                                               boolean offsetsAreCorrect) throws IOException {
    assertNotNull(output);
    CheckClearAttributesAttribute checkClearAtt = ts.addAttribute(CheckClearAttributesAttribute.class);
@ -145,6 +146,12 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
      assertTrue("has no PositionLengthAttribute", ts.hasAttribute(PositionLengthAttribute.class));
      posLengthAtt = ts.getAttribute(PositionLengthAttribute.class);
    }
+
+    KeywordAttribute keywordAtt = null;
+    if (keywordAtts != null) {
+      assertTrue("has no KeywordAttribute", ts.hasAttribute(KeywordAttribute.class));
+      keywordAtt = ts.getAttribute(KeywordAttribute.class);
+    }
    
    // Maps position to the start/end offset:
    final Map<Integer,Integer> posToStartOffset = new HashMap<Integer,Integer>();
@ -161,22 +168,31 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
      if (typeAtt != null) typeAtt.setType("bogusType");
      if (posIncrAtt != null) posIncrAtt.setPositionIncrement(45987657);
      if (posLengthAtt != null) posLengthAtt.setPositionLength(45987653);
+      if (keywordAtt != null) keywordAtt.setKeyword((i&1) == 0);
      
      checkClearAtt.getAndResetClearCalled(); // reset it, because we called clearAttribute() before
      assertTrue("token "+i+" does not exist", ts.incrementToken());
      assertTrue("clearAttributes() was not called correctly in TokenStream chain", checkClearAtt.getAndResetClearCalled());
      
      assertEquals("term "+i, output[i], termAtt.toString());
-      if (startOffsets != null)
+      if (startOffsets != null) {
        assertEquals("startOffset "+i, startOffsets[i], offsetAtt.startOffset());
-      if (endOffsets != null)
+      }
+      if (endOffsets != null) {
        assertEquals("endOffset "+i, endOffsets[i], offsetAtt.endOffset());
-      if (types != null)
+      }
+      if (types != null) {
        assertEquals("type "+i, types[i], typeAtt.type());
-      if (posIncrements != null)
+      }
+      if (posIncrements != null) {
        assertEquals("posIncrement "+i, posIncrements[i], posIncrAtt.getPositionIncrement());
-      if (posLengths != null)
+      }
+      if (posLengths != null) {
        assertEquals("posLength "+i, posLengths[i], posLengthAtt.getPositionLength());
+      }
+      if (keywordAtts != null) {
+        assertEquals("keywordAtt " + i, keywordAtts[i], keywordAtt.isKeyword());
+      }
      
      // we can enforce some basic things about a few attributes even if the caller doesn't check:
      if (offsetAtt != null) {
@ -239,7 +255,9 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
        assertTrue("posLength must be >= 1", posLengthAtt.getPositionLength() >= 1);
      }
    }
-    assertFalse("TokenStream has more tokens than expected (expected count=" + output.length + ")", ts.incrementToken());
+    if (ts.incrementToken()) {
+      fail("TokenStream has more tokens than expected (expected count=" + output.length + "); extra token=" + termAtt.toString());
+    }
    ts.end();
    if (finalOffset != null) {
      assertEquals("finalOffset ", finalOffset.intValue(), offsetAtt.endOffset());
@ -250,6 +268,10 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
    ts.close();
  }
  
+  public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[], int posLengths[], Integer finalOffset, boolean offsetsAreCorrect) throws IOException {
+    assertTokenStreamContents(ts, output, startOffsets, endOffsets, types, posIncrements, posLengths, finalOffset, null, offsetsAreCorrect);
+  }
+
  public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[], int posLengths[], Integer finalOffset) throws IOException {
    assertTokenStreamContents(ts, output, startOffsets, endOffsets, types, posIncrements, posLengths, finalOffset, true);
  }