Use position increments to account for removed stop words

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@150148 13f79535-47bb-0310-9956-ffa450edef68
2003-11-28 02:03:14 +00:00 · 2003-11-28 02:03:14 +00:00 · fd5806ddf2
parent 40dd950e3f
commit fd5806ddf2
3 changed files with 117 additions and 10 deletions
--- a/CHANGES.txt
+++ b/CHANGES.txt
@ -7,6 +7,12 @@ $Id$
 1. Added catch of BooleanQuery$TooManyClauses in QueryParser to
    throw ParseException instead. (Erik Hatcher)

+ 2. Modified StopFilter to increment positions to account for
+    stop words removed.  This prevents exact phrase queries from
+    matching erroneously (use slop factor to account for missing
+    stop words).  StopFilter is used by StopAnalyzer, StandardAnalyzer
+    and some others.  (Erik Hatcher)
+
 1.3 RC3

 1. Added minMergeDocs in IndexWriter.  This can be raised to speed
--- a/src/java/org/apache/lucene/analysis/StopFilter.java
+++ b/src/java/org/apache/lucene/analysis/StopFilter.java
@ -57,29 +57,33 @@ package org.apache.lucene.analysis;
 import java.io.IOException;
 import java.util.Hashtable;

-/** Removes stop words from a token stream. */
-
+/**
+ * Removes stop words from a token stream.  Position increments
+ * on tokens emitted are adjusted to account for words
+ * removed.  Exact phrase queries will not match across holes left
+ * by stop word removal, but sloppy phrase queries may match.
+ */
 public final class StopFilter extends TokenFilter {

  private Hashtable table;

  /** Constructs a filter which removes words from the input
-    TokenStream that are named in the array of words. */
+   TokenStream that are named in the array of words. */
  public StopFilter(TokenStream in, String[] stopWords) {
    super(in);
    table = makeStopTable(stopWords);
  }

  /** Constructs a filter which removes words from the input
-    TokenStream that are named in the Hashtable. */
+   TokenStream that are named in the Hashtable. */
  public StopFilter(TokenStream in, Hashtable stopTable) {
    super(in);
    table = stopTable;
  }
-  
+
  /** Builds a Hashtable from an array of stop words, appropriate for passing
-    into the StopFilter constructor.  This permits this table construction to
-    be cached once when an Analyzer is constructed. */
+   into the StopFilter constructor.  This permits this table construction to
+   be cached once when an Analyzer is constructed. */
  public static final Hashtable makeStopTable(String[] stopWords) {
    Hashtable stopTable = new Hashtable(stopWords.length);
    for (int i = 0; i < stopWords.length; i++)
@ -89,10 +93,18 @@ public final class StopFilter extends TokenFilter {

  /** Returns the next input Token whose termText() is not a stop word. */
  public final Token next() throws IOException {
+    int position = 1;
+
    // return the first non-stop word found
-    for (Token token = input.next(); token != null; token = input.next())
-      if (table.get(token.termText) == null)
-	return token;
+    for (Token token = input.next(); token != null; token = input.next()) {
+      if (table.get(token.termText) == null) {
+        token.setPositionIncrement(position);
+        position = 1;
+        return token;
+      }
+
+      position++;
+    }
    // reached EOS -- return null
    return null;
  }
--- a/src/test/org/apache/lucene/analysis/TestStopAnalyzer.java
+++ b/src/test/org/apache/lucene/analysis/TestStopAnalyzer.java
@ -0,0 +1,89 @@
+package org.apache.lucene.analysis;
+
+import junit.framework.TestCase;
+import java.io.StringReader;
+import java.util.ArrayList;
+import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.store.RAMDirectory;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.search.PhraseQuery;
+import org.apache.lucene.search.Hits;
+
+public class TestStopAnalyzer extends TestCase {
+  private StopAnalyzer stopAnalyzer = new StopAnalyzer();
+
+  public Token[] tokensFromAnalyzer(Analyzer analyzer, String text)
+                                                  throws Exception {
+    TokenStream stream =
+      analyzer.tokenStream("contents", new StringReader(text));
+    ArrayList tokenList = new ArrayList();
+    while (true) {
+      Token token = stream.next();
+      if (token == null) break;
+
+      tokenList.add(token);
+    }
+
+    return (Token[]) tokenList.toArray(new Token[0]);
+  }
+
+
+  public void testNoHoles() throws Exception {
+    Token[] tokens = tokensFromAnalyzer(stopAnalyzer,
+                                        "non-stop words");
+
+    assertEquals(3, tokens.length);
+
+    // ensure all words are in successive positions
+    assertEquals("non", 1, tokens[0].getPositionIncrement());
+    assertEquals("stop", 1, tokens[1].getPositionIncrement());
+    assertEquals("words", 1, tokens[2].getPositionIncrement());
+  }
+
+  public void testHoles() throws Exception {
+    Token[] tokens = tokensFromAnalyzer(stopAnalyzer,
+                                        "the stop words are here");
+
+    assertEquals(3, tokens.length);
+
+    // check for the holes noted by position gaps
+    assertEquals("stop", 2, tokens[0].getPositionIncrement());
+    assertEquals("words", 1, tokens[1].getPositionIncrement());
+    assertEquals("here", 2, tokens[2].getPositionIncrement());
+  }
+
+  public void testPhraseQuery() throws Exception {
+    RAMDirectory directory = new RAMDirectory();
+    IndexWriter writer = new IndexWriter(directory, stopAnalyzer, true);
+    Document doc = new Document();
+    doc.add(Field.Text("field", "the stop words are here"));
+    writer.addDocument(doc);
+    writer.close();
+
+    IndexSearcher searcher = new IndexSearcher(directory);
+
+    // valid exact phrase query
+    PhraseQuery query = new PhraseQuery();
+    query.add(new Term("field","stop"));
+    query.add(new Term("field","words"));
+    Hits hits = searcher.search(query);
+    assertEquals(1, hits.length());
+
+    // incorrect attempt at exact phrase query over stop word hole
+    query = new PhraseQuery();
+    query.add(new Term("field", "words"));
+    query.add(new Term("field", "here"));
+    hits = searcher.search(query);
+    assertEquals(0, hits.length());
+
+    // add some slop, and match over the hole
+    query.setSlop(1);
+    hits = searcher.search(query);
+    assertEquals(1, hits.length());
+
+    searcher.close();
+  }
+}