diff --git a/CHANGES.txt b/CHANGES.txt index 5fa26b3865a..fe0b9fb2e75 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -7,6 +7,12 @@ $Id$ 1. Added catch of BooleanQuery$TooManyClauses in QueryParser to throw ParseException instead. (Erik Hatcher) + 2. Modified StopFilter to increment positions to account for + stop words removed. This prevents exact phrase queries from + matching erroneously (use slop factor to account for missing + stop words). StopFilter is used by StopAnalyzer, StandardAnalyzer + and some others. (Erik Hatcher) + 1.3 RC3 1. Added minMergeDocs in IndexWriter. This can be raised to speed diff --git a/src/java/org/apache/lucene/analysis/StopFilter.java b/src/java/org/apache/lucene/analysis/StopFilter.java index 15f2a73a974..1975ae70b9d 100644 --- a/src/java/org/apache/lucene/analysis/StopFilter.java +++ b/src/java/org/apache/lucene/analysis/StopFilter.java @@ -57,29 +57,33 @@ package org.apache.lucene.analysis; import java.io.IOException; import java.util.Hashtable; -/** Removes stop words from a token stream. */ - +/** + * Removes stop words from a token stream. Position increments + * on tokens emitted are adjusted to account for words + * removed. Exact phrase queries will not match across holes left + * by stop word removal, but sloppy phrase queries may match. + */ public final class StopFilter extends TokenFilter { private Hashtable table; /** Constructs a filter which removes words from the input - TokenStream that are named in the array of words. */ + TokenStream that are named in the array of words. */ public StopFilter(TokenStream in, String[] stopWords) { super(in); table = makeStopTable(stopWords); } /** Constructs a filter which removes words from the input - TokenStream that are named in the Hashtable. */ + TokenStream that are named in the Hashtable. */ public StopFilter(TokenStream in, Hashtable stopTable) { super(in); table = stopTable; } - + /** Builds a Hashtable from an array of stop words, appropriate for passing - into the StopFilter constructor. This permits this table construction to - be cached once when an Analyzer is constructed. */ + into the StopFilter constructor. This permits this table construction to + be cached once when an Analyzer is constructed. */ public static final Hashtable makeStopTable(String[] stopWords) { Hashtable stopTable = new Hashtable(stopWords.length); for (int i = 0; i < stopWords.length; i++) @@ -89,10 +93,18 @@ public final class StopFilter extends TokenFilter { /** Returns the next input Token whose termText() is not a stop word. */ public final Token next() throws IOException { + int position = 1; + // return the first non-stop word found - for (Token token = input.next(); token != null; token = input.next()) - if (table.get(token.termText) == null) - return token; + for (Token token = input.next(); token != null; token = input.next()) { + if (table.get(token.termText) == null) { + token.setPositionIncrement(position); + position = 1; + return token; + } + + position++; + } // reached EOS -- return null return null; } diff --git a/src/test/org/apache/lucene/analysis/TestStopAnalyzer.java b/src/test/org/apache/lucene/analysis/TestStopAnalyzer.java new file mode 100644 index 00000000000..1508aa1f3ad --- /dev/null +++ b/src/test/org/apache/lucene/analysis/TestStopAnalyzer.java @@ -0,0 +1,89 @@ +package org.apache.lucene.analysis; + +import junit.framework.TestCase; +import java.io.StringReader; +import java.util.ArrayList; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.Term; +import org.apache.lucene.store.RAMDirectory; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.PhraseQuery; +import org.apache.lucene.search.Hits; + +public class TestStopAnalyzer extends TestCase { + private StopAnalyzer stopAnalyzer = new StopAnalyzer(); + + public Token[] tokensFromAnalyzer(Analyzer analyzer, String text) + throws Exception { + TokenStream stream = + analyzer.tokenStream("contents", new StringReader(text)); + ArrayList tokenList = new ArrayList(); + while (true) { + Token token = stream.next(); + if (token == null) break; + + tokenList.add(token); + } + + return (Token[]) tokenList.toArray(new Token[0]); + } + + + public void testNoHoles() throws Exception { + Token[] tokens = tokensFromAnalyzer(stopAnalyzer, + "non-stop words"); + + assertEquals(3, tokens.length); + + // ensure all words are in successive positions + assertEquals("non", 1, tokens[0].getPositionIncrement()); + assertEquals("stop", 1, tokens[1].getPositionIncrement()); + assertEquals("words", 1, tokens[2].getPositionIncrement()); + } + + public void testHoles() throws Exception { + Token[] tokens = tokensFromAnalyzer(stopAnalyzer, + "the stop words are here"); + + assertEquals(3, tokens.length); + + // check for the holes noted by position gaps + assertEquals("stop", 2, tokens[0].getPositionIncrement()); + assertEquals("words", 1, tokens[1].getPositionIncrement()); + assertEquals("here", 2, tokens[2].getPositionIncrement()); + } + + public void testPhraseQuery() throws Exception { + RAMDirectory directory = new RAMDirectory(); + IndexWriter writer = new IndexWriter(directory, stopAnalyzer, true); + Document doc = new Document(); + doc.add(Field.Text("field", "the stop words are here")); + writer.addDocument(doc); + writer.close(); + + IndexSearcher searcher = new IndexSearcher(directory); + + // valid exact phrase query + PhraseQuery query = new PhraseQuery(); + query.add(new Term("field","stop")); + query.add(new Term("field","words")); + Hits hits = searcher.search(query); + assertEquals(1, hits.length()); + + // incorrect attempt at exact phrase query over stop word hole + query = new PhraseQuery(); + query.add(new Term("field", "words")); + query.add(new Term("field", "here")); + hits = searcher.search(query); + assertEquals(0, hits.length()); + + // add some slop, and match over the hole + query.setSlop(1); + hits = searcher.search(query); + assertEquals(1, hits.length()); + + searcher.close(); + } +}