Use position increments to account for removed stop words

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@150148 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Erik Hatcher 2003-11-28 02:03:14 +00:00
parent 40dd950e3f
commit fd5806ddf2
3 changed files with 117 additions and 10 deletions

View File

@ -7,6 +7,12 @@ $Id$
1. Added catch of BooleanQuery$TooManyClauses in QueryParser to
throw ParseException instead. (Erik Hatcher)
2. Modified StopFilter to increment positions to account for
stop words removed. This prevents exact phrase queries from
matching erroneously (use slop factor to account for missing
stop words). StopFilter is used by StopAnalyzer, StandardAnalyzer
and some others. (Erik Hatcher)
1.3 RC3
1. Added minMergeDocs in IndexWriter. This can be raised to speed

View File

@ -57,29 +57,33 @@ package org.apache.lucene.analysis;
import java.io.IOException;
import java.util.Hashtable;
/** Removes stop words from a token stream. */
/**
* Removes stop words from a token stream. Position increments
* on tokens emitted are adjusted to account for words
* removed. Exact phrase queries will not match across holes left
* by stop word removal, but sloppy phrase queries may match.
*/
public final class StopFilter extends TokenFilter {
private Hashtable table;
/** Constructs a filter which removes words from the input
TokenStream that are named in the array of words. */
TokenStream that are named in the array of words. */
public StopFilter(TokenStream in, String[] stopWords) {
super(in);
table = makeStopTable(stopWords);
}
/** Constructs a filter which removes words from the input
TokenStream that are named in the Hashtable. */
TokenStream that are named in the Hashtable. */
public StopFilter(TokenStream in, Hashtable stopTable) {
super(in);
table = stopTable;
}
/** Builds a Hashtable from an array of stop words, appropriate for passing
into the StopFilter constructor. This permits this table construction to
be cached once when an Analyzer is constructed. */
into the StopFilter constructor. This permits this table construction to
be cached once when an Analyzer is constructed. */
public static final Hashtable makeStopTable(String[] stopWords) {
Hashtable stopTable = new Hashtable(stopWords.length);
for (int i = 0; i < stopWords.length; i++)
@ -89,10 +93,18 @@ public final class StopFilter extends TokenFilter {
/** Returns the next input Token whose termText() is not a stop word. */
public final Token next() throws IOException {
int position = 1;
// return the first non-stop word found
for (Token token = input.next(); token != null; token = input.next())
if (table.get(token.termText) == null)
return token;
for (Token token = input.next(); token != null; token = input.next()) {
if (table.get(token.termText) == null) {
token.setPositionIncrement(position);
position = 1;
return token;
}
position++;
}
// reached EOS -- return null
return null;
}

View File

@ -0,0 +1,89 @@
package org.apache.lucene.analysis;
import junit.framework.TestCase;
import java.io.StringReader;
import java.util.ArrayList;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.PhraseQuery;
import org.apache.lucene.search.Hits;
public class TestStopAnalyzer extends TestCase {
private StopAnalyzer stopAnalyzer = new StopAnalyzer();
public Token[] tokensFromAnalyzer(Analyzer analyzer, String text)
throws Exception {
TokenStream stream =
analyzer.tokenStream("contents", new StringReader(text));
ArrayList tokenList = new ArrayList();
while (true) {
Token token = stream.next();
if (token == null) break;
tokenList.add(token);
}
return (Token[]) tokenList.toArray(new Token[0]);
}
public void testNoHoles() throws Exception {
Token[] tokens = tokensFromAnalyzer(stopAnalyzer,
"non-stop words");
assertEquals(3, tokens.length);
// ensure all words are in successive positions
assertEquals("non", 1, tokens[0].getPositionIncrement());
assertEquals("stop", 1, tokens[1].getPositionIncrement());
assertEquals("words", 1, tokens[2].getPositionIncrement());
}
public void testHoles() throws Exception {
Token[] tokens = tokensFromAnalyzer(stopAnalyzer,
"the stop words are here");
assertEquals(3, tokens.length);
// check for the holes noted by position gaps
assertEquals("stop", 2, tokens[0].getPositionIncrement());
assertEquals("words", 1, tokens[1].getPositionIncrement());
assertEquals("here", 2, tokens[2].getPositionIncrement());
}
public void testPhraseQuery() throws Exception {
RAMDirectory directory = new RAMDirectory();
IndexWriter writer = new IndexWriter(directory, stopAnalyzer, true);
Document doc = new Document();
doc.add(Field.Text("field", "the stop words are here"));
writer.addDocument(doc);
writer.close();
IndexSearcher searcher = new IndexSearcher(directory);
// valid exact phrase query
PhraseQuery query = new PhraseQuery();
query.add(new Term("field","stop"));
query.add(new Term("field","words"));
Hits hits = searcher.search(query);
assertEquals(1, hits.length());
// incorrect attempt at exact phrase query over stop word hole
query = new PhraseQuery();
query.add(new Term("field", "words"));
query.add(new Term("field", "here"));
hits = searcher.search(query);
assertEquals(0, hits.length());
// add some slop, and match over the hole
query.setSlop(1);
hits = searcher.search(query);
assertEquals(1, hits.length());
searcher.close();
}
}