mirror of https://github.com/apache/lucene.git
Use position increments to account for removed stop words
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@150148 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
40dd950e3f
commit
fd5806ddf2
|
@ -7,6 +7,12 @@ $Id$
|
|||
1. Added catch of BooleanQuery$TooManyClauses in QueryParser to
|
||||
throw ParseException instead. (Erik Hatcher)
|
||||
|
||||
2. Modified StopFilter to increment positions to account for
|
||||
stop words removed. This prevents exact phrase queries from
|
||||
matching erroneously (use slop factor to account for missing
|
||||
stop words). StopFilter is used by StopAnalyzer, StandardAnalyzer
|
||||
and some others. (Erik Hatcher)
|
||||
|
||||
1.3 RC3
|
||||
|
||||
1. Added minMergeDocs in IndexWriter. This can be raised to speed
|
||||
|
|
|
@ -57,29 +57,33 @@ package org.apache.lucene.analysis;
|
|||
import java.io.IOException;
|
||||
import java.util.Hashtable;
|
||||
|
||||
/** Removes stop words from a token stream. */
|
||||
|
||||
/**
|
||||
* Removes stop words from a token stream. Position increments
|
||||
* on tokens emitted are adjusted to account for words
|
||||
* removed. Exact phrase queries will not match across holes left
|
||||
* by stop word removal, but sloppy phrase queries may match.
|
||||
*/
|
||||
public final class StopFilter extends TokenFilter {
|
||||
|
||||
private Hashtable table;
|
||||
|
||||
/** Constructs a filter which removes words from the input
|
||||
TokenStream that are named in the array of words. */
|
||||
TokenStream that are named in the array of words. */
|
||||
public StopFilter(TokenStream in, String[] stopWords) {
|
||||
super(in);
|
||||
table = makeStopTable(stopWords);
|
||||
}
|
||||
|
||||
/** Constructs a filter which removes words from the input
|
||||
TokenStream that are named in the Hashtable. */
|
||||
TokenStream that are named in the Hashtable. */
|
||||
public StopFilter(TokenStream in, Hashtable stopTable) {
|
||||
super(in);
|
||||
table = stopTable;
|
||||
}
|
||||
|
||||
|
||||
/** Builds a Hashtable from an array of stop words, appropriate for passing
|
||||
into the StopFilter constructor. This permits this table construction to
|
||||
be cached once when an Analyzer is constructed. */
|
||||
into the StopFilter constructor. This permits this table construction to
|
||||
be cached once when an Analyzer is constructed. */
|
||||
public static final Hashtable makeStopTable(String[] stopWords) {
|
||||
Hashtable stopTable = new Hashtable(stopWords.length);
|
||||
for (int i = 0; i < stopWords.length; i++)
|
||||
|
@ -89,10 +93,18 @@ public final class StopFilter extends TokenFilter {
|
|||
|
||||
/** Returns the next input Token whose termText() is not a stop word. */
|
||||
public final Token next() throws IOException {
|
||||
int position = 1;
|
||||
|
||||
// return the first non-stop word found
|
||||
for (Token token = input.next(); token != null; token = input.next())
|
||||
if (table.get(token.termText) == null)
|
||||
return token;
|
||||
for (Token token = input.next(); token != null; token = input.next()) {
|
||||
if (table.get(token.termText) == null) {
|
||||
token.setPositionIncrement(position);
|
||||
position = 1;
|
||||
return token;
|
||||
}
|
||||
|
||||
position++;
|
||||
}
|
||||
// reached EOS -- return null
|
||||
return null;
|
||||
}
|
||||
|
|
|
@ -0,0 +1,89 @@
|
|||
package org.apache.lucene.analysis;
|
||||
|
||||
import junit.framework.TestCase;
|
||||
import java.io.StringReader;
|
||||
import java.util.ArrayList;
|
||||
import org.apache.lucene.index.IndexWriter;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.store.RAMDirectory;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.search.IndexSearcher;
|
||||
import org.apache.lucene.search.PhraseQuery;
|
||||
import org.apache.lucene.search.Hits;
|
||||
|
||||
public class TestStopAnalyzer extends TestCase {
|
||||
private StopAnalyzer stopAnalyzer = new StopAnalyzer();
|
||||
|
||||
public Token[] tokensFromAnalyzer(Analyzer analyzer, String text)
|
||||
throws Exception {
|
||||
TokenStream stream =
|
||||
analyzer.tokenStream("contents", new StringReader(text));
|
||||
ArrayList tokenList = new ArrayList();
|
||||
while (true) {
|
||||
Token token = stream.next();
|
||||
if (token == null) break;
|
||||
|
||||
tokenList.add(token);
|
||||
}
|
||||
|
||||
return (Token[]) tokenList.toArray(new Token[0]);
|
||||
}
|
||||
|
||||
|
||||
public void testNoHoles() throws Exception {
|
||||
Token[] tokens = tokensFromAnalyzer(stopAnalyzer,
|
||||
"non-stop words");
|
||||
|
||||
assertEquals(3, tokens.length);
|
||||
|
||||
// ensure all words are in successive positions
|
||||
assertEquals("non", 1, tokens[0].getPositionIncrement());
|
||||
assertEquals("stop", 1, tokens[1].getPositionIncrement());
|
||||
assertEquals("words", 1, tokens[2].getPositionIncrement());
|
||||
}
|
||||
|
||||
public void testHoles() throws Exception {
|
||||
Token[] tokens = tokensFromAnalyzer(stopAnalyzer,
|
||||
"the stop words are here");
|
||||
|
||||
assertEquals(3, tokens.length);
|
||||
|
||||
// check for the holes noted by position gaps
|
||||
assertEquals("stop", 2, tokens[0].getPositionIncrement());
|
||||
assertEquals("words", 1, tokens[1].getPositionIncrement());
|
||||
assertEquals("here", 2, tokens[2].getPositionIncrement());
|
||||
}
|
||||
|
||||
public void testPhraseQuery() throws Exception {
|
||||
RAMDirectory directory = new RAMDirectory();
|
||||
IndexWriter writer = new IndexWriter(directory, stopAnalyzer, true);
|
||||
Document doc = new Document();
|
||||
doc.add(Field.Text("field", "the stop words are here"));
|
||||
writer.addDocument(doc);
|
||||
writer.close();
|
||||
|
||||
IndexSearcher searcher = new IndexSearcher(directory);
|
||||
|
||||
// valid exact phrase query
|
||||
PhraseQuery query = new PhraseQuery();
|
||||
query.add(new Term("field","stop"));
|
||||
query.add(new Term("field","words"));
|
||||
Hits hits = searcher.search(query);
|
||||
assertEquals(1, hits.length());
|
||||
|
||||
// incorrect attempt at exact phrase query over stop word hole
|
||||
query = new PhraseQuery();
|
||||
query.add(new Term("field", "words"));
|
||||
query.add(new Term("field", "here"));
|
||||
hits = searcher.search(query);
|
||||
assertEquals(0, hits.length());
|
||||
|
||||
// add some slop, and match over the hole
|
||||
query.setSlop(1);
|
||||
hits = searcher.search(query);
|
||||
assertEquals(1, hits.length());
|
||||
|
||||
searcher.close();
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue