LUCENE-1095: option added to StopFilter and QueryParser to consider position increments.

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@607591 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Doron Cohen 2007-12-30 21:19:17 +00:00
parent b367e863e6
commit f4639c0ab0
8 changed files with 364 additions and 31 deletions

View File

@ -266,6 +266,14 @@ New features
11. LUCENE-1019: CustomScoreQuery enhanced to support multiple
ValueSource queries. (Kyle Maxwell via Doron Cohen)
12. LUCENE-1095: Added an option to StopFilter to increase
positionIncrement of the token succeeding a stopped token.
Disabled by default. Similar option added to QueryParser
to consider token positions when creating PhraseQuery
and MultiPhraseQuery. Disabled by default (so by default
the query parser ignores position increments).
(Doron Cohen)
Optimizations

View File

@ -27,7 +27,10 @@ import java.util.Set;
public final class StopFilter extends TokenFilter {
private static boolean ENABLE_POSITION_INCREMENTS_DEFAULT = false;
private final CharArraySet stopWords;
private boolean enablePositionIncrements = ENABLE_POSITION_INCREMENTS_DEFAULT;
/**
* Construct a token stream filtering the given input.
@ -111,11 +114,58 @@ public final class StopFilter extends TokenFilter {
*/
public final Token next(Token result) throws IOException {
// return the first non-stop word found
int skippedPositions = 0;
while((result = input.next(result)) != null) {
if (!stopWords.contains(result.termBuffer(), 0, result.termLength))
if (!stopWords.contains(result.termBuffer(), 0, result.termLength)) {
if (enablePositionIncrements) {
result.setPositionIncrement(result.getPositionIncrement() + skippedPositions);
}
return result;
}
skippedPositions += result.getPositionIncrement();
}
// reached EOS -- return null
return null;
}
/**
* @see #setEnablePositionIncrementsDefault(boolean).
*/
public static boolean getEnablePositionIncrementsDefault() {
return ENABLE_POSITION_INCREMENTS_DEFAULT;
}
/**
* Set the default position increments behavior of every StopFilter created from now on.
* <p>
* Note: behavior of a single StopFilter instance can be modified
* with {@link #setEnablePositionIncrements(boolean)}.
* This static method allows control over behavior of classes using StopFilters internally,
* for example {@link org.apache.lucene.analysis.standard.StandardAnalyzer StandardAnalyzer}.
* <p>
* Default : false.
* @see #setEnablePositionIncrements(boolean).
*/
public static void setEnablePositionIncrementsDefault(boolean defaultValue) {
ENABLE_POSITION_INCREMENTS_DEFAULT = defaultValue;
}
/**
* @see #setEnablePositionIncrements(boolean).
*/
public boolean getEnablePositionIncrements() {
return enablePositionIncrements;
}
/**
* Set to <code>true</code> to make <b>this</b> StopFilter enable position increments to result tokens.
* <p>
* When set, when a token is stopped (omitted), the position increment of
* the following token is incremented.
* <p>
* Default: see {@link #setEnablePositionIncrementsDefault(boolean)}.
*/
public void setEnablePositionIncrements(boolean enable) {
this.enablePositionIncrements = enable;
}
}

View File

@ -100,6 +100,7 @@ public class QueryParser implements QueryParserConstants {
boolean lowercaseExpandedTerms = true;
boolean useOldRangeQuery= false;
boolean allowLeadingWildcard = false;
boolean enablePositionIncrements = false;
Analyzer analyzer;
String field;
@ -234,12 +235,33 @@ public class QueryParser implements QueryParserConstants {
}
/**
* @see #setAllowLeadingWildcard
* @see #setAllowLeadingWildcard(boolean)
*/
public boolean getAllowLeadingWildcard() {
return allowLeadingWildcard;
}
/**
* Set to <code>true</code> to enable position increments in result query.
* <p>
* When set, result phrase and multi-phrase queries will
* be aware of position increments.
* Useful when e.g. a StopFilter increases the position increment of
* the token that follows an omitted token.
* <p>
* Default: false.
*/
public void setEnablePositionIncrements(boolean enable) {
this.enablePositionIncrements = enable;
}
/**
* @see #setEnablePositionIncrements(boolean)
*/
public boolean getEnablePositionIncrements() {
return enablePositionIncrements;
}
/**
* Sets the boolean operator of the QueryParser.
* In default mode (<code>OR_OPERATOR</code>) terms without any modifiers
@ -478,27 +500,42 @@ public class QueryParser implements QueryParserConstants {
MultiPhraseQuery mpq = new MultiPhraseQuery();
mpq.setSlop(phraseSlop);
List multiTerms = new ArrayList();
int position = -1;
for (int i = 0; i < v.size(); i++) {
t = (org.apache.lucene.analysis.Token) v.elementAt(i);
if (t.getPositionIncrement() == 1 && multiTerms.size() > 0) {
mpq.add((Term[])multiTerms.toArray(new Term[0]));
if (t.getPositionIncrement() > 0 && multiTerms.size() > 0) {
if (enablePositionIncrements) {
mpq.add((Term[])multiTerms.toArray(new Term[0]),position);
} else {
mpq.add((Term[])multiTerms.toArray(new Term[0]));
}
multiTerms.clear();
}
position += t.getPositionIncrement();
multiTerms.add(new Term(field, t.termText()));
}
mpq.add((Term[])multiTerms.toArray(new Term[0]));
if (enablePositionIncrements) {
mpq.add((Term[])multiTerms.toArray(new Term[0]),position);
} else {
mpq.add((Term[])multiTerms.toArray(new Term[0]));
}
return mpq;
}
}
else {
PhraseQuery q = new PhraseQuery();
q.setSlop(phraseSlop);
PhraseQuery pq = new PhraseQuery();
pq.setSlop(phraseSlop);
int position = -1;
for (int i = 0; i < v.size(); i++) {
q.add(new Term(field, ((org.apache.lucene.analysis.Token)
v.elementAt(i)).termText()));
t = (org.apache.lucene.analysis.Token) v.elementAt(i);
if (enablePositionIncrements) {
position += t.getPositionIncrement();
pq.add(new Term(field, t.termText()),position);
} else {
pq.add(new Term(field, t.termText()));
}
}
return q;
return pq;
}
}
}
@ -1262,12 +1299,6 @@ public class QueryParser implements QueryParserConstants {
finally { jj_save(0, xla); }
}
final private boolean jj_3R_3() {
if (jj_scan_token(STAR)) return true;
if (jj_scan_token(COLON)) return true;
return false;
}
final private boolean jj_3R_2() {
if (jj_scan_token(TERM)) return true;
if (jj_scan_token(COLON)) return true;
@ -1284,6 +1315,12 @@ public class QueryParser implements QueryParserConstants {
return false;
}
final private boolean jj_3R_3() {
if (jj_scan_token(STAR)) return true;
if (jj_scan_token(COLON)) return true;
return false;
}
public QueryParserTokenManager token_source;
public Token token, jj_nt;
private int jj_ntk;

View File

@ -124,6 +124,7 @@ public class QueryParser {
boolean lowercaseExpandedTerms = true;
boolean useOldRangeQuery= false;
boolean allowLeadingWildcard = false;
boolean enablePositionIncrements = false;
Analyzer analyzer;
String field;
@ -258,12 +259,33 @@ public class QueryParser {
}
/**
* @see #setAllowLeadingWildcard
* @see #setAllowLeadingWildcard(boolean)
*/
public boolean getAllowLeadingWildcard() {
return allowLeadingWildcard;
}
/**
* Set to <code>true</code> to enable position increments in result query.
* <p>
* When set, result phrase and multi-phrase queries will
* be aware of position increments.
* Useful when e.g. a StopFilter increases the position increment of
* the token that follows an omitted token.
* <p>
* Default: false.
*/
public void setEnablePositionIncrements(boolean enable) {
this.enablePositionIncrements = enable;
}
/**
* @see #setEnablePositionIncrements(boolean)
*/
public boolean getEnablePositionIncrements() {
return enablePositionIncrements;
}
/**
* Sets the boolean operator of the QueryParser.
* In default mode (<code>OR_OPERATOR</code>) terms without any modifiers
@ -502,27 +524,42 @@ public class QueryParser {
MultiPhraseQuery mpq = new MultiPhraseQuery();
mpq.setSlop(phraseSlop);
List multiTerms = new ArrayList();
int position = -1;
for (int i = 0; i < v.size(); i++) {
t = (org.apache.lucene.analysis.Token) v.elementAt(i);
if (t.getPositionIncrement() == 1 && multiTerms.size() > 0) {
mpq.add((Term[])multiTerms.toArray(new Term[0]));
if (t.getPositionIncrement() > 0 && multiTerms.size() > 0) {
if (enablePositionIncrements) {
mpq.add((Term[])multiTerms.toArray(new Term[0]),position);
} else {
mpq.add((Term[])multiTerms.toArray(new Term[0]));
}
multiTerms.clear();
}
position += t.getPositionIncrement();
multiTerms.add(new Term(field, t.termText()));
}
mpq.add((Term[])multiTerms.toArray(new Term[0]));
if (enablePositionIncrements) {
mpq.add((Term[])multiTerms.toArray(new Term[0]),position);
} else {
mpq.add((Term[])multiTerms.toArray(new Term[0]));
}
return mpq;
}
}
else {
PhraseQuery q = new PhraseQuery();
q.setSlop(phraseSlop);
PhraseQuery pq = new PhraseQuery();
pq.setSlop(phraseSlop);
int position = -1;
for (int i = 0; i < v.size(); i++) {
q.add(new Term(field, ((org.apache.lucene.analysis.Token)
v.elementAt(i)).termText()));
t = (org.apache.lucene.analysis.Token) v.elementAt(i);
if (enablePositionIncrements) {
position += t.getPositionIncrement();
pq.add(new Term(field, t.termText()),position);
} else {
pq.add(new Term(field, t.termText()));
}
}
return q;
return pq;
}
}
}

View File

@ -64,7 +64,33 @@ public class TestStopAnalyzer extends LuceneTestCase {
while ((token = stream.next()) != null) {
String text = token.termText();
assertFalse(stopWordsSet.contains(text));
assertEquals(1,token.getPositionIncrement()); // by default stop tokenizer does not apply increments.
}
}
public void testStopListPositions() throws IOException {
boolean defaultEnable = StopFilter.getEnablePositionIncrementsDefault();
StopFilter.setEnablePositionIncrementsDefault(true);
try {
Set stopWordsSet = new HashSet();
stopWordsSet.add("good");
stopWordsSet.add("test");
stopWordsSet.add("analyzer");
StopAnalyzer newStop = new StopAnalyzer((String[])stopWordsSet.toArray(new String[3]));
StringReader reader = new StringReader("This is a good test of the english stop analyzer with positions");
int expectedIncr[] = { 1, 1, 1, 3, 1, 1, 1, 2, 1};
TokenStream stream = newStop.tokenStream("test", reader);
assertNotNull(stream);
Token token = null;
int i = 0;
while ((token = stream.next()) != null) {
String text = token.termText();
assertFalse(stopWordsSet.contains(text));
assertEquals(expectedIncr[i++],token.getPositionIncrement());
}
} finally {
StopFilter.setEnablePositionIncrementsDefault(defaultEnable);
}
}
}

View File

@ -16,10 +16,12 @@ package org.apache.lucene.analysis;
* limitations under the License.
*/
import org.apache.lucene.util.English;
import org.apache.lucene.util.LuceneTestCase;
import java.io.IOException;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.Set;
/**
@ -27,6 +29,8 @@ import java.util.Set;
*/
public class TestStopFilter extends LuceneTestCase {
private final static boolean VERBOSE = false;
// other StopFilter functionality is already tested by TestStopAnalyzer
public void testExactCase() throws IOException {
@ -56,4 +60,69 @@ public class TestStopFilter extends LuceneTestCase {
assertEquals(null, stream.next());
}
/**
* Test Position increments applied by StopFilter with and without enabling this option.
*/
public void testStopPositons() throws IOException {
StringBuffer sb = new StringBuffer();
ArrayList a = new ArrayList();
for (int i=0; i<20; i++) {
String w = English.intToEnglish(i).trim();
sb.append(w).append(" ");
if (i%3 != 0) a.add(w);
}
log(sb.toString());
String stopWords[] = (String[]) a.toArray(new String[0]);
for (int i=0; i<a.size(); i++) log("Stop: "+stopWords[i]);
Set stopSet = StopFilter.makeStopSet(stopWords);
// with increments
StringReader reader = new StringReader(sb.toString());
StopFilter stpf = new StopFilter(new WhitespaceTokenizer(reader), stopSet);
doTestStopPositons(stpf,true);
// without increments
reader = new StringReader(sb.toString());
stpf = new StopFilter(new WhitespaceTokenizer(reader), stopSet);
doTestStopPositons(stpf,false);
// with increments, concatenating two stop filters
ArrayList a0 = new ArrayList();
ArrayList a1 = new ArrayList();
for (int i=0; i<a.size(); i++) {
if (i%2==0) {
a0.add(a.get(i));
} else {
a1.add(a.get(i));
}
}
String stopWords0[] = (String[]) a0.toArray(new String[0]);
for (int i=0; i<a0.size(); i++) log("Stop0: "+stopWords0[i]);
String stopWords1[] = (String[]) a1.toArray(new String[0]);
for (int i=0; i<a1.size(); i++) log("Stop1: "+stopWords1[i]);
Set stopSet0 = StopFilter.makeStopSet(stopWords0);
Set stopSet1 = StopFilter.makeStopSet(stopWords1);
reader = new StringReader(sb.toString());
StopFilter stpf0 = new StopFilter(new WhitespaceTokenizer(reader), stopSet0); // first part of the set
stpf0.setEnablePositionIncrements(true);
StopFilter stpf01 = new StopFilter(stpf0, stopSet1); // two stop filters concatenated!
doTestStopPositons(stpf01,true);
}
private void doTestStopPositons(StopFilter stpf, boolean enableIcrements) throws IOException {
log("---> test with enable-increments-"+(enableIcrements?"enabled":"disabled"));
stpf.setEnablePositionIncrements(enableIcrements);
for (int i=0; i<20; i+=3) {
Token t = stpf.next();
log("Token "+i+": "+t);
String w = English.intToEnglish(i).trim();
assertEquals("expecting token "+i+" to be "+w,w,t.termText());
assertEquals("all but first token must have position increment of 3",enableIcrements?(i==0?1:3):1,t.getPositionIncrement());
}
assertNull(stpf.next());
}
// print debug info depending on VERBOSE
private static void log(String s) {
if (VERBOSE) {
System.out.println(s);
}
}
}

View File

@ -838,19 +838,42 @@ public class TestQueryParser extends LuceneTestCase {
public void testStopwords() throws Exception {
QueryParser qp = new QueryParser("a", new StopAnalyzer(new String[]{"the", "foo"}));
Query result = qp.parse("a:the OR a:foo");
assertTrue("result is null and it shouldn't be", result != null);
assertNotNull("result is null and it shouldn't be", result);
assertTrue("result is not a BooleanQuery", result instanceof BooleanQuery);
assertTrue(((BooleanQuery) result).clauses().size() + " does not equal: " + 0, ((BooleanQuery) result).clauses().size() == 0);
result = qp.parse("a:woo OR a:the");
assertTrue("result is null and it shouldn't be", result != null);
assertNotNull("result is null and it shouldn't be", result);
assertTrue("result is not a TermQuery", result instanceof TermQuery);
result = qp.parse("(fieldX:xxxxx OR fieldy:xxxxxxxx)^2 AND (fieldx:the OR fieldy:foo)");
assertTrue("result is null and it shouldn't be", result != null);
assertNotNull("result is null and it shouldn't be", result);
assertTrue("result is not a BooleanQuery", result instanceof BooleanQuery);
System.out.println("Result: " + result);
assertTrue(((BooleanQuery) result).clauses().size() + " does not equal: " + 2, ((BooleanQuery) result).clauses().size() == 2);
}
public void testPositionIncrement() throws Exception {
boolean dflt = StopFilter.getEnablePositionIncrementsDefault();
StopFilter.setEnablePositionIncrementsDefault(true);
try {
QueryParser qp = new QueryParser("a", new StopAnalyzer(new String[]{"the", "in", "are", "this"}));
qp.setEnablePositionIncrements(true);
String qtxt = "\"the words in poisitions pos02578 are stopped in this phrasequery\"";
// 0 2 5 7 8
int expectedPositions[] = {1,3,4,6,9};
PhraseQuery pq = (PhraseQuery) qp.parse(qtxt);
//System.out.println("Query text: "+qtxt);
//System.out.println("Result: "+pq);
Term t[] = pq.getTerms();
int pos[] = pq.getPositions();
for (int i = 0; i < t.length; i++) {
//System.out.println(i+". "+t[i]+" pos: "+pos[i]);
assertEquals("term "+i+" = "+t[i]+" has wrong term-position!",expectedPositions[i],pos[i]);
}
} finally {
StopFilter.setEnablePositionIncrementsDefault(dflt);
}
}
public void testMatchAllDocs() throws Exception {
QueryParser qp = new QueryParser("field", new WhitespaceAnalyzer());
assertEquals(new MatchAllDocsQuery(), qp.parse("*:*"));

View File

@ -19,11 +19,14 @@ package org.apache.lucene.search;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.PhraseQuery;
import org.apache.lucene.search.Hits;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.StopAnalyzer;
import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.WhitespaceAnalyzer;
@ -80,6 +83,20 @@ public class TestPositionIncrement extends LuceneTestCase {
hits = searcher.search(q);
assertEquals(0, hits.length());
// same as previous, just specify positions explicitely.
q = new PhraseQuery();
q.add(new Term("field", "1"),0);
q.add(new Term("field", "2"),1);
hits = searcher.search(q);
assertEquals(0, hits.length());
// specifying correct positions should find the phrase.
q = new PhraseQuery();
q.add(new Term("field", "1"),0);
q.add(new Term("field", "2"),2);
hits = searcher.search(q);
assertEquals(1, hits.length());
q = new PhraseQuery();
q.add(new Term("field", "2"));
q.add(new Term("field", "3"));
@ -92,6 +109,28 @@ public class TestPositionIncrement extends LuceneTestCase {
hits = searcher.search(q);
assertEquals(0, hits.length());
// phrase query would find it when correct positions are specified.
q = new PhraseQuery();
q.add(new Term("field", "3"),0);
q.add(new Term("field", "4"),0);
hits = searcher.search(q);
assertEquals(1, hits.length());
// phrase query should fail for non existing searched term
// even if there exist another searched terms in the same searched position.
q = new PhraseQuery();
q.add(new Term("field", "3"),0);
q.add(new Term("field", "9"),0);
hits = searcher.search(q);
assertEquals(0, hits.length());
// multi-phrase query should succed for non existing searched term
// because there exist another searched terms in the same searched position.
MultiPhraseQuery mq = new MultiPhraseQuery();
mq.add(new Term[]{new Term("field", "3"),new Term("field", "9")},0);
hits = searcher.search(mq);
assertEquals(1, hits.length());
q = new PhraseQuery();
q.add(new Term("field", "2"));
q.add(new Term("field", "4"));
@ -115,6 +154,50 @@ public class TestPositionIncrement extends LuceneTestCase {
q.add(new Term("field", "5"));
hits = searcher.search(q);
assertEquals(0, hits.length());
// analyzer to introduce stopwords and increment gaps
Analyzer stpa = new Analyzer() {
final WhitespaceAnalyzer a = new WhitespaceAnalyzer();
public TokenStream tokenStream(String fieldName, Reader reader) {
TokenStream ts = a.tokenStream(fieldName,reader);
return new StopFilter(ts,new String[]{"stop"});
}
};
// should not find "1 2" because there is a gap of 1 in the index
QueryParser qp = new QueryParser("field",stpa);
q = (PhraseQuery) qp.parse("\"1 2\"");
hits = searcher.search(q);
assertEquals(0, hits.length());
// omitted stop word cannot help because stop filter swallows the increments.
q = (PhraseQuery) qp.parse("\"1 stop 2\"");
hits = searcher.search(q);
assertEquals(0, hits.length());
// query parser alone won't help, because stop filter swallows the increments.
qp.setEnablePositionIncrements(true);
q = (PhraseQuery) qp.parse("\"1 stop 2\"");
hits = searcher.search(q);
assertEquals(0, hits.length());
boolean dflt = StopFilter.getEnablePositionIncrementsDefault();
try {
// stop filter alone won't help, because query parser swallows the increments.
qp.setEnablePositionIncrements(false);
StopFilter.setEnablePositionIncrementsDefault(true);
q = (PhraseQuery) qp.parse("\"1 stop 2\"");
hits = searcher.search(q);
assertEquals(0, hits.length());
// when both qp qnd stopFilter propagate increments, we should find the doc.
qp.setEnablePositionIncrements(true);
q = (PhraseQuery) qp.parse("\"1 stop 2\"");
hits = searcher.search(q);
assertEquals(1, hits.length());
} finally {
StopFilter.setEnablePositionIncrementsDefault(dflt);
}
}
/**