SOLR-1165: Use Lucene Filters in conjunction with IndexSearcher.search methods

git-svn-id: https://svn.apache.org/repos/asf/lucene/solr/trunk@777969 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Yonik Seeley 2009-05-23 17:26:54 +00:00
parent 5a39044cc0
commit ea43e4c997
3 changed files with 170 additions and 77 deletions

View File

@ -264,6 +264,9 @@ Optimizations
is ordered for easier impelemntation of skipTo, and is faster is ordered for easier impelemntation of skipTo, and is faster
in the general case. (yonik) in the general case. (yonik)
12. SOLR-1165: Use Lucene Filters and pass them down to the Lucene
search methods to filter earlier and improve performance. (yonik)
Bug Fixes Bug Fixes
---------------------- ----------------------

View File

@ -885,9 +885,11 @@ public class SolrIndexSearcher extends IndexSearcher implements SolrInfoMBean {
// do it the normal way... // do it the normal way...
cmd.setSupersetMaxDoc(supersetMaxDoc); cmd.setSupersetMaxDoc(supersetMaxDoc);
if ((cmd.getFlags() & GET_DOCSET)!=0) { if ((cmd.getFlags() & GET_DOCSET)!=0) {
// this currently conflates returning the docset for the base query vs
// the base query and all filters.
DocSet qDocSet = getDocListAndSetNC(qr,cmd); DocSet qDocSet = getDocListAndSetNC(qr,cmd);
// cache the docSet matching the query w/o filtering // cache the docSet matching the query w/o filtering
if (filterCache!=null && !qr.isPartialResults()) filterCache.put(cmd.getQuery(),qDocSet); if (qDocSet!=null && filterCache!=null && !qr.isPartialResults()) filterCache.put(cmd.getQuery(),qDocSet);
} else { } else {
getDocListNC(qr,cmd); getDocListNC(qr,cmd);
//Parameters: cmd.getQuery(),theFilt,cmd.getSort(),0,supersetMaxDoc,cmd.getFlags(),cmd.getTimeAllowed(),responseHeader); //Parameters: cmd.getQuery(),theFilt,cmd.getSort(),0,supersetMaxDoc,cmd.getFlags(),cmd.getTimeAllowed(),responseHeader);
@ -922,15 +924,15 @@ public class SolrIndexSearcher extends IndexSearcher implements SolrInfoMBean {
Query query = QueryUtils.makeQueryable(cmd.getQuery()); Query query = QueryUtils.makeQueryable(cmd.getQuery());
final Filter luceneFilter = filter==null ? null : filter.getTopFilter();
// handle zero case... // handle zero case...
if (lastDocRequested<=0) { if (lastDocRequested<=0) {
final DocSet filt = filter;
final float[] topscore = new float[] { Float.NEGATIVE_INFINITY }; final float[] topscore = new float[] { Float.NEGATIVE_INFINITY };
final int[] numHits = new int[1]; final int[] numHits = new int[1];
HitCollector hc = new HitCollector() { HitCollector hc = new HitCollector() {
public void collect(int doc, float score) { public void collect(int doc, float score) {
if (filt!=null && !filt.exists(doc)) return;
numHits[0]++; numHits[0]++;
if (score > topscore[0]) topscore[0]=score; if (score > topscore[0]) topscore[0]=score;
} }
@ -939,7 +941,7 @@ public class SolrIndexSearcher extends IndexSearcher implements SolrInfoMBean {
hc = new TimeLimitedCollector( hc, timeAllowed ); hc = new TimeLimitedCollector( hc, timeAllowed );
} }
try { try {
super.search(query, hc ); super.search(query, luceneFilter, hc);
} }
catch( TimeLimitedCollector.TimeExceededException x ) { catch( TimeLimitedCollector.TimeExceededException x ) {
log.warn( "Query: " + query + "; " + x.getMessage() ); log.warn( "Query: " + query + "; " + x.getMessage() );
@ -955,15 +957,12 @@ public class SolrIndexSearcher extends IndexSearcher implements SolrInfoMBean {
// can't use TopDocs if there is a sort since it // can't use TopDocs if there is a sort since it
// will do automatic score normalization. // will do automatic score normalization.
// NOTE: this changed late in Lucene 1.9 // NOTE: this changed late in Lucene 1.9
final DocSet filt = filter;
final int[] numHits = new int[1]; final int[] numHits = new int[1];
final FieldSortedHitQueue hq = new FieldSortedHitQueue(reader, cmd.getSort().getSort(), len); final FieldSortedHitQueue hq = new FieldSortedHitQueue(reader, cmd.getSort().getSort(), len);
HitCollector hc = new HitCollector() { HitCollector hc = new HitCollector() {
private FieldDoc reusableFD; private FieldDoc reusableFD;
public void collect(int doc, float score) { public void collect(int doc, float score) {
if (filt!=null && !filt.exists(doc)) return;
numHits[0]++; numHits[0]++;
if (reusableFD == null) if (reusableFD == null)
reusableFD = new FieldDoc(doc, score); reusableFD = new FieldDoc(doc, score);
@ -978,7 +977,7 @@ public class SolrIndexSearcher extends IndexSearcher implements SolrInfoMBean {
hc = new TimeLimitedCollector( hc, timeAllowed ); hc = new TimeLimitedCollector( hc, timeAllowed );
} }
try { try {
super.search(query, hc ); super.search(query, luceneFilter, hc );
} }
catch( TimeLimitedCollector.TimeExceededException x ) { catch( TimeLimitedCollector.TimeExceededException x ) {
log.warn( "Query: " + query + "; " + x.getMessage() ); log.warn( "Query: " + query + "; " + x.getMessage() );
@ -1002,14 +1001,11 @@ public class SolrIndexSearcher extends IndexSearcher implements SolrInfoMBean {
// No Sort specified (sort by score descending) // No Sort specified (sort by score descending)
// This case could be done with TopDocs, but would currently require // This case could be done with TopDocs, but would currently require
// getting a BitSet filter from a DocSet which may be inefficient. // getting a BitSet filter from a DocSet which may be inefficient.
final DocSet filt = filter;
final ScorePriorityQueue hq = new ScorePriorityQueue(lastDocRequested); final ScorePriorityQueue hq = new ScorePriorityQueue(lastDocRequested);
final int[] numHits = new int[1]; final int[] numHits = new int[1];
HitCollector hc = new HitCollector() { HitCollector hc = new HitCollector() {
private ScoreDoc reusableSD; private ScoreDoc reusableSD;
public void collect(int doc, float score) { public void collect(int doc, float score) {
if (filt!=null && !filt.exists(doc)) return;
// TODO: if docs are always delivered in order, we could use "score>minScore" // TODO: if docs are always delivered in order, we could use "score>minScore"
// instead of "score>=minScore" and avoid tiebreaking scores // instead of "score>=minScore" and avoid tiebreaking scores
// in the priority queue. // in the priority queue.
@ -1033,7 +1029,7 @@ public class SolrIndexSearcher extends IndexSearcher implements SolrInfoMBean {
hc = new TimeLimitedCollector( hc, timeAllowed ); hc = new TimeLimitedCollector( hc, timeAllowed );
} }
try { try {
super.search(query, hc ); super.search(query, luceneFilter, hc );
} }
catch( TimeLimitedCollector.TimeExceededException x ) { catch( TimeLimitedCollector.TimeExceededException x ) {
log.warn( "Query: " + query + "; " + x.getMessage() ); log.warn( "Query: " + query + "; " + x.getMessage() );
@ -1110,10 +1106,10 @@ public class SolrIndexSearcher extends IndexSearcher implements SolrInfoMBean {
} }
// the DocSet returned is for the query only, without any filtering... that way it may // the DocSet returned is for the query only, without any filtering... that way it may
// be cached if desired. // be cached if desired.
private DocSet getDocListAndSetNC(QueryResult qr,QueryCommand cmd) throws IOException { private DocSet getDocListAndSetNC(QueryResult qr,QueryCommand cmd) throws IOException {
///////////////////// NEW
int len = cmd.getSupersetMaxDoc(); int len = cmd.getSupersetMaxDoc();
DocSet filter = cmd.getFilter()!=null ? cmd.getFilter() : getDocSet(cmd.getFilterList()); DocSet filter = cmd.getFilter()!=null ? cmd.getFilter() : getDocSet(cmd.getFilterList());
int last = len; int last = len;
@ -1124,22 +1120,13 @@ public class SolrIndexSearcher extends IndexSearcher implements SolrInfoMBean {
float maxScore; float maxScore;
int[] ids; int[] ids;
float[] scores; float[] scores;
final DocSetHitCollector setHC = new DocSetHitCollector(maxDoc()>>6, maxDoc());
final HitCollector collector = ( cmd.getTimeAllowed() > 0 ) ? new TimeLimitedCollector( setHC, cmd.getTimeAllowed() ) : setHC; final DocSetHitCollector collector = new DocSetHitCollector(maxDoc()>>6, maxDoc());
Query query = QueryUtils.makeQueryable(cmd.getQuery()); Query query = QueryUtils.makeQueryable(cmd.getQuery());
final long timeAllowed = cmd.getTimeAllowed();
// TODO: perhaps unify getDocListAndSetNC and getDocListNC without imposing a significant performance hit final Filter luceneFilter = filter==null ? null : filter.getTopFilter();
// Comment: gathering the set before the filter is applied allows one to cache
// the resulting DocSet under the query. The drawback is that it requires an
// extra intersection with the filter at the end. This will be a net win
// for expensive queries.
// Q: what if the final intersection results in a small set from two large
// sets... it won't be a HashDocSet or other small set. One way around
// this would be to collect the resulting set as we go (the filter is
// checked anyway).
// handle zero case... // handle zero case...
if (lastDocRequested<=0) { if (lastDocRequested<=0) {
@ -1147,22 +1134,26 @@ public class SolrIndexSearcher extends IndexSearcher implements SolrInfoMBean {
final float[] topscore = new float[] { Float.NEGATIVE_INFINITY }; final float[] topscore = new float[] { Float.NEGATIVE_INFINITY };
final int[] numHits = new int[1]; final int[] numHits = new int[1];
try { HitCollector hc = new HitCollector() {
super.search(query, new HitCollector() {
public void collect(int doc, float score) { public void collect(int doc, float score) {
collector.collect(doc,score); collector.collect(doc, score);
if (filt!=null && !filt.exists(doc)) return;
numHits[0]++; numHits[0]++;
if (score > topscore[0]) topscore[0]=score; if (score > topscore[0]) topscore[0]=score;
} }
} };
);
if( timeAllowed > 0 ) {
hc = new TimeLimitedCollector( hc, timeAllowed );
}
try {
super.search(query, luceneFilter, hc);
} }
catch( TimeLimitedCollector.TimeExceededException x ) { catch( TimeLimitedCollector.TimeExceededException x ) {
log.warn( "Query: " + query + "; " + x.getMessage() ); log.warn( "Query: " + query + "; " + x.getMessage() );
qr.setPartialResults(true); qr.setPartialResults(true);
} }
nDocsReturned=0; nDocsReturned=0;
ids = new int[nDocsReturned]; ids = new int[nDocsReturned];
scores = new float[nDocsReturned]; scores = new float[nDocsReturned];
@ -1173,16 +1164,13 @@ public class SolrIndexSearcher extends IndexSearcher implements SolrInfoMBean {
// will do automatic score normalization. // will do automatic score normalization.
// NOTE: this changed late in Lucene 1.9 // NOTE: this changed late in Lucene 1.9
final DocSet filt = filter;
final int[] numHits = new int[1]; final int[] numHits = new int[1];
final FieldSortedHitQueue hq = new FieldSortedHitQueue(reader, cmd.getSort().getSort(), len); final FieldSortedHitQueue hq = new FieldSortedHitQueue(reader, cmd.getSort().getSort(), len);
try { HitCollector hc = new HitCollector() {
super.search(query, new HitCollector() {
private FieldDoc reusableFD; private FieldDoc reusableFD;
public void collect(int doc, float score) { public void collect(int doc, float score) {
collector.collect(doc,score); collector.collect(doc, score);
if (filt!=null && !filt.exists(doc)) return;
numHits[0]++; numHits[0]++;
if (reusableFD == null) if (reusableFD == null)
reusableFD = new FieldDoc(doc, score); reusableFD = new FieldDoc(doc, score);
@ -1192,14 +1180,20 @@ public class SolrIndexSearcher extends IndexSearcher implements SolrInfoMBean {
} }
reusableFD = (FieldDoc) hq.insertWithOverflow(reusableFD); reusableFD = (FieldDoc) hq.insertWithOverflow(reusableFD);
} }
} };
);
if( timeAllowed > 0 ) {
hc = new TimeLimitedCollector( hc, timeAllowed );
}
try {
super.search(query, luceneFilter, hc);
} }
catch( TimeLimitedCollector.TimeExceededException x ) { catch( TimeLimitedCollector.TimeExceededException x ) {
log.warn( "Query: " + query + "; " + x.getMessage() ); log.warn( "Query: " + query + "; " + x.getMessage() );
qr.setPartialResults(true); qr.setPartialResults(true);
} }
totalHits = numHits[0]; totalHits = numHits[0];
maxScore = totalHits>0 ? hq.getMaxScore() : 0.0f; maxScore = totalHits>0 ? hq.getMaxScore() : 0.0f;
@ -1218,33 +1212,37 @@ public class SolrIndexSearcher extends IndexSearcher implements SolrInfoMBean {
// This case could be done with TopDocs, but would currently require // This case could be done with TopDocs, but would currently require
// getting a BitSet filter from a DocSet which may be inefficient. // getting a BitSet filter from a DocSet which may be inefficient.
final DocSet filt = filter;
final ScorePriorityQueue hq = new ScorePriorityQueue(lastDocRequested); final ScorePriorityQueue hq = new ScorePriorityQueue(lastDocRequested);
final int[] numHits = new int[1]; final int[] numHits = new int[1];
try {
super.search(query, new HitCollector() { HitCollector hc = new HitCollector() {
private ScoreDoc reusableSD; private ScoreDoc reusableSD;
public void collect(int doc, float score) { public void collect(int doc, float score) {
collector.collect(doc,score); collector.collect(doc, score);
if (filt!=null && !filt.exists(doc)) return;
// if docs are always delivered in order, we could use "score>minScore" // if docs are always delivered in order, we could use "score>minScore"
// but might BooleanScorer14 might still be used and deliver docs out-of-order? // but might BooleanScorer14 might still be used and deliver docs out-of-order?
int nhits = numHits[0]++; int nhits = numHits[0]++;
if (reusableSD == null) { if (reusableSD == null) {
reusableSD = new ScoreDoc(doc, score); reusableSD = new ScoreDoc(doc, score);
} else if (nhits < lastDocRequested || score >= reusableSD.score) { } else if (nhits < lastDocRequested || score >= reusableSD.score) {
// reusableSD holds the last "rejected" entry, so, if // reusableSD holds the last "rejected" entry, so, if
// this new score is not better than that, there's no // this new score is not better than that, there's no
// need to try inserting it // need to try inserting it
reusableSD.doc = doc; reusableSD.doc = doc;
reusableSD.score = score; reusableSD.score = score;
} else { } else {
return; return;
}
reusableSD = (ScoreDoc) hq.insertWithOverflow(reusableSD);
} }
} reusableSD = (ScoreDoc) hq.insertWithOverflow(reusableSD);
); }
};
if( timeAllowed > 0 ) {
hc = new TimeLimitedCollector( hc, timeAllowed );
}
try {
super.search(query, luceneFilter, hc);
} }
catch( TimeLimitedCollector.TimeExceededException x ) { catch( TimeLimitedCollector.TimeExceededException x ) {
log.warn( "Query: " + query + "; " + x.getMessage() ); log.warn( "Query: " + query + "; " + x.getMessage() );
@ -1269,11 +1267,14 @@ public class SolrIndexSearcher extends IndexSearcher implements SolrInfoMBean {
if (sliceLen < 0) sliceLen=0; if (sliceLen < 0) sliceLen=0;
qr.setDocList(new DocSlice(0,sliceLen,ids,scores,totalHits,maxScore)); qr.setDocList(new DocSlice(0,sliceLen,ids,scores,totalHits,maxScore));
DocSet qDocSet = setHC.getDocSet(); // TODO: if we collect results before the filter, we just need to intersect with
qr.setDocSet(filter==null ? qDocSet : qDocSet.intersection(filter)); // that filter to generate the DocSet for qr.setDocSet()
return qDocSet; qr.setDocSet(collector.getDocSet());
}
// TODO: currently we don't generate the DocSet for the base query.
// But the QueryDocSet == CompleteDocSet if filter==null.
return filter==null ? qr.getDocSet() : null;
}
/** /**
@ -1701,7 +1702,6 @@ public class SolrIndexSearcher extends IndexSearcher implements SolrInfoMBean {
private int supersetMaxDoc; private int supersetMaxDoc;
private int flags; private int flags;
private long timeAllowed = -1; private long timeAllowed = -1;
private boolean needDocSet;
public Query getQuery() { return query; } public Query getQuery() { return query; }
public QueryCommand setQuery(Query query) { public QueryCommand setQuery(Query query) {

View File

@ -18,10 +18,7 @@
package org.apache.solr.search; package org.apache.solr.search;
import org.apache.lucene.index.Term; import org.apache.lucene.index.Term;
import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.*;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.BooleanClause;
import org.apache.solr.util.AbstractSolrTestCase; import org.apache.solr.util.AbstractSolrTestCase;
import org.apache.solr.request.SolrQueryRequest; import org.apache.solr.request.SolrQueryRequest;
import org.apache.solr.request.SolrQueryResponse; import org.apache.solr.request.SolrQueryResponse;
@ -31,7 +28,7 @@ import org.apache.solr.update.AddUpdateCommand;
import org.apache.solr.common.params.UpdateParams; import org.apache.solr.common.params.UpdateParams;
import org.apache.solr.common.SolrInputDocument; import org.apache.solr.common.SolrInputDocument;
import java.util.Random; import java.util.*;
import java.io.IOException; import java.io.IOException;
/** /**
@ -71,16 +68,50 @@ public class TestSearchPerf extends AbstractSolrTestCase {
} }
void createIndex2(int nDocs) throws IOException { // Skip encoding for updating the index
void createIndex2(int nDocs, String... fields) throws IOException {
Set<String> fieldSet = new HashSet<String>(Arrays.asList(fields));
SolrQueryRequest req = lrf.makeRequest(); SolrQueryRequest req = lrf.makeRequest();
SolrQueryResponse rsp = new SolrQueryResponse(); SolrQueryResponse rsp = new SolrQueryResponse();
UpdateRequestProcessorChain processorChain = req.getCore().getUpdateProcessingChain(null); UpdateRequestProcessorChain processorChain = req.getCore().getUpdateProcessingChain(null);
UpdateRequestProcessor processor = processorChain.createProcessor(req, rsp); UpdateRequestProcessor processor = processorChain.createProcessor(req, rsp);
boolean foomany_s = fieldSet.contains("foomany_s");
boolean foo1_s = fieldSet.contains("foo1_s");
boolean foo2_s = fieldSet.contains("foo2_s");
boolean foo4_s = fieldSet.contains("foo4_s");
boolean foo8_s = fieldSet.contains("foo8_s");
boolean t10_100_ws = fieldSet.contains("t10_100_ws");
for (int i=0; i<nDocs; i++) { for (int i=0; i<nDocs; i++) {
SolrInputDocument doc = new SolrInputDocument(); SolrInputDocument doc = new SolrInputDocument();
doc.addField("id",Float.toString(i)); doc.addField("id",Float.toString(i));
doc.addField("foomany_s",t(r.nextInt(nDocs*10))); if (foomany_s) {
doc.addField("foomany_s",t(r.nextInt(nDocs*10)));
}
if (foo1_s) {
doc.addField("foo1_s",t(0));
}
if (foo2_s) {
doc.addField("foo2_s",r.nextInt(2));
}
if (foo4_s) {
doc.addField("foo4_s",r.nextInt(4));
}
if (foo8_s) {
doc.addField("foo8_s",r.nextInt(8));
}
if (t10_100_ws) {
StringBuilder sb = new StringBuilder(9*100);
for (int j=0; j<100; j++) {
sb.append(' ');
sb.append(t(r.nextInt(10)));
}
doc.addField("t10_100_ws", sb.toString());
}
AddUpdateCommand cmd = new AddUpdateCommand(); AddUpdateCommand cmd = new AddUpdateCommand();
cmd.solrDoc = doc; cmd.solrDoc = doc;
processor.processAdd(cmd); processor.processAdd(cmd);
@ -113,6 +144,34 @@ public class TestSearchPerf extends AbstractSolrTestCase {
System.out.println("ret="+ret+ " time="+(end-start)+" throughput="+iter*1000/(end-start+1)); System.out.println("ret="+ret+ " time="+(end-start)+" throughput="+iter*1000/(end-start+1));
req.close(); req.close();
assertTrue(ret>0); // make sure we did some work
return ret;
}
int doListGen(int iter, Query q, List<Query> filt, boolean cacheQuery, boolean cacheFilt) throws Exception {
SolrQueryRequest req = lrf.makeRequest();
SolrIndexSearcher searcher = req.getSearcher();
long start = System.currentTimeMillis();
// These aren't public in SolrIndexSearcher
int NO_CHECK_QCACHE = 0x80000000;
int GET_DOCSET = 0x40000000;
int NO_CHECK_FILTERCACHE = 0x20000000;
int GET_SCORES = 0x01;
int ret = 0;
for (int i=0; i<iter; i++) {
DocList l = searcher.getDocList(q, filt, (Sort)null, 0, 10, (cacheQuery?0:NO_CHECK_QCACHE)|(cacheFilt?0:NO_CHECK_FILTERCACHE) );
ret += l.matches();
}
long end = System.currentTimeMillis();
System.out.println("ret="+ret+ " time="+(end-start)+" throughput="+iter*1000/(end-start+1));
req.close();
assertTrue(ret>0); // make sure we did some work
return ret; return ret;
} }
@ -132,7 +191,7 @@ public class TestSearchPerf extends AbstractSolrTestCase {
/** test range query performance */ /** test range query performance */
public void XtestRangePerformance() throws Exception { public void XtestRangePerformance() throws Exception {
int indexSize=199999; int indexSize=1999;
float fractionCovered=1.0f; float fractionCovered=1.0f;
String l=t(0); String l=t(0);
@ -146,7 +205,7 @@ public class TestSearchPerf extends AbstractSolrTestCase {
Query frange = parser2.parse(); Query frange = parser2.parse();
req.close(); req.close();
createIndex2(indexSize); createIndex2(indexSize,"foomany_s");
doSetGen(1, range); doSetGen(1, range);
doSetGen(1, frange); // load field cache doSetGen(1, frange); // load field cache
@ -155,4 +214,35 @@ public class TestSearchPerf extends AbstractSolrTestCase {
doSetGen(10000, frange); doSetGen(10000, frange);
} }
/** test range query performance */
public void testFilteringPerformance() throws Exception {
int indexSize=19999;
float fractionCovered=.1f;
String l=t(0);
String u=t((int)(indexSize*10*fractionCovered));
SolrQueryRequest req = lrf.makeRequest();
QParser parser = QParser.getParser("foomany_s:[" + l + " TO " + u + "]", null, req);
Query rangeQ = parser.parse();
List<Query> filters = new ArrayList<Query>();
filters.add(rangeQ);
req.close();
parser = QParser.getParser("{!dismax qf=t10_100_ws pf=t10_100_ws ps=20}"+ t(0) + ' ' + t(1) + ' ' + t(2), null, req);
Query q= parser.parse();
// SolrIndexSearcher searcher = req.getSearcher();
// DocSet range = searcher.getDocSet(rangeQ, null);
createIndex2(indexSize, "foomany_s", "t10_100_ws");
// doListGen(100, q, filters, false, true);
doListGen(500, q, filters, false, true);
req.close();
}
} }