From ea43e4c99718e06cd30b257a77ddb083648fc27e Mon Sep 17 00:00:00 2001
From: Yonik Seeley <yonik@apache.org>
Date: Sat, 23 May 2009 17:26:54 +0000
Subject: [PATCH] SOLR-1165: Use Lucene Filters in conjunction with
 IndexSearcher.search methods

git-svn-id: https://svn.apache.org/repos/asf/lucene/solr/trunk@777969 13f79535-47bb-0310-9956-ffa450edef68
---
 CHANGES.txt                                   |   3 +
 .../apache/solr/search/SolrIndexSearcher.java | 136 +++++++++---------
 .../apache/solr/search/TestSearchPerf.java    | 108 ++++++++++++--
 3 files changed, 170 insertions(+), 77 deletions(-)

diff --git a/CHANGES.txt b/CHANGES.txt
index def4841c259..d77804b3637 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -264,6 +264,9 @@ Optimizations
     is ordered for easier impelemntation of skipTo, and is faster
     in the general case. (yonik)
 
+12. SOLR-1165: Use Lucene Filters and pass them down to the Lucene
+    search methods to filter earlier and improve performance. (yonik)
+
 
 Bug Fixes
 ----------------------
diff --git a/src/java/org/apache/solr/search/SolrIndexSearcher.java b/src/java/org/apache/solr/search/SolrIndexSearcher.java
index 787556fa90c..a4d3ce9ca6c 100644
--- a/src/java/org/apache/solr/search/SolrIndexSearcher.java
+++ b/src/java/org/apache/solr/search/SolrIndexSearcher.java
@@ -885,9 +885,11 @@ public class SolrIndexSearcher extends IndexSearcher implements SolrInfoMBean {
       // do it the normal way...
       cmd.setSupersetMaxDoc(supersetMaxDoc);
       if ((cmd.getFlags() & GET_DOCSET)!=0) {
+        // this currently conflates returning the docset for the base query vs
+        // the base query and all filters.
         DocSet qDocSet = getDocListAndSetNC(qr,cmd);
         // cache the docSet matching the query w/o filtering
-        if (filterCache!=null && !qr.isPartialResults()) filterCache.put(cmd.getQuery(),qDocSet);
+        if (qDocSet!=null && filterCache!=null && !qr.isPartialResults()) filterCache.put(cmd.getQuery(),qDocSet);
       } else {
         getDocListNC(qr,cmd);
         //Parameters: cmd.getQuery(),theFilt,cmd.getSort(),0,supersetMaxDoc,cmd.getFlags(),cmd.getTimeAllowed(),responseHeader);
@@ -922,15 +924,15 @@ public class SolrIndexSearcher extends IndexSearcher implements SolrInfoMBean {
 
     Query query = QueryUtils.makeQueryable(cmd.getQuery());
 
+    final Filter luceneFilter = filter==null ? null : filter.getTopFilter();
+
     // handle zero case...
     if (lastDocRequested<=0) {
-      final DocSet filt = filter;
       final float[] topscore = new float[] { Float.NEGATIVE_INFINITY };
       final int[] numHits = new int[1];
 
       HitCollector hc = new HitCollector() {
         public void collect(int doc, float score) {
-          if (filt!=null && !filt.exists(doc)) return;
           numHits[0]++;
           if (score > topscore[0]) topscore[0]=score;
         }
@@ -939,7 +941,7 @@ public class SolrIndexSearcher extends IndexSearcher implements SolrInfoMBean {
         hc = new TimeLimitedCollector( hc, timeAllowed );
       }
       try {
-        super.search(query, hc );
+        super.search(query, luceneFilter, hc);
       }
       catch( TimeLimitedCollector.TimeExceededException x ) {
         log.warn( "Query: " + query + "; " + x.getMessage() );
@@ -955,15 +957,12 @@ public class SolrIndexSearcher extends IndexSearcher implements SolrInfoMBean {
       // can't use TopDocs if there is a sort since it
       // will do automatic score normalization.
       // NOTE: this changed late in Lucene 1.9
-
-      final DocSet filt = filter;
       final int[] numHits = new int[1];
       final FieldSortedHitQueue hq = new FieldSortedHitQueue(reader, cmd.getSort().getSort(), len);
 
       HitCollector hc = new HitCollector() {
         private FieldDoc reusableFD;
         public void collect(int doc, float score) {
-          if (filt!=null && !filt.exists(doc)) return;
           numHits[0]++;
           if (reusableFD == null)
             reusableFD = new FieldDoc(doc, score);
@@ -978,7 +977,7 @@ public class SolrIndexSearcher extends IndexSearcher implements SolrInfoMBean {
         hc = new TimeLimitedCollector( hc, timeAllowed );
       }
       try {
-        super.search(query, hc );
+        super.search(query, luceneFilter, hc );
       }
       catch( TimeLimitedCollector.TimeExceededException x ) {
         log.warn( "Query: " + query + "; " + x.getMessage() );
@@ -1002,14 +1001,11 @@ public class SolrIndexSearcher extends IndexSearcher implements SolrInfoMBean {
       // No Sort specified (sort by score descending)
       // This case could be done with TopDocs, but would currently require
       // getting a BitSet filter from a DocSet which may be inefficient.
-
-      final DocSet filt = filter;
       final ScorePriorityQueue hq = new ScorePriorityQueue(lastDocRequested);
       final int[] numHits = new int[1];
       HitCollector hc = new HitCollector() {
         private ScoreDoc reusableSD;
         public void collect(int doc, float score) {
-          if (filt!=null && !filt.exists(doc)) return;
             // TODO: if docs are always delivered in order, we could use "score>minScore"
             // instead of "score>=minScore" and avoid tiebreaking scores
             // in the priority queue.
@@ -1033,7 +1029,7 @@ public class SolrIndexSearcher extends IndexSearcher implements SolrInfoMBean {
         hc = new TimeLimitedCollector( hc, timeAllowed );
       }
       try {
-        super.search(query, hc );
+        super.search(query, luceneFilter, hc );
       }
       catch( TimeLimitedCollector.TimeExceededException x ) {
         log.warn( "Query: " + query + "; " + x.getMessage() );
@@ -1110,10 +1106,10 @@ public class SolrIndexSearcher extends IndexSearcher implements SolrInfoMBean {
 
   }
 
-
   // the DocSet returned is for the query only, without any filtering... that way it may
   // be cached if desired.
   private DocSet getDocListAndSetNC(QueryResult qr,QueryCommand cmd) throws IOException {
+///////////////////// NEW
     int len = cmd.getSupersetMaxDoc();
     DocSet filter = cmd.getFilter()!=null ? cmd.getFilter() : getDocSet(cmd.getFilterList());
     int last = len;
@@ -1124,22 +1120,13 @@ public class SolrIndexSearcher extends IndexSearcher implements SolrInfoMBean {
     float maxScore;
     int[] ids;
     float[] scores;
-    final DocSetHitCollector setHC = new DocSetHitCollector(maxDoc()>>6, maxDoc());
-    final HitCollector collector = ( cmd.getTimeAllowed() > 0 ) ? new TimeLimitedCollector( setHC, cmd.getTimeAllowed() ) : setHC;
+
+    final DocSetHitCollector collector = new DocSetHitCollector(maxDoc()>>6, maxDoc());
 
     Query query = QueryUtils.makeQueryable(cmd.getQuery());
+    final long timeAllowed = cmd.getTimeAllowed();
 
-    // TODO: perhaps unify getDocListAndSetNC and getDocListNC without imposing a significant performance hit
-
-    // Comment: gathering the set before the filter is applied allows one to cache
-    // the resulting DocSet under the query.  The drawback is that it requires an
-    // extra intersection with the filter at the end.  This will be a net win
-    // for expensive queries.
-
-    // Q: what if the final intersection results in a small set from two large
-    // sets... it won't be a HashDocSet or other small set.  One way around
-    // this would be to collect the resulting set as we go (the filter is
-    // checked anyway).
+    final Filter luceneFilter = filter==null ? null : filter.getTopFilter();
 
     // handle zero case...
     if (lastDocRequested<=0) {
@@ -1147,22 +1134,26 @@ public class SolrIndexSearcher extends IndexSearcher implements SolrInfoMBean {
       final float[] topscore = new float[] { Float.NEGATIVE_INFINITY };
       final int[] numHits = new int[1];
 
-      try {
-        super.search(query, new HitCollector() {
+      HitCollector hc = new HitCollector() {
           public void collect(int doc, float score) {
-            collector.collect(doc,score);
-            if (filt!=null && !filt.exists(doc)) return;
+            collector.collect(doc, score);
             numHits[0]++;
             if (score > topscore[0]) topscore[0]=score;
           }
-        }
-        );
+      };
+
+      if( timeAllowed > 0 ) {
+        hc = new TimeLimitedCollector( hc, timeAllowed );
+      }
+      try {
+        super.search(query, luceneFilter, hc);
       }
       catch( TimeLimitedCollector.TimeExceededException x ) {
         log.warn( "Query: " + query + "; " + x.getMessage() );
         qr.setPartialResults(true);
       }
 
+
       nDocsReturned=0;
       ids = new int[nDocsReturned];
       scores = new float[nDocsReturned];
@@ -1173,16 +1164,13 @@ public class SolrIndexSearcher extends IndexSearcher implements SolrInfoMBean {
       // will do automatic score normalization.
       // NOTE: this changed late in Lucene 1.9
 
-      final DocSet filt = filter;
       final int[] numHits = new int[1];
       final FieldSortedHitQueue hq = new FieldSortedHitQueue(reader, cmd.getSort().getSort(), len);
 
-      try {
-        super.search(query, new HitCollector() {
+      HitCollector hc = new HitCollector() {
           private FieldDoc reusableFD;
           public void collect(int doc, float score) {
-            collector.collect(doc,score);
-            if (filt!=null && !filt.exists(doc)) return;
+            collector.collect(doc, score);            
             numHits[0]++;
             if (reusableFD == null)
               reusableFD = new FieldDoc(doc, score);
@@ -1192,13 +1180,19 @@ public class SolrIndexSearcher extends IndexSearcher implements SolrInfoMBean {
             }
             reusableFD = (FieldDoc) hq.insertWithOverflow(reusableFD);
           }
-        }
-        );
+        };
+
+      if( timeAllowed > 0 ) {
+        hc = new TimeLimitedCollector( hc, timeAllowed );
+      }
+      try {
+        super.search(query, luceneFilter, hc);
       }
       catch( TimeLimitedCollector.TimeExceededException x ) {
         log.warn( "Query: " + query + "; " + x.getMessage() );
         qr.setPartialResults(true);
       }
+      
 
       totalHits = numHits[0];
       maxScore = totalHits>0 ? hq.getMaxScore() : 0.0f;
@@ -1218,33 +1212,37 @@ public class SolrIndexSearcher extends IndexSearcher implements SolrInfoMBean {
       // This case could be done with TopDocs, but would currently require
       // getting a BitSet filter from a DocSet which may be inefficient.
 
-      final DocSet filt = filter;
       final ScorePriorityQueue hq = new ScorePriorityQueue(lastDocRequested);
       final int[] numHits = new int[1];
-      try {
-        super.search(query, new HitCollector() {
+      
+      HitCollector hc = new HitCollector() {
           private ScoreDoc reusableSD;
           public void collect(int doc, float score) {
-            collector.collect(doc,score);
-            if (filt!=null && !filt.exists(doc)) return;
-              // if docs are always delivered in order, we could use "score>minScore"
-              // but might BooleanScorer14 might still be used and deliver docs out-of-order?
-              int nhits = numHits[0]++;
-              if (reusableSD == null) {
-                reusableSD = new ScoreDoc(doc, score);
-              } else if (nhits < lastDocRequested || score >= reusableSD.score) {
-                // reusableSD holds the last "rejected" entry, so, if
-                // this new score is not better than that, there's no
-                // need to try inserting it
-                reusableSD.doc = doc;
-                reusableSD.score = score;
-              } else {
-                return;
-              }
-              reusableSD = (ScoreDoc) hq.insertWithOverflow(reusableSD);
+            collector.collect(doc, score);
+
+            // if docs are always delivered in order, we could use "score>minScore"
+            // but might BooleanScorer14 might still be used and deliver docs out-of-order?
+            int nhits = numHits[0]++;
+            if (reusableSD == null) {
+              reusableSD = new ScoreDoc(doc, score);
+            } else if (nhits < lastDocRequested || score >= reusableSD.score) {
+              // reusableSD holds the last "rejected" entry, so, if
+              // this new score is not better than that, there's no
+              // need to try inserting it
+              reusableSD.doc = doc;
+              reusableSD.score = score;
+            } else {
+              return;
             }
-        }
-        );
+            reusableSD = (ScoreDoc) hq.insertWithOverflow(reusableSD);
+          }
+      };
+
+      if( timeAllowed > 0 ) {
+        hc = new TimeLimitedCollector( hc, timeAllowed );
+      }
+      try {
+        super.search(query, luceneFilter, hc);
       }
       catch( TimeLimitedCollector.TimeExceededException x ) {
         log.warn( "Query: " + query + "; " + x.getMessage() );
@@ -1267,13 +1265,16 @@ public class SolrIndexSearcher extends IndexSearcher implements SolrInfoMBean {
 
     int sliceLen = Math.min(lastDocRequested,nDocsReturned);
     if (sliceLen < 0) sliceLen=0;
-    
-    qr.setDocList(new DocSlice(0,sliceLen,ids,scores,totalHits,maxScore));
-    DocSet qDocSet = setHC.getDocSet();
-    qr.setDocSet(filter==null ? qDocSet : qDocSet.intersection(filter));
-    return qDocSet;
-  }
 
+    qr.setDocList(new DocSlice(0,sliceLen,ids,scores,totalHits,maxScore));
+    // TODO: if we collect results before the filter, we just need to intersect with
+    // that filter to generate the DocSet for qr.setDocSet()
+    qr.setDocSet(collector.getDocSet());
+
+    // TODO: currently we don't generate the DocSet for the base query.
+    // But the QueryDocSet == CompleteDocSet if filter==null.
+    return filter==null ? qr.getDocSet() : null;
+  }
 
 
   /**
@@ -1701,7 +1702,6 @@ public class SolrIndexSearcher extends IndexSearcher implements SolrInfoMBean {
     private int supersetMaxDoc;
     private int flags;
     private long timeAllowed = -1;
-    private boolean needDocSet;
 
     public Query getQuery() { return query; }
     public QueryCommand setQuery(Query query) {
diff --git a/src/test/org/apache/solr/search/TestSearchPerf.java b/src/test/org/apache/solr/search/TestSearchPerf.java
index 2b049daf3a5..42ca3d3ce96 100755
--- a/src/test/org/apache/solr/search/TestSearchPerf.java
+++ b/src/test/org/apache/solr/search/TestSearchPerf.java
@@ -18,10 +18,7 @@
 package org.apache.solr.search;
 
 import org.apache.lucene.index.Term;
-import org.apache.lucene.search.TermQuery;
-import org.apache.lucene.search.Query;
-import org.apache.lucene.search.BooleanQuery;
-import org.apache.lucene.search.BooleanClause;
+import org.apache.lucene.search.*;
 import org.apache.solr.util.AbstractSolrTestCase;
 import org.apache.solr.request.SolrQueryRequest;
 import org.apache.solr.request.SolrQueryResponse;
@@ -31,7 +28,7 @@ import org.apache.solr.update.AddUpdateCommand;
 import org.apache.solr.common.params.UpdateParams;
 import org.apache.solr.common.SolrInputDocument;
 
-import java.util.Random;
+import java.util.*;
 import java.io.IOException;
 
 /**
@@ -71,16 +68,50 @@ public class TestSearchPerf extends AbstractSolrTestCase {
   }
 
 
-  void createIndex2(int nDocs) throws IOException {
+  // Skip encoding for updating the index
+  void createIndex2(int nDocs, String... fields) throws IOException {
+    Set<String> fieldSet = new HashSet<String>(Arrays.asList(fields));
+
     SolrQueryRequest req = lrf.makeRequest();
     SolrQueryResponse rsp = new SolrQueryResponse();
     UpdateRequestProcessorChain processorChain = req.getCore().getUpdateProcessingChain(null);
     UpdateRequestProcessor processor = processorChain.createProcessor(req, rsp);
 
+    boolean foomany_s = fieldSet.contains("foomany_s");
+    boolean foo1_s = fieldSet.contains("foo1_s");
+    boolean foo2_s = fieldSet.contains("foo2_s");
+    boolean foo4_s = fieldSet.contains("foo4_s");
+    boolean foo8_s = fieldSet.contains("foo8_s");
+    boolean t10_100_ws = fieldSet.contains("t10_100_ws");
+
+    
     for (int i=0; i<nDocs; i++) {
       SolrInputDocument doc = new SolrInputDocument();
       doc.addField("id",Float.toString(i));
-      doc.addField("foomany_s",t(r.nextInt(nDocs*10)));
+      if (foomany_s) {
+        doc.addField("foomany_s",t(r.nextInt(nDocs*10)));
+      }
+      if (foo1_s) {
+        doc.addField("foo1_s",t(0));
+      }
+      if (foo2_s) {
+        doc.addField("foo2_s",r.nextInt(2));
+      }
+      if (foo4_s) {
+        doc.addField("foo4_s",r.nextInt(4));
+      }
+      if (foo8_s) {
+        doc.addField("foo8_s",r.nextInt(8));
+      }
+      if (t10_100_ws) {
+        StringBuilder sb = new StringBuilder(9*100);
+        for (int j=0; j<100; j++) {
+          sb.append(' ');
+          sb.append(t(r.nextInt(10)));
+        }
+        doc.addField("t10_100_ws", sb.toString());
+      }
+
       AddUpdateCommand cmd = new AddUpdateCommand();
       cmd.solrDoc = doc;
       processor.processAdd(cmd);
@@ -113,6 +144,34 @@ public class TestSearchPerf extends AbstractSolrTestCase {
     System.out.println("ret="+ret+ " time="+(end-start)+" throughput="+iter*1000/(end-start+1));
 
     req.close();
+    assertTrue(ret>0);  // make sure we did some work
+    return ret;
+  }
+
+  int doListGen(int iter, Query q, List<Query> filt, boolean cacheQuery, boolean cacheFilt) throws Exception {
+    SolrQueryRequest req = lrf.makeRequest();
+
+    SolrIndexSearcher searcher = req.getSearcher();
+
+    long start = System.currentTimeMillis();
+
+    // These aren't public in SolrIndexSearcher
+    int NO_CHECK_QCACHE       = 0x80000000;
+    int GET_DOCSET            = 0x40000000;
+    int NO_CHECK_FILTERCACHE  = 0x20000000;
+    int GET_SCORES            = 0x01;
+
+    int ret = 0;
+    for (int i=0; i<iter; i++) {
+      DocList l = searcher.getDocList(q, filt, (Sort)null, 0, 10, (cacheQuery?0:NO_CHECK_QCACHE)|(cacheFilt?0:NO_CHECK_FILTERCACHE) );
+      ret += l.matches();
+    }
+
+    long end = System.currentTimeMillis();
+    System.out.println("ret="+ret+ " time="+(end-start)+" throughput="+iter*1000/(end-start+1));
+
+    req.close();
+    assertTrue(ret>0);  // make sure we did some work
     return ret;
   }
 
@@ -132,7 +191,7 @@ public class TestSearchPerf extends AbstractSolrTestCase {
 
   /** test range query performance */
   public void XtestRangePerformance() throws Exception {
-    int indexSize=199999;
+    int indexSize=1999;
     float fractionCovered=1.0f;
 
     String l=t(0);
@@ -146,7 +205,7 @@ public class TestSearchPerf extends AbstractSolrTestCase {
     Query frange = parser2.parse();
     req.close();
 
-    createIndex2(indexSize);
+    createIndex2(indexSize,"foomany_s");
 
     doSetGen(1, range);
     doSetGen(1, frange);   // load field cache
@@ -155,4 +214,35 @@ public class TestSearchPerf extends AbstractSolrTestCase {
     doSetGen(10000, frange);
   }
 
+  /** test range query performance */
+  public void testFilteringPerformance() throws Exception {
+    int indexSize=19999;
+    float fractionCovered=.1f;
+
+    String l=t(0);
+    String u=t((int)(indexSize*10*fractionCovered));
+
+    SolrQueryRequest req = lrf.makeRequest();
+
+    QParser parser = QParser.getParser("foomany_s:[" + l + " TO " + u + "]", null, req);
+    Query rangeQ = parser.parse();
+    List<Query> filters = new ArrayList<Query>();
+    filters.add(rangeQ);
+    req.close();
+
+    parser = QParser.getParser("{!dismax qf=t10_100_ws pf=t10_100_ws ps=20}"+ t(0) + ' ' + t(1) + ' ' + t(2), null, req);
+    Query q= parser.parse();
+
+    // SolrIndexSearcher searcher = req.getSearcher();
+    // DocSet range = searcher.getDocSet(rangeQ, null);
+
+    createIndex2(indexSize, "foomany_s", "t10_100_ws");
+
+    // doListGen(100, q, filters, false, true);
+    doListGen(500, q, filters, false, true);
+
+    req.close();
+  }  
+
+
 }
\ No newline at end of file