use FilterCache for faceting single-term non-bool fields

git-svn-id: https://svn.apache.org/repos/asf/incubator/solr/trunk@448695 13f79535-47bb-0310-9956-ffa450edef68
2006-09-21 21:37:36 +00:00 · 2006-09-21 21:37:36 +00:00 · 8c79297a75
parent 6646560c9d
commit 8c79297a75
4 changed files with 146 additions and 49 deletions
--- a/CHANGES.txt
+++ b/CHANGES.txt
@ -83,6 +83,10 @@ Optimizations
 5. Optimized getDocSet() for term queries resulting in a 36% speedup of facet.field
    queries where DocSets aren't cached (for example, if the number of terms in the field
    is larger than the filter cache.) (yonik)
+ 6. Optimized facet.field faceting by as much as 500 times when the field has
+    a single token per document (not multiValued & not tokenized) by using the
+    Lucene FieldCache entry for that field to tally term counts.  The first request
+    utilizing the FieldCache will take longer than subsequent ones.

 Bug Fixes
 1. Fixed delete-by-id for field types who's indexed form is different
--- a/src/java/org/apache/solr/request/SimpleFacets.java
+++ b/src/java/org/apache/solr/request/SimpleFacets.java
@ -20,23 +20,20 @@ import org.apache.lucene.index.Term;
 import org.apache.lucene.index.TermEnum;
 import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.queryParser.ParseException;
-import org.apache.lucene.queryParser.QueryParser;
 import org.apache.lucene.search.*;
 import org.apache.solr.core.SolrCore;
 import org.apache.solr.core.SolrException;
 import org.apache.solr.request.SolrParams;
-import org.apache.solr.request.SolrQueryRequest;
-import org.apache.solr.request.SolrQueryResponse;
-import org.apache.solr.request.DefaultSolrParams;
 import org.apache.solr.schema.IndexSchema;
 import org.apache.solr.schema.FieldType;
+import org.apache.solr.schema.SchemaField;
+import org.apache.solr.schema.BoolField;
 import org.apache.solr.search.*;
 import org.apache.solr.util.NamedList;
 import org.apache.solr.util.BoundedTreeSet;

 import java.io.IOException;
 import java.util.*;
-import java.util.logging.Level;

 /**
 * A class that generates simple Facet information for a request.
@ -53,6 +50,7 @@ public class SimpleFacets {
  /** Searcher to use for all calculations */
  protected SolrIndexSearcher searcher;

+
  public SimpleFacets(SolrIndexSearcher searcher,
                      DocSet docs,
                      SolrParams params) {
@ -117,30 +115,44 @@ public class SimpleFacets {
    return res;
  }

+
+  public NamedList getTermCounts(String field) throws IOException {
+    int limit = params.getFieldInt(field, params.FACET_LIMIT, 100);
+    boolean zeros = params.getFieldBool(field, params.FACET_ZEROS, true);
+    boolean missing = params.getFieldBool(field, params.FACET_MISSING, false);
+
+    NamedList counts;
+    SchemaField sf = searcher.getSchema().getField(field);
+    FieldType ft = sf.getType();
+    if (sf.multiValued() || ft.isTokenized() || ft instanceof BoolField) {
+      // Always use filters for booleans... we know the number of values is very small.
+      counts = getFacetTermEnumCounts(searcher,docs,field,limit,zeros,missing);
+    } else {
+      // TODO: future logic could use filters instead of the fieldcache if
+      // the number of terms in the field is small enough.
+      counts = getFieldCacheCounts(searcher, docs, field, limit, zeros, missing);
+    }
+
+    return counts;
+  }
+
+
  /**
   * Returns a list of value constraints and the associated facet counts 
   * for each facet field specified in the params.
   *
   * @see SolrParams#FACET_FIELD
-   * @see #getFacetFieldMissingCount
+   * @see #getFieldMissingCount
   * @see #getFacetTermEnumCounts
   */
  public NamedList getFacetFieldCounts()
-    throws IOException {
+          throws IOException {

    NamedList res = new NamedList();
    String[] facetFs = params.getParams(SolrParams.FACET_FIELD);
-    if (null != facetFs && 0 != facetFs.length) {
-      
+    if (null != facetFs) {
      for (String f : facetFs) {
-
-        NamedList counts = getFacetTermEnumCounts(f);
-        
-        if (params.getFieldBool(f, params.FACET_MISSING, false))
-          counts.add(null, getFacetFieldMissingCount(f));
-        
-        res.add(f, counts);
-        
+        res.add(f, getTermCounts(f));
      }
    }
    return res;
@ -152,7 +164,7 @@ public class SimpleFacets {
   *
   * @see SolrParams#FACET_MISSING
   */
-  public int getFacetFieldMissingCount(String fieldName)
+  public static int getFieldMissingCount(SolrIndexSearcher searcher, DocSet docs, String fieldName)
    throws IOException {

    DocSet hasVal = searcher.getDocSet
@ -160,58 +172,122 @@ public class SimpleFacets {
    return docs.andNotSize(hasVal);
  }

+  /**
+   * Use the Lucene FieldCache to get counts for each unique field value in <code>docs</code>.
+   * The field must have at most one indexed token per document.
+   */
+  public static NamedList getFieldCacheCounts(SolrIndexSearcher searcher, DocSet docs, String fieldName, int limit, boolean zeros, boolean missing) throws IOException {
+    // TODO: If the number of terms is high compared to docs.size(), and zeros==false,
+    //  we should use an alternate strategy to avoid
+    //  1) creating another huge int[] for the counts
+    //  2) looping over that huge int[] looking for the rare non-zeros.
+    //
+    // Yet another variation: if docs.size() is small and termvectors are stored,
+    // then use them instead of the FieldCache.
+    //
+
+    FieldCache.StringIndex si = FieldCache.DEFAULT.getStringIndex(searcher.getReader(), fieldName);
+    int[] count = new int[si.lookup.length];
+    DocIterator iter = docs.iterator();
+    while (iter.hasNext()) {
+      count[si.order[iter.nextDoc()]]++;
+    }
+
+    FieldType ft = searcher.getSchema().getFieldType(fieldName);
+    NamedList res = new NamedList();
+
+    // IDEA: we could also maintain a count of "other"... everything that fell outside
+    // of the top 'N'
+
+    BoundedTreeSet<CountPair<String,Integer>> queue=null;
+
+    if (limit>=0) {
+      // TODO: compare performance of BoundedTreeSet compare against priority queue?
+      queue = new BoundedTreeSet<CountPair<String,Integer>>(limit);
+    }
+
+    int min=-1;  // the smallest value in the top 'N' values
+    for (int i=1; i<count.length; i++) {
+      int c = count[i];
+      if (c==0 && !zeros) continue;
+      if (limit<0) {
+        res.add(ft.indexedToReadable(si.lookup[i]), c);
+      } else if (c>min) {
+        // NOTE: we use c>min rather than c>=min as an optimization because we are going in
+        // index order, so we already know that the keys are ordered.  This can be very
+        // important if a lot of the counts are repeated (like zero counts would be).
+        queue.add(new CountPair<String,Integer>(ft.indexedToReadable(si.lookup[i]), c));
+        if (queue.size()>=limit) min=queue.last().val;
+      }
+    }
+
+    if (limit>=0) {
+      for (CountPair<String,Integer> p : queue) {
+        res.add(p.key, p.val);
+      }
+    }
+
+
+    if (missing) res.add(null, count[0]);
+    return res;
+  }
+
  /**
   * Returns a list of terms in the specified field along with the 
   * corrisponding count of documents in the set that match that constraint.
+   * This method uses the FilterCache to get the intersection count between <code>docs</code>
+   * and the DocSet for each term in the filter.
   *
   * @see SolrParams#FACET_LIMIT
   * @see SolrParams#FACET_ZEROS
+   * @see SolrParams#FACET_MISSING
   */
-  public NamedList getFacetTermEnumCounts(String fieldName) 
+  public NamedList getFacetTermEnumCounts(SolrIndexSearcher searcher, DocSet docs, String field, int limit, boolean zeros, boolean missing)
    throws IOException {

    /* :TODO: potential optimization...
-     * cache the Terms with the highest docFreq and try them first
-     * don't enum if we get our max from them
-     */
+    * cache the Terms with the highest docFreq and try them first
+    * don't enum if we get our max from them
+    */

    IndexSchema schema = searcher.getSchema();
    IndexReader r = searcher.getReader();
-    FieldType ft = schema.getFieldType(fieldName);
+    FieldType ft = schema.getFieldType(field);

    Set<CountPair<String,Integer>> counts
      = new HashSet<CountPair<String,Integer>>();

-    int limit = params.getFieldInt(fieldName, params.FACET_LIMIT, 100);
    if (0 <= limit) {
      counts = new BoundedTreeSet<CountPair<String,Integer>>(limit);
    }

-    boolean zeros = params.getFieldBool(fieldName, params.FACET_ZEROS, true);
-      
-    TermEnum te = r.terms(new Term(fieldName,""));
+    TermEnum te = r.terms(new Term(field,""));
    do {
      Term t = te.term();

-      if (null == t || ! t.field().equals(fieldName)) 
+      if (null == t || ! t.field().equals(field))
        break;

      if (0 < te.docFreq()) { /* all docs may be deleted */
        int count = searcher.numDocs(new TermQuery(t),
                                     docs);

-        /* :TODO: is indexedToReadable correct? */ 
        if (zeros || 0 < count)
          counts.add(new CountPair<String,Integer>
-                     (ft.indexedToReadable(t.text()), count));
+                     (t.text(), count));

      }
    } while (te.next());

    NamedList res = new NamedList();
    for (CountPair<String,Integer> p : counts) {
-      res.add(p.key, p.val);
+      res.add(ft.indexedToReadable(p.key), p.val);
    }
+
+    if (missing) {
+      res.add(null, getFieldMissingCount(searcher,docs,field));
+    }
+
    return res;
  }

--- a/src/java/org/apache/solr/request/SolrQueryRequest.java
+++ b/src/java/org/apache/solr/request/SolrQueryRequest.java
@ -20,19 +20,36 @@ import org.apache.solr.search.SolrIndexSearcher;
 import org.apache.solr.schema.IndexSchema;
 import org.apache.solr.core.SolrCore;

+import java.util.Map;
+
 /**
- * Container for a request to execute a query.
+ * <p>Container for a request to execute a query.</p>
+ * <p><code>SolrQueryRequest</code> is not thread safe.</p>
 * 
 * @author yonik
 * @version $Id$
 */
 public interface SolrQueryRequest {
+
+  /** returns the current request parameters */
  public SolrParams getParams();

+  /** Change the parameters for this request.  This does not affect
+   *  the original parameters returned by getOriginalParams()
+   */
  public void setParams(SolrParams params);

+  /** Returns the original request parameters.  As this
+   * does not normally include configured defaults
+   * it's more suitable for logging.
+   */
  public SolrParams getOriginalParams();

+  /**
+   * Generic information associated with this request that may be both read and updated.
+   */
+  public Map<Object,Object> getContext();
+
  /**
   * This method should be called when all uses of this request are
   * finished, so that resources can be freed.
--- a/src/java/org/apache/solr/schema/FieldType.java
+++ b/src/java/org/apache/solr/schema/FieldType.java
@ -57,7 +57,7 @@ public abstract class FieldType extends FieldProperties {
  int properties;

  /** Returns true if fields of this type should be tokenized */
-  protected boolean isTokenized() {
+  public boolean isTokenized() {
    return (properties & TOKENIZED) != 0;
  }