From 8c79297a754cca22833baf9c22dc6669ae29b92f Mon Sep 17 00:00:00 2001
From: Yonik Seeley <yonik@apache.org>
Date: Thu, 21 Sep 2006 21:37:36 +0000
Subject: [PATCH] use FilterCache for faceting single-term non-bool fields

git-svn-id: https://svn.apache.org/repos/asf/incubator/solr/trunk@448695 13f79535-47bb-0310-9956-ffa450edef68
---
 CHANGES.txt                                   |   4 +
 .../org/apache/solr/request/SimpleFacets.java | 170 +++++++++++++-----
 .../apache/solr/request/SolrQueryRequest.java |  19 +-
 .../org/apache/solr/schema/FieldType.java     |   2 +-
 4 files changed, 146 insertions(+), 49 deletions(-)

diff --git a/CHANGES.txt b/CHANGES.txt
index 200bb2ed4bd..4190e233bd4 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -83,6 +83,10 @@ Optimizations
  5. Optimized getDocSet() for term queries resulting in a 36% speedup of facet.field
     queries where DocSets aren't cached (for example, if the number of terms in the field
     is larger than the filter cache.) (yonik)
+ 6. Optimized facet.field faceting by as much as 500 times when the field has
+    a single token per document (not multiValued & not tokenized) by using the
+    Lucene FieldCache entry for that field to tally term counts.  The first request
+    utilizing the FieldCache will take longer than subsequent ones.
 
 Bug Fixes
  1. Fixed delete-by-id for field types who's indexed form is different
diff --git a/src/java/org/apache/solr/request/SimpleFacets.java b/src/java/org/apache/solr/request/SimpleFacets.java
index c85091db600..40a0d0604fc 100644
--- a/src/java/org/apache/solr/request/SimpleFacets.java
+++ b/src/java/org/apache/solr/request/SimpleFacets.java
@@ -20,23 +20,20 @@ import org.apache.lucene.index.Term;
 import org.apache.lucene.index.TermEnum;
 import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.queryParser.ParseException;
-import org.apache.lucene.queryParser.QueryParser;
 import org.apache.lucene.search.*;
 import org.apache.solr.core.SolrCore;
 import org.apache.solr.core.SolrException;
 import org.apache.solr.request.SolrParams;
-import org.apache.solr.request.SolrQueryRequest;
-import org.apache.solr.request.SolrQueryResponse;
-import org.apache.solr.request.DefaultSolrParams;
 import org.apache.solr.schema.IndexSchema;
 import org.apache.solr.schema.FieldType;
+import org.apache.solr.schema.SchemaField;
+import org.apache.solr.schema.BoolField;
 import org.apache.solr.search.*;
 import org.apache.solr.util.NamedList;
 import org.apache.solr.util.BoundedTreeSet;
 
 import java.io.IOException;
 import java.util.*;
-import java.util.logging.Level;
 
 /**
  * A class that generates simple Facet information for a request.
@@ -52,15 +49,16 @@ public class SimpleFacets {
   protected SolrParams params;
   /** Searcher to use for all calculations */
   protected SolrIndexSearcher searcher;
-  
-  public SimpleFacets(SolrIndexSearcher searcher, 
-                      DocSet docs, 
+
+
+  public SimpleFacets(SolrIndexSearcher searcher,
+                      DocSet docs,
                       SolrParams params) {
     this.searcher = searcher;
     this.docs = docs;
     this.params = params;
   }
-  
+
   /**
    * Looks at various Params to determing if any simple Facet Constraint count
    * computations are desired.
@@ -73,7 +71,7 @@ public class SimpleFacets {
   public NamedList getFacetCounts() {
 
     // if someone called this method, benefit of the doubt: assume true
-    if (!params.getBool(params.FACET,true)) 
+    if (!params.getBool(params.FACET,true))
       return null;
 
     NamedList res = new NamedList();
@@ -82,7 +80,7 @@ public class SimpleFacets {
       res.add("facet_queries", getFacetQueryCounts());
 
       res.add("facet_fields", getFacetFieldCounts());
-      
+
     } catch (Exception e) {
       SolrException.logOnce(SolrCore.log, "Exception during facet counts", e);
       res.add("exception", SolrException.toStr(e));
@@ -97,7 +95,7 @@ public class SimpleFacets {
    * @see SolrParams#FACET_QUERY
    */
   public NamedList getFacetQueryCounts() throws IOException,ParseException {
-    
+
     NamedList res = new NamedList();
 
     /* Ignore SolrParams.DF - could have init param facet.query assuming
@@ -106,7 +104,7 @@ public class SimpleFacets {
      * explicit.
      */
     SolrQueryParser qp = new SolrQueryParser(searcher.getSchema(),null);
-    
+
     String[] facetQs = params.getParams(SolrParams.FACET_QUERY);
     if (null != facetQs && 0 != facetQs.length) {
       for (String q : facetQs) {
@@ -117,30 +115,44 @@ public class SimpleFacets {
     return res;
   }
 
+
+  public NamedList getTermCounts(String field) throws IOException {
+    int limit = params.getFieldInt(field, params.FACET_LIMIT, 100);
+    boolean zeros = params.getFieldBool(field, params.FACET_ZEROS, true);
+    boolean missing = params.getFieldBool(field, params.FACET_MISSING, false);
+
+    NamedList counts;
+    SchemaField sf = searcher.getSchema().getField(field);
+    FieldType ft = sf.getType();
+    if (sf.multiValued() || ft.isTokenized() || ft instanceof BoolField) {
+      // Always use filters for booleans... we know the number of values is very small.
+      counts = getFacetTermEnumCounts(searcher,docs,field,limit,zeros,missing);
+    } else {
+      // TODO: future logic could use filters instead of the fieldcache if
+      // the number of terms in the field is small enough.
+      counts = getFieldCacheCounts(searcher, docs, field, limit, zeros, missing);
+    }
+
+    return counts;
+  }
+
+
   /**
    * Returns a list of value constraints and the associated facet counts 
    * for each facet field specified in the params.
    *
    * @see SolrParams#FACET_FIELD
-   * @see #getFacetFieldMissingCount
+   * @see #getFieldMissingCount
    * @see #getFacetTermEnumCounts
    */
-  public NamedList getFacetFieldCounts() 
-    throws IOException {
-    
+  public NamedList getFacetFieldCounts()
+          throws IOException {
+
     NamedList res = new NamedList();
     String[] facetFs = params.getParams(SolrParams.FACET_FIELD);
-    if (null != facetFs && 0 != facetFs.length) {
-      
+    if (null != facetFs) {
       for (String f : facetFs) {
-
-        NamedList counts = getFacetTermEnumCounts(f);
-        
-        if (params.getFieldBool(f, params.FACET_MISSING, false))
-          counts.add(null, getFacetFieldMissingCount(f));
-        
-        res.add(f, counts);
-        
+        res.add(f, getTermCounts(f));
       }
     }
     return res;
@@ -152,7 +164,7 @@ public class SimpleFacets {
    *
    * @see SolrParams#FACET_MISSING
    */
-  public int getFacetFieldMissingCount(String fieldName)
+  public static int getFieldMissingCount(SolrIndexSearcher searcher, DocSet docs, String fieldName)
     throws IOException {
 
     DocSet hasVal = searcher.getDocSet
@@ -160,58 +172,122 @@ public class SimpleFacets {
     return docs.andNotSize(hasVal);
   }
 
+  /**
+   * Use the Lucene FieldCache to get counts for each unique field value in <code>docs</code>.
+   * The field must have at most one indexed token per document.
+   */
+  public static NamedList getFieldCacheCounts(SolrIndexSearcher searcher, DocSet docs, String fieldName, int limit, boolean zeros, boolean missing) throws IOException {
+    // TODO: If the number of terms is high compared to docs.size(), and zeros==false,
+    //  we should use an alternate strategy to avoid
+    //  1) creating another huge int[] for the counts
+    //  2) looping over that huge int[] looking for the rare non-zeros.
+    //
+    // Yet another variation: if docs.size() is small and termvectors are stored,
+    // then use them instead of the FieldCache.
+    //
+
+    FieldCache.StringIndex si = FieldCache.DEFAULT.getStringIndex(searcher.getReader(), fieldName);
+    int[] count = new int[si.lookup.length];
+    DocIterator iter = docs.iterator();
+    while (iter.hasNext()) {
+      count[si.order[iter.nextDoc()]]++;
+    }
+
+    FieldType ft = searcher.getSchema().getFieldType(fieldName);
+    NamedList res = new NamedList();
+
+    // IDEA: we could also maintain a count of "other"... everything that fell outside
+    // of the top 'N'
+
+    BoundedTreeSet<CountPair<String,Integer>> queue=null;
+
+    if (limit>=0) {
+      // TODO: compare performance of BoundedTreeSet compare against priority queue?
+      queue = new BoundedTreeSet<CountPair<String,Integer>>(limit);
+    }
+
+    int min=-1;  // the smallest value in the top 'N' values
+    for (int i=1; i<count.length; i++) {
+      int c = count[i];
+      if (c==0 && !zeros) continue;
+      if (limit<0) {
+        res.add(ft.indexedToReadable(si.lookup[i]), c);
+      } else if (c>min) {
+        // NOTE: we use c>min rather than c>=min as an optimization because we are going in
+        // index order, so we already know that the keys are ordered.  This can be very
+        // important if a lot of the counts are repeated (like zero counts would be).
+        queue.add(new CountPair<String,Integer>(ft.indexedToReadable(si.lookup[i]), c));
+        if (queue.size()>=limit) min=queue.last().val;
+      }
+    }
+
+    if (limit>=0) {
+      for (CountPair<String,Integer> p : queue) {
+        res.add(p.key, p.val);
+      }
+    }
+
+
+    if (missing) res.add(null, count[0]);
+    return res;
+  }
+
   /**
    * Returns a list of terms in the specified field along with the 
    * corrisponding count of documents in the set that match that constraint.
+   * This method uses the FilterCache to get the intersection count between <code>docs</code>
+   * and the DocSet for each term in the filter.
    *
    * @see SolrParams#FACET_LIMIT
    * @see SolrParams#FACET_ZEROS
+   * @see SolrParams#FACET_MISSING
    */
-  public NamedList getFacetTermEnumCounts(String fieldName) 
+  public NamedList getFacetTermEnumCounts(SolrIndexSearcher searcher, DocSet docs, String field, int limit, boolean zeros, boolean missing)
     throws IOException {
-    
+
     /* :TODO: potential optimization...
-     * cache the Terms with the highest docFreq and try them first
-     * don't enum if we get our max from them
-     */
-     
+    * cache the Terms with the highest docFreq and try them first
+    * don't enum if we get our max from them
+    */
+
     IndexSchema schema = searcher.getSchema();
     IndexReader r = searcher.getReader();
-    FieldType ft = schema.getFieldType(fieldName);
+    FieldType ft = schema.getFieldType(field);
 
-    Set<CountPair<String,Integer>> counts 
+    Set<CountPair<String,Integer>> counts
       = new HashSet<CountPair<String,Integer>>();
 
-    int limit = params.getFieldInt(fieldName, params.FACET_LIMIT, 100);
     if (0 <= limit) {
       counts = new BoundedTreeSet<CountPair<String,Integer>>(limit);
     }
 
-    boolean zeros = params.getFieldBool(fieldName, params.FACET_ZEROS, true);
-      
-    TermEnum te = r.terms(new Term(fieldName,""));
+    TermEnum te = r.terms(new Term(field,""));
     do {
       Term t = te.term();
 
-      if (null == t || ! t.field().equals(fieldName)) 
+      if (null == t || ! t.field().equals(field))
         break;
 
       if (0 < te.docFreq()) { /* all docs may be deleted */
         int count = searcher.numDocs(new TermQuery(t),
                                      docs);
 
-        /* :TODO: is indexedToReadable correct? */ 
-        if (zeros || 0 < count) 
+        if (zeros || 0 < count)
           counts.add(new CountPair<String,Integer>
-                     (ft.indexedToReadable(t.text()), count));
+                     (t.text(), count));
 
       }
     } while (te.next());
 
     NamedList res = new NamedList();
     for (CountPair<String,Integer> p : counts) {
-      res.add(p.key, p.val);
+      res.add(ft.indexedToReadable(p.key), p.val);
     }
+
+    if (missing) {
+      res.add(null, getFieldMissingCount(searcher,docs,field));
+    }
+
     return res;
   }
 
@@ -220,7 +296,7 @@ public class SimpleFacets {
    * <b>higher</b> vals come before lower vals.
    * In case of tie vals, then <b>lower</b> keys come before higher keys.
    */
-  public static class CountPair<K extends Comparable<? super K>, V extends Comparable<? super V>> 
+  public static class CountPair<K extends Comparable<? super K>, V extends Comparable<? super V>>
     implements Comparable<CountPair<K,V>> {
 
     public CountPair(K k, V v) {
@@ -232,7 +308,7 @@ public class SimpleFacets {
       return key.hashCode() ^ val.hashCode();
     }
     public boolean equals(Object o) {
-      return (o instanceof CountPair) 
+      return (o instanceof CountPair)
         && (0 == this.compareTo((CountPair<K,V>) o));
     }
     public int compareTo(CountPair<K,V> o) {
diff --git a/src/java/org/apache/solr/request/SolrQueryRequest.java b/src/java/org/apache/solr/request/SolrQueryRequest.java
index 1712aaed9d4..050b01eb561 100644
--- a/src/java/org/apache/solr/request/SolrQueryRequest.java
+++ b/src/java/org/apache/solr/request/SolrQueryRequest.java
@@ -20,19 +20,36 @@ import org.apache.solr.search.SolrIndexSearcher;
 import org.apache.solr.schema.IndexSchema;
 import org.apache.solr.core.SolrCore;
 
+import java.util.Map;
+
 /**
- * Container for a request to execute a query.
+ * <p>Container for a request to execute a query.</p>
+ * <p><code>SolrQueryRequest</code> is not thread safe.</p>
  * 
  * @author yonik
  * @version $Id$
  */
 public interface SolrQueryRequest {
+
+  /** returns the current request parameters */
   public SolrParams getParams();
 
+  /** Change the parameters for this request.  This does not affect
+   *  the original parameters returned by getOriginalParams()
+   */
   public void setParams(SolrParams params);
 
+  /** Returns the original request parameters.  As this
+   * does not normally include configured defaults
+   * it's more suitable for logging.
+   */
   public SolrParams getOriginalParams();
 
+  /**
+   * Generic information associated with this request that may be both read and updated.
+   */
+  public Map<Object,Object> getContext();
+
   /**
    * This method should be called when all uses of this request are
    * finished, so that resources can be freed.
diff --git a/src/java/org/apache/solr/schema/FieldType.java b/src/java/org/apache/solr/schema/FieldType.java
index 9996803f0ad..0e47ad354d7 100644
--- a/src/java/org/apache/solr/schema/FieldType.java
+++ b/src/java/org/apache/solr/schema/FieldType.java
@@ -57,7 +57,7 @@ public abstract class FieldType extends FieldProperties {
   int properties;
 
   /** Returns true if fields of this type should be tokenized */
-  protected boolean isTokenized() {
+  public boolean isTokenized() {
     return (properties & TOKENIZED) != 0;
   }