use FilterCache for faceting single-term non-bool fields

git-svn-id: https://svn.apache.org/repos/asf/incubator/solr/trunk@448695 13f79535-47bb-0310-9956-ffa450edef68
2006-09-21 21:37:36 +00:00 · 2006-09-21 21:37:36 +00:00 · 8c79297a75
parent 6646560c9d
commit 8c79297a75
4 changed files with 146 additions and 49 deletions
--- a/CHANGES.txt
+++ b/CHANGES.txt
@ -83,6 +83,10 @@ Optimizations
 5. Optimized getDocSet() for term queries resulting in a 36% speedup of facet.field
    queries where DocSets aren't cached (for example, if the number of terms in the field
    is larger than the filter cache.) (yonik)
 6. Optimized facet.field faceting by as much as 500 times when the field has
    a single token per document (not multiValued & not tokenized) by using the
    Lucene FieldCache entry for that field to tally term counts.  The first request
    utilizing the FieldCache will take longer than subsequent ones.
 Bug Fixes
 1. Fixed delete-by-id for field types who's indexed form is different
--- a/src/java/org/apache/solr/request/SimpleFacets.java
+++ b/src/java/org/apache/solr/request/SimpleFacets.java
@ -20,23 +20,20 @@ import org.apache.lucene.index.Term;
 import org.apache.lucene.index.TermEnum;
 import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.queryParser.ParseException;
 import org.apache.lucene.queryParser.QueryParser;
 import org.apache.lucene.search.*;
 import org.apache.solr.core.SolrCore;
 import org.apache.solr.core.SolrException;
 import org.apache.solr.request.SolrParams;
 import org.apache.solr.request.SolrQueryRequest;
 import org.apache.solr.request.SolrQueryResponse;
 import org.apache.solr.request.DefaultSolrParams;
 import org.apache.solr.schema.IndexSchema;
 import org.apache.solr.schema.FieldType;
 import org.apache.solr.schema.SchemaField;
 import org.apache.solr.schema.BoolField;
 import org.apache.solr.search.*;
 import org.apache.solr.util.NamedList;
 import org.apache.solr.util.BoundedTreeSet;
 import java.io.IOException;
 import java.util.*;
 import java.util.logging.Level;
 /**
 * A class that generates simple Facet information for a request.
@ -53,6 +50,7 @@ public class SimpleFacets {
  /** Searcher to use for all calculations */
  protected SolrIndexSearcher searcher;
  public SimpleFacets(SolrIndexSearcher searcher,
                      DocSet docs,
                      SolrParams params) {
@ -117,30 +115,44 @@ public class SimpleFacets {
    return res;
  }
  public NamedList getTermCounts(String field) throws IOException {
    int limit = params.getFieldInt(field, params.FACET_LIMIT, 100);
    boolean zeros = params.getFieldBool(field, params.FACET_ZEROS, true);
    boolean missing = params.getFieldBool(field, params.FACET_MISSING, false);
    NamedList counts;
    SchemaField sf = searcher.getSchema().getField(field);
    FieldType ft = sf.getType();
    if (sf.multiValued() || ft.isTokenized() || ft instanceof BoolField) {
      // Always use filters for booleans... we know the number of values is very small.
      counts = getFacetTermEnumCounts(searcher,docs,field,limit,zeros,missing);
    } else {
      // TODO: future logic could use filters instead of the fieldcache if
      // the number of terms in the field is small enough.
      counts = getFieldCacheCounts(searcher, docs, field, limit, zeros, missing);
    }
    return counts;
  }
  /**
   * Returns a list of value constraints and the associated facet counts 
   * for each facet field specified in the params.
   *
   * @see SolrParams#FACET_FIELD
-   * @see #getFacetFieldMissingCount
+   * @see #getFieldMissingCount
   * @see #getFacetTermEnumCounts
   */
  public NamedList getFacetFieldCounts()
-    throws IOException {
+          throws IOException {
    NamedList res = new NamedList();
    String[] facetFs = params.getParams(SolrParams.FACET_FIELD);
-    if (null != facetFs && 0 != facetFs.length) {
+    if (null != facetFs) {
      for (String f : facetFs) {
-
+        res.add(f, getTermCounts(f));
        NamedList counts = getFacetTermEnumCounts(f);
        if (params.getFieldBool(f, params.FACET_MISSING, false))
          counts.add(null, getFacetFieldMissingCount(f));
        res.add(f, counts);
      }
    }
    return res;
@ -152,7 +164,7 @@ public class SimpleFacets {
   *
   * @see SolrParams#FACET_MISSING
   */
-  public int getFacetFieldMissingCount(String fieldName)
+  public static int getFieldMissingCount(SolrIndexSearcher searcher, DocSet docs, String fieldName)
    throws IOException {
    DocSet hasVal = searcher.getDocSet
@ -160,58 +172,122 @@ public class SimpleFacets {
    return docs.andNotSize(hasVal);
  }
  /**
   * Use the Lucene FieldCache to get counts for each unique field value in <code>docs</code>.
   * The field must have at most one indexed token per document.
   */
  public static NamedList getFieldCacheCounts(SolrIndexSearcher searcher, DocSet docs, String fieldName, int limit, boolean zeros, boolean missing) throws IOException {
    // TODO: If the number of terms is high compared to docs.size(), and zeros==false,
    //  we should use an alternate strategy to avoid
    //  1) creating another huge int[] for the counts
    //  2) looping over that huge int[] looking for the rare non-zeros.
    //
    // Yet another variation: if docs.size() is small and termvectors are stored,
    // then use them instead of the FieldCache.
    //
    FieldCache.StringIndex si = FieldCache.DEFAULT.getStringIndex(searcher.getReader(), fieldName);
    int[] count = new int[si.lookup.length];
    DocIterator iter = docs.iterator();
    while (iter.hasNext()) {
      count[si.order[iter.nextDoc()]]++;
    }
    FieldType ft = searcher.getSchema().getFieldType(fieldName);
    NamedList res = new NamedList();
    // IDEA: we could also maintain a count of "other"... everything that fell outside
    // of the top 'N'
    BoundedTreeSet<CountPair<String,Integer>> queue=null;
    if (limit>=0) {
      // TODO: compare performance of BoundedTreeSet compare against priority queue?
      queue = new BoundedTreeSet<CountPair<String,Integer>>(limit);
    }
    int min=-1;  // the smallest value in the top 'N' values
    for (int i=1; i<count.length; i++) {
      int c = count[i];
      if (c==0 && !zeros) continue;
      if (limit<0) {
        res.add(ft.indexedToReadable(si.lookup[i]), c);
      } else if (c>min) {
        // NOTE: we use c>min rather than c>=min as an optimization because we are going in
        // index order, so we already know that the keys are ordered.  This can be very
        // important if a lot of the counts are repeated (like zero counts would be).
        queue.add(new CountPair<String,Integer>(ft.indexedToReadable(si.lookup[i]), c));
        if (queue.size()>=limit) min=queue.last().val;
      }
    }
    if (limit>=0) {
      for (CountPair<String,Integer> p : queue) {
        res.add(p.key, p.val);
      }
    }
    if (missing) res.add(null, count[0]);
    return res;
  }
  /**
   * Returns a list of terms in the specified field along with the 
   * corrisponding count of documents in the set that match that constraint.
   * This method uses the FilterCache to get the intersection count between <code>docs</code>
   * and the DocSet for each term in the filter.
   *
   * @see SolrParams#FACET_LIMIT
   * @see SolrParams#FACET_ZEROS
   * @see SolrParams#FACET_MISSING
   */
-  public NamedList getFacetTermEnumCounts(String fieldName) 
+  public NamedList getFacetTermEnumCounts(SolrIndexSearcher searcher, DocSet docs, String field, int limit, boolean zeros, boolean missing)
    throws IOException {
    /* :TODO: potential optimization...
-     * cache the Terms with the highest docFreq and try them first
+    * cache the Terms with the highest docFreq and try them first
-     * don't enum if we get our max from them
+    * don't enum if we get our max from them
-     */
+    */
    IndexSchema schema = searcher.getSchema();
    IndexReader r = searcher.getReader();
-    FieldType ft = schema.getFieldType(fieldName);
+    FieldType ft = schema.getFieldType(field);
    Set<CountPair<String,Integer>> counts
      = new HashSet<CountPair<String,Integer>>();
    int limit = params.getFieldInt(fieldName, params.FACET_LIMIT, 100);
    if (0 <= limit) {
      counts = new BoundedTreeSet<CountPair<String,Integer>>(limit);
    }
-    boolean zeros = params.getFieldBool(fieldName, params.FACET_ZEROS, true);
+    TermEnum te = r.terms(new Term(field,""));
    TermEnum te = r.terms(new Term(fieldName,""));
    do {
      Term t = te.term();
-      if (null == t || ! t.field().equals(fieldName)) 
+      if (null == t || ! t.field().equals(field))
        break;
      if (0 < te.docFreq()) { /* all docs may be deleted */
        int count = searcher.numDocs(new TermQuery(t),
                                     docs);
        /* :TODO: is indexedToReadable correct? */ 
        if (zeros || 0 < count)
          counts.add(new CountPair<String,Integer>
-                     (ft.indexedToReadable(t.text()), count));
+                     (t.text(), count));
      }
    } while (te.next());
    NamedList res = new NamedList();
    for (CountPair<String,Integer> p : counts) {
-      res.add(p.key, p.val);
+      res.add(ft.indexedToReadable(p.key), p.val);
    }
    if (missing) {
      res.add(null, getFieldMissingCount(searcher,docs,field));
    }
    return res;
  }
--- a/src/java/org/apache/solr/request/SolrQueryRequest.java
+++ b/src/java/org/apache/solr/request/SolrQueryRequest.java
@ -20,19 +20,36 @@ import org.apache.solr.search.SolrIndexSearcher;
 import org.apache.solr.schema.IndexSchema;
 import org.apache.solr.core.SolrCore;
 import java.util.Map;
 /**
- * Container for a request to execute a query.
+ * <p>Container for a request to execute a query.</p>
 * <p><code>SolrQueryRequest</code> is not thread safe.</p>
 * 
 * @author yonik
 * @version $Id$
 */
 public interface SolrQueryRequest {
  /** returns the current request parameters */
  public SolrParams getParams();
  /** Change the parameters for this request.  This does not affect
   *  the original parameters returned by getOriginalParams()
   */
  public void setParams(SolrParams params);
  /** Returns the original request parameters.  As this
   * does not normally include configured defaults
   * it's more suitable for logging.
   */
  public SolrParams getOriginalParams();
  /**
   * Generic information associated with this request that may be both read and updated.
   */
  public Map<Object,Object> getContext();
  /**
   * This method should be called when all uses of this request are
   * finished, so that resources can be freed.
--- a/src/java/org/apache/solr/schema/FieldType.java
+++ b/src/java/org/apache/solr/schema/FieldType.java
@ -57,7 +57,7 @@ public abstract class FieldType extends FieldProperties {
  int properties;
  /** Returns true if fields of this type should be tokenized */
-  protected boolean isTokenized() {
+  public boolean isTokenized() {
    return (properties & TOKENIZED) != 0;
  }