From 8c79297a754cca22833baf9c22dc6669ae29b92f Mon Sep 17 00:00:00 2001 From: Yonik Seeley Date: Thu, 21 Sep 2006 21:37:36 +0000 Subject: [PATCH] use FilterCache for faceting single-term non-bool fields git-svn-id: https://svn.apache.org/repos/asf/incubator/solr/trunk@448695 13f79535-47bb-0310-9956-ffa450edef68 --- CHANGES.txt | 4 + .../org/apache/solr/request/SimpleFacets.java | 170 +++++++++++++----- .../apache/solr/request/SolrQueryRequest.java | 19 +- .../org/apache/solr/schema/FieldType.java | 2 +- 4 files changed, 146 insertions(+), 49 deletions(-) diff --git a/CHANGES.txt b/CHANGES.txt index 200bb2ed4bd..4190e233bd4 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -83,6 +83,10 @@ Optimizations 5. Optimized getDocSet() for term queries resulting in a 36% speedup of facet.field queries where DocSets aren't cached (for example, if the number of terms in the field is larger than the filter cache.) (yonik) + 6. Optimized facet.field faceting by as much as 500 times when the field has + a single token per document (not multiValued & not tokenized) by using the + Lucene FieldCache entry for that field to tally term counts. The first request + utilizing the FieldCache will take longer than subsequent ones. Bug Fixes 1. Fixed delete-by-id for field types who's indexed form is different diff --git a/src/java/org/apache/solr/request/SimpleFacets.java b/src/java/org/apache/solr/request/SimpleFacets.java index c85091db600..40a0d0604fc 100644 --- a/src/java/org/apache/solr/request/SimpleFacets.java +++ b/src/java/org/apache/solr/request/SimpleFacets.java @@ -20,23 +20,20 @@ import org.apache.lucene.index.Term; import org.apache.lucene.index.TermEnum; import org.apache.lucene.index.IndexReader; import org.apache.lucene.queryParser.ParseException; -import org.apache.lucene.queryParser.QueryParser; import org.apache.lucene.search.*; import org.apache.solr.core.SolrCore; import org.apache.solr.core.SolrException; import org.apache.solr.request.SolrParams; -import org.apache.solr.request.SolrQueryRequest; -import org.apache.solr.request.SolrQueryResponse; -import org.apache.solr.request.DefaultSolrParams; import org.apache.solr.schema.IndexSchema; import org.apache.solr.schema.FieldType; +import org.apache.solr.schema.SchemaField; +import org.apache.solr.schema.BoolField; import org.apache.solr.search.*; import org.apache.solr.util.NamedList; import org.apache.solr.util.BoundedTreeSet; import java.io.IOException; import java.util.*; -import java.util.logging.Level; /** * A class that generates simple Facet information for a request. @@ -52,15 +49,16 @@ public class SimpleFacets { protected SolrParams params; /** Searcher to use for all calculations */ protected SolrIndexSearcher searcher; - - public SimpleFacets(SolrIndexSearcher searcher, - DocSet docs, + + + public SimpleFacets(SolrIndexSearcher searcher, + DocSet docs, SolrParams params) { this.searcher = searcher; this.docs = docs; this.params = params; } - + /** * Looks at various Params to determing if any simple Facet Constraint count * computations are desired. @@ -73,7 +71,7 @@ public class SimpleFacets { public NamedList getFacetCounts() { // if someone called this method, benefit of the doubt: assume true - if (!params.getBool(params.FACET,true)) + if (!params.getBool(params.FACET,true)) return null; NamedList res = new NamedList(); @@ -82,7 +80,7 @@ public class SimpleFacets { res.add("facet_queries", getFacetQueryCounts()); res.add("facet_fields", getFacetFieldCounts()); - + } catch (Exception e) { SolrException.logOnce(SolrCore.log, "Exception during facet counts", e); res.add("exception", SolrException.toStr(e)); @@ -97,7 +95,7 @@ public class SimpleFacets { * @see SolrParams#FACET_QUERY */ public NamedList getFacetQueryCounts() throws IOException,ParseException { - + NamedList res = new NamedList(); /* Ignore SolrParams.DF - could have init param facet.query assuming @@ -106,7 +104,7 @@ public class SimpleFacets { * explicit. */ SolrQueryParser qp = new SolrQueryParser(searcher.getSchema(),null); - + String[] facetQs = params.getParams(SolrParams.FACET_QUERY); if (null != facetQs && 0 != facetQs.length) { for (String q : facetQs) { @@ -117,30 +115,44 @@ public class SimpleFacets { return res; } + + public NamedList getTermCounts(String field) throws IOException { + int limit = params.getFieldInt(field, params.FACET_LIMIT, 100); + boolean zeros = params.getFieldBool(field, params.FACET_ZEROS, true); + boolean missing = params.getFieldBool(field, params.FACET_MISSING, false); + + NamedList counts; + SchemaField sf = searcher.getSchema().getField(field); + FieldType ft = sf.getType(); + if (sf.multiValued() || ft.isTokenized() || ft instanceof BoolField) { + // Always use filters for booleans... we know the number of values is very small. + counts = getFacetTermEnumCounts(searcher,docs,field,limit,zeros,missing); + } else { + // TODO: future logic could use filters instead of the fieldcache if + // the number of terms in the field is small enough. + counts = getFieldCacheCounts(searcher, docs, field, limit, zeros, missing); + } + + return counts; + } + + /** * Returns a list of value constraints and the associated facet counts * for each facet field specified in the params. * * @see SolrParams#FACET_FIELD - * @see #getFacetFieldMissingCount + * @see #getFieldMissingCount * @see #getFacetTermEnumCounts */ - public NamedList getFacetFieldCounts() - throws IOException { - + public NamedList getFacetFieldCounts() + throws IOException { + NamedList res = new NamedList(); String[] facetFs = params.getParams(SolrParams.FACET_FIELD); - if (null != facetFs && 0 != facetFs.length) { - + if (null != facetFs) { for (String f : facetFs) { - - NamedList counts = getFacetTermEnumCounts(f); - - if (params.getFieldBool(f, params.FACET_MISSING, false)) - counts.add(null, getFacetFieldMissingCount(f)); - - res.add(f, counts); - + res.add(f, getTermCounts(f)); } } return res; @@ -152,7 +164,7 @@ public class SimpleFacets { * * @see SolrParams#FACET_MISSING */ - public int getFacetFieldMissingCount(String fieldName) + public static int getFieldMissingCount(SolrIndexSearcher searcher, DocSet docs, String fieldName) throws IOException { DocSet hasVal = searcher.getDocSet @@ -160,58 +172,122 @@ public class SimpleFacets { return docs.andNotSize(hasVal); } + /** + * Use the Lucene FieldCache to get counts for each unique field value in docs. + * The field must have at most one indexed token per document. + */ + public static NamedList getFieldCacheCounts(SolrIndexSearcher searcher, DocSet docs, String fieldName, int limit, boolean zeros, boolean missing) throws IOException { + // TODO: If the number of terms is high compared to docs.size(), and zeros==false, + // we should use an alternate strategy to avoid + // 1) creating another huge int[] for the counts + // 2) looping over that huge int[] looking for the rare non-zeros. + // + // Yet another variation: if docs.size() is small and termvectors are stored, + // then use them instead of the FieldCache. + // + + FieldCache.StringIndex si = FieldCache.DEFAULT.getStringIndex(searcher.getReader(), fieldName); + int[] count = new int[si.lookup.length]; + DocIterator iter = docs.iterator(); + while (iter.hasNext()) { + count[si.order[iter.nextDoc()]]++; + } + + FieldType ft = searcher.getSchema().getFieldType(fieldName); + NamedList res = new NamedList(); + + // IDEA: we could also maintain a count of "other"... everything that fell outside + // of the top 'N' + + BoundedTreeSet> queue=null; + + if (limit>=0) { + // TODO: compare performance of BoundedTreeSet compare against priority queue? + queue = new BoundedTreeSet>(limit); + } + + int min=-1; // the smallest value in the top 'N' values + for (int i=1; imin) { + // NOTE: we use c>min rather than c>=min as an optimization because we are going in + // index order, so we already know that the keys are ordered. This can be very + // important if a lot of the counts are repeated (like zero counts would be). + queue.add(new CountPair(ft.indexedToReadable(si.lookup[i]), c)); + if (queue.size()>=limit) min=queue.last().val; + } + } + + if (limit>=0) { + for (CountPair p : queue) { + res.add(p.key, p.val); + } + } + + + if (missing) res.add(null, count[0]); + return res; + } + /** * Returns a list of terms in the specified field along with the * corrisponding count of documents in the set that match that constraint. + * This method uses the FilterCache to get the intersection count between docs + * and the DocSet for each term in the filter. * * @see SolrParams#FACET_LIMIT * @see SolrParams#FACET_ZEROS + * @see SolrParams#FACET_MISSING */ - public NamedList getFacetTermEnumCounts(String fieldName) + public NamedList getFacetTermEnumCounts(SolrIndexSearcher searcher, DocSet docs, String field, int limit, boolean zeros, boolean missing) throws IOException { - + /* :TODO: potential optimization... - * cache the Terms with the highest docFreq and try them first - * don't enum if we get our max from them - */ - + * cache the Terms with the highest docFreq and try them first + * don't enum if we get our max from them + */ + IndexSchema schema = searcher.getSchema(); IndexReader r = searcher.getReader(); - FieldType ft = schema.getFieldType(fieldName); + FieldType ft = schema.getFieldType(field); - Set> counts + Set> counts = new HashSet>(); - int limit = params.getFieldInt(fieldName, params.FACET_LIMIT, 100); if (0 <= limit) { counts = new BoundedTreeSet>(limit); } - boolean zeros = params.getFieldBool(fieldName, params.FACET_ZEROS, true); - - TermEnum te = r.terms(new Term(fieldName,"")); + TermEnum te = r.terms(new Term(field,"")); do { Term t = te.term(); - if (null == t || ! t.field().equals(fieldName)) + if (null == t || ! t.field().equals(field)) break; if (0 < te.docFreq()) { /* all docs may be deleted */ int count = searcher.numDocs(new TermQuery(t), docs); - /* :TODO: is indexedToReadable correct? */ - if (zeros || 0 < count) + if (zeros || 0 < count) counts.add(new CountPair - (ft.indexedToReadable(t.text()), count)); + (t.text(), count)); } } while (te.next()); NamedList res = new NamedList(); for (CountPair p : counts) { - res.add(p.key, p.val); + res.add(ft.indexedToReadable(p.key), p.val); } + + if (missing) { + res.add(null, getFieldMissingCount(searcher,docs,field)); + } + return res; } @@ -220,7 +296,7 @@ public class SimpleFacets { * higher vals come before lower vals. * In case of tie vals, then lower keys come before higher keys. */ - public static class CountPair, V extends Comparable> + public static class CountPair, V extends Comparable> implements Comparable> { public CountPair(K k, V v) { @@ -232,7 +308,7 @@ public class SimpleFacets { return key.hashCode() ^ val.hashCode(); } public boolean equals(Object o) { - return (o instanceof CountPair) + return (o instanceof CountPair) && (0 == this.compareTo((CountPair) o)); } public int compareTo(CountPair o) { diff --git a/src/java/org/apache/solr/request/SolrQueryRequest.java b/src/java/org/apache/solr/request/SolrQueryRequest.java index 1712aaed9d4..050b01eb561 100644 --- a/src/java/org/apache/solr/request/SolrQueryRequest.java +++ b/src/java/org/apache/solr/request/SolrQueryRequest.java @@ -20,19 +20,36 @@ import org.apache.solr.search.SolrIndexSearcher; import org.apache.solr.schema.IndexSchema; import org.apache.solr.core.SolrCore; +import java.util.Map; + /** - * Container for a request to execute a query. + *

Container for a request to execute a query.

+ *

SolrQueryRequest is not thread safe.

* * @author yonik * @version $Id$ */ public interface SolrQueryRequest { + + /** returns the current request parameters */ public SolrParams getParams(); + /** Change the parameters for this request. This does not affect + * the original parameters returned by getOriginalParams() + */ public void setParams(SolrParams params); + /** Returns the original request parameters. As this + * does not normally include configured defaults + * it's more suitable for logging. + */ public SolrParams getOriginalParams(); + /** + * Generic information associated with this request that may be both read and updated. + */ + public Map getContext(); + /** * This method should be called when all uses of this request are * finished, so that resources can be freed. diff --git a/src/java/org/apache/solr/schema/FieldType.java b/src/java/org/apache/solr/schema/FieldType.java index 9996803f0ad..0e47ad354d7 100644 --- a/src/java/org/apache/solr/schema/FieldType.java +++ b/src/java/org/apache/solr/schema/FieldType.java @@ -57,7 +57,7 @@ public abstract class FieldType extends FieldProperties { int properties; /** Returns true if fields of this type should be tokenized */ - protected boolean isTokenized() { + public boolean isTokenized() { return (properties & TOKENIZED) != 0; }