mirror of https://github.com/apache/lucene.git
use FilterCache for faceting single-term non-bool fields
git-svn-id: https://svn.apache.org/repos/asf/incubator/solr/trunk@448695 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
6646560c9d
commit
8c79297a75
|
@ -83,6 +83,10 @@ Optimizations
|
|||
5. Optimized getDocSet() for term queries resulting in a 36% speedup of facet.field
|
||||
queries where DocSets aren't cached (for example, if the number of terms in the field
|
||||
is larger than the filter cache.) (yonik)
|
||||
6. Optimized facet.field faceting by as much as 500 times when the field has
|
||||
a single token per document (not multiValued & not tokenized) by using the
|
||||
Lucene FieldCache entry for that field to tally term counts. The first request
|
||||
utilizing the FieldCache will take longer than subsequent ones.
|
||||
|
||||
Bug Fixes
|
||||
1. Fixed delete-by-id for field types who's indexed form is different
|
||||
|
|
|
@ -20,23 +20,20 @@ import org.apache.lucene.index.Term;
|
|||
import org.apache.lucene.index.TermEnum;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.queryParser.ParseException;
|
||||
import org.apache.lucene.queryParser.QueryParser;
|
||||
import org.apache.lucene.search.*;
|
||||
import org.apache.solr.core.SolrCore;
|
||||
import org.apache.solr.core.SolrException;
|
||||
import org.apache.solr.request.SolrParams;
|
||||
import org.apache.solr.request.SolrQueryRequest;
|
||||
import org.apache.solr.request.SolrQueryResponse;
|
||||
import org.apache.solr.request.DefaultSolrParams;
|
||||
import org.apache.solr.schema.IndexSchema;
|
||||
import org.apache.solr.schema.FieldType;
|
||||
import org.apache.solr.schema.SchemaField;
|
||||
import org.apache.solr.schema.BoolField;
|
||||
import org.apache.solr.search.*;
|
||||
import org.apache.solr.util.NamedList;
|
||||
import org.apache.solr.util.BoundedTreeSet;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.*;
|
||||
import java.util.logging.Level;
|
||||
|
||||
/**
|
||||
* A class that generates simple Facet information for a request.
|
||||
|
@ -53,6 +50,7 @@ public class SimpleFacets {
|
|||
/** Searcher to use for all calculations */
|
||||
protected SolrIndexSearcher searcher;
|
||||
|
||||
|
||||
public SimpleFacets(SolrIndexSearcher searcher,
|
||||
DocSet docs,
|
||||
SolrParams params) {
|
||||
|
@ -117,30 +115,44 @@ public class SimpleFacets {
|
|||
return res;
|
||||
}
|
||||
|
||||
|
||||
public NamedList getTermCounts(String field) throws IOException {
|
||||
int limit = params.getFieldInt(field, params.FACET_LIMIT, 100);
|
||||
boolean zeros = params.getFieldBool(field, params.FACET_ZEROS, true);
|
||||
boolean missing = params.getFieldBool(field, params.FACET_MISSING, false);
|
||||
|
||||
NamedList counts;
|
||||
SchemaField sf = searcher.getSchema().getField(field);
|
||||
FieldType ft = sf.getType();
|
||||
if (sf.multiValued() || ft.isTokenized() || ft instanceof BoolField) {
|
||||
// Always use filters for booleans... we know the number of values is very small.
|
||||
counts = getFacetTermEnumCounts(searcher,docs,field,limit,zeros,missing);
|
||||
} else {
|
||||
// TODO: future logic could use filters instead of the fieldcache if
|
||||
// the number of terms in the field is small enough.
|
||||
counts = getFieldCacheCounts(searcher, docs, field, limit, zeros, missing);
|
||||
}
|
||||
|
||||
return counts;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Returns a list of value constraints and the associated facet counts
|
||||
* for each facet field specified in the params.
|
||||
*
|
||||
* @see SolrParams#FACET_FIELD
|
||||
* @see #getFacetFieldMissingCount
|
||||
* @see #getFieldMissingCount
|
||||
* @see #getFacetTermEnumCounts
|
||||
*/
|
||||
public NamedList getFacetFieldCounts()
|
||||
throws IOException {
|
||||
throws IOException {
|
||||
|
||||
NamedList res = new NamedList();
|
||||
String[] facetFs = params.getParams(SolrParams.FACET_FIELD);
|
||||
if (null != facetFs && 0 != facetFs.length) {
|
||||
|
||||
if (null != facetFs) {
|
||||
for (String f : facetFs) {
|
||||
|
||||
NamedList counts = getFacetTermEnumCounts(f);
|
||||
|
||||
if (params.getFieldBool(f, params.FACET_MISSING, false))
|
||||
counts.add(null, getFacetFieldMissingCount(f));
|
||||
|
||||
res.add(f, counts);
|
||||
|
||||
res.add(f, getTermCounts(f));
|
||||
}
|
||||
}
|
||||
return res;
|
||||
|
@ -152,7 +164,7 @@ public class SimpleFacets {
|
|||
*
|
||||
* @see SolrParams#FACET_MISSING
|
||||
*/
|
||||
public int getFacetFieldMissingCount(String fieldName)
|
||||
public static int getFieldMissingCount(SolrIndexSearcher searcher, DocSet docs, String fieldName)
|
||||
throws IOException {
|
||||
|
||||
DocSet hasVal = searcher.getDocSet
|
||||
|
@ -160,58 +172,122 @@ public class SimpleFacets {
|
|||
return docs.andNotSize(hasVal);
|
||||
}
|
||||
|
||||
/**
|
||||
* Use the Lucene FieldCache to get counts for each unique field value in <code>docs</code>.
|
||||
* The field must have at most one indexed token per document.
|
||||
*/
|
||||
public static NamedList getFieldCacheCounts(SolrIndexSearcher searcher, DocSet docs, String fieldName, int limit, boolean zeros, boolean missing) throws IOException {
|
||||
// TODO: If the number of terms is high compared to docs.size(), and zeros==false,
|
||||
// we should use an alternate strategy to avoid
|
||||
// 1) creating another huge int[] for the counts
|
||||
// 2) looping over that huge int[] looking for the rare non-zeros.
|
||||
//
|
||||
// Yet another variation: if docs.size() is small and termvectors are stored,
|
||||
// then use them instead of the FieldCache.
|
||||
//
|
||||
|
||||
FieldCache.StringIndex si = FieldCache.DEFAULT.getStringIndex(searcher.getReader(), fieldName);
|
||||
int[] count = new int[si.lookup.length];
|
||||
DocIterator iter = docs.iterator();
|
||||
while (iter.hasNext()) {
|
||||
count[si.order[iter.nextDoc()]]++;
|
||||
}
|
||||
|
||||
FieldType ft = searcher.getSchema().getFieldType(fieldName);
|
||||
NamedList res = new NamedList();
|
||||
|
||||
// IDEA: we could also maintain a count of "other"... everything that fell outside
|
||||
// of the top 'N'
|
||||
|
||||
BoundedTreeSet<CountPair<String,Integer>> queue=null;
|
||||
|
||||
if (limit>=0) {
|
||||
// TODO: compare performance of BoundedTreeSet compare against priority queue?
|
||||
queue = new BoundedTreeSet<CountPair<String,Integer>>(limit);
|
||||
}
|
||||
|
||||
int min=-1; // the smallest value in the top 'N' values
|
||||
for (int i=1; i<count.length; i++) {
|
||||
int c = count[i];
|
||||
if (c==0 && !zeros) continue;
|
||||
if (limit<0) {
|
||||
res.add(ft.indexedToReadable(si.lookup[i]), c);
|
||||
} else if (c>min) {
|
||||
// NOTE: we use c>min rather than c>=min as an optimization because we are going in
|
||||
// index order, so we already know that the keys are ordered. This can be very
|
||||
// important if a lot of the counts are repeated (like zero counts would be).
|
||||
queue.add(new CountPair<String,Integer>(ft.indexedToReadable(si.lookup[i]), c));
|
||||
if (queue.size()>=limit) min=queue.last().val;
|
||||
}
|
||||
}
|
||||
|
||||
if (limit>=0) {
|
||||
for (CountPair<String,Integer> p : queue) {
|
||||
res.add(p.key, p.val);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
if (missing) res.add(null, count[0]);
|
||||
return res;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a list of terms in the specified field along with the
|
||||
* corrisponding count of documents in the set that match that constraint.
|
||||
* This method uses the FilterCache to get the intersection count between <code>docs</code>
|
||||
* and the DocSet for each term in the filter.
|
||||
*
|
||||
* @see SolrParams#FACET_LIMIT
|
||||
* @see SolrParams#FACET_ZEROS
|
||||
* @see SolrParams#FACET_MISSING
|
||||
*/
|
||||
public NamedList getFacetTermEnumCounts(String fieldName)
|
||||
public NamedList getFacetTermEnumCounts(SolrIndexSearcher searcher, DocSet docs, String field, int limit, boolean zeros, boolean missing)
|
||||
throws IOException {
|
||||
|
||||
/* :TODO: potential optimization...
|
||||
* cache the Terms with the highest docFreq and try them first
|
||||
* don't enum if we get our max from them
|
||||
*/
|
||||
* cache the Terms with the highest docFreq and try them first
|
||||
* don't enum if we get our max from them
|
||||
*/
|
||||
|
||||
IndexSchema schema = searcher.getSchema();
|
||||
IndexReader r = searcher.getReader();
|
||||
FieldType ft = schema.getFieldType(fieldName);
|
||||
FieldType ft = schema.getFieldType(field);
|
||||
|
||||
Set<CountPair<String,Integer>> counts
|
||||
= new HashSet<CountPair<String,Integer>>();
|
||||
|
||||
int limit = params.getFieldInt(fieldName, params.FACET_LIMIT, 100);
|
||||
if (0 <= limit) {
|
||||
counts = new BoundedTreeSet<CountPair<String,Integer>>(limit);
|
||||
}
|
||||
|
||||
boolean zeros = params.getFieldBool(fieldName, params.FACET_ZEROS, true);
|
||||
|
||||
TermEnum te = r.terms(new Term(fieldName,""));
|
||||
TermEnum te = r.terms(new Term(field,""));
|
||||
do {
|
||||
Term t = te.term();
|
||||
|
||||
if (null == t || ! t.field().equals(fieldName))
|
||||
if (null == t || ! t.field().equals(field))
|
||||
break;
|
||||
|
||||
if (0 < te.docFreq()) { /* all docs may be deleted */
|
||||
int count = searcher.numDocs(new TermQuery(t),
|
||||
docs);
|
||||
|
||||
/* :TODO: is indexedToReadable correct? */
|
||||
if (zeros || 0 < count)
|
||||
counts.add(new CountPair<String,Integer>
|
||||
(ft.indexedToReadable(t.text()), count));
|
||||
(t.text(), count));
|
||||
|
||||
}
|
||||
} while (te.next());
|
||||
|
||||
NamedList res = new NamedList();
|
||||
for (CountPair<String,Integer> p : counts) {
|
||||
res.add(p.key, p.val);
|
||||
res.add(ft.indexedToReadable(p.key), p.val);
|
||||
}
|
||||
|
||||
if (missing) {
|
||||
res.add(null, getFieldMissingCount(searcher,docs,field));
|
||||
}
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
|
|
|
@ -20,19 +20,36 @@ import org.apache.solr.search.SolrIndexSearcher;
|
|||
import org.apache.solr.schema.IndexSchema;
|
||||
import org.apache.solr.core.SolrCore;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* Container for a request to execute a query.
|
||||
* <p>Container for a request to execute a query.</p>
|
||||
* <p><code>SolrQueryRequest</code> is not thread safe.</p>
|
||||
*
|
||||
* @author yonik
|
||||
* @version $Id$
|
||||
*/
|
||||
public interface SolrQueryRequest {
|
||||
|
||||
/** returns the current request parameters */
|
||||
public SolrParams getParams();
|
||||
|
||||
/** Change the parameters for this request. This does not affect
|
||||
* the original parameters returned by getOriginalParams()
|
||||
*/
|
||||
public void setParams(SolrParams params);
|
||||
|
||||
/** Returns the original request parameters. As this
|
||||
* does not normally include configured defaults
|
||||
* it's more suitable for logging.
|
||||
*/
|
||||
public SolrParams getOriginalParams();
|
||||
|
||||
/**
|
||||
* Generic information associated with this request that may be both read and updated.
|
||||
*/
|
||||
public Map<Object,Object> getContext();
|
||||
|
||||
/**
|
||||
* This method should be called when all uses of this request are
|
||||
* finished, so that resources can be freed.
|
||||
|
|
|
@ -57,7 +57,7 @@ public abstract class FieldType extends FieldProperties {
|
|||
int properties;
|
||||
|
||||
/** Returns true if fields of this type should be tokenized */
|
||||
protected boolean isTokenized() {
|
||||
public boolean isTokenized() {
|
||||
return (properties & TOKENIZED) != 0;
|
||||
}
|
||||
|
||||
|
|
Loading…
Reference in New Issue