mirror of https://github.com/apache/lucene.git
use FilterCache for faceting single-term non-bool fields
git-svn-id: https://svn.apache.org/repos/asf/incubator/solr/trunk@448695 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
6646560c9d
commit
8c79297a75
|
@ -83,6 +83,10 @@ Optimizations
|
||||||
5. Optimized getDocSet() for term queries resulting in a 36% speedup of facet.field
|
5. Optimized getDocSet() for term queries resulting in a 36% speedup of facet.field
|
||||||
queries where DocSets aren't cached (for example, if the number of terms in the field
|
queries where DocSets aren't cached (for example, if the number of terms in the field
|
||||||
is larger than the filter cache.) (yonik)
|
is larger than the filter cache.) (yonik)
|
||||||
|
6. Optimized facet.field faceting by as much as 500 times when the field has
|
||||||
|
a single token per document (not multiValued & not tokenized) by using the
|
||||||
|
Lucene FieldCache entry for that field to tally term counts. The first request
|
||||||
|
utilizing the FieldCache will take longer than subsequent ones.
|
||||||
|
|
||||||
Bug Fixes
|
Bug Fixes
|
||||||
1. Fixed delete-by-id for field types who's indexed form is different
|
1. Fixed delete-by-id for field types who's indexed form is different
|
||||||
|
|
|
@ -20,23 +20,20 @@ import org.apache.lucene.index.Term;
|
||||||
import org.apache.lucene.index.TermEnum;
|
import org.apache.lucene.index.TermEnum;
|
||||||
import org.apache.lucene.index.IndexReader;
|
import org.apache.lucene.index.IndexReader;
|
||||||
import org.apache.lucene.queryParser.ParseException;
|
import org.apache.lucene.queryParser.ParseException;
|
||||||
import org.apache.lucene.queryParser.QueryParser;
|
|
||||||
import org.apache.lucene.search.*;
|
import org.apache.lucene.search.*;
|
||||||
import org.apache.solr.core.SolrCore;
|
import org.apache.solr.core.SolrCore;
|
||||||
import org.apache.solr.core.SolrException;
|
import org.apache.solr.core.SolrException;
|
||||||
import org.apache.solr.request.SolrParams;
|
import org.apache.solr.request.SolrParams;
|
||||||
import org.apache.solr.request.SolrQueryRequest;
|
|
||||||
import org.apache.solr.request.SolrQueryResponse;
|
|
||||||
import org.apache.solr.request.DefaultSolrParams;
|
|
||||||
import org.apache.solr.schema.IndexSchema;
|
import org.apache.solr.schema.IndexSchema;
|
||||||
import org.apache.solr.schema.FieldType;
|
import org.apache.solr.schema.FieldType;
|
||||||
|
import org.apache.solr.schema.SchemaField;
|
||||||
|
import org.apache.solr.schema.BoolField;
|
||||||
import org.apache.solr.search.*;
|
import org.apache.solr.search.*;
|
||||||
import org.apache.solr.util.NamedList;
|
import org.apache.solr.util.NamedList;
|
||||||
import org.apache.solr.util.BoundedTreeSet;
|
import org.apache.solr.util.BoundedTreeSet;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
import java.util.logging.Level;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* A class that generates simple Facet information for a request.
|
* A class that generates simple Facet information for a request.
|
||||||
|
@ -53,6 +50,7 @@ public class SimpleFacets {
|
||||||
/** Searcher to use for all calculations */
|
/** Searcher to use for all calculations */
|
||||||
protected SolrIndexSearcher searcher;
|
protected SolrIndexSearcher searcher;
|
||||||
|
|
||||||
|
|
||||||
public SimpleFacets(SolrIndexSearcher searcher,
|
public SimpleFacets(SolrIndexSearcher searcher,
|
||||||
DocSet docs,
|
DocSet docs,
|
||||||
SolrParams params) {
|
SolrParams params) {
|
||||||
|
@ -117,30 +115,44 @@ public class SimpleFacets {
|
||||||
return res;
|
return res;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public NamedList getTermCounts(String field) throws IOException {
|
||||||
|
int limit = params.getFieldInt(field, params.FACET_LIMIT, 100);
|
||||||
|
boolean zeros = params.getFieldBool(field, params.FACET_ZEROS, true);
|
||||||
|
boolean missing = params.getFieldBool(field, params.FACET_MISSING, false);
|
||||||
|
|
||||||
|
NamedList counts;
|
||||||
|
SchemaField sf = searcher.getSchema().getField(field);
|
||||||
|
FieldType ft = sf.getType();
|
||||||
|
if (sf.multiValued() || ft.isTokenized() || ft instanceof BoolField) {
|
||||||
|
// Always use filters for booleans... we know the number of values is very small.
|
||||||
|
counts = getFacetTermEnumCounts(searcher,docs,field,limit,zeros,missing);
|
||||||
|
} else {
|
||||||
|
// TODO: future logic could use filters instead of the fieldcache if
|
||||||
|
// the number of terms in the field is small enough.
|
||||||
|
counts = getFieldCacheCounts(searcher, docs, field, limit, zeros, missing);
|
||||||
|
}
|
||||||
|
|
||||||
|
return counts;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Returns a list of value constraints and the associated facet counts
|
* Returns a list of value constraints and the associated facet counts
|
||||||
* for each facet field specified in the params.
|
* for each facet field specified in the params.
|
||||||
*
|
*
|
||||||
* @see SolrParams#FACET_FIELD
|
* @see SolrParams#FACET_FIELD
|
||||||
* @see #getFacetFieldMissingCount
|
* @see #getFieldMissingCount
|
||||||
* @see #getFacetTermEnumCounts
|
* @see #getFacetTermEnumCounts
|
||||||
*/
|
*/
|
||||||
public NamedList getFacetFieldCounts()
|
public NamedList getFacetFieldCounts()
|
||||||
throws IOException {
|
throws IOException {
|
||||||
|
|
||||||
NamedList res = new NamedList();
|
NamedList res = new NamedList();
|
||||||
String[] facetFs = params.getParams(SolrParams.FACET_FIELD);
|
String[] facetFs = params.getParams(SolrParams.FACET_FIELD);
|
||||||
if (null != facetFs && 0 != facetFs.length) {
|
if (null != facetFs) {
|
||||||
|
|
||||||
for (String f : facetFs) {
|
for (String f : facetFs) {
|
||||||
|
res.add(f, getTermCounts(f));
|
||||||
NamedList counts = getFacetTermEnumCounts(f);
|
|
||||||
|
|
||||||
if (params.getFieldBool(f, params.FACET_MISSING, false))
|
|
||||||
counts.add(null, getFacetFieldMissingCount(f));
|
|
||||||
|
|
||||||
res.add(f, counts);
|
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return res;
|
return res;
|
||||||
|
@ -152,7 +164,7 @@ public class SimpleFacets {
|
||||||
*
|
*
|
||||||
* @see SolrParams#FACET_MISSING
|
* @see SolrParams#FACET_MISSING
|
||||||
*/
|
*/
|
||||||
public int getFacetFieldMissingCount(String fieldName)
|
public static int getFieldMissingCount(SolrIndexSearcher searcher, DocSet docs, String fieldName)
|
||||||
throws IOException {
|
throws IOException {
|
||||||
|
|
||||||
DocSet hasVal = searcher.getDocSet
|
DocSet hasVal = searcher.getDocSet
|
||||||
|
@ -160,58 +172,122 @@ public class SimpleFacets {
|
||||||
return docs.andNotSize(hasVal);
|
return docs.andNotSize(hasVal);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Use the Lucene FieldCache to get counts for each unique field value in <code>docs</code>.
|
||||||
|
* The field must have at most one indexed token per document.
|
||||||
|
*/
|
||||||
|
public static NamedList getFieldCacheCounts(SolrIndexSearcher searcher, DocSet docs, String fieldName, int limit, boolean zeros, boolean missing) throws IOException {
|
||||||
|
// TODO: If the number of terms is high compared to docs.size(), and zeros==false,
|
||||||
|
// we should use an alternate strategy to avoid
|
||||||
|
// 1) creating another huge int[] for the counts
|
||||||
|
// 2) looping over that huge int[] looking for the rare non-zeros.
|
||||||
|
//
|
||||||
|
// Yet another variation: if docs.size() is small and termvectors are stored,
|
||||||
|
// then use them instead of the FieldCache.
|
||||||
|
//
|
||||||
|
|
||||||
|
FieldCache.StringIndex si = FieldCache.DEFAULT.getStringIndex(searcher.getReader(), fieldName);
|
||||||
|
int[] count = new int[si.lookup.length];
|
||||||
|
DocIterator iter = docs.iterator();
|
||||||
|
while (iter.hasNext()) {
|
||||||
|
count[si.order[iter.nextDoc()]]++;
|
||||||
|
}
|
||||||
|
|
||||||
|
FieldType ft = searcher.getSchema().getFieldType(fieldName);
|
||||||
|
NamedList res = new NamedList();
|
||||||
|
|
||||||
|
// IDEA: we could also maintain a count of "other"... everything that fell outside
|
||||||
|
// of the top 'N'
|
||||||
|
|
||||||
|
BoundedTreeSet<CountPair<String,Integer>> queue=null;
|
||||||
|
|
||||||
|
if (limit>=0) {
|
||||||
|
// TODO: compare performance of BoundedTreeSet compare against priority queue?
|
||||||
|
queue = new BoundedTreeSet<CountPair<String,Integer>>(limit);
|
||||||
|
}
|
||||||
|
|
||||||
|
int min=-1; // the smallest value in the top 'N' values
|
||||||
|
for (int i=1; i<count.length; i++) {
|
||||||
|
int c = count[i];
|
||||||
|
if (c==0 && !zeros) continue;
|
||||||
|
if (limit<0) {
|
||||||
|
res.add(ft.indexedToReadable(si.lookup[i]), c);
|
||||||
|
} else if (c>min) {
|
||||||
|
// NOTE: we use c>min rather than c>=min as an optimization because we are going in
|
||||||
|
// index order, so we already know that the keys are ordered. This can be very
|
||||||
|
// important if a lot of the counts are repeated (like zero counts would be).
|
||||||
|
queue.add(new CountPair<String,Integer>(ft.indexedToReadable(si.lookup[i]), c));
|
||||||
|
if (queue.size()>=limit) min=queue.last().val;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (limit>=0) {
|
||||||
|
for (CountPair<String,Integer> p : queue) {
|
||||||
|
res.add(p.key, p.val);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
if (missing) res.add(null, count[0]);
|
||||||
|
return res;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Returns a list of terms in the specified field along with the
|
* Returns a list of terms in the specified field along with the
|
||||||
* corrisponding count of documents in the set that match that constraint.
|
* corrisponding count of documents in the set that match that constraint.
|
||||||
|
* This method uses the FilterCache to get the intersection count between <code>docs</code>
|
||||||
|
* and the DocSet for each term in the filter.
|
||||||
*
|
*
|
||||||
* @see SolrParams#FACET_LIMIT
|
* @see SolrParams#FACET_LIMIT
|
||||||
* @see SolrParams#FACET_ZEROS
|
* @see SolrParams#FACET_ZEROS
|
||||||
|
* @see SolrParams#FACET_MISSING
|
||||||
*/
|
*/
|
||||||
public NamedList getFacetTermEnumCounts(String fieldName)
|
public NamedList getFacetTermEnumCounts(SolrIndexSearcher searcher, DocSet docs, String field, int limit, boolean zeros, boolean missing)
|
||||||
throws IOException {
|
throws IOException {
|
||||||
|
|
||||||
/* :TODO: potential optimization...
|
/* :TODO: potential optimization...
|
||||||
* cache the Terms with the highest docFreq and try them first
|
* cache the Terms with the highest docFreq and try them first
|
||||||
* don't enum if we get our max from them
|
* don't enum if we get our max from them
|
||||||
*/
|
*/
|
||||||
|
|
||||||
IndexSchema schema = searcher.getSchema();
|
IndexSchema schema = searcher.getSchema();
|
||||||
IndexReader r = searcher.getReader();
|
IndexReader r = searcher.getReader();
|
||||||
FieldType ft = schema.getFieldType(fieldName);
|
FieldType ft = schema.getFieldType(field);
|
||||||
|
|
||||||
Set<CountPair<String,Integer>> counts
|
Set<CountPair<String,Integer>> counts
|
||||||
= new HashSet<CountPair<String,Integer>>();
|
= new HashSet<CountPair<String,Integer>>();
|
||||||
|
|
||||||
int limit = params.getFieldInt(fieldName, params.FACET_LIMIT, 100);
|
|
||||||
if (0 <= limit) {
|
if (0 <= limit) {
|
||||||
counts = new BoundedTreeSet<CountPair<String,Integer>>(limit);
|
counts = new BoundedTreeSet<CountPair<String,Integer>>(limit);
|
||||||
}
|
}
|
||||||
|
|
||||||
boolean zeros = params.getFieldBool(fieldName, params.FACET_ZEROS, true);
|
TermEnum te = r.terms(new Term(field,""));
|
||||||
|
|
||||||
TermEnum te = r.terms(new Term(fieldName,""));
|
|
||||||
do {
|
do {
|
||||||
Term t = te.term();
|
Term t = te.term();
|
||||||
|
|
||||||
if (null == t || ! t.field().equals(fieldName))
|
if (null == t || ! t.field().equals(field))
|
||||||
break;
|
break;
|
||||||
|
|
||||||
if (0 < te.docFreq()) { /* all docs may be deleted */
|
if (0 < te.docFreq()) { /* all docs may be deleted */
|
||||||
int count = searcher.numDocs(new TermQuery(t),
|
int count = searcher.numDocs(new TermQuery(t),
|
||||||
docs);
|
docs);
|
||||||
|
|
||||||
/* :TODO: is indexedToReadable correct? */
|
|
||||||
if (zeros || 0 < count)
|
if (zeros || 0 < count)
|
||||||
counts.add(new CountPair<String,Integer>
|
counts.add(new CountPair<String,Integer>
|
||||||
(ft.indexedToReadable(t.text()), count));
|
(t.text(), count));
|
||||||
|
|
||||||
}
|
}
|
||||||
} while (te.next());
|
} while (te.next());
|
||||||
|
|
||||||
NamedList res = new NamedList();
|
NamedList res = new NamedList();
|
||||||
for (CountPair<String,Integer> p : counts) {
|
for (CountPair<String,Integer> p : counts) {
|
||||||
res.add(p.key, p.val);
|
res.add(ft.indexedToReadable(p.key), p.val);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (missing) {
|
||||||
|
res.add(null, getFieldMissingCount(searcher,docs,field));
|
||||||
|
}
|
||||||
|
|
||||||
return res;
|
return res;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -20,19 +20,36 @@ import org.apache.solr.search.SolrIndexSearcher;
|
||||||
import org.apache.solr.schema.IndexSchema;
|
import org.apache.solr.schema.IndexSchema;
|
||||||
import org.apache.solr.core.SolrCore;
|
import org.apache.solr.core.SolrCore;
|
||||||
|
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Container for a request to execute a query.
|
* <p>Container for a request to execute a query.</p>
|
||||||
|
* <p><code>SolrQueryRequest</code> is not thread safe.</p>
|
||||||
*
|
*
|
||||||
* @author yonik
|
* @author yonik
|
||||||
* @version $Id$
|
* @version $Id$
|
||||||
*/
|
*/
|
||||||
public interface SolrQueryRequest {
|
public interface SolrQueryRequest {
|
||||||
|
|
||||||
|
/** returns the current request parameters */
|
||||||
public SolrParams getParams();
|
public SolrParams getParams();
|
||||||
|
|
||||||
|
/** Change the parameters for this request. This does not affect
|
||||||
|
* the original parameters returned by getOriginalParams()
|
||||||
|
*/
|
||||||
public void setParams(SolrParams params);
|
public void setParams(SolrParams params);
|
||||||
|
|
||||||
|
/** Returns the original request parameters. As this
|
||||||
|
* does not normally include configured defaults
|
||||||
|
* it's more suitable for logging.
|
||||||
|
*/
|
||||||
public SolrParams getOriginalParams();
|
public SolrParams getOriginalParams();
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Generic information associated with this request that may be both read and updated.
|
||||||
|
*/
|
||||||
|
public Map<Object,Object> getContext();
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* This method should be called when all uses of this request are
|
* This method should be called when all uses of this request are
|
||||||
* finished, so that resources can be freed.
|
* finished, so that resources can be freed.
|
||||||
|
|
|
@ -57,7 +57,7 @@ public abstract class FieldType extends FieldProperties {
|
||||||
int properties;
|
int properties;
|
||||||
|
|
||||||
/** Returns true if fields of this type should be tokenized */
|
/** Returns true if fields of this type should be tokenized */
|
||||||
protected boolean isTokenized() {
|
public boolean isTokenized() {
|
||||||
return (properties & TOKENIZED) != 0;
|
return (properties & TOKENIZED) != 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue