facet.mincount,facet.offset,facet.sort params: SOLR-106

git-svn-id: https://svn.apache.org/repos/asf/incubator/solr/trunk@496811 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Yonik Seeley 2007-01-16 18:26:14 +00:00
parent ebfafc6d63
commit b9e804e032
5 changed files with 328 additions and 95 deletions

View File

@ -37,17 +37,24 @@ Detailed Change List
New Features
1. SOLR-82: Default field values can be specified in the schema.xml.
(Ryan McKinley via hossman)
2. SOLR-89: Two new TokenFilters with corresponding Factories...
* TrimFilter - Trims leading and trailing whitespace from Tokens
* PatternReplaceFilter - applies a Pattern to each token in the
stream, replacing match occurances with a specified replacement.
(hossman)
3. SOLR-91: allow configuration of a limit of the number of searchers
that can be warming in the background. This can be used to avoid
out-of-memory errors, or contention caused by more and more searchers
warming in the background. An error is thrown if the limit specified
by maxWarmingSearchers in solrconfig.xml is exceeded. (yonik)
4. SOLR-106: New faceting parameters that allow specification of a
minimum count for returned facets (facet.mincount), paging through facets
(facet.offset, facet.limit), and explicit sorting (facet.sort).
facet.zeros is now deprecated. (yonik)
Changes in runtime behavior
1. Highlighting using DisMax will only pick up terms from the main
user query, not boost or filter queries (klaas).
@ -58,9 +65,11 @@ Optimizations
Bug Fixes
1. SOLR-87: Parsing of synonym files did not correctly handle escaped
whitespace such as \r\n\t\b\f. (yonik)
2. SOLR-92: DOMUtils.getText (used when parsing config files) did not
work properly with many DOM implementations when dealing with
"Attributes". (Ryan McKinley via hossman)
3. SOLR-9,SOLR-99: Tighten up sort specification error checking, throw
exceptions for missing sort specifications or a sort on a non-indexed
field. (Ryan McKinley via yonik)

View File

@ -20,10 +20,12 @@ package org.apache.solr.request;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermEnum;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.TermDocs;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.search.*;
import org.apache.solr.core.SolrCore;
import org.apache.solr.core.SolrException;
import org.apache.solr.core.SolrConfig;
import org.apache.solr.request.SolrParams;
import org.apache.solr.schema.IndexSchema;
import org.apache.solr.schema.FieldType;
@ -118,20 +120,29 @@ public class SimpleFacets {
public NamedList getTermCounts(String field) throws IOException {
int offset = params.getFieldInt(field, params.FACET_OFFSET, 0);
int limit = params.getFieldInt(field, params.FACET_LIMIT, 100);
boolean zeros = params.getFieldBool(field, params.FACET_ZEROS, true);
Integer mincount = params.getFieldInt(field, params.FACET_MINCOUNT);
if (mincount==null) {
Boolean zeros = params.getFieldBool(field, params.FACET_ZEROS);
// mincount = (zeros!=null && zeros) ? 0 : 1;
mincount = (zeros!=null && !zeros) ? 1 : 0;
// current default is to include zeros.
}
boolean missing = params.getFieldBool(field, params.FACET_MISSING, false);
// default to sorting if there is a limit.
boolean sort = params.getFieldBool(field, params.FACET_SORT, limit>0);
NamedList counts;
SchemaField sf = searcher.getSchema().getField(field);
FieldType ft = sf.getType();
if (sf.multiValued() || ft.isTokenized() || ft instanceof BoolField) {
// Always use filters for booleans... we know the number of values is very small.
counts = getFacetTermEnumCounts(searcher,docs,field,limit,zeros,missing);
counts = getFacetTermEnumCounts(searcher, docs, field, offset, limit, mincount,missing,sort);
} else {
// TODO: future logic could use filters instead of the fieldcache if
// the number of terms in the field is small enough.
counts = getFieldCacheCounts(searcher, docs, field, limit, zeros, missing);
counts = getFieldCacheCounts(searcher, docs, field, offset,limit, mincount, missing, sort);
}
return counts;
@ -177,7 +188,7 @@ public class SimpleFacets {
* Use the Lucene FieldCache to get counts for each unique field value in <code>docs</code>.
* The field must have at most one indexed token per document.
*/
public static NamedList getFieldCacheCounts(SolrIndexSearcher searcher, DocSet docs, String fieldName, int limit, boolean zeros, boolean missing) throws IOException {
public static NamedList getFieldCacheCounts(SolrIndexSearcher searcher, DocSet docs, String fieldName, int offset, int limit, int mincount, boolean missing, boolean sort) throws IOException {
// TODO: If the number of terms is high compared to docs.size(), and zeros==false,
// we should use an alternate strategy to avoid
// 1) creating another huge int[] for the counts
@ -188,7 +199,7 @@ public class SimpleFacets {
//
FieldCache.StringIndex si = FieldCache.DEFAULT.getStringIndex(searcher.getReader(), fieldName);
int[] count = new int[si.lookup.length];
final int[] count = new int[si.lookup.length];
DocIterator iter = docs.iterator();
while (iter.hasNext()) {
count[si.order[iter.nextDoc()]]++;
@ -200,42 +211,51 @@ public class SimpleFacets {
// IDEA: we could also maintain a count of "other"... everything that fell outside
// of the top 'N'
BoundedTreeSet<CountPair<String,Integer>> queue=null;
int off=offset;
int lim=limit>=0 ? limit : Integer.MAX_VALUE;
if (limit>=0) {
if (sort) {
// TODO: compare performance of BoundedTreeSet compare against priority queue?
queue = new BoundedTreeSet<CountPair<String,Integer>>(limit);
}
int min=-1; // the smallest value in the top 'N' values
for (int i=1; i<count.length; i++) {
int c = count[i];
if (c==0 && !zeros) continue;
if (limit<0) {
res.add(ft.indexedToReadable(si.lookup[i]), c);
} else if (c>min) {
// NOTE: we use c>min rather than c>=min as an optimization because we are going in
// index order, so we already know that the keys are ordered. This can be very
// important if a lot of the counts are repeated (like zero counts would be).
queue.add(new CountPair<String,Integer>(ft.indexedToReadable(si.lookup[i]), c));
if (queue.size()>=limit) min=queue.last().val;
int maxsize = limit>0 ? offset+limit : Integer.MAX_VALUE-1;
final BoundedTreeSet<CountPair<String,Integer>> queue = new BoundedTreeSet<CountPair<String,Integer>>(maxsize);
int min=mincount-1; // the smallest value in the top 'N' values
for (int i=1; i<count.length; i++) {
int c = count[i];
if (c>min) {
// NOTE: we use c>min rather than c>=min as an optimization because we are going in
// index order, so we already know that the keys are ordered. This can be very
// important if a lot of the counts are repeated (like zero counts would be).
queue.add(new CountPair<String,Integer>(ft.indexedToReadable(si.lookup[i]), c));
if (queue.size()>=maxsize) min=queue.last().val;
}
}
}
if (limit>=0) {
for (CountPair<String,Integer> p : queue) {
if (--off>=0) continue;
if (--lim<0) break;
res.add(p.key, p.val);
}
} else if (mincount<=0) {
// This is an optimization... if mincount<=0 and we aren't sorting then
// we know exactly where to start and end in the fieldcache.
for (int i=offset+1; i<offset+1+limit; i++) {
res.add(ft.indexedToReadable(si.lookup[i]),count[i]);
}
} else {
for (int i=1; i<count.length; i++) {
int c = count[i];
if (c<mincount || --off>=0) continue;
if (--lim<0) break;
res.add(ft.indexedToReadable(si.lookup[i]), c);
}
}
if (missing) res.add(null, count[0]);
return res;
}
/**
* Returns a list of terms in the specified field along with the
* corrisponding count of documents in the set that match that constraint.
* corresponding count of documents in the set that match that constraint.
* This method uses the FilterCache to get the intersection count between <code>docs</code>
* and the DocSet for each term in the filter.
*
@ -243,7 +263,7 @@ public class SimpleFacets {
* @see SolrParams#FACET_ZEROS
* @see SolrParams#FACET_MISSING
*/
public NamedList getFacetTermEnumCounts(SolrIndexSearcher searcher, DocSet docs, String field, int limit, boolean zeros, boolean missing)
public NamedList getFacetTermEnumCounts(SolrIndexSearcher searcher, DocSet docs, String field, int offset, int limit, int mincount, boolean missing, boolean sort)
throws IOException {
/* :TODO: potential optimization...
@ -255,13 +275,13 @@ public class SimpleFacets {
IndexReader r = searcher.getReader();
FieldType ft = schema.getFieldType(field);
Set<CountPair<String,Integer>> counts
= new HashSet<CountPair<String,Integer>>();
if (0 <= limit) {
counts = new BoundedTreeSet<CountPair<String,Integer>>(limit);
}
final int maxsize = limit>=0 ? offset+limit : Integer.MAX_VALUE-1;
final BoundedTreeSet<CountPair<String,Integer>> queue = sort ? new BoundedTreeSet<CountPair<String,Integer>>(maxsize) : null;
final NamedList res = new NamedList();
int min=mincount-1; // the smallest value in the top 'N' values
int off=offset;
int lim=limit>=0 ? limit : Integer.MAX_VALUE;
TermEnum te = r.terms(new Term(field,""));
do {
Term t = te.term();
@ -269,26 +289,37 @@ public class SimpleFacets {
if (null == t || ! t.field().equals(field))
break;
if (0 < te.docFreq()) { /* all docs may be deleted */
int count = searcher.numDocs(new TermQuery(t),
docs);
int df = te.docFreq();
if (zeros || 0 < count)
counts.add(new CountPair<String,Integer>
(t.text(), count));
if (df>0) { /* check df since all docs may be deleted */
int c = searcher.numDocs(new TermQuery(t), docs);
if (sort) {
if (c>min) {
queue.add(new CountPair<String,Integer>(t.text(), c));
if (queue.size()>=maxsize) min=queue.last().val;
}
} else {
if (c >= mincount && --off<0) {
if (--lim<0) break;
res.add(ft.indexedToReadable(t.text()), c);
}
}
}
} while (te.next());
NamedList res = new NamedList();
for (CountPair<String,Integer> p : counts) {
res.add(ft.indexedToReadable(p.key), p.val);
if (sort) {
for (CountPair<String,Integer> p : queue) {
if (--off>=0) continue;
if (--lim<0) break;
res.add(ft.indexedToReadable(p.key), p.val);
}
}
if (missing) {
res.add(null, getFieldMissingCount(searcher,docs,field));
}
return res;
}

View File

@ -83,17 +83,32 @@ public abstract class SolrParams {
* Facet Contraint Counts (multi-value)
*/
public static final String FACET_FIELD = "facet.field";
/**
* The offset into the list of facets.
* Can be overriden on a per field basis.
*/
public static final String FACET_OFFSET = "facet.offset";
/**
* Numeric option indicating the maximum number of facet field counts
* be included in the response for each field - in descending order of count.
* Can be overriden on a per field basis.
*/
public static final String FACET_LIMIT = "facet.limit";
/**
* Numeric option indicating the minimum number of hits before a facet should
* be included in the response. Can be overriden on a per field basis.
*/
public static final String FACET_MINCOUNT = "facet.mincount";
/**
* Boolean option indicating whether facet field counts of "0" should
* be included in the response. Can be overriden on a per field basis.
*/
public static final String FACET_ZEROS = "facet.zeros";
/**
* Boolean option indicating whether the response should include a
* facet field count for all records which have no value for the
@ -101,6 +116,11 @@ public abstract class SolrParams {
*/
public static final String FACET_MISSING = "facet.missing";
/**
* Boolean option: true causes facets to be sorted
* by the count, false results in natural index order.
*/
public static final String FACET_SORT = "facet.sort";
/** returns the String value of a param, or null if not set */
public abstract String get(String param);
@ -166,6 +186,13 @@ public abstract class SolrParams {
String val = get(param);
return val==null ? def : Integer.parseInt(val);
}
/** Returns the int value of the field param,
or the value for param, or def if neither is set. */
public Integer getFieldInt(String field, String param) {
String val = getFieldParam(field, param);
return val==null ? null : Integer.parseInt(val);
}
/** Returns the int value of the field param,
or the value for param, or def if neither is set. */
@ -174,6 +201,7 @@ public abstract class SolrParams {
return val==null ? def : Integer.parseInt(val);
}
/** Returns the Float value of the param, or null if not set */
public Float getFloat(String param) {
String val = get(param);

View File

@ -240,7 +240,7 @@ public class BasicFunctionalityTest extends AbstractSolrTestCase {
);
}
/** @see TestRemoveDuplicatesTokenFilter */
/** @see org.apache.solr.analysis.TestRemoveDuplicatesTokenFilter */
public void testRemoveDuplicatesTokenFilter() {
Query q = QueryParsing.parseQuery("TV", "dedup",
h.getCore().getSchema());
@ -508,76 +508,240 @@ public class BasicFunctionalityTest extends AbstractSolrTestCase {
,"//lst[@name='trait_s']/int[@name='Chauvinist'][.='1']"
,"//lst[@name='trait_s']/int[not(@name)][.='1']"
);
assertQ("check counts with facet.mincount=1&facet.missing=true using fq",
req("q", "id:[42 TO 47]"
,"facet", "true"
,"facet.mincount", "1"
,"f.trait_s.facet.missing", "true"
,"fq", "id:[42 TO 45]"
,"facet.field", "trait_s"
)
,"*[count(//doc)=4]"
,"*[count(//lst[@name='trait_s']/int)=4]"
,"//lst[@name='trait_s']/int[@name='Tool'][.='2']"
,"//lst[@name='trait_s']/int[@name='Obnoxious'][.='1']"
,"//lst[@name='trait_s']/int[@name='Chauvinist'][.='1']"
,"//lst[@name='trait_s']/int[not(@name)][.='1']"
);
assertQ("check counts with facet.mincount=2&facet.missing=true using fq",
req("q", "id:[42 TO 47]"
,"facet", "true"
,"facet.mincount", "2"
,"f.trait_s.facet.missing", "true"
,"fq", "id:[42 TO 45]"
,"facet.field", "trait_s"
)
,"*[count(//doc)=4]"
,"*[count(//lst[@name='trait_s']/int)=2]"
,"//lst[@name='trait_s']/int[@name='Tool'][.='2']"
,"//lst[@name='trait_s']/int[not(@name)][.='1']"
);
assertQ("check sorted paging",
req("q", "id:[42 TO 47]"
,"facet", "true"
,"fq", "id:[42 TO 45]"
,"facet.field", "trait_s"
,"facet.mincount","0"
,"facet.offset","0"
,"facet.limit","4"
)
,"*[count(//lst[@name='trait_s']/int)=4]"
,"//lst[@name='trait_s']/int[@name='Tool'][.='2']"
,"//lst[@name='trait_s']/int[@name='Obnoxious'][.='1']"
,"//lst[@name='trait_s']/int[@name='Chauvinist'][.='1']"
,"//lst[@name='trait_s']/int[@name='Pig'][.='0']"
);
assertQ("check sorted paging",
req("q", "id:[42 TO 47]"
,"facet", "true"
,"fq", "id:[42 TO 45]"
,"facet.field", "trait_s"
,"facet.mincount","0"
,"facet.offset","0"
,"facet.limit","3"
,"sort","true"
)
,"*[count(//lst[@name='trait_s']/int)=3]"
,"//lst[@name='trait_s']/int[@name='Tool'][.='2']"
,"//lst[@name='trait_s']/int[@name='Obnoxious'][.='1']"
,"//lst[@name='trait_s']/int[@name='Chauvinist'][.='1']"
);
}
public void testSimpleFacetCountsWithLimits() {
assertU(adoc("id", "1", "t_s", "A"));
assertU(adoc("id", "2", "t_s", "B"));
assertU(adoc("id", "3", "t_s", "C"));
assertU(adoc("id", "4", "t_s", "C"));
assertU(adoc("id", "5", "t_s", "D"));
assertU(adoc("id", "6", "t_s", "E"));
assertU(adoc("id", "7", "t_s", "E"));
assertU(adoc("id", "8", "t_s", "E"));
assertU(adoc("id", "9", "t_s", "F"));
assertU(adoc("id", "10", "t_s", "G"));
assertU(adoc("id", "11", "t_s", "G"));
assertU(adoc("id", "12", "t_s", "G"));
assertU(adoc("id", "13", "t_s", "G"));
assertU(adoc("id", "14", "t_s", "G"));
public void testFacetMultiValued() {
doSimpleFacetCountsWithLimits("t_s");
}
public void testFacetSingleValued() {
doSimpleFacetCountsWithLimits("t_s1");
}
public void doSimpleFacetCountsWithLimits(String f) {
String pre = "//lst[@name='"+f+"']";
String notc = "id:[* TO *] -"+f+":C";
assertU(adoc("id", "1", f, "A"));
assertU(adoc("id", "2", f, "B"));
assertU(adoc("id", "3", f, "C"));
assertU(adoc("id", "4", f, "C"));
assertU(adoc("id", "5", f, "D"));
assertU(adoc("id", "6", f, "E"));
assertU(adoc("id", "7", f, "E"));
assertU(adoc("id", "8", f, "E"));
assertU(adoc("id", "9", f, "F"));
assertU(adoc("id", "10", f, "G"));
assertU(adoc("id", "11", f, "G"));
assertU(adoc("id", "12", f, "G"));
assertU(adoc("id", "13", f, "G"));
assertU(adoc("id", "14", f, "G"));
assertU(commit());
assertQ("check counts for unlimited facet",
req("q", "id:[* TO *]"
,"facet", "true"
,"facet.field", "t_s"
,"facet.field", f
)
,"*[count(//lst[@name='facet_fields']/lst[@name='t_s']/int)=7]"
,"//lst[@name='t_s']/int[@name='G'][.='5']"
,"//lst[@name='t_s']/int[@name='E'][.='3']"
,"//lst[@name='t_s']/int[@name='C'][.='2']"
,"//lst[@name='t_s']/int[@name='A'][.='1']"
,"//lst[@name='t_s']/int[@name='B'][.='1']"
,"//lst[@name='t_s']/int[@name='D'][.='1']"
,"//lst[@name='t_s']/int[@name='F'][.='1']"
,"*[count(//lst[@name='facet_fields']/lst/int)=7]"
,pre+"/int[@name='G'][.='5']"
,pre+"/int[@name='E'][.='3']"
,pre+"/int[@name='C'][.='2']"
,pre+"/int[@name='A'][.='1']"
,pre+"/int[@name='B'][.='1']"
,pre+"/int[@name='D'][.='1']"
,pre+"/int[@name='F'][.='1']"
);
assertQ("check counts for facet with generous limit",
req("q", "id:[* TO *]"
,"facet", "true"
,"facet.limit", "100"
,"facet.field", "t_s"
,"facet.field", f
)
,"*[count(//lst[@name='facet_fields']/lst[@name='t_s']/int)=7]"
,"//lst[@name='t_s']/int[1][@name='G'][.='5']"
,"//lst[@name='t_s']/int[2][@name='E'][.='3']"
,"//lst[@name='t_s']/int[3][@name='C'][.='2']"
,"//lst[@name='t_s']/int[@name='A'][.='1']"
,"//lst[@name='t_s']/int[@name='B'][.='1']"
,"//lst[@name='t_s']/int[@name='D'][.='1']"
,"//lst[@name='t_s']/int[@name='F'][.='1']"
,"*[count(//lst[@name='facet_fields']/lst/int)=7]"
,pre+"/int[1][@name='G'][.='5']"
,pre+"/int[2][@name='E'][.='3']"
,pre+"/int[3][@name='C'][.='2']"
,pre+"/int[@name='A'][.='1']"
,pre+"/int[@name='B'][.='1']"
,pre+"/int[@name='D'][.='1']"
,pre+"/int[@name='F'][.='1']"
);
assertQ("check counts for limited facet",
req("q", "id:[* TO *]"
,"facet", "true"
,"facet.limit", "2"
,"facet.field", "t_s"
,"facet.field", f
)
,"*[count(//lst[@name='facet_fields']/lst[@name='t_s']/int)=2]"
,"//lst[@name='t_s']/int[1][@name='G'][.='5']"
,"//lst[@name='t_s']/int[2][@name='E'][.='3']"
);
}
,"*[count(//lst[@name='facet_fields']/lst/int)=2]"
,pre+"/int[1][@name='G'][.='5']"
,pre+"/int[2][@name='E'][.='3']"
);
assertQ("check offset",
req("q", "id:[* TO *]"
,"facet", "true"
,"facet.offset", "1"
,"facet.limit", "1"
,"facet.field", f
)
,"*[count(//lst[@name='facet_fields']/lst/int)=1]"
,pre+"/int[1][@name='E'][.='3']"
);
assertQ("test sorted facet paging with zero (don't count in limit)",
req("q", "id:[* TO *]"
,"fq",notc
,"facet", "true"
,"facet.field", f
,"facet.mincount","1"
,"facet.offset","0"
,"facet.limit","6"
)
,"*[count(//lst[@name='facet_fields']/lst/int)=6]"
,pre+"/int[1][@name='G'][.='5']"
,pre+"/int[2][@name='E'][.='3']"
,pre+"/int[3][@name='A'][.='1']"
,pre+"/int[4][@name='B'][.='1']"
,pre+"/int[5][@name='D'][.='1']"
,pre+"/int[6][@name='F'][.='1']"
);
assertQ("test sorted facet paging with zero (test offset correctness)",
req("q", "id:[* TO *]"
,"fq",notc
,"facet", "true"
,"facet.field", f
,"facet.mincount","1"
,"facet.offset","3"
,"facet.limit","2"
,"facet.sort","true"
)
,"*[count(//lst[@name='facet_fields']/lst/int)=2]"
,pre+"/int[1][@name='B'][.='1']"
,pre+"/int[2][@name='D'][.='1']"
);
assertQ("test facet unsorted paging",
req("q", "id:[* TO *]"
,"fq",notc
,"facet", "true"
,"facet.field", f
,"facet.mincount","1"
,"facet.offset","0"
,"facet.limit","6"
,"facet.sort","false"
)
,"*[count(//lst[@name='facet_fields']/lst/int)=6]"
,pre+"/int[1][@name='A'][.='1']"
,pre+"/int[2][@name='B'][.='1']"
,pre+"/int[3][@name='D'][.='1']"
,pre+"/int[4][@name='E'][.='3']"
,pre+"/int[5][@name='F'][.='1']"
,pre+"/int[6][@name='G'][.='5']"
);
assertQ("test facet unsorted paging",
req("q", "id:[* TO *]"
,"fq",notc
,"facet", "true"
,"facet.field", f
,"facet.mincount","1"
,"facet.offset","3"
,"facet.limit","2"
,"facet.sort","false"
)
,"*[count(//lst[@name='facet_fields']/lst/int)=2]"
,pre+"/int[1][@name='E'][.='3']"
,pre+"/int[2][@name='F'][.='1']"
);
assertQ("test facet unsorted paging, mincount=2",
req("q", "id:[* TO *]"
,"fq",notc
,"facet", "true"
,"facet.field", f
,"facet.mincount","2"
,"facet.offset","1"
,"facet.limit","2"
,"facet.sort","false"
)
,"*[count(//lst[@name='facet_fields']/lst/int)=1]"
,pre+"/int[1][@name='G'][.='5']"
);
}
private String mkstr(int len) {
StringBuilder sb = new StringBuilder(len);

View File

@ -386,6 +386,7 @@
-->
<dynamicField name="*_i" type="sint" indexed="true" stored="true"/>
<dynamicField name="*_s" type="string" indexed="true" stored="true"/>
<dynamicField name="*_s1" type="string" indexed="true" stored="true" multiValued="false"/>
<dynamicField name="*_l" type="slong" indexed="true" stored="true"/>
<dynamicField name="*_t" type="text" indexed="true" stored="true"/>
<dynamicField name="*_b" type="boolean" indexed="true" stored="true"/>