facet.prefix: SOLR-117

git-svn-id: https://svn.apache.org/repos/asf/lucene/solr/trunk@499834 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Yonik Seeley 2007-01-25 16:32:01 +00:00
parent f027d99c6f
commit 8cf4a87acb
4 changed files with 376 additions and 56 deletions

View File

@ -54,6 +54,9 @@ New Features
starting with the smallest positive set, subtracting all negative
sets, then intersecting with all other positive sets. (yonik)
6. SOLR-117: Limit a field faceting to constraints with a prefix specified
by facet.prefix or f.<field>.facet.prefix. (yonik)
Changes in runtime behavior
1. Highlighting using DisMax will only pick up terms from the main
user query, not boost or filter queries (klaas).

View File

@ -20,12 +20,10 @@ package org.apache.solr.request;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermEnum;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.TermDocs;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.search.*;
import org.apache.solr.core.SolrCore;
import org.apache.solr.core.SolrException;
import org.apache.solr.core.SolrConfig;
import org.apache.solr.request.SolrParams;
import org.apache.solr.schema.IndexSchema;
import org.apache.solr.schema.FieldType;
@ -36,7 +34,8 @@ import org.apache.solr.util.NamedList;
import org.apache.solr.util.BoundedTreeSet;
import java.io.IOException;
import java.util.*;
import java.util.Arrays;
import java.util.Comparator;
/**
* A class that generates simple Facet information for a request.
@ -132,17 +131,18 @@ public class SimpleFacets {
boolean missing = params.getFieldBool(field, params.FACET_MISSING, false);
// default to sorting if there is a limit.
boolean sort = params.getFieldBool(field, params.FACET_SORT, limit>0);
String prefix = params.getFieldParam(field,params.FACET_PREFIX);
NamedList counts;
SchemaField sf = searcher.getSchema().getField(field);
FieldType ft = sf.getType();
if (sf.multiValued() || ft.isTokenized() || ft instanceof BoolField) {
// Always use filters for booleans... we know the number of values is very small.
counts = getFacetTermEnumCounts(searcher, docs, field, offset, limit, mincount,missing,sort);
counts = getFacetTermEnumCounts(searcher, docs, field, offset, limit, mincount,missing,sort,prefix);
} else {
// TODO: future logic could use filters instead of the fieldcache if
// the number of terms in the field is small enough.
counts = getFieldCacheCounts(searcher, docs, field, offset,limit, mincount, missing, sort);
counts = getFieldCacheCounts(searcher, docs, field, offset,limit, mincount, missing, sort, prefix);
}
return counts;
@ -184,11 +184,21 @@ public class SimpleFacets {
return docs.andNotSize(hasVal);
}
// first element of the fieldcache is null, so we need this comparator.
private static final Comparator nullStrComparator = new Comparator() {
public int compare(Object o1, Object o2) {
if (o1==null) return (o2==null) ? 0 : -1;
else if (o2==null) return 1;
return ((String)o1).compareTo((String)o2);
}
};
/**
* Use the Lucene FieldCache to get counts for each unique field value in <code>docs</code>.
* The field must have at most one indexed token per document.
*/
public static NamedList getFieldCacheCounts(SolrIndexSearcher searcher, DocSet docs, String fieldName, int offset, int limit, int mincount, boolean missing, boolean sort) throws IOException {
public static NamedList getFieldCacheCounts(SolrIndexSearcher searcher, DocSet docs, String fieldName, int offset, int limit, int mincount, boolean missing, boolean sort, String prefix) throws IOException {
// TODO: If the number of terms is high compared to docs.size(), and zeros==false,
// we should use an alternate strategy to avoid
// 1) creating another huge int[] for the counts
@ -198,58 +208,97 @@ public class SimpleFacets {
// then use them instead of the FieldCache.
//
FieldCache.StringIndex si = FieldCache.DEFAULT.getStringIndex(searcher.getReader(), fieldName);
final int[] count = new int[si.lookup.length];
DocIterator iter = docs.iterator();
while (iter.hasNext()) {
count[si.order[iter.nextDoc()]]++;
}
// TODO: this function is too big and could use some refactoring, but
// we also need a facet cache, and refactoring of SimpleFacets instead of
// trying to pass all the various params around.
FieldType ft = searcher.getSchema().getFieldType(fieldName);
NamedList res = new NamedList();
// IDEA: we could also maintain a count of "other"... everything that fell outside
// of the top 'N'
FieldCache.StringIndex si = FieldCache.DEFAULT.getStringIndex(searcher.getReader(), fieldName);
final String[] terms = si.lookup;
final int[] termNum = si.order;
int off=offset;
int lim=limit>=0 ? limit : Integer.MAX_VALUE;
if (prefix!=null && prefix.length()==0) prefix=null;
if (sort) {
// TODO: compare performance of BoundedTreeSet compare against priority queue?
int maxsize = limit>0 ? offset+limit : Integer.MAX_VALUE-1;
final BoundedTreeSet<CountPair<String,Integer>> queue = new BoundedTreeSet<CountPair<String,Integer>>(maxsize);
int min=mincount-1; // the smallest value in the top 'N' values
for (int i=1; i<count.length; i++) {
int c = count[i];
if (c>min) {
// NOTE: we use c>min rather than c>=min as an optimization because we are going in
// index order, so we already know that the keys are ordered. This can be very
// important if a lot of the counts are repeated (like zero counts would be).
queue.add(new CountPair<String,Integer>(ft.indexedToReadable(si.lookup[i]), c));
if (queue.size()>=maxsize) min=queue.last().val;
}
}
for (CountPair<String,Integer> p : queue) {
if (--off>=0) continue;
if (--lim<0) break;
res.add(p.key, p.val);
}
} else if (mincount<=0) {
// This is an optimization... if mincount<=0 and we aren't sorting then
// we know exactly where to start and end in the fieldcache.
for (int i=offset+1; i<offset+1+limit; i++) {
res.add(ft.indexedToReadable(si.lookup[i]),count[i]);
}
int startTermIndex, endTermIndex;
if (prefix!=null) {
startTermIndex = Arrays.binarySearch(terms,prefix,nullStrComparator);
if (startTermIndex<0) startTermIndex=-startTermIndex-1;
// find the end term. \uffff isn't a legal unicode char, but only compareTo
// is used, so it should be fine, and is guaranteed to be bigger than legal chars.
endTermIndex = Arrays.binarySearch(terms,prefix+"\uffff\uffff\uffff\uffff",nullStrComparator);
endTermIndex = -endTermIndex-1;
} else {
for (int i=1; i<count.length; i++) {
int c = count[i];
if (c<mincount || --off>=0) continue;
if (--lim<0) break;
res.add(ft.indexedToReadable(si.lookup[i]), c);
startTermIndex=1;
endTermIndex=terms.length;
}
final int nTerms=endTermIndex-startTermIndex;
if (nTerms>0) {
// count collection array only needs to be as big as the number of terms we are
// going to collect counts for.
final int[] counts = new int[nTerms];
DocIterator iter = docs.iterator();
while (iter.hasNext()) {
int term = termNum[iter.nextDoc()];
int arrIdx = term-startTermIndex;
if (arrIdx>=0 && arrIdx<nTerms) counts[arrIdx]++;
}
// IDEA: we could also maintain a count of "other"... everything that fell outside
// of the top 'N'
int off=offset;
int lim=limit>=0 ? limit : Integer.MAX_VALUE;
if (sort) {
int maxsize = limit>0 ? offset+limit : Integer.MAX_VALUE-1;
maxsize = Math.min(maxsize, nTerms);
final BoundedTreeSet<CountPair<String,Integer>> queue = new BoundedTreeSet<CountPair<String,Integer>>(maxsize);
int min=mincount-1; // the smallest value in the top 'N' values
for (int i=0; i<nTerms; i++) {
int c = counts[i];
if (c>min) {
// NOTE: we use c>min rather than c>=min as an optimization because we are going in
// index order, so we already know that the keys are ordered. This can be very
// important if a lot of the counts are repeated (like zero counts would be).
queue.add(new CountPair<String,Integer>(terms[startTermIndex+i], c));
if (queue.size()>=maxsize) min=queue.last().val;
}
}
// now select the right page from the results
for (CountPair<String,Integer> p : queue) {
if (--off>=0) continue;
if (--lim<0) break;
res.add(ft.indexedToReadable(p.key), p.val);
}
} else {
// add results in index order
int i=0;
if (mincount<=0) {
// if mincount<=0, then we won't discard any terms and we know exactly
// where to start.
i=off;
off=0;
}
for (; i<nTerms; i++) {
int c = counts[i];
if (c<mincount || --off>=0) continue;
if (--lim<0) break;
res.add(ft.indexedToReadable(terms[startTermIndex+i]), c);
}
}
}
if (missing) res.add(null, count[0]);
if (missing) {
res.add(null, getFieldMissingCount(searcher,docs,fieldName));
}
return res;
}
@ -263,7 +312,7 @@ public class SimpleFacets {
* @see SolrParams#FACET_ZEROS
* @see SolrParams#FACET_MISSING
*/
public NamedList getFacetTermEnumCounts(SolrIndexSearcher searcher, DocSet docs, String field, int offset, int limit, int mincount, boolean missing, boolean sort)
public NamedList getFacetTermEnumCounts(SolrIndexSearcher searcher, DocSet docs, String field, int offset, int limit, int mincount, boolean missing, boolean sort, String prefix)
throws IOException {
/* :TODO: potential optimization...
@ -282,13 +331,17 @@ public class SimpleFacets {
int min=mincount-1; // the smallest value in the top 'N' values
int off=offset;
int lim=limit>=0 ? limit : Integer.MAX_VALUE;
TermEnum te = r.terms(new Term(field,""));
String startTerm = prefix==null ? "" : ft.toInternal(prefix);
TermEnum te = r.terms(new Term(field,startTerm));
do {
Term t = te.term();
if (null == t || ! t.field().equals(field))
break;
if (prefix!=null && !t.text().startsWith(prefix)) break;
int df = te.docFreq();
if (df>0) { /* check df since all docs may be deleted */

View File

@ -122,6 +122,11 @@ public abstract class SolrParams {
*/
public static final String FACET_SORT = "facet.sort";
/**
* Only return constraints of a facet field with the given prefix.
*/
public static final String FACET_PREFIX = "facet.prefix";
/** returns the String value of a param, or null if not set */
public abstract String get(String param);

View File

@ -25,14 +25,12 @@ import org.apache.solr.search.*;
import org.apache.solr.request.*;
import org.apache.solr.util.*;
import org.apache.solr.schema.*;
import org.w3c.dom.Document;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.DocumentBuilder;
import java.io.IOException;
import java.io.StringWriter;
import java.io.ByteArrayInputStream;
import java.io.UnsupportedEncodingException;
import java.util.Map;
import java.util.HashMap;
@ -575,14 +573,14 @@ public class BasicFunctionalityTest extends AbstractSolrTestCase {
public void testFacetMultiValued() {
doSimpleFacetCountsWithLimits("t_s");
doFacets("t_s");
}
public void testFacetSingleValued() {
doSimpleFacetCountsWithLimits("t_s1");
doFacets("t_s1");
}
public void doSimpleFacetCountsWithLimits(String f) {
public void doFacets(String f) {
String pre = "//lst[@name='"+f+"']";
String notc = "id:[* TO *] -"+f+":C";
@ -742,7 +740,268 @@ public class BasicFunctionalityTest extends AbstractSolrTestCase {
,pre+"/int[1][@name='G'][.='5']"
);
}
public void testFacetPrefixMultiValued() {
doFacetPrefix("t_s");
}
public void testFacetPrefixSingleValued() {
doFacetPrefix("t_s1");
}
public void doFacetPrefix(String f) {
String indent="on";
String pre = "//lst[@name='"+f+"']";
String notc = "id:[* TO *] -"+f+":C";
assertU(adoc("id", "1", f, "AAA"));
assertU(adoc("id", "2", f, "B"));
assertU(adoc("id", "3", f, "BB"));
assertU(adoc("id", "4", f, "BB"));
assertU(adoc("id", "5", f, "BBB"));
assertU(adoc("id", "6", f, "BBB"));
assertU(adoc("id", "7", f, "BBB"));
assertU(adoc("id", "8", f, "CC"));
assertU(adoc("id", "9", f, "CC"));
assertU(adoc("id", "10", f, "CCC"));
assertU(adoc("id", "11", f, "CCC"));
assertU(adoc("id", "12", f, "CCC"));
assertU(commit());
assertQ("test facet.prefix middle, exact match first term",
req("q", "id:[* TO *]"
,"indent",indent
,"facet","true"
,"facet.field", f
,"facet.mincount","0"
,"facet.offset","0"
,"facet.limit","100"
,"facet.sort","true"
,"facet.prefix","B"
)
,"*[count(//lst[@name='facet_fields']/lst/int)=3]"
,pre+"/int[1][@name='BBB'][.='3']"
,pre+"/int[2][@name='BB'][.='2']"
,pre+"/int[3][@name='B'][.='1']"
);
assertQ("test facet.prefix middle, exact match first term, unsorted",
req("q", "id:[* TO *]"
,"indent",indent
,"facet","true"
,"facet.field", f
,"facet.mincount","0"
,"facet.offset","0"
,"facet.limit","100"
,"facet.sort","false"
,"facet.prefix","B"
)
,"*[count(//lst[@name='facet_fields']/lst/int)=3]"
,pre+"/int[1][@name='B'][.='1']"
,pre+"/int[2][@name='BB'][.='2']"
,pre+"/int[3][@name='BBB'][.='3']"
);
assertQ("test facet.prefix middle, exact match first term, unsorted",
req("q", "id:[* TO *]"
,"indent",indent
,"facet","true"
,"facet.field", f
,"facet.mincount","0"
,"facet.offset","0"
,"facet.limit","100"
,"facet.sort","false"
,"facet.prefix","B"
)
,"*[count(//lst[@name='facet_fields']/lst/int)=3]"
,pre+"/int[1][@name='B'][.='1']"
,pre+"/int[2][@name='BB'][.='2']"
,pre+"/int[3][@name='BBB'][.='3']"
);
assertQ("test facet.prefix middle, paging",
req("q", "id:[* TO *]"
,"indent",indent
,"facet","true"
,"facet.field", f
,"facet.mincount","0"
,"facet.offset","1"
,"facet.limit","100"
,"facet.sort","true"
,"facet.prefix","B"
)
,"*[count(//lst[@name='facet_fields']/lst/int)=2]"
,pre+"/int[1][@name='BB'][.='2']"
,pre+"/int[2][@name='B'][.='1']"
);
assertQ("test facet.prefix middle, paging",
req("q", "id:[* TO *]"
,"indent",indent
,"facet","true"
,"facet.field", f
,"facet.mincount","0"
,"facet.offset","1"
,"facet.limit","1"
,"facet.sort","true"
,"facet.prefix","B"
)
,"*[count(//lst[@name='facet_fields']/lst/int)=1]"
,pre+"/int[1][@name='BB'][.='2']"
);
assertQ("test facet.prefix middle, paging",
req("q", "id:[* TO *]"
,"indent",indent
,"facet","true"
,"facet.field", f
,"facet.mincount","0"
,"facet.offset","1"
,"facet.limit","1"
,"facet.sort","true"
,"facet.prefix","B"
)
,"*[count(//lst[@name='facet_fields']/lst/int)=1]"
,pre+"/int[1][@name='BB'][.='2']"
);
assertQ("test facet.prefix end, not exact match",
req("q", "id:[* TO *]"
,"indent",indent
,"facet","true"
,"facet.field", f
,"facet.mincount","0"
,"facet.offset","0"
,"facet.limit","100"
,"facet.sort","true"
,"facet.prefix","C"
)
,"*[count(//lst[@name='facet_fields']/lst/int)=2]"
,pre+"/int[1][@name='CCC'][.='3']"
,pre+"/int[2][@name='CC'][.='2']"
);
assertQ("test facet.prefix end, exact match",
req("q", "id:[* TO *]"
,"indent",indent
,"facet","true"
,"facet.field", f
,"facet.mincount","0"
,"facet.offset","0"
,"facet.limit","100"
,"facet.sort","true"
,"facet.prefix","CC"
)
,"*[count(//lst[@name='facet_fields']/lst/int)=2]"
,pre+"/int[1][@name='CCC'][.='3']"
,pre+"/int[2][@name='CC'][.='2']"
);
assertQ("test facet.prefix past end",
req("q", "id:[* TO *]"
,"indent",indent
,"facet","true"
,"facet.field", f
,"facet.mincount","0"
,"facet.offset","0"
,"facet.limit","100"
,"facet.sort","true"
,"facet.prefix","X"
)
,"*[count(//lst[@name='facet_fields']/lst/int)=0]"
);
assertQ("test facet.prefix past end",
req("q", "id:[* TO *]"
,"indent",indent
,"facet","true"
,"facet.field", f
,"facet.mincount","0"
,"facet.offset","1"
,"facet.limit","-1"
,"facet.sort","true"
,"facet.prefix","X"
)
,"*[count(//lst[@name='facet_fields']/lst/int)=0]"
);
assertQ("test facet.prefix at start, exact match",
req("q", "id:[* TO *]"
,"indent",indent
,"facet","true"
,"facet.field", f
,"facet.mincount","0"
,"facet.offset","0"
,"facet.limit","100"
,"facet.sort","true"
,"facet.prefix","AAA"
)
,"*[count(//lst[@name='facet_fields']/lst/int)=1]"
,pre+"/int[1][@name='AAA'][.='1']"
);
assertQ("test facet.prefix at Start, not exact match",
req("q", "id:[* TO *]"
,"indent",indent
,"facet","true"
,"facet.field", f
,"facet.mincount","0"
,"facet.offset","0"
,"facet.limit","100"
,"facet.sort","true"
,"facet.prefix","AA"
)
,"*[count(//lst[@name='facet_fields']/lst/int)=1]"
,pre+"/int[1][@name='AAA'][.='1']"
);
assertQ("test facet.prefix at Start, not exact match",
req("q", "id:[* TO *]"
,"indent",indent
,"facet","true"
,"facet.field", f
,"facet.mincount","0"
,"facet.offset","0"
,"facet.limit","100"
,"facet.sort","true"
,"facet.prefix","AA"
)
,"*[count(//lst[@name='facet_fields']/lst/int)=1]"
,pre+"/int[1][@name='AAA'][.='1']"
);
assertQ("test facet.prefix before start",
req("q", "id:[* TO *]"
,"indent",indent
,"facet","true"
,"facet.field", f
,"facet.mincount","0"
,"facet.offset","0"
,"facet.limit","100"
,"facet.sort","true"
,"facet.prefix","999"
)
,"*[count(//lst[@name='facet_fields']/lst/int)=0]"
);
assertQ("test facet.prefix before start",
req("q", "id:[* TO *]"
,"indent",indent
,"facet","true"
,"facet.field", f
,"facet.mincount","0"
,"facet.offset","2"
,"facet.limit","100"
,"facet.sort","true"
,"facet.prefix","999"
)
,"*[count(//lst[@name='facet_fields']/lst/int)=0]"
);
}
private String mkstr(int len) {
StringBuilder sb = new StringBuilder(len);
for (int i = 0; i < len; i++) {