From 8cf4a87acbc9e08697c28761c23d35e19d9f61c8 Mon Sep 17 00:00:00 2001 From: Yonik Seeley Date: Thu, 25 Jan 2007 16:32:01 +0000 Subject: [PATCH] facet.prefix: SOLR-117 git-svn-id: https://svn.apache.org/repos/asf/lucene/solr/trunk@499834 13f79535-47bb-0310-9956-ffa450edef68 --- CHANGES.txt | 3 + .../org/apache/solr/request/SimpleFacets.java | 153 ++++++---- .../org/apache/solr/request/SolrParams.java | 5 + .../apache/solr/BasicFunctionalityTest.java | 271 +++++++++++++++++- 4 files changed, 376 insertions(+), 56 deletions(-) diff --git a/CHANGES.txt b/CHANGES.txt index a13715398df..b6c3fcdb30d 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -54,6 +54,9 @@ New Features starting with the smallest positive set, subtracting all negative sets, then intersecting with all other positive sets. (yonik) + 6. SOLR-117: Limit a field faceting to constraints with a prefix specified + by facet.prefix or f..facet.prefix. (yonik) + Changes in runtime behavior 1. Highlighting using DisMax will only pick up terms from the main user query, not boost or filter queries (klaas). diff --git a/src/java/org/apache/solr/request/SimpleFacets.java b/src/java/org/apache/solr/request/SimpleFacets.java index 4944b8b4edd..8f3947e6351 100644 --- a/src/java/org/apache/solr/request/SimpleFacets.java +++ b/src/java/org/apache/solr/request/SimpleFacets.java @@ -20,12 +20,10 @@ package org.apache.solr.request; import org.apache.lucene.index.Term; import org.apache.lucene.index.TermEnum; import org.apache.lucene.index.IndexReader; -import org.apache.lucene.index.TermDocs; import org.apache.lucene.queryParser.ParseException; import org.apache.lucene.search.*; import org.apache.solr.core.SolrCore; import org.apache.solr.core.SolrException; -import org.apache.solr.core.SolrConfig; import org.apache.solr.request.SolrParams; import org.apache.solr.schema.IndexSchema; import org.apache.solr.schema.FieldType; @@ -36,7 +34,8 @@ import org.apache.solr.util.NamedList; import org.apache.solr.util.BoundedTreeSet; import java.io.IOException; -import java.util.*; +import java.util.Arrays; +import java.util.Comparator; /** * A class that generates simple Facet information for a request. @@ -132,17 +131,18 @@ public class SimpleFacets { boolean missing = params.getFieldBool(field, params.FACET_MISSING, false); // default to sorting if there is a limit. boolean sort = params.getFieldBool(field, params.FACET_SORT, limit>0); + String prefix = params.getFieldParam(field,params.FACET_PREFIX); NamedList counts; SchemaField sf = searcher.getSchema().getField(field); FieldType ft = sf.getType(); if (sf.multiValued() || ft.isTokenized() || ft instanceof BoolField) { // Always use filters for booleans... we know the number of values is very small. - counts = getFacetTermEnumCounts(searcher, docs, field, offset, limit, mincount,missing,sort); + counts = getFacetTermEnumCounts(searcher, docs, field, offset, limit, mincount,missing,sort,prefix); } else { // TODO: future logic could use filters instead of the fieldcache if // the number of terms in the field is small enough. - counts = getFieldCacheCounts(searcher, docs, field, offset,limit, mincount, missing, sort); + counts = getFieldCacheCounts(searcher, docs, field, offset,limit, mincount, missing, sort, prefix); } return counts; @@ -184,11 +184,21 @@ public class SimpleFacets { return docs.andNotSize(hasVal); } + + // first element of the fieldcache is null, so we need this comparator. + private static final Comparator nullStrComparator = new Comparator() { + public int compare(Object o1, Object o2) { + if (o1==null) return (o2==null) ? 0 : -1; + else if (o2==null) return 1; + return ((String)o1).compareTo((String)o2); + } + }; + /** * Use the Lucene FieldCache to get counts for each unique field value in docs. * The field must have at most one indexed token per document. */ - public static NamedList getFieldCacheCounts(SolrIndexSearcher searcher, DocSet docs, String fieldName, int offset, int limit, int mincount, boolean missing, boolean sort) throws IOException { + public static NamedList getFieldCacheCounts(SolrIndexSearcher searcher, DocSet docs, String fieldName, int offset, int limit, int mincount, boolean missing, boolean sort, String prefix) throws IOException { // TODO: If the number of terms is high compared to docs.size(), and zeros==false, // we should use an alternate strategy to avoid // 1) creating another huge int[] for the counts @@ -198,58 +208,97 @@ public class SimpleFacets { // then use them instead of the FieldCache. // - FieldCache.StringIndex si = FieldCache.DEFAULT.getStringIndex(searcher.getReader(), fieldName); - final int[] count = new int[si.lookup.length]; - DocIterator iter = docs.iterator(); - while (iter.hasNext()) { - count[si.order[iter.nextDoc()]]++; - } + // TODO: this function is too big and could use some refactoring, but + // we also need a facet cache, and refactoring of SimpleFacets instead of + // trying to pass all the various params around. FieldType ft = searcher.getSchema().getFieldType(fieldName); NamedList res = new NamedList(); - // IDEA: we could also maintain a count of "other"... everything that fell outside - // of the top 'N' + FieldCache.StringIndex si = FieldCache.DEFAULT.getStringIndex(searcher.getReader(), fieldName); + final String[] terms = si.lookup; + final int[] termNum = si.order; - int off=offset; - int lim=limit>=0 ? limit : Integer.MAX_VALUE; + if (prefix!=null && prefix.length()==0) prefix=null; - if (sort) { - // TODO: compare performance of BoundedTreeSet compare against priority queue? - int maxsize = limit>0 ? offset+limit : Integer.MAX_VALUE-1; - final BoundedTreeSet> queue = new BoundedTreeSet>(maxsize); - int min=mincount-1; // the smallest value in the top 'N' values - for (int i=1; imin) { - // NOTE: we use c>min rather than c>=min as an optimization because we are going in - // index order, so we already know that the keys are ordered. This can be very - // important if a lot of the counts are repeated (like zero counts would be). - queue.add(new CountPair(ft.indexedToReadable(si.lookup[i]), c)); - if (queue.size()>=maxsize) min=queue.last().val; - } - } - for (CountPair p : queue) { - if (--off>=0) continue; - if (--lim<0) break; - res.add(p.key, p.val); - } - } else if (mincount<=0) { - // This is an optimization... if mincount<=0 and we aren't sorting then - // we know exactly where to start and end in the fieldcache. - for (int i=offset+1; i=0) continue; - if (--lim<0) break; - res.add(ft.indexedToReadable(si.lookup[i]), c); + startTermIndex=1; + endTermIndex=terms.length; + } + + final int nTerms=endTermIndex-startTermIndex; + + if (nTerms>0) { + + // count collection array only needs to be as big as the number of terms we are + // going to collect counts for. + final int[] counts = new int[nTerms]; + + DocIterator iter = docs.iterator(); + while (iter.hasNext()) { + int term = termNum[iter.nextDoc()]; + int arrIdx = term-startTermIndex; + if (arrIdx>=0 && arrIdx=0 ? limit : Integer.MAX_VALUE; + + if (sort) { + int maxsize = limit>0 ? offset+limit : Integer.MAX_VALUE-1; + maxsize = Math.min(maxsize, nTerms); + final BoundedTreeSet> queue = new BoundedTreeSet>(maxsize); + int min=mincount-1; // the smallest value in the top 'N' values + for (int i=0; imin) { + // NOTE: we use c>min rather than c>=min as an optimization because we are going in + // index order, so we already know that the keys are ordered. This can be very + // important if a lot of the counts are repeated (like zero counts would be). + queue.add(new CountPair(terms[startTermIndex+i], c)); + if (queue.size()>=maxsize) min=queue.last().val; + } + } + // now select the right page from the results + for (CountPair p : queue) { + if (--off>=0) continue; + if (--lim<0) break; + res.add(ft.indexedToReadable(p.key), p.val); + } + } else { + // add results in index order + int i=0; + if (mincount<=0) { + // if mincount<=0, then we won't discard any terms and we know exactly + // where to start. + i=off; + off=0; + } + + for (; i=0) continue; + if (--lim<0) break; + res.add(ft.indexedToReadable(terms[startTermIndex+i]), c); + } } } - if (missing) res.add(null, count[0]); + if (missing) { + res.add(null, getFieldMissingCount(searcher,docs,fieldName)); + } + return res; } @@ -263,7 +312,7 @@ public class SimpleFacets { * @see SolrParams#FACET_ZEROS * @see SolrParams#FACET_MISSING */ - public NamedList getFacetTermEnumCounts(SolrIndexSearcher searcher, DocSet docs, String field, int offset, int limit, int mincount, boolean missing, boolean sort) + public NamedList getFacetTermEnumCounts(SolrIndexSearcher searcher, DocSet docs, String field, int offset, int limit, int mincount, boolean missing, boolean sort, String prefix) throws IOException { /* :TODO: potential optimization... @@ -282,13 +331,17 @@ public class SimpleFacets { int min=mincount-1; // the smallest value in the top 'N' values int off=offset; int lim=limit>=0 ? limit : Integer.MAX_VALUE; - TermEnum te = r.terms(new Term(field,"")); + + String startTerm = prefix==null ? "" : ft.toInternal(prefix); + TermEnum te = r.terms(new Term(field,startTerm)); do { Term t = te.term(); if (null == t || ! t.field().equals(field)) break; + if (prefix!=null && !t.text().startsWith(prefix)) break; + int df = te.docFreq(); if (df>0) { /* check df since all docs may be deleted */ diff --git a/src/java/org/apache/solr/request/SolrParams.java b/src/java/org/apache/solr/request/SolrParams.java index 8a878902ebb..a7e020bf7fd 100644 --- a/src/java/org/apache/solr/request/SolrParams.java +++ b/src/java/org/apache/solr/request/SolrParams.java @@ -122,6 +122,11 @@ public abstract class SolrParams { */ public static final String FACET_SORT = "facet.sort"; + /** + * Only return constraints of a facet field with the given prefix. + */ + public static final String FACET_PREFIX = "facet.prefix"; + /** returns the String value of a param, or null if not set */ public abstract String get(String param); diff --git a/src/test/org/apache/solr/BasicFunctionalityTest.java b/src/test/org/apache/solr/BasicFunctionalityTest.java index 8f0c6484b91..eecd755158c 100644 --- a/src/test/org/apache/solr/BasicFunctionalityTest.java +++ b/src/test/org/apache/solr/BasicFunctionalityTest.java @@ -25,14 +25,12 @@ import org.apache.solr.search.*; import org.apache.solr.request.*; import org.apache.solr.util.*; import org.apache.solr.schema.*; -import org.w3c.dom.Document; import javax.xml.parsers.DocumentBuilderFactory; import javax.xml.parsers.DocumentBuilder; import java.io.IOException; import java.io.StringWriter; import java.io.ByteArrayInputStream; -import java.io.UnsupportedEncodingException; import java.util.Map; import java.util.HashMap; @@ -575,14 +573,14 @@ public class BasicFunctionalityTest extends AbstractSolrTestCase { public void testFacetMultiValued() { - doSimpleFacetCountsWithLimits("t_s"); + doFacets("t_s"); } public void testFacetSingleValued() { - doSimpleFacetCountsWithLimits("t_s1"); + doFacets("t_s1"); } - public void doSimpleFacetCountsWithLimits(String f) { + public void doFacets(String f) { String pre = "//lst[@name='"+f+"']"; String notc = "id:[* TO *] -"+f+":C"; @@ -742,7 +740,268 @@ public class BasicFunctionalityTest extends AbstractSolrTestCase { ,pre+"/int[1][@name='G'][.='5']" ); } - + + + + public void testFacetPrefixMultiValued() { + doFacetPrefix("t_s"); + } + + public void testFacetPrefixSingleValued() { + doFacetPrefix("t_s1"); + } + + public void doFacetPrefix(String f) { + String indent="on"; + String pre = "//lst[@name='"+f+"']"; + String notc = "id:[* TO *] -"+f+":C"; + + assertU(adoc("id", "1", f, "AAA")); + assertU(adoc("id", "2", f, "B")); + assertU(adoc("id", "3", f, "BB")); + assertU(adoc("id", "4", f, "BB")); + assertU(adoc("id", "5", f, "BBB")); + assertU(adoc("id", "6", f, "BBB")); + assertU(adoc("id", "7", f, "BBB")); + assertU(adoc("id", "8", f, "CC")); + assertU(adoc("id", "9", f, "CC")); + assertU(adoc("id", "10", f, "CCC")); + assertU(adoc("id", "11", f, "CCC")); + assertU(adoc("id", "12", f, "CCC")); + assertU(commit()); + + assertQ("test facet.prefix middle, exact match first term", + req("q", "id:[* TO *]" + ,"indent",indent + ,"facet","true" + ,"facet.field", f + ,"facet.mincount","0" + ,"facet.offset","0" + ,"facet.limit","100" + ,"facet.sort","true" + ,"facet.prefix","B" + ) + ,"*[count(//lst[@name='facet_fields']/lst/int)=3]" + ,pre+"/int[1][@name='BBB'][.='3']" + ,pre+"/int[2][@name='BB'][.='2']" + ,pre+"/int[3][@name='B'][.='1']" + ); + + assertQ("test facet.prefix middle, exact match first term, unsorted", + req("q", "id:[* TO *]" + ,"indent",indent + ,"facet","true" + ,"facet.field", f + ,"facet.mincount","0" + ,"facet.offset","0" + ,"facet.limit","100" + ,"facet.sort","false" + ,"facet.prefix","B" + ) + ,"*[count(//lst[@name='facet_fields']/lst/int)=3]" + ,pre+"/int[1][@name='B'][.='1']" + ,pre+"/int[2][@name='BB'][.='2']" + ,pre+"/int[3][@name='BBB'][.='3']" + ); + + + assertQ("test facet.prefix middle, exact match first term, unsorted", + req("q", "id:[* TO *]" + ,"indent",indent + ,"facet","true" + ,"facet.field", f + ,"facet.mincount","0" + ,"facet.offset","0" + ,"facet.limit","100" + ,"facet.sort","false" + ,"facet.prefix","B" + ) + ,"*[count(//lst[@name='facet_fields']/lst/int)=3]" + ,pre+"/int[1][@name='B'][.='1']" + ,pre+"/int[2][@name='BB'][.='2']" + ,pre+"/int[3][@name='BBB'][.='3']" + ); + + + assertQ("test facet.prefix middle, paging", + req("q", "id:[* TO *]" + ,"indent",indent + ,"facet","true" + ,"facet.field", f + ,"facet.mincount","0" + ,"facet.offset","1" + ,"facet.limit","100" + ,"facet.sort","true" + ,"facet.prefix","B" + ) + ,"*[count(//lst[@name='facet_fields']/lst/int)=2]" + ,pre+"/int[1][@name='BB'][.='2']" + ,pre+"/int[2][@name='B'][.='1']" + ); + + assertQ("test facet.prefix middle, paging", + req("q", "id:[* TO *]" + ,"indent",indent + ,"facet","true" + ,"facet.field", f + ,"facet.mincount","0" + ,"facet.offset","1" + ,"facet.limit","1" + ,"facet.sort","true" + ,"facet.prefix","B" + ) + ,"*[count(//lst[@name='facet_fields']/lst/int)=1]" + ,pre+"/int[1][@name='BB'][.='2']" + ); + + assertQ("test facet.prefix middle, paging", + req("q", "id:[* TO *]" + ,"indent",indent + ,"facet","true" + ,"facet.field", f + ,"facet.mincount","0" + ,"facet.offset","1" + ,"facet.limit","1" + ,"facet.sort","true" + ,"facet.prefix","B" + ) + ,"*[count(//lst[@name='facet_fields']/lst/int)=1]" + ,pre+"/int[1][@name='BB'][.='2']" + ); + + assertQ("test facet.prefix end, not exact match", + req("q", "id:[* TO *]" + ,"indent",indent + ,"facet","true" + ,"facet.field", f + ,"facet.mincount","0" + ,"facet.offset","0" + ,"facet.limit","100" + ,"facet.sort","true" + ,"facet.prefix","C" + ) + ,"*[count(//lst[@name='facet_fields']/lst/int)=2]" + ,pre+"/int[1][@name='CCC'][.='3']" + ,pre+"/int[2][@name='CC'][.='2']" + ); + + assertQ("test facet.prefix end, exact match", + req("q", "id:[* TO *]" + ,"indent",indent + ,"facet","true" + ,"facet.field", f + ,"facet.mincount","0" + ,"facet.offset","0" + ,"facet.limit","100" + ,"facet.sort","true" + ,"facet.prefix","CC" + ) + ,"*[count(//lst[@name='facet_fields']/lst/int)=2]" + ,pre+"/int[1][@name='CCC'][.='3']" + ,pre+"/int[2][@name='CC'][.='2']" + ); + + assertQ("test facet.prefix past end", + req("q", "id:[* TO *]" + ,"indent",indent + ,"facet","true" + ,"facet.field", f + ,"facet.mincount","0" + ,"facet.offset","0" + ,"facet.limit","100" + ,"facet.sort","true" + ,"facet.prefix","X" + ) + ,"*[count(//lst[@name='facet_fields']/lst/int)=0]" + ); + + assertQ("test facet.prefix past end", + req("q", "id:[* TO *]" + ,"indent",indent + ,"facet","true" + ,"facet.field", f + ,"facet.mincount","0" + ,"facet.offset","1" + ,"facet.limit","-1" + ,"facet.sort","true" + ,"facet.prefix","X" + ) + ,"*[count(//lst[@name='facet_fields']/lst/int)=0]" + ); + + assertQ("test facet.prefix at start, exact match", + req("q", "id:[* TO *]" + ,"indent",indent + ,"facet","true" + ,"facet.field", f + ,"facet.mincount","0" + ,"facet.offset","0" + ,"facet.limit","100" + ,"facet.sort","true" + ,"facet.prefix","AAA" + ) + ,"*[count(//lst[@name='facet_fields']/lst/int)=1]" + ,pre+"/int[1][@name='AAA'][.='1']" + ); + assertQ("test facet.prefix at Start, not exact match", + req("q", "id:[* TO *]" + ,"indent",indent + ,"facet","true" + ,"facet.field", f + ,"facet.mincount","0" + ,"facet.offset","0" + ,"facet.limit","100" + ,"facet.sort","true" + ,"facet.prefix","AA" + ) + ,"*[count(//lst[@name='facet_fields']/lst/int)=1]" + ,pre+"/int[1][@name='AAA'][.='1']" + ); + assertQ("test facet.prefix at Start, not exact match", + req("q", "id:[* TO *]" + ,"indent",indent + ,"facet","true" + ,"facet.field", f + ,"facet.mincount","0" + ,"facet.offset","0" + ,"facet.limit","100" + ,"facet.sort","true" + ,"facet.prefix","AA" + ) + ,"*[count(//lst[@name='facet_fields']/lst/int)=1]" + ,pre+"/int[1][@name='AAA'][.='1']" + ); + assertQ("test facet.prefix before start", + req("q", "id:[* TO *]" + ,"indent",indent + ,"facet","true" + ,"facet.field", f + ,"facet.mincount","0" + ,"facet.offset","0" + ,"facet.limit","100" + ,"facet.sort","true" + ,"facet.prefix","999" + ) + ,"*[count(//lst[@name='facet_fields']/lst/int)=0]" + ); + + assertQ("test facet.prefix before start", + req("q", "id:[* TO *]" + ,"indent",indent + ,"facet","true" + ,"facet.field", f + ,"facet.mincount","0" + ,"facet.offset","2" + ,"facet.limit","100" + ,"facet.sort","true" + ,"facet.prefix","999" + ) + ,"*[count(//lst[@name='facet_fields']/lst/int)=0]" + ); + + } + + private String mkstr(int len) { StringBuilder sb = new StringBuilder(len); for (int i = 0; i < len; i++) {