facet.prefix: SOLR-117

git-svn-id: https://svn.apache.org/repos/asf/lucene/solr/trunk@499834 13f79535-47bb-0310-9956-ffa450edef68
2007-01-25 16:32:01 +00:00 · 2007-01-25 16:32:01 +00:00 · 8cf4a87acb
parent f027d99c6f
commit 8cf4a87acb
4 changed files with 376 additions and 56 deletions
--- a/CHANGES.txt
+++ b/CHANGES.txt
@ -54,6 +54,9 @@ New Features
    starting with the smallest positive set, subtracting all negative
    sets, then intersecting with all other positive sets.  (yonik)
 6. SOLR-117: Limit a field faceting to constraints with a prefix specified
    by facet.prefix or f.<field>.facet.prefix. (yonik)
 Changes in runtime behavior
 1. Highlighting using DisMax will only pick up terms from the main 
    user query, not boost or filter queries (klaas).
--- a/src/java/org/apache/solr/request/SimpleFacets.java
+++ b/src/java/org/apache/solr/request/SimpleFacets.java
@ -20,12 +20,10 @@ package org.apache.solr.request;
 import org.apache.lucene.index.Term;
 import org.apache.lucene.index.TermEnum;
 import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.index.TermDocs;
 import org.apache.lucene.queryParser.ParseException;
 import org.apache.lucene.search.*;
 import org.apache.solr.core.SolrCore;
 import org.apache.solr.core.SolrException;
 import org.apache.solr.core.SolrConfig;
 import org.apache.solr.request.SolrParams;
 import org.apache.solr.schema.IndexSchema;
 import org.apache.solr.schema.FieldType;
@ -36,7 +34,8 @@ import org.apache.solr.util.NamedList;
 import org.apache.solr.util.BoundedTreeSet;
 import java.io.IOException;
-import java.util.*;
+import java.util.Arrays;
 import java.util.Comparator;
 /**
 * A class that generates simple Facet information for a request.
@ -132,17 +131,18 @@ public class SimpleFacets {
    boolean missing = params.getFieldBool(field, params.FACET_MISSING, false);
    // default to sorting if there is a limit.
    boolean sort = params.getFieldBool(field, params.FACET_SORT, limit>0);
    String prefix = params.getFieldParam(field,params.FACET_PREFIX);
    NamedList counts;
    SchemaField sf = searcher.getSchema().getField(field);
    FieldType ft = sf.getType();
    if (sf.multiValued() || ft.isTokenized() || ft instanceof BoolField) {
      // Always use filters for booleans... we know the number of values is very small.
-      counts = getFacetTermEnumCounts(searcher, docs, field, offset, limit, mincount,missing,sort);
+      counts = getFacetTermEnumCounts(searcher, docs, field, offset, limit, mincount,missing,sort,prefix);
    } else {
      // TODO: future logic could use filters instead of the fieldcache if
      // the number of terms in the field is small enough.
-      counts = getFieldCacheCounts(searcher, docs, field, offset,limit, mincount, missing, sort);
+      counts = getFieldCacheCounts(searcher, docs, field, offset,limit, mincount, missing, sort, prefix);
    }
    return counts;
@ -184,11 +184,21 @@ public class SimpleFacets {
    return docs.andNotSize(hasVal);
  }
  // first element of the fieldcache is null, so we need this comparator.
  private static final Comparator nullStrComparator = new Comparator() {
        public int compare(Object o1, Object o2) {
          if (o1==null) return (o2==null) ? 0 : -1;
          else if (o2==null) return 1;
          return ((String)o1).compareTo((String)o2);
        }
      }; 
  /**
   * Use the Lucene FieldCache to get counts for each unique field value in <code>docs</code>.
   * The field must have at most one indexed token per document.
   */
-  public static NamedList getFieldCacheCounts(SolrIndexSearcher searcher, DocSet docs, String fieldName, int offset, int limit, int mincount, boolean missing, boolean sort) throws IOException {
+  public static NamedList getFieldCacheCounts(SolrIndexSearcher searcher, DocSet docs, String fieldName, int offset, int limit, int mincount, boolean missing, boolean sort, String prefix) throws IOException {
    // TODO: If the number of terms is high compared to docs.size(), and zeros==false,
    //  we should use an alternate strategy to avoid
    //  1) creating another huge int[] for the counts
@ -198,58 +208,97 @@ public class SimpleFacets {
    // then use them instead of the FieldCache.
    //
-    FieldCache.StringIndex si = FieldCache.DEFAULT.getStringIndex(searcher.getReader(), fieldName);
+    // TODO: this function is too big and could use some refactoring, but
-    final int[] count = new int[si.lookup.length];
+    // we also need a facet cache, and refactoring of SimpleFacets instead of
-    DocIterator iter = docs.iterator();
+    // trying to pass all the various params around.
    while (iter.hasNext()) {
      count[si.order[iter.nextDoc()]]++;
    }
    FieldType ft = searcher.getSchema().getFieldType(fieldName);
    NamedList res = new NamedList();
-    // IDEA: we could also maintain a count of "other"... everything that fell outside
+    FieldCache.StringIndex si = FieldCache.DEFAULT.getStringIndex(searcher.getReader(), fieldName);
-    // of the top 'N'
+    final String[] terms = si.lookup;
    final int[] termNum = si.order;
-    int off=offset;
+    if (prefix!=null && prefix.length()==0) prefix=null;
    int lim=limit>=0 ? limit : Integer.MAX_VALUE;
-    if (sort) {
+    int startTermIndex, endTermIndex;
-      // TODO: compare performance of BoundedTreeSet compare against priority queue?
+    if (prefix!=null) {
-      int maxsize = limit>0 ? offset+limit : Integer.MAX_VALUE-1;
+      startTermIndex = Arrays.binarySearch(terms,prefix,nullStrComparator);
-      final BoundedTreeSet<CountPair<String,Integer>> queue = new BoundedTreeSet<CountPair<String,Integer>>(maxsize);
+      if (startTermIndex<0) startTermIndex=-startTermIndex-1;
-      int min=mincount-1;  // the smallest value in the top 'N' values
+      // find the end term.  \uffff isn't a legal unicode char, but only compareTo
-      for (int i=1; i<count.length; i++) {
+      // is used, so it should be fine, and is guaranteed to be bigger than legal chars.
-        int c = count[i];
+      endTermIndex = Arrays.binarySearch(terms,prefix+"\uffff\uffff\uffff\uffff",nullStrComparator);
-        if (c>min) {
+      endTermIndex = -endTermIndex-1;
          // NOTE: we use c>min rather than c>=min as an optimization because we are going in
          // index order, so we already know that the keys are ordered.  This can be very
          // important if a lot of the counts are repeated (like zero counts would be).
          queue.add(new CountPair<String,Integer>(ft.indexedToReadable(si.lookup[i]), c));
          if (queue.size()>=maxsize) min=queue.last().val;
        }
      }
      for (CountPair<String,Integer> p : queue) {
        if (--off>=0) continue;
        if (--lim<0) break;
        res.add(p.key, p.val);
      }
    } else if (mincount<=0) {
      // This is an optimization... if mincount<=0 and we aren't sorting then
      // we know exactly where to start and end in the fieldcache.
      for (int i=offset+1; i<offset+1+limit; i++) {
        res.add(ft.indexedToReadable(si.lookup[i]),count[i]);
      }
    } else {
-      for (int i=1; i<count.length; i++) {
+      startTermIndex=1;
-        int c = count[i];
+      endTermIndex=terms.length;
-        if (c<mincount || --off>=0) continue;
+    }
-        if (--lim<0) break;
+
-        res.add(ft.indexedToReadable(si.lookup[i]), c);      
+    final int nTerms=endTermIndex-startTermIndex;
    if (nTerms>0) {
      // count collection array only needs to be as big as the number of terms we are
      // going to collect counts for.
      final int[] counts = new int[nTerms];
      DocIterator iter = docs.iterator();
      while (iter.hasNext()) {
        int term = termNum[iter.nextDoc()];
        int arrIdx = term-startTermIndex;
        if (arrIdx>=0 && arrIdx<nTerms) counts[arrIdx]++;
      }
      // IDEA: we could also maintain a count of "other"... everything that fell outside
      // of the top 'N'
      int off=offset;
      int lim=limit>=0 ? limit : Integer.MAX_VALUE;
      if (sort) {
        int maxsize = limit>0 ? offset+limit : Integer.MAX_VALUE-1;
        maxsize = Math.min(maxsize, nTerms);
        final BoundedTreeSet<CountPair<String,Integer>> queue = new BoundedTreeSet<CountPair<String,Integer>>(maxsize);
        int min=mincount-1;  // the smallest value in the top 'N' values
        for (int i=0; i<nTerms; i++) {
          int c = counts[i];
          if (c>min) {
            // NOTE: we use c>min rather than c>=min as an optimization because we are going in
            // index order, so we already know that the keys are ordered.  This can be very
            // important if a lot of the counts are repeated (like zero counts would be).
            queue.add(new CountPair<String,Integer>(terms[startTermIndex+i], c));
            if (queue.size()>=maxsize) min=queue.last().val;
          }
        }
        // now select the right page from the results
        for (CountPair<String,Integer> p : queue) {
          if (--off>=0) continue;
          if (--lim<0) break;
          res.add(ft.indexedToReadable(p.key), p.val);
        }
      } else {
        // add results in index order
        int i=0;
        if (mincount<=0) {
          // if mincount<=0, then we won't discard any terms and we know exactly
          // where to start.
          i=off;
          off=0;
        }
        for (; i<nTerms; i++) {          
          int c = counts[i];
          if (c<mincount || --off>=0) continue;
          if (--lim<0) break;
          res.add(ft.indexedToReadable(terms[startTermIndex+i]), c);
        }
      }
    }
-    if (missing) res.add(null, count[0]);
+    if (missing) {
      res.add(null, getFieldMissingCount(searcher,docs,fieldName));
    }
    return res;
  }
@ -263,7 +312,7 @@ public class SimpleFacets {
   * @see SolrParams#FACET_ZEROS
   * @see SolrParams#FACET_MISSING
   */
-  public NamedList getFacetTermEnumCounts(SolrIndexSearcher searcher, DocSet docs, String field, int offset, int limit, int mincount, boolean missing, boolean sort)
+  public NamedList getFacetTermEnumCounts(SolrIndexSearcher searcher, DocSet docs, String field, int offset, int limit, int mincount, boolean missing, boolean sort, String prefix)
    throws IOException {
    /* :TODO: potential optimization...
@ -282,13 +331,17 @@ public class SimpleFacets {
    int min=mincount-1;  // the smallest value in the top 'N' values    
    int off=offset;
    int lim=limit>=0 ? limit : Integer.MAX_VALUE;
-    TermEnum te = r.terms(new Term(field,""));
+
    String startTerm = prefix==null ? "" : ft.toInternal(prefix);
    TermEnum te = r.terms(new Term(field,startTerm));
    do {
      Term t = te.term();
      if (null == t || ! t.field().equals(field))
        break;
      if (prefix!=null && !t.text().startsWith(prefix)) break;
      int df = te.docFreq();
      if (df>0) { /* check df since all docs may be deleted */
--- a/src/java/org/apache/solr/request/SolrParams.java
+++ b/src/java/org/apache/solr/request/SolrParams.java
@ -122,6 +122,11 @@ public abstract class SolrParams {
   */
  public static final String FACET_SORT = "facet.sort";
  /**
   * Only return constraints of a facet field with the given prefix.
   */
  public static final String FACET_PREFIX = "facet.prefix";
  /** returns the String value of a param, or null if not set */
  public abstract String get(String param);
--- a/src/test/org/apache/solr/BasicFunctionalityTest.java
+++ b/src/test/org/apache/solr/BasicFunctionalityTest.java
@ -25,14 +25,12 @@ import org.apache.solr.search.*;
 import org.apache.solr.request.*;
 import org.apache.solr.util.*;
 import org.apache.solr.schema.*;
 import org.w3c.dom.Document;
 import javax.xml.parsers.DocumentBuilderFactory;
 import javax.xml.parsers.DocumentBuilder;
 import java.io.IOException;
 import java.io.StringWriter;
 import java.io.ByteArrayInputStream;
 import java.io.UnsupportedEncodingException;
 import java.util.Map;
 import java.util.HashMap;
@ -575,14 +573,14 @@ public class BasicFunctionalityTest extends AbstractSolrTestCase {
  public void testFacetMultiValued() {
-    doSimpleFacetCountsWithLimits("t_s");
+    doFacets("t_s");
  }
  public void testFacetSingleValued() {
-    doSimpleFacetCountsWithLimits("t_s1");
+    doFacets("t_s1");
  }
-  public void doSimpleFacetCountsWithLimits(String f) {
+  public void doFacets(String f) {
    String pre = "//lst[@name='"+f+"']";
    String notc = "id:[* TO *] -"+f+":C";
@ -742,7 +740,268 @@ public class BasicFunctionalityTest extends AbstractSolrTestCase {
            ,pre+"/int[1][@name='G'][.='5']"
            );
  }
-  
+
  public void testFacetPrefixMultiValued() {
    doFacetPrefix("t_s");
  }
  public void testFacetPrefixSingleValued() {
    doFacetPrefix("t_s1");
  }
  public void doFacetPrefix(String f) {
    String indent="on";
    String pre = "//lst[@name='"+f+"']";
    String notc = "id:[* TO *] -"+f+":C";
    assertU(adoc("id", "1",  f, "AAA"));
    assertU(adoc("id", "2",  f, "B"));
    assertU(adoc("id", "3",  f, "BB"));
    assertU(adoc("id", "4",  f, "BB"));
    assertU(adoc("id", "5",  f, "BBB"));
    assertU(adoc("id", "6",  f, "BBB"));
    assertU(adoc("id", "7",  f, "BBB"));
    assertU(adoc("id", "8",  f, "CC"));
    assertU(adoc("id", "9",  f, "CC"));
    assertU(adoc("id", "10", f, "CCC"));
    assertU(adoc("id", "11", f, "CCC"));
    assertU(adoc("id", "12", f, "CCC"));
    assertU(commit());
    assertQ("test facet.prefix middle, exact match first term",
            req("q", "id:[* TO *]"
                    ,"indent",indent
                    ,"facet","true"
                    ,"facet.field", f
                    ,"facet.mincount","0"
                    ,"facet.offset","0"
                    ,"facet.limit","100"
                    ,"facet.sort","true"
                    ,"facet.prefix","B"
            )
            ,"*[count(//lst[@name='facet_fields']/lst/int)=3]"
            ,pre+"/int[1][@name='BBB'][.='3']"
            ,pre+"/int[2][@name='BB'][.='2']"
            ,pre+"/int[3][@name='B'][.='1']"
    );
    assertQ("test facet.prefix middle, exact match first term, unsorted",
            req("q", "id:[* TO *]"
                    ,"indent",indent
                    ,"facet","true"
                    ,"facet.field", f
                    ,"facet.mincount","0"
                    ,"facet.offset","0"
                    ,"facet.limit","100"
                    ,"facet.sort","false"
                    ,"facet.prefix","B"
            )
            ,"*[count(//lst[@name='facet_fields']/lst/int)=3]"
            ,pre+"/int[1][@name='B'][.='1']"
            ,pre+"/int[2][@name='BB'][.='2']"
            ,pre+"/int[3][@name='BBB'][.='3']"
    );
     assertQ("test facet.prefix middle, exact match first term, unsorted",
            req("q", "id:[* TO *]"
                    ,"indent",indent
                    ,"facet","true"
                    ,"facet.field", f
                    ,"facet.mincount","0"
                    ,"facet.offset","0"
                    ,"facet.limit","100"
                    ,"facet.sort","false"
                    ,"facet.prefix","B"
            )
            ,"*[count(//lst[@name='facet_fields']/lst/int)=3]"
            ,pre+"/int[1][@name='B'][.='1']"
            ,pre+"/int[2][@name='BB'][.='2']"
            ,pre+"/int[3][@name='BBB'][.='3']"
    );
    assertQ("test facet.prefix middle, paging",
            req("q", "id:[* TO *]"
                    ,"indent",indent
                    ,"facet","true"
                    ,"facet.field", f
                    ,"facet.mincount","0"
                    ,"facet.offset","1"
                    ,"facet.limit","100"
                    ,"facet.sort","true"
                    ,"facet.prefix","B"
            )
            ,"*[count(//lst[@name='facet_fields']/lst/int)=2]"
            ,pre+"/int[1][@name='BB'][.='2']"
            ,pre+"/int[2][@name='B'][.='1']"
    );
    assertQ("test facet.prefix middle, paging",
            req("q", "id:[* TO *]"
                    ,"indent",indent
                    ,"facet","true"
                    ,"facet.field", f
                    ,"facet.mincount","0"
                    ,"facet.offset","1"
                    ,"facet.limit","1"
                    ,"facet.sort","true"
                    ,"facet.prefix","B"
            )
            ,"*[count(//lst[@name='facet_fields']/lst/int)=1]"
            ,pre+"/int[1][@name='BB'][.='2']"
    );
    assertQ("test facet.prefix middle, paging",
            req("q", "id:[* TO *]"
                    ,"indent",indent
                    ,"facet","true"
                    ,"facet.field", f
                    ,"facet.mincount","0"
                    ,"facet.offset","1"
                    ,"facet.limit","1"
                    ,"facet.sort","true"
                    ,"facet.prefix","B"
            )
            ,"*[count(//lst[@name='facet_fields']/lst/int)=1]"
            ,pre+"/int[1][@name='BB'][.='2']"
    );
    assertQ("test facet.prefix end, not exact match",
            req("q", "id:[* TO *]"
                    ,"indent",indent
                    ,"facet","true"
                    ,"facet.field", f
                    ,"facet.mincount","0"
                    ,"facet.offset","0"
                    ,"facet.limit","100"
                    ,"facet.sort","true"
                    ,"facet.prefix","C"
            )
            ,"*[count(//lst[@name='facet_fields']/lst/int)=2]"
            ,pre+"/int[1][@name='CCC'][.='3']"
            ,pre+"/int[2][@name='CC'][.='2']"
    );
    assertQ("test facet.prefix end, exact match",
            req("q", "id:[* TO *]"
                    ,"indent",indent
                    ,"facet","true"
                    ,"facet.field", f
                    ,"facet.mincount","0"
                    ,"facet.offset","0"
                    ,"facet.limit","100"
                    ,"facet.sort","true"
                    ,"facet.prefix","CC"
            )
            ,"*[count(//lst[@name='facet_fields']/lst/int)=2]"
            ,pre+"/int[1][@name='CCC'][.='3']"
            ,pre+"/int[2][@name='CC'][.='2']"
    );
    assertQ("test facet.prefix past end",
            req("q", "id:[* TO *]"
                    ,"indent",indent
                    ,"facet","true"
                    ,"facet.field", f
                    ,"facet.mincount","0"
                    ,"facet.offset","0"
                    ,"facet.limit","100"
                    ,"facet.sort","true"
                    ,"facet.prefix","X"
            )
            ,"*[count(//lst[@name='facet_fields']/lst/int)=0]"
    );
    assertQ("test facet.prefix past end",
            req("q", "id:[* TO *]"
                    ,"indent",indent
                    ,"facet","true"
                    ,"facet.field", f
                    ,"facet.mincount","0"
                    ,"facet.offset","1"
                    ,"facet.limit","-1"
                    ,"facet.sort","true"
                    ,"facet.prefix","X"
            )
            ,"*[count(//lst[@name='facet_fields']/lst/int)=0]"
    );
    assertQ("test facet.prefix at start, exact match",
            req("q", "id:[* TO *]"
                    ,"indent",indent
                    ,"facet","true"
                    ,"facet.field", f
                    ,"facet.mincount","0"
                    ,"facet.offset","0"
                    ,"facet.limit","100"
                    ,"facet.sort","true"
                    ,"facet.prefix","AAA"
            )
            ,"*[count(//lst[@name='facet_fields']/lst/int)=1]"
            ,pre+"/int[1][@name='AAA'][.='1']"
    );
    assertQ("test facet.prefix at Start, not exact match",
            req("q", "id:[* TO *]"
                    ,"indent",indent
                    ,"facet","true"
                    ,"facet.field", f
                    ,"facet.mincount","0"
                    ,"facet.offset","0"
                    ,"facet.limit","100"
                    ,"facet.sort","true"
                    ,"facet.prefix","AA"
            )
            ,"*[count(//lst[@name='facet_fields']/lst/int)=1]"
            ,pre+"/int[1][@name='AAA'][.='1']"
    );
    assertQ("test facet.prefix at Start, not exact match",
            req("q", "id:[* TO *]"
                    ,"indent",indent
                    ,"facet","true"
                    ,"facet.field", f
                    ,"facet.mincount","0"
                    ,"facet.offset","0"
                    ,"facet.limit","100"
                    ,"facet.sort","true"
                    ,"facet.prefix","AA"
            )
            ,"*[count(//lst[@name='facet_fields']/lst/int)=1]"
            ,pre+"/int[1][@name='AAA'][.='1']"
    );    
    assertQ("test facet.prefix before start",
            req("q", "id:[* TO *]"
                    ,"indent",indent
                    ,"facet","true"
                    ,"facet.field", f
                    ,"facet.mincount","0"
                    ,"facet.offset","0"
                    ,"facet.limit","100"
                    ,"facet.sort","true"
                    ,"facet.prefix","999"
            )
            ,"*[count(//lst[@name='facet_fields']/lst/int)=0]"
    );
    assertQ("test facet.prefix before start",
            req("q", "id:[* TO *]"
                    ,"indent",indent
                    ,"facet","true"
                    ,"facet.field", f
                    ,"facet.mincount","0"
                    ,"facet.offset","2"
                    ,"facet.limit","100"
                    ,"facet.sort","true"
                    ,"facet.prefix","999"
            )
            ,"*[count(//lst[@name='facet_fields']/lst/int)=0]"
    );
  }
  private String mkstr(int len) {
    StringBuilder sb = new StringBuilder(len);
    for (int i = 0; i < len; i++) {