From 8cf4a87acbc9e08697c28761c23d35e19d9f61c8 Mon Sep 17 00:00:00 2001
From: Yonik Seeley <yonik@apache.org>
Date: Thu, 25 Jan 2007 16:32:01 +0000
Subject: [PATCH] facet.prefix: SOLR-117

git-svn-id: https://svn.apache.org/repos/asf/lucene/solr/trunk@499834 13f79535-47bb-0310-9956-ffa450edef68
---
 CHANGES.txt                                   |   3 +
 .../org/apache/solr/request/SimpleFacets.java | 153 ++++++----
 .../org/apache/solr/request/SolrParams.java   |   5 +
 .../apache/solr/BasicFunctionalityTest.java   | 271 +++++++++++++++++-
 4 files changed, 376 insertions(+), 56 deletions(-)
diff --git a/CHANGES.txt b/CHANGES.txt
index a13715398df..b6c3fcdb30d 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -54,6 +54,9 @@ New Features
     starting with the smallest positive set, subtracting all negative
     sets, then intersecting with all other positive sets.  (yonik)
 
+ 6. SOLR-117: Limit a field faceting to constraints with a prefix specified
+    by facet.prefix or f.<field>.facet.prefix. (yonik)
+
 Changes in runtime behavior
  1. Highlighting using DisMax will only pick up terms from the main 
     user query, not boost or filter queries (klaas).
diff --git a/src/java/org/apache/solr/request/SimpleFacets.java b/src/java/org/apache/solr/request/SimpleFacets.java
index 4944b8b4edd..8f3947e6351 100644
--- a/src/java/org/apache/solr/request/SimpleFacets.java
+++ b/src/java/org/apache/solr/request/SimpleFacets.java
@@ -20,12 +20,10 @@ package org.apache.solr.request;
 import org.apache.lucene.index.Term;
 import org.apache.lucene.index.TermEnum;
 import org.apache.lucene.index.IndexReader;
-import org.apache.lucene.index.TermDocs;
 import org.apache.lucene.queryParser.ParseException;
 import org.apache.lucene.search.*;
 import org.apache.solr.core.SolrCore;
 import org.apache.solr.core.SolrException;
-import org.apache.solr.core.SolrConfig;
 import org.apache.solr.request.SolrParams;
 import org.apache.solr.schema.IndexSchema;
 import org.apache.solr.schema.FieldType;
@@ -36,7 +34,8 @@ import org.apache.solr.util.NamedList;
 import org.apache.solr.util.BoundedTreeSet;
 
 import java.io.IOException;
-import java.util.*;
+import java.util.Arrays;
+import java.util.Comparator;
 
 /**
  * A class that generates simple Facet information for a request.
@@ -132,17 +131,18 @@ public class SimpleFacets {
     boolean missing = params.getFieldBool(field, params.FACET_MISSING, false);
     // default to sorting if there is a limit.
     boolean sort = params.getFieldBool(field, params.FACET_SORT, limit>0);
+    String prefix = params.getFieldParam(field,params.FACET_PREFIX);
 
     NamedList counts;
     SchemaField sf = searcher.getSchema().getField(field);
     FieldType ft = sf.getType();
     if (sf.multiValued() || ft.isTokenized() || ft instanceof BoolField) {
       // Always use filters for booleans... we know the number of values is very small.
-      counts = getFacetTermEnumCounts(searcher, docs, field, offset, limit, mincount,missing,sort);
+      counts = getFacetTermEnumCounts(searcher, docs, field, offset, limit, mincount,missing,sort,prefix);
     } else {
       // TODO: future logic could use filters instead of the fieldcache if
       // the number of terms in the field is small enough.
-      counts = getFieldCacheCounts(searcher, docs, field, offset,limit, mincount, missing, sort);
+      counts = getFieldCacheCounts(searcher, docs, field, offset,limit, mincount, missing, sort, prefix);
     }
 
     return counts;
@@ -184,11 +184,21 @@ public class SimpleFacets {
     return docs.andNotSize(hasVal);
   }
 
+
+  // first element of the fieldcache is null, so we need this comparator.
+  private static final Comparator nullStrComparator = new Comparator() {
+        public int compare(Object o1, Object o2) {
+          if (o1==null) return (o2==null) ? 0 : -1;
+          else if (o2==null) return 1;
+          return ((String)o1).compareTo((String)o2);
+        }
+      }; 
+
   /**
    * Use the Lucene FieldCache to get counts for each unique field value in <code>docs</code>.
    * The field must have at most one indexed token per document.
    */
-  public static NamedList getFieldCacheCounts(SolrIndexSearcher searcher, DocSet docs, String fieldName, int offset, int limit, int mincount, boolean missing, boolean sort) throws IOException {
+  public static NamedList getFieldCacheCounts(SolrIndexSearcher searcher, DocSet docs, String fieldName, int offset, int limit, int mincount, boolean missing, boolean sort, String prefix) throws IOException {
     // TODO: If the number of terms is high compared to docs.size(), and zeros==false,
     //  we should use an alternate strategy to avoid
     //  1) creating another huge int[] for the counts
@@ -198,58 +208,97 @@ public class SimpleFacets {
     // then use them instead of the FieldCache.
     //
 
-    FieldCache.StringIndex si = FieldCache.DEFAULT.getStringIndex(searcher.getReader(), fieldName);
-    final int[] count = new int[si.lookup.length];
-    DocIterator iter = docs.iterator();
-    while (iter.hasNext()) {
-      count[si.order[iter.nextDoc()]]++;
-    }
+    // TODO: this function is too big and could use some refactoring, but
+    // we also need a facet cache, and refactoring of SimpleFacets instead of
+    // trying to pass all the various params around.
 
     FieldType ft = searcher.getSchema().getFieldType(fieldName);
     NamedList res = new NamedList();
 
-    // IDEA: we could also maintain a count of "other"... everything that fell outside
-    // of the top 'N'
+    FieldCache.StringIndex si = FieldCache.DEFAULT.getStringIndex(searcher.getReader(), fieldName);
+    final String[] terms = si.lookup;
+    final int[] termNum = si.order;
 
-    int off=offset;
-    int lim=limit>=0 ? limit : Integer.MAX_VALUE;
+    if (prefix!=null && prefix.length()==0) prefix=null;
 
-    if (sort) {
-      // TODO: compare performance of BoundedTreeSet compare against priority queue?
-      int maxsize = limit>0 ? offset+limit : Integer.MAX_VALUE-1;
-      final BoundedTreeSet<CountPair<String,Integer>> queue = new BoundedTreeSet<CountPair<String,Integer>>(maxsize);
-      int min=mincount-1;  // the smallest value in the top 'N' values
-      for (int i=1; i<count.length; i++) {
-        int c = count[i];
-        if (c>min) {
-          // NOTE: we use c>min rather than c>=min as an optimization because we are going in
-          // index order, so we already know that the keys are ordered.  This can be very
-          // important if a lot of the counts are repeated (like zero counts would be).
-          queue.add(new CountPair<String,Integer>(ft.indexedToReadable(si.lookup[i]), c));
-          if (queue.size()>=maxsize) min=queue.last().val;
-        }
-      }
-      for (CountPair<String,Integer> p : queue) {
-        if (--off>=0) continue;
-        if (--lim<0) break;
-        res.add(p.key, p.val);
-      }
-    } else if (mincount<=0) {
-      // This is an optimization... if mincount<=0 and we aren't sorting then
-      // we know exactly where to start and end in the fieldcache.
-      for (int i=offset+1; i<offset+1+limit; i++) {
-        res.add(ft.indexedToReadable(si.lookup[i]),count[i]);
-      }
+    int startTermIndex, endTermIndex;
+    if (prefix!=null) {
+      startTermIndex = Arrays.binarySearch(terms,prefix,nullStrComparator);
+      if (startTermIndex<0) startTermIndex=-startTermIndex-1;
+      // find the end term.  \uffff isn't a legal unicode char, but only compareTo
+      // is used, so it should be fine, and is guaranteed to be bigger than legal chars.
+      endTermIndex = Arrays.binarySearch(terms,prefix+"\uffff\uffff\uffff\uffff",nullStrComparator);
+      endTermIndex = -endTermIndex-1;
     } else {
-      for (int i=1; i<count.length; i++) {
-        int c = count[i];
-        if (c<mincount || --off>=0) continue;
-        if (--lim<0) break;
-        res.add(ft.indexedToReadable(si.lookup[i]), c);      
+      startTermIndex=1;
+      endTermIndex=terms.length;
+    }
+
+    final int nTerms=endTermIndex-startTermIndex;
+
+    if (nTerms>0) {
+
+      // count collection array only needs to be as big as the number of terms we are
+      // going to collect counts for.
+      final int[] counts = new int[nTerms];
+
+      DocIterator iter = docs.iterator();
+      while (iter.hasNext()) {
+        int term = termNum[iter.nextDoc()];
+        int arrIdx = term-startTermIndex;
+        if (arrIdx>=0 && arrIdx<nTerms) counts[arrIdx]++;
+      }
+
+      // IDEA: we could also maintain a count of "other"... everything that fell outside
+      // of the top 'N'
+
+      int off=offset;
+      int lim=limit>=0 ? limit : Integer.MAX_VALUE;
+
+      if (sort) {
+        int maxsize = limit>0 ? offset+limit : Integer.MAX_VALUE-1;
+        maxsize = Math.min(maxsize, nTerms);
+        final BoundedTreeSet<CountPair<String,Integer>> queue = new BoundedTreeSet<CountPair<String,Integer>>(maxsize);
+        int min=mincount-1;  // the smallest value in the top 'N' values
+        for (int i=0; i<nTerms; i++) {
+          int c = counts[i];
+          if (c>min) {
+            // NOTE: we use c>min rather than c>=min as an optimization because we are going in
+            // index order, so we already know that the keys are ordered.  This can be very
+            // important if a lot of the counts are repeated (like zero counts would be).
+            queue.add(new CountPair<String,Integer>(terms[startTermIndex+i], c));
+            if (queue.size()>=maxsize) min=queue.last().val;
+          }
+        }
+        // now select the right page from the results
+        for (CountPair<String,Integer> p : queue) {
+          if (--off>=0) continue;
+          if (--lim<0) break;
+          res.add(ft.indexedToReadable(p.key), p.val);
+        }
+      } else {
+        // add results in index order
+        int i=0;
+        if (mincount<=0) {
+          // if mincount<=0, then we won't discard any terms and we know exactly
+          // where to start.
+          i=off;
+          off=0;
+        }
+
+        for (; i<nTerms; i++) {          
+          int c = counts[i];
+          if (c<mincount || --off>=0) continue;
+          if (--lim<0) break;
+          res.add(ft.indexedToReadable(terms[startTermIndex+i]), c);
+        }
       }
     }
 
-    if (missing) res.add(null, count[0]);
+    if (missing) {
+      res.add(null, getFieldMissingCount(searcher,docs,fieldName));
+    }
+    
     return res;
   }
 
@@ -263,7 +312,7 @@ public class SimpleFacets {
    * @see SolrParams#FACET_ZEROS
    * @see SolrParams#FACET_MISSING
    */
-  public NamedList getFacetTermEnumCounts(SolrIndexSearcher searcher, DocSet docs, String field, int offset, int limit, int mincount, boolean missing, boolean sort)
+  public NamedList getFacetTermEnumCounts(SolrIndexSearcher searcher, DocSet docs, String field, int offset, int limit, int mincount, boolean missing, boolean sort, String prefix)
     throws IOException {
 
     /* :TODO: potential optimization...
@@ -282,13 +331,17 @@ public class SimpleFacets {
     int min=mincount-1;  // the smallest value in the top 'N' values    
     int off=offset;
     int lim=limit>=0 ? limit : Integer.MAX_VALUE;
-    TermEnum te = r.terms(new Term(field,""));
+
+    String startTerm = prefix==null ? "" : ft.toInternal(prefix);
+    TermEnum te = r.terms(new Term(field,startTerm));
     do {
       Term t = te.term();
 
       if (null == t || ! t.field().equals(field))
         break;
 
+      if (prefix!=null && !t.text().startsWith(prefix)) break;
+
       int df = te.docFreq();
 
       if (df>0) { /* check df since all docs may be deleted */
diff --git a/src/java/org/apache/solr/request/SolrParams.java b/src/java/org/apache/solr/request/SolrParams.java
index 8a878902ebb..a7e020bf7fd 100644
--- a/src/java/org/apache/solr/request/SolrParams.java
+++ b/src/java/org/apache/solr/request/SolrParams.java
@@ -122,6 +122,11 @@ public abstract class SolrParams {
    */
   public static final String FACET_SORT = "facet.sort";
 
+  /**
+   * Only return constraints of a facet field with the given prefix.
+   */
+  public static final String FACET_PREFIX = "facet.prefix";
+
   /** returns the String value of a param, or null if not set */
   public abstract String get(String param);
 
diff --git a/src/test/org/apache/solr/BasicFunctionalityTest.java b/src/test/org/apache/solr/BasicFunctionalityTest.java
index 8f0c6484b91..eecd755158c 100644
--- a/src/test/org/apache/solr/BasicFunctionalityTest.java
+++ b/src/test/org/apache/solr/BasicFunctionalityTest.java
@@ -25,14 +25,12 @@ import org.apache.solr.search.*;
 import org.apache.solr.request.*;
 import org.apache.solr.util.*;
 import org.apache.solr.schema.*;
-import org.w3c.dom.Document;
 
 import javax.xml.parsers.DocumentBuilderFactory;
 import javax.xml.parsers.DocumentBuilder;
 import java.io.IOException;
 import java.io.StringWriter;
 import java.io.ByteArrayInputStream;
-import java.io.UnsupportedEncodingException;
 import java.util.Map;
 import java.util.HashMap;
 
@@ -575,14 +573,14 @@ public class BasicFunctionalityTest extends AbstractSolrTestCase {
  
 
   public void testFacetMultiValued() {
-    doSimpleFacetCountsWithLimits("t_s");
+    doFacets("t_s");
   }
 
   public void testFacetSingleValued() {
-    doSimpleFacetCountsWithLimits("t_s1");
+    doFacets("t_s1");
   }
 
-  public void doSimpleFacetCountsWithLimits(String f) {
+  public void doFacets(String f) {
     String pre = "//lst[@name='"+f+"']";
     String notc = "id:[* TO *] -"+f+":C";
 
@@ -742,7 +740,268 @@ public class BasicFunctionalityTest extends AbstractSolrTestCase {
             ,pre+"/int[1][@name='G'][.='5']"
             );
   }
-  
+
+
+
+  public void testFacetPrefixMultiValued() {
+    doFacetPrefix("t_s");
+  }
+
+  public void testFacetPrefixSingleValued() {
+    doFacetPrefix("t_s1");
+  }
+
+  public void doFacetPrefix(String f) {
+    String indent="on";
+    String pre = "//lst[@name='"+f+"']";
+    String notc = "id:[* TO *] -"+f+":C";
+
+    assertU(adoc("id", "1",  f, "AAA"));
+    assertU(adoc("id", "2",  f, "B"));
+    assertU(adoc("id", "3",  f, "BB"));
+    assertU(adoc("id", "4",  f, "BB"));
+    assertU(adoc("id", "5",  f, "BBB"));
+    assertU(adoc("id", "6",  f, "BBB"));
+    assertU(adoc("id", "7",  f, "BBB"));
+    assertU(adoc("id", "8",  f, "CC"));
+    assertU(adoc("id", "9",  f, "CC"));
+    assertU(adoc("id", "10", f, "CCC"));
+    assertU(adoc("id", "11", f, "CCC"));
+    assertU(adoc("id", "12", f, "CCC"));
+    assertU(commit());
+
+    assertQ("test facet.prefix middle, exact match first term",
+            req("q", "id:[* TO *]"
+                    ,"indent",indent
+                    ,"facet","true"
+                    ,"facet.field", f
+                    ,"facet.mincount","0"
+                    ,"facet.offset","0"
+                    ,"facet.limit","100"
+                    ,"facet.sort","true"
+                    ,"facet.prefix","B"
+            )
+            ,"*[count(//lst[@name='facet_fields']/lst/int)=3]"
+            ,pre+"/int[1][@name='BBB'][.='3']"
+            ,pre+"/int[2][@name='BB'][.='2']"
+            ,pre+"/int[3][@name='B'][.='1']"
+    );
+
+    assertQ("test facet.prefix middle, exact match first term, unsorted",
+            req("q", "id:[* TO *]"
+                    ,"indent",indent
+                    ,"facet","true"
+                    ,"facet.field", f
+                    ,"facet.mincount","0"
+                    ,"facet.offset","0"
+                    ,"facet.limit","100"
+                    ,"facet.sort","false"
+                    ,"facet.prefix","B"
+            )
+            ,"*[count(//lst[@name='facet_fields']/lst/int)=3]"
+            ,pre+"/int[1][@name='B'][.='1']"
+            ,pre+"/int[2][@name='BB'][.='2']"
+            ,pre+"/int[3][@name='BBB'][.='3']"
+    );
+
+
+     assertQ("test facet.prefix middle, exact match first term, unsorted",
+            req("q", "id:[* TO *]"
+                    ,"indent",indent
+                    ,"facet","true"
+                    ,"facet.field", f
+                    ,"facet.mincount","0"
+                    ,"facet.offset","0"
+                    ,"facet.limit","100"
+                    ,"facet.sort","false"
+                    ,"facet.prefix","B"
+            )
+            ,"*[count(//lst[@name='facet_fields']/lst/int)=3]"
+            ,pre+"/int[1][@name='B'][.='1']"
+            ,pre+"/int[2][@name='BB'][.='2']"
+            ,pre+"/int[3][@name='BBB'][.='3']"
+    );
+
+
+    assertQ("test facet.prefix middle, paging",
+            req("q", "id:[* TO *]"
+                    ,"indent",indent
+                    ,"facet","true"
+                    ,"facet.field", f
+                    ,"facet.mincount","0"
+                    ,"facet.offset","1"
+                    ,"facet.limit","100"
+                    ,"facet.sort","true"
+                    ,"facet.prefix","B"
+            )
+            ,"*[count(//lst[@name='facet_fields']/lst/int)=2]"
+            ,pre+"/int[1][@name='BB'][.='2']"
+            ,pre+"/int[2][@name='B'][.='1']"
+    );
+
+    assertQ("test facet.prefix middle, paging",
+            req("q", "id:[* TO *]"
+                    ,"indent",indent
+                    ,"facet","true"
+                    ,"facet.field", f
+                    ,"facet.mincount","0"
+                    ,"facet.offset","1"
+                    ,"facet.limit","1"
+                    ,"facet.sort","true"
+                    ,"facet.prefix","B"
+            )
+            ,"*[count(//lst[@name='facet_fields']/lst/int)=1]"
+            ,pre+"/int[1][@name='BB'][.='2']"
+    );
+
+    assertQ("test facet.prefix middle, paging",
+            req("q", "id:[* TO *]"
+                    ,"indent",indent
+                    ,"facet","true"
+                    ,"facet.field", f
+                    ,"facet.mincount","0"
+                    ,"facet.offset","1"
+                    ,"facet.limit","1"
+                    ,"facet.sort","true"
+                    ,"facet.prefix","B"
+            )
+            ,"*[count(//lst[@name='facet_fields']/lst/int)=1]"
+            ,pre+"/int[1][@name='BB'][.='2']"
+    );
+
+    assertQ("test facet.prefix end, not exact match",
+            req("q", "id:[* TO *]"
+                    ,"indent",indent
+                    ,"facet","true"
+                    ,"facet.field", f
+                    ,"facet.mincount","0"
+                    ,"facet.offset","0"
+                    ,"facet.limit","100"
+                    ,"facet.sort","true"
+                    ,"facet.prefix","C"
+            )
+            ,"*[count(//lst[@name='facet_fields']/lst/int)=2]"
+            ,pre+"/int[1][@name='CCC'][.='3']"
+            ,pre+"/int[2][@name='CC'][.='2']"
+    );
+
+    assertQ("test facet.prefix end, exact match",
+            req("q", "id:[* TO *]"
+                    ,"indent",indent
+                    ,"facet","true"
+                    ,"facet.field", f
+                    ,"facet.mincount","0"
+                    ,"facet.offset","0"
+                    ,"facet.limit","100"
+                    ,"facet.sort","true"
+                    ,"facet.prefix","CC"
+            )
+            ,"*[count(//lst[@name='facet_fields']/lst/int)=2]"
+            ,pre+"/int[1][@name='CCC'][.='3']"
+            ,pre+"/int[2][@name='CC'][.='2']"
+    );
+
+    assertQ("test facet.prefix past end",
+            req("q", "id:[* TO *]"
+                    ,"indent",indent
+                    ,"facet","true"
+                    ,"facet.field", f
+                    ,"facet.mincount","0"
+                    ,"facet.offset","0"
+                    ,"facet.limit","100"
+                    ,"facet.sort","true"
+                    ,"facet.prefix","X"
+            )
+            ,"*[count(//lst[@name='facet_fields']/lst/int)=0]"
+    );
+
+    assertQ("test facet.prefix past end",
+            req("q", "id:[* TO *]"
+                    ,"indent",indent
+                    ,"facet","true"
+                    ,"facet.field", f
+                    ,"facet.mincount","0"
+                    ,"facet.offset","1"
+                    ,"facet.limit","-1"
+                    ,"facet.sort","true"
+                    ,"facet.prefix","X"
+            )
+            ,"*[count(//lst[@name='facet_fields']/lst/int)=0]"
+    );
+
+    assertQ("test facet.prefix at start, exact match",
+            req("q", "id:[* TO *]"
+                    ,"indent",indent
+                    ,"facet","true"
+                    ,"facet.field", f
+                    ,"facet.mincount","0"
+                    ,"facet.offset","0"
+                    ,"facet.limit","100"
+                    ,"facet.sort","true"
+                    ,"facet.prefix","AAA"
+            )
+            ,"*[count(//lst[@name='facet_fields']/lst/int)=1]"
+            ,pre+"/int[1][@name='AAA'][.='1']"
+    );
+    assertQ("test facet.prefix at Start, not exact match",
+            req("q", "id:[* TO *]"
+                    ,"indent",indent
+                    ,"facet","true"
+                    ,"facet.field", f
+                    ,"facet.mincount","0"
+                    ,"facet.offset","0"
+                    ,"facet.limit","100"
+                    ,"facet.sort","true"
+                    ,"facet.prefix","AA"
+            )
+            ,"*[count(//lst[@name='facet_fields']/lst/int)=1]"
+            ,pre+"/int[1][@name='AAA'][.='1']"
+    );
+    assertQ("test facet.prefix at Start, not exact match",
+            req("q", "id:[* TO *]"
+                    ,"indent",indent
+                    ,"facet","true"
+                    ,"facet.field", f
+                    ,"facet.mincount","0"
+                    ,"facet.offset","0"
+                    ,"facet.limit","100"
+                    ,"facet.sort","true"
+                    ,"facet.prefix","AA"
+            )
+            ,"*[count(//lst[@name='facet_fields']/lst/int)=1]"
+            ,pre+"/int[1][@name='AAA'][.='1']"
+    );    
+    assertQ("test facet.prefix before start",
+            req("q", "id:[* TO *]"
+                    ,"indent",indent
+                    ,"facet","true"
+                    ,"facet.field", f
+                    ,"facet.mincount","0"
+                    ,"facet.offset","0"
+                    ,"facet.limit","100"
+                    ,"facet.sort","true"
+                    ,"facet.prefix","999"
+            )
+            ,"*[count(//lst[@name='facet_fields']/lst/int)=0]"
+    );
+
+    assertQ("test facet.prefix before start",
+            req("q", "id:[* TO *]"
+                    ,"indent",indent
+                    ,"facet","true"
+                    ,"facet.field", f
+                    ,"facet.mincount","0"
+                    ,"facet.offset","2"
+                    ,"facet.limit","100"
+                    ,"facet.sort","true"
+                    ,"facet.prefix","999"
+            )
+            ,"*[count(//lst[@name='facet_fields']/lst/int)=0]"
+    );
+
+  }
+
+
   private String mkstr(int len) {
     StringBuilder sb = new StringBuilder(len);
     for (int i = 0; i < len; i++) {