SOLR-8559: FCS facet performance optimization

Significantly speeds up processing when terms are high cardinality and the matching docset is small. When facet minCount > 0 and the number of matching documents is small (or 0) this enhancement prevents considering terms which have a 0 count. Also includes change to move to the next non-zero term value when selecting a segment position. git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1725638 13f79535-47bb-0310-9956-ffa450edef68
2016-01-19 22:41:49 +00:00 · 2016-01-19 22:41:49 +00:00 · 2d2582e4f4
parent cecb9f4e25
commit 2d2582e4f4
2 changed files with 29 additions and 6 deletions
--- a/solr/CHANGES.txt
+++ b/solr/CHANGES.txt
@ -165,6 +165,12 @@ Optimizations
  speedups were up to 2.5x for production of filters, and up to 10x for query evaluation with
  embedded term range queres that resulted in filter cache hits.  (yonik)

+* SOLR-8559: FCS facet performance optimization which significantly speeds up processing when terms 
+  are high cardinality and the matching docset is small. When facet minCount > 0 and the number of
+  matching documents is small (or 0) this enhancement prevents considering terms which have a 0
+  count. Also includes change to move to the next non-zero term value when selecting a segment
+  position. (Keith Laban, Steve Bower, Dennis Gove)
+
 Other Changes
 ----------------------

--- a/solr/core/src/java/org/apache/solr/request/PerSegmentSingleValuedFaceting.java
+++ b/solr/core/src/java/org/apache/solr/request/PerSegmentSingleValuedFaceting.java
@ -163,7 +163,7 @@ class PerSegmentSingleValuedFaceting {
        } else {
          seg.pos = seg.startTermIndex;
        }
-        if (seg.pos < seg.endTermIndex) {
+        if (seg.pos < seg.endTermIndex && (mincount < 1 || seg.hasAnyCount)) {  
          seg.tenum = seg.si.termsEnum();
          seg.tenum.seekExact(seg.pos);
          seg.tempBR = seg.tenum.term();
@ -201,14 +201,22 @@ class PerSegmentSingleValuedFaceting {
          count += seg.counts[seg.pos - seg.startTermIndex];
        }

-        // TODO: OPTIMIZATION...
        // if mincount>0 then seg.pos++ can skip ahead to the next non-zero entry.
-        seg.pos++;
+        do{
+          ++seg.pos;
+        }
+        while(  
+            (seg.pos < seg.endTermIndex)  //stop incrementing before we run off the end
+         && (seg.tenum.next() != null || true) //move term enum forward with position -- dont care about value 
+         && (mincount > 0) //only skip ahead if mincount > 0
+         && (seg.counts[seg.pos - seg.startTermIndex] == 0) //check zero count
+        );
+        
        if (seg.pos >= seg.endTermIndex) {
          queue.pop();
          seg = queue.top();
        } else {
-          seg.tempBR = seg.tenum.next();
+          seg.tempBR = seg.tenum.term();
          seg = queue.updateTop();
        }
      } while (seg != null && val.get().compareTo(seg.tempBR) == 0);
@ -249,6 +257,10 @@ class PerSegmentSingleValuedFaceting {
    int endTermIndex;
    int[] counts;
    
+    //whether this segment has any non-zero term counts
+    //used to ignore insignificant segments when mincount>0
+    boolean hasAnyCount = false; 
+
    int pos; // only used when merging
    TermsEnum tenum; // only used when merging

@ -285,7 +297,9 @@ class PerSegmentSingleValuedFaceting {
        // specialized version when collecting counts for all terms
        int doc;
        while ((doc = iter.nextDoc()) < DocIdSetIterator.NO_MORE_DOCS) {
-          counts[1+si.getOrd(doc)]++;
+          int t = 1+si.getOrd(doc);
+          hasAnyCount = hasAnyCount || t > 0; //counts[0] == missing counts
+          counts[t]++;
        }
      } else {
        // version that adjusts term numbers because we aren't collecting the full range
@ -293,7 +307,10 @@ class PerSegmentSingleValuedFaceting {
        while ((doc = iter.nextDoc()) < DocIdSetIterator.NO_MORE_DOCS) {
          int term = si.getOrd(doc);
          int arrIdx = term-startTermIndex;
-          if (arrIdx>=0 && arrIdx<nTerms) counts[arrIdx]++;
+          if (arrIdx>=0 && arrIdx<nTerms){
+            counts[arrIdx]++;
+            hasAnyCount = true;
+          }
        }
      }
    }