diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt index 191e0c18131..69f764bb4e4 100644 --- a/solr/CHANGES.txt +++ b/solr/CHANGES.txt @@ -165,6 +165,12 @@ Optimizations speedups were up to 2.5x for production of filters, and up to 10x for query evaluation with embedded term range queres that resulted in filter cache hits. (yonik) +* SOLR-8559: FCS facet performance optimization which significantly speeds up processing when terms + are high cardinality and the matching docset is small. When facet minCount > 0 and the number of + matching documents is small (or 0) this enhancement prevents considering terms which have a 0 + count. Also includes change to move to the next non-zero term value when selecting a segment + position. (Keith Laban, Steve Bower, Dennis Gove) + Other Changes ---------------------- diff --git a/solr/core/src/java/org/apache/solr/request/PerSegmentSingleValuedFaceting.java b/solr/core/src/java/org/apache/solr/request/PerSegmentSingleValuedFaceting.java index 54513ed5f1a..906fe9a66c1 100644 --- a/solr/core/src/java/org/apache/solr/request/PerSegmentSingleValuedFaceting.java +++ b/solr/core/src/java/org/apache/solr/request/PerSegmentSingleValuedFaceting.java @@ -163,7 +163,7 @@ class PerSegmentSingleValuedFaceting { } else { seg.pos = seg.startTermIndex; } - if (seg.pos < seg.endTermIndex) { + if (seg.pos < seg.endTermIndex && (mincount < 1 || seg.hasAnyCount)) { seg.tenum = seg.si.termsEnum(); seg.tenum.seekExact(seg.pos); seg.tempBR = seg.tenum.term(); @@ -201,14 +201,22 @@ class PerSegmentSingleValuedFaceting { count += seg.counts[seg.pos - seg.startTermIndex]; } - // TODO: OPTIMIZATION... // if mincount>0 then seg.pos++ can skip ahead to the next non-zero entry. - seg.pos++; + do{ + ++seg.pos; + } + while( + (seg.pos < seg.endTermIndex) //stop incrementing before we run off the end + && (seg.tenum.next() != null || true) //move term enum forward with position -- dont care about value + && (mincount > 0) //only skip ahead if mincount > 0 + && (seg.counts[seg.pos - seg.startTermIndex] == 0) //check zero count + ); + if (seg.pos >= seg.endTermIndex) { queue.pop(); seg = queue.top(); } else { - seg.tempBR = seg.tenum.next(); + seg.tempBR = seg.tenum.term(); seg = queue.updateTop(); } } while (seg != null && val.get().compareTo(seg.tempBR) == 0); @@ -248,6 +256,10 @@ class PerSegmentSingleValuedFaceting { int startTermIndex; int endTermIndex; int[] counts; + + //whether this segment has any non-zero term counts + //used to ignore insignificant segments when mincount>0 + boolean hasAnyCount = false; int pos; // only used when merging TermsEnum tenum; // only used when merging @@ -285,7 +297,9 @@ class PerSegmentSingleValuedFaceting { // specialized version when collecting counts for all terms int doc; while ((doc = iter.nextDoc()) < DocIdSetIterator.NO_MORE_DOCS) { - counts[1+si.getOrd(doc)]++; + int t = 1+si.getOrd(doc); + hasAnyCount = hasAnyCount || t > 0; //counts[0] == missing counts + counts[t]++; } } else { // version that adjusts term numbers because we aren't collecting the full range @@ -293,7 +307,10 @@ class PerSegmentSingleValuedFaceting { while ((doc = iter.nextDoc()) < DocIdSetIterator.NO_MORE_DOCS) { int term = si.getOrd(doc); int arrIdx = term-startTermIndex; - if (arrIdx>=0 && arrIdx=0 && arrIdx