mirror of https://github.com/apache/lucene.git
SOLR-8559: FCS facet performance optimization
Significantly speeds up processing when terms are high cardinality and the matching docset is small. When facet minCount > 0 and the number of matching documents is small (or 0) this enhancement prevents considering terms which have a 0 count. Also includes change to move to the next non-zero term value when selecting a segment position. git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1725638 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
cecb9f4e25
commit
2d2582e4f4
|
@ -165,6 +165,12 @@ Optimizations
|
|||
speedups were up to 2.5x for production of filters, and up to 10x for query evaluation with
|
||||
embedded term range queres that resulted in filter cache hits. (yonik)
|
||||
|
||||
* SOLR-8559: FCS facet performance optimization which significantly speeds up processing when terms
|
||||
are high cardinality and the matching docset is small. When facet minCount > 0 and the number of
|
||||
matching documents is small (or 0) this enhancement prevents considering terms which have a 0
|
||||
count. Also includes change to move to the next non-zero term value when selecting a segment
|
||||
position. (Keith Laban, Steve Bower, Dennis Gove)
|
||||
|
||||
Other Changes
|
||||
----------------------
|
||||
|
||||
|
|
|
@ -163,7 +163,7 @@ class PerSegmentSingleValuedFaceting {
|
|||
} else {
|
||||
seg.pos = seg.startTermIndex;
|
||||
}
|
||||
if (seg.pos < seg.endTermIndex) {
|
||||
if (seg.pos < seg.endTermIndex && (mincount < 1 || seg.hasAnyCount)) {
|
||||
seg.tenum = seg.si.termsEnum();
|
||||
seg.tenum.seekExact(seg.pos);
|
||||
seg.tempBR = seg.tenum.term();
|
||||
|
@ -201,14 +201,22 @@ class PerSegmentSingleValuedFaceting {
|
|||
count += seg.counts[seg.pos - seg.startTermIndex];
|
||||
}
|
||||
|
||||
// TODO: OPTIMIZATION...
|
||||
// if mincount>0 then seg.pos++ can skip ahead to the next non-zero entry.
|
||||
seg.pos++;
|
||||
do{
|
||||
++seg.pos;
|
||||
}
|
||||
while(
|
||||
(seg.pos < seg.endTermIndex) //stop incrementing before we run off the end
|
||||
&& (seg.tenum.next() != null || true) //move term enum forward with position -- dont care about value
|
||||
&& (mincount > 0) //only skip ahead if mincount > 0
|
||||
&& (seg.counts[seg.pos - seg.startTermIndex] == 0) //check zero count
|
||||
);
|
||||
|
||||
if (seg.pos >= seg.endTermIndex) {
|
||||
queue.pop();
|
||||
seg = queue.top();
|
||||
} else {
|
||||
seg.tempBR = seg.tenum.next();
|
||||
seg.tempBR = seg.tenum.term();
|
||||
seg = queue.updateTop();
|
||||
}
|
||||
} while (seg != null && val.get().compareTo(seg.tempBR) == 0);
|
||||
|
@ -248,6 +256,10 @@ class PerSegmentSingleValuedFaceting {
|
|||
int startTermIndex;
|
||||
int endTermIndex;
|
||||
int[] counts;
|
||||
|
||||
//whether this segment has any non-zero term counts
|
||||
//used to ignore insignificant segments when mincount>0
|
||||
boolean hasAnyCount = false;
|
||||
|
||||
int pos; // only used when merging
|
||||
TermsEnum tenum; // only used when merging
|
||||
|
@ -285,7 +297,9 @@ class PerSegmentSingleValuedFaceting {
|
|||
// specialized version when collecting counts for all terms
|
||||
int doc;
|
||||
while ((doc = iter.nextDoc()) < DocIdSetIterator.NO_MORE_DOCS) {
|
||||
counts[1+si.getOrd(doc)]++;
|
||||
int t = 1+si.getOrd(doc);
|
||||
hasAnyCount = hasAnyCount || t > 0; //counts[0] == missing counts
|
||||
counts[t]++;
|
||||
}
|
||||
} else {
|
||||
// version that adjusts term numbers because we aren't collecting the full range
|
||||
|
@ -293,7 +307,10 @@ class PerSegmentSingleValuedFaceting {
|
|||
while ((doc = iter.nextDoc()) < DocIdSetIterator.NO_MORE_DOCS) {
|
||||
int term = si.getOrd(doc);
|
||||
int arrIdx = term-startTermIndex;
|
||||
if (arrIdx>=0 && arrIdx<nTerms) counts[arrIdx]++;
|
||||
if (arrIdx>=0 && arrIdx<nTerms){
|
||||
counts[arrIdx]++;
|
||||
hasAnyCount = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue