Don't report terms as live if all it's docs are filtered out

FilterableTermsEnum allows to filter stats by supplying per segment
bits. Today if all docs are filtered out the term is still reported as
live but shouldn't.

Relates to #6211
This commit is contained in:
Simon Willnauer 2014-05-18 22:20:01 +02:00
parent c593234b7c
commit 72da764261
2 changed files with 35 additions and 37 deletions

View File

@ -124,48 +124,50 @@ public class FilterableTermsEnum extends TermsEnum {
@Override
public boolean seekExact(BytesRef text) throws IOException {
boolean found = false;
currentDocFreq = NOT_FOUND;
currentTotalTermFreq = NOT_FOUND;
int docFreq = 0;
long totalTermFreq = 0;
for (Holder anEnum : enums) {
if (!anEnum.termsEnum.seekExact(text)) {
continue;
}
found = true;
if (anEnum.bits == null) {
docFreq += anEnum.termsEnum.docFreq();
if (docsEnumFlag == DocsEnum.FLAG_FREQS) {
long leafTotalTermFreq = anEnum.termsEnum.totalTermFreq();
if (totalTermFreq == -1 || leafTotalTermFreq == -1) {
totalTermFreq = -1;
continue;
}
totalTermFreq += leafTotalTermFreq;
}
} else {
DocsEnum docsEnum = anEnum.docsEnum = anEnum.termsEnum.docs(anEnum.bits, anEnum.docsEnum, docsEnumFlag);
// 2 choices for performing same heavy loop - one attempts to calculate totalTermFreq and other does not
if (docsEnumFlag == DocsEnum.FLAG_FREQS) {
for (int docId = docsEnum.nextDoc(); docId != DocIdSetIterator.NO_MORE_DOCS; docId = docsEnum.nextDoc()) {
docFreq++;
// docsEnum.freq() returns 1 if doc indexed with IndexOptions.DOCS_ONLY so no way of knowing if value
// is really 1 or unrecorded when filtering like this
totalTermFreq += docsEnum.freq();
if (anEnum.termsEnum.seekExact(text)) {
if (anEnum.bits == null) {
docFreq += anEnum.termsEnum.docFreq();
if (docsEnumFlag == DocsEnum.FLAG_FREQS) {
long leafTotalTermFreq = anEnum.termsEnum.totalTermFreq();
if (totalTermFreq == -1 || leafTotalTermFreq == -1) {
totalTermFreq = -1;
continue;
}
totalTermFreq += leafTotalTermFreq;
}
} else {
for (int docId = docsEnum.nextDoc(); docId != DocIdSetIterator.NO_MORE_DOCS; docId = docsEnum.nextDoc()) {
// docsEnum.freq() behaviour is undefined if docsEnumFlag==DocsEnum.FLAG_NONE so don't bother with call
docFreq++;
final DocsEnum docsEnum = anEnum.docsEnum = anEnum.termsEnum.docs(anEnum.bits, anEnum.docsEnum, docsEnumFlag);
// 2 choices for performing same heavy loop - one attempts to calculate totalTermFreq and other does not
if (docsEnumFlag == DocsEnum.FLAG_FREQS) {
for (int docId = docsEnum.nextDoc(); docId != DocIdSetIterator.NO_MORE_DOCS; docId = docsEnum.nextDoc()) {
docFreq++;
// docsEnum.freq() returns 1 if doc indexed with IndexOptions.DOCS_ONLY so no way of knowing if value
// is really 1 or unrecorded when filtering like this
totalTermFreq += docsEnum.freq();
}
} else {
for (int docId = docsEnum.nextDoc(); docId != DocIdSetIterator.NO_MORE_DOCS; docId = docsEnum.nextDoc()) {
// docsEnum.freq() behaviour is undefined if docsEnumFlag==DocsEnum.FLAG_NONE so don't bother with call
docFreq++;
}
}
}
}
}
if (docFreq > 0) {
currentDocFreq = docFreq;
currentTotalTermFreq = totalTermFreq;
current = text;
return true;
} else {
currentDocFreq = NOT_FOUND;
currentTotalTermFreq = NOT_FOUND;
current = null;
return false;
}
return found;
}
@Override

View File

@ -76,22 +76,18 @@ public class FreqTermsEnum extends FilterableTermsEnum implements Releasable {
boolean found = true;
if (needDocFreqs) {
currentDocFreq = termDocFreqs.get(currentTermOrd);
if (currentDocFreq == NOT_FOUND) {
found = false;
}
found = currentDocFreq != NOT_FOUND;
}
if (needTotalTermFreqs) {
currentTotalTermFreq = termsTotalFreqs.get(currentTermOrd);
if (currentTotalTermFreq == NOT_FOUND) {
found = false;
}
found = currentTotalTermFreq != NOT_FOUND;
}
current = found ? text : null;
return found;
}
//Cache miss - gather stats
boolean found = super.seekExact(text);
final boolean found = super.seekExact(text);
//Cache the result - found or not.
if (needDocFreqs) {