LUCENE-4485: CheckIndex's terms, terms/docs pairs counts don't include deleted docs

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1399028 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Michael McCandless 2012-10-16 22:41:51 +00:00
parent a839ab7fb0
commit cd6c2fe0a2
3 changed files with 21 additions and 11 deletions

View File

@ -57,6 +57,9 @@ Bug Fixes
* LUCENE-4468: Fix rareish integer overflows in Block and Lucene40 postings
formats (Robert Muir)
* LUCENE-4485: When CheckIndex terms, terms/docs pairs and tokens,
these counts now all exclude deleted documents. (Mike McCandless)
Optimizations
* LUCENE-4443: BlockPostingsFormat no longer writes unnecessary offsets

View File

@ -233,9 +233,12 @@ public class CheckIndex {
TermIndexStatus() {
}
/** Total term count */
/** Number of terms with at least one live doc. */
public long termCount = 0L;
/** Number of terms with zero live docs docs. */
public long delTermCount = 0L;
/** Total frequency across all terms. */
public long totFreq = 0L;
@ -750,7 +753,7 @@ public class CheckIndex {
final TermsEnum termsEnum = terms.iterator(null);
boolean hasOrd = true;
final long termCountStart = status.termCount;
final long termCountStart = status.delTermCount + status.termCount;
BytesRef lastTerm = null;
@ -781,7 +784,6 @@ public class CheckIndex {
if (docFreq <= 0) {
throw new RuntimeException("docfreq: " + docFreq + " is out of bounds");
}
status.totFreq += docFreq;
sumDocFreq += docFreq;
docs = termsEnum.docs(liveDocs, docs);
@ -796,15 +798,13 @@ public class CheckIndex {
}
if (hasOrd) {
final long ordExpected = status.termCount - termCountStart;
final long ordExpected = status.delTermCount + status.termCount - termCountStart;
if (ord != ordExpected) {
throw new RuntimeException("ord mismatch: TermsEnum has ord=" + ord + " vs actual=" + ordExpected);
}
}
}
status.termCount++;
final DocsEnum docs2;
if (postings != null) {
docs2 = postings;
@ -820,6 +820,7 @@ public class CheckIndex {
if (doc == DocIdSetIterator.NO_MORE_DOCS) {
break;
}
status.totFreq++;
visitedDocs.set(doc);
int freq = -1;
if (hasFreqs) {
@ -883,6 +884,12 @@ public class CheckIndex {
}
}
if (docCount != 0) {
status.termCount++;
} else {
status.delTermCount++;
}
final long totalTermFreq2 = termsEnum.totalTermFreq();
final boolean hasTotalTermFreq = hasFreqs && totalTermFreq2 != -1;
@ -1063,11 +1070,11 @@ public class CheckIndex {
// check unique term count
long termCount = -1;
if (status.termCount-termCountStart > 0) {
if ((status.delTermCount+status.termCount)-termCountStart > 0) {
termCount = fields.terms(field).size();
if (termCount != -1 && termCount != status.termCount - termCountStart) {
throw new RuntimeException("termCount mismatch " + termCount + " vs " + (status.termCount - termCountStart));
if (termCount != -1 && termCount != status.delTermCount + status.termCount - termCountStart) {
throw new RuntimeException("termCount mismatch " + (status.delTermCount + termCount) + " vs " + (status.termCount - termCountStart));
}
}

View File

@ -75,8 +75,8 @@ public class TestCheckIndex extends LuceneTestCase {
assertNotNull(seg.termIndexStatus);
assertNull(seg.termIndexStatus.error);
assertEquals(19, seg.termIndexStatus.termCount);
assertEquals(19, seg.termIndexStatus.totFreq);
assertEquals(18, seg.termIndexStatus.termCount);
assertEquals(18, seg.termIndexStatus.totFreq);
assertEquals(18, seg.termIndexStatus.totPos);
assertNotNull(seg.storedFieldStatus);