LUCENE-3878: fix CheckIndex.testTermVectors to use checkFields too; this found a bug in Lucene40's term vectors reader

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1301939 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Michael McCandless 2012-03-17 13:42:54 +00:00
parent 52b55a13c1
commit cd05c6f0c3
2 changed files with 99 additions and 161 deletions

View File

@ -406,9 +406,14 @@ public class Lucene40TermVectorsReader extends TermVectorsReader {
@Override
public SeekStatus seekCeil(BytesRef text, boolean useCache)
throws IOException {
if (nextTerm != 0 && text.compareTo(term) < 0) {
nextTerm = 0;
tvf.seek(tvfFP);
if (nextTerm != 0) {
final int cmp = text.compareTo(term);
if (cmp < 0) {
nextTerm = 0;
tvf.seek(tvfFP);
} else if (cmp == 0) {
return SeekStatus.FOUND;
}
}
while (next() != null) {

View File

@ -605,7 +605,7 @@ public class CheckIndex {
segInfoStat.storedFieldStatus = testStoredFields(info, reader, nf);
// Test Term Vectors
segInfoStat.termVectorStatus = testTermVectors(info, reader, nf);
segInfoStat.termVectorStatus = testTermVectors(fieldInfos, info, reader, nf);
segInfoStat.docValuesStatus = testDocValues(info, reader);
@ -867,6 +867,13 @@ public class CheckIndex {
if (hasPositions) {
for(int j=0;j<freq;j++) {
final int pos = postings.nextPosition();
// NOTE: pos=-1 is allowed because of ancient bug
// (LUCENE-1542) whereby IndexWriter could
// write pos=-1 when first token's posInc is 0
// (separately: analyzers should not give
// posInc=0 to first token); also, term
// vectors are allowed to return pos=-1 if
// they indexed offset but not positions:
if (pos < -1) {
throw new RuntimeException("term " + term + ": doc " + doc + ": pos " + pos + " is out of bounds");
}
@ -938,7 +945,14 @@ public class CheckIndex {
int lastPosition = -1;
for(int posUpto=0;posUpto<freq;posUpto++) {
final int pos = postings.nextPosition();
if (pos < 0) {
// NOTE: pos=-1 is allowed because of ancient bug
// (LUCENE-1542) whereby IndexWriter could
// write pos=-1 when first token's posInc is 0
// (separately: analyzers should not give
// posInc=0 to first token); also, term
// vectors are allowed to return pos=-1 if
// they indexed offset but not positions:
if (pos < -1) {
throw new RuntimeException("position " + pos + " is out of bounds");
}
if (pos < lastPosition) {
@ -1181,6 +1195,8 @@ public class CheckIndex {
// Scan stored fields for all documents
final Bits liveDocs = reader.getLiveDocs();
for (int j = 0; j < info.docCount; ++j) {
// Intentionally pull even deleted documents to
// make sure they too are not corrupt:
Document doc = reader.document(j);
if (liveDocs == null || liveDocs.get(j)) {
status.docCount++;
@ -1327,19 +1343,16 @@ public class CheckIndex {
/**
* Test term vectors for a segment.
*/
private Status.TermVectorStatus testTermVectors(SegmentInfo info, SegmentReader reader, NumberFormat format) {
private Status.TermVectorStatus testTermVectors(FieldInfos fieldInfos, SegmentInfo info, SegmentReader reader, NumberFormat format) {
final Status.TermVectorStatus status = new Status.TermVectorStatus();
// TODO: in theory we could test that term vectors have
// same terms/pos/offsets as the postings, but it'd be
// very slow...
final Bits onlyDocIsDeleted = new FixedBitSet(1);
try {
if (infoStream != null) {
infoStream.print(" test: term vectors........");
}
// TODO: maybe we can factor out testTermIndex and reuse here?
DocsEnum docs = null;
DocsAndPositionsEnum postings = null;
@ -1361,69 +1374,53 @@ public class CheckIndex {
TermsEnum postingsTermsEnum = null;
for (int j = 0; j < info.docCount; ++j) {
if (liveDocs == null || liveDocs.get(j)) {
status.docCount++;
Fields tfv = reader.getTermVectors(j);
if (tfv != null) {
int tfvComputedFieldCount = 0;
long tfvComputedTermCount = 0;
// Intentionally pull/visit (but don't count in
// stats) deleted documents to make sure they too
// are not corrupt:
Fields tfv = reader.getTermVectors(j);
FieldsEnum fieldsEnum = tfv.iterator();
String field = null;
String lastField = null;
while((field = fieldsEnum.next()) != null) {
// TODO: can we make a IS(FIR) that searches just
// this term vector... to pass for searcher?
if (tfv != null) {
// First run with no deletions:
checkFields(tfv, null, 1, fieldInfos, null);
// Again, with the one doc deleted:
checkFields(tfv, onlyDocIsDeleted, 1, fieldInfos, null);
// Only agg stats if the doc is live:
final boolean doStats = liveDocs == null || liveDocs.get(j);
if (doStats) {
status.docCount++;
}
FieldsEnum fieldsEnum = tfv.iterator();
String field = null;
while((field = fieldsEnum.next()) != null) {
if (doStats) {
status.totVectors++;
tfvComputedFieldCount++;
}
if (lastField == null) {
lastField = field;
} else if (lastField.compareTo(field) > 0) {
throw new RuntimeException("vector fields are out of order: lastField=" + lastField + " field=" + field + " doc=" + j);
}
// Make sure FieldInfo thinks this field is vector'd:
final FieldInfo fieldInfo = fieldInfos.fieldInfo(field);
if (!fieldInfo.storeTermVector) {
throw new RuntimeException("docID=" + j + " has term vectors for field=" + field + " but FieldInfo has storeTermVector=false");
}
if (crossCheckTermVectors) {
Terms terms = tfv.terms(field);
termsEnum = terms.iterator(termsEnum);
if (crossCheckTermVectors) {
Terms postingsTerms = postingsFields.terms(field);
if (postingsTerms == null) {
throw new RuntimeException("vector field=" + field + " does not exist in postings; doc=" + j);
}
postingsTermsEnum = postingsTerms.iterator(postingsTermsEnum);
} else {
postingsTermsEnum = null;
Terms postingsTerms = postingsFields.terms(field);
if (postingsTerms == null) {
throw new RuntimeException("vector field=" + field + " does not exist in postings; doc=" + j);
}
postingsTermsEnum = postingsTerms.iterator(postingsTermsEnum);
long tfvComputedTermCountForField = 0;
long tfvComputedSumTotalTermFreq = 0;
BytesRef lastTerm = null;
Comparator<BytesRef> termComp = terms.getComparator();
BytesRef term = null;
while ((term = termsEnum.next()) != null) {
tfvComputedTermCountForField++;
// make sure terms arrive in order according to
// the comp
if (lastTerm == null) {
lastTerm = BytesRef.deepCopyOf(term);
} else {
if (termComp.compare(lastTerm, term) >= 0) {
throw new RuntimeException("vector terms out of order for doc " + j + ": lastTerm=" + lastTerm + " term=" + term);
}
lastTerm.copyBytes(term);
}
if (termsEnum.docFreq() != 1) {
throw new RuntimeException("vector docFreq for doc " + j + ", field " + field + ", term" + term + " != 1");
}
long totalTermFreq = termsEnum.totalTermFreq();
if (totalTermFreq != -1 && totalTermFreq <= 0) {
throw new RuntimeException("totalTermFreq: " + totalTermFreq + " is out of bounds");
}
final boolean hasPositions;
final boolean hasOffsets;
final boolean hasFreqs;
@ -1455,7 +1452,7 @@ public class CheckIndex {
}
} else {
hasOffsets = true;
// NOTE: may be a lie... but we accept -1 below
// NOTE: may be a lie... but we accept -1
hasPositions = true;
hasFreqs = true;
}
@ -1471,24 +1468,20 @@ public class CheckIndex {
final DocsEnum postingsDocs2;
final boolean postingsHasFreq;
if (crossCheckTermVectors) {
if (!postingsTermsEnum.seekExact(term, true)) {
throw new RuntimeException("vector term=" + term + " field=" + field + " does not exist in postings; doc=" + j);
}
postingsPostings = postingsTermsEnum.docsAndPositions(null, postingsPostings, true);
if (!postingsTermsEnum.seekExact(term, true)) {
throw new RuntimeException("vector term=" + term + " field=" + field + " does not exist in postings; doc=" + j);
}
postingsPostings = postingsTermsEnum.docsAndPositions(null, postingsPostings, true);
if (postingsPostings == null) {
// Term vectors were indexed w/ offsets but postings were not
postingsPostings = postingsTermsEnum.docsAndPositions(null, postingsPostings, false);
if (postingsPostings == null) {
// Term vectors were indexed w/ offsets but postings were not
postingsPostings = postingsTermsEnum.docsAndPositions(null, postingsPostings, false);
if (postingsPostings == null) {
postingsDocs = postingsTermsEnum.docs(null, postingsDocs, true);
postingsDocs = postingsTermsEnum.docs(null, postingsDocs, true);
if (postingsDocs == null) {
postingsHasFreq = false;
postingsDocs = postingsTermsEnum.docs(null, postingsDocs, false);
if (postingsDocs == null) {
postingsHasFreq = false;
postingsDocs = postingsTermsEnum.docs(null, postingsDocs, false);
if (postingsDocs == null) {
throw new RuntimeException("vector term=" + term + " field=" + field + " does not exist in postings; doc=" + j);
}
} else {
postingsHasFreq = true;
throw new RuntimeException("vector term=" + term + " field=" + field + " does not exist in postings; doc=" + j);
}
} else {
postingsHasFreq = true;
@ -1496,20 +1489,19 @@ public class CheckIndex {
} else {
postingsHasFreq = true;
}
if (postingsPostings != null) {
postingsDocs2 = postingsPostings;
} else {
postingsDocs2 = postingsDocs;
}
final int advanceDoc = postingsDocs2.advance(j);
if (advanceDoc != j) {
throw new RuntimeException("vector term=" + term + " field=" + field + ": doc=" + j + " was not found in postings (got: " + advanceDoc + ")");
}
} else {
postingsDocs2 = null;
postingsHasFreq = false;
postingsHasFreq = true;
}
if (postingsPostings != null) {
postingsDocs2 = postingsPostings;
} else {
postingsDocs2 = postingsDocs;
}
final int advanceDoc = postingsDocs2.advance(j);
if (advanceDoc != j) {
throw new RuntimeException("vector term=" + term + " field=" + field + ": doc=" + j + " was not found in postings (got: " + advanceDoc + ")");
}
final int doc = docs2.nextDoc();
@ -1520,36 +1512,14 @@ public class CheckIndex {
if (hasFreqs) {
final int tf = docs2.freq();
if (tf <= 0) {
throw new RuntimeException("vector freq " + tf + " is out of bounds");
if (postingsHasFreq && postingsDocs2.freq() != tf) {
throw new RuntimeException("vector term=" + term + " field=" + field + " doc=" + j + ": freq=" + tf + " differs from postings freq=" + postingsDocs2.freq());
}
if (totalTermFreq != -1 && totalTermFreq != tf) {
throw new RuntimeException("vector totalTermFreq " + totalTermFreq + " != tf " + tf);
}
if (crossCheckTermVectors && postingsHasFreq) {
if (postingsDocs2.freq() != tf) {
throw new RuntimeException("vector term=" + term + " field=" + field + " doc=" + j + ": freq=" + tf + " differs from postings freq=" + postingsDocs2.freq());
}
}
tfvComputedSumTotalTermFreq += tf;
if (hasPositions || hasOffsets) {
int lastPosition = -1;
//int lastStartOffset = -1;
for (int i = 0; i < tf; i++) {
int pos = postings.nextPosition();
if (hasPositions) {
if (pos != -1 && pos < 0) {
throw new RuntimeException("vector position " + pos + " is out of bounds");
}
if (pos < lastPosition) {
throw new RuntimeException("vector position " + pos + " < lastPos " + lastPosition);
}
lastPosition = pos;
}
if (crossCheckTermVectors && postingsPostings != null) {
if (postingsPostings != null) {
int postingsPos = postingsPostings.nextPosition();
if (pos != -1 && postingsPos != -1 && pos != postingsPos) {
throw new RuntimeException("vector term=" + term + " field=" + field + " doc=" + j + ": pos=" + pos + " differs from postings pos=" + postingsPos);
@ -1563,16 +1533,16 @@ public class CheckIndex {
final int endOffset = postings.endOffset();
// TODO: these are too anal...?
/*
if (endOffset < startOffset) {
if (endOffset < startOffset) {
throw new RuntimeException("vector startOffset=" + startOffset + " is > endOffset=" + endOffset);
}
if (startOffset < lastStartOffset) {
}
if (startOffset < lastStartOffset) {
throw new RuntimeException("vector startOffset=" + startOffset + " is < prior startOffset=" + lastStartOffset);
}
lastStartOffset = startOffset;
}
lastStartOffset = startOffset;
*/
if (crossCheckTermVectors && postingsPostings != null) {
if (postingsPostings != null) {
final int postingsStartOffset = postingsPostings.startOffset();
final int postingsEndOffset = postingsPostings.endOffset();
@ -1587,48 +1557,11 @@ public class CheckIndex {
}
}
}
if (docs2.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
throw new RuntimeException("vector for doc " + j + " references multiple documents!");
}
}
long uniqueTermCount = terms.getUniqueTermCount();
if (uniqueTermCount != -1 && uniqueTermCount != tfvComputedTermCountForField) {
throw new RuntimeException("vector term count for doc " + j + ", field " + field + " = " + uniqueTermCount + " != recomputed term count=" + tfvComputedTermCountForField);
}
int docCount = terms.getDocCount();
if (docCount != -1 && docCount != 1) {
throw new RuntimeException("vector doc count for doc " + j + ", field " + field + " = " + docCount + " != 1");
}
long sumDocFreq = terms.getSumDocFreq();
if (sumDocFreq != -1 && sumDocFreq != tfvComputedTermCountForField) {
throw new RuntimeException("vector postings count for doc " + j + ", field " + field + " = " + sumDocFreq + " != recomputed postings count=" + tfvComputedTermCountForField);
}
long sumTotalTermFreq = terms.getSumTotalTermFreq();
if (sumTotalTermFreq != -1 && sumTotalTermFreq != tfvComputedSumTotalTermFreq) {
throw new RuntimeException("vector sumTotalTermFreq for doc " + j + ", field " + field + " = " + sumTotalTermFreq + " != recomputed sumTotalTermFreq=" + tfvComputedSumTotalTermFreq);
}
tfvComputedTermCount += tfvComputedTermCountForField;
}
int tfvUniqueFieldCount = tfv.getUniqueFieldCount();
if (tfvUniqueFieldCount != -1 && tfvUniqueFieldCount != tfvComputedFieldCount) {
throw new RuntimeException("vector field count for doc " + j + "=" + tfvUniqueFieldCount + " != recomputed uniqueFieldCount=" + tfvComputedFieldCount);
}
long tfvUniqueTermCount = tfv.getUniqueTermCount();
if (tfvUniqueTermCount != -1 && tfvUniqueTermCount != tfvComputedTermCount) {
throw new RuntimeException("vector term count for doc " + j + "=" + tfvUniqueTermCount + " != recomputed uniqueTermCount=" + tfvComputedTermCount);
}
}
}
}
msg("OK [" + status.totVectors + " total vector count; avg " +
format.format((((float) status.totVectors) / status.docCount)) + " term/freq vector fields per doc]");
} catch (Throwable e) {