LUCENE-10272: cross-check norms with postings in checkindex (#493)

Previously, CheckIndex would iterate norms and validate each one. But if norms that should be there were missing, nothing would fail. Now it computes an expected count of norms and ensures it saw them all.
This commit is contained in:
Robert Muir 2021-11-30 14:21:40 -05:00 committed by GitHub
parent 749b744c0c
commit 46a5a57724
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
1 changed files with 30 additions and 6 deletions

View File

@ -2115,6 +2115,8 @@ public final class CheckIndex implements Closeable {
if (fieldInfo.hasNorms() && isVectors == false) { if (fieldInfo.hasNorms() && isVectors == false) {
final NumericDocValues norms = normsProducer.getNorms(fieldInfo); final NumericDocValues norms = normsProducer.getNorms(fieldInfo);
// count of valid norm values found for the field
int actualCount = 0;
// Cross-check terms with norms // Cross-check terms with norms
for (int doc = norms.nextDoc(); for (int doc = norms.nextDoc();
doc != DocIdSetIterator.NO_MORE_DOCS; doc != DocIdSetIterator.NO_MORE_DOCS;
@ -2126,12 +2128,15 @@ public final class CheckIndex implements Closeable {
continue; continue;
} }
final long norm = norms.longValue(); final long norm = norms.longValue();
if (norm != 0 && visitedDocs.get(doc) == false) { if (norm != 0) {
throw new CheckIndexException( actualCount++;
"Document " if (visitedDocs.get(doc) == false) {
+ doc throw new CheckIndexException(
+ " doesn't have terms according to postings but has a norm value that is not zero: " "Document "
+ Long.toUnsignedString(norm)); + doc
+ " doesn't have terms according to postings but has a norm value that is not zero: "
+ Long.toUnsignedString(norm));
}
} else if (norm == 0 && visitedDocs.get(doc)) { } else if (norm == 0 && visitedDocs.get(doc)) {
throw new CheckIndexException( throw new CheckIndexException(
"Document " "Document "
@ -2139,6 +2144,25 @@ public final class CheckIndex implements Closeable {
+ " has terms according to postings but its norm value is 0, which may only be used on documents that have no terms"); + " has terms according to postings but its norm value is 0, which may only be used on documents that have no terms");
} }
} }
int expectedCount = 0;
for (int doc = visitedDocs.nextSetBit(0);
doc != DocIdSetIterator.NO_MORE_DOCS;
doc =
doc + 1 >= visitedDocs.length()
? DocIdSetIterator.NO_MORE_DOCS
: visitedDocs.nextSetBit(doc + 1)) {
if (liveDocs != null && liveDocs.get(doc) == false) {
// Norms may only be out of sync with terms on deleted documents.
// This happens when a document fails indexing and in that case it
// should be immediately marked as deleted by the IndexWriter.
continue;
}
expectedCount++;
}
if (expectedCount != actualCount) {
throw new CheckIndexException(
"actual norm count: " + actualCount + " but expected: " + expectedCount);
}
} }
// Test seek to last term: // Test seek to last term: