From 46a5a57724519f349728a7aa613d0b1fe77a8c14 Mon Sep 17 00:00:00 2001 From: Robert Muir Date: Tue, 30 Nov 2021 14:21:40 -0500 Subject: [PATCH] LUCENE-10272: cross-check norms with postings in checkindex (#493) Previously, CheckIndex would iterate norms and validate each one. But if norms that should be there were missing, nothing would fail. Now it computes an expected count of norms and ensures it saw them all. --- .../org/apache/lucene/index/CheckIndex.java | 36 +++++++++++++++---- 1 file changed, 30 insertions(+), 6 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java b/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java index 285bed28a6a..64bc28c7928 100644 --- a/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java +++ b/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java @@ -2115,6 +2115,8 @@ public final class CheckIndex implements Closeable { if (fieldInfo.hasNorms() && isVectors == false) { final NumericDocValues norms = normsProducer.getNorms(fieldInfo); + // count of valid norm values found for the field + int actualCount = 0; // Cross-check terms with norms for (int doc = norms.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; @@ -2126,12 +2128,15 @@ public final class CheckIndex implements Closeable { continue; } final long norm = norms.longValue(); - if (norm != 0 && visitedDocs.get(doc) == false) { - throw new CheckIndexException( - "Document " - + doc - + " doesn't have terms according to postings but has a norm value that is not zero: " - + Long.toUnsignedString(norm)); + if (norm != 0) { + actualCount++; + if (visitedDocs.get(doc) == false) { + throw new CheckIndexException( + "Document " + + doc + + " doesn't have terms according to postings but has a norm value that is not zero: " + + Long.toUnsignedString(norm)); + } } else if (norm == 0 && visitedDocs.get(doc)) { throw new CheckIndexException( "Document " @@ -2139,6 +2144,25 @@ public final class CheckIndex implements Closeable { + " has terms according to postings but its norm value is 0, which may only be used on documents that have no terms"); } } + int expectedCount = 0; + for (int doc = visitedDocs.nextSetBit(0); + doc != DocIdSetIterator.NO_MORE_DOCS; + doc = + doc + 1 >= visitedDocs.length() + ? DocIdSetIterator.NO_MORE_DOCS + : visitedDocs.nextSetBit(doc + 1)) { + if (liveDocs != null && liveDocs.get(doc) == false) { + // Norms may only be out of sync with terms on deleted documents. + // This happens when a document fails indexing and in that case it + // should be immediately marked as deleted by the IndexWriter. + continue; + } + expectedCount++; + } + if (expectedCount != actualCount) { + throw new CheckIndexException( + "actual norm count: " + actualCount + " but expected: " + expectedCount); + } } // Test seek to last term: