From e00c4cede26690a82cf553a22b53a47c675cc01d Mon Sep 17 00:00:00 2001 From: Adrien Grand Date: Wed, 2 May 2018 11:55:48 +0200 Subject: [PATCH] LUCENE-8279: CheckIndex now cross-checks terms with norms. --- lucene/CHANGES.txt | 2 ++ .../org/apache/lucene/index/CheckIndex.java | 34 ++++++++++++++++--- .../lucene/index/TestFilterLeafReader.java | 22 ++++++++++++ 3 files changed, 53 insertions(+), 5 deletions(-) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index a5d08168073..ed6cc26e256 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -80,6 +80,8 @@ Improvements * LUCENE-8135: Boolean queries now implement the block-max WAND algorithm in order to speed up selection of top scored documents. (Adrien Grand) +* LUCENE-8279: CheckIndex now cross-checks terms with norms. (Adrien Grand) + Optimizations * LUCENE-8040: Optimize IndexSearcher.collectionStatistics, avoiding MultiFields/MultiTerms diff --git a/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java b/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java index ca27066c959..0f7871acd23 100644 --- a/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java +++ b/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java @@ -739,7 +739,7 @@ public final class CheckIndex implements Closeable { // Test Fieldinfos segInfoStat.fieldInfoStatus = testFieldInfos(reader, infoStream, failFast); - + // Test Field Norms segInfoStat.fieldNormStatus = testFieldNorms(reader, infoStream, failFast); @@ -1209,7 +1209,8 @@ public final class CheckIndex implements Closeable { * checks Fields api is consistent with itself. * searcher is optional, to verify with queries. Can be null. */ - private static Status.TermIndexStatus checkFields(Fields fields, Bits liveDocs, int maxDoc, FieldInfos fieldInfos, boolean doPrint, boolean isVectors, PrintStream infoStream, boolean verbose, boolean doSlowChecks) throws IOException { + private static Status.TermIndexStatus checkFields(Fields fields, Bits liveDocs, int maxDoc, FieldInfos fieldInfos, + NormsProducer normsProducer, boolean doPrint, boolean isVectors, PrintStream infoStream, boolean verbose, boolean doSlowChecks) throws IOException { // TODO: we should probably return our own stats thing...?! long startNS; if (doPrint) { @@ -1754,7 +1755,26 @@ public final class CheckIndex implements Closeable { if (visitedDocs.cardinality() != v) { throw new RuntimeException("docCount for field " + field + "=" + v + " != recomputed docCount=" + visitedDocs.cardinality()); } - + + if (fieldInfo.hasNorms() && isVectors == false) { + final NumericDocValues norms = normsProducer.getNorms(fieldInfo); + // Cross-check terms with norms + for (int doc = norms.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = norms.nextDoc()) { + if (liveDocs != null && liveDocs.get(doc) == false) { + // Norms may only be out of sync with terms on deleted documents. + // This happens when a document fails indexing and in that case it + // should be immediately marked as deleted by the IndexWriter. + continue; + } + final long norm = norms.longValue(); + if (norm != 0 && visitedDocs.get(doc) == false) { + throw new RuntimeException("Document " + doc + " doesn't have terms according to postings but has a norm value that is not zero: " + norm); + } else if (norm == 0 && visitedDocs.get(doc)) { + throw new RuntimeException("Document " + doc + " has terms according to postings but its norm value is 0, which may only be used on documents that have no terms"); + } + } + } + // Test seek to last term: if (lastTerm != null) { if (termsEnum.seekCeil(lastTerm.get()) != TermsEnum.SeekStatus.FOUND) { @@ -1937,7 +1957,11 @@ public final class CheckIndex implements Closeable { final Fields fields = reader.getPostingsReader().getMergeInstance(); final FieldInfos fieldInfos = reader.getFieldInfos(); - status = checkFields(fields, reader.getLiveDocs(), maxDoc, fieldInfos, true, false, infoStream, verbose, doSlowChecks); + NormsProducer normsProducer = reader.getNormsReader(); + if (normsProducer != null) { + normsProducer = normsProducer.getMergeInstance(); + } + status = checkFields(fields, reader.getLiveDocs(), maxDoc, fieldInfos, normsProducer, true, false, infoStream, verbose, doSlowChecks); } catch (Throwable e) { if (failFast) { throw IOUtils.rethrowAlways(e); @@ -2594,7 +2618,7 @@ public final class CheckIndex implements Closeable { if (tfv != null) { // First run with no deletions: - checkFields(tfv, null, 1, fieldInfos, false, true, infoStream, verbose, doSlowChecks); + checkFields(tfv, null, 1, fieldInfos, null, false, true, infoStream, verbose, doSlowChecks); // Only agg stats if the doc is live: final boolean doStats = liveDocs == null || liveDocs.get(j); diff --git a/lucene/core/src/test/org/apache/lucene/index/TestFilterLeafReader.java b/lucene/core/src/test/org/apache/lucene/index/TestFilterLeafReader.java index de06f900083..f89ef9cd079 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestFilterLeafReader.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestFilterLeafReader.java @@ -29,6 +29,7 @@ import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.store.BaseDirectoryWrapper; import org.apache.lucene.store.Directory; import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.FixedBitSet; import org.apache.lucene.util.LuceneTestCase; public class TestFilterLeafReader extends LuceneTestCase { @@ -96,6 +97,27 @@ public class TestFilterLeafReader extends LuceneTestCase { return terms==null ? null : new TestTerms(terms); } + @Override + public NumericDocValues getNormValues(String field) throws IOException { + NumericDocValues ndv = super.getNormValues(field); + if (ndv == null) { + return null; + } + FixedBitSet docsWithTerms = new FixedBitSet(maxDoc()); + TermsEnum termsEnum = terms(field).iterator(); + PostingsEnum postings = null; + while (termsEnum.next() != null) { + postings = termsEnum.postings(postings, PostingsEnum.NONE); + docsWithTerms.or(postings); + } + return new FilterNumericDocValues(ndv) { + @Override + public long longValue() throws IOException { + return docsWithTerms.get(docID()) ? super.longValue() : 0L; + } + }; + } + @Override public CacheHelper getCoreCacheHelper() { return null;