LUCENE-8279: CheckIndex now cross-checks terms with norms.

This commit is contained in:
Adrien Grand 2018-05-02 11:55:48 +02:00
parent af680af77f
commit e00c4cede2
3 changed files with 53 additions and 5 deletions

View File

@ -80,6 +80,8 @@ Improvements
* LUCENE-8135: Boolean queries now implement the block-max WAND algorithm in * LUCENE-8135: Boolean queries now implement the block-max WAND algorithm in
order to speed up selection of top scored documents. (Adrien Grand) order to speed up selection of top scored documents. (Adrien Grand)
* LUCENE-8279: CheckIndex now cross-checks terms with norms. (Adrien Grand)
Optimizations Optimizations
* LUCENE-8040: Optimize IndexSearcher.collectionStatistics, avoiding MultiFields/MultiTerms * LUCENE-8040: Optimize IndexSearcher.collectionStatistics, avoiding MultiFields/MultiTerms

View File

@ -739,7 +739,7 @@ public final class CheckIndex implements Closeable {
// Test Fieldinfos // Test Fieldinfos
segInfoStat.fieldInfoStatus = testFieldInfos(reader, infoStream, failFast); segInfoStat.fieldInfoStatus = testFieldInfos(reader, infoStream, failFast);
// Test Field Norms // Test Field Norms
segInfoStat.fieldNormStatus = testFieldNorms(reader, infoStream, failFast); segInfoStat.fieldNormStatus = testFieldNorms(reader, infoStream, failFast);
@ -1209,7 +1209,8 @@ public final class CheckIndex implements Closeable {
* checks Fields api is consistent with itself. * checks Fields api is consistent with itself.
* searcher is optional, to verify with queries. Can be null. * searcher is optional, to verify with queries. Can be null.
*/ */
private static Status.TermIndexStatus checkFields(Fields fields, Bits liveDocs, int maxDoc, FieldInfos fieldInfos, boolean doPrint, boolean isVectors, PrintStream infoStream, boolean verbose, boolean doSlowChecks) throws IOException { private static Status.TermIndexStatus checkFields(Fields fields, Bits liveDocs, int maxDoc, FieldInfos fieldInfos,
NormsProducer normsProducer, boolean doPrint, boolean isVectors, PrintStream infoStream, boolean verbose, boolean doSlowChecks) throws IOException {
// TODO: we should probably return our own stats thing...?! // TODO: we should probably return our own stats thing...?!
long startNS; long startNS;
if (doPrint) { if (doPrint) {
@ -1754,7 +1755,26 @@ public final class CheckIndex implements Closeable {
if (visitedDocs.cardinality() != v) { if (visitedDocs.cardinality() != v) {
throw new RuntimeException("docCount for field " + field + "=" + v + " != recomputed docCount=" + visitedDocs.cardinality()); throw new RuntimeException("docCount for field " + field + "=" + v + " != recomputed docCount=" + visitedDocs.cardinality());
} }
if (fieldInfo.hasNorms() && isVectors == false) {
final NumericDocValues norms = normsProducer.getNorms(fieldInfo);
// Cross-check terms with norms
for (int doc = norms.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = norms.nextDoc()) {
if (liveDocs != null && liveDocs.get(doc) == false) {
// Norms may only be out of sync with terms on deleted documents.
// This happens when a document fails indexing and in that case it
// should be immediately marked as deleted by the IndexWriter.
continue;
}
final long norm = norms.longValue();
if (norm != 0 && visitedDocs.get(doc) == false) {
throw new RuntimeException("Document " + doc + " doesn't have terms according to postings but has a norm value that is not zero: " + norm);
} else if (norm == 0 && visitedDocs.get(doc)) {
throw new RuntimeException("Document " + doc + " has terms according to postings but its norm value is 0, which may only be used on documents that have no terms");
}
}
}
// Test seek to last term: // Test seek to last term:
if (lastTerm != null) { if (lastTerm != null) {
if (termsEnum.seekCeil(lastTerm.get()) != TermsEnum.SeekStatus.FOUND) { if (termsEnum.seekCeil(lastTerm.get()) != TermsEnum.SeekStatus.FOUND) {
@ -1937,7 +1957,11 @@ public final class CheckIndex implements Closeable {
final Fields fields = reader.getPostingsReader().getMergeInstance(); final Fields fields = reader.getPostingsReader().getMergeInstance();
final FieldInfos fieldInfos = reader.getFieldInfos(); final FieldInfos fieldInfos = reader.getFieldInfos();
status = checkFields(fields, reader.getLiveDocs(), maxDoc, fieldInfos, true, false, infoStream, verbose, doSlowChecks); NormsProducer normsProducer = reader.getNormsReader();
if (normsProducer != null) {
normsProducer = normsProducer.getMergeInstance();
}
status = checkFields(fields, reader.getLiveDocs(), maxDoc, fieldInfos, normsProducer, true, false, infoStream, verbose, doSlowChecks);
} catch (Throwable e) { } catch (Throwable e) {
if (failFast) { if (failFast) {
throw IOUtils.rethrowAlways(e); throw IOUtils.rethrowAlways(e);
@ -2594,7 +2618,7 @@ public final class CheckIndex implements Closeable {
if (tfv != null) { if (tfv != null) {
// First run with no deletions: // First run with no deletions:
checkFields(tfv, null, 1, fieldInfos, false, true, infoStream, verbose, doSlowChecks); checkFields(tfv, null, 1, fieldInfos, null, false, true, infoStream, verbose, doSlowChecks);
// Only agg stats if the doc is live: // Only agg stats if the doc is live:
final boolean doStats = liveDocs == null || liveDocs.get(j); final boolean doStats = liveDocs == null || liveDocs.get(j);

View File

@ -29,6 +29,7 @@ import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.store.BaseDirectoryWrapper; import org.apache.lucene.store.BaseDirectoryWrapper;
import org.apache.lucene.store.Directory; import org.apache.lucene.store.Directory;
import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.FixedBitSet;
import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.util.LuceneTestCase;
public class TestFilterLeafReader extends LuceneTestCase { public class TestFilterLeafReader extends LuceneTestCase {
@ -96,6 +97,27 @@ public class TestFilterLeafReader extends LuceneTestCase {
return terms==null ? null : new TestTerms(terms); return terms==null ? null : new TestTerms(terms);
} }
@Override
public NumericDocValues getNormValues(String field) throws IOException {
NumericDocValues ndv = super.getNormValues(field);
if (ndv == null) {
return null;
}
FixedBitSet docsWithTerms = new FixedBitSet(maxDoc());
TermsEnum termsEnum = terms(field).iterator();
PostingsEnum postings = null;
while (termsEnum.next() != null) {
postings = termsEnum.postings(postings, PostingsEnum.NONE);
docsWithTerms.or(postings);
}
return new FilterNumericDocValues(ndv) {
@Override
public long longValue() throws IOException {
return docsWithTerms.get(docID()) ? super.longValue() : 0L;
}
};
}
@Override @Override
public CacheHelper getCoreCacheHelper() { public CacheHelper getCoreCacheHelper() {
return null; return null;