mirror of https://github.com/apache/lucene.git
LUCENE-8279: CheckIndex now cross-checks terms with norms.
This commit is contained in:
parent
af680af77f
commit
e00c4cede2
|
@ -80,6 +80,8 @@ Improvements
|
||||||
* LUCENE-8135: Boolean queries now implement the block-max WAND algorithm in
|
* LUCENE-8135: Boolean queries now implement the block-max WAND algorithm in
|
||||||
order to speed up selection of top scored documents. (Adrien Grand)
|
order to speed up selection of top scored documents. (Adrien Grand)
|
||||||
|
|
||||||
|
* LUCENE-8279: CheckIndex now cross-checks terms with norms. (Adrien Grand)
|
||||||
|
|
||||||
Optimizations
|
Optimizations
|
||||||
|
|
||||||
* LUCENE-8040: Optimize IndexSearcher.collectionStatistics, avoiding MultiFields/MultiTerms
|
* LUCENE-8040: Optimize IndexSearcher.collectionStatistics, avoiding MultiFields/MultiTerms
|
||||||
|
|
|
@ -1209,7 +1209,8 @@ public final class CheckIndex implements Closeable {
|
||||||
* checks Fields api is consistent with itself.
|
* checks Fields api is consistent with itself.
|
||||||
* searcher is optional, to verify with queries. Can be null.
|
* searcher is optional, to verify with queries. Can be null.
|
||||||
*/
|
*/
|
||||||
private static Status.TermIndexStatus checkFields(Fields fields, Bits liveDocs, int maxDoc, FieldInfos fieldInfos, boolean doPrint, boolean isVectors, PrintStream infoStream, boolean verbose, boolean doSlowChecks) throws IOException {
|
private static Status.TermIndexStatus checkFields(Fields fields, Bits liveDocs, int maxDoc, FieldInfos fieldInfos,
|
||||||
|
NormsProducer normsProducer, boolean doPrint, boolean isVectors, PrintStream infoStream, boolean verbose, boolean doSlowChecks) throws IOException {
|
||||||
// TODO: we should probably return our own stats thing...?!
|
// TODO: we should probably return our own stats thing...?!
|
||||||
long startNS;
|
long startNS;
|
||||||
if (doPrint) {
|
if (doPrint) {
|
||||||
|
@ -1755,6 +1756,25 @@ public final class CheckIndex implements Closeable {
|
||||||
throw new RuntimeException("docCount for field " + field + "=" + v + " != recomputed docCount=" + visitedDocs.cardinality());
|
throw new RuntimeException("docCount for field " + field + "=" + v + " != recomputed docCount=" + visitedDocs.cardinality());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (fieldInfo.hasNorms() && isVectors == false) {
|
||||||
|
final NumericDocValues norms = normsProducer.getNorms(fieldInfo);
|
||||||
|
// Cross-check terms with norms
|
||||||
|
for (int doc = norms.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = norms.nextDoc()) {
|
||||||
|
if (liveDocs != null && liveDocs.get(doc) == false) {
|
||||||
|
// Norms may only be out of sync with terms on deleted documents.
|
||||||
|
// This happens when a document fails indexing and in that case it
|
||||||
|
// should be immediately marked as deleted by the IndexWriter.
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
final long norm = norms.longValue();
|
||||||
|
if (norm != 0 && visitedDocs.get(doc) == false) {
|
||||||
|
throw new RuntimeException("Document " + doc + " doesn't have terms according to postings but has a norm value that is not zero: " + norm);
|
||||||
|
} else if (norm == 0 && visitedDocs.get(doc)) {
|
||||||
|
throw new RuntimeException("Document " + doc + " has terms according to postings but its norm value is 0, which may only be used on documents that have no terms");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Test seek to last term:
|
// Test seek to last term:
|
||||||
if (lastTerm != null) {
|
if (lastTerm != null) {
|
||||||
if (termsEnum.seekCeil(lastTerm.get()) != TermsEnum.SeekStatus.FOUND) {
|
if (termsEnum.seekCeil(lastTerm.get()) != TermsEnum.SeekStatus.FOUND) {
|
||||||
|
@ -1937,7 +1957,11 @@ public final class CheckIndex implements Closeable {
|
||||||
|
|
||||||
final Fields fields = reader.getPostingsReader().getMergeInstance();
|
final Fields fields = reader.getPostingsReader().getMergeInstance();
|
||||||
final FieldInfos fieldInfos = reader.getFieldInfos();
|
final FieldInfos fieldInfos = reader.getFieldInfos();
|
||||||
status = checkFields(fields, reader.getLiveDocs(), maxDoc, fieldInfos, true, false, infoStream, verbose, doSlowChecks);
|
NormsProducer normsProducer = reader.getNormsReader();
|
||||||
|
if (normsProducer != null) {
|
||||||
|
normsProducer = normsProducer.getMergeInstance();
|
||||||
|
}
|
||||||
|
status = checkFields(fields, reader.getLiveDocs(), maxDoc, fieldInfos, normsProducer, true, false, infoStream, verbose, doSlowChecks);
|
||||||
} catch (Throwable e) {
|
} catch (Throwable e) {
|
||||||
if (failFast) {
|
if (failFast) {
|
||||||
throw IOUtils.rethrowAlways(e);
|
throw IOUtils.rethrowAlways(e);
|
||||||
|
@ -2594,7 +2618,7 @@ public final class CheckIndex implements Closeable {
|
||||||
|
|
||||||
if (tfv != null) {
|
if (tfv != null) {
|
||||||
// First run with no deletions:
|
// First run with no deletions:
|
||||||
checkFields(tfv, null, 1, fieldInfos, false, true, infoStream, verbose, doSlowChecks);
|
checkFields(tfv, null, 1, fieldInfos, null, false, true, infoStream, verbose, doSlowChecks);
|
||||||
|
|
||||||
// Only agg stats if the doc is live:
|
// Only agg stats if the doc is live:
|
||||||
final boolean doStats = liveDocs == null || liveDocs.get(j);
|
final boolean doStats = liveDocs == null || liveDocs.get(j);
|
||||||
|
|
|
@ -29,6 +29,7 @@ import org.apache.lucene.search.DocIdSetIterator;
|
||||||
import org.apache.lucene.store.BaseDirectoryWrapper;
|
import org.apache.lucene.store.BaseDirectoryWrapper;
|
||||||
import org.apache.lucene.store.Directory;
|
import org.apache.lucene.store.Directory;
|
||||||
import org.apache.lucene.util.BytesRef;
|
import org.apache.lucene.util.BytesRef;
|
||||||
|
import org.apache.lucene.util.FixedBitSet;
|
||||||
import org.apache.lucene.util.LuceneTestCase;
|
import org.apache.lucene.util.LuceneTestCase;
|
||||||
|
|
||||||
public class TestFilterLeafReader extends LuceneTestCase {
|
public class TestFilterLeafReader extends LuceneTestCase {
|
||||||
|
@ -96,6 +97,27 @@ public class TestFilterLeafReader extends LuceneTestCase {
|
||||||
return terms==null ? null : new TestTerms(terms);
|
return terms==null ? null : new TestTerms(terms);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public NumericDocValues getNormValues(String field) throws IOException {
|
||||||
|
NumericDocValues ndv = super.getNormValues(field);
|
||||||
|
if (ndv == null) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
FixedBitSet docsWithTerms = new FixedBitSet(maxDoc());
|
||||||
|
TermsEnum termsEnum = terms(field).iterator();
|
||||||
|
PostingsEnum postings = null;
|
||||||
|
while (termsEnum.next() != null) {
|
||||||
|
postings = termsEnum.postings(postings, PostingsEnum.NONE);
|
||||||
|
docsWithTerms.or(postings);
|
||||||
|
}
|
||||||
|
return new FilterNumericDocValues(ndv) {
|
||||||
|
@Override
|
||||||
|
public long longValue() throws IOException {
|
||||||
|
return docsWithTerms.get(docID()) ? super.longValue() : 0L;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public CacheHelper getCoreCacheHelper() {
|
public CacheHelper getCoreCacheHelper() {
|
||||||
return null;
|
return null;
|
||||||
|
|
Loading…
Reference in New Issue