mirror of https://github.com/apache/lucene.git
LUCENE-6320: speed up checkindex
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1663505 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
99fd468624
commit
1510f69303
|
@ -131,6 +131,8 @@ Optimizations
|
|||
* LUCENE-6318: Reduce RAM usage of FieldInfos when there are many fields.
|
||||
(Mike McCandless, Robert Muir)
|
||||
|
||||
* LUCENE-6320: Speed up CheckIndex. (Robert Muir)
|
||||
|
||||
API Changes
|
||||
|
||||
* LUCENE-6204, LUCENE-6208: Simplify CompoundFormat: remove files()
|
||||
|
|
|
@ -31,7 +31,12 @@ import java.util.Locale;
|
|||
import java.util.Map;
|
||||
|
||||
import org.apache.lucene.codecs.Codec;
|
||||
import org.apache.lucene.codecs.DocValuesProducer;
|
||||
import org.apache.lucene.codecs.NormsProducer;
|
||||
import org.apache.lucene.codecs.PostingsFormat;
|
||||
import org.apache.lucene.codecs.StoredFieldsReader;
|
||||
import org.apache.lucene.codecs.TermVectorsReader;
|
||||
import org.apache.lucene.document.DocumentStoredFieldVisitor;
|
||||
import org.apache.lucene.index.CheckIndex.Status.DocValuesStatus;
|
||||
import org.apache.lucene.search.DocIdSetIterator;
|
||||
import org.apache.lucene.store.AlreadyClosedException;
|
||||
|
@ -174,7 +179,7 @@ public class CheckIndex implements Closeable {
|
|||
/** Current deletions generation. */
|
||||
public long deletionsGen;
|
||||
|
||||
/** True if we were able to open an LeafReader on this
|
||||
/** True if we were able to open a CodecReader on this
|
||||
* segment. */
|
||||
public boolean openReaderPassed;
|
||||
|
||||
|
@ -755,7 +760,7 @@ public class CheckIndex implements Closeable {
|
|||
* Test live docs.
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public static Status.LiveDocStatus testLiveDocs(LeafReader reader, PrintStream infoStream, boolean failFast) throws IOException {
|
||||
public static Status.LiveDocStatus testLiveDocs(CodecReader reader, PrintStream infoStream, boolean failFast) throws IOException {
|
||||
long startNS = System.nanoTime();
|
||||
final Status.LiveDocStatus status = new Status.LiveDocStatus();
|
||||
|
||||
|
@ -812,7 +817,7 @@ public class CheckIndex implements Closeable {
|
|||
* Test field infos.
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public static Status.FieldInfoStatus testFieldInfos(LeafReader reader, PrintStream infoStream, boolean failFast) throws IOException {
|
||||
public static Status.FieldInfoStatus testFieldInfos(CodecReader reader, PrintStream infoStream, boolean failFast) throws IOException {
|
||||
long startNS = System.nanoTime();
|
||||
final Status.FieldInfoStatus status = new Status.FieldInfoStatus();
|
||||
|
||||
|
@ -845,7 +850,7 @@ public class CheckIndex implements Closeable {
|
|||
* Test field norms.
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public static Status.FieldNormStatus testFieldNorms(LeafReader reader, PrintStream infoStream, boolean failFast) throws IOException {
|
||||
public static Status.FieldNormStatus testFieldNorms(CodecReader reader, PrintStream infoStream, boolean failFast) throws IOException {
|
||||
long startNS = System.nanoTime();
|
||||
final Status.FieldNormStatus status = new Status.FieldNormStatus();
|
||||
|
||||
|
@ -854,14 +859,14 @@ public class CheckIndex implements Closeable {
|
|||
if (infoStream != null) {
|
||||
infoStream.print(" test: field norms.........");
|
||||
}
|
||||
NormsProducer normsReader = reader.getNormsReader();
|
||||
if (normsReader != null) {
|
||||
normsReader = normsReader.getMergeInstance();
|
||||
}
|
||||
for (FieldInfo info : reader.getFieldInfos()) {
|
||||
if (info.hasNorms()) {
|
||||
checkNorms(info, reader, infoStream);
|
||||
checkNumericDocValues(info.name, reader.maxDoc(), normsReader.getNorms(info), new Bits.MatchAllBits(reader.maxDoc()));
|
||||
++status.totFields;
|
||||
} else {
|
||||
if (reader.getNormValues(info.name) != null) {
|
||||
throw new RuntimeException("field: " + info.name + " should omit norms but has them!");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -897,7 +902,6 @@ public class CheckIndex implements Closeable {
|
|||
int computedFieldCount = 0;
|
||||
|
||||
PostingsEnum docs = null;
|
||||
PostingsEnum docsAndFreqs = null;
|
||||
PostingsEnum postings = null;
|
||||
|
||||
String lastField = null;
|
||||
|
@ -1174,20 +1178,20 @@ public class CheckIndex implements Closeable {
|
|||
// Re-count if there are deleted docs:
|
||||
if (liveDocs != null) {
|
||||
if (hasFreqs) {
|
||||
final PostingsEnum docsNoDel = termsEnum.postings(null, docsAndFreqs);
|
||||
docs = termsEnum.postings(null, docs);
|
||||
docCount = 0;
|
||||
totalTermFreq = 0;
|
||||
while(docsNoDel.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
|
||||
visitedDocs.set(docsNoDel.docID());
|
||||
while(docs.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
|
||||
visitedDocs.set(docs.docID());
|
||||
docCount++;
|
||||
totalTermFreq += docsNoDel.freq();
|
||||
totalTermFreq += docs.freq();
|
||||
}
|
||||
} else {
|
||||
final PostingsEnum docsNoDel = termsEnum.postings(null, docs, PostingsEnum.NONE);
|
||||
docs = termsEnum.postings(null, docs, PostingsEnum.NONE);
|
||||
docCount = 0;
|
||||
totalTermFreq = -1;
|
||||
while(docsNoDel.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
|
||||
visitedDocs.set(docsNoDel.docID());
|
||||
while(docs.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
|
||||
visitedDocs.set(docs.docID());
|
||||
docCount++;
|
||||
}
|
||||
}
|
||||
|
@ -1457,7 +1461,7 @@ public class CheckIndex implements Closeable {
|
|||
* Test the term index.
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public static Status.TermIndexStatus testPostings(LeafReader reader, PrintStream infoStream) throws IOException {
|
||||
public static Status.TermIndexStatus testPostings(CodecReader reader, PrintStream infoStream) throws IOException {
|
||||
return testPostings(reader, infoStream, false, false);
|
||||
}
|
||||
|
||||
|
@ -1465,7 +1469,7 @@ public class CheckIndex implements Closeable {
|
|||
* Test the term index.
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public static Status.TermIndexStatus testPostings(LeafReader reader, PrintStream infoStream, boolean verbose, boolean failFast) throws IOException {
|
||||
public static Status.TermIndexStatus testPostings(CodecReader reader, PrintStream infoStream, boolean verbose, boolean failFast) throws IOException {
|
||||
|
||||
// TODO: we should go and verify term vectors match, if
|
||||
// crossCheckTermVectors is on...
|
||||
|
@ -1479,7 +1483,7 @@ public class CheckIndex implements Closeable {
|
|||
infoStream.print(" test: terms, freq, prox...");
|
||||
}
|
||||
|
||||
final Fields fields = reader.fields();
|
||||
final Fields fields = reader.getPostingsReader().getMergeInstance();
|
||||
final FieldInfos fieldInfos = reader.getFieldInfos();
|
||||
status = checkFields(fields, liveDocs, maxDoc, fieldInfos, true, false, infoStream, verbose);
|
||||
if (liveDocs != null) {
|
||||
|
@ -1507,7 +1511,7 @@ public class CheckIndex implements Closeable {
|
|||
* Test stored fields.
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public static Status.StoredFieldStatus testStoredFields(LeafReader reader, PrintStream infoStream, boolean failFast) throws IOException {
|
||||
public static Status.StoredFieldStatus testStoredFields(CodecReader reader, PrintStream infoStream, boolean failFast) throws IOException {
|
||||
long startNS = System.nanoTime();
|
||||
final Status.StoredFieldStatus status = new Status.StoredFieldStatus();
|
||||
|
||||
|
@ -1518,10 +1522,13 @@ public class CheckIndex implements Closeable {
|
|||
|
||||
// Scan stored fields for all documents
|
||||
final Bits liveDocs = reader.getLiveDocs();
|
||||
StoredFieldsReader storedFields = reader.getFieldsReader().getMergeInstance();
|
||||
for (int j = 0; j < reader.maxDoc(); ++j) {
|
||||
// Intentionally pull even deleted documents to
|
||||
// make sure they too are not corrupt:
|
||||
StoredDocument doc = reader.document(j);
|
||||
DocumentStoredFieldVisitor visitor = new DocumentStoredFieldVisitor();
|
||||
storedFields.visitDocument(j, visitor);
|
||||
StoredDocument doc = visitor.getDocument();
|
||||
if (liveDocs == null || liveDocs.get(j)) {
|
||||
status.docCount++;
|
||||
status.totFields += doc.getFields().size();
|
||||
|
@ -1555,7 +1562,7 @@ public class CheckIndex implements Closeable {
|
|||
* Test docvalues.
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public static Status.DocValuesStatus testDocValues(LeafReader reader,
|
||||
public static Status.DocValuesStatus testDocValues(CodecReader reader,
|
||||
PrintStream infoStream,
|
||||
boolean failFast) throws IOException {
|
||||
long startNS = System.nanoTime();
|
||||
|
@ -1564,18 +1571,14 @@ public class CheckIndex implements Closeable {
|
|||
if (infoStream != null) {
|
||||
infoStream.print(" test: docvalues...........");
|
||||
}
|
||||
DocValuesProducer dvReader = reader.getDocValuesReader();
|
||||
if (dvReader != null) {
|
||||
dvReader = dvReader.getMergeInstance();
|
||||
}
|
||||
for (FieldInfo fieldInfo : reader.getFieldInfos()) {
|
||||
if (fieldInfo.getDocValuesType() != DocValuesType.NONE) {
|
||||
status.totalValueFields++;
|
||||
checkDocValues(fieldInfo, reader, infoStream, status);
|
||||
} else {
|
||||
if (reader.getBinaryDocValues(fieldInfo.name) != null ||
|
||||
reader.getNumericDocValues(fieldInfo.name) != null ||
|
||||
reader.getSortedDocValues(fieldInfo.name) != null ||
|
||||
reader.getSortedSetDocValues(fieldInfo.name) != null ||
|
||||
reader.getDocsWithField(fieldInfo.name) != null) {
|
||||
throw new RuntimeException("field: " + fieldInfo.name + " has docvalues but should omit them!");
|
||||
}
|
||||
checkDocValues(fieldInfo, dvReader, reader.maxDoc(), infoStream, status);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1601,8 +1604,8 @@ public class CheckIndex implements Closeable {
|
|||
return status;
|
||||
}
|
||||
|
||||
private static void checkBinaryDocValues(String fieldName, LeafReader reader, BinaryDocValues dv, Bits docsWithField) {
|
||||
for (int i = 0; i < reader.maxDoc(); i++) {
|
||||
private static void checkBinaryDocValues(String fieldName, int maxDoc, BinaryDocValues dv, Bits docsWithField) {
|
||||
for (int i = 0; i < maxDoc; i++) {
|
||||
final BytesRef term = dv.get(i);
|
||||
assert term.isValid();
|
||||
if (docsWithField.get(i) == false && term.length > 0) {
|
||||
|
@ -1611,12 +1614,12 @@ public class CheckIndex implements Closeable {
|
|||
}
|
||||
}
|
||||
|
||||
private static void checkSortedDocValues(String fieldName, LeafReader reader, SortedDocValues dv, Bits docsWithField) {
|
||||
checkBinaryDocValues(fieldName, reader, dv, docsWithField);
|
||||
private static void checkSortedDocValues(String fieldName, int maxDoc, SortedDocValues dv, Bits docsWithField) {
|
||||
checkBinaryDocValues(fieldName, maxDoc, dv, docsWithField);
|
||||
final int maxOrd = dv.getValueCount()-1;
|
||||
FixedBitSet seenOrds = new FixedBitSet(dv.getValueCount());
|
||||
int maxOrd2 = -1;
|
||||
for (int i = 0; i < reader.maxDoc(); i++) {
|
||||
for (int i = 0; i < maxDoc; i++) {
|
||||
int ord = dv.getOrd(i);
|
||||
if (ord == -1) {
|
||||
if (docsWithField.get(i)) {
|
||||
|
@ -1651,11 +1654,11 @@ public class CheckIndex implements Closeable {
|
|||
}
|
||||
}
|
||||
|
||||
private static void checkSortedSetDocValues(String fieldName, LeafReader reader, SortedSetDocValues dv, Bits docsWithField) {
|
||||
private static void checkSortedSetDocValues(String fieldName, int maxDoc, SortedSetDocValues dv, Bits docsWithField) {
|
||||
final long maxOrd = dv.getValueCount()-1;
|
||||
LongBitSet seenOrds = new LongBitSet(dv.getValueCount());
|
||||
long maxOrd2 = -1;
|
||||
for (int i = 0; i < reader.maxDoc(); i++) {
|
||||
for (int i = 0; i < maxDoc; i++) {
|
||||
dv.setDocument(i);
|
||||
long lastOrd = -1;
|
||||
long ord;
|
||||
|
@ -1721,8 +1724,8 @@ public class CheckIndex implements Closeable {
|
|||
}
|
||||
}
|
||||
|
||||
private static void checkSortedNumericDocValues(String fieldName, LeafReader reader, SortedNumericDocValues ndv, Bits docsWithField) {
|
||||
for (int i = 0; i < reader.maxDoc(); i++) {
|
||||
private static void checkSortedNumericDocValues(String fieldName, int maxDoc, SortedNumericDocValues ndv, Bits docsWithField) {
|
||||
for (int i = 0; i < maxDoc; i++) {
|
||||
ndv.setDocument(i);
|
||||
int count = ndv.count();
|
||||
if (docsWithField.get(i)) {
|
||||
|
@ -1745,8 +1748,8 @@ public class CheckIndex implements Closeable {
|
|||
}
|
||||
}
|
||||
|
||||
private static void checkNumericDocValues(String fieldName, LeafReader reader, NumericDocValues ndv, Bits docsWithField) {
|
||||
for (int i = 0; i < reader.maxDoc(); i++) {
|
||||
private static void checkNumericDocValues(String fieldName, int maxDoc, NumericDocValues ndv, Bits docsWithField) {
|
||||
for (int i = 0; i < maxDoc; i++) {
|
||||
long value = ndv.get(i);
|
||||
if (docsWithField.get(i) == false && value != 0) {
|
||||
throw new RuntimeException("dv for field: " + fieldName + " is marked missing but has value=" + value + " for doc: " + i);
|
||||
|
@ -1754,80 +1757,44 @@ public class CheckIndex implements Closeable {
|
|||
}
|
||||
}
|
||||
|
||||
private static void checkDocValues(FieldInfo fi, LeafReader reader, PrintStream infoStream, DocValuesStatus status) throws Exception {
|
||||
Bits docsWithField = reader.getDocsWithField(fi.name);
|
||||
private static void checkDocValues(FieldInfo fi, DocValuesProducer dvReader, int maxDoc, PrintStream infoStream, DocValuesStatus status) throws Exception {
|
||||
Bits docsWithField = dvReader.getDocsWithField(fi);
|
||||
if (docsWithField == null) {
|
||||
throw new RuntimeException(fi.name + " docsWithField does not exist");
|
||||
} else if (docsWithField.length() != reader.maxDoc()) {
|
||||
throw new RuntimeException(fi.name + " docsWithField has incorrect length: " + docsWithField.length() + ",expected: " + reader.maxDoc());
|
||||
} else if (docsWithField.length() != maxDoc) {
|
||||
throw new RuntimeException(fi.name + " docsWithField has incorrect length: " + docsWithField.length() + ",expected: " + maxDoc);
|
||||
}
|
||||
switch(fi.getDocValuesType()) {
|
||||
case SORTED:
|
||||
status.totalSortedFields++;
|
||||
checkSortedDocValues(fi.name, reader, reader.getSortedDocValues(fi.name), docsWithField);
|
||||
if (reader.getBinaryDocValues(fi.name) != null ||
|
||||
reader.getNumericDocValues(fi.name) != null ||
|
||||
reader.getSortedNumericDocValues(fi.name) != null ||
|
||||
reader.getSortedSetDocValues(fi.name) != null) {
|
||||
throw new RuntimeException(fi.name + " returns multiple docvalues types!");
|
||||
}
|
||||
checkSortedDocValues(fi.name, maxDoc, dvReader.getSorted(fi), docsWithField);
|
||||
break;
|
||||
case SORTED_NUMERIC:
|
||||
status.totalSortedNumericFields++;
|
||||
checkSortedNumericDocValues(fi.name, reader, reader.getSortedNumericDocValues(fi.name), docsWithField);
|
||||
if (reader.getBinaryDocValues(fi.name) != null ||
|
||||
reader.getNumericDocValues(fi.name) != null ||
|
||||
reader.getSortedSetDocValues(fi.name) != null ||
|
||||
reader.getSortedDocValues(fi.name) != null) {
|
||||
throw new RuntimeException(fi.name + " returns multiple docvalues types!");
|
||||
}
|
||||
checkSortedNumericDocValues(fi.name, maxDoc, dvReader.getSortedNumeric(fi), docsWithField);
|
||||
break;
|
||||
case SORTED_SET:
|
||||
status.totalSortedSetFields++;
|
||||
checkSortedSetDocValues(fi.name, reader, reader.getSortedSetDocValues(fi.name), docsWithField);
|
||||
if (reader.getBinaryDocValues(fi.name) != null ||
|
||||
reader.getNumericDocValues(fi.name) != null ||
|
||||
reader.getSortedNumericDocValues(fi.name) != null ||
|
||||
reader.getSortedDocValues(fi.name) != null) {
|
||||
throw new RuntimeException(fi.name + " returns multiple docvalues types!");
|
||||
}
|
||||
checkSortedSetDocValues(fi.name, maxDoc, dvReader.getSortedSet(fi), docsWithField);
|
||||
break;
|
||||
case BINARY:
|
||||
status.totalBinaryFields++;
|
||||
checkBinaryDocValues(fi.name, reader, reader.getBinaryDocValues(fi.name), docsWithField);
|
||||
if (reader.getNumericDocValues(fi.name) != null ||
|
||||
reader.getSortedDocValues(fi.name) != null ||
|
||||
reader.getSortedNumericDocValues(fi.name) != null ||
|
||||
reader.getSortedSetDocValues(fi.name) != null) {
|
||||
throw new RuntimeException(fi.name + " returns multiple docvalues types!");
|
||||
}
|
||||
checkBinaryDocValues(fi.name, maxDoc, dvReader.getBinary(fi), docsWithField);
|
||||
break;
|
||||
case NUMERIC:
|
||||
status.totalNumericFields++;
|
||||
checkNumericDocValues(fi.name, reader, reader.getNumericDocValues(fi.name), docsWithField);
|
||||
if (reader.getBinaryDocValues(fi.name) != null ||
|
||||
reader.getSortedDocValues(fi.name) != null ||
|
||||
reader.getSortedNumericDocValues(fi.name) != null ||
|
||||
reader.getSortedSetDocValues(fi.name) != null) {
|
||||
throw new RuntimeException(fi.name + " returns multiple docvalues types!");
|
||||
}
|
||||
checkNumericDocValues(fi.name, maxDoc, dvReader.getNumeric(fi), docsWithField);
|
||||
break;
|
||||
default:
|
||||
throw new AssertionError();
|
||||
}
|
||||
}
|
||||
|
||||
private static void checkNorms(FieldInfo fi, LeafReader reader, PrintStream infoStream) throws IOException {
|
||||
if (fi.hasNorms()) {
|
||||
checkNumericDocValues(fi.name, reader, reader.getNormValues(fi.name), new Bits.MatchAllBits(reader.maxDoc()));
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Test term vectors.
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public static Status.TermVectorStatus testTermVectors(LeafReader reader, PrintStream infoStream) throws IOException {
|
||||
public static Status.TermVectorStatus testTermVectors(CodecReader reader, PrintStream infoStream) throws IOException {
|
||||
return testTermVectors(reader, infoStream, false, false, false);
|
||||
}
|
||||
|
||||
|
@ -1835,7 +1802,7 @@ public class CheckIndex implements Closeable {
|
|||
* Test term vectors.
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public static Status.TermVectorStatus testTermVectors(LeafReader reader, PrintStream infoStream, boolean verbose, boolean crossCheckTermVectors, boolean failFast) throws IOException {
|
||||
public static Status.TermVectorStatus testTermVectors(CodecReader reader, PrintStream infoStream, boolean verbose, boolean crossCheckTermVectors, boolean failFast) throws IOException {
|
||||
long startNS = System.nanoTime();
|
||||
final Status.TermVectorStatus status = new Status.TermVectorStatus();
|
||||
final FieldInfos fieldInfos = reader.getFieldInfos();
|
||||
|
@ -1858,139 +1825,143 @@ public class CheckIndex implements Closeable {
|
|||
final Fields postingsFields;
|
||||
// TODO: testTermsIndex
|
||||
if (crossCheckTermVectors) {
|
||||
postingsFields = reader.fields();
|
||||
postingsFields = reader.getPostingsReader().getMergeInstance();
|
||||
} else {
|
||||
postingsFields = null;
|
||||
}
|
||||
|
||||
TermsEnum termsEnum = null;
|
||||
TermsEnum postingsTermsEnum = null;
|
||||
|
||||
TermVectorsReader vectorsReader = reader.getTermVectorsReader();
|
||||
|
||||
for (int j = 0; j < reader.maxDoc(); ++j) {
|
||||
// Intentionally pull/visit (but don't count in
|
||||
// stats) deleted documents to make sure they too
|
||||
// are not corrupt:
|
||||
Fields tfv = reader.getTermVectors(j);
|
||||
|
||||
// TODO: can we make a IS(FIR) that searches just
|
||||
// this term vector... to pass for searcher?
|
||||
|
||||
if (tfv != null) {
|
||||
// First run with no deletions:
|
||||
checkFields(tfv, null, 1, fieldInfos, false, true, infoStream, verbose);
|
||||
|
||||
if (j == 0) {
|
||||
// Also test with the 1 doc deleted; we only do this for first doc because this really is just looking for a [slightly] buggy
|
||||
// TermVectors impl that fails to respect the incoming live docs:
|
||||
checkFields(tfv, onlyDocIsDeleted, 1, fieldInfos, false, true, infoStream, verbose);
|
||||
}
|
||||
|
||||
// Only agg stats if the doc is live:
|
||||
final boolean doStats = liveDocs == null || liveDocs.get(j);
|
||||
|
||||
if (doStats) {
|
||||
status.docCount++;
|
||||
}
|
||||
|
||||
for(String field : tfv) {
|
||||
if (vectorsReader != null) {
|
||||
vectorsReader = vectorsReader.getMergeInstance();
|
||||
for (int j = 0; j < reader.maxDoc(); ++j) {
|
||||
// Intentionally pull/visit (but don't count in
|
||||
// stats) deleted documents to make sure they too
|
||||
// are not corrupt:
|
||||
Fields tfv = vectorsReader.get(j);
|
||||
|
||||
// TODO: can we make a IS(FIR) that searches just
|
||||
// this term vector... to pass for searcher?
|
||||
|
||||
if (tfv != null) {
|
||||
// First run with no deletions:
|
||||
checkFields(tfv, null, 1, fieldInfos, false, true, infoStream, verbose);
|
||||
|
||||
if (j == 0) {
|
||||
// Also test with the 1 doc deleted; we only do this for first doc because this really is just looking for a [slightly] buggy
|
||||
// TermVectors impl that fails to respect the incoming live docs:
|
||||
checkFields(tfv, onlyDocIsDeleted, 1, fieldInfos, false, true, infoStream, verbose);
|
||||
}
|
||||
|
||||
// Only agg stats if the doc is live:
|
||||
final boolean doStats = liveDocs == null || liveDocs.get(j);
|
||||
|
||||
if (doStats) {
|
||||
status.totVectors++;
|
||||
status.docCount++;
|
||||
}
|
||||
|
||||
// Make sure FieldInfo thinks this field is vector'd:
|
||||
final FieldInfo fieldInfo = fieldInfos.fieldInfo(field);
|
||||
if (!fieldInfo.hasVectors()) {
|
||||
throw new RuntimeException("docID=" + j + " has term vectors for field=" + field + " but FieldInfo has storeTermVector=false");
|
||||
}
|
||||
|
||||
if (crossCheckTermVectors) {
|
||||
Terms terms = tfv.terms(field);
|
||||
termsEnum = terms.iterator(termsEnum);
|
||||
final boolean postingsHasFreq = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS) >= 0;
|
||||
final boolean postingsHasPayload = fieldInfo.hasPayloads();
|
||||
final boolean vectorsHasPayload = terms.hasPayloads();
|
||||
|
||||
Terms postingsTerms = postingsFields.terms(field);
|
||||
if (postingsTerms == null) {
|
||||
throw new RuntimeException("vector field=" + field + " does not exist in postings; doc=" + j);
|
||||
|
||||
for(String field : tfv) {
|
||||
if (doStats) {
|
||||
status.totVectors++;
|
||||
}
|
||||
postingsTermsEnum = postingsTerms.iterator(postingsTermsEnum);
|
||||
|
||||
final boolean hasProx = terms.hasOffsets() || terms.hasPositions();
|
||||
BytesRef term = null;
|
||||
while ((term = termsEnum.next()) != null) {
|
||||
|
||||
if (hasProx) {
|
||||
postings = termsEnum.postings(null, postings, PostingsEnum.ALL);
|
||||
assert postings != null;
|
||||
docs = null;
|
||||
} else {
|
||||
docs = termsEnum.postings(null, docs);
|
||||
assert docs != null;
|
||||
postings = null;
|
||||
// Make sure FieldInfo thinks this field is vector'd:
|
||||
final FieldInfo fieldInfo = fieldInfos.fieldInfo(field);
|
||||
if (!fieldInfo.hasVectors()) {
|
||||
throw new RuntimeException("docID=" + j + " has term vectors for field=" + field + " but FieldInfo has storeTermVector=false");
|
||||
}
|
||||
|
||||
if (crossCheckTermVectors) {
|
||||
Terms terms = tfv.terms(field);
|
||||
termsEnum = terms.iterator(termsEnum);
|
||||
final boolean postingsHasFreq = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS) >= 0;
|
||||
final boolean postingsHasPayload = fieldInfo.hasPayloads();
|
||||
final boolean vectorsHasPayload = terms.hasPayloads();
|
||||
|
||||
Terms postingsTerms = postingsFields.terms(field);
|
||||
if (postingsTerms == null) {
|
||||
throw new RuntimeException("vector field=" + field + " does not exist in postings; doc=" + j);
|
||||
}
|
||||
|
||||
final PostingsEnum docs2;
|
||||
if (hasProx) {
|
||||
assert postings != null;
|
||||
docs2 = postings;
|
||||
} else {
|
||||
assert docs != null;
|
||||
docs2 = docs;
|
||||
}
|
||||
|
||||
final PostingsEnum postingsDocs2;
|
||||
if (!postingsTermsEnum.seekExact(term)) {
|
||||
throw new RuntimeException("vector term=" + term + " field=" + field + " does not exist in postings; doc=" + j);
|
||||
}
|
||||
postingsPostings = postingsTermsEnum.postings(null, postingsPostings, PostingsEnum.ALL);
|
||||
if (postingsPostings == null) {
|
||||
// Term vectors were indexed w/ pos but postings were not
|
||||
postingsDocs = postingsTermsEnum.postings(null, postingsDocs);
|
||||
if (postingsDocs == null) {
|
||||
postingsTermsEnum = postingsTerms.iterator(postingsTermsEnum);
|
||||
|
||||
final boolean hasProx = terms.hasOffsets() || terms.hasPositions();
|
||||
BytesRef term = null;
|
||||
while ((term = termsEnum.next()) != null) {
|
||||
|
||||
if (hasProx) {
|
||||
postings = termsEnum.postings(null, postings, PostingsEnum.ALL);
|
||||
assert postings != null;
|
||||
docs = null;
|
||||
} else {
|
||||
docs = termsEnum.postings(null, docs);
|
||||
assert docs != null;
|
||||
postings = null;
|
||||
}
|
||||
|
||||
final PostingsEnum docs2;
|
||||
if (hasProx) {
|
||||
assert postings != null;
|
||||
docs2 = postings;
|
||||
} else {
|
||||
assert docs != null;
|
||||
docs2 = docs;
|
||||
}
|
||||
|
||||
final PostingsEnum postingsDocs2;
|
||||
if (!postingsTermsEnum.seekExact(term)) {
|
||||
throw new RuntimeException("vector term=" + term + " field=" + field + " does not exist in postings; doc=" + j);
|
||||
}
|
||||
}
|
||||
|
||||
if (postingsPostings != null) {
|
||||
postingsDocs2 = postingsPostings;
|
||||
} else {
|
||||
postingsDocs2 = postingsDocs;
|
||||
}
|
||||
|
||||
final int advanceDoc = postingsDocs2.advance(j);
|
||||
if (advanceDoc != j) {
|
||||
throw new RuntimeException("vector term=" + term + " field=" + field + ": doc=" + j + " was not found in postings (got: " + advanceDoc + ")");
|
||||
}
|
||||
|
||||
final int doc = docs2.nextDoc();
|
||||
|
||||
if (doc != 0) {
|
||||
throw new RuntimeException("vector for doc " + j + " didn't return docID=0: got docID=" + doc);
|
||||
}
|
||||
|
||||
if (postingsHasFreq) {
|
||||
final int tf = docs2.freq();
|
||||
if (postingsHasFreq && postingsDocs2.freq() != tf) {
|
||||
throw new RuntimeException("vector term=" + term + " field=" + field + " doc=" + j + ": freq=" + tf + " differs from postings freq=" + postingsDocs2.freq());
|
||||
postingsPostings = postingsTermsEnum.postings(null, postingsPostings, PostingsEnum.ALL);
|
||||
if (postingsPostings == null) {
|
||||
// Term vectors were indexed w/ pos but postings were not
|
||||
postingsDocs = postingsTermsEnum.postings(null, postingsDocs);
|
||||
if (postingsDocs == null) {
|
||||
throw new RuntimeException("vector term=" + term + " field=" + field + " does not exist in postings; doc=" + j);
|
||||
}
|
||||
}
|
||||
|
||||
if (hasProx) {
|
||||
for (int i = 0; i < tf; i++) {
|
||||
int pos = postings.nextPosition();
|
||||
if (postingsPostings != null) {
|
||||
int postingsPos = postingsPostings.nextPosition();
|
||||
if (terms.hasPositions() && pos != postingsPos) {
|
||||
throw new RuntimeException("vector term=" + term + " field=" + field + " doc=" + j + ": pos=" + pos + " differs from postings pos=" + postingsPos);
|
||||
|
||||
if (postingsPostings != null) {
|
||||
postingsDocs2 = postingsPostings;
|
||||
} else {
|
||||
postingsDocs2 = postingsDocs;
|
||||
}
|
||||
|
||||
final int advanceDoc = postingsDocs2.advance(j);
|
||||
if (advanceDoc != j) {
|
||||
throw new RuntimeException("vector term=" + term + " field=" + field + ": doc=" + j + " was not found in postings (got: " + advanceDoc + ")");
|
||||
}
|
||||
|
||||
final int doc = docs2.nextDoc();
|
||||
|
||||
if (doc != 0) {
|
||||
throw new RuntimeException("vector for doc " + j + " didn't return docID=0: got docID=" + doc);
|
||||
}
|
||||
|
||||
if (postingsHasFreq) {
|
||||
final int tf = docs2.freq();
|
||||
if (postingsHasFreq && postingsDocs2.freq() != tf) {
|
||||
throw new RuntimeException("vector term=" + term + " field=" + field + " doc=" + j + ": freq=" + tf + " differs from postings freq=" + postingsDocs2.freq());
|
||||
}
|
||||
|
||||
if (hasProx) {
|
||||
for (int i = 0; i < tf; i++) {
|
||||
int pos = postings.nextPosition();
|
||||
if (postingsPostings != null) {
|
||||
int postingsPos = postingsPostings.nextPosition();
|
||||
if (terms.hasPositions() && pos != postingsPos) {
|
||||
throw new RuntimeException("vector term=" + term + " field=" + field + " doc=" + j + ": pos=" + pos + " differs from postings pos=" + postingsPos);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Call the methods to at least make
|
||||
// sure they don't throw exc:
|
||||
final int startOffset = postings.startOffset();
|
||||
final int endOffset = postings.endOffset();
|
||||
// TODO: these are too anal...?
|
||||
/*
|
||||
|
||||
// Call the methods to at least make
|
||||
// sure they don't throw exc:
|
||||
final int startOffset = postings.startOffset();
|
||||
final int endOffset = postings.endOffset();
|
||||
// TODO: these are too anal...?
|
||||
/*
|
||||
if (endOffset < startOffset) {
|
||||
throw new RuntimeException("vector startOffset=" + startOffset + " is > endOffset=" + endOffset);
|
||||
}
|
||||
|
@ -1998,44 +1969,45 @@ public class CheckIndex implements Closeable {
|
|||
throw new RuntimeException("vector startOffset=" + startOffset + " is < prior startOffset=" + lastStartOffset);
|
||||
}
|
||||
lastStartOffset = startOffset;
|
||||
*/
|
||||
|
||||
if (postingsPostings != null) {
|
||||
final int postingsStartOffset = postingsPostings.startOffset();
|
||||
|
||||
final int postingsEndOffset = postingsPostings.endOffset();
|
||||
if (startOffset != -1 && postingsStartOffset != -1 && startOffset != postingsStartOffset) {
|
||||
throw new RuntimeException("vector term=" + term + " field=" + field + " doc=" + j + ": startOffset=" + startOffset + " differs from postings startOffset=" + postingsStartOffset);
|
||||
}
|
||||
if (endOffset != -1 && postingsEndOffset != -1 && endOffset != postingsEndOffset) {
|
||||
throw new RuntimeException("vector term=" + term + " field=" + field + " doc=" + j + ": endOffset=" + endOffset + " differs from postings endOffset=" + postingsEndOffset);
|
||||
}
|
||||
}
|
||||
|
||||
BytesRef payload = postings.getPayload();
|
||||
|
||||
if (payload != null) {
|
||||
assert vectorsHasPayload;
|
||||
}
|
||||
|
||||
if (postingsHasPayload && vectorsHasPayload) {
|
||||
assert postingsPostings != null;
|
||||
*/
|
||||
|
||||
if (payload == null) {
|
||||
// we have payloads, but not at this position.
|
||||
// postings has payloads too, it should not have one at this position
|
||||
if (postingsPostings.getPayload() != null) {
|
||||
throw new RuntimeException("vector term=" + term + " field=" + field + " doc=" + j + " has no payload but postings does: " + postingsPostings.getPayload());
|
||||
if (postingsPostings != null) {
|
||||
final int postingsStartOffset = postingsPostings.startOffset();
|
||||
|
||||
final int postingsEndOffset = postingsPostings.endOffset();
|
||||
if (startOffset != -1 && postingsStartOffset != -1 && startOffset != postingsStartOffset) {
|
||||
throw new RuntimeException("vector term=" + term + " field=" + field + " doc=" + j + ": startOffset=" + startOffset + " differs from postings startOffset=" + postingsStartOffset);
|
||||
}
|
||||
} else {
|
||||
// we have payloads, and one at this position
|
||||
// postings should also have one at this position, with the same bytes.
|
||||
if (postingsPostings.getPayload() == null) {
|
||||
throw new RuntimeException("vector term=" + term + " field=" + field + " doc=" + j + " has payload=" + payload + " but postings does not.");
|
||||
if (endOffset != -1 && postingsEndOffset != -1 && endOffset != postingsEndOffset) {
|
||||
throw new RuntimeException("vector term=" + term + " field=" + field + " doc=" + j + ": endOffset=" + endOffset + " differs from postings endOffset=" + postingsEndOffset);
|
||||
}
|
||||
BytesRef postingsPayload = postingsPostings.getPayload();
|
||||
if (!payload.equals(postingsPayload)) {
|
||||
throw new RuntimeException("vector term=" + term + " field=" + field + " doc=" + j + " has payload=" + payload + " but differs from postings payload=" + postingsPayload);
|
||||
}
|
||||
|
||||
BytesRef payload = postings.getPayload();
|
||||
|
||||
if (payload != null) {
|
||||
assert vectorsHasPayload;
|
||||
}
|
||||
|
||||
if (postingsHasPayload && vectorsHasPayload) {
|
||||
assert postingsPostings != null;
|
||||
|
||||
if (payload == null) {
|
||||
// we have payloads, but not at this position.
|
||||
// postings has payloads too, it should not have one at this position
|
||||
if (postingsPostings.getPayload() != null) {
|
||||
throw new RuntimeException("vector term=" + term + " field=" + field + " doc=" + j + " has no payload but postings does: " + postingsPostings.getPayload());
|
||||
}
|
||||
} else {
|
||||
// we have payloads, and one at this position
|
||||
// postings should also have one at this position, with the same bytes.
|
||||
if (postingsPostings.getPayload() == null) {
|
||||
throw new RuntimeException("vector term=" + term + " field=" + field + " doc=" + j + " has payload=" + payload + " but postings does not.");
|
||||
}
|
||||
BytesRef postingsPayload = postingsPostings.getPayload();
|
||||
if (!payload.equals(postingsPayload)) {
|
||||
throw new RuntimeException("vector term=" + term + " field=" + field + " doc=" + j + " has payload=" + payload + " but differs from postings payload=" + postingsPayload);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -45,6 +45,7 @@ import org.apache.lucene.document.SortedSetDocValuesField;
|
|||
import org.apache.lucene.document.StoredField;
|
||||
import org.apache.lucene.document.StringField;
|
||||
import org.apache.lucene.document.TextField;
|
||||
import org.apache.lucene.index.CheckIndex.Status.DocValuesStatus;
|
||||
import org.apache.lucene.index.TermsEnum.SeekStatus;
|
||||
import org.apache.lucene.search.BooleanClause;
|
||||
import org.apache.lucene.search.BooleanQuery;
|
||||
|
@ -2674,9 +2675,12 @@ public abstract class BaseDocValuesFormatTestCase extends BaseIndexFileFormatTes
|
|||
PrintStream infoStream = new PrintStream(bos, false, IOUtils.UTF_8);
|
||||
startingGun.await();
|
||||
for (LeafReaderContext leaf : r.leaves()) {
|
||||
CheckIndex.testDocValues(leaf.reader(), infoStream, true);
|
||||
DocValuesStatus status = CheckIndex.testDocValues((SegmentReader)leaf.reader(), infoStream, true);
|
||||
if (status.error != null) {
|
||||
throw status.error;
|
||||
}
|
||||
}
|
||||
} catch (Exception e) {
|
||||
} catch (Throwable e) {
|
||||
throw new RuntimeException();
|
||||
}
|
||||
}
|
||||
|
|
|
@ -73,6 +73,7 @@ import org.apache.lucene.index.CheckIndex;
|
|||
import org.apache.lucene.index.ConcurrentMergeScheduler;
|
||||
import org.apache.lucene.index.DirectoryReader;
|
||||
import org.apache.lucene.index.DocValuesType;
|
||||
import org.apache.lucene.index.FieldInfo;
|
||||
import org.apache.lucene.index.IndexFileNames;
|
||||
import org.apache.lucene.index.PostingsEnum;
|
||||
import org.apache.lucene.index.FilterLeafReader;
|
||||
|
@ -301,14 +302,23 @@ public final class TestUtil {
|
|||
ByteArrayOutputStream bos = new ByteArrayOutputStream(1024);
|
||||
PrintStream infoStream = new PrintStream(bos, false, IOUtils.UTF_8);
|
||||
|
||||
reader.checkIntegrity();
|
||||
CheckIndex.testLiveDocs(reader, infoStream, true);
|
||||
CheckIndex.testFieldInfos(reader, infoStream, true);
|
||||
CheckIndex.testFieldNorms(reader, infoStream, true);
|
||||
CheckIndex.testPostings(reader, infoStream, false, true);
|
||||
CheckIndex.testStoredFields(reader, infoStream, true);
|
||||
CheckIndex.testTermVectors(reader, infoStream, false, crossCheckTermVectors, true);
|
||||
CheckIndex.testDocValues(reader, infoStream, true);
|
||||
final CodecReader codecReader;
|
||||
if (reader instanceof CodecReader) {
|
||||
codecReader = (CodecReader) reader;
|
||||
reader.checkIntegrity();
|
||||
} else {
|
||||
codecReader = SlowCodecReaderWrapper.wrap(reader);
|
||||
}
|
||||
CheckIndex.testLiveDocs(codecReader, infoStream, true);
|
||||
CheckIndex.testFieldInfos(codecReader, infoStream, true);
|
||||
CheckIndex.testFieldNorms(codecReader, infoStream, true);
|
||||
CheckIndex.testPostings(codecReader, infoStream, false, true);
|
||||
CheckIndex.testStoredFields(codecReader, infoStream, true);
|
||||
CheckIndex.testTermVectors(codecReader, infoStream, false, crossCheckTermVectors, true);
|
||||
CheckIndex.testDocValues(codecReader, infoStream, true);
|
||||
|
||||
// some checks really against the reader API
|
||||
checkReaderSanity(reader);
|
||||
|
||||
if (LuceneTestCase.INFOSTREAM) {
|
||||
System.out.println(bos.toString(IOUtils.UTF_8));
|
||||
|
@ -324,6 +334,76 @@ public final class TestUtil {
|
|||
assert Accountables.toString(sr) != null;
|
||||
}
|
||||
}
|
||||
|
||||
// used by TestUtil.checkReader to check some things really unrelated to the index,
|
||||
// just looking for bugs in indexreader implementations.
|
||||
private static void checkReaderSanity(LeafReader reader) throws IOException {
|
||||
for (FieldInfo info : reader.getFieldInfos()) {
|
||||
|
||||
// reader shouldn't return normValues if the field does not have them
|
||||
if (!info.hasNorms()) {
|
||||
if (reader.getNormValues(info.name) != null) {
|
||||
throw new RuntimeException("field: " + info.name + " should omit norms but has them!");
|
||||
}
|
||||
}
|
||||
|
||||
// reader shouldn't return docValues if the field does not have them
|
||||
// reader shouldn't return multiple docvalues types for the same field.
|
||||
switch(info.getDocValuesType()) {
|
||||
case NONE:
|
||||
if (reader.getBinaryDocValues(info.name) != null ||
|
||||
reader.getNumericDocValues(info.name) != null ||
|
||||
reader.getSortedDocValues(info.name) != null ||
|
||||
reader.getSortedSetDocValues(info.name) != null ||
|
||||
reader.getDocsWithField(info.name) != null) {
|
||||
throw new RuntimeException("field: " + info.name + " has docvalues but should omit them!");
|
||||
}
|
||||
break;
|
||||
case SORTED:
|
||||
if (reader.getBinaryDocValues(info.name) != null ||
|
||||
reader.getNumericDocValues(info.name) != null ||
|
||||
reader.getSortedNumericDocValues(info.name) != null ||
|
||||
reader.getSortedSetDocValues(info.name) != null) {
|
||||
throw new RuntimeException(info.name + " returns multiple docvalues types!");
|
||||
}
|
||||
break;
|
||||
case SORTED_NUMERIC:
|
||||
if (reader.getBinaryDocValues(info.name) != null ||
|
||||
reader.getNumericDocValues(info.name) != null ||
|
||||
reader.getSortedSetDocValues(info.name) != null ||
|
||||
reader.getSortedDocValues(info.name) != null) {
|
||||
throw new RuntimeException(info.name + " returns multiple docvalues types!");
|
||||
}
|
||||
break;
|
||||
case SORTED_SET:
|
||||
if (reader.getBinaryDocValues(info.name) != null ||
|
||||
reader.getNumericDocValues(info.name) != null ||
|
||||
reader.getSortedNumericDocValues(info.name) != null ||
|
||||
reader.getSortedDocValues(info.name) != null) {
|
||||
throw new RuntimeException(info.name + " returns multiple docvalues types!");
|
||||
}
|
||||
break;
|
||||
case BINARY:
|
||||
if (reader.getNumericDocValues(info.name) != null ||
|
||||
reader.getSortedDocValues(info.name) != null ||
|
||||
reader.getSortedNumericDocValues(info.name) != null ||
|
||||
reader.getSortedSetDocValues(info.name) != null) {
|
||||
throw new RuntimeException(info.name + " returns multiple docvalues types!");
|
||||
}
|
||||
break;
|
||||
case NUMERIC:
|
||||
if (reader.getBinaryDocValues(info.name) != null ||
|
||||
reader.getSortedDocValues(info.name) != null ||
|
||||
reader.getSortedNumericDocValues(info.name) != null ||
|
||||
reader.getSortedSetDocValues(info.name) != null) {
|
||||
throw new RuntimeException(info.name + " returns multiple docvalues types!");
|
||||
}
|
||||
break;
|
||||
default:
|
||||
throw new AssertionError();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/** start and end are BOTH inclusive */
|
||||
public static int nextInt(Random r, int start, int end) {
|
||||
|
|
Loading…
Reference in New Issue