LUCENE-4221: CheckIndex is overeager for term vector offsets bounds checks

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1361701 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2012-07-15 13:45:27 +00:00
parent 75b6bf69bd
commit 383d17e6a5
3 changed files with 59 additions and 27 deletions

View File

@ -48,6 +48,9 @@ Bug Fixes
leave temp files behind in /tmp on Windows. Fix Sort to not leave
temp files behind when /tmp is a separate volume. (Uwe Schindler, Robert Muir)
* LUCENE-4221: Fix overeager CheckIndex validation for term vector offsets.
(Robert Muir)
Build
* LUCENE-4094: Support overriding file.encoding on forked test JVMs

View File

@ -668,7 +668,7 @@ public class CheckIndex {
* checks Fields api is consistent with itself.
* searcher is optional, to verify with queries. Can be null.
*/
private Status.TermIndexStatus checkFields(Fields fields, Bits liveDocs, int maxDoc, FieldInfos fieldInfos, boolean doPrint) throws IOException {
private Status.TermIndexStatus checkFields(Fields fields, Bits liveDocs, int maxDoc, FieldInfos fieldInfos, boolean doPrint, boolean isVectors) throws IOException {
// TODO: we should probably return our own stats thing...?!
final Status.TermIndexStatus status = new Status.TermIndexStatus();
@ -863,6 +863,9 @@ public class CheckIndex {
if (hasOffsets) {
int startOffset = postings.startOffset();
int endOffset = postings.endOffset();
// NOTE: we cannot enforce any bounds whatsoever on vectors... they were a free-for-all before?
// but for offsets in the postings lists these checks are fine: they were always enforced by IndexWriter
if (!isVectors) {
if (startOffset < 0) {
throw new RuntimeException("term " + term + ": doc " + doc + ": pos " + pos + ": startOffset " + startOffset + " is out of bounds");
}
@ -875,6 +878,7 @@ public class CheckIndex {
if (endOffset < startOffset) {
throw new RuntimeException("term " + term + ": doc " + doc + ": pos " + pos + ": endOffset " + endOffset + " < startOffset " + startOffset);
}
}
lastOffset = startOffset;
}
}
@ -956,6 +960,9 @@ public class CheckIndex {
if (hasOffsets) {
int startOffset = postings.startOffset();
int endOffset = postings.endOffset();
// NOTE: we cannot enforce any bounds whatsoever on vectors... they were a free-for-all before?
// but for offsets in the postings lists these checks are fine: they were always enforced by IndexWriter
if (!isVectors) {
if (startOffset < 0) {
throw new RuntimeException("term " + term + ": doc " + docID + ": pos " + pos + ": startOffset " + startOffset + " is out of bounds");
}
@ -968,6 +975,7 @@ public class CheckIndex {
if (endOffset < startOffset) {
throw new RuntimeException("term " + term + ": doc " + docID + ": pos " + pos + ": endOffset " + endOffset + " < startOffset " + startOffset);
}
}
lastOffset = startOffset;
}
}
@ -1193,12 +1201,12 @@ public class CheckIndex {
}
final Fields fields = reader.fields();
status = checkFields(fields, liveDocs, maxDoc, fieldInfos, true);
status = checkFields(fields, liveDocs, maxDoc, fieldInfos, true, false);
if (liveDocs != null) {
if (infoStream != null) {
infoStream.print(" test (ignoring deletes): terms, freq, prox...");
}
checkFields(fields, null, maxDoc, fieldInfos, true);
checkFields(fields, null, maxDoc, fieldInfos, true, false);
}
} catch (Throwable e) {
msg("ERROR: " + e);
@ -1415,10 +1423,10 @@ public class CheckIndex {
if (tfv != null) {
// First run with no deletions:
checkFields(tfv, null, 1, fieldInfos, false);
checkFields(tfv, null, 1, fieldInfos, false, true);
// Again, with the one doc deleted:
checkFields(tfv, onlyDocIsDeleted, 1, fieldInfos, false);
checkFields(tfv, onlyDocIsDeleted, 1, fieldInfos, false, true);
// Only agg stats if the doc is live:
final boolean doStats = liveDocs == null || liveDocs.get(j);

View File

@ -25,8 +25,11 @@ import java.util.ArrayList;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.store.Directory;
import org.apache.lucene.analysis.CannedTokenStream;
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.document.TextField;
import org.apache.lucene.util.Constants;
@ -94,6 +97,24 @@ public class TestCheckIndex extends LuceneTestCase {
dir.close();
}
// LUCENE-4221: we have to let these thru, for now
public void testBogusTermVectors() throws IOException {
Directory dir = newDirectory();
IndexWriter iw = new IndexWriter(dir, newIndexWriterConfig(TEST_VERSION_CURRENT, null));
Document doc = new Document();
FieldType ft = new FieldType(TextField.TYPE_NOT_STORED);
ft.setStoreTermVectors(true);
ft.setStoreTermVectorOffsets(true);
Field field = new Field("foo", "", ft);
field.setTokenStream(new CannedTokenStream(
new Token("bar", 5, 10), new Token("bar", 1, 4)
));
doc.add(field);
iw.addDocument(doc);
iw.close();
dir.close(); // checkindex
}
public void testLuceneConstantVersion() throws IOException {
// common-build.xml sets lucene.version
final String version = System.getProperty("lucene.version");