LUCENE-4221: CheckIndex is overeager for term vector offsets bounds checks

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1361701 13f79535-47bb-0310-9956-ffa450edef68
2025-02-06 10:08:58 +00:00 · 2012-07-15 13:45:27 +00:00 · 2012-07-15 13:45:27 +00:00 · 383d17e6a5
commit 383d17e6a5
parent 75b6bf69bd
3 changed files with 59 additions and 27 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -48,6 +48,9 @@ Bug Fixes
  leave temp files behind in /tmp on Windows. Fix Sort to not leave
  temp files behind when /tmp is a separate volume. (Uwe Schindler, Robert Muir)

+* LUCENE-4221: Fix overeager CheckIndex validation for term vector offsets.
+  (Robert Muir)
+
 Build

 * LUCENE-4094: Support overriding file.encoding on forked test JVMs
--- a/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java
+++ b/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java
@ -668,7 +668,7 @@ public class CheckIndex {
   * checks Fields api is consistent with itself.
   * searcher is optional, to verify with queries. Can be null.
   */
-  private Status.TermIndexStatus checkFields(Fields fields, Bits liveDocs, int maxDoc, FieldInfos fieldInfos, boolean doPrint) throws IOException {
+  private Status.TermIndexStatus checkFields(Fields fields, Bits liveDocs, int maxDoc, FieldInfos fieldInfos, boolean doPrint, boolean isVectors) throws IOException {
    // TODO: we should probably return our own stats thing...?!
    
    final Status.TermIndexStatus status = new Status.TermIndexStatus();
@ -863,17 +863,21 @@ public class CheckIndex {
              if (hasOffsets) {
                int startOffset = postings.startOffset();
                int endOffset = postings.endOffset();
-                if (startOffset < 0) {
-                  throw new RuntimeException("term " + term + ": doc " + doc + ": pos " + pos + ": startOffset " + startOffset + " is out of bounds");
-                }
-                if (startOffset < lastOffset) {
-                  throw new RuntimeException("term " + term + ": doc " + doc + ": pos " + pos + ": startOffset " + startOffset + " < lastStartOffset " + lastOffset);
-                }
-                if (endOffset < 0) {
-                  throw new RuntimeException("term " + term + ": doc " + doc + ": pos " + pos + ": endOffset " + endOffset + " is out of bounds");
-                }
-                if (endOffset < startOffset) {
-                  throw new RuntimeException("term " + term + ": doc " + doc + ": pos " + pos + ": endOffset " + endOffset + " < startOffset " + startOffset);
+                // NOTE: we cannot enforce any bounds whatsoever on vectors... they were a free-for-all before?
+                // but for offsets in the postings lists these checks are fine: they were always enforced by IndexWriter
+                if (!isVectors) {
+                  if (startOffset < 0) {
+                    throw new RuntimeException("term " + term + ": doc " + doc + ": pos " + pos + ": startOffset " + startOffset + " is out of bounds");
+                  }
+                  if (startOffset < lastOffset) {
+                    throw new RuntimeException("term " + term + ": doc " + doc + ": pos " + pos + ": startOffset " + startOffset + " < lastStartOffset " + lastOffset);
+                  }
+                  if (endOffset < 0) {
+                    throw new RuntimeException("term " + term + ": doc " + doc + ": pos " + pos + ": endOffset " + endOffset + " is out of bounds");
+                  }
+                  if (endOffset < startOffset) {
+                    throw new RuntimeException("term " + term + ": doc " + doc + ": pos " + pos + ": endOffset " + endOffset + " < startOffset " + startOffset);
+                  }
                }
                lastOffset = startOffset;
              }
@ -956,17 +960,21 @@ public class CheckIndex {
                if (hasOffsets) {
                  int startOffset = postings.startOffset();
                  int endOffset = postings.endOffset();
-                  if (startOffset < 0) {
-                    throw new RuntimeException("term " + term + ": doc " + docID + ": pos " + pos + ": startOffset " + startOffset + " is out of bounds");
-                  }
-                  if (startOffset < lastOffset) {
-                    throw new RuntimeException("term " + term + ": doc " + docID + ": pos " + pos + ": startOffset " + startOffset + " < lastStartOffset " + lastOffset);
-                  }
-                  if (endOffset < 0) {
-                    throw new RuntimeException("term " + term + ": doc " + docID + ": pos " + pos + ": endOffset " + endOffset + " is out of bounds");
-                  }
-                  if (endOffset < startOffset) {
-                    throw new RuntimeException("term " + term + ": doc " + docID + ": pos " + pos + ": endOffset " + endOffset + " < startOffset " + startOffset);
+                  // NOTE: we cannot enforce any bounds whatsoever on vectors... they were a free-for-all before?
+                  // but for offsets in the postings lists these checks are fine: they were always enforced by IndexWriter
+                  if (!isVectors) {
+                    if (startOffset < 0) {
+                      throw new RuntimeException("term " + term + ": doc " + docID + ": pos " + pos + ": startOffset " + startOffset + " is out of bounds");
+                    }
+                    if (startOffset < lastOffset) {
+                      throw new RuntimeException("term " + term + ": doc " + docID + ": pos " + pos + ": startOffset " + startOffset + " < lastStartOffset " + lastOffset);
+                    }
+                    if (endOffset < 0) {
+                      throw new RuntimeException("term " + term + ": doc " + docID + ": pos " + pos + ": endOffset " + endOffset + " is out of bounds");
+                    }
+                    if (endOffset < startOffset) {
+                      throw new RuntimeException("term " + term + ": doc " + docID + ": pos " + pos + ": endOffset " + endOffset + " < startOffset " + startOffset);
+                    }
                  }
                  lastOffset = startOffset;
                }
@ -1193,12 +1201,12 @@ public class CheckIndex {
      }

      final Fields fields = reader.fields();
-      status = checkFields(fields, liveDocs, maxDoc, fieldInfos, true);
+      status = checkFields(fields, liveDocs, maxDoc, fieldInfos, true, false);
      if (liveDocs != null) {
        if (infoStream != null) {
          infoStream.print("    test (ignoring deletes): terms, freq, prox...");
        }
-        checkFields(fields, null, maxDoc, fieldInfos, true);
+        checkFields(fields, null, maxDoc, fieldInfos, true, false);
      }
    } catch (Throwable e) {
      msg("ERROR: " + e);
@ -1415,10 +1423,10 @@ public class CheckIndex {

        if (tfv != null) {
          // First run with no deletions:
-          checkFields(tfv, null, 1, fieldInfos, false);
+          checkFields(tfv, null, 1, fieldInfos, false, true);

          // Again, with the one doc deleted:
-          checkFields(tfv, onlyDocIsDeleted, 1, fieldInfos, false);
+          checkFields(tfv, onlyDocIsDeleted, 1, fieldInfos, false, true);

          // Only agg stats if the doc is live:
          final boolean doStats = liveDocs == null || liveDocs.get(j);
--- a/lucene/core/src/test/org/apache/lucene/index/TestCheckIndex.java
+++ b/lucene/core/src/test/org/apache/lucene/index/TestCheckIndex.java
@ -25,8 +25,11 @@ import java.util.ArrayList;

 import org.apache.lucene.util.LuceneTestCase;
 import org.apache.lucene.store.Directory;
+import org.apache.lucene.analysis.CannedTokenStream;
 import org.apache.lucene.analysis.MockAnalyzer;
+import org.apache.lucene.analysis.Token;
 import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
 import org.apache.lucene.document.FieldType;
 import org.apache.lucene.document.TextField;
 import org.apache.lucene.util.Constants;
@ -93,6 +96,24 @@ public class TestCheckIndex extends LuceneTestCase {
    assertTrue(checker.checkIndex(onlySegments).clean == true);
    dir.close();
  }
+  
+  // LUCENE-4221: we have to let these thru, for now
+  public void testBogusTermVectors() throws IOException {
+    Directory dir = newDirectory();
+    IndexWriter iw = new IndexWriter(dir, newIndexWriterConfig(TEST_VERSION_CURRENT, null));
+    Document doc = new Document();
+    FieldType ft = new FieldType(TextField.TYPE_NOT_STORED);
+    ft.setStoreTermVectors(true);
+    ft.setStoreTermVectorOffsets(true);
+    Field field = new Field("foo", "", ft);
+    field.setTokenStream(new CannedTokenStream(
+        new Token("bar", 5, 10), new Token("bar", 1, 4)
+    ));
+    doc.add(field);
+    iw.addDocument(doc);
+    iw.close();
+    dir.close(); // checkindex
+  }

  public void testLuceneConstantVersion() throws IOException {
    // common-build.xml sets lucene.version