LUCENE-4127: don't let bogus offsets corrupt the index

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1348685 13f79535-47bb-0310-9956-ffa450edef68
2012-06-10 23:50:21 +00:00 · 2012-06-10 23:50:21 +00:00 · 4110fae40b
parent 386cc7859d
commit 4110fae40b
11 changed files with 127 additions and 51 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -396,6 +396,8 @@ Changes in Runtime Behavior
 * LUCENE-4127: IndexWriter will now throw IllegalArgumentException if
  the first token of an indexed field has 0 positionIncrement
  (previously it silently corrected it to 1, possibly masking bugs).
+  OffsetAttributeImpl will throw IllegalArgumentException if startOffset
+  is less than endOffset, or if offsets are negative.
  (Robert Muir, Mike McCandless)

 API Changes
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/PrefixAndSuffixAwareTokenFilter.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/PrefixAndSuffixAwareTokenFilter.java
@ -49,14 +49,14 @@ public class PrefixAndSuffixAwareTokenFilter extends TokenStream {
  }

  public Token updateInputToken(Token inputToken, Token lastPrefixToken) {
-    inputToken.setStartOffset(lastPrefixToken.endOffset() + inputToken.startOffset());
-    inputToken.setEndOffset(lastPrefixToken.endOffset() + inputToken.endOffset());
+    inputToken.setOffset(lastPrefixToken.endOffset() + inputToken.startOffset(), 
+                         lastPrefixToken.endOffset() + inputToken.endOffset());
    return inputToken;
  }

  public Token updateSuffixToken(Token suffixToken, Token lastInputToken) {
-    suffixToken.setStartOffset(lastInputToken.endOffset() + suffixToken.startOffset());
-    suffixToken.setEndOffset(lastInputToken.endOffset() + suffixToken.endOffset());
+    suffixToken.setOffset(lastInputToken.endOffset() + suffixToken.startOffset(),
+                          lastInputToken.endOffset() + suffixToken.endOffset());
    return suffixToken;
  }

--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/PrefixAwareTokenFilter.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/PrefixAwareTokenFilter.java
@ -153,8 +153,8 @@ public class PrefixAwareTokenFilter extends TokenStream {
   * @return consumer token
   */
  public Token updateSuffixToken(Token suffixToken, Token lastPrefixToken) {
-    suffixToken.setStartOffset(lastPrefixToken.endOffset() + suffixToken.startOffset());
-    suffixToken.setEndOffset(lastPrefixToken.endOffset() + suffixToken.endOffset());
+    suffixToken.setOffset(lastPrefixToken.endOffset() + suffixToken.startOffset(),
+                          lastPrefixToken.endOffset() + suffixToken.endOffset());
    return suffixToken;
  }

--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/payloads/TokenOffsetPayloadTokenFilter.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/payloads/TokenOffsetPayloadTokenFilter.java
@ -27,8 +27,8 @@ import org.apache.lucene.util.BytesRef;


 /**
- * Adds the {@link org.apache.lucene.analysis.Token#setStartOffset(int)}
- * and {@link org.apache.lucene.analysis.Token#setEndOffset(int)}
+ * Adds the {@link OffsetAttribute#startOffset()}
+ * and {@link OffsetAttribute#endOffset()}
 * First 4 bytes are the start
 *
 **/
--- a/lucene/core/src/java/org/apache/lucene/analysis/Token.java
+++ b/lucene/core/src/java/org/apache/lucene/analysis/Token.java
@ -139,6 +139,7 @@ public class Token extends CharTermAttributeImpl
   *  @param start start offset in the source text
   *  @param end end offset in the source text */
  public Token(int start, int end) {
+    checkOffsets(start, end);
    startOffset = start;
    endOffset = end;
  }
@ -149,6 +150,7 @@ public class Token extends CharTermAttributeImpl
   *  @param end end offset in the source text
   *  @param typ the lexical type of this Token */
  public Token(int start, int end, String typ) {
+    checkOffsets(start, end);
    startOffset = start;
    endOffset = end;
    type = typ;
@ -162,6 +164,7 @@ public class Token extends CharTermAttributeImpl
   *  @param flags The bits to set for this token
   */
  public Token(int start, int end, int flags) {
+    checkOffsets(start, end);
    startOffset = start;
    endOffset = end;
    this.flags = flags;
@ -177,6 +180,7 @@ public class Token extends CharTermAttributeImpl
   *  @param end end offset
   */
  public Token(String text, int start, int end) {
+    checkOffsets(start, end);
    append(text);
    startOffset = start;
    endOffset = end;
@ -192,6 +196,7 @@ public class Token extends CharTermAttributeImpl
   *  @param typ token type
   */
  public Token(String text, int start, int end, String typ) {
+    checkOffsets(start, end);
    append(text);
    startOffset = start;
    endOffset = end;
@ -209,6 +214,7 @@ public class Token extends CharTermAttributeImpl
   * @param flags token type bits
   */
  public Token(String text, int start, int end, int flags) {
+    checkOffsets(start, end);
    append(text);
    startOffset = start;
    endOffset = end;
@ -226,6 +232,7 @@ public class Token extends CharTermAttributeImpl
   * @param end
   */
  public Token(char[] startTermBuffer, int termBufferOffset, int termBufferLength, int start, int end) {
+    checkOffsets(start, end);
    copyBuffer(startTermBuffer, termBufferOffset, termBufferLength);
    startOffset = start;
    endOffset = end;
@ -295,12 +302,6 @@ public class Token extends CharTermAttributeImpl
    return startOffset;
  }

-  /** Set the starting offset.
-      @see #startOffset() */
-  public void setStartOffset(int offset) {
-    this.startOffset = offset;
-  }
-
  /** Returns this Token's ending offset, one greater than the position of the
    last character corresponding to this token in the source text. The length
    of the token in the source text is (endOffset - startOffset). */
@ -308,15 +309,10 @@ public class Token extends CharTermAttributeImpl
    return endOffset;
  }

-  /** Set the ending offset.
-      @see #endOffset() */
-  public void setEndOffset(int offset) {
-    this.endOffset = offset;
-  }
-  
  /** Set the starting and ending offset.
  @see #startOffset() and #endOffset()*/
  public void setOffset(int startOffset, int endOffset) {
+    checkOffsets(startOffset, endOffset);
    this.startOffset = startOffset;
    this.endOffset = endOffset;
  }
@ -449,11 +445,11 @@ public class Token extends CharTermAttributeImpl

  /** Shorthand for calling {@link #clear},
   *  {@link #copyBuffer(char[], int, int)},
-   *  {@link #setStartOffset},
-   *  {@link #setEndOffset},
+   *  {@link #setOffset},
   *  {@link #setType}
   *  @return this Token instance */
  public Token reinit(char[] newTermBuffer, int newTermOffset, int newTermLength, int newStartOffset, int newEndOffset, String newType) {
+    checkOffsets(newStartOffset, newEndOffset);
    clearNoTermBuffer();
    copyBuffer(newTermBuffer, newTermOffset, newTermLength);
    payload = null;
@ -466,11 +462,11 @@ public class Token extends CharTermAttributeImpl

  /** Shorthand for calling {@link #clear},
   *  {@link #copyBuffer(char[], int, int)},
-   *  {@link #setStartOffset},
-   *  {@link #setEndOffset}
+   *  {@link #setOffset},
   *  {@link #setType} on Token.DEFAULT_TYPE
   *  @return this Token instance */
  public Token reinit(char[] newTermBuffer, int newTermOffset, int newTermLength, int newStartOffset, int newEndOffset) {
+    checkOffsets(newStartOffset, newEndOffset);
    clearNoTermBuffer();
    copyBuffer(newTermBuffer, newTermOffset, newTermLength);
    startOffset = newStartOffset;
@ -481,11 +477,11 @@ public class Token extends CharTermAttributeImpl

  /** Shorthand for calling {@link #clear},
   *  {@link #append(CharSequence)},
-   *  {@link #setStartOffset},
-   *  {@link #setEndOffset}
+   *  {@link #setOffset},
   *  {@link #setType}
   *  @return this Token instance */
  public Token reinit(String newTerm, int newStartOffset, int newEndOffset, String newType) {
+    checkOffsets(newStartOffset, newEndOffset);
    clear();
    append(newTerm);
    startOffset = newStartOffset;
@ -496,11 +492,11 @@ public class Token extends CharTermAttributeImpl

  /** Shorthand for calling {@link #clear},
   *  {@link #append(CharSequence, int, int)},
-   *  {@link #setStartOffset},
-   *  {@link #setEndOffset}
+   *  {@link #setOffset},
   *  {@link #setType}
   *  @return this Token instance */
  public Token reinit(String newTerm, int newTermOffset, int newTermLength, int newStartOffset, int newEndOffset, String newType) {
+    checkOffsets(newStartOffset, newEndOffset);
    clear();
    append(newTerm, newTermOffset, newTermOffset + newTermLength);
    startOffset = newStartOffset;
@ -511,11 +507,11 @@ public class Token extends CharTermAttributeImpl

  /** Shorthand for calling {@link #clear},
   *  {@link #append(CharSequence)},
-   *  {@link #setStartOffset},
-   *  {@link #setEndOffset}
+   *  {@link #setOffset},
   *  {@link #setType} on Token.DEFAULT_TYPE
   *  @return this Token instance */
  public Token reinit(String newTerm, int newStartOffset, int newEndOffset) {
+    checkOffsets(newStartOffset, newEndOffset);
    clear();
    append(newTerm);
    startOffset = newStartOffset;
@ -526,11 +522,11 @@ public class Token extends CharTermAttributeImpl

  /** Shorthand for calling {@link #clear},
   *  {@link #append(CharSequence, int, int)},
-   *  {@link #setStartOffset},
-   *  {@link #setEndOffset}
+   *  {@link #setOffset},
   *  {@link #setType} on Token.DEFAULT_TYPE
   *  @return this Token instance */
  public Token reinit(String newTerm, int newTermOffset, int newTermLength, int newStartOffset, int newEndOffset) {
+    checkOffsets(newStartOffset, newEndOffset);
    clear();
    append(newTerm, newTermOffset, newTermOffset + newTermLength);
    startOffset = newStartOffset;
@ -614,6 +610,13 @@ public class Token extends CharTermAttributeImpl
    reflector.reflect(FlagsAttribute.class, "flags", flags);
    reflector.reflect(TypeAttribute.class, "type", type);
  }
+  
+  private void checkOffsets(int startOffset, int endOffset) {
+    if (startOffset < 0 || endOffset < startOffset) {
+      throw new IllegalArgumentException("startOffset must be non-negative, and endOffset must be >= startOffset, "
+          + "startOffset=" + startOffset + ",endOffset=" + endOffset);
+    }
+  }

  /** Convenience factory that returns <code>Token</code> as implementation for the basic
   * attributes and return the default impl (with &quot;Impl&quot; appended) for all other
--- a/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/OffsetAttributeImpl.java
+++ b/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/OffsetAttributeImpl.java
@ -47,10 +47,10 @@ public class OffsetAttributeImpl extends AttributeImpl implements OffsetAttribut
    // tokenizer should call clearAtts before re-using
    // OffsetAtt

-    // TODO: check that these are valid!  IE, each should be
-    // >= 0, and endOffset should be >= startOffset.
-    // Problem is this could "break" existing
-    // tokenizers/filters.
+    if (startOffset < 0 || endOffset < startOffset) {
+      throw new IllegalArgumentException("startOffset must be non-negative, and endOffset must be >= startOffset, "
+          + "startOffset=" + startOffset + ",endOffset=" + endOffset);
+    }

    this.startOffset = startOffset;
    this.endOffset = endOffset;
--- a/lucene/core/src/java/org/apache/lucene/index/DocInverterPerField.java
+++ b/lucene/core/src/java/org/apache/lucene/index/DocInverterPerField.java
@ -22,6 +22,7 @@ import java.io.IOException;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
 import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.index.FieldInfo.IndexOptions;
 import org.apache.lucene.util.IOUtils;

 /**
@ -80,6 +81,11 @@ final class DocInverterPerField extends DocFieldConsumerPerField {
        if (fieldType.omitNorms() && field.boost() != 1.0f) {
          throw new UnsupportedOperationException("You cannot set an index-time boost: norms are omitted for field '" + field.name() + "'");
        }
+        
+        // only bother checking offsets if something will consume them.
+        // TODO: after we fix analyzers, also check if termVectorOffsets will be indexed.
+        final boolean checkOffsets = fieldType.indexOptions() == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS;
+        int lastStartOffset = 0;

        if (i > 0) {
          fieldState.position += docState.analyzer == null ? 0 : docState.analyzer.getPositionIncrementGap(fieldInfo.name);
@ -134,6 +140,20 @@ final class DocInverterPerField extends DocFieldConsumerPerField {

            if (posIncr == 0)
              fieldState.numOverlap++;
+            
+            if (checkOffsets) {
+              int startOffset = fieldState.offset + offsetAttribute.startOffset();
+              int endOffset = fieldState.offset + offsetAttribute.endOffset();
+              if (startOffset < 0 || endOffset < startOffset) {
+                throw new IllegalArgumentException("startOffset must be non-negative, and endOffset must be >= startOffset, "
+                    + "startOffset=" + startOffset + ",endOffset=" + endOffset);
+              }
+              if (startOffset < lastStartOffset) {
+                throw new IllegalArgumentException("offsets must not go backwards startOffset=" 
+                     + startOffset + " is < lastStartOffset=" + lastStartOffset);
+              }
+              lastStartOffset = startOffset;
+            }

            boolean success = false;
            try {
--- a/lucene/core/src/test/org/apache/lucene/index/TestPostingsOffsets.java
+++ b/lucene/core/src/test/org/apache/lucene/index/TestPostingsOffsets.java
@ -17,6 +17,7 @@ package org.apache.lucene.index;
 * limitations under the License.
 */

+import java.io.IOException;
 import java.util.ArrayList;
 import java.util.HashMap;
 import java.util.List;
@ -39,6 +40,7 @@ import org.apache.lucene.search.FieldCache;
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.English;
+import org.apache.lucene.util.IOUtils;
 import org.apache.lucene.util.LuceneTestCase;
 import org.apache.lucene.util.LuceneTestCase.SuppressCodecs;
 import org.apache.lucene.util._TestUtil;
@ -382,6 +384,68 @@ public class TestPostingsOffsets extends LuceneTestCase {
    riw.close();
    dir.close();
  }
+  
+  // NOTE: the next two tests aren't that good as we need an EvilToken...
+  public void testNegativeOffsets() throws Exception {
+    try {
+      checkTokens(new Token[] { 
+          makeToken("foo", 1, -1, -1)
+      });
+      fail();
+    } catch (IllegalArgumentException expected) {
+      //expected
+    }
+  }
+  
+  public void testIllegalOffsets() throws Exception {
+    try {
+      checkTokens(new Token[] { 
+          makeToken("foo", 1, 1, 0)
+      });
+      fail();
+    } catch (IllegalArgumentException expected) {
+      //expected
+    }
+  }
+   
+  public void testBackwardsOffsets() throws Exception {
+    try {
+      checkTokens(new Token[] { 
+         makeToken("foo", 1, 0, 3),
+         makeToken("foo", 1, 4, 7),
+         makeToken("foo", 0, 3, 6)
+      });
+      fail();
+    } catch (IllegalArgumentException expected) {
+      // expected
+    }
+  }
+  // TODO: more tests with other possibilities
+  
+  private void checkTokens(Token[] tokens) throws IOException {
+    Directory dir = newDirectory();
+    RandomIndexWriter riw = new RandomIndexWriter(random(), dir, iwc);
+    boolean success = false;
+    try {
+      FieldType ft = new FieldType(TextField.TYPE_NOT_STORED);
+      ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
+      // store some term vectors for the checkindex cross-check
+      ft.setStoreTermVectors(true);
+      ft.setStoreTermVectorPositions(true);
+      ft.setStoreTermVectorOffsets(true);
+     
+      Document doc = new Document();
+      doc.add(new Field("body", new CannedTokenStream(tokens), ft));
+      riw.addDocument(doc);
+      success = true;
+    } finally {
+      if (success) {
+        IOUtils.close(riw, dir);
+      } else {
+        IOUtils.closeWhileHandlingException(riw, dir);
+      }
+    }
+  }

  private Token makeToken(String text, int posIncr, int startOffset, int endOffset) {
    final Token t = new Token();
--- a/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java
@ -175,10 +175,6 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
      if (offsetAtt != null) {
        final int startOffset = offsetAtt.startOffset();
        final int endOffset = offsetAtt.endOffset();
-        assertTrue("startOffset must be >= 0", startOffset >= 0);
-        assertTrue("endOffset must be >= 0", endOffset >= 0);
-        assertTrue("endOffset must be >= startOffset, got startOffset=" + startOffset + ",endOffset=" + endOffset, 
-            endOffset >= startOffset);
        if (finalOffset != null) {
          assertTrue("startOffset must be <= finalOffset", startOffset <= finalOffset.intValue());
          assertTrue("endOffset must be <= finalOffset: got endOffset=" + endOffset + " vs finalOffset=" + finalOffset.intValue(),
--- a/lucene/test-framework/src/java/org/apache/lucene/analysis/ValidatingTokenFilter.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/analysis/ValidatingTokenFilter.java
@ -96,15 +96,6 @@ public final class ValidatingTokenFilter extends TokenFilter {
      startOffset = offsetAtt.startOffset();
      endOffset = offsetAtt.endOffset();

-      if (startOffset < 0) {
-        throw new IllegalStateException(name + ": startOffset=" + startOffset + " is < 0");
-      }
-      if (endOffset < 0) {
-        throw new IllegalStateException(name + ": endOffset=" + endOffset + " is < 0");
-      }
-      if (endOffset < startOffset) {
-        throw new IllegalStateException(name + ": startOffset=" + startOffset + " is > endOffset=" + endOffset + " pos=" + pos + "; token=" + termAtt);
-      }
      if (offsetsAreCorrect && offsetAtt.startOffset() < lastStartOffset) {
        throw new IllegalStateException(name + ": offsets must not go backwards startOffset=" + startOffset + " is < lastStartOffset=" + lastStartOffset);
      }
--- a/solr/core/src/java/org/apache/solr/spelling/SpellingQueryConverter.java
+++ b/solr/core/src/java/org/apache/solr/spelling/SpellingQueryConverter.java
@ -186,8 +186,8 @@ public class SpellingQueryConverter extends QueryConverter  {
    while (stream.incrementToken()) {      
      Token token = new Token();
      token.copyBuffer(termAtt.buffer(), 0, termAtt.length());
-      token.setStartOffset(offset + offsetAtt.startOffset());
-      token.setEndOffset(offset + offsetAtt.endOffset());
+      token.setOffset(offset + offsetAtt.startOffset(), 
+                      offset + offsetAtt.endOffset());
      token.setFlags(flagsAttValue); //overwriting any flags already set...
      token.setType(typeAtt.type());
      token.setPayload(payloadAtt.getPayload());