LUCENE-5980: don't let document length overflow

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1629835 13f79535-47bb-0310-9956-ffa450edef68
2014-10-07 09:15:44 +00:00 · 2014-10-07 09:15:44 +00:00 · 6f480ae9dd
parent f6cbe636bb
commit 6f480ae9dd
3 changed files with 51 additions and 2 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -171,6 +171,8 @@ Bug Fixes
  not have the regular "spinlock" of DirectoryReader.open. It now implements
  Closeable and you must close it to release the lock.  (Mike McCandless, Robert Muir)

+* LUCENE-5980: Don't let document length overflow. (Robert Muir)
+
 Documentation

 * LUCENE-5392: Add/improve analysis package documentation to reflect
--- a/lucene/core/src/java/org/apache/lucene/index/DefaultIndexingChain.java
+++ b/lucene/core/src/java/org/apache/lucene/index/DefaultIndexingChain.java
@ -622,6 +622,10 @@ final class DefaultIndexingChain extends DocConsumer {
            invertState.lastStartOffset = startOffset;
          }

+          invertState.length++;
+          if (invertState.length < 0) {
+            throw new IllegalArgumentException("too many tokens in field '" + field.name() + "'");
+          }
          //System.out.println("  term=" + invertState.termAttribute);

          // If we hit an exception in here, we abort
@ -633,8 +637,6 @@ final class DefaultIndexingChain extends DocConsumer {
          aborting = true;
          termsHashPerField.add();
          aborting = false;
-
-          invertState.length++;
        }

        // trigger streams to perform end-of-stream operations
--- a/lucene/core/src/test/org/apache/lucene/index/TestIndexWriterExceptions.java
+++ b/lucene/core/src/test/org/apache/lucene/index/TestIndexWriterExceptions.java
@ -35,6 +35,8 @@ import org.apache.lucene.analysis.MockTokenizer;
 import org.apache.lucene.analysis.Token;
 import org.apache.lucene.analysis.TokenFilter;
 import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
 import org.apache.lucene.document.BinaryDocValuesField;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.Field;
@ -46,6 +48,7 @@ import org.apache.lucene.document.SortedSetDocValuesField;
 import org.apache.lucene.document.StoredField;
 import org.apache.lucene.document.StringField;
 import org.apache.lucene.document.TextField;
+import org.apache.lucene.index.FieldInfo.IndexOptions;
 import org.apache.lucene.search.DocIdSetIterator;
 import org.apache.lucene.search.IndexSearcher;
 import org.apache.lucene.search.PhraseQuery;
@ -2118,6 +2121,48 @@ public class TestIndexWriterExceptions extends LuceneTestCase {
    dir.close();
  }
  
+  // kind of slow, but omits positions, so just CPU
+  @Nightly
+  public void testTooManyTokens() throws Exception {
+    Directory dir = newDirectory();
+    IndexWriter iw = new IndexWriter(dir, newIndexWriterConfig(null));
+    Document doc = new Document();
+    FieldType ft = new FieldType(TextField.TYPE_NOT_STORED);
+    ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS);
+    doc.add(new Field("foo", new TokenStream() {
+      CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+      PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);
+      long num = 0;
+      
+      @Override
+      public boolean incrementToken() throws IOException {
+        if (num == Integer.MAX_VALUE + 1) {
+          return false;
+        }
+        clearAttributes();
+        if (num == 0) {
+          posIncAtt.setPositionIncrement(1);
+        } else {
+          posIncAtt.setPositionIncrement(0);
+        }
+        termAtt.append("a");
+        num++;
+        if (VERBOSE && num % 1000000 == 0) {
+          System.out.println("indexed: " + num);
+        }
+        return true;
+      }
+    }, ft));
+    try {
+      iw.addDocument(doc);
+      fail("didn't hit exception");
+    } catch (IllegalArgumentException expected) {
+      assertTrue(expected.getMessage().contains("too many tokens"));
+    }
+    iw.close();
+    dir.close();
+  }
+  
  public void testExceptionDuringRollback() throws Exception {
    // currently: fail in two different places
    final String messageToFailOn = random().nextBoolean() ?