LUCENE-5980: don't let document length overflow

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1629835 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2014-10-07 09:15:44 +00:00
parent f6cbe636bb
commit 6f480ae9dd
3 changed files with 51 additions and 2 deletions

View File

@ -171,6 +171,8 @@ Bug Fixes
not have the regular "spinlock" of DirectoryReader.open. It now implements
Closeable and you must close it to release the lock. (Mike McCandless, Robert Muir)
* LUCENE-5980: Don't let document length overflow. (Robert Muir)
Documentation
* LUCENE-5392: Add/improve analysis package documentation to reflect

View File

@ -622,6 +622,10 @@ final class DefaultIndexingChain extends DocConsumer {
invertState.lastStartOffset = startOffset;
}
invertState.length++;
if (invertState.length < 0) {
throw new IllegalArgumentException("too many tokens in field '" + field.name() + "'");
}
//System.out.println(" term=" + invertState.termAttribute);
// If we hit an exception in here, we abort
@ -633,8 +637,6 @@ final class DefaultIndexingChain extends DocConsumer {
aborting = true;
termsHashPerField.add();
aborting = false;
invertState.length++;
}
// trigger streams to perform end-of-stream operations

View File

@ -35,6 +35,8 @@ import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.document.BinaryDocValuesField;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
@ -46,6 +48,7 @@ import org.apache.lucene.document.SortedSetDocValuesField;
import org.apache.lucene.document.StoredField;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.FieldInfo.IndexOptions;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.PhraseQuery;
@ -2118,6 +2121,48 @@ public class TestIndexWriterExceptions extends LuceneTestCase {
dir.close();
}
// kind of slow, but omits positions, so just CPU
@Nightly
public void testTooManyTokens() throws Exception {
Directory dir = newDirectory();
IndexWriter iw = new IndexWriter(dir, newIndexWriterConfig(null));
Document doc = new Document();
FieldType ft = new FieldType(TextField.TYPE_NOT_STORED);
ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS);
doc.add(new Field("foo", new TokenStream() {
CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);
long num = 0;
@Override
public boolean incrementToken() throws IOException {
if (num == Integer.MAX_VALUE + 1) {
return false;
}
clearAttributes();
if (num == 0) {
posIncAtt.setPositionIncrement(1);
} else {
posIncAtt.setPositionIncrement(0);
}
termAtt.append("a");
num++;
if (VERBOSE && num % 1000000 == 0) {
System.out.println("indexed: " + num);
}
return true;
}
}, ft));
try {
iw.addDocument(doc);
fail("didn't hit exception");
} catch (IllegalArgumentException expected) {
assertTrue(expected.getMessage().contains("too many tokens"));
}
iw.close();
dir.close();
}
public void testExceptionDuringRollback() throws Exception {
// currently: fail in two different places
final String messageToFailOn = random().nextBoolean() ?