From 27c4c6a3b1db0a938c0b568319cf8408065d68d0 Mon Sep 17 00:00:00 2001 From: Michael McCandless Date: Sat, 18 Aug 2007 23:15:14 +0000 Subject: [PATCH] LUCENE-985: throw clearer exception when term is too long (> 16383 chars) git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@567338 13f79535-47bb-0310-9956-ffa450edef68 --- CHANGES.txt | 5 ++++ .../apache/lucene/index/DocumentsWriter.java | 5 +++- .../org/apache/lucene/index/IndexWriter.java | 4 ++++ .../apache/lucene/index/TestIndexWriter.java | 23 +++++++++++++++++++ 4 files changed, 36 insertions(+), 1 deletion(-) diff --git a/CHANGES.txt b/CHANGES.txt index 131e43bcbf5..98f3270964a 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -76,6 +76,11 @@ Bug fixes close any streams they had opened if an exception is hit in the constructor. (Ning Li via Mike McCandless) +12. LUCENE-985: If an extremely long term is in a doc (> 16383 chars), + we now throw an IllegalArgumentException saying the term is too + long, instead of cryptic ArrayIndexOutOfBoundsException. (Karl + Wettin via Mike McCandless) + New features 1. LUCENE-906: Elision filter for French. diff --git a/src/java/org/apache/lucene/index/DocumentsWriter.java b/src/java/org/apache/lucene/index/DocumentsWriter.java index bf002099ca7..acf8a8a21c6 100644 --- a/src/java/org/apache/lucene/index/DocumentsWriter.java +++ b/src/java/org/apache/lucene/index/DocumentsWriter.java @@ -1452,8 +1452,11 @@ final class DocumentsWriter { p = postingsFreeList[--postingsFreeCount]; final int textLen1 = 1+tokenTextLen; - if (textLen1 + charPool.byteUpto > CHAR_BLOCK_SIZE) + if (textLen1 + charPool.byteUpto > CHAR_BLOCK_SIZE) { + if (textLen1 > CHAR_BLOCK_SIZE) + throw new IllegalArgumentException("term length " + tokenTextLen + " exceeds max term length " + (CHAR_BLOCK_SIZE-1)); charPool.nextBuffer(); + } final char[] text = charPool.buffer; final int textUpto = charPool.byteUpto; p.textStart = textUpto + charPool.byteOffset; diff --git a/src/java/org/apache/lucene/index/IndexWriter.java b/src/java/org/apache/lucene/index/IndexWriter.java index 7cb68faf7e3..60244bb78df 100644 --- a/src/java/org/apache/lucene/index/IndexWriter.java +++ b/src/java/org/apache/lucene/index/IndexWriter.java @@ -1091,6 +1091,10 @@ public class IndexWriter { * temporary space usage) then the maximum free disk space * required is the same as {@link #optimize}.

* + *

Note that each term in the document can be no longer + * than 16383 characters, otherwise an + * IllegalArgumentException will be thrown.

+ * * @throws CorruptIndexException if the index is corrupt * @throws IOException if there is a low-level IO error */ diff --git a/src/test/org/apache/lucene/index/TestIndexWriter.java b/src/test/org/apache/lucene/index/TestIndexWriter.java index 2fb599bbcfc..39a8f585420 100644 --- a/src/test/org/apache/lucene/index/TestIndexWriter.java +++ b/src/test/org/apache/lucene/index/TestIndexWriter.java @@ -25,6 +25,7 @@ import java.util.Random; import junit.framework.TestCase; import org.apache.lucene.analysis.WhitespaceAnalyzer; +import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.index.IndexReader; @@ -490,6 +491,28 @@ public class TestIndexWriter extends TestCase } } + /** + * Make sure we get a friendly exception for a wicked + * long term. + */ + public void testWickedLongTerm() throws IOException { + RAMDirectory dir = new RAMDirectory(); + IndexWriter writer = new IndexWriter(dir, new StandardAnalyzer(), true); + + char[] chars = new char[16384]; + Arrays.fill(chars, 'x'); + Document doc = new Document(); + String contents = "a b c " + new String(chars); + doc.add(new Field("content", contents, Field.Store.NO, Field.Index.TOKENIZED)); + try { + writer.addDocument(doc); + fail("did not hit expected exception"); + } catch (IllegalArgumentException e) { + } + writer.close(); + dir.close(); + } + /** * Make sure optimize doesn't use any more than 1X * starting index size as its temporary free space