From 27c4c6a3b1db0a938c0b568319cf8408065d68d0 Mon Sep 17 00:00:00 2001
From: Michael McCandless
Date: Sat, 18 Aug 2007 23:15:14 +0000
Subject: [PATCH] LUCENE-985: throw clearer exception when term is too long (>
16383 chars)
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@567338 13f79535-47bb-0310-9956-ffa450edef68
---
CHANGES.txt | 5 ++++
.../apache/lucene/index/DocumentsWriter.java | 5 +++-
.../org/apache/lucene/index/IndexWriter.java | 4 ++++
.../apache/lucene/index/TestIndexWriter.java | 23 +++++++++++++++++++
4 files changed, 36 insertions(+), 1 deletion(-)
diff --git a/CHANGES.txt b/CHANGES.txt
index 131e43bcbf5..98f3270964a 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -76,6 +76,11 @@ Bug fixes
close any streams they had opened if an exception is hit in the
constructor. (Ning Li via Mike McCandless)
+12. LUCENE-985: If an extremely long term is in a doc (> 16383 chars),
+ we now throw an IllegalArgumentException saying the term is too
+ long, instead of cryptic ArrayIndexOutOfBoundsException. (Karl
+ Wettin via Mike McCandless)
+
New features
1. LUCENE-906: Elision filter for French.
diff --git a/src/java/org/apache/lucene/index/DocumentsWriter.java b/src/java/org/apache/lucene/index/DocumentsWriter.java
index bf002099ca7..acf8a8a21c6 100644
--- a/src/java/org/apache/lucene/index/DocumentsWriter.java
+++ b/src/java/org/apache/lucene/index/DocumentsWriter.java
@@ -1452,8 +1452,11 @@ final class DocumentsWriter {
p = postingsFreeList[--postingsFreeCount];
final int textLen1 = 1+tokenTextLen;
- if (textLen1 + charPool.byteUpto > CHAR_BLOCK_SIZE)
+ if (textLen1 + charPool.byteUpto > CHAR_BLOCK_SIZE) {
+ if (textLen1 > CHAR_BLOCK_SIZE)
+ throw new IllegalArgumentException("term length " + tokenTextLen + " exceeds max term length " + (CHAR_BLOCK_SIZE-1));
charPool.nextBuffer();
+ }
final char[] text = charPool.buffer;
final int textUpto = charPool.byteUpto;
p.textStart = textUpto + charPool.byteOffset;
diff --git a/src/java/org/apache/lucene/index/IndexWriter.java b/src/java/org/apache/lucene/index/IndexWriter.java
index 7cb68faf7e3..60244bb78df 100644
--- a/src/java/org/apache/lucene/index/IndexWriter.java
+++ b/src/java/org/apache/lucene/index/IndexWriter.java
@@ -1091,6 +1091,10 @@ public class IndexWriter {
* temporary space usage) then the maximum free disk space
* required is the same as {@link #optimize}.
*
+ * Note that each term in the document can be no longer
+ * than 16383 characters, otherwise an
+ * IllegalArgumentException will be thrown.
+ *
* @throws CorruptIndexException if the index is corrupt
* @throws IOException if there is a low-level IO error
*/
diff --git a/src/test/org/apache/lucene/index/TestIndexWriter.java b/src/test/org/apache/lucene/index/TestIndexWriter.java
index 2fb599bbcfc..39a8f585420 100644
--- a/src/test/org/apache/lucene/index/TestIndexWriter.java
+++ b/src/test/org/apache/lucene/index/TestIndexWriter.java
@@ -25,6 +25,7 @@ import java.util.Random;
import junit.framework.TestCase;
import org.apache.lucene.analysis.WhitespaceAnalyzer;
+import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexReader;
@@ -490,6 +491,28 @@ public class TestIndexWriter extends TestCase
}
}
+ /**
+ * Make sure we get a friendly exception for a wicked
+ * long term.
+ */
+ public void testWickedLongTerm() throws IOException {
+ RAMDirectory dir = new RAMDirectory();
+ IndexWriter writer = new IndexWriter(dir, new StandardAnalyzer(), true);
+
+ char[] chars = new char[16384];
+ Arrays.fill(chars, 'x');
+ Document doc = new Document();
+ String contents = "a b c " + new String(chars);
+ doc.add(new Field("content", contents, Field.Store.NO, Field.Index.TOKENIZED));
+ try {
+ writer.addDocument(doc);
+ fail("did not hit expected exception");
+ } catch (IllegalArgumentException e) {
+ }
+ writer.close();
+ dir.close();
+ }
+
/**
* Make sure optimize doesn't use any more than 1X
* starting index size as its temporary free space