diff --git a/src/java/org/apache/lucene/index/DocumentsWriter.java b/src/java/org/apache/lucene/index/DocumentsWriter.java index 383dfd08aff..5f4a1d9f539 100644 --- a/src/java/org/apache/lucene/index/DocumentsWriter.java +++ b/src/java/org/apache/lucene/index/DocumentsWriter.java @@ -354,7 +354,7 @@ final class DocumentsWriter { state.tvfLocal.reset(); state.fdtLocal.reset(); } - + docStoreSegment = null; files = null; } finally { @@ -518,6 +518,7 @@ final class DocumentsWriter { int numAllFieldData; FieldData[] fieldDataHash; // Hash FieldData instances by field name int fieldDataHashMask; + int maxTermHit; // Set to > 0 if this doc has a too-large term boolean doFlushAfter; @@ -608,6 +609,7 @@ final class DocumentsWriter { numStoredFields = 0; numFieldData = 0; numVectorFields = 0; + maxTermHit = 0; List docFields = doc.getFields(); final int numDocFields = docFields.size(); @@ -1483,17 +1485,23 @@ final class DocumentsWriter { getPostings(postingsFreeList); } - // Pull next free Posting from free list - p = postingsFreeList[--postingsFreeCount]; - final int textLen1 = 1+tokenTextLen; if (textLen1 + charPool.byteUpto > CHAR_BLOCK_SIZE) { - if (textLen1 > CHAR_BLOCK_SIZE) - throw new IllegalArgumentException("term length " + tokenTextLen + " exceeds max term length " + (CHAR_BLOCK_SIZE-1)); + if (textLen1 > CHAR_BLOCK_SIZE) { + maxTermHit = tokenTextLen; + // Just skip this term; we will throw an + // exception after processing all accepted + // terms in the doc + return; + } charPool.nextBuffer(); } final char[] text = charPool.buffer; final int textUpto = charPool.byteUpto; + + // Pull next free Posting from free list + p = postingsFreeList[--postingsFreeCount]; + p.textStart = textUpto + charPool.byteOffset; charPool.byteUpto += textLen1; @@ -2181,26 +2189,28 @@ final class DocumentsWriter { /** Returns true if the caller (IndexWriter) should now * flush. */ - boolean addDocument(Document doc, Analyzer analyzer) + int addDocument(Document doc, Analyzer analyzer) throws CorruptIndexException, IOException { return updateDocument(doc, analyzer, null); } - boolean updateDocument(Term t, Document doc, Analyzer analyzer) + int updateDocument(Term t, Document doc, Analyzer analyzer) throws CorruptIndexException, IOException { return updateDocument(doc, analyzer, t); } - boolean updateDocument(Document doc, Analyzer analyzer, Term delTerm) + int updateDocument(Document doc, Analyzer analyzer, Term delTerm) throws CorruptIndexException, IOException { // This call is synchronized but fast final ThreadState state = getThreadState(doc, delTerm); boolean success = false; + int maxTermHit; try { // This call is not synchronized and does all the work state.processDocument(analyzer); // This call synchronized but fast + maxTermHit = state.maxTermHit; finishDocument(state); success = true; } finally { @@ -2209,7 +2219,11 @@ final class DocumentsWriter { abort(); } } - return state.doFlushAfter || timeToFlushDeletes(); + + int status = maxTermHit<<1; + if (state.doFlushAfter || timeToFlushDeletes()) + status += 1; + return status; } synchronized int getNumBufferedDeleteTerms() { diff --git a/src/java/org/apache/lucene/index/IndexWriter.java b/src/java/org/apache/lucene/index/IndexWriter.java index b886b3a5f71..809df3c9f20 100644 --- a/src/java/org/apache/lucene/index/IndexWriter.java +++ b/src/java/org/apache/lucene/index/IndexWriter.java @@ -1426,10 +1426,10 @@ public class IndexWriter { */ public void addDocument(Document doc, Analyzer analyzer) throws CorruptIndexException, IOException { ensureOpen(); - boolean doFlush = false; + int status = 0; boolean success = false; try { - doFlush = docWriter.addDocument(doc, analyzer); + status = docWriter.addDocument(doc, analyzer); success = true; } finally { if (!success) { @@ -1446,8 +1446,9 @@ public class IndexWriter { } } } - if (doFlush) + if ((status & 1) != 0) flush(true, false); + checkMaxTermLength(status); } /** @@ -1511,10 +1512,10 @@ public class IndexWriter { public void updateDocument(Term term, Document doc, Analyzer analyzer) throws CorruptIndexException, IOException { ensureOpen(); - boolean doFlush = false; + int status = 0; boolean success = false; try { - doFlush = docWriter.updateDocument(term, doc, analyzer); + status = docWriter.updateDocument(term, doc, analyzer); success = true; } finally { if (!success) { @@ -1531,8 +1532,17 @@ public class IndexWriter { } } } - if (doFlush) + if ((status & 1) != 0) flush(true, false); + checkMaxTermLength(status); + } + + /** Throws IllegalArgumentException if the return status + * from DocumentsWriter.{add,update}Document indicates + * that a too-long term was encountered */ + final private void checkMaxTermLength(int status) { + if (status > 1) + throw new IllegalArgumentException("at least one term (length " + (status>>1) + ") exceeds max term length " + (DocumentsWriter.CHAR_BLOCK_SIZE-1) + "; these terms were skipped"); } // for test purpose diff --git a/src/test/org/apache/lucene/index/TestIndexWriter.java b/src/test/org/apache/lucene/index/TestIndexWriter.java index 8e709a1c4c0..0e56cd6c36f 100644 --- a/src/test/org/apache/lucene/index/TestIndexWriter.java +++ b/src/test/org/apache/lucene/index/TestIndexWriter.java @@ -28,8 +28,6 @@ import org.apache.lucene.analysis.WhitespaceAnalyzer; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; -import org.apache.lucene.index.IndexReader; -import org.apache.lucene.index.IndexWriter; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Hits; import org.apache.lucene.search.TermQuery; @@ -221,12 +219,8 @@ public class TestIndexWriter extends LuceneTestCase methodName = "addIndexesNoOptimize(Directory[])"; } - int cycleCount = 0; - while(!done) { - cycleCount++; - // Make a new dir that will enforce disk usage: MockRAMDirectory dir = new MockRAMDirectory(startDir); writer = new IndexWriter(dir, autoCommit, new WhitespaceAnalyzer(), false); @@ -524,7 +518,7 @@ public class TestIndexWriter extends LuceneTestCase String[] startFiles = dir.list(); SegmentInfos infos = new SegmentInfos(); infos.read(dir); - IndexFileDeleter d = new IndexFileDeleter(dir, new KeepOnlyLastCommitDeletionPolicy(), infos, null, null); + new IndexFileDeleter(dir, new KeepOnlyLastCommitDeletionPolicy(), infos, null, null); String[] endFiles = dir.list(); Arrays.sort(startFiles); @@ -543,17 +537,44 @@ public class TestIndexWriter extends LuceneTestCase RAMDirectory dir = new RAMDirectory(); IndexWriter writer = new IndexWriter(dir, new StandardAnalyzer(), true); - char[] chars = new char[16384]; + char[] chars = new char[16383]; Arrays.fill(chars, 'x'); Document doc = new Document(); - String contents = "a b c " + new String(chars); + final String bigTerm = new String(chars); + + // Max length term is 16383, so this contents produces + // a too-long term: + String contents = "abc xyz x" + bigTerm; doc.add(new Field("content", contents, Field.Store.NO, Field.Index.TOKENIZED)); try { writer.addDocument(doc); fail("did not hit expected exception"); } catch (IllegalArgumentException e) { } + + // Make sure we can add another normal document + doc = new Document(); + doc.add(new Field("content", "abc bbb ccc", Field.Store.NO, Field.Index.TOKENIZED)); + writer.addDocument(doc); writer.close(); + + IndexReader reader = IndexReader.open(dir); + // Make sure all terms < max size were indexed + assertEquals(2, reader.docFreq(new Term("content", "abc"))); + assertEquals(1, reader.docFreq(new Term("content", "bbb"))); + reader.close(); + + // Make sure we can add a document with exactly the + // maximum length term, and search on that term: + doc = new Document(); + doc.add(new Field("content", bigTerm, Field.Store.NO, Field.Index.TOKENIZED)); + writer = new IndexWriter(dir, new StandardAnalyzer()); + writer.addDocument(doc); + writer.close(); + reader = IndexReader.open(dir); + assertEquals(1, reader.docFreq(new Term("content", bigTerm))); + reader.close(); + dir.close(); } @@ -1342,7 +1363,6 @@ public class TestIndexWriter extends LuceneTestCase public void testDiverseDocs() throws IOException { RAMDirectory dir = new RAMDirectory(); IndexWriter writer = new IndexWriter(dir, new WhitespaceAnalyzer(), true); - long t0 = System.currentTimeMillis(); writer.setRAMBufferSizeMB(0.5); Random rand = new Random(31415); for(int i=0;i<3;i++) { @@ -1381,7 +1401,6 @@ public class TestIndexWriter extends LuceneTestCase } writer.close(); - long t1 = System.currentTimeMillis(); IndexSearcher searcher = new IndexSearcher(dir); Hits hits = searcher.search(new TermQuery(new Term("field", "aaa"))); assertEquals(300, hits.length()); @@ -1491,7 +1510,6 @@ public class TestIndexWriter extends LuceneTestCase addDoc(writer); } writer.close(); - IndexReader reader = IndexReader.open(dir); Term searchTerm = new Term("content", "aaa"); IndexSearcher searcher = new IndexSearcher(dir); Hits hits = searcher.search(new TermQuery(searchTerm));