LUCENE-1072: make sure on hitting a too-long term that IndexWriter is still usable

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@600465 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Michael McCandless 2007-12-03 10:09:10 +00:00
parent 11a7bf835b
commit ddf136d928
3 changed files with 70 additions and 28 deletions

View File

@ -354,7 +354,7 @@ final class DocumentsWriter {
state.tvfLocal.reset();
state.fdtLocal.reset();
}
docStoreSegment = null;
files = null;
} finally {
@ -518,6 +518,7 @@ final class DocumentsWriter {
int numAllFieldData;
FieldData[] fieldDataHash; // Hash FieldData instances by field name
int fieldDataHashMask;
int maxTermHit; // Set to > 0 if this doc has a too-large term
boolean doFlushAfter;
@ -608,6 +609,7 @@ final class DocumentsWriter {
numStoredFields = 0;
numFieldData = 0;
numVectorFields = 0;
maxTermHit = 0;
List docFields = doc.getFields();
final int numDocFields = docFields.size();
@ -1483,17 +1485,23 @@ final class DocumentsWriter {
getPostings(postingsFreeList);
}
// Pull next free Posting from free list
p = postingsFreeList[--postingsFreeCount];
final int textLen1 = 1+tokenTextLen;
if (textLen1 + charPool.byteUpto > CHAR_BLOCK_SIZE) {
if (textLen1 > CHAR_BLOCK_SIZE)
throw new IllegalArgumentException("term length " + tokenTextLen + " exceeds max term length " + (CHAR_BLOCK_SIZE-1));
if (textLen1 > CHAR_BLOCK_SIZE) {
maxTermHit = tokenTextLen;
// Just skip this term; we will throw an
// exception after processing all accepted
// terms in the doc
return;
}
charPool.nextBuffer();
}
final char[] text = charPool.buffer;
final int textUpto = charPool.byteUpto;
// Pull next free Posting from free list
p = postingsFreeList[--postingsFreeCount];
p.textStart = textUpto + charPool.byteOffset;
charPool.byteUpto += textLen1;
@ -2181,26 +2189,28 @@ final class DocumentsWriter {
/** Returns true if the caller (IndexWriter) should now
* flush. */
boolean addDocument(Document doc, Analyzer analyzer)
int addDocument(Document doc, Analyzer analyzer)
throws CorruptIndexException, IOException {
return updateDocument(doc, analyzer, null);
}
boolean updateDocument(Term t, Document doc, Analyzer analyzer)
int updateDocument(Term t, Document doc, Analyzer analyzer)
throws CorruptIndexException, IOException {
return updateDocument(doc, analyzer, t);
}
boolean updateDocument(Document doc, Analyzer analyzer, Term delTerm)
int updateDocument(Document doc, Analyzer analyzer, Term delTerm)
throws CorruptIndexException, IOException {
// This call is synchronized but fast
final ThreadState state = getThreadState(doc, delTerm);
boolean success = false;
int maxTermHit;
try {
// This call is not synchronized and does all the work
state.processDocument(analyzer);
// This call synchronized but fast
maxTermHit = state.maxTermHit;
finishDocument(state);
success = true;
} finally {
@ -2209,7 +2219,11 @@ final class DocumentsWriter {
abort();
}
}
return state.doFlushAfter || timeToFlushDeletes();
int status = maxTermHit<<1;
if (state.doFlushAfter || timeToFlushDeletes())
status += 1;
return status;
}
synchronized int getNumBufferedDeleteTerms() {

View File

@ -1426,10 +1426,10 @@ public class IndexWriter {
*/
public void addDocument(Document doc, Analyzer analyzer) throws CorruptIndexException, IOException {
ensureOpen();
boolean doFlush = false;
int status = 0;
boolean success = false;
try {
doFlush = docWriter.addDocument(doc, analyzer);
status = docWriter.addDocument(doc, analyzer);
success = true;
} finally {
if (!success) {
@ -1446,8 +1446,9 @@ public class IndexWriter {
}
}
}
if (doFlush)
if ((status & 1) != 0)
flush(true, false);
checkMaxTermLength(status);
}
/**
@ -1511,10 +1512,10 @@ public class IndexWriter {
public void updateDocument(Term term, Document doc, Analyzer analyzer)
throws CorruptIndexException, IOException {
ensureOpen();
boolean doFlush = false;
int status = 0;
boolean success = false;
try {
doFlush = docWriter.updateDocument(term, doc, analyzer);
status = docWriter.updateDocument(term, doc, analyzer);
success = true;
} finally {
if (!success) {
@ -1531,8 +1532,17 @@ public class IndexWriter {
}
}
}
if (doFlush)
if ((status & 1) != 0)
flush(true, false);
checkMaxTermLength(status);
}
/** Throws IllegalArgumentException if the return status
* from DocumentsWriter.{add,update}Document indicates
* that a too-long term was encountered */
final private void checkMaxTermLength(int status) {
if (status > 1)
throw new IllegalArgumentException("at least one term (length " + (status>>1) + ") exceeds max term length " + (DocumentsWriter.CHAR_BLOCK_SIZE-1) + "; these terms were skipped");
}
// for test purpose

View File

@ -28,8 +28,6 @@ import org.apache.lucene.analysis.WhitespaceAnalyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Hits;
import org.apache.lucene.search.TermQuery;
@ -221,12 +219,8 @@ public class TestIndexWriter extends LuceneTestCase
methodName = "addIndexesNoOptimize(Directory[])";
}
int cycleCount = 0;
while(!done) {
cycleCount++;
// Make a new dir that will enforce disk usage:
MockRAMDirectory dir = new MockRAMDirectory(startDir);
writer = new IndexWriter(dir, autoCommit, new WhitespaceAnalyzer(), false);
@ -524,7 +518,7 @@ public class TestIndexWriter extends LuceneTestCase
String[] startFiles = dir.list();
SegmentInfos infos = new SegmentInfos();
infos.read(dir);
IndexFileDeleter d = new IndexFileDeleter(dir, new KeepOnlyLastCommitDeletionPolicy(), infos, null, null);
new IndexFileDeleter(dir, new KeepOnlyLastCommitDeletionPolicy(), infos, null, null);
String[] endFiles = dir.list();
Arrays.sort(startFiles);
@ -543,17 +537,44 @@ public class TestIndexWriter extends LuceneTestCase
RAMDirectory dir = new RAMDirectory();
IndexWriter writer = new IndexWriter(dir, new StandardAnalyzer(), true);
char[] chars = new char[16384];
char[] chars = new char[16383];
Arrays.fill(chars, 'x');
Document doc = new Document();
String contents = "a b c " + new String(chars);
final String bigTerm = new String(chars);
// Max length term is 16383, so this contents produces
// a too-long term:
String contents = "abc xyz x" + bigTerm;
doc.add(new Field("content", contents, Field.Store.NO, Field.Index.TOKENIZED));
try {
writer.addDocument(doc);
fail("did not hit expected exception");
} catch (IllegalArgumentException e) {
}
// Make sure we can add another normal document
doc = new Document();
doc.add(new Field("content", "abc bbb ccc", Field.Store.NO, Field.Index.TOKENIZED));
writer.addDocument(doc);
writer.close();
IndexReader reader = IndexReader.open(dir);
// Make sure all terms < max size were indexed
assertEquals(2, reader.docFreq(new Term("content", "abc")));
assertEquals(1, reader.docFreq(new Term("content", "bbb")));
reader.close();
// Make sure we can add a document with exactly the
// maximum length term, and search on that term:
doc = new Document();
doc.add(new Field("content", bigTerm, Field.Store.NO, Field.Index.TOKENIZED));
writer = new IndexWriter(dir, new StandardAnalyzer());
writer.addDocument(doc);
writer.close();
reader = IndexReader.open(dir);
assertEquals(1, reader.docFreq(new Term("content", bigTerm)));
reader.close();
dir.close();
}
@ -1342,7 +1363,6 @@ public class TestIndexWriter extends LuceneTestCase
public void testDiverseDocs() throws IOException {
RAMDirectory dir = new RAMDirectory();
IndexWriter writer = new IndexWriter(dir, new WhitespaceAnalyzer(), true);
long t0 = System.currentTimeMillis();
writer.setRAMBufferSizeMB(0.5);
Random rand = new Random(31415);
for(int i=0;i<3;i++) {
@ -1381,7 +1401,6 @@ public class TestIndexWriter extends LuceneTestCase
}
writer.close();
long t1 = System.currentTimeMillis();
IndexSearcher searcher = new IndexSearcher(dir);
Hits hits = searcher.search(new TermQuery(new Term("field", "aaa")));
assertEquals(300, hits.length());
@ -1491,7 +1510,6 @@ public class TestIndexWriter extends LuceneTestCase
addDoc(writer);
}
writer.close();
IndexReader reader = IndexReader.open(dir);
Term searchTerm = new Term("content", "aaa");
IndexSearcher searcher = new IndexSearcher(dir);
Hits hits = searcher.search(new TermQuery(searchTerm));