mirror of
https://github.com/apache/lucene.git
synced 2025-02-22 01:56:16 +00:00
LUCENE-1072: make sure on hitting a too-long term that IndexWriter is still usable
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@600465 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
11a7bf835b
commit
ddf136d928
@ -354,7 +354,7 @@ final class DocumentsWriter {
|
||||
state.tvfLocal.reset();
|
||||
state.fdtLocal.reset();
|
||||
}
|
||||
|
||||
docStoreSegment = null;
|
||||
files = null;
|
||||
|
||||
} finally {
|
||||
@ -518,6 +518,7 @@ final class DocumentsWriter {
|
||||
int numAllFieldData;
|
||||
FieldData[] fieldDataHash; // Hash FieldData instances by field name
|
||||
int fieldDataHashMask;
|
||||
int maxTermHit; // Set to > 0 if this doc has a too-large term
|
||||
|
||||
boolean doFlushAfter;
|
||||
|
||||
@ -608,6 +609,7 @@ final class DocumentsWriter {
|
||||
numStoredFields = 0;
|
||||
numFieldData = 0;
|
||||
numVectorFields = 0;
|
||||
maxTermHit = 0;
|
||||
|
||||
List docFields = doc.getFields();
|
||||
final int numDocFields = docFields.size();
|
||||
@ -1483,17 +1485,23 @@ final class DocumentsWriter {
|
||||
getPostings(postingsFreeList);
|
||||
}
|
||||
|
||||
// Pull next free Posting from free list
|
||||
p = postingsFreeList[--postingsFreeCount];
|
||||
|
||||
final int textLen1 = 1+tokenTextLen;
|
||||
if (textLen1 + charPool.byteUpto > CHAR_BLOCK_SIZE) {
|
||||
if (textLen1 > CHAR_BLOCK_SIZE)
|
||||
throw new IllegalArgumentException("term length " + tokenTextLen + " exceeds max term length " + (CHAR_BLOCK_SIZE-1));
|
||||
if (textLen1 > CHAR_BLOCK_SIZE) {
|
||||
maxTermHit = tokenTextLen;
|
||||
// Just skip this term; we will throw an
|
||||
// exception after processing all accepted
|
||||
// terms in the doc
|
||||
return;
|
||||
}
|
||||
charPool.nextBuffer();
|
||||
}
|
||||
final char[] text = charPool.buffer;
|
||||
final int textUpto = charPool.byteUpto;
|
||||
|
||||
// Pull next free Posting from free list
|
||||
p = postingsFreeList[--postingsFreeCount];
|
||||
|
||||
p.textStart = textUpto + charPool.byteOffset;
|
||||
charPool.byteUpto += textLen1;
|
||||
|
||||
@ -2181,26 +2189,28 @@ final class DocumentsWriter {
|
||||
|
||||
/** Returns true if the caller (IndexWriter) should now
|
||||
* flush. */
|
||||
boolean addDocument(Document doc, Analyzer analyzer)
|
||||
int addDocument(Document doc, Analyzer analyzer)
|
||||
throws CorruptIndexException, IOException {
|
||||
return updateDocument(doc, analyzer, null);
|
||||
}
|
||||
|
||||
boolean updateDocument(Term t, Document doc, Analyzer analyzer)
|
||||
int updateDocument(Term t, Document doc, Analyzer analyzer)
|
||||
throws CorruptIndexException, IOException {
|
||||
return updateDocument(doc, analyzer, t);
|
||||
}
|
||||
|
||||
boolean updateDocument(Document doc, Analyzer analyzer, Term delTerm)
|
||||
int updateDocument(Document doc, Analyzer analyzer, Term delTerm)
|
||||
throws CorruptIndexException, IOException {
|
||||
|
||||
// This call is synchronized but fast
|
||||
final ThreadState state = getThreadState(doc, delTerm);
|
||||
boolean success = false;
|
||||
int maxTermHit;
|
||||
try {
|
||||
// This call is not synchronized and does all the work
|
||||
state.processDocument(analyzer);
|
||||
// This call synchronized but fast
|
||||
maxTermHit = state.maxTermHit;
|
||||
finishDocument(state);
|
||||
success = true;
|
||||
} finally {
|
||||
@ -2209,7 +2219,11 @@ final class DocumentsWriter {
|
||||
abort();
|
||||
}
|
||||
}
|
||||
return state.doFlushAfter || timeToFlushDeletes();
|
||||
|
||||
int status = maxTermHit<<1;
|
||||
if (state.doFlushAfter || timeToFlushDeletes())
|
||||
status += 1;
|
||||
return status;
|
||||
}
|
||||
|
||||
synchronized int getNumBufferedDeleteTerms() {
|
||||
|
@ -1426,10 +1426,10 @@ public class IndexWriter {
|
||||
*/
|
||||
public void addDocument(Document doc, Analyzer analyzer) throws CorruptIndexException, IOException {
|
||||
ensureOpen();
|
||||
boolean doFlush = false;
|
||||
int status = 0;
|
||||
boolean success = false;
|
||||
try {
|
||||
doFlush = docWriter.addDocument(doc, analyzer);
|
||||
status = docWriter.addDocument(doc, analyzer);
|
||||
success = true;
|
||||
} finally {
|
||||
if (!success) {
|
||||
@ -1446,8 +1446,9 @@ public class IndexWriter {
|
||||
}
|
||||
}
|
||||
}
|
||||
if (doFlush)
|
||||
if ((status & 1) != 0)
|
||||
flush(true, false);
|
||||
checkMaxTermLength(status);
|
||||
}
|
||||
|
||||
/**
|
||||
@ -1511,10 +1512,10 @@ public class IndexWriter {
|
||||
public void updateDocument(Term term, Document doc, Analyzer analyzer)
|
||||
throws CorruptIndexException, IOException {
|
||||
ensureOpen();
|
||||
boolean doFlush = false;
|
||||
int status = 0;
|
||||
boolean success = false;
|
||||
try {
|
||||
doFlush = docWriter.updateDocument(term, doc, analyzer);
|
||||
status = docWriter.updateDocument(term, doc, analyzer);
|
||||
success = true;
|
||||
} finally {
|
||||
if (!success) {
|
||||
@ -1531,8 +1532,17 @@ public class IndexWriter {
|
||||
}
|
||||
}
|
||||
}
|
||||
if (doFlush)
|
||||
if ((status & 1) != 0)
|
||||
flush(true, false);
|
||||
checkMaxTermLength(status);
|
||||
}
|
||||
|
||||
/** Throws IllegalArgumentException if the return status
|
||||
* from DocumentsWriter.{add,update}Document indicates
|
||||
* that a too-long term was encountered */
|
||||
final private void checkMaxTermLength(int status) {
|
||||
if (status > 1)
|
||||
throw new IllegalArgumentException("at least one term (length " + (status>>1) + ") exceeds max term length " + (DocumentsWriter.CHAR_BLOCK_SIZE-1) + "; these terms were skipped");
|
||||
}
|
||||
|
||||
// for test purpose
|
||||
|
@ -28,8 +28,6 @@ import org.apache.lucene.analysis.WhitespaceAnalyzer;
|
||||
import org.apache.lucene.analysis.standard.StandardAnalyzer;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.IndexWriter;
|
||||
import org.apache.lucene.search.IndexSearcher;
|
||||
import org.apache.lucene.search.Hits;
|
||||
import org.apache.lucene.search.TermQuery;
|
||||
@ -221,12 +219,8 @@ public class TestIndexWriter extends LuceneTestCase
|
||||
methodName = "addIndexesNoOptimize(Directory[])";
|
||||
}
|
||||
|
||||
int cycleCount = 0;
|
||||
|
||||
while(!done) {
|
||||
|
||||
cycleCount++;
|
||||
|
||||
// Make a new dir that will enforce disk usage:
|
||||
MockRAMDirectory dir = new MockRAMDirectory(startDir);
|
||||
writer = new IndexWriter(dir, autoCommit, new WhitespaceAnalyzer(), false);
|
||||
@ -524,7 +518,7 @@ public class TestIndexWriter extends LuceneTestCase
|
||||
String[] startFiles = dir.list();
|
||||
SegmentInfos infos = new SegmentInfos();
|
||||
infos.read(dir);
|
||||
IndexFileDeleter d = new IndexFileDeleter(dir, new KeepOnlyLastCommitDeletionPolicy(), infos, null, null);
|
||||
new IndexFileDeleter(dir, new KeepOnlyLastCommitDeletionPolicy(), infos, null, null);
|
||||
String[] endFiles = dir.list();
|
||||
|
||||
Arrays.sort(startFiles);
|
||||
@ -543,17 +537,44 @@ public class TestIndexWriter extends LuceneTestCase
|
||||
RAMDirectory dir = new RAMDirectory();
|
||||
IndexWriter writer = new IndexWriter(dir, new StandardAnalyzer(), true);
|
||||
|
||||
char[] chars = new char[16384];
|
||||
char[] chars = new char[16383];
|
||||
Arrays.fill(chars, 'x');
|
||||
Document doc = new Document();
|
||||
String contents = "a b c " + new String(chars);
|
||||
final String bigTerm = new String(chars);
|
||||
|
||||
// Max length term is 16383, so this contents produces
|
||||
// a too-long term:
|
||||
String contents = "abc xyz x" + bigTerm;
|
||||
doc.add(new Field("content", contents, Field.Store.NO, Field.Index.TOKENIZED));
|
||||
try {
|
||||
writer.addDocument(doc);
|
||||
fail("did not hit expected exception");
|
||||
} catch (IllegalArgumentException e) {
|
||||
}
|
||||
|
||||
// Make sure we can add another normal document
|
||||
doc = new Document();
|
||||
doc.add(new Field("content", "abc bbb ccc", Field.Store.NO, Field.Index.TOKENIZED));
|
||||
writer.addDocument(doc);
|
||||
writer.close();
|
||||
|
||||
IndexReader reader = IndexReader.open(dir);
|
||||
// Make sure all terms < max size were indexed
|
||||
assertEquals(2, reader.docFreq(new Term("content", "abc")));
|
||||
assertEquals(1, reader.docFreq(new Term("content", "bbb")));
|
||||
reader.close();
|
||||
|
||||
// Make sure we can add a document with exactly the
|
||||
// maximum length term, and search on that term:
|
||||
doc = new Document();
|
||||
doc.add(new Field("content", bigTerm, Field.Store.NO, Field.Index.TOKENIZED));
|
||||
writer = new IndexWriter(dir, new StandardAnalyzer());
|
||||
writer.addDocument(doc);
|
||||
writer.close();
|
||||
reader = IndexReader.open(dir);
|
||||
assertEquals(1, reader.docFreq(new Term("content", bigTerm)));
|
||||
reader.close();
|
||||
|
||||
dir.close();
|
||||
}
|
||||
|
||||
@ -1342,7 +1363,6 @@ public class TestIndexWriter extends LuceneTestCase
|
||||
public void testDiverseDocs() throws IOException {
|
||||
RAMDirectory dir = new RAMDirectory();
|
||||
IndexWriter writer = new IndexWriter(dir, new WhitespaceAnalyzer(), true);
|
||||
long t0 = System.currentTimeMillis();
|
||||
writer.setRAMBufferSizeMB(0.5);
|
||||
Random rand = new Random(31415);
|
||||
for(int i=0;i<3;i++) {
|
||||
@ -1381,7 +1401,6 @@ public class TestIndexWriter extends LuceneTestCase
|
||||
}
|
||||
writer.close();
|
||||
|
||||
long t1 = System.currentTimeMillis();
|
||||
IndexSearcher searcher = new IndexSearcher(dir);
|
||||
Hits hits = searcher.search(new TermQuery(new Term("field", "aaa")));
|
||||
assertEquals(300, hits.length());
|
||||
@ -1491,7 +1510,6 @@ public class TestIndexWriter extends LuceneTestCase
|
||||
addDoc(writer);
|
||||
}
|
||||
writer.close();
|
||||
IndexReader reader = IndexReader.open(dir);
|
||||
Term searchTerm = new Term("content", "aaa");
|
||||
IndexSearcher searcher = new IndexSearcher(dir);
|
||||
Hits hits = searcher.search(new TermQuery(searchTerm));
|
||||
|
Loading…
x
Reference in New Issue
Block a user