diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index e95fe68c498..fae37e475d9 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -284,6 +284,10 @@ Bug Fixes parts of the shape's boundary erroneously too. So center points aren't indexed any more; you should use another spatial field. (David Smiley) +* LUCENE-4629: IndexWriter misses to delete documents if a document block is + indexed and the Iterator throws an exception. Documents were only rolled back + if the actual indexing process failed. (Simon Willnauer) + Changes in Runtime Behavior * LUCENE-4586: Change default ResultMode of FacetRequest to PER_NODE_IN_TREE. diff --git a/lucene/core/src/java/org/apache/lucene/index/DocumentsWriterPerThread.java b/lucene/core/src/java/org/apache/lucene/index/DocumentsWriterPerThread.java index a0ce577e504..03152b79760 100644 --- a/lucene/core/src/java/org/apache/lucene/index/DocumentsWriterPerThread.java +++ b/lucene/core/src/java/org/apache/lucene/index/DocumentsWriterPerThread.java @@ -296,7 +296,9 @@ class DocumentsWriterPerThread { infoStream.message("DWPT", Thread.currentThread().getName() + " update delTerm=" + delTerm + " docID=" + docState.docID + " seg=" + segmentInfo.name); } int docCount = 0; + boolean allDocsIndexed = false; try { + for(IndexDocument doc : docs) { docState.doc = doc; docState.docID = numDocsInRAM; @@ -309,20 +311,7 @@ class DocumentsWriterPerThread { } finally { if (!success) { // An exc is being thrown... - if (!aborting) { - // One of the documents hit a non-aborting - // exception (eg something happened during - // analysis). We now go and mark any docs - // from this batch that we had already indexed - // as deleted: - int docID = docState.docID; - final int endDocID = docID - docCount; - while (docID > endDocID) { - deleteDocID(docID); - docID--; - } - // Incr here because finishDocument will not // be called (because an exc is being thrown): numDocsInRAM++; @@ -343,6 +332,7 @@ class DocumentsWriterPerThread { finishDocument(null); } + allDocsIndexed = true; // Apply delTerm only after all indexing has // succeeded, but apply it only to docs prior to when @@ -354,6 +344,16 @@ class DocumentsWriterPerThread { } } finally { + if (!allDocsIndexed && !aborting) { + // the iterator threw an exception that is not aborting + // go and mark all docs from this block as deleted + int docID = numDocsInRAM-1; + final int endDocID = docID - docCount; + while (docID > endDocID) { + deleteDocID(docID); + docID--; + } + } docState.clear(); } diff --git a/lucene/core/src/test/org/apache/lucene/index/TestIndexWriter.java b/lucene/core/src/test/org/apache/lucene/index/TestIndexWriter.java index fc99fb61dc9..caaab94da8c 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestIndexWriter.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestIndexWriter.java @@ -24,8 +24,11 @@ import java.io.StringReader; import java.util.ArrayList; import java.util.Arrays; import java.util.HashMap; +import java.util.HashSet; import java.util.Iterator; import java.util.List; +import java.util.Random; +import java.util.Set; import org.apache.lucene.analysis.*; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; @@ -56,7 +59,9 @@ import org.apache.lucene.store.NoLockFactory; import org.apache.lucene.store.RAMDirectory; import org.apache.lucene.store.SimpleFSLockFactory; import org.apache.lucene.store.SingleInstanceLockFactory; +import org.apache.lucene.util.Bits; import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.util.ThreadInterruptedException; import org.apache.lucene.util._TestUtil; @@ -1995,4 +2000,92 @@ public class TestIndexWriter extends LuceneTestCase { dir.close(); } + public void testIterableThrowsException() throws IOException { + Directory dir = newDirectory(); + IndexWriter w = new IndexWriter(dir, newIndexWriterConfig( + TEST_VERSION_CURRENT, new MockAnalyzer(random()))); + int iters = atLeast(100); + int docCount = 0; + int docId = 0; + Set liveIds = new HashSet(); + for (int i = 0; i < iters; i++) { + List docs = new ArrayList(); + FieldType ft = new FieldType(TextField.TYPE_NOT_STORED); + FieldType idFt = new FieldType(TextField.TYPE_STORED); + + int numDocs = atLeast(4); + for (int j = 0; j < numDocs; j++) { + Document doc = new Document(); + doc.add(newField("id", ""+ (docId++), idFt)); + doc.add(newField("foo", _TestUtil.randomSimpleString(random()), ft)); + docs.add(doc); + } + boolean success = false; + try { + w.addDocuments(new RandomFailingFieldIterable(docs, random())); + success = true; + } catch (RuntimeException e) { + assertEquals("boom", e.getMessage()); + } finally { + if (success) { + docCount += docs.size(); + for (Document indexDocument : docs) { + liveIds.add(indexDocument.get("id")); + } + } + } + } + DirectoryReader reader = w.getReader(); + assertEquals(docCount, reader.numDocs()); + List leaves = reader.leaves(); + for (AtomicReaderContext atomicReaderContext : leaves) { + AtomicReader ar = atomicReaderContext.reader(); + Bits liveDocs = ar.getLiveDocs(); + int maxDoc = ar.maxDoc(); + for (int i = 0; i < maxDoc; i++) { + if (liveDocs.get(i)) { + assertTrue(liveIds.remove(ar.document(i).get("id"))); + } + } + } + assertTrue(liveIds.isEmpty()); + IOUtils.close(reader, w, dir); + } + + private static class RandomFailingFieldIterable implements Iterable { + private final List docList; + private final Random random; + + public RandomFailingFieldIterable(List docList, Random random) { + this.docList = docList; + this.random = random; + } + + @Override + public Iterator iterator() { + final Iterator docIter = docList.iterator(); + return new Iterator() { + + @Override + public boolean hasNext() { + return docIter.hasNext(); + } + + @Override + public IndexDocument next() { + if (random.nextInt(5) == 0) { + throw new RuntimeException("boom"); + } + return docIter.next(); + } + + @Override + public void remove() {throw new UnsupportedOperationException();} + + + }; + } + + } + }