LUCENE-2010: drop segments that are 100% deleted docs in IW/IR commit

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1063323 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Michael McCandless 2011-01-25 15:40:51 +00:00
parent 913a9e4491
commit 5642eb07c4
12 changed files with 77 additions and 38 deletions

View File

@ -486,6 +486,9 @@ Changes in runtime behavior
* LUCENE-2829: Improve the performance of "primary key" lookup use
case (running a TermQuery that matches one document) on a
multi-segment index. (Robert Muir, Mike McCandless)
* LUCENE-2010: Segments with 100% deleted documents are now removed on
IndexReader or IndexWriter commit. (Uwe Schindler, Mike McCandless)
API Changes
@ -905,6 +908,9 @@ Optimizations
* LUCENE-2824: Optimize BufferedIndexInput to do less bounds checks.
(Robert Muir)
* LUCENE-2010: Segments with 100% deleted documents are now removed on
IndexReader or IndexWriter commit. (Uwe Schindler, Mike McCandless)
Build
* LUCENE-2124: Moved the JDK-based collation support from contrib/collation

View File

@ -270,9 +270,9 @@ class BufferedDeletes {
}
private synchronized long applyDeletes(IndexWriter.ReaderPool readerPool,
SegmentInfo info,
SegmentDeletes coalescedDeletes,
SegmentDeletes segmentDeletes) throws IOException {
SegmentInfo info,
SegmentDeletes coalescedDeletes,
SegmentDeletes segmentDeletes) throws IOException {
assert readerPool.infoIsLive(info);
assert coalescedDeletes == null || coalescedDeletes.docIDs.size() == 0;

View File

@ -710,6 +710,9 @@ class DirectoryReader extends IndexReader implements Cloneable {
for (int i = 0; i < subReaders.length; i++)
subReaders[i].commit();
// Remove segments that contain only 100% deleted docs:
segmentInfos.pruneDeletedSegments();
// Sync all files we just wrote
directory.sync(segmentInfos.files(directory, false));
segmentInfos.commit(directory);

View File

@ -1163,7 +1163,14 @@ public abstract class IndexReader implements Cloneable,Closeable {
return n;
}
/** Undeletes all documents currently marked as deleted in this index.
/** Undeletes all documents currently marked as deleted in
* this index.
*
* <p>NOTE: this is only a best-effort process. For
* example, if all documents in a given segment were
* deleted, Lucene now drops that segment from the index,
* which means its documents will not be recovered by this
* method.
*
* @throws StaleReaderException if the index has changed
* since this reader was opened

View File

@ -3276,6 +3276,15 @@ public class IndexWriter implements Closeable {
}
}
private boolean keepFullyDeletedSegments;
/** Only for testing.
*
* @lucene.internal */
void keepFullyDeletedSegments() {
keepFullyDeletedSegments = true;
}
// called only from assert
private boolean filesExist(SegmentInfos toSync) throws IOException {
Collection<String> files = toSync.files(directory, false);
@ -3334,6 +3343,10 @@ public class IndexWriter implements Closeable {
readerPool.commit();
toSync = (SegmentInfos) segmentInfos.clone();
if (!keepFullyDeletedSegments) {
toSync.pruneDeletedSegments();
}
assert filesExist(toSync);
if (commitUserData != null)

View File

@ -308,6 +308,19 @@ public final class SegmentInfos extends Vector<SegmentInfo> {
}
}
/** Prunes any segment whose docs are all deleted. */
public void pruneDeletedSegments() {
int segIdx = 0;
while(segIdx < size()) {
final SegmentInfo info = info(segIdx);
if (info.getDelCount() == info.docCount) {
remove(segIdx);
} else {
segIdx++;
}
}
}
/**
* Returns a copy of this instance, also copying each
* SegmentInfo.

View File

@ -428,7 +428,7 @@ public class TestAddIndexes extends LuceneTestCase {
);
writer.addIndexes(aux, new MockDirectoryWrapper(random, new RAMDirectory(aux)));
assertEquals(1060, writer.maxDoc());
assertEquals(1020, writer.maxDoc());
assertEquals(1000, writer.getDocCount(0));
writer.close();
dir.close();
@ -480,7 +480,7 @@ public class TestAddIndexes extends LuceneTestCase {
);
writer.addIndexes(aux, aux2);
assertEquals(1060, writer.maxDoc());
assertEquals(1040, writer.maxDoc());
assertEquals(1000, writer.getDocCount(0));
writer.close();
dir.close();

View File

@ -360,7 +360,7 @@ public class TestIndexReader extends LuceneTestCase
// CREATE A NEW READER and re-test
reader = IndexReader.open(dir, false);
assertEquals("deleted docFreq", 100, reader.docFreq(searchTerm));
assertEquals("deleted docFreq", 0, reader.docFreq(searchTerm));
assertTermDocsCount("deleted termDocs", reader, searchTerm, 0);
reader.close();
reader2.close();
@ -697,7 +697,6 @@ public class TestIndexReader extends LuceneTestCase
// CREATE A NEW READER and re-test
reader = IndexReader.open(dir, false);
assertEquals("deleted docFreq", 100, reader.docFreq(searchTerm));
assertEquals("deleted docFreq", 100, reader.docFreq(searchTerm2));
assertTermDocsCount("deleted termDocs", reader, searchTerm, 0);
assertTermDocsCount("deleted termDocs", reader, searchTerm2, 100);
@ -838,7 +837,6 @@ public class TestIndexReader extends LuceneTestCase
writer.close();
IndexReader reader = IndexReader.open(dir, false);
reader.deleteDocument(0);
reader.deleteDocument(1);
reader.close();
reader = IndexReader.open(dir, false);
reader.undeleteAll();
@ -855,7 +853,6 @@ public class TestIndexReader extends LuceneTestCase
writer.close();
IndexReader reader = IndexReader.open(dir, false);
reader.deleteDocument(0);
reader.deleteDocument(1);
reader.close();
reader = IndexReader.open(dir, false);
reader.undeleteAll();
@ -1290,9 +1287,6 @@ public class TestIndexReader extends LuceneTestCase
// Open another reader to confirm that everything is deleted
reader2 = IndexReader.open(dir, false);
assertEquals("reopened 2", 100, reader2.docFreq(searchTerm1));
assertEquals("reopened 2", 100, reader2.docFreq(searchTerm2));
assertEquals("reopened 2", 100, reader2.docFreq(searchTerm3));
assertTermDocsCount("reopened 2", reader2, searchTerm1, 0);
assertTermDocsCount("reopened 2", reader2, searchTerm2, 0);
assertTermDocsCount("reopened 2", reader2, searchTerm3, 100);

View File

@ -1211,7 +1211,6 @@ public class TestIndexReaderReopen extends LuceneTestCase {
IndexReader r = IndexReader.open(dir, false);
assertEquals(0, r.numDocs());
assertEquals(4, r.maxDoc());
Collection<IndexCommit> commits = IndexReader.listCommits(dir);
for (final IndexCommit commit : commits) {

View File

@ -101,19 +101,12 @@ public class TestIndexWriter extends LuceneTestCase {
}
reader.close();
// test doc count before segments are merged/index is optimized
writer = new IndexWriter(dir, newIndexWriterConfig( TEST_VERSION_CURRENT, new MockAnalyzer()));
assertEquals(100, writer.maxDoc());
writer.close();
reader = IndexReader.open(dir, true);
assertEquals(100, reader.maxDoc());
assertEquals(60, reader.numDocs());
reader.close();
// optimize the index and check that the new doc count is correct
writer = new IndexWriter(dir, newIndexWriterConfig( TEST_VERSION_CURRENT, new MockAnalyzer()));
assertEquals(100, writer.maxDoc());
assertEquals(60, writer.numDocs());
writer.optimize();
assertEquals(60, writer.maxDoc());
@ -1431,7 +1424,6 @@ public class TestIndexWriter extends LuceneTestCase {
w.close();
IndexReader ir = IndexReader.open(dir, true);
assertEquals(1, ir.maxDoc());
assertEquals(0, ir.numDocs());
ir.close();

View File

@ -567,24 +567,25 @@ public class TestIndexWriterExceptions extends LuceneTestCase {
System.out.println("TEST: open reader");
}
IndexReader reader = IndexReader.open(dir, true);
int expected = 3+(1-i)*2;
assertEquals(expected, reader.docFreq(new Term("contents", "here")));
assertEquals(expected, reader.maxDoc());
int numDel = 0;
final Bits delDocs = MultiFields.getDeletedDocs(reader);
assertNotNull(delDocs);
for(int j=0;j<reader.maxDoc();j++) {
if (delDocs.get(j))
numDel++;
else {
reader.document(j);
reader.getTermFreqVectors(j);
if (i == 0) {
int expected = 5;
assertEquals(expected, reader.docFreq(new Term("contents", "here")));
assertEquals(expected, reader.maxDoc());
int numDel = 0;
final Bits delDocs = MultiFields.getDeletedDocs(reader);
assertNotNull(delDocs);
for(int j=0;j<reader.maxDoc();j++) {
if (delDocs.get(j))
numDel++;
else {
reader.document(j);
reader.getTermFreqVectors(j);
}
}
assertEquals(1, numDel);
}
reader.close();
assertEquals(1, numDel);
writer = new IndexWriter(dir, newIndexWriterConfig( TEST_VERSION_CURRENT,
analyzer).setMaxBufferedDocs(10));
doc = new Document();
@ -596,10 +597,10 @@ public class TestIndexWriterExceptions extends LuceneTestCase {
writer.close();
reader = IndexReader.open(dir, true);
expected = 19+(1-i)*2;
int expected = 19+(1-i)*2;
assertEquals(expected, reader.docFreq(new Term("contents", "here")));
assertEquals(expected, reader.maxDoc());
numDel = 0;
int numDel = 0;
assertNull(MultiFields.getDeletedDocs(reader));
for(int j=0;j<reader.maxDoc();j++) {
reader.document(j);

View File

@ -6,6 +6,7 @@ import java.io.IOException;
import java.io.ObjectInputStream;
import java.io.ObjectOutputStream;
import java.util.Random;
import java.lang.reflect.Method;
import junit.framework.Assert;
@ -172,6 +173,16 @@ public class QueryUtils {
}
w.commit();
w.deleteDocuments( new MatchAllDocsQuery() );
try {
// Carefully invoke what is a package-private (test
// only, internal) method on IndexWriter:
Method m = IndexWriter.class.getDeclaredMethod("keepFullyDeletedSegments");
m.setAccessible(true);
m.invoke(w);
} catch (Exception e) {
// Should not happen?
throw new RuntimeException(e);
}
w.commit();
if (0 < numDeletedDocs)