mirror of https://github.com/apache/lucene.git
LUCENE-2010: drop segments that are 100% deleted docs in IW/IR commit
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1063323 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
913a9e4491
commit
5642eb07c4
|
@ -487,6 +487,9 @@ Changes in runtime behavior
|
|||
case (running a TermQuery that matches one document) on a
|
||||
multi-segment index. (Robert Muir, Mike McCandless)
|
||||
|
||||
* LUCENE-2010: Segments with 100% deleted documents are now removed on
|
||||
IndexReader or IndexWriter commit. (Uwe Schindler, Mike McCandless)
|
||||
|
||||
API Changes
|
||||
|
||||
* LUCENE-2076: Rename FSDirectory.getFile -> getDirectory. (George
|
||||
|
@ -905,6 +908,9 @@ Optimizations
|
|||
* LUCENE-2824: Optimize BufferedIndexInput to do less bounds checks.
|
||||
(Robert Muir)
|
||||
|
||||
* LUCENE-2010: Segments with 100% deleted documents are now removed on
|
||||
IndexReader or IndexWriter commit. (Uwe Schindler, Mike McCandless)
|
||||
|
||||
Build
|
||||
|
||||
* LUCENE-2124: Moved the JDK-based collation support from contrib/collation
|
||||
|
|
|
@ -710,6 +710,9 @@ class DirectoryReader extends IndexReader implements Cloneable {
|
|||
for (int i = 0; i < subReaders.length; i++)
|
||||
subReaders[i].commit();
|
||||
|
||||
// Remove segments that contain only 100% deleted docs:
|
||||
segmentInfos.pruneDeletedSegments();
|
||||
|
||||
// Sync all files we just wrote
|
||||
directory.sync(segmentInfos.files(directory, false));
|
||||
segmentInfos.commit(directory);
|
||||
|
|
|
@ -1163,7 +1163,14 @@ public abstract class IndexReader implements Cloneable,Closeable {
|
|||
return n;
|
||||
}
|
||||
|
||||
/** Undeletes all documents currently marked as deleted in this index.
|
||||
/** Undeletes all documents currently marked as deleted in
|
||||
* this index.
|
||||
*
|
||||
* <p>NOTE: this is only a best-effort process. For
|
||||
* example, if all documents in a given segment were
|
||||
* deleted, Lucene now drops that segment from the index,
|
||||
* which means its documents will not be recovered by this
|
||||
* method.
|
||||
*
|
||||
* @throws StaleReaderException if the index has changed
|
||||
* since this reader was opened
|
||||
|
|
|
@ -3276,6 +3276,15 @@ public class IndexWriter implements Closeable {
|
|||
}
|
||||
}
|
||||
|
||||
private boolean keepFullyDeletedSegments;
|
||||
|
||||
/** Only for testing.
|
||||
*
|
||||
* @lucene.internal */
|
||||
void keepFullyDeletedSegments() {
|
||||
keepFullyDeletedSegments = true;
|
||||
}
|
||||
|
||||
// called only from assert
|
||||
private boolean filesExist(SegmentInfos toSync) throws IOException {
|
||||
Collection<String> files = toSync.files(directory, false);
|
||||
|
@ -3334,6 +3343,10 @@ public class IndexWriter implements Closeable {
|
|||
readerPool.commit();
|
||||
|
||||
toSync = (SegmentInfos) segmentInfos.clone();
|
||||
if (!keepFullyDeletedSegments) {
|
||||
toSync.pruneDeletedSegments();
|
||||
}
|
||||
|
||||
assert filesExist(toSync);
|
||||
|
||||
if (commitUserData != null)
|
||||
|
|
|
@ -308,6 +308,19 @@ public final class SegmentInfos extends Vector<SegmentInfo> {
|
|||
}
|
||||
}
|
||||
|
||||
/** Prunes any segment whose docs are all deleted. */
|
||||
public void pruneDeletedSegments() {
|
||||
int segIdx = 0;
|
||||
while(segIdx < size()) {
|
||||
final SegmentInfo info = info(segIdx);
|
||||
if (info.getDelCount() == info.docCount) {
|
||||
remove(segIdx);
|
||||
} else {
|
||||
segIdx++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a copy of this instance, also copying each
|
||||
* SegmentInfo.
|
||||
|
|
|
@ -428,7 +428,7 @@ public class TestAddIndexes extends LuceneTestCase {
|
|||
);
|
||||
|
||||
writer.addIndexes(aux, new MockDirectoryWrapper(random, new RAMDirectory(aux)));
|
||||
assertEquals(1060, writer.maxDoc());
|
||||
assertEquals(1020, writer.maxDoc());
|
||||
assertEquals(1000, writer.getDocCount(0));
|
||||
writer.close();
|
||||
dir.close();
|
||||
|
@ -480,7 +480,7 @@ public class TestAddIndexes extends LuceneTestCase {
|
|||
);
|
||||
|
||||
writer.addIndexes(aux, aux2);
|
||||
assertEquals(1060, writer.maxDoc());
|
||||
assertEquals(1040, writer.maxDoc());
|
||||
assertEquals(1000, writer.getDocCount(0));
|
||||
writer.close();
|
||||
dir.close();
|
||||
|
|
|
@ -360,7 +360,7 @@ public class TestIndexReader extends LuceneTestCase
|
|||
|
||||
// CREATE A NEW READER and re-test
|
||||
reader = IndexReader.open(dir, false);
|
||||
assertEquals("deleted docFreq", 100, reader.docFreq(searchTerm));
|
||||
assertEquals("deleted docFreq", 0, reader.docFreq(searchTerm));
|
||||
assertTermDocsCount("deleted termDocs", reader, searchTerm, 0);
|
||||
reader.close();
|
||||
reader2.close();
|
||||
|
@ -697,7 +697,6 @@ public class TestIndexReader extends LuceneTestCase
|
|||
|
||||
// CREATE A NEW READER and re-test
|
||||
reader = IndexReader.open(dir, false);
|
||||
assertEquals("deleted docFreq", 100, reader.docFreq(searchTerm));
|
||||
assertEquals("deleted docFreq", 100, reader.docFreq(searchTerm2));
|
||||
assertTermDocsCount("deleted termDocs", reader, searchTerm, 0);
|
||||
assertTermDocsCount("deleted termDocs", reader, searchTerm2, 100);
|
||||
|
@ -838,7 +837,6 @@ public class TestIndexReader extends LuceneTestCase
|
|||
writer.close();
|
||||
IndexReader reader = IndexReader.open(dir, false);
|
||||
reader.deleteDocument(0);
|
||||
reader.deleteDocument(1);
|
||||
reader.close();
|
||||
reader = IndexReader.open(dir, false);
|
||||
reader.undeleteAll();
|
||||
|
@ -855,7 +853,6 @@ public class TestIndexReader extends LuceneTestCase
|
|||
writer.close();
|
||||
IndexReader reader = IndexReader.open(dir, false);
|
||||
reader.deleteDocument(0);
|
||||
reader.deleteDocument(1);
|
||||
reader.close();
|
||||
reader = IndexReader.open(dir, false);
|
||||
reader.undeleteAll();
|
||||
|
@ -1290,9 +1287,6 @@ public class TestIndexReader extends LuceneTestCase
|
|||
|
||||
// Open another reader to confirm that everything is deleted
|
||||
reader2 = IndexReader.open(dir, false);
|
||||
assertEquals("reopened 2", 100, reader2.docFreq(searchTerm1));
|
||||
assertEquals("reopened 2", 100, reader2.docFreq(searchTerm2));
|
||||
assertEquals("reopened 2", 100, reader2.docFreq(searchTerm3));
|
||||
assertTermDocsCount("reopened 2", reader2, searchTerm1, 0);
|
||||
assertTermDocsCount("reopened 2", reader2, searchTerm2, 0);
|
||||
assertTermDocsCount("reopened 2", reader2, searchTerm3, 100);
|
||||
|
|
|
@ -1211,7 +1211,6 @@ public class TestIndexReaderReopen extends LuceneTestCase {
|
|||
|
||||
IndexReader r = IndexReader.open(dir, false);
|
||||
assertEquals(0, r.numDocs());
|
||||
assertEquals(4, r.maxDoc());
|
||||
|
||||
Collection<IndexCommit> commits = IndexReader.listCommits(dir);
|
||||
for (final IndexCommit commit : commits) {
|
||||
|
|
|
@ -101,19 +101,12 @@ public class TestIndexWriter extends LuceneTestCase {
|
|||
}
|
||||
reader.close();
|
||||
|
||||
// test doc count before segments are merged/index is optimized
|
||||
writer = new IndexWriter(dir, newIndexWriterConfig( TEST_VERSION_CURRENT, new MockAnalyzer()));
|
||||
assertEquals(100, writer.maxDoc());
|
||||
writer.close();
|
||||
|
||||
reader = IndexReader.open(dir, true);
|
||||
assertEquals(100, reader.maxDoc());
|
||||
assertEquals(60, reader.numDocs());
|
||||
reader.close();
|
||||
|
||||
// optimize the index and check that the new doc count is correct
|
||||
writer = new IndexWriter(dir, newIndexWriterConfig( TEST_VERSION_CURRENT, new MockAnalyzer()));
|
||||
assertEquals(100, writer.maxDoc());
|
||||
assertEquals(60, writer.numDocs());
|
||||
writer.optimize();
|
||||
assertEquals(60, writer.maxDoc());
|
||||
|
@ -1431,7 +1424,6 @@ public class TestIndexWriter extends LuceneTestCase {
|
|||
w.close();
|
||||
|
||||
IndexReader ir = IndexReader.open(dir, true);
|
||||
assertEquals(1, ir.maxDoc());
|
||||
assertEquals(0, ir.numDocs());
|
||||
ir.close();
|
||||
|
||||
|
|
|
@ -567,7 +567,8 @@ public class TestIndexWriterExceptions extends LuceneTestCase {
|
|||
System.out.println("TEST: open reader");
|
||||
}
|
||||
IndexReader reader = IndexReader.open(dir, true);
|
||||
int expected = 3+(1-i)*2;
|
||||
if (i == 0) {
|
||||
int expected = 5;
|
||||
assertEquals(expected, reader.docFreq(new Term("contents", "here")));
|
||||
assertEquals(expected, reader.maxDoc());
|
||||
int numDel = 0;
|
||||
|
@ -581,9 +582,9 @@ public class TestIndexWriterExceptions extends LuceneTestCase {
|
|||
reader.getTermFreqVectors(j);
|
||||
}
|
||||
}
|
||||
reader.close();
|
||||
|
||||
assertEquals(1, numDel);
|
||||
}
|
||||
reader.close();
|
||||
|
||||
writer = new IndexWriter(dir, newIndexWriterConfig( TEST_VERSION_CURRENT,
|
||||
analyzer).setMaxBufferedDocs(10));
|
||||
|
@ -596,10 +597,10 @@ public class TestIndexWriterExceptions extends LuceneTestCase {
|
|||
writer.close();
|
||||
|
||||
reader = IndexReader.open(dir, true);
|
||||
expected = 19+(1-i)*2;
|
||||
int expected = 19+(1-i)*2;
|
||||
assertEquals(expected, reader.docFreq(new Term("contents", "here")));
|
||||
assertEquals(expected, reader.maxDoc());
|
||||
numDel = 0;
|
||||
int numDel = 0;
|
||||
assertNull(MultiFields.getDeletedDocs(reader));
|
||||
for(int j=0;j<reader.maxDoc();j++) {
|
||||
reader.document(j);
|
||||
|
|
|
@ -6,6 +6,7 @@ import java.io.IOException;
|
|||
import java.io.ObjectInputStream;
|
||||
import java.io.ObjectOutputStream;
|
||||
import java.util.Random;
|
||||
import java.lang.reflect.Method;
|
||||
|
||||
import junit.framework.Assert;
|
||||
|
||||
|
@ -172,6 +173,16 @@ public class QueryUtils {
|
|||
}
|
||||
w.commit();
|
||||
w.deleteDocuments( new MatchAllDocsQuery() );
|
||||
try {
|
||||
// Carefully invoke what is a package-private (test
|
||||
// only, internal) method on IndexWriter:
|
||||
Method m = IndexWriter.class.getDeclaredMethod("keepFullyDeletedSegments");
|
||||
m.setAccessible(true);
|
||||
m.invoke(w);
|
||||
} catch (Exception e) {
|
||||
// Should not happen?
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
w.commit();
|
||||
|
||||
if (0 < numDeletedDocs)
|
||||
|
|
Loading…
Reference in New Issue