mirror of https://github.com/apache/lucene.git
LUCENE-2010: drop segments that are 100% deleted docs in IW/IR commit
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1063323 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
913a9e4491
commit
5642eb07c4
|
@ -486,6 +486,9 @@ Changes in runtime behavior
|
||||||
* LUCENE-2829: Improve the performance of "primary key" lookup use
|
* LUCENE-2829: Improve the performance of "primary key" lookup use
|
||||||
case (running a TermQuery that matches one document) on a
|
case (running a TermQuery that matches one document) on a
|
||||||
multi-segment index. (Robert Muir, Mike McCandless)
|
multi-segment index. (Robert Muir, Mike McCandless)
|
||||||
|
|
||||||
|
* LUCENE-2010: Segments with 100% deleted documents are now removed on
|
||||||
|
IndexReader or IndexWriter commit. (Uwe Schindler, Mike McCandless)
|
||||||
|
|
||||||
API Changes
|
API Changes
|
||||||
|
|
||||||
|
@ -905,6 +908,9 @@ Optimizations
|
||||||
* LUCENE-2824: Optimize BufferedIndexInput to do less bounds checks.
|
* LUCENE-2824: Optimize BufferedIndexInput to do less bounds checks.
|
||||||
(Robert Muir)
|
(Robert Muir)
|
||||||
|
|
||||||
|
* LUCENE-2010: Segments with 100% deleted documents are now removed on
|
||||||
|
IndexReader or IndexWriter commit. (Uwe Schindler, Mike McCandless)
|
||||||
|
|
||||||
Build
|
Build
|
||||||
|
|
||||||
* LUCENE-2124: Moved the JDK-based collation support from contrib/collation
|
* LUCENE-2124: Moved the JDK-based collation support from contrib/collation
|
||||||
|
|
|
@ -270,9 +270,9 @@ class BufferedDeletes {
|
||||||
}
|
}
|
||||||
|
|
||||||
private synchronized long applyDeletes(IndexWriter.ReaderPool readerPool,
|
private synchronized long applyDeletes(IndexWriter.ReaderPool readerPool,
|
||||||
SegmentInfo info,
|
SegmentInfo info,
|
||||||
SegmentDeletes coalescedDeletes,
|
SegmentDeletes coalescedDeletes,
|
||||||
SegmentDeletes segmentDeletes) throws IOException {
|
SegmentDeletes segmentDeletes) throws IOException {
|
||||||
assert readerPool.infoIsLive(info);
|
assert readerPool.infoIsLive(info);
|
||||||
|
|
||||||
assert coalescedDeletes == null || coalescedDeletes.docIDs.size() == 0;
|
assert coalescedDeletes == null || coalescedDeletes.docIDs.size() == 0;
|
||||||
|
|
|
@ -710,6 +710,9 @@ class DirectoryReader extends IndexReader implements Cloneable {
|
||||||
for (int i = 0; i < subReaders.length; i++)
|
for (int i = 0; i < subReaders.length; i++)
|
||||||
subReaders[i].commit();
|
subReaders[i].commit();
|
||||||
|
|
||||||
|
// Remove segments that contain only 100% deleted docs:
|
||||||
|
segmentInfos.pruneDeletedSegments();
|
||||||
|
|
||||||
// Sync all files we just wrote
|
// Sync all files we just wrote
|
||||||
directory.sync(segmentInfos.files(directory, false));
|
directory.sync(segmentInfos.files(directory, false));
|
||||||
segmentInfos.commit(directory);
|
segmentInfos.commit(directory);
|
||||||
|
|
|
@ -1163,7 +1163,14 @@ public abstract class IndexReader implements Cloneable,Closeable {
|
||||||
return n;
|
return n;
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Undeletes all documents currently marked as deleted in this index.
|
/** Undeletes all documents currently marked as deleted in
|
||||||
|
* this index.
|
||||||
|
*
|
||||||
|
* <p>NOTE: this is only a best-effort process. For
|
||||||
|
* example, if all documents in a given segment were
|
||||||
|
* deleted, Lucene now drops that segment from the index,
|
||||||
|
* which means its documents will not be recovered by this
|
||||||
|
* method.
|
||||||
*
|
*
|
||||||
* @throws StaleReaderException if the index has changed
|
* @throws StaleReaderException if the index has changed
|
||||||
* since this reader was opened
|
* since this reader was opened
|
||||||
|
|
|
@ -3276,6 +3276,15 @@ public class IndexWriter implements Closeable {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private boolean keepFullyDeletedSegments;
|
||||||
|
|
||||||
|
/** Only for testing.
|
||||||
|
*
|
||||||
|
* @lucene.internal */
|
||||||
|
void keepFullyDeletedSegments() {
|
||||||
|
keepFullyDeletedSegments = true;
|
||||||
|
}
|
||||||
|
|
||||||
// called only from assert
|
// called only from assert
|
||||||
private boolean filesExist(SegmentInfos toSync) throws IOException {
|
private boolean filesExist(SegmentInfos toSync) throws IOException {
|
||||||
Collection<String> files = toSync.files(directory, false);
|
Collection<String> files = toSync.files(directory, false);
|
||||||
|
@ -3334,6 +3343,10 @@ public class IndexWriter implements Closeable {
|
||||||
readerPool.commit();
|
readerPool.commit();
|
||||||
|
|
||||||
toSync = (SegmentInfos) segmentInfos.clone();
|
toSync = (SegmentInfos) segmentInfos.clone();
|
||||||
|
if (!keepFullyDeletedSegments) {
|
||||||
|
toSync.pruneDeletedSegments();
|
||||||
|
}
|
||||||
|
|
||||||
assert filesExist(toSync);
|
assert filesExist(toSync);
|
||||||
|
|
||||||
if (commitUserData != null)
|
if (commitUserData != null)
|
||||||
|
|
|
@ -308,6 +308,19 @@ public final class SegmentInfos extends Vector<SegmentInfo> {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** Prunes any segment whose docs are all deleted. */
|
||||||
|
public void pruneDeletedSegments() {
|
||||||
|
int segIdx = 0;
|
||||||
|
while(segIdx < size()) {
|
||||||
|
final SegmentInfo info = info(segIdx);
|
||||||
|
if (info.getDelCount() == info.docCount) {
|
||||||
|
remove(segIdx);
|
||||||
|
} else {
|
||||||
|
segIdx++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Returns a copy of this instance, also copying each
|
* Returns a copy of this instance, also copying each
|
||||||
* SegmentInfo.
|
* SegmentInfo.
|
||||||
|
|
|
@ -428,7 +428,7 @@ public class TestAddIndexes extends LuceneTestCase {
|
||||||
);
|
);
|
||||||
|
|
||||||
writer.addIndexes(aux, new MockDirectoryWrapper(random, new RAMDirectory(aux)));
|
writer.addIndexes(aux, new MockDirectoryWrapper(random, new RAMDirectory(aux)));
|
||||||
assertEquals(1060, writer.maxDoc());
|
assertEquals(1020, writer.maxDoc());
|
||||||
assertEquals(1000, writer.getDocCount(0));
|
assertEquals(1000, writer.getDocCount(0));
|
||||||
writer.close();
|
writer.close();
|
||||||
dir.close();
|
dir.close();
|
||||||
|
@ -480,7 +480,7 @@ public class TestAddIndexes extends LuceneTestCase {
|
||||||
);
|
);
|
||||||
|
|
||||||
writer.addIndexes(aux, aux2);
|
writer.addIndexes(aux, aux2);
|
||||||
assertEquals(1060, writer.maxDoc());
|
assertEquals(1040, writer.maxDoc());
|
||||||
assertEquals(1000, writer.getDocCount(0));
|
assertEquals(1000, writer.getDocCount(0));
|
||||||
writer.close();
|
writer.close();
|
||||||
dir.close();
|
dir.close();
|
||||||
|
|
|
@ -360,7 +360,7 @@ public class TestIndexReader extends LuceneTestCase
|
||||||
|
|
||||||
// CREATE A NEW READER and re-test
|
// CREATE A NEW READER and re-test
|
||||||
reader = IndexReader.open(dir, false);
|
reader = IndexReader.open(dir, false);
|
||||||
assertEquals("deleted docFreq", 100, reader.docFreq(searchTerm));
|
assertEquals("deleted docFreq", 0, reader.docFreq(searchTerm));
|
||||||
assertTermDocsCount("deleted termDocs", reader, searchTerm, 0);
|
assertTermDocsCount("deleted termDocs", reader, searchTerm, 0);
|
||||||
reader.close();
|
reader.close();
|
||||||
reader2.close();
|
reader2.close();
|
||||||
|
@ -697,7 +697,6 @@ public class TestIndexReader extends LuceneTestCase
|
||||||
|
|
||||||
// CREATE A NEW READER and re-test
|
// CREATE A NEW READER and re-test
|
||||||
reader = IndexReader.open(dir, false);
|
reader = IndexReader.open(dir, false);
|
||||||
assertEquals("deleted docFreq", 100, reader.docFreq(searchTerm));
|
|
||||||
assertEquals("deleted docFreq", 100, reader.docFreq(searchTerm2));
|
assertEquals("deleted docFreq", 100, reader.docFreq(searchTerm2));
|
||||||
assertTermDocsCount("deleted termDocs", reader, searchTerm, 0);
|
assertTermDocsCount("deleted termDocs", reader, searchTerm, 0);
|
||||||
assertTermDocsCount("deleted termDocs", reader, searchTerm2, 100);
|
assertTermDocsCount("deleted termDocs", reader, searchTerm2, 100);
|
||||||
|
@ -838,7 +837,6 @@ public class TestIndexReader extends LuceneTestCase
|
||||||
writer.close();
|
writer.close();
|
||||||
IndexReader reader = IndexReader.open(dir, false);
|
IndexReader reader = IndexReader.open(dir, false);
|
||||||
reader.deleteDocument(0);
|
reader.deleteDocument(0);
|
||||||
reader.deleteDocument(1);
|
|
||||||
reader.close();
|
reader.close();
|
||||||
reader = IndexReader.open(dir, false);
|
reader = IndexReader.open(dir, false);
|
||||||
reader.undeleteAll();
|
reader.undeleteAll();
|
||||||
|
@ -855,7 +853,6 @@ public class TestIndexReader extends LuceneTestCase
|
||||||
writer.close();
|
writer.close();
|
||||||
IndexReader reader = IndexReader.open(dir, false);
|
IndexReader reader = IndexReader.open(dir, false);
|
||||||
reader.deleteDocument(0);
|
reader.deleteDocument(0);
|
||||||
reader.deleteDocument(1);
|
|
||||||
reader.close();
|
reader.close();
|
||||||
reader = IndexReader.open(dir, false);
|
reader = IndexReader.open(dir, false);
|
||||||
reader.undeleteAll();
|
reader.undeleteAll();
|
||||||
|
@ -1290,9 +1287,6 @@ public class TestIndexReader extends LuceneTestCase
|
||||||
|
|
||||||
// Open another reader to confirm that everything is deleted
|
// Open another reader to confirm that everything is deleted
|
||||||
reader2 = IndexReader.open(dir, false);
|
reader2 = IndexReader.open(dir, false);
|
||||||
assertEquals("reopened 2", 100, reader2.docFreq(searchTerm1));
|
|
||||||
assertEquals("reopened 2", 100, reader2.docFreq(searchTerm2));
|
|
||||||
assertEquals("reopened 2", 100, reader2.docFreq(searchTerm3));
|
|
||||||
assertTermDocsCount("reopened 2", reader2, searchTerm1, 0);
|
assertTermDocsCount("reopened 2", reader2, searchTerm1, 0);
|
||||||
assertTermDocsCount("reopened 2", reader2, searchTerm2, 0);
|
assertTermDocsCount("reopened 2", reader2, searchTerm2, 0);
|
||||||
assertTermDocsCount("reopened 2", reader2, searchTerm3, 100);
|
assertTermDocsCount("reopened 2", reader2, searchTerm3, 100);
|
||||||
|
|
|
@ -1211,7 +1211,6 @@ public class TestIndexReaderReopen extends LuceneTestCase {
|
||||||
|
|
||||||
IndexReader r = IndexReader.open(dir, false);
|
IndexReader r = IndexReader.open(dir, false);
|
||||||
assertEquals(0, r.numDocs());
|
assertEquals(0, r.numDocs());
|
||||||
assertEquals(4, r.maxDoc());
|
|
||||||
|
|
||||||
Collection<IndexCommit> commits = IndexReader.listCommits(dir);
|
Collection<IndexCommit> commits = IndexReader.listCommits(dir);
|
||||||
for (final IndexCommit commit : commits) {
|
for (final IndexCommit commit : commits) {
|
||||||
|
|
|
@ -101,19 +101,12 @@ public class TestIndexWriter extends LuceneTestCase {
|
||||||
}
|
}
|
||||||
reader.close();
|
reader.close();
|
||||||
|
|
||||||
// test doc count before segments are merged/index is optimized
|
|
||||||
writer = new IndexWriter(dir, newIndexWriterConfig( TEST_VERSION_CURRENT, new MockAnalyzer()));
|
|
||||||
assertEquals(100, writer.maxDoc());
|
|
||||||
writer.close();
|
|
||||||
|
|
||||||
reader = IndexReader.open(dir, true);
|
reader = IndexReader.open(dir, true);
|
||||||
assertEquals(100, reader.maxDoc());
|
|
||||||
assertEquals(60, reader.numDocs());
|
assertEquals(60, reader.numDocs());
|
||||||
reader.close();
|
reader.close();
|
||||||
|
|
||||||
// optimize the index and check that the new doc count is correct
|
// optimize the index and check that the new doc count is correct
|
||||||
writer = new IndexWriter(dir, newIndexWriterConfig( TEST_VERSION_CURRENT, new MockAnalyzer()));
|
writer = new IndexWriter(dir, newIndexWriterConfig( TEST_VERSION_CURRENT, new MockAnalyzer()));
|
||||||
assertEquals(100, writer.maxDoc());
|
|
||||||
assertEquals(60, writer.numDocs());
|
assertEquals(60, writer.numDocs());
|
||||||
writer.optimize();
|
writer.optimize();
|
||||||
assertEquals(60, writer.maxDoc());
|
assertEquals(60, writer.maxDoc());
|
||||||
|
@ -1431,7 +1424,6 @@ public class TestIndexWriter extends LuceneTestCase {
|
||||||
w.close();
|
w.close();
|
||||||
|
|
||||||
IndexReader ir = IndexReader.open(dir, true);
|
IndexReader ir = IndexReader.open(dir, true);
|
||||||
assertEquals(1, ir.maxDoc());
|
|
||||||
assertEquals(0, ir.numDocs());
|
assertEquals(0, ir.numDocs());
|
||||||
ir.close();
|
ir.close();
|
||||||
|
|
||||||
|
|
|
@ -567,24 +567,25 @@ public class TestIndexWriterExceptions extends LuceneTestCase {
|
||||||
System.out.println("TEST: open reader");
|
System.out.println("TEST: open reader");
|
||||||
}
|
}
|
||||||
IndexReader reader = IndexReader.open(dir, true);
|
IndexReader reader = IndexReader.open(dir, true);
|
||||||
int expected = 3+(1-i)*2;
|
if (i == 0) {
|
||||||
assertEquals(expected, reader.docFreq(new Term("contents", "here")));
|
int expected = 5;
|
||||||
assertEquals(expected, reader.maxDoc());
|
assertEquals(expected, reader.docFreq(new Term("contents", "here")));
|
||||||
int numDel = 0;
|
assertEquals(expected, reader.maxDoc());
|
||||||
final Bits delDocs = MultiFields.getDeletedDocs(reader);
|
int numDel = 0;
|
||||||
assertNotNull(delDocs);
|
final Bits delDocs = MultiFields.getDeletedDocs(reader);
|
||||||
for(int j=0;j<reader.maxDoc();j++) {
|
assertNotNull(delDocs);
|
||||||
if (delDocs.get(j))
|
for(int j=0;j<reader.maxDoc();j++) {
|
||||||
numDel++;
|
if (delDocs.get(j))
|
||||||
else {
|
numDel++;
|
||||||
reader.document(j);
|
else {
|
||||||
reader.getTermFreqVectors(j);
|
reader.document(j);
|
||||||
|
reader.getTermFreqVectors(j);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
assertEquals(1, numDel);
|
||||||
}
|
}
|
||||||
reader.close();
|
reader.close();
|
||||||
|
|
||||||
assertEquals(1, numDel);
|
|
||||||
|
|
||||||
writer = new IndexWriter(dir, newIndexWriterConfig( TEST_VERSION_CURRENT,
|
writer = new IndexWriter(dir, newIndexWriterConfig( TEST_VERSION_CURRENT,
|
||||||
analyzer).setMaxBufferedDocs(10));
|
analyzer).setMaxBufferedDocs(10));
|
||||||
doc = new Document();
|
doc = new Document();
|
||||||
|
@ -596,10 +597,10 @@ public class TestIndexWriterExceptions extends LuceneTestCase {
|
||||||
writer.close();
|
writer.close();
|
||||||
|
|
||||||
reader = IndexReader.open(dir, true);
|
reader = IndexReader.open(dir, true);
|
||||||
expected = 19+(1-i)*2;
|
int expected = 19+(1-i)*2;
|
||||||
assertEquals(expected, reader.docFreq(new Term("contents", "here")));
|
assertEquals(expected, reader.docFreq(new Term("contents", "here")));
|
||||||
assertEquals(expected, reader.maxDoc());
|
assertEquals(expected, reader.maxDoc());
|
||||||
numDel = 0;
|
int numDel = 0;
|
||||||
assertNull(MultiFields.getDeletedDocs(reader));
|
assertNull(MultiFields.getDeletedDocs(reader));
|
||||||
for(int j=0;j<reader.maxDoc();j++) {
|
for(int j=0;j<reader.maxDoc();j++) {
|
||||||
reader.document(j);
|
reader.document(j);
|
||||||
|
|
|
@ -6,6 +6,7 @@ import java.io.IOException;
|
||||||
import java.io.ObjectInputStream;
|
import java.io.ObjectInputStream;
|
||||||
import java.io.ObjectOutputStream;
|
import java.io.ObjectOutputStream;
|
||||||
import java.util.Random;
|
import java.util.Random;
|
||||||
|
import java.lang.reflect.Method;
|
||||||
|
|
||||||
import junit.framework.Assert;
|
import junit.framework.Assert;
|
||||||
|
|
||||||
|
@ -172,6 +173,16 @@ public class QueryUtils {
|
||||||
}
|
}
|
||||||
w.commit();
|
w.commit();
|
||||||
w.deleteDocuments( new MatchAllDocsQuery() );
|
w.deleteDocuments( new MatchAllDocsQuery() );
|
||||||
|
try {
|
||||||
|
// Carefully invoke what is a package-private (test
|
||||||
|
// only, internal) method on IndexWriter:
|
||||||
|
Method m = IndexWriter.class.getDeclaredMethod("keepFullyDeletedSegments");
|
||||||
|
m.setAccessible(true);
|
||||||
|
m.invoke(w);
|
||||||
|
} catch (Exception e) {
|
||||||
|
// Should not happen?
|
||||||
|
throw new RuntimeException(e);
|
||||||
|
}
|
||||||
w.commit();
|
w.commit();
|
||||||
|
|
||||||
if (0 < numDeletedDocs)
|
if (0 < numDeletedDocs)
|
||||||
|
|
Loading…
Reference in New Issue