mirror of https://github.com/apache/lucene.git
LUCENE-325: add expungeDeletes methods to IndexWriter
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@620604 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
862c44215a
commit
1d4ab68796
|
@ -49,6 +49,12 @@ API Changes
|
|||
see the changes. Deprecate IndexWriter.flush() in favor of
|
||||
IndexWriter.commit(). (Mike McCandless)
|
||||
|
||||
5. LUCENE-325: Added IndexWriter.expungeDeletes methods, which
|
||||
consult the MergePolicy to find merges necessary to merge away all
|
||||
deletes from the index. This should be a somewhat lower cost
|
||||
operation than optimize. (John Wang via Mike McCandless)
|
||||
|
||||
|
||||
Bug fixes
|
||||
|
||||
1. LUCENE-1134: Fixed BooleanQuery.rewrite to only optimze a single
|
||||
|
|
|
@ -26,7 +26,6 @@ import org.apache.lucene.store.Lock;
|
|||
import org.apache.lucene.store.LockObtainFailedException;
|
||||
import org.apache.lucene.store.AlreadyClosedException;
|
||||
import org.apache.lucene.util.BitVector;
|
||||
import org.apache.lucene.util.Parameter;
|
||||
import org.apache.lucene.util.Constants;
|
||||
|
||||
import java.io.File;
|
||||
|
@ -2163,6 +2162,7 @@ public class IndexWriter {
|
|||
try {
|
||||
wait();
|
||||
} catch (InterruptedException ie) {
|
||||
Thread.currentThread().interrupt();
|
||||
}
|
||||
|
||||
if (mergeExceptions.size() > 0) {
|
||||
|
@ -2205,6 +2205,87 @@ public class IndexWriter {
|
|||
return false;
|
||||
}
|
||||
|
||||
/** Just like {@link #expungeDeletes()}, except you can
|
||||
* specify whether the call should block until the
|
||||
* operation completes. This is only meaningful with a
|
||||
* {@link MergeScheduler} that is able to run merges in
|
||||
* background threads. */
|
||||
public void expungeDeletes(boolean doWait)
|
||||
throws CorruptIndexException, IOException {
|
||||
ensureOpen();
|
||||
|
||||
if (infoStream != null)
|
||||
message("expungeDeletes: index now " + segString());
|
||||
|
||||
MergePolicy.MergeSpecification spec;
|
||||
|
||||
synchronized(this) {
|
||||
spec = mergePolicy.findMergesToExpungeDeletes(segmentInfos, this);
|
||||
if (spec != null) {
|
||||
final int numMerges = spec.merges.size();
|
||||
for(int i=0;i<numMerges;i++)
|
||||
registerMerge((MergePolicy.OneMerge) spec.merges.get(i));
|
||||
}
|
||||
}
|
||||
|
||||
mergeScheduler.merge(this);
|
||||
|
||||
if (spec != null && doWait) {
|
||||
final int numMerges = spec.merges.size();
|
||||
synchronized(this) {
|
||||
boolean running = true;
|
||||
while(running) {
|
||||
|
||||
running = false;
|
||||
for(int i=0;i<numMerges;i++) {
|
||||
final MergePolicy.OneMerge merge = (MergePolicy.OneMerge) spec.merges.get(i);
|
||||
if (pendingMerges.contains(merge) || runningMerges.contains(merge))
|
||||
running = true;
|
||||
Throwable t = merge.getException();
|
||||
if (t != null) {
|
||||
IOException ioe = new IOException("background merge hit exception: " + merge.segString(directory));
|
||||
ioe.initCause(t);
|
||||
throw ioe;
|
||||
}
|
||||
}
|
||||
|
||||
if (running) {
|
||||
try {
|
||||
wait();
|
||||
} catch (InterruptedException ie) {
|
||||
Thread.currentThread().interrupt();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// NOTE: in the ConcurrentMergeScheduler case, when
|
||||
// doWait is false, we can return immediately while
|
||||
// background threads accomplish the optimization
|
||||
}
|
||||
|
||||
|
||||
/** Expunges all deletes from the index. When and index
|
||||
* has many document deletions (or updates to existing
|
||||
* documents), it's best to either call optimize or
|
||||
* expungeDeletes to remove all unusged data in the index
|
||||
* associated with the deleted documents. To see how
|
||||
* many deletions you have pending in your index, call
|
||||
* {@link IndexReader#maxDoc - IndexReader#numDocs}.
|
||||
* This saves disk space and memory usage while
|
||||
* searching. expungeDeletes should be somewhat faster
|
||||
* than optimize since it does not insist on reducing the
|
||||
* index to a single segment (though, this depends on the
|
||||
* {@link MergePolicy}; see {@link
|
||||
* MergePolicy#findMergesToExpungeDeletes}.). Note that
|
||||
* this call does not first commit any buffered
|
||||
* documents, so you must do so yourself if necessary.
|
||||
* See also {@link #expungeDeletes(boolean)}*/
|
||||
public void expungeDeletes() throws CorruptIndexException, IOException {
|
||||
expungeDeletes(true);
|
||||
}
|
||||
|
||||
/**
|
||||
* Expert: asks the mergePolicy whether any merges are
|
||||
* necessary now and if so, runs the requested merges and
|
||||
|
|
|
@ -245,6 +245,54 @@ public abstract class LogMergePolicy extends MergePolicy {
|
|||
return spec;
|
||||
}
|
||||
|
||||
/**
|
||||
* Finds merges necessary to expunge all deletes from the
|
||||
* index. We simply merge adjacent segments that have
|
||||
* deletes, up to mergeFactor at a time.
|
||||
*/
|
||||
public MergeSpecification findMergesToExpungeDeletes(SegmentInfos segmentInfos,
|
||||
IndexWriter writer)
|
||||
throws CorruptIndexException, IOException
|
||||
{
|
||||
this.writer = writer;
|
||||
|
||||
final int numSegments = segmentInfos.size();
|
||||
|
||||
message("findMergesToExpungeDeletes: " + numSegments + " segments");
|
||||
|
||||
MergeSpecification spec = new MergeSpecification();
|
||||
int firstSegmentWithDeletions = -1;
|
||||
for(int i=0;i<numSegments;i++) {
|
||||
final SegmentInfo info = segmentInfos.info(i);
|
||||
if (info.hasDeletions()) {
|
||||
message(" segment " + info.name + " has deletions");
|
||||
if (firstSegmentWithDeletions == -1)
|
||||
firstSegmentWithDeletions = i;
|
||||
else if (i - firstSegmentWithDeletions == mergeFactor) {
|
||||
// We've seen mergeFactor segments in a row with
|
||||
// deletions, so force a merge now:
|
||||
message(" add merge " + firstSegmentWithDeletions + " to " + (i-1) + " inclusive");
|
||||
spec.add(new OneMerge(segmentInfos.range(firstSegmentWithDeletions, i), useCompoundFile));
|
||||
firstSegmentWithDeletions = i;
|
||||
}
|
||||
} else if (firstSegmentWithDeletions != -1) {
|
||||
// End of a sequence of segments with deletions, so,
|
||||
// merge those past segments even if it's fewer than
|
||||
// mergeFactor segments
|
||||
message(" add merge " + firstSegmentWithDeletions + " to " + (i-1) + " inclusive");
|
||||
spec.add(new OneMerge(segmentInfos.range(firstSegmentWithDeletions, i), useCompoundFile));
|
||||
firstSegmentWithDeletions = -1;
|
||||
}
|
||||
}
|
||||
|
||||
if (firstSegmentWithDeletions != -1) {
|
||||
message(" add merge " + firstSegmentWithDeletions + " to " + (numSegments-1) + " inclusive");
|
||||
spec.add(new OneMerge(segmentInfos.range(firstSegmentWithDeletions, numSegments), useCompoundFile));
|
||||
}
|
||||
|
||||
return spec;
|
||||
}
|
||||
|
||||
/** Checks if any merges are now necessary and returns a
|
||||
* {@link MergePolicy.MergeSpecification} if so. A merge
|
||||
* is necessary when there are more than {@link
|
||||
|
|
|
@ -50,6 +50,8 @@ import java.util.Set;
|
|||
*
|
||||
* <p>The default MergePolicy is {@link
|
||||
* LogByteSizeMergePolicy}.</p>
|
||||
* <p><b>NOTE:</b> This API is new and still experimental
|
||||
* (subject to change suddenly in the next release)</p>
|
||||
*/
|
||||
|
||||
public abstract class MergePolicy {
|
||||
|
@ -209,7 +211,7 @@ public abstract class MergePolicy {
|
|||
throws CorruptIndexException, IOException;
|
||||
|
||||
/**
|
||||
* Determine what set of merge operations are necessary in
|
||||
* Determine what set of merge operations is necessary in
|
||||
* order to optimize the index. The IndexWriter calls
|
||||
* this when its optimize() method is called. This call
|
||||
* is always synchronized on the IndexWriter instance so
|
||||
|
@ -229,6 +231,19 @@ public abstract class MergePolicy {
|
|||
Set segmentsToOptimize)
|
||||
throws CorruptIndexException, IOException;
|
||||
|
||||
/**
|
||||
* Determine what set of merge operations is necessary in
|
||||
* order to expunge all deletes from the index.
|
||||
* @param segmentInfos the total set of segments in the index
|
||||
* @param writer IndexWriter instance
|
||||
*/
|
||||
MergeSpecification findMergesToExpungeDeletes(SegmentInfos segmentInfos,
|
||||
IndexWriter writer)
|
||||
throws CorruptIndexException, IOException
|
||||
{
|
||||
throw new RuntimeException("not implemented");
|
||||
}
|
||||
|
||||
/**
|
||||
* Release all resources for the policy.
|
||||
*/
|
||||
|
|
|
@ -19,10 +19,13 @@ package org.apache.lucene.index;
|
|||
|
||||
import java.io.IOException;
|
||||
|
||||
/** Expert: {@link IndexWriter} uses an instance
|
||||
/** <p>Expert: {@link IndexWriter} uses an instance
|
||||
* implementing this interface to execute the merges
|
||||
* selected by a {@link MergePolicy}. The default
|
||||
* MergeScheduler is {@link ConcurrentMergeScheduler}. */
|
||||
* MergeScheduler is {@link ConcurrentMergeScheduler}.</p>
|
||||
* <p><b>NOTE:</b> This API is new and still experimental
|
||||
* (subject to change suddenly in the next release)</p>
|
||||
*/
|
||||
|
||||
public abstract class MergeScheduler {
|
||||
|
||||
|
|
|
@ -2977,4 +2977,140 @@ public class TestIndexWriter extends LuceneTestCase
|
|||
reader.close();
|
||||
dir.close();
|
||||
}
|
||||
|
||||
// LUCENE-325: test expungeDeletes, when 2 singular merges
|
||||
// are required
|
||||
public void testExpungeDeletes() throws IOException {
|
||||
Directory dir = new MockRAMDirectory();
|
||||
IndexWriter writer = new IndexWriter(dir,
|
||||
false, new StandardAnalyzer(),
|
||||
IndexWriter.MaxFieldLength.LIMITED);
|
||||
writer.setMaxBufferedDocs(2);
|
||||
writer.setRAMBufferSizeMB(IndexWriter.DISABLE_AUTO_FLUSH);
|
||||
|
||||
Document document = new Document();
|
||||
|
||||
document = new Document();
|
||||
Field storedField = new Field("stored", "stored", Field.Store.YES,
|
||||
Field.Index.NO);
|
||||
document.add(storedField);
|
||||
Field termVectorField = new Field("termVector", "termVector",
|
||||
Field.Store.NO, Field.Index.UN_TOKENIZED,
|
||||
Field.TermVector.WITH_POSITIONS_OFFSETS);
|
||||
document.add(termVectorField);
|
||||
for(int i=0;i<10;i++)
|
||||
writer.addDocument(document);
|
||||
writer.close();
|
||||
|
||||
IndexReader ir = IndexReader.open(dir);
|
||||
assertEquals(10, ir.maxDoc());
|
||||
assertEquals(10, ir.numDocs());
|
||||
ir.deleteDocument(0);
|
||||
ir.deleteDocument(7);
|
||||
assertEquals(8, ir.numDocs());
|
||||
ir.close();
|
||||
|
||||
writer = new IndexWriter(dir,
|
||||
false, new StandardAnalyzer(),
|
||||
IndexWriter.MaxFieldLength.LIMITED);
|
||||
writer.expungeDeletes();
|
||||
writer.close();
|
||||
ir = IndexReader.open(dir);
|
||||
assertEquals(8, ir.maxDoc());
|
||||
assertEquals(8, ir.numDocs());
|
||||
ir.close();
|
||||
dir.close();
|
||||
}
|
||||
|
||||
// LUCENE-325: test expungeDeletes, when many adjacent merges are required
|
||||
public void testExpungeDeletes2() throws IOException {
|
||||
Directory dir = new MockRAMDirectory();
|
||||
IndexWriter writer = new IndexWriter(dir,
|
||||
false, new StandardAnalyzer(),
|
||||
IndexWriter.MaxFieldLength.LIMITED);
|
||||
writer.setMaxBufferedDocs(2);
|
||||
writer.setMergeFactor(50);
|
||||
writer.setRAMBufferSizeMB(IndexWriter.DISABLE_AUTO_FLUSH);
|
||||
|
||||
Document document = new Document();
|
||||
|
||||
document = new Document();
|
||||
Field storedField = new Field("stored", "stored", Field.Store.YES,
|
||||
Field.Index.NO);
|
||||
document.add(storedField);
|
||||
Field termVectorField = new Field("termVector", "termVector",
|
||||
Field.Store.NO, Field.Index.UN_TOKENIZED,
|
||||
Field.TermVector.WITH_POSITIONS_OFFSETS);
|
||||
document.add(termVectorField);
|
||||
for(int i=0;i<98;i++)
|
||||
writer.addDocument(document);
|
||||
writer.close();
|
||||
|
||||
IndexReader ir = IndexReader.open(dir);
|
||||
assertEquals(98, ir.maxDoc());
|
||||
assertEquals(98, ir.numDocs());
|
||||
for(int i=0;i<98;i+=2)
|
||||
ir.deleteDocument(i);
|
||||
assertEquals(49, ir.numDocs());
|
||||
ir.close();
|
||||
|
||||
writer = new IndexWriter(dir,
|
||||
false, new StandardAnalyzer(),
|
||||
IndexWriter.MaxFieldLength.LIMITED);
|
||||
writer.setMergeFactor(3);
|
||||
writer.expungeDeletes();
|
||||
writer.close();
|
||||
ir = IndexReader.open(dir);
|
||||
assertEquals(49, ir.maxDoc());
|
||||
assertEquals(49, ir.numDocs());
|
||||
ir.close();
|
||||
dir.close();
|
||||
}
|
||||
|
||||
// LUCENE-325: test expungeDeletes without waiting, when
|
||||
// many adjacent merges are required
|
||||
public void testExpungeDeletes3() throws IOException {
|
||||
Directory dir = new MockRAMDirectory();
|
||||
IndexWriter writer = new IndexWriter(dir,
|
||||
false, new StandardAnalyzer(),
|
||||
IndexWriter.MaxFieldLength.LIMITED);
|
||||
writer.setMaxBufferedDocs(2);
|
||||
writer.setMergeFactor(50);
|
||||
writer.setRAMBufferSizeMB(IndexWriter.DISABLE_AUTO_FLUSH);
|
||||
|
||||
Document document = new Document();
|
||||
|
||||
document = new Document();
|
||||
Field storedField = new Field("stored", "stored", Field.Store.YES,
|
||||
Field.Index.NO);
|
||||
document.add(storedField);
|
||||
Field termVectorField = new Field("termVector", "termVector",
|
||||
Field.Store.NO, Field.Index.UN_TOKENIZED,
|
||||
Field.TermVector.WITH_POSITIONS_OFFSETS);
|
||||
document.add(termVectorField);
|
||||
for(int i=0;i<98;i++)
|
||||
writer.addDocument(document);
|
||||
writer.close();
|
||||
|
||||
IndexReader ir = IndexReader.open(dir);
|
||||
assertEquals(98, ir.maxDoc());
|
||||
assertEquals(98, ir.numDocs());
|
||||
for(int i=0;i<98;i+=2)
|
||||
ir.deleteDocument(i);
|
||||
assertEquals(49, ir.numDocs());
|
||||
ir.close();
|
||||
|
||||
writer = new IndexWriter(dir,
|
||||
false, new StandardAnalyzer(),
|
||||
IndexWriter.MaxFieldLength.LIMITED);
|
||||
// Force many merges to happen
|
||||
writer.setMergeFactor(3);
|
||||
writer.expungeDeletes(false);
|
||||
writer.close();
|
||||
ir = IndexReader.open(dir);
|
||||
assertEquals(49, ir.maxDoc());
|
||||
assertEquals(49, ir.numDocs());
|
||||
ir.close();
|
||||
dir.close();
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue