LUCENE-10078: Enable merge-on-refresh by default. (#921)

This gives implementations of `findFullFlushMerges` to `LogMergePolicy` and
`TieredMergePolicy` and enables merge-on-refresh with a default timeout of
500ms.

The idea behind the 500ms default is that it felt both high-enough to have time
to run merges of small segments, and low enough that the freshness of the data
wouldn't look badly affected for users who have high refresh rates (e.g.
refreshing every second).

In both cases, `findFullFlushMerges` delegates to `findMerges` and filters
merges whose segments are all below the min/floor size.
This commit is contained in:
Adrien Grand 2022-06-07 16:59:55 +02:00 committed by GitHub
parent 7e9d5ab768
commit b5795db0cf
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
15 changed files with 192 additions and 25 deletions

View File

@ -76,6 +76,9 @@ New Features
Improvements
---------------------
* LUCENE-10078: Merge on full flush is now enabled by default with a timeout of
500ms. (Adrien Grand)
* LUCENE-10585: Facet module code cleanup (copy/paste scrubbing, simplification and some very minor
optimization tweaks). (Greg Miller)

View File

@ -124,4 +124,9 @@ public class FilterMergePolicy extends MergePolicy implements Unwrappable<MergeP
public MergePolicy unwrap() {
return in;
}
@Override
protected long maxFullFlushMergeSize() {
return in.maxFullFlushMergeSize();
}
}

View File

@ -2372,6 +2372,10 @@ public class IndexWriter
* @lucene.experimental
*/
private synchronized MergePolicy.OneMerge getNextMerge() {
if (tragedy.get() != null) {
throw new IllegalStateException(
"this writer hit an unrecoverable error; cannot merge", tragedy.get());
}
if (pendingMerges.size() == 0) {
return null;
} else {
@ -2388,6 +2392,10 @@ public class IndexWriter
* @lucene.experimental
*/
public synchronized boolean hasPendingMerges() {
if (tragedy.get() != null) {
throw new IllegalStateException(
"this writer hit an unrecoverable error; cannot merge", tragedy.get());
}
return pendingMerges.size() != 0;
}

View File

@ -105,7 +105,7 @@ public final class IndexWriterConfig extends LiveIndexWriterConfig {
* Default value for time to wait for merges on commit or getReader (when using a {@link
* MergePolicy} that implements {@link MergePolicy#findFullFlushMerges}).
*/
public static final long DEFAULT_MAX_FULL_FLUSH_MERGE_WAIT_MILLIS = 0;
public static final long DEFAULT_MAX_FULL_FLUSH_MERGE_WAIT_MILLIS = 500;
// indicates whether this config instance is already attached to a writer.
// not final so that it can be cloned properly.
@ -457,9 +457,14 @@ public final class IndexWriterConfig extends LiveIndexWriterConfig {
* call, like natural segment merges. The default is <code>
* {@value IndexWriterConfig#DEFAULT_MAX_FULL_FLUSH_MERGE_WAIT_MILLIS}</code>.
*
* <p>Note: This settings has no effect unless {@link
* MergePolicy#findFullFlushMerges(MergeTrigger, SegmentInfos, MergePolicy.MergeContext)} has an
* implementation that actually returns merges which by default doesn't return any merges.
* <p>Note: Which segments would get merged depends on the implementation of {@link
* MergePolicy#findFullFlushMerges(MergeTrigger, SegmentInfos, MergePolicy.MergeContext)}
*
* <p>Note: Set to 0 to disable merging on full flush.
*
* <p>Note: If {@link SerialMergeScheduler} is used and a non-zero timout is configured,
* full-flush merges will always wait for the merge to finish without honoring the configured
* timeout.
*/
public IndexWriterConfig setMaxFullFlushMergeWaitMillis(long maxFullFlushMergeWaitMillis) {
this.maxFullFlushMergeWaitMillis = maxFullFlushMergeWaitMillis;

View File

@ -96,9 +96,10 @@ public class LogByteSizeMergePolicy extends LogMergePolicy {
}
/**
* Sets the minimum size for the lowest level segments. Any segments below this size will be
* merged more aggressively in order to avoid having a long tail of small segments. Large values
* of this parameter increase the merging cost during indexing if you flush small segments.
* Sets the minimum size for the lowest level segments. Any segments below this size are
* candidates for full-flush merges and be merged more aggressively in order to avoid having a
* long tail of small segments. Large values of this parameter increase the merging cost during
* indexing if you flush small segments.
*/
public void setMinMergeMB(double mb) {
minMergeSize = (long) (mb * 1024 * 1024);

View File

@ -43,9 +43,10 @@ public class LogDocMergePolicy extends LogMergePolicy {
}
/**
* Sets the minimum size for the lowest level segments. Any segments below this size will be
* merged more aggressively in order to avoid having a long tail of small segments. Large values
* of this parameter increase the merging cost during indexing if you flush small segments.
* Sets the minimum size for the lowest level segments. Any segments below this size are
* candidates for full-flush merges and merged more aggressively in order to avoid having a long
* tail of small segments. Large values of this parameter increase the merging cost during
* indexing if you flush small segments.
*/
public void setMinMergeDocs(int minMergeDocs) {
minMergeSize = minMergeDocs;

View File

@ -35,6 +35,9 @@ import java.util.Set;
* specifies how a segment's size is determined. {@link LogDocMergePolicy} is one subclass that
* measures size by document count in the segment. {@link LogByteSizeMergePolicy} is another
* subclass that measures size as the total byte size of the file(s) for the segment.
*
* <p><b>NOTE</b>: This policy returns natural merges whose size is below the {@link #minMergeSize
* minimum merge size} for {@link #findFullFlushMerges full-flush merges}.
*/
public abstract class LogMergePolicy extends MergePolicy {
@ -64,7 +67,10 @@ public abstract class LogMergePolicy extends MergePolicy {
/** How many segments to merge at a time. */
protected int mergeFactor = DEFAULT_MERGE_FACTOR;
/** Any segments whose size is smaller than this value will be merged more aggressively. */
/**
* Any segments whose size is smaller than this value will be candidates for full-flush merges and
* merged more aggressively.
*/
protected long minMergeSize;
/** If the size of a segment exceeds this value then it will never be merged. */
@ -178,6 +184,11 @@ public abstract class LogMergePolicy extends MergePolicy {
&& (numToMerge != 1 || !segmentIsOriginal || isMerged(infos, mergeInfo, mergeContext));
}
@Override
protected long maxFullFlushMergeSize() {
return minMergeSize;
}
/**
* Returns the merges necessary to merge the index, taking the max merge size or max merge docs
* into consideration. This method attempts to respect the {@code maxNumSegments} parameter,

View File

@ -601,9 +601,9 @@ public abstract class MergePolicy {
SegmentInfos segmentInfos, MergeContext mergeContext) throws IOException;
/**
* Identifies merges that we want to execute (synchronously) on commit. By default, this will do
* no merging on commit. If you implement this method in your {@code MergePolicy} you must also
* set a non-zero timeout using {@link IndexWriterConfig#setMaxFullFlushMergeWaitMillis}.
* Identifies merges that we want to execute (synchronously) on commit. By default, this will
* return {@link #findMerges natural merges} whose segments are all less than the {@link
* #maxFullFlushMergeSize() max segment size for full flushes}.
*
* <p>Any merges returned here will make {@link IndexWriter#commit()}, {@link
* IndexWriter#prepareCommit()} or {@link IndexWriter#getReader(boolean, boolean)} block until the
@ -628,7 +628,28 @@ public abstract class MergePolicy {
public MergeSpecification findFullFlushMerges(
MergeTrigger mergeTrigger, SegmentInfos segmentInfos, MergeContext mergeContext)
throws IOException {
return null;
// This returns natural merges that contain segments below the minimum size
MergeSpecification mergeSpec = findMerges(mergeTrigger, segmentInfos, mergeContext);
if (mergeSpec == null) {
return null;
}
MergeSpecification newMergeSpec = null;
for (OneMerge oneMerge : mergeSpec.merges) {
boolean belowMaxFullFlushSize = true;
for (SegmentCommitInfo sci : oneMerge.segments) {
if (size(sci, mergeContext) >= maxFullFlushMergeSize()) {
belowMaxFullFlushSize = false;
break;
}
}
if (belowMaxFullFlushSize) {
if (newMergeSpec == null) {
newMergeSpec = new MergeSpecification();
}
newMergeSpec.add(oneMerge);
}
}
return newMergeSpec;
}
/**
@ -671,6 +692,14 @@ public abstract class MergePolicy {
return (info.info.maxDoc() <= 0 ? byteSize : (long) (byteSize * (1.0 - delRatio)));
}
/**
* Return the maximum size of segments to be included in full-flush merges by the default
* implementation of {@link #findFullFlushMerges}.
*/
protected long maxFullFlushMergeSize() {
return 0L;
}
/** Asserts that the delCount for this SegmentCommitInfo is valid */
protected final boolean assertDelCount(int delCount, SegmentCommitInfo info) {
assert delCount >= 0 : "delCount must be positive: " + delCount;

View File

@ -62,6 +62,10 @@ import java.util.Set;
*
* <p>findForcedDeletesMerges should never produce segments greater than maxSegmentSize.
*
* <p><b>NOTE</b>: This policy returns natural merges whose size is below the {@link
* #setFloorSegmentMB(double) floor segment size} for {@link #findFullFlushMerges full-flush
* merges}.
*
* @lucene.experimental
*/
@ -168,9 +172,16 @@ public class TieredMergePolicy extends MergePolicy {
}
/**
* Segments smaller than this are "rounded up" to this size, ie treated as equal (floor) size for
* merge selection. This is to prevent frequent flushing of tiny segments from allowing a long
* tail in the index. Default is 2 MB.
* Segments smaller than this size are merged more aggressively:
*
* <ul>
* <li>They are candidates for full-flush merges, in order to reduce the number of segments in
* the index prior to opening a new point-in-time view of the index.
* <li>For background merges, smaller segments are "rounded up" to this size.
* </ul>
*
* In both cases, this helps prevent frequent flushing of tiny segments to create a long tail of
* small segments in the index. Default is 2MB.
*/
public TieredMergePolicy setFloorSegmentMB(double v) {
if (v <= 0.0) {
@ -190,6 +201,11 @@ public class TieredMergePolicy extends MergePolicy {
return floorSegmentBytes / (1024 * 1024.);
}
@Override
protected long maxFullFlushMergeSize() {
return floorSegmentBytes;
}
/**
* When forceMergeDeletes is called, we only merge away a segment if its delete percentage is over
* this threshold. Default is 10%.

View File

@ -815,7 +815,8 @@ public class TestDirectoryReaderReopen extends LuceneTestCase {
/** test reopening backwards from a non-NRT reader (with document deletes) */
public void testNRTMdeletes() throws Exception {
Directory dir = newDirectory();
IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random()));
IndexWriterConfig iwc =
new IndexWriterConfig(new MockAnalyzer(random())).setMergePolicy(NoMergePolicy.INSTANCE);
SnapshotDeletionPolicy snapshotter =
new SnapshotDeletionPolicy(new KeepOnlyLastCommitDeletionPolicy());
iwc.setIndexDeletionPolicy(snapshotter);
@ -865,7 +866,8 @@ public class TestDirectoryReaderReopen extends LuceneTestCase {
/** test reopening backwards from an NRT reader (with document deletes) */
public void testNRTMdeletes2() throws Exception {
Directory dir = newDirectory();
IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random()));
IndexWriterConfig iwc =
new IndexWriterConfig(new MockAnalyzer(random())).setMergePolicy(NoMergePolicy.INSTANCE);
SnapshotDeletionPolicy snapshotter =
new SnapshotDeletionPolicy(new KeepOnlyLastCommitDeletionPolicy());
iwc.setIndexDeletionPolicy(snapshotter);

View File

@ -27,7 +27,7 @@ import org.apache.lucene.util.Version;
public class TestLogMergePolicy extends BaseMergePolicyTestCase {
@Override
public MergePolicy mergePolicy() {
public LogMergePolicy mergePolicy() {
return newLogMergePolicy(random());
}
@ -187,4 +187,31 @@ public class TestLogMergePolicy extends BaseMergePolicyTestCase {
assertEquals(100, segmentInfos.info(0).info.maxDoc());
assertEquals(10, segmentInfos.info(1).info.maxDoc());
}
public void testFullFlushMerges() throws IOException {
AtomicLong segNameGenerator = new AtomicLong();
IOStats stats = new IOStats();
MergeContext mergeContext = new MockMergeContext(SegmentCommitInfo::getDelCount);
SegmentInfos segmentInfos = new SegmentInfos(Version.LATEST.major);
LogMergePolicy mp = mergePolicy();
for (int i = 0; i < mp.getMergeFactor(); ++i) {
segmentInfos.add(
makeSegmentCommitInfo(
"_" + segNameGenerator.getAndIncrement(),
1,
0,
Double.MIN_VALUE,
IndexWriter.SOURCE_FLUSH));
}
MergeSpecification spec =
mp.findFullFlushMerges(MergeTrigger.FULL_FLUSH, segmentInfos, mergeContext);
assertNotNull(spec);
for (OneMerge merge : spec.merges) {
segmentInfos =
applyMerge(segmentInfos, merge, "_" + segNameGenerator.getAndIncrement(), stats);
}
assertEquals(1, segmentInfos.size());
}
}

View File

@ -24,9 +24,11 @@ import java.util.Iterator;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.concurrent.atomic.AtomicLong;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.StoredField;
import org.apache.lucene.index.MergePolicy.MergeContext;
import org.apache.lucene.index.MergePolicy.MergeSpecification;
import org.apache.lucene.index.MergePolicy.OneMerge;
import org.apache.lucene.store.Directory;
@ -917,4 +919,31 @@ public class TestTieredMergePolicy extends BaseMergePolicyTestCase {
int numDocs = TEST_NIGHTLY ? atLeast(10_000_000) : atLeast(1_000_000);
doTestSimulateUpdates(mergePolicy, numDocs, 2500);
}
public void testFullFlushMerges() throws IOException {
AtomicLong segNameGenerator = new AtomicLong();
IOStats stats = new IOStats();
MergeContext mergeContext = new MockMergeContext(SegmentCommitInfo::getDelCount);
SegmentInfos segmentInfos = new SegmentInfos(Version.LATEST.major);
TieredMergePolicy mp = new TieredMergePolicy();
for (int i = 0; i < 11; ++i) {
segmentInfos.add(
makeSegmentCommitInfo(
"_" + segNameGenerator.getAndIncrement(),
1,
0,
Double.MIN_VALUE,
IndexWriter.SOURCE_FLUSH));
}
MergeSpecification spec =
mp.findFullFlushMerges(MergeTrigger.FULL_FLUSH, segmentInfos, mergeContext);
assertNotNull(spec);
for (OneMerge merge : spec.merges) {
segmentInfos =
applyMerge(segmentInfos, merge, "_" + segNameGenerator.getAndIncrement(), stats);
}
assertEquals(2, segmentInfos.size());
}
}

View File

@ -104,8 +104,17 @@ public class TestTragicIndexWriterDeadlock extends LuceneTestCase {
// LUCENE-7570
public void testDeadlockStalledMerges() throws Exception {
doTestDeadlockStalledMerges(false);
}
public void testDeadlockStalledFullFlushMerges() throws Exception {
doTestDeadlockStalledMerges(true);
}
private void doTestDeadlockStalledMerges(boolean mergeOnFlush) throws Exception {
Directory dir = newDirectory();
IndexWriterConfig iwc = new IndexWriterConfig();
IndexWriterConfig iwc =
new IndexWriterConfig().setMaxFullFlushMergeWaitMillis(mergeOnFlush ? 1000 : 0);
// so we merge every 2 segments:
LogMergePolicy mp = new LogDocMergePolicy();
@ -163,7 +172,8 @@ public class TestTragicIndexWriterDeadlock extends LuceneTestCase {
w.addDocument(new Document());
// w writes third segment
w.addDocument(new Document());
w.commit();
IllegalStateException e = expectThrows(IllegalStateException.class, () -> w.commit());
assertTrue(e.getMessage(), e.getMessage().startsWith("this writer hit an unrecoverable error"));
// w writes fourth segment, and commit flushes and kicks off merge that stalls
w.close();
dir.close();

View File

@ -428,7 +428,10 @@ public abstract class BaseMergePolicyTestCase extends LuceneTestCase {
IndexWriter.SOURCE_FLUSH));
MergeSpecification merges =
mergePolicy.findMerges(MergeTrigger.SEGMENT_FLUSH, segmentInfos, mergeContext);
mergePolicy.findFullFlushMerges(MergeTrigger.SEGMENT_FLUSH, segmentInfos, mergeContext);
if (merges == null) {
merges = mergePolicy.findMerges(MergeTrigger.SEGMENT_FLUSH, segmentInfos, mergeContext);
}
while (merges != null) {
assertTrue(merges.merges.size() > 0);
assertMerge(mergePolicy, merges);
@ -490,7 +493,10 @@ public abstract class BaseMergePolicyTestCase extends LuceneTestCase {
flushSize,
IndexWriter.SOURCE_FLUSH));
MergeSpecification merges =
mergePolicy.findMerges(MergeTrigger.SEGMENT_FLUSH, segmentInfos, mergeContext);
mergePolicy.findFullFlushMerges(MergeTrigger.SEGMENT_FLUSH, segmentInfos, mergeContext);
if (merges == null) {
merges = mergePolicy.findMerges(MergeTrigger.SEGMENT_FLUSH, segmentInfos, mergeContext);
}
while (merges != null) {
assertMerge(mergePolicy, merges);
for (OneMerge oneMerge : merges.merges) {

View File

@ -1041,6 +1041,20 @@ public abstract class LuceneTestCase extends Assert {
if (rarely(r)) {
c.setIndexWriterEventListener(new MockIndexWriterEventListener());
}
switch (r.nextInt(3)) {
case 0:
// Disable merge on refresh
c.setMaxFullFlushMergeWaitMillis(0L);
break;
case 1:
// Very low timeout, merges will likely not be able to run in time
c.setMaxFullFlushMergeWaitMillis(1L);
break;
default:
// Very long timeout, merges will almost always be able to run in time
c.setMaxFullFlushMergeWaitMillis(1000L);
break;
}
c.setMaxFullFlushMergeWaitMillis(rarely() ? atLeast(r, 1000) : atLeast(r, 200));
return c;