LUCENE-10093: fix conflicting test assert to match how TieredMergePolicy (TMP) works; improv TMP javadocs (#375)

This commit is contained in:
Michael McCandless 2021-10-21 09:23:17 -04:00 committed by GitHub
parent 8b6c90eccd
commit e3151d6c7d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 152 additions and 78 deletions

View File

@ -447,6 +447,7 @@ public abstract class MergePolicy {
merges.add(merge); merges.add(merge);
} }
// TODO: deprecate me (dir is never used! and is sometimes difficult to provide!)
/** Returns a description of the merges in this specification. */ /** Returns a description of the merges in this specification. */
public String segString(Directory dir) { public String segString(Directory dir) {
StringBuilder b = new StringBuilder(); StringBuilder b = new StringBuilder();
@ -458,6 +459,17 @@ public abstract class MergePolicy {
return b.toString(); return b.toString();
} }
@Override
public String toString() {
StringBuilder b = new StringBuilder();
b.append("MergeSpec:");
final int count = merges.size();
for (int i = 0; i < count; i++) {
b.append("\n ").append(1 + i).append(": ").append(merges.get(i).segString());
}
return b.toString();
}
/** Waits if necessary for at most the given time for all merges. */ /** Waits if necessary for at most the given time for all merges. */
boolean await(long timeout, TimeUnit unit) { boolean await(long timeout, TimeUnit unit) {
try { try {
@ -562,8 +574,7 @@ public abstract class MergePolicy {
* one thread at a time will call this method. * one thread at a time will call this method.
* *
* @param segmentInfos the total set of segments in the index * @param segmentInfos the total set of segments in the index
* @param maxSegmentCount requested maximum number of segments in the index (currently this is * @param maxSegmentCount requested maximum number of segments in the index
* always 1)
* @param segmentsToMerge contains the specific SegmentInfo instances that must be merged away. * @param segmentsToMerge contains the specific SegmentInfo instances that must be merged away.
* This may be a subset of all SegmentInfos. If the value is True for a given SegmentInfo, * This may be a subset of all SegmentInfos. If the value is True for a given SegmentInfo,
* that means this segment was an original segment present in the to-be-merged index; else, it * that means this segment was an original segment present in the to-be-merged index; else, it

View File

@ -51,16 +51,16 @@ import java.util.Set;
* <p><b>NOTE</b>: This policy always merges by byte size of the segments, always pro-rates by * <p><b>NOTE</b>: This policy always merges by byte size of the segments, always pro-rates by
* percent deletes * percent deletes
* *
* <p><b>NOTE</b> Starting with Lucene 7.5, there are several changes: * <p><b>NOTE</b> Starting with Lucene 7.5, if you call {@link IndexWriter#forceMerge(int)} with
* this (default) merge policy, if {@link #setMaxMergedSegmentMB} is in conflict with {@code
* maxNumSegments} passed to {@link IndexWriter#forceMerge} then {@code maxNumSegments} wins. For
* example, if your index has 50 1 GB segments, and you have {@link #setMaxMergedSegmentMB} at 1024
* (1 GB), and you call {@code forceMerge(10)}, the two settings are clearly in conflict. {@code
* TieredMergePolicy} will choose to break the {@link #setMaxMergedSegmentMB} constraint and try to
* merge down to at most ten segments, each up to 5 * 1.25 GB in size (since an extra 25% buffer
* increase in the expected segment size is targetted).
* *
* <p>- findForcedMerges and findForcedDeletesMerges) respect the max segment size by default. * <p>findForcedDeletesMerges should never produce segments greater than maxSegmentSize.
*
* <p>- When findforcedmerges is called with maxSegmentCount other than 1, the resulting index is
* not guaranteed to have &lt;= maxSegmentCount segments. Rather it is on a "best effort" basis.
* Specifically the theoretical ideal segment size is calculated and a "fudge factor" of 25% is
* added as the new maxSegmentSize, which is respected.
*
* <p>- findForcedDeletesMerges will not produce segments greater than maxSegmentSize.
* *
* @lucene.experimental * @lucene.experimental
*/ */
@ -707,8 +707,8 @@ public class TieredMergePolicy extends MergePolicy {
final Set<SegmentCommitInfo> merging = mergeContext.getMergingSegments(); final Set<SegmentCommitInfo> merging = mergeContext.getMergingSegments();
// Trim the list down, remove if we're respecting max segment size and it's not original. // Trim the list down, remove if we're respecting max segment size and it's not original.
// Presumably it's been merged before and // Presumably it's been merged before and is close enough to the max segment size we
// is close enough to the max segment size we shouldn't add it in again. // shouldn't add it in again.
Iterator<SegmentSizeAndDocs> iter = sortedSizeAndDocs.iterator(); Iterator<SegmentSizeAndDocs> iter = sortedSizeAndDocs.iterator();
boolean forceMergeRunning = false; boolean forceMergeRunning = false;
while (iter.hasNext()) { while (iter.hasNext()) {
@ -729,17 +729,20 @@ public class TieredMergePolicy extends MergePolicy {
long maxMergeBytes = maxMergedSegmentBytes; long maxMergeBytes = maxMergedSegmentBytes;
// Set the maximum segment size based on how many segments have been specified. // Set the maximum segment size based on how many segments have been specified.
if (maxSegmentCount == 1) maxMergeBytes = Long.MAX_VALUE; if (maxSegmentCount == 1) {
else if (maxSegmentCount != Integer.MAX_VALUE) { maxMergeBytes = Long.MAX_VALUE;
// Fudge this up a bit so we have a better chance of not having to rewrite segments. If we use } else if (maxSegmentCount != Integer.MAX_VALUE) {
// the exact size,
// it's almost guaranteed that the segments won't fit perfectly and we'll be left with more
// segments than
// we want and have to re-merge in the code at the bottom of this method.
maxMergeBytes = maxMergeBytes =
Math.max( Math.max(
(long) (((double) totalMergeBytes / (double) maxSegmentCount)), (long) (((double) totalMergeBytes / (double) maxSegmentCount)),
maxMergedSegmentBytes); maxMergedSegmentBytes);
// Fudge this up a bit so we have a better chance of not having to do a second pass of merging
// to get
// down to the requested target segment count. If we use the exact size, it's almost
// guaranteed
// that the segments selected below won't fit perfectly and we'll be left with more segments
// than
// we want and have to re-merge in the code at the bottom of this method.
maxMergeBytes = (long) ((double) maxMergeBytes * 1.25); maxMergeBytes = (long) ((double) maxMergeBytes * 1.25);
} }
@ -748,8 +751,8 @@ public class TieredMergePolicy extends MergePolicy {
while (iter.hasNext()) { while (iter.hasNext()) {
SegmentSizeAndDocs segSizeDocs = iter.next(); SegmentSizeAndDocs segSizeDocs = iter.next();
Boolean isOriginal = segmentsToMerge.get(segSizeDocs.segInfo); Boolean isOriginal = segmentsToMerge.get(segSizeDocs.segInfo);
if (segSizeDocs.delCount if (segSizeDocs.delCount != 0) {
!= 0) { // This is forceMerge, all segments with deleted docs should be merged. // This is forceMerge; all segments with deleted docs should be merged.
if (isOriginal != null && isOriginal) { if (isOriginal != null && isOriginal) {
foundDeletes = true; foundDeletes = true;
} }
@ -770,8 +773,7 @@ public class TieredMergePolicy extends MergePolicy {
return null; return null;
} }
// We should never bail if there are segments that have deleted documents, all deleted docs // We only bail if there are no deletions
// should be purged.
if (foundDeletes == false) { if (foundDeletes == false) {
SegmentCommitInfo infoZero = sortedSizeAndDocs.get(0).segInfo; SegmentCommitInfo infoZero = sortedSizeAndDocs.get(0).segInfo;
if ((maxSegmentCount != Integer.MAX_VALUE if ((maxSegmentCount != Integer.MAX_VALUE
@ -794,6 +796,11 @@ public class TieredMergePolicy extends MergePolicy {
final int startingSegmentCount = sortedSizeAndDocs.size(); final int startingSegmentCount = sortedSizeAndDocs.size();
if (forceMergeRunning) { if (forceMergeRunning) {
// hmm this is a little dangerous -- if a user kicks off a forceMerge, it is taking forever,
// lots of
// new indexing/segments happened since, and they want to kick off another to ensure those
// newly
// indexed segments partake in the force merge, they (silently) won't due to this?
return null; return null;
} }

View File

@ -17,7 +17,6 @@
package org.apache.lucene.index; package org.apache.lucene.index;
import java.io.IOException; import java.io.IOException;
import java.io.UncheckedIOException;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Collections; import java.util.Collections;
import java.util.HashMap; import java.util.HashMap;
@ -411,55 +410,132 @@ public class TestTieredMergePolicy extends BaseMergePolicyTestCase {
dir.close(); dir.close();
} }
/**
* Returns how many segments are in the index after applying all merges from the {@code spec} to
* an index with {@code startingSegmentCount} segments
*/
private int postMergesSegmentCount(int startingSegmentCount, MergeSpecification spec) {
int count = startingSegmentCount;
// remove the segments that are merged away
for (var merge : spec.merges) {
count -= merge.segments.size();
}
// add one for each newly merged segment
count += spec.merges.size();
return count;
}
// verify that all merges in the spec do not create a final merged segment size too much bigger
// than the configured maxMergedSegmentSizeMb
private static void assertMaxMergedSize(
MergeSpecification specification,
double maxMergedSegmentSizeMB,
double indexTotalSizeInMB,
int maxMergedSegmentCount)
throws IOException {
// When the two configs conflict, TMP favors the requested number of segments. I.e., it will
// produce
// too-large (> maxMergedSegmentMB) merged segments.
double maxMBPerSegment = indexTotalSizeInMB / maxMergedSegmentCount;
for (OneMerge merge : specification.merges) {
// compute total size of all segments being merged
long mergeTotalSizeInBytes = 0;
for (var segment : merge.segments) {
mergeTotalSizeInBytes += segment.sizeInBytes();
}
// we allow up to 50% "violation" of the configured maxMergedSegmentSizeMb (why? TMP itself is
// on only adding 25% fudge factor):
// TODO: drop this fudge factor back down to 25% -- TooMuchFudgeException!
long limitBytes =
(long) (1024 * 1024 * Math.max(maxMBPerSegment, maxMergedSegmentSizeMB) * 1.5);
assertTrue(
"mergeTotalSizeInBytes="
+ mergeTotalSizeInBytes
+ " limitBytes="
+ limitBytes
+ " maxMergedSegmentSizeMb="
+ maxMergedSegmentSizeMB,
mergeTotalSizeInBytes < limitBytes);
}
}
// LUCENE-8688 reports that force merges merged more segments that necessary to respect // LUCENE-8688 reports that force merges merged more segments that necessary to respect
// maxSegmentCount as a result // maxSegmentCount as a result
// of LUCENE-7976 so we ensure that it only does the minimum number of merges here. // of LUCENE-7976 so we ensure that it only does the minimum number of merges here.
public void testForcedMergesUseLeastNumberOfMerges() throws Exception { public void testForcedMergesUseLeastNumberOfMerges() throws Exception {
final TieredMergePolicy tmp = new TieredMergePolicy(); TieredMergePolicy tmp = new TieredMergePolicy();
final double oneSegmentSize = 1.0D; double oneSegmentSizeMB = 1.0D;
final double maxSegmentSize = 10 * oneSegmentSize; double maxMergedSegmentSizeMB = 10 * oneSegmentSizeMB;
tmp.setMaxMergedSegmentMB(maxSegmentSize); tmp.setMaxMergedSegmentMB(maxMergedSegmentSizeMB);
if (VERBOSE) {
SegmentInfos infos = new SegmentInfos(Version.LATEST.major); System.out.println(
for (int j = 0; j < 30; ++j) { String.format(Locale.ROOT, "TEST: maxMergedSegmentSizeMB=%.2f", maxMergedSegmentSizeMB));
infos.add(makeSegmentCommitInfo("_" + j, 1000, 0, oneSegmentSize, IndexWriter.SOURCE_MERGE));
} }
final int expectedCount = random().nextInt(10) + 3; // create simulated 30 segment index where each segment is 1 MB
final MergeSpecification specification = SegmentInfos infos = new SegmentInfos(Version.LATEST.major);
int segmentCount = 30;
for (int j = 0; j < segmentCount; j++) {
infos.add(
makeSegmentCommitInfo("_" + j, 1000, 0, oneSegmentSizeMB, IndexWriter.SOURCE_MERGE));
}
double indexTotalSizeMB = segmentCount * oneSegmentSizeMB;
int maxSegmentCountAfterForceMerge = random().nextInt(10) + 3;
if (VERBOSE) {
System.out.println("TEST: maxSegmentCountAfterForceMerge=" + maxSegmentCountAfterForceMerge);
}
MergeSpecification specification =
tmp.findForcedMerges( tmp.findForcedMerges(
infos, infos,
expectedCount, maxSegmentCountAfterForceMerge,
segmentsToMerge(infos), segmentsToMerge(infos),
new MockMergeContext(SegmentCommitInfo::getDelCount)); new MockMergeContext(SegmentCommitInfo::getDelCount));
assertMaxSize(specification, maxSegmentSize); if (VERBOSE) {
final int resultingCount = System.out.println("TEST: specification=" + specification);
infos.size() }
+ specification.merges.size() assertMaxMergedSize(
- specification.merges.stream().mapToInt(spec -> spec.segments.size()).sum(); specification, maxMergedSegmentSizeMB, indexTotalSizeMB, maxSegmentCountAfterForceMerge);
assertEquals(expectedCount, resultingCount);
SegmentInfos manySegmentsInfos = new SegmentInfos(Version.LATEST.major); // verify we achieved exactly the max segment count post-force-merge (which is a bit odd -- the
// API only ensures <= segments, not ==)
// TODO: change this to a <= equality like the last assert below?
assertEquals(
maxSegmentCountAfterForceMerge, postMergesSegmentCount(infos.size(), specification));
// now create many segments index, containing 0.1 MB sized segments
infos = new SegmentInfos(Version.LATEST.major);
final int manySegmentsCount = atLeast(100); final int manySegmentsCount = atLeast(100);
for (int j = 0; j < manySegmentsCount; ++j) { if (VERBOSE) {
manySegmentsInfos.add( System.out.println("TEST: manySegmentsCount=" + manySegmentsCount);
makeSegmentCommitInfo("_" + j, 1000, 0, 0.1D, IndexWriter.SOURCE_MERGE)); }
oneSegmentSizeMB = 0.1D;
for (int j = 0; j < manySegmentsCount; j++) {
infos.add(
makeSegmentCommitInfo("_" + j, 1000, 0, oneSegmentSizeMB, IndexWriter.SOURCE_MERGE));
} }
final MergeSpecification specificationManySegments = indexTotalSizeMB = manySegmentsCount * oneSegmentSizeMB;
specification =
tmp.findForcedMerges( tmp.findForcedMerges(
manySegmentsInfos, infos,
expectedCount, maxSegmentCountAfterForceMerge,
segmentsToMerge(manySegmentsInfos), segmentsToMerge(infos),
new MockMergeContext(SegmentCommitInfo::getDelCount)); new MockMergeContext(SegmentCommitInfo::getDelCount));
assertMaxSize(specificationManySegments, maxSegmentSize); if (VERBOSE) {
final int resultingCountManySegments = System.out.println("TEST: specification=" + specification);
manySegmentsInfos.size() }
+ specificationManySegments.merges.size() assertMaxMergedSize(
- specificationManySegments.merges.stream() specification, maxMergedSegmentSizeMB, indexTotalSizeMB, maxSegmentCountAfterForceMerge);
.mapToInt(spec -> spec.segments.size()) assertTrue(
.sum(); postMergesSegmentCount(infos.size(), specification) >= maxSegmentCountAfterForceMerge);
assertTrue(resultingCountManySegments >= expectedCount);
} }
// Make sure that TieredMergePolicy doesn't do the final merge while there are merges ongoing, but // Make sure that TieredMergePolicy doesn't do the final merge while there are merges ongoing, but
@ -493,26 +569,6 @@ public class TestTieredMergePolicy extends BaseMergePolicyTestCase {
return segmentsToMerge; return segmentsToMerge;
} }
private static void assertMaxSize(MergeSpecification specification, double maxSegmentSizeMb) {
for (OneMerge merge : specification.merges) {
long size =
merge.segments.stream()
.mapToLong(
s -> {
try {
return s.sizeInBytes();
} catch (IOException e) {
throw new UncheckedIOException(e);
}
})
.sum();
long limit = (long) (1024 * 1024 * maxSegmentSizeMb * 1.5);
assertTrue(
"size=" + size + ",limit=" + limit + ",maxSegmentSizeMb=" + maxSegmentSizeMb,
size < limit);
}
}
// Having a segment with very few documents in it can happen because of the random nature of the // Having a segment with very few documents in it can happen because of the random nature of the
// docs added to the index. For instance, let's say it just happens that the last segment has 3 // docs added to the index. For instance, let's say it just happens that the last segment has 3
// docs in it. // docs in it.