mirror of https://github.com/apache/lucene.git
LUCENE-10093: fix conflicting test assert to match how TieredMergePolicy (TMP) works; improv TMP javadocs (#375)
This commit is contained in:
parent
8b6c90eccd
commit
e3151d6c7d
|
@ -447,6 +447,7 @@ public abstract class MergePolicy {
|
|||
merges.add(merge);
|
||||
}
|
||||
|
||||
// TODO: deprecate me (dir is never used! and is sometimes difficult to provide!)
|
||||
/** Returns a description of the merges in this specification. */
|
||||
public String segString(Directory dir) {
|
||||
StringBuilder b = new StringBuilder();
|
||||
|
@ -458,6 +459,17 @@ public abstract class MergePolicy {
|
|||
return b.toString();
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
StringBuilder b = new StringBuilder();
|
||||
b.append("MergeSpec:");
|
||||
final int count = merges.size();
|
||||
for (int i = 0; i < count; i++) {
|
||||
b.append("\n ").append(1 + i).append(": ").append(merges.get(i).segString());
|
||||
}
|
||||
return b.toString();
|
||||
}
|
||||
|
||||
/** Waits if necessary for at most the given time for all merges. */
|
||||
boolean await(long timeout, TimeUnit unit) {
|
||||
try {
|
||||
|
@ -562,8 +574,7 @@ public abstract class MergePolicy {
|
|||
* one thread at a time will call this method.
|
||||
*
|
||||
* @param segmentInfos the total set of segments in the index
|
||||
* @param maxSegmentCount requested maximum number of segments in the index (currently this is
|
||||
* always 1)
|
||||
* @param maxSegmentCount requested maximum number of segments in the index
|
||||
* @param segmentsToMerge contains the specific SegmentInfo instances that must be merged away.
|
||||
* This may be a subset of all SegmentInfos. If the value is True for a given SegmentInfo,
|
||||
* that means this segment was an original segment present in the to-be-merged index; else, it
|
||||
|
|
|
@ -51,16 +51,16 @@ import java.util.Set;
|
|||
* <p><b>NOTE</b>: This policy always merges by byte size of the segments, always pro-rates by
|
||||
* percent deletes
|
||||
*
|
||||
* <p><b>NOTE</b> Starting with Lucene 7.5, there are several changes:
|
||||
* <p><b>NOTE</b> Starting with Lucene 7.5, if you call {@link IndexWriter#forceMerge(int)} with
|
||||
* this (default) merge policy, if {@link #setMaxMergedSegmentMB} is in conflict with {@code
|
||||
* maxNumSegments} passed to {@link IndexWriter#forceMerge} then {@code maxNumSegments} wins. For
|
||||
* example, if your index has 50 1 GB segments, and you have {@link #setMaxMergedSegmentMB} at 1024
|
||||
* (1 GB), and you call {@code forceMerge(10)}, the two settings are clearly in conflict. {@code
|
||||
* TieredMergePolicy} will choose to break the {@link #setMaxMergedSegmentMB} constraint and try to
|
||||
* merge down to at most ten segments, each up to 5 * 1.25 GB in size (since an extra 25% buffer
|
||||
* increase in the expected segment size is targetted).
|
||||
*
|
||||
* <p>- findForcedMerges and findForcedDeletesMerges) respect the max segment size by default.
|
||||
*
|
||||
* <p>- When findforcedmerges is called with maxSegmentCount other than 1, the resulting index is
|
||||
* not guaranteed to have <= maxSegmentCount segments. Rather it is on a "best effort" basis.
|
||||
* Specifically the theoretical ideal segment size is calculated and a "fudge factor" of 25% is
|
||||
* added as the new maxSegmentSize, which is respected.
|
||||
*
|
||||
* <p>- findForcedDeletesMerges will not produce segments greater than maxSegmentSize.
|
||||
* <p>findForcedDeletesMerges should never produce segments greater than maxSegmentSize.
|
||||
*
|
||||
* @lucene.experimental
|
||||
*/
|
||||
|
@ -707,8 +707,8 @@ public class TieredMergePolicy extends MergePolicy {
|
|||
final Set<SegmentCommitInfo> merging = mergeContext.getMergingSegments();
|
||||
|
||||
// Trim the list down, remove if we're respecting max segment size and it's not original.
|
||||
// Presumably it's been merged before and
|
||||
// is close enough to the max segment size we shouldn't add it in again.
|
||||
// Presumably it's been merged before and is close enough to the max segment size we
|
||||
// shouldn't add it in again.
|
||||
Iterator<SegmentSizeAndDocs> iter = sortedSizeAndDocs.iterator();
|
||||
boolean forceMergeRunning = false;
|
||||
while (iter.hasNext()) {
|
||||
|
@ -729,17 +729,20 @@ public class TieredMergePolicy extends MergePolicy {
|
|||
long maxMergeBytes = maxMergedSegmentBytes;
|
||||
|
||||
// Set the maximum segment size based on how many segments have been specified.
|
||||
if (maxSegmentCount == 1) maxMergeBytes = Long.MAX_VALUE;
|
||||
else if (maxSegmentCount != Integer.MAX_VALUE) {
|
||||
// Fudge this up a bit so we have a better chance of not having to rewrite segments. If we use
|
||||
// the exact size,
|
||||
// it's almost guaranteed that the segments won't fit perfectly and we'll be left with more
|
||||
// segments than
|
||||
// we want and have to re-merge in the code at the bottom of this method.
|
||||
if (maxSegmentCount == 1) {
|
||||
maxMergeBytes = Long.MAX_VALUE;
|
||||
} else if (maxSegmentCount != Integer.MAX_VALUE) {
|
||||
maxMergeBytes =
|
||||
Math.max(
|
||||
(long) (((double) totalMergeBytes / (double) maxSegmentCount)),
|
||||
maxMergedSegmentBytes);
|
||||
// Fudge this up a bit so we have a better chance of not having to do a second pass of merging
|
||||
// to get
|
||||
// down to the requested target segment count. If we use the exact size, it's almost
|
||||
// guaranteed
|
||||
// that the segments selected below won't fit perfectly and we'll be left with more segments
|
||||
// than
|
||||
// we want and have to re-merge in the code at the bottom of this method.
|
||||
maxMergeBytes = (long) ((double) maxMergeBytes * 1.25);
|
||||
}
|
||||
|
||||
|
@ -748,8 +751,8 @@ public class TieredMergePolicy extends MergePolicy {
|
|||
while (iter.hasNext()) {
|
||||
SegmentSizeAndDocs segSizeDocs = iter.next();
|
||||
Boolean isOriginal = segmentsToMerge.get(segSizeDocs.segInfo);
|
||||
if (segSizeDocs.delCount
|
||||
!= 0) { // This is forceMerge, all segments with deleted docs should be merged.
|
||||
if (segSizeDocs.delCount != 0) {
|
||||
// This is forceMerge; all segments with deleted docs should be merged.
|
||||
if (isOriginal != null && isOriginal) {
|
||||
foundDeletes = true;
|
||||
}
|
||||
|
@ -770,8 +773,7 @@ public class TieredMergePolicy extends MergePolicy {
|
|||
return null;
|
||||
}
|
||||
|
||||
// We should never bail if there are segments that have deleted documents, all deleted docs
|
||||
// should be purged.
|
||||
// We only bail if there are no deletions
|
||||
if (foundDeletes == false) {
|
||||
SegmentCommitInfo infoZero = sortedSizeAndDocs.get(0).segInfo;
|
||||
if ((maxSegmentCount != Integer.MAX_VALUE
|
||||
|
@ -794,6 +796,11 @@ public class TieredMergePolicy extends MergePolicy {
|
|||
|
||||
final int startingSegmentCount = sortedSizeAndDocs.size();
|
||||
if (forceMergeRunning) {
|
||||
// hmm this is a little dangerous -- if a user kicks off a forceMerge, it is taking forever,
|
||||
// lots of
|
||||
// new indexing/segments happened since, and they want to kick off another to ensure those
|
||||
// newly
|
||||
// indexed segments partake in the force merge, they (silently) won't due to this?
|
||||
return null;
|
||||
}
|
||||
|
||||
|
|
|
@ -17,7 +17,6 @@
|
|||
package org.apache.lucene.index;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.UncheckedIOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.HashMap;
|
||||
|
@ -411,55 +410,132 @@ public class TestTieredMergePolicy extends BaseMergePolicyTestCase {
|
|||
dir.close();
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns how many segments are in the index after applying all merges from the {@code spec} to
|
||||
* an index with {@code startingSegmentCount} segments
|
||||
*/
|
||||
private int postMergesSegmentCount(int startingSegmentCount, MergeSpecification spec) {
|
||||
int count = startingSegmentCount;
|
||||
// remove the segments that are merged away
|
||||
for (var merge : spec.merges) {
|
||||
count -= merge.segments.size();
|
||||
}
|
||||
|
||||
// add one for each newly merged segment
|
||||
count += spec.merges.size();
|
||||
|
||||
return count;
|
||||
}
|
||||
|
||||
// verify that all merges in the spec do not create a final merged segment size too much bigger
|
||||
// than the configured maxMergedSegmentSizeMb
|
||||
private static void assertMaxMergedSize(
|
||||
MergeSpecification specification,
|
||||
double maxMergedSegmentSizeMB,
|
||||
double indexTotalSizeInMB,
|
||||
int maxMergedSegmentCount)
|
||||
throws IOException {
|
||||
|
||||
// When the two configs conflict, TMP favors the requested number of segments. I.e., it will
|
||||
// produce
|
||||
// too-large (> maxMergedSegmentMB) merged segments.
|
||||
double maxMBPerSegment = indexTotalSizeInMB / maxMergedSegmentCount;
|
||||
|
||||
for (OneMerge merge : specification.merges) {
|
||||
// compute total size of all segments being merged
|
||||
long mergeTotalSizeInBytes = 0;
|
||||
for (var segment : merge.segments) {
|
||||
mergeTotalSizeInBytes += segment.sizeInBytes();
|
||||
}
|
||||
|
||||
// we allow up to 50% "violation" of the configured maxMergedSegmentSizeMb (why? TMP itself is
|
||||
// on only adding 25% fudge factor):
|
||||
// TODO: drop this fudge factor back down to 25% -- TooMuchFudgeException!
|
||||
long limitBytes =
|
||||
(long) (1024 * 1024 * Math.max(maxMBPerSegment, maxMergedSegmentSizeMB) * 1.5);
|
||||
assertTrue(
|
||||
"mergeTotalSizeInBytes="
|
||||
+ mergeTotalSizeInBytes
|
||||
+ " limitBytes="
|
||||
+ limitBytes
|
||||
+ " maxMergedSegmentSizeMb="
|
||||
+ maxMergedSegmentSizeMB,
|
||||
mergeTotalSizeInBytes < limitBytes);
|
||||
}
|
||||
}
|
||||
|
||||
// LUCENE-8688 reports that force merges merged more segments that necessary to respect
|
||||
// maxSegmentCount as a result
|
||||
// of LUCENE-7976 so we ensure that it only does the minimum number of merges here.
|
||||
public void testForcedMergesUseLeastNumberOfMerges() throws Exception {
|
||||
final TieredMergePolicy tmp = new TieredMergePolicy();
|
||||
final double oneSegmentSize = 1.0D;
|
||||
final double maxSegmentSize = 10 * oneSegmentSize;
|
||||
tmp.setMaxMergedSegmentMB(maxSegmentSize);
|
||||
|
||||
SegmentInfos infos = new SegmentInfos(Version.LATEST.major);
|
||||
for (int j = 0; j < 30; ++j) {
|
||||
infos.add(makeSegmentCommitInfo("_" + j, 1000, 0, oneSegmentSize, IndexWriter.SOURCE_MERGE));
|
||||
TieredMergePolicy tmp = new TieredMergePolicy();
|
||||
double oneSegmentSizeMB = 1.0D;
|
||||
double maxMergedSegmentSizeMB = 10 * oneSegmentSizeMB;
|
||||
tmp.setMaxMergedSegmentMB(maxMergedSegmentSizeMB);
|
||||
if (VERBOSE) {
|
||||
System.out.println(
|
||||
String.format(Locale.ROOT, "TEST: maxMergedSegmentSizeMB=%.2f", maxMergedSegmentSizeMB));
|
||||
}
|
||||
|
||||
final int expectedCount = random().nextInt(10) + 3;
|
||||
final MergeSpecification specification =
|
||||
// create simulated 30 segment index where each segment is 1 MB
|
||||
SegmentInfos infos = new SegmentInfos(Version.LATEST.major);
|
||||
int segmentCount = 30;
|
||||
for (int j = 0; j < segmentCount; j++) {
|
||||
infos.add(
|
||||
makeSegmentCommitInfo("_" + j, 1000, 0, oneSegmentSizeMB, IndexWriter.SOURCE_MERGE));
|
||||
}
|
||||
|
||||
double indexTotalSizeMB = segmentCount * oneSegmentSizeMB;
|
||||
|
||||
int maxSegmentCountAfterForceMerge = random().nextInt(10) + 3;
|
||||
if (VERBOSE) {
|
||||
System.out.println("TEST: maxSegmentCountAfterForceMerge=" + maxSegmentCountAfterForceMerge);
|
||||
}
|
||||
MergeSpecification specification =
|
||||
tmp.findForcedMerges(
|
||||
infos,
|
||||
expectedCount,
|
||||
maxSegmentCountAfterForceMerge,
|
||||
segmentsToMerge(infos),
|
||||
new MockMergeContext(SegmentCommitInfo::getDelCount));
|
||||
assertMaxSize(specification, maxSegmentSize);
|
||||
final int resultingCount =
|
||||
infos.size()
|
||||
+ specification.merges.size()
|
||||
- specification.merges.stream().mapToInt(spec -> spec.segments.size()).sum();
|
||||
assertEquals(expectedCount, resultingCount);
|
||||
if (VERBOSE) {
|
||||
System.out.println("TEST: specification=" + specification);
|
||||
}
|
||||
assertMaxMergedSize(
|
||||
specification, maxMergedSegmentSizeMB, indexTotalSizeMB, maxSegmentCountAfterForceMerge);
|
||||
|
||||
SegmentInfos manySegmentsInfos = new SegmentInfos(Version.LATEST.major);
|
||||
// verify we achieved exactly the max segment count post-force-merge (which is a bit odd -- the
|
||||
// API only ensures <= segments, not ==)
|
||||
// TODO: change this to a <= equality like the last assert below?
|
||||
assertEquals(
|
||||
maxSegmentCountAfterForceMerge, postMergesSegmentCount(infos.size(), specification));
|
||||
|
||||
// now create many segments index, containing 0.1 MB sized segments
|
||||
infos = new SegmentInfos(Version.LATEST.major);
|
||||
final int manySegmentsCount = atLeast(100);
|
||||
for (int j = 0; j < manySegmentsCount; ++j) {
|
||||
manySegmentsInfos.add(
|
||||
makeSegmentCommitInfo("_" + j, 1000, 0, 0.1D, IndexWriter.SOURCE_MERGE));
|
||||
if (VERBOSE) {
|
||||
System.out.println("TEST: manySegmentsCount=" + manySegmentsCount);
|
||||
}
|
||||
oneSegmentSizeMB = 0.1D;
|
||||
for (int j = 0; j < manySegmentsCount; j++) {
|
||||
infos.add(
|
||||
makeSegmentCommitInfo("_" + j, 1000, 0, oneSegmentSizeMB, IndexWriter.SOURCE_MERGE));
|
||||
}
|
||||
|
||||
final MergeSpecification specificationManySegments =
|
||||
indexTotalSizeMB = manySegmentsCount * oneSegmentSizeMB;
|
||||
|
||||
specification =
|
||||
tmp.findForcedMerges(
|
||||
manySegmentsInfos,
|
||||
expectedCount,
|
||||
segmentsToMerge(manySegmentsInfos),
|
||||
infos,
|
||||
maxSegmentCountAfterForceMerge,
|
||||
segmentsToMerge(infos),
|
||||
new MockMergeContext(SegmentCommitInfo::getDelCount));
|
||||
assertMaxSize(specificationManySegments, maxSegmentSize);
|
||||
final int resultingCountManySegments =
|
||||
manySegmentsInfos.size()
|
||||
+ specificationManySegments.merges.size()
|
||||
- specificationManySegments.merges.stream()
|
||||
.mapToInt(spec -> spec.segments.size())
|
||||
.sum();
|
||||
assertTrue(resultingCountManySegments >= expectedCount);
|
||||
if (VERBOSE) {
|
||||
System.out.println("TEST: specification=" + specification);
|
||||
}
|
||||
assertMaxMergedSize(
|
||||
specification, maxMergedSegmentSizeMB, indexTotalSizeMB, maxSegmentCountAfterForceMerge);
|
||||
assertTrue(
|
||||
postMergesSegmentCount(infos.size(), specification) >= maxSegmentCountAfterForceMerge);
|
||||
}
|
||||
|
||||
// Make sure that TieredMergePolicy doesn't do the final merge while there are merges ongoing, but
|
||||
|
@ -493,26 +569,6 @@ public class TestTieredMergePolicy extends BaseMergePolicyTestCase {
|
|||
return segmentsToMerge;
|
||||
}
|
||||
|
||||
private static void assertMaxSize(MergeSpecification specification, double maxSegmentSizeMb) {
|
||||
for (OneMerge merge : specification.merges) {
|
||||
long size =
|
||||
merge.segments.stream()
|
||||
.mapToLong(
|
||||
s -> {
|
||||
try {
|
||||
return s.sizeInBytes();
|
||||
} catch (IOException e) {
|
||||
throw new UncheckedIOException(e);
|
||||
}
|
||||
})
|
||||
.sum();
|
||||
long limit = (long) (1024 * 1024 * maxSegmentSizeMb * 1.5);
|
||||
assertTrue(
|
||||
"size=" + size + ",limit=" + limit + ",maxSegmentSizeMb=" + maxSegmentSizeMb,
|
||||
size < limit);
|
||||
}
|
||||
}
|
||||
|
||||
// Having a segment with very few documents in it can happen because of the random nature of the
|
||||
// docs added to the index. For instance, let's say it just happens that the last segment has 3
|
||||
// docs in it.
|
||||
|
|
Loading…
Reference in New Issue