mirror of https://github.com/apache/lucene.git
LUCENE-10093: fix conflicting test assert to match how TieredMergePolicy (TMP) works; improv TMP javadocs (#375)
This commit is contained in:
parent
8b6c90eccd
commit
e3151d6c7d
|
@ -447,6 +447,7 @@ public abstract class MergePolicy {
|
||||||
merges.add(merge);
|
merges.add(merge);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// TODO: deprecate me (dir is never used! and is sometimes difficult to provide!)
|
||||||
/** Returns a description of the merges in this specification. */
|
/** Returns a description of the merges in this specification. */
|
||||||
public String segString(Directory dir) {
|
public String segString(Directory dir) {
|
||||||
StringBuilder b = new StringBuilder();
|
StringBuilder b = new StringBuilder();
|
||||||
|
@ -458,6 +459,17 @@ public abstract class MergePolicy {
|
||||||
return b.toString();
|
return b.toString();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String toString() {
|
||||||
|
StringBuilder b = new StringBuilder();
|
||||||
|
b.append("MergeSpec:");
|
||||||
|
final int count = merges.size();
|
||||||
|
for (int i = 0; i < count; i++) {
|
||||||
|
b.append("\n ").append(1 + i).append(": ").append(merges.get(i).segString());
|
||||||
|
}
|
||||||
|
return b.toString();
|
||||||
|
}
|
||||||
|
|
||||||
/** Waits if necessary for at most the given time for all merges. */
|
/** Waits if necessary for at most the given time for all merges. */
|
||||||
boolean await(long timeout, TimeUnit unit) {
|
boolean await(long timeout, TimeUnit unit) {
|
||||||
try {
|
try {
|
||||||
|
@ -562,8 +574,7 @@ public abstract class MergePolicy {
|
||||||
* one thread at a time will call this method.
|
* one thread at a time will call this method.
|
||||||
*
|
*
|
||||||
* @param segmentInfos the total set of segments in the index
|
* @param segmentInfos the total set of segments in the index
|
||||||
* @param maxSegmentCount requested maximum number of segments in the index (currently this is
|
* @param maxSegmentCount requested maximum number of segments in the index
|
||||||
* always 1)
|
|
||||||
* @param segmentsToMerge contains the specific SegmentInfo instances that must be merged away.
|
* @param segmentsToMerge contains the specific SegmentInfo instances that must be merged away.
|
||||||
* This may be a subset of all SegmentInfos. If the value is True for a given SegmentInfo,
|
* This may be a subset of all SegmentInfos. If the value is True for a given SegmentInfo,
|
||||||
* that means this segment was an original segment present in the to-be-merged index; else, it
|
* that means this segment was an original segment present in the to-be-merged index; else, it
|
||||||
|
|
|
@ -51,16 +51,16 @@ import java.util.Set;
|
||||||
* <p><b>NOTE</b>: This policy always merges by byte size of the segments, always pro-rates by
|
* <p><b>NOTE</b>: This policy always merges by byte size of the segments, always pro-rates by
|
||||||
* percent deletes
|
* percent deletes
|
||||||
*
|
*
|
||||||
* <p><b>NOTE</b> Starting with Lucene 7.5, there are several changes:
|
* <p><b>NOTE</b> Starting with Lucene 7.5, if you call {@link IndexWriter#forceMerge(int)} with
|
||||||
|
* this (default) merge policy, if {@link #setMaxMergedSegmentMB} is in conflict with {@code
|
||||||
|
* maxNumSegments} passed to {@link IndexWriter#forceMerge} then {@code maxNumSegments} wins. For
|
||||||
|
* example, if your index has 50 1 GB segments, and you have {@link #setMaxMergedSegmentMB} at 1024
|
||||||
|
* (1 GB), and you call {@code forceMerge(10)}, the two settings are clearly in conflict. {@code
|
||||||
|
* TieredMergePolicy} will choose to break the {@link #setMaxMergedSegmentMB} constraint and try to
|
||||||
|
* merge down to at most ten segments, each up to 5 * 1.25 GB in size (since an extra 25% buffer
|
||||||
|
* increase in the expected segment size is targetted).
|
||||||
*
|
*
|
||||||
* <p>- findForcedMerges and findForcedDeletesMerges) respect the max segment size by default.
|
* <p>findForcedDeletesMerges should never produce segments greater than maxSegmentSize.
|
||||||
*
|
|
||||||
* <p>- When findforcedmerges is called with maxSegmentCount other than 1, the resulting index is
|
|
||||||
* not guaranteed to have <= maxSegmentCount segments. Rather it is on a "best effort" basis.
|
|
||||||
* Specifically the theoretical ideal segment size is calculated and a "fudge factor" of 25% is
|
|
||||||
* added as the new maxSegmentSize, which is respected.
|
|
||||||
*
|
|
||||||
* <p>- findForcedDeletesMerges will not produce segments greater than maxSegmentSize.
|
|
||||||
*
|
*
|
||||||
* @lucene.experimental
|
* @lucene.experimental
|
||||||
*/
|
*/
|
||||||
|
@ -707,8 +707,8 @@ public class TieredMergePolicy extends MergePolicy {
|
||||||
final Set<SegmentCommitInfo> merging = mergeContext.getMergingSegments();
|
final Set<SegmentCommitInfo> merging = mergeContext.getMergingSegments();
|
||||||
|
|
||||||
// Trim the list down, remove if we're respecting max segment size and it's not original.
|
// Trim the list down, remove if we're respecting max segment size and it's not original.
|
||||||
// Presumably it's been merged before and
|
// Presumably it's been merged before and is close enough to the max segment size we
|
||||||
// is close enough to the max segment size we shouldn't add it in again.
|
// shouldn't add it in again.
|
||||||
Iterator<SegmentSizeAndDocs> iter = sortedSizeAndDocs.iterator();
|
Iterator<SegmentSizeAndDocs> iter = sortedSizeAndDocs.iterator();
|
||||||
boolean forceMergeRunning = false;
|
boolean forceMergeRunning = false;
|
||||||
while (iter.hasNext()) {
|
while (iter.hasNext()) {
|
||||||
|
@ -729,17 +729,20 @@ public class TieredMergePolicy extends MergePolicy {
|
||||||
long maxMergeBytes = maxMergedSegmentBytes;
|
long maxMergeBytes = maxMergedSegmentBytes;
|
||||||
|
|
||||||
// Set the maximum segment size based on how many segments have been specified.
|
// Set the maximum segment size based on how many segments have been specified.
|
||||||
if (maxSegmentCount == 1) maxMergeBytes = Long.MAX_VALUE;
|
if (maxSegmentCount == 1) {
|
||||||
else if (maxSegmentCount != Integer.MAX_VALUE) {
|
maxMergeBytes = Long.MAX_VALUE;
|
||||||
// Fudge this up a bit so we have a better chance of not having to rewrite segments. If we use
|
} else if (maxSegmentCount != Integer.MAX_VALUE) {
|
||||||
// the exact size,
|
|
||||||
// it's almost guaranteed that the segments won't fit perfectly and we'll be left with more
|
|
||||||
// segments than
|
|
||||||
// we want and have to re-merge in the code at the bottom of this method.
|
|
||||||
maxMergeBytes =
|
maxMergeBytes =
|
||||||
Math.max(
|
Math.max(
|
||||||
(long) (((double) totalMergeBytes / (double) maxSegmentCount)),
|
(long) (((double) totalMergeBytes / (double) maxSegmentCount)),
|
||||||
maxMergedSegmentBytes);
|
maxMergedSegmentBytes);
|
||||||
|
// Fudge this up a bit so we have a better chance of not having to do a second pass of merging
|
||||||
|
// to get
|
||||||
|
// down to the requested target segment count. If we use the exact size, it's almost
|
||||||
|
// guaranteed
|
||||||
|
// that the segments selected below won't fit perfectly and we'll be left with more segments
|
||||||
|
// than
|
||||||
|
// we want and have to re-merge in the code at the bottom of this method.
|
||||||
maxMergeBytes = (long) ((double) maxMergeBytes * 1.25);
|
maxMergeBytes = (long) ((double) maxMergeBytes * 1.25);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -748,8 +751,8 @@ public class TieredMergePolicy extends MergePolicy {
|
||||||
while (iter.hasNext()) {
|
while (iter.hasNext()) {
|
||||||
SegmentSizeAndDocs segSizeDocs = iter.next();
|
SegmentSizeAndDocs segSizeDocs = iter.next();
|
||||||
Boolean isOriginal = segmentsToMerge.get(segSizeDocs.segInfo);
|
Boolean isOriginal = segmentsToMerge.get(segSizeDocs.segInfo);
|
||||||
if (segSizeDocs.delCount
|
if (segSizeDocs.delCount != 0) {
|
||||||
!= 0) { // This is forceMerge, all segments with deleted docs should be merged.
|
// This is forceMerge; all segments with deleted docs should be merged.
|
||||||
if (isOriginal != null && isOriginal) {
|
if (isOriginal != null && isOriginal) {
|
||||||
foundDeletes = true;
|
foundDeletes = true;
|
||||||
}
|
}
|
||||||
|
@ -770,8 +773,7 @@ public class TieredMergePolicy extends MergePolicy {
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
// We should never bail if there are segments that have deleted documents, all deleted docs
|
// We only bail if there are no deletions
|
||||||
// should be purged.
|
|
||||||
if (foundDeletes == false) {
|
if (foundDeletes == false) {
|
||||||
SegmentCommitInfo infoZero = sortedSizeAndDocs.get(0).segInfo;
|
SegmentCommitInfo infoZero = sortedSizeAndDocs.get(0).segInfo;
|
||||||
if ((maxSegmentCount != Integer.MAX_VALUE
|
if ((maxSegmentCount != Integer.MAX_VALUE
|
||||||
|
@ -794,6 +796,11 @@ public class TieredMergePolicy extends MergePolicy {
|
||||||
|
|
||||||
final int startingSegmentCount = sortedSizeAndDocs.size();
|
final int startingSegmentCount = sortedSizeAndDocs.size();
|
||||||
if (forceMergeRunning) {
|
if (forceMergeRunning) {
|
||||||
|
// hmm this is a little dangerous -- if a user kicks off a forceMerge, it is taking forever,
|
||||||
|
// lots of
|
||||||
|
// new indexing/segments happened since, and they want to kick off another to ensure those
|
||||||
|
// newly
|
||||||
|
// indexed segments partake in the force merge, they (silently) won't due to this?
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -17,7 +17,6 @@
|
||||||
package org.apache.lucene.index;
|
package org.apache.lucene.index;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.UncheckedIOException;
|
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.Collections;
|
import java.util.Collections;
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
|
@ -411,55 +410,132 @@ public class TestTieredMergePolicy extends BaseMergePolicyTestCase {
|
||||||
dir.close();
|
dir.close();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns how many segments are in the index after applying all merges from the {@code spec} to
|
||||||
|
* an index with {@code startingSegmentCount} segments
|
||||||
|
*/
|
||||||
|
private int postMergesSegmentCount(int startingSegmentCount, MergeSpecification spec) {
|
||||||
|
int count = startingSegmentCount;
|
||||||
|
// remove the segments that are merged away
|
||||||
|
for (var merge : spec.merges) {
|
||||||
|
count -= merge.segments.size();
|
||||||
|
}
|
||||||
|
|
||||||
|
// add one for each newly merged segment
|
||||||
|
count += spec.merges.size();
|
||||||
|
|
||||||
|
return count;
|
||||||
|
}
|
||||||
|
|
||||||
|
// verify that all merges in the spec do not create a final merged segment size too much bigger
|
||||||
|
// than the configured maxMergedSegmentSizeMb
|
||||||
|
private static void assertMaxMergedSize(
|
||||||
|
MergeSpecification specification,
|
||||||
|
double maxMergedSegmentSizeMB,
|
||||||
|
double indexTotalSizeInMB,
|
||||||
|
int maxMergedSegmentCount)
|
||||||
|
throws IOException {
|
||||||
|
|
||||||
|
// When the two configs conflict, TMP favors the requested number of segments. I.e., it will
|
||||||
|
// produce
|
||||||
|
// too-large (> maxMergedSegmentMB) merged segments.
|
||||||
|
double maxMBPerSegment = indexTotalSizeInMB / maxMergedSegmentCount;
|
||||||
|
|
||||||
|
for (OneMerge merge : specification.merges) {
|
||||||
|
// compute total size of all segments being merged
|
||||||
|
long mergeTotalSizeInBytes = 0;
|
||||||
|
for (var segment : merge.segments) {
|
||||||
|
mergeTotalSizeInBytes += segment.sizeInBytes();
|
||||||
|
}
|
||||||
|
|
||||||
|
// we allow up to 50% "violation" of the configured maxMergedSegmentSizeMb (why? TMP itself is
|
||||||
|
// on only adding 25% fudge factor):
|
||||||
|
// TODO: drop this fudge factor back down to 25% -- TooMuchFudgeException!
|
||||||
|
long limitBytes =
|
||||||
|
(long) (1024 * 1024 * Math.max(maxMBPerSegment, maxMergedSegmentSizeMB) * 1.5);
|
||||||
|
assertTrue(
|
||||||
|
"mergeTotalSizeInBytes="
|
||||||
|
+ mergeTotalSizeInBytes
|
||||||
|
+ " limitBytes="
|
||||||
|
+ limitBytes
|
||||||
|
+ " maxMergedSegmentSizeMb="
|
||||||
|
+ maxMergedSegmentSizeMB,
|
||||||
|
mergeTotalSizeInBytes < limitBytes);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// LUCENE-8688 reports that force merges merged more segments that necessary to respect
|
// LUCENE-8688 reports that force merges merged more segments that necessary to respect
|
||||||
// maxSegmentCount as a result
|
// maxSegmentCount as a result
|
||||||
// of LUCENE-7976 so we ensure that it only does the minimum number of merges here.
|
// of LUCENE-7976 so we ensure that it only does the minimum number of merges here.
|
||||||
public void testForcedMergesUseLeastNumberOfMerges() throws Exception {
|
public void testForcedMergesUseLeastNumberOfMerges() throws Exception {
|
||||||
final TieredMergePolicy tmp = new TieredMergePolicy();
|
TieredMergePolicy tmp = new TieredMergePolicy();
|
||||||
final double oneSegmentSize = 1.0D;
|
double oneSegmentSizeMB = 1.0D;
|
||||||
final double maxSegmentSize = 10 * oneSegmentSize;
|
double maxMergedSegmentSizeMB = 10 * oneSegmentSizeMB;
|
||||||
tmp.setMaxMergedSegmentMB(maxSegmentSize);
|
tmp.setMaxMergedSegmentMB(maxMergedSegmentSizeMB);
|
||||||
|
if (VERBOSE) {
|
||||||
SegmentInfos infos = new SegmentInfos(Version.LATEST.major);
|
System.out.println(
|
||||||
for (int j = 0; j < 30; ++j) {
|
String.format(Locale.ROOT, "TEST: maxMergedSegmentSizeMB=%.2f", maxMergedSegmentSizeMB));
|
||||||
infos.add(makeSegmentCommitInfo("_" + j, 1000, 0, oneSegmentSize, IndexWriter.SOURCE_MERGE));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
final int expectedCount = random().nextInt(10) + 3;
|
// create simulated 30 segment index where each segment is 1 MB
|
||||||
final MergeSpecification specification =
|
SegmentInfos infos = new SegmentInfos(Version.LATEST.major);
|
||||||
|
int segmentCount = 30;
|
||||||
|
for (int j = 0; j < segmentCount; j++) {
|
||||||
|
infos.add(
|
||||||
|
makeSegmentCommitInfo("_" + j, 1000, 0, oneSegmentSizeMB, IndexWriter.SOURCE_MERGE));
|
||||||
|
}
|
||||||
|
|
||||||
|
double indexTotalSizeMB = segmentCount * oneSegmentSizeMB;
|
||||||
|
|
||||||
|
int maxSegmentCountAfterForceMerge = random().nextInt(10) + 3;
|
||||||
|
if (VERBOSE) {
|
||||||
|
System.out.println("TEST: maxSegmentCountAfterForceMerge=" + maxSegmentCountAfterForceMerge);
|
||||||
|
}
|
||||||
|
MergeSpecification specification =
|
||||||
tmp.findForcedMerges(
|
tmp.findForcedMerges(
|
||||||
infos,
|
infos,
|
||||||
expectedCount,
|
maxSegmentCountAfterForceMerge,
|
||||||
segmentsToMerge(infos),
|
segmentsToMerge(infos),
|
||||||
new MockMergeContext(SegmentCommitInfo::getDelCount));
|
new MockMergeContext(SegmentCommitInfo::getDelCount));
|
||||||
assertMaxSize(specification, maxSegmentSize);
|
if (VERBOSE) {
|
||||||
final int resultingCount =
|
System.out.println("TEST: specification=" + specification);
|
||||||
infos.size()
|
}
|
||||||
+ specification.merges.size()
|
assertMaxMergedSize(
|
||||||
- specification.merges.stream().mapToInt(spec -> spec.segments.size()).sum();
|
specification, maxMergedSegmentSizeMB, indexTotalSizeMB, maxSegmentCountAfterForceMerge);
|
||||||
assertEquals(expectedCount, resultingCount);
|
|
||||||
|
|
||||||
SegmentInfos manySegmentsInfos = new SegmentInfos(Version.LATEST.major);
|
// verify we achieved exactly the max segment count post-force-merge (which is a bit odd -- the
|
||||||
|
// API only ensures <= segments, not ==)
|
||||||
|
// TODO: change this to a <= equality like the last assert below?
|
||||||
|
assertEquals(
|
||||||
|
maxSegmentCountAfterForceMerge, postMergesSegmentCount(infos.size(), specification));
|
||||||
|
|
||||||
|
// now create many segments index, containing 0.1 MB sized segments
|
||||||
|
infos = new SegmentInfos(Version.LATEST.major);
|
||||||
final int manySegmentsCount = atLeast(100);
|
final int manySegmentsCount = atLeast(100);
|
||||||
for (int j = 0; j < manySegmentsCount; ++j) {
|
if (VERBOSE) {
|
||||||
manySegmentsInfos.add(
|
System.out.println("TEST: manySegmentsCount=" + manySegmentsCount);
|
||||||
makeSegmentCommitInfo("_" + j, 1000, 0, 0.1D, IndexWriter.SOURCE_MERGE));
|
}
|
||||||
|
oneSegmentSizeMB = 0.1D;
|
||||||
|
for (int j = 0; j < manySegmentsCount; j++) {
|
||||||
|
infos.add(
|
||||||
|
makeSegmentCommitInfo("_" + j, 1000, 0, oneSegmentSizeMB, IndexWriter.SOURCE_MERGE));
|
||||||
}
|
}
|
||||||
|
|
||||||
final MergeSpecification specificationManySegments =
|
indexTotalSizeMB = manySegmentsCount * oneSegmentSizeMB;
|
||||||
|
|
||||||
|
specification =
|
||||||
tmp.findForcedMerges(
|
tmp.findForcedMerges(
|
||||||
manySegmentsInfos,
|
infos,
|
||||||
expectedCount,
|
maxSegmentCountAfterForceMerge,
|
||||||
segmentsToMerge(manySegmentsInfos),
|
segmentsToMerge(infos),
|
||||||
new MockMergeContext(SegmentCommitInfo::getDelCount));
|
new MockMergeContext(SegmentCommitInfo::getDelCount));
|
||||||
assertMaxSize(specificationManySegments, maxSegmentSize);
|
if (VERBOSE) {
|
||||||
final int resultingCountManySegments =
|
System.out.println("TEST: specification=" + specification);
|
||||||
manySegmentsInfos.size()
|
}
|
||||||
+ specificationManySegments.merges.size()
|
assertMaxMergedSize(
|
||||||
- specificationManySegments.merges.stream()
|
specification, maxMergedSegmentSizeMB, indexTotalSizeMB, maxSegmentCountAfterForceMerge);
|
||||||
.mapToInt(spec -> spec.segments.size())
|
assertTrue(
|
||||||
.sum();
|
postMergesSegmentCount(infos.size(), specification) >= maxSegmentCountAfterForceMerge);
|
||||||
assertTrue(resultingCountManySegments >= expectedCount);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Make sure that TieredMergePolicy doesn't do the final merge while there are merges ongoing, but
|
// Make sure that TieredMergePolicy doesn't do the final merge while there are merges ongoing, but
|
||||||
|
@ -493,26 +569,6 @@ public class TestTieredMergePolicy extends BaseMergePolicyTestCase {
|
||||||
return segmentsToMerge;
|
return segmentsToMerge;
|
||||||
}
|
}
|
||||||
|
|
||||||
private static void assertMaxSize(MergeSpecification specification, double maxSegmentSizeMb) {
|
|
||||||
for (OneMerge merge : specification.merges) {
|
|
||||||
long size =
|
|
||||||
merge.segments.stream()
|
|
||||||
.mapToLong(
|
|
||||||
s -> {
|
|
||||||
try {
|
|
||||||
return s.sizeInBytes();
|
|
||||||
} catch (IOException e) {
|
|
||||||
throw new UncheckedIOException(e);
|
|
||||||
}
|
|
||||||
})
|
|
||||||
.sum();
|
|
||||||
long limit = (long) (1024 * 1024 * maxSegmentSizeMb * 1.5);
|
|
||||||
assertTrue(
|
|
||||||
"size=" + size + ",limit=" + limit + ",maxSegmentSizeMb=" + maxSegmentSizeMb,
|
|
||||||
size < limit);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Having a segment with very few documents in it can happen because of the random nature of the
|
// Having a segment with very few documents in it can happen because of the random nature of the
|
||||||
// docs added to the index. For instance, let's say it just happens that the last segment has 3
|
// docs added to the index. For instance, let's say it just happens that the last segment has 3
|
||||||
// docs in it.
|
// docs in it.
|
||||||
|
|
Loading…
Reference in New Issue