From e3151d6c7dea187ed99d349f1435b38b31aa6dd9 Mon Sep 17 00:00:00 2001 From: Michael McCandless Date: Thu, 21 Oct 2021 09:23:17 -0400 Subject: [PATCH] LUCENE-10093: fix conflicting test assert to match how TieredMergePolicy (TMP) works; improv TMP javadocs (#375) --- .../org/apache/lucene/index/MergePolicy.java | 15 +- .../lucene/index/TieredMergePolicy.java | 51 +++--- .../lucene/index/TestTieredMergePolicy.java | 164 ++++++++++++------ 3 files changed, 152 insertions(+), 78 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/index/MergePolicy.java b/lucene/core/src/java/org/apache/lucene/index/MergePolicy.java index 2d9717ad0d5..bead219dcfd 100644 --- a/lucene/core/src/java/org/apache/lucene/index/MergePolicy.java +++ b/lucene/core/src/java/org/apache/lucene/index/MergePolicy.java @@ -447,6 +447,7 @@ public abstract class MergePolicy { merges.add(merge); } + // TODO: deprecate me (dir is never used! and is sometimes difficult to provide!) /** Returns a description of the merges in this specification. */ public String segString(Directory dir) { StringBuilder b = new StringBuilder(); @@ -458,6 +459,17 @@ public abstract class MergePolicy { return b.toString(); } + @Override + public String toString() { + StringBuilder b = new StringBuilder(); + b.append("MergeSpec:"); + final int count = merges.size(); + for (int i = 0; i < count; i++) { + b.append("\n ").append(1 + i).append(": ").append(merges.get(i).segString()); + } + return b.toString(); + } + /** Waits if necessary for at most the given time for all merges. */ boolean await(long timeout, TimeUnit unit) { try { @@ -562,8 +574,7 @@ public abstract class MergePolicy { * one thread at a time will call this method. * * @param segmentInfos the total set of segments in the index - * @param maxSegmentCount requested maximum number of segments in the index (currently this is - * always 1) + * @param maxSegmentCount requested maximum number of segments in the index * @param segmentsToMerge contains the specific SegmentInfo instances that must be merged away. * This may be a subset of all SegmentInfos. If the value is True for a given SegmentInfo, * that means this segment was an original segment present in the to-be-merged index; else, it diff --git a/lucene/core/src/java/org/apache/lucene/index/TieredMergePolicy.java b/lucene/core/src/java/org/apache/lucene/index/TieredMergePolicy.java index 76863ebabbe..a71a0631ff3 100644 --- a/lucene/core/src/java/org/apache/lucene/index/TieredMergePolicy.java +++ b/lucene/core/src/java/org/apache/lucene/index/TieredMergePolicy.java @@ -51,16 +51,16 @@ import java.util.Set; *

NOTE: This policy always merges by byte size of the segments, always pro-rates by * percent deletes * - *

NOTE Starting with Lucene 7.5, there are several changes: + *

NOTE Starting with Lucene 7.5, if you call {@link IndexWriter#forceMerge(int)} with + * this (default) merge policy, if {@link #setMaxMergedSegmentMB} is in conflict with {@code + * maxNumSegments} passed to {@link IndexWriter#forceMerge} then {@code maxNumSegments} wins. For + * example, if your index has 50 1 GB segments, and you have {@link #setMaxMergedSegmentMB} at 1024 + * (1 GB), and you call {@code forceMerge(10)}, the two settings are clearly in conflict. {@code + * TieredMergePolicy} will choose to break the {@link #setMaxMergedSegmentMB} constraint and try to + * merge down to at most ten segments, each up to 5 * 1.25 GB in size (since an extra 25% buffer + * increase in the expected segment size is targetted). * - *

- findForcedMerges and findForcedDeletesMerges) respect the max segment size by default. - * - *

- When findforcedmerges is called with maxSegmentCount other than 1, the resulting index is - * not guaranteed to have <= maxSegmentCount segments. Rather it is on a "best effort" basis. - * Specifically the theoretical ideal segment size is calculated and a "fudge factor" of 25% is - * added as the new maxSegmentSize, which is respected. - * - *

- findForcedDeletesMerges will not produce segments greater than maxSegmentSize. + *

findForcedDeletesMerges should never produce segments greater than maxSegmentSize. * * @lucene.experimental */ @@ -707,8 +707,8 @@ public class TieredMergePolicy extends MergePolicy { final Set merging = mergeContext.getMergingSegments(); // Trim the list down, remove if we're respecting max segment size and it's not original. - // Presumably it's been merged before and - // is close enough to the max segment size we shouldn't add it in again. + // Presumably it's been merged before and is close enough to the max segment size we + // shouldn't add it in again. Iterator iter = sortedSizeAndDocs.iterator(); boolean forceMergeRunning = false; while (iter.hasNext()) { @@ -729,17 +729,20 @@ public class TieredMergePolicy extends MergePolicy { long maxMergeBytes = maxMergedSegmentBytes; // Set the maximum segment size based on how many segments have been specified. - if (maxSegmentCount == 1) maxMergeBytes = Long.MAX_VALUE; - else if (maxSegmentCount != Integer.MAX_VALUE) { - // Fudge this up a bit so we have a better chance of not having to rewrite segments. If we use - // the exact size, - // it's almost guaranteed that the segments won't fit perfectly and we'll be left with more - // segments than - // we want and have to re-merge in the code at the bottom of this method. + if (maxSegmentCount == 1) { + maxMergeBytes = Long.MAX_VALUE; + } else if (maxSegmentCount != Integer.MAX_VALUE) { maxMergeBytes = Math.max( (long) (((double) totalMergeBytes / (double) maxSegmentCount)), maxMergedSegmentBytes); + // Fudge this up a bit so we have a better chance of not having to do a second pass of merging + // to get + // down to the requested target segment count. If we use the exact size, it's almost + // guaranteed + // that the segments selected below won't fit perfectly and we'll be left with more segments + // than + // we want and have to re-merge in the code at the bottom of this method. maxMergeBytes = (long) ((double) maxMergeBytes * 1.25); } @@ -748,8 +751,8 @@ public class TieredMergePolicy extends MergePolicy { while (iter.hasNext()) { SegmentSizeAndDocs segSizeDocs = iter.next(); Boolean isOriginal = segmentsToMerge.get(segSizeDocs.segInfo); - if (segSizeDocs.delCount - != 0) { // This is forceMerge, all segments with deleted docs should be merged. + if (segSizeDocs.delCount != 0) { + // This is forceMerge; all segments with deleted docs should be merged. if (isOriginal != null && isOriginal) { foundDeletes = true; } @@ -770,8 +773,7 @@ public class TieredMergePolicy extends MergePolicy { return null; } - // We should never bail if there are segments that have deleted documents, all deleted docs - // should be purged. + // We only bail if there are no deletions if (foundDeletes == false) { SegmentCommitInfo infoZero = sortedSizeAndDocs.get(0).segInfo; if ((maxSegmentCount != Integer.MAX_VALUE @@ -794,6 +796,11 @@ public class TieredMergePolicy extends MergePolicy { final int startingSegmentCount = sortedSizeAndDocs.size(); if (forceMergeRunning) { + // hmm this is a little dangerous -- if a user kicks off a forceMerge, it is taking forever, + // lots of + // new indexing/segments happened since, and they want to kick off another to ensure those + // newly + // indexed segments partake in the force merge, they (silently) won't due to this? return null; } diff --git a/lucene/core/src/test/org/apache/lucene/index/TestTieredMergePolicy.java b/lucene/core/src/test/org/apache/lucene/index/TestTieredMergePolicy.java index 863da7f1792..09c06b361e7 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestTieredMergePolicy.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestTieredMergePolicy.java @@ -17,7 +17,6 @@ package org.apache.lucene.index; import java.io.IOException; -import java.io.UncheckedIOException; import java.util.ArrayList; import java.util.Collections; import java.util.HashMap; @@ -411,55 +410,132 @@ public class TestTieredMergePolicy extends BaseMergePolicyTestCase { dir.close(); } + /** + * Returns how many segments are in the index after applying all merges from the {@code spec} to + * an index with {@code startingSegmentCount} segments + */ + private int postMergesSegmentCount(int startingSegmentCount, MergeSpecification spec) { + int count = startingSegmentCount; + // remove the segments that are merged away + for (var merge : spec.merges) { + count -= merge.segments.size(); + } + + // add one for each newly merged segment + count += spec.merges.size(); + + return count; + } + + // verify that all merges in the spec do not create a final merged segment size too much bigger + // than the configured maxMergedSegmentSizeMb + private static void assertMaxMergedSize( + MergeSpecification specification, + double maxMergedSegmentSizeMB, + double indexTotalSizeInMB, + int maxMergedSegmentCount) + throws IOException { + + // When the two configs conflict, TMP favors the requested number of segments. I.e., it will + // produce + // too-large (> maxMergedSegmentMB) merged segments. + double maxMBPerSegment = indexTotalSizeInMB / maxMergedSegmentCount; + + for (OneMerge merge : specification.merges) { + // compute total size of all segments being merged + long mergeTotalSizeInBytes = 0; + for (var segment : merge.segments) { + mergeTotalSizeInBytes += segment.sizeInBytes(); + } + + // we allow up to 50% "violation" of the configured maxMergedSegmentSizeMb (why? TMP itself is + // on only adding 25% fudge factor): + // TODO: drop this fudge factor back down to 25% -- TooMuchFudgeException! + long limitBytes = + (long) (1024 * 1024 * Math.max(maxMBPerSegment, maxMergedSegmentSizeMB) * 1.5); + assertTrue( + "mergeTotalSizeInBytes=" + + mergeTotalSizeInBytes + + " limitBytes=" + + limitBytes + + " maxMergedSegmentSizeMb=" + + maxMergedSegmentSizeMB, + mergeTotalSizeInBytes < limitBytes); + } + } + // LUCENE-8688 reports that force merges merged more segments that necessary to respect // maxSegmentCount as a result // of LUCENE-7976 so we ensure that it only does the minimum number of merges here. public void testForcedMergesUseLeastNumberOfMerges() throws Exception { - final TieredMergePolicy tmp = new TieredMergePolicy(); - final double oneSegmentSize = 1.0D; - final double maxSegmentSize = 10 * oneSegmentSize; - tmp.setMaxMergedSegmentMB(maxSegmentSize); - - SegmentInfos infos = new SegmentInfos(Version.LATEST.major); - for (int j = 0; j < 30; ++j) { - infos.add(makeSegmentCommitInfo("_" + j, 1000, 0, oneSegmentSize, IndexWriter.SOURCE_MERGE)); + TieredMergePolicy tmp = new TieredMergePolicy(); + double oneSegmentSizeMB = 1.0D; + double maxMergedSegmentSizeMB = 10 * oneSegmentSizeMB; + tmp.setMaxMergedSegmentMB(maxMergedSegmentSizeMB); + if (VERBOSE) { + System.out.println( + String.format(Locale.ROOT, "TEST: maxMergedSegmentSizeMB=%.2f", maxMergedSegmentSizeMB)); } - final int expectedCount = random().nextInt(10) + 3; - final MergeSpecification specification = + // create simulated 30 segment index where each segment is 1 MB + SegmentInfos infos = new SegmentInfos(Version.LATEST.major); + int segmentCount = 30; + for (int j = 0; j < segmentCount; j++) { + infos.add( + makeSegmentCommitInfo("_" + j, 1000, 0, oneSegmentSizeMB, IndexWriter.SOURCE_MERGE)); + } + + double indexTotalSizeMB = segmentCount * oneSegmentSizeMB; + + int maxSegmentCountAfterForceMerge = random().nextInt(10) + 3; + if (VERBOSE) { + System.out.println("TEST: maxSegmentCountAfterForceMerge=" + maxSegmentCountAfterForceMerge); + } + MergeSpecification specification = tmp.findForcedMerges( infos, - expectedCount, + maxSegmentCountAfterForceMerge, segmentsToMerge(infos), new MockMergeContext(SegmentCommitInfo::getDelCount)); - assertMaxSize(specification, maxSegmentSize); - final int resultingCount = - infos.size() - + specification.merges.size() - - specification.merges.stream().mapToInt(spec -> spec.segments.size()).sum(); - assertEquals(expectedCount, resultingCount); + if (VERBOSE) { + System.out.println("TEST: specification=" + specification); + } + assertMaxMergedSize( + specification, maxMergedSegmentSizeMB, indexTotalSizeMB, maxSegmentCountAfterForceMerge); - SegmentInfos manySegmentsInfos = new SegmentInfos(Version.LATEST.major); + // verify we achieved exactly the max segment count post-force-merge (which is a bit odd -- the + // API only ensures <= segments, not ==) + // TODO: change this to a <= equality like the last assert below? + assertEquals( + maxSegmentCountAfterForceMerge, postMergesSegmentCount(infos.size(), specification)); + + // now create many segments index, containing 0.1 MB sized segments + infos = new SegmentInfos(Version.LATEST.major); final int manySegmentsCount = atLeast(100); - for (int j = 0; j < manySegmentsCount; ++j) { - manySegmentsInfos.add( - makeSegmentCommitInfo("_" + j, 1000, 0, 0.1D, IndexWriter.SOURCE_MERGE)); + if (VERBOSE) { + System.out.println("TEST: manySegmentsCount=" + manySegmentsCount); + } + oneSegmentSizeMB = 0.1D; + for (int j = 0; j < manySegmentsCount; j++) { + infos.add( + makeSegmentCommitInfo("_" + j, 1000, 0, oneSegmentSizeMB, IndexWriter.SOURCE_MERGE)); } - final MergeSpecification specificationManySegments = + indexTotalSizeMB = manySegmentsCount * oneSegmentSizeMB; + + specification = tmp.findForcedMerges( - manySegmentsInfos, - expectedCount, - segmentsToMerge(manySegmentsInfos), + infos, + maxSegmentCountAfterForceMerge, + segmentsToMerge(infos), new MockMergeContext(SegmentCommitInfo::getDelCount)); - assertMaxSize(specificationManySegments, maxSegmentSize); - final int resultingCountManySegments = - manySegmentsInfos.size() - + specificationManySegments.merges.size() - - specificationManySegments.merges.stream() - .mapToInt(spec -> spec.segments.size()) - .sum(); - assertTrue(resultingCountManySegments >= expectedCount); + if (VERBOSE) { + System.out.println("TEST: specification=" + specification); + } + assertMaxMergedSize( + specification, maxMergedSegmentSizeMB, indexTotalSizeMB, maxSegmentCountAfterForceMerge); + assertTrue( + postMergesSegmentCount(infos.size(), specification) >= maxSegmentCountAfterForceMerge); } // Make sure that TieredMergePolicy doesn't do the final merge while there are merges ongoing, but @@ -493,26 +569,6 @@ public class TestTieredMergePolicy extends BaseMergePolicyTestCase { return segmentsToMerge; } - private static void assertMaxSize(MergeSpecification specification, double maxSegmentSizeMb) { - for (OneMerge merge : specification.merges) { - long size = - merge.segments.stream() - .mapToLong( - s -> { - try { - return s.sizeInBytes(); - } catch (IOException e) { - throw new UncheckedIOException(e); - } - }) - .sum(); - long limit = (long) (1024 * 1024 * maxSegmentSizeMb * 1.5); - assertTrue( - "size=" + size + ",limit=" + limit + ",maxSegmentSizeMb=" + maxSegmentSizeMb, - size < limit); - } - } - // Having a segment with very few documents in it can happen because of the random nature of the // docs added to the index. For instance, let's say it just happens that the last segment has 3 // docs in it.