From 67720b60ae664aab95e5482086e0b4f2a77be500 Mon Sep 17 00:00:00 2001 From: AmatyaAvadhanula Date: Tue, 16 Jan 2024 12:31:36 +0530 Subject: [PATCH] Skip compaction for intervals without data (#15676) * Skip compaction for intervals with a single tombstone and no data --- .../compact/NewestSegmentFirstIterator.java | 6 + .../compact/NewestSegmentFirstPolicyTest.java | 113 ++++++++++++++++++ 2 files changed, 119 insertions(+) diff --git a/server/src/main/java/org/apache/druid/server/coordinator/compact/NewestSegmentFirstIterator.java b/server/src/main/java/org/apache/druid/server/coordinator/compact/NewestSegmentFirstIterator.java index 05bae7d2386..c4ae771f808 100644 --- a/server/src/main/java/org/apache/druid/server/coordinator/compact/NewestSegmentFirstIterator.java +++ b/server/src/main/java/org/apache/druid/server/coordinator/compact/NewestSegmentFirstIterator.java @@ -339,6 +339,12 @@ public class NewestSegmentFirstIterator implements CompactionSegmentIterator while (compactibleSegmentIterator.hasNext()) { List segments = compactibleSegmentIterator.next(); + // Do not compact an interval which comprises of a single tombstone + // If there are multiple tombstones in the interval, we may still want to compact them + if (segments.size() == 1 && segments.get(0).isTombstone()) { + continue; + } + final SegmentsToCompact candidates = SegmentsToCompact.from(segments); final Interval interval = candidates.getUmbrellaInterval(); diff --git a/server/src/test/java/org/apache/druid/server/coordinator/compact/NewestSegmentFirstPolicyTest.java b/server/src/test/java/org/apache/druid/server/coordinator/compact/NewestSegmentFirstPolicyTest.java index 553445d9f73..dda1cb1af13 100644 --- a/server/src/test/java/org/apache/druid/server/coordinator/compact/NewestSegmentFirstPolicyTest.java +++ b/server/src/test/java/org/apache/druid/server/coordinator/compact/NewestSegmentFirstPolicyTest.java @@ -57,6 +57,7 @@ import org.apache.druid.timeline.Partitions; import org.apache.druid.timeline.SegmentTimeline; import org.apache.druid.timeline.partition.NumberedShardSpec; import org.apache.druid.timeline.partition.ShardSpec; +import org.apache.druid.timeline.partition.TombstoneShardSpec; import org.apache.druid.utils.Streams; import org.joda.time.DateTimeZone; import org.joda.time.Interval; @@ -1764,6 +1765,118 @@ public class NewestSegmentFirstPolicyTest Assert.assertFalse(iterator.hasNext()); } + @Test + public void testSkipCompactionForIntervalsContainingSingleTombstone() + { + final DataSegment tombstone2023 = new DataSegment( + DATA_SOURCE, + Intervals.of("2023/2024"), + "0", + new HashMap<>(), + new ArrayList<>(), + new ArrayList<>(), + TombstoneShardSpec.INSTANCE, + 0, + 1); + final DataSegment dataSegment2023 = new DataSegment( + DATA_SOURCE, + Intervals.of("2023/2024"), + "0", + new HashMap<>(), + new ArrayList<>(), + new ArrayList<>(), + new NumberedShardSpec(1, 0), + 0, + 100); + final DataSegment tombstone2024 = new DataSegment( + DATA_SOURCE, + Intervals.of("2024/2025"), + "0", + new HashMap<>(), + new ArrayList<>(), + new ArrayList<>(), + TombstoneShardSpec.INSTANCE, + 0, + 1); + + CompactionSegmentIterator iterator = policy.reset( + ImmutableMap.of(DATA_SOURCE, + createCompactionConfig(10000, + new Period("P0D"), + new UserCompactionTaskGranularityConfig(Granularities.YEAR, null, null) + ) + ), + ImmutableMap.of( + DATA_SOURCE, + SegmentTimeline.forSegments(ImmutableSet.of(tombstone2023, dataSegment2023, tombstone2024)) + ), + Collections.emptyMap() + ); + + // Skips 2024/2025 since it has a single tombstone and no data. + // Return all segments in 2023/2024 since at least one of them has data despite there being a tombstone. + Assert.assertEquals( + ImmutableList.of(tombstone2023, dataSegment2023), + iterator.next().getSegments() + ); + + final DataSegment tombstone2025Jan = new DataSegment( + DATA_SOURCE, + Intervals.of("2025-01-01/2025-02-01"), + "0", + new HashMap<>(), + new ArrayList<>(), + new ArrayList<>(), + TombstoneShardSpec.INSTANCE, + 0, + 1); + final DataSegment tombstone2025Feb = new DataSegment( + DATA_SOURCE, + Intervals.of("2025-02-01/2025-03-01"), + "0", + new HashMap<>(), + new ArrayList<>(), + new ArrayList<>(), + TombstoneShardSpec.INSTANCE, + 0, + 1); + final DataSegment tombstone2025Mar = new DataSegment( + DATA_SOURCE, + Intervals.of("2025-03-01/2025-04-01"), + "0", + new HashMap<>(), + new ArrayList<>(), + new ArrayList<>(), + TombstoneShardSpec.INSTANCE, + 0, + 1); + iterator = policy.reset( + ImmutableMap.of(DATA_SOURCE, + createCompactionConfig(10000, + new Period("P0D"), + new UserCompactionTaskGranularityConfig(Granularities.YEAR, null, null) + ) + ), + ImmutableMap.of( + DATA_SOURCE, + SegmentTimeline.forSegments(ImmutableSet.of( + tombstone2023, + dataSegment2023, + tombstone2024, + tombstone2025Jan, + tombstone2025Feb, + tombstone2025Mar + )) + ), + Collections.emptyMap() + ); + // Does not skip the tombstones in 2025 since there are multiple of them which could potentially be condensed to one + Assert.assertEquals( + ImmutableList.of(tombstone2025Jan, tombstone2025Feb, tombstone2025Mar), + iterator.next().getSegments() + ); + } + private static void assertCompactSegmentIntervals( CompactionSegmentIterator iterator, Period segmentPeriod,