From 18a4722d11b2d0e7dd39ba50269b1ef22f38d346 Mon Sep 17 00:00:00 2001 From: Adarsh Sanjeev Date: Tue, 14 May 2024 10:23:57 +0530 Subject: [PATCH] Resolve a bug where datasketches would not downsample sketches sufficiently (#16119) * Fix sketch memory issue * Rename function * Add unit test * Revert downsampling change --- .../ClusterByStatisticsCollectorImpl.java | 8 +++++++- .../ClusterByStatisticsCollectorImplTest.java | 20 +++++++++++++++++++ 2 files changed, 27 insertions(+), 1 deletion(-) diff --git a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/statistics/ClusterByStatisticsCollectorImpl.java b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/statistics/ClusterByStatisticsCollectorImpl.java index f51cdccd9ed..aad5d3d5483 100644 --- a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/statistics/ClusterByStatisticsCollectorImpl.java +++ b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/statistics/ClusterByStatisticsCollectorImpl.java @@ -212,6 +212,12 @@ public class ClusterByStatisticsCollectorImpl implements ClusterByStatisticsColl return count; } + @VisibleForTesting + long getTotalRetainedBytes() + { + return totalRetainedBytes; + } + @Override public boolean hasMultipleValues(final int keyPosition) { @@ -414,7 +420,7 @@ public class ClusterByStatisticsCollectorImpl implements ClusterByStatisticsColl void downSample() { long newTotalRetainedBytes = totalRetainedBytes; - final long targetTotalRetainedBytes = totalRetainedBytes / 2; + final long targetTotalRetainedBytes = Math.min(totalRetainedBytes / 2, maxRetainedBytes); final List> sortedHolders = new ArrayList<>(buckets.size()); final RowKeyReader trimmedRowReader = keyReader.trimmedKeyReader(clusterBy.getBucketByCount()); diff --git a/extensions-core/multi-stage-query/src/test/java/org/apache/druid/msq/statistics/ClusterByStatisticsCollectorImplTest.java b/extensions-core/multi-stage-query/src/test/java/org/apache/druid/msq/statistics/ClusterByStatisticsCollectorImplTest.java index b2fbf500ce4..baf630681b0 100644 --- a/extensions-core/multi-stage-query/src/test/java/org/apache/druid/msq/statistics/ClusterByStatisticsCollectorImplTest.java +++ b/extensions-core/multi-stage-query/src/test/java/org/apache/druid/msq/statistics/ClusterByStatisticsCollectorImplTest.java @@ -451,6 +451,26 @@ public class ClusterByStatisticsCollectorImplTest extends InitializedNullHandlin ); } + @Test + public void testShouldDownsampleSingleBucket() + { + ClusterByStatisticsCollectorImpl clusterByStatisticsCollector = + (ClusterByStatisticsCollectorImpl) ClusterByStatisticsCollectorImpl.create( + CLUSTER_BY_XYZ_BUCKET_BY_X, + SIGNATURE, + 35000, + 500, + false, + false + ); + + clusterByStatisticsCollector.add(createKey(CLUSTER_BY_XYZ_BUCKET_BY_X, 2, 1, "value1"), 1); + clusterByStatisticsCollector.add(createKey(CLUSTER_BY_XYZ_BUCKET_BY_X, 2, 3, "value2"), 1); + clusterByStatisticsCollector.add(createKey(CLUSTER_BY_XYZ_BUCKET_BY_X, 1, 1, "Extremely long key string for unit test; Extremely long key string for unit test;"), 500); + + Assert.assertTrue(clusterByStatisticsCollector.getTotalRetainedBytes() <= 35000); + } + @Test public void testBucketDownsampledToSingleKeyFinishesCorrectly() {