Resolve a bug where datasketches would not downsample sketches sufficiently (#16119)

* Fix sketch memory issue

* Rename function

* Add unit test

* Revert downsampling change
This commit is contained in:
Adarsh Sanjeev 2024-05-14 10:23:57 +05:30 committed by GitHub
parent b8dd7478d0
commit 18a4722d11
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 27 additions and 1 deletions

View File

@ -212,6 +212,12 @@ public class ClusterByStatisticsCollectorImpl implements ClusterByStatisticsColl
return count; return count;
} }
@VisibleForTesting
long getTotalRetainedBytes()
{
return totalRetainedBytes;
}
@Override @Override
public boolean hasMultipleValues(final int keyPosition) public boolean hasMultipleValues(final int keyPosition)
{ {
@ -414,7 +420,7 @@ public class ClusterByStatisticsCollectorImpl implements ClusterByStatisticsColl
void downSample() void downSample()
{ {
long newTotalRetainedBytes = totalRetainedBytes; long newTotalRetainedBytes = totalRetainedBytes;
final long targetTotalRetainedBytes = totalRetainedBytes / 2; final long targetTotalRetainedBytes = Math.min(totalRetainedBytes / 2, maxRetainedBytes);
final List<Pair<Long, BucketHolder>> sortedHolders = new ArrayList<>(buckets.size()); final List<Pair<Long, BucketHolder>> sortedHolders = new ArrayList<>(buckets.size());
final RowKeyReader trimmedRowReader = keyReader.trimmedKeyReader(clusterBy.getBucketByCount()); final RowKeyReader trimmedRowReader = keyReader.trimmedKeyReader(clusterBy.getBucketByCount());

View File

@ -451,6 +451,26 @@ public class ClusterByStatisticsCollectorImplTest extends InitializedNullHandlin
); );
} }
@Test
public void testShouldDownsampleSingleBucket()
{
ClusterByStatisticsCollectorImpl clusterByStatisticsCollector =
(ClusterByStatisticsCollectorImpl) ClusterByStatisticsCollectorImpl.create(
CLUSTER_BY_XYZ_BUCKET_BY_X,
SIGNATURE,
35000,
500,
false,
false
);
clusterByStatisticsCollector.add(createKey(CLUSTER_BY_XYZ_BUCKET_BY_X, 2, 1, "value1"), 1);
clusterByStatisticsCollector.add(createKey(CLUSTER_BY_XYZ_BUCKET_BY_X, 2, 3, "value2"), 1);
clusterByStatisticsCollector.add(createKey(CLUSTER_BY_XYZ_BUCKET_BY_X, 1, 1, "Extremely long key string for unit test; Extremely long key string for unit test;"), 500);
Assert.assertTrue(clusterByStatisticsCollector.getTotalRetainedBytes() <= 35000);
}
@Test @Test
public void testBucketDownsampledToSingleKeyFinishesCorrectly() public void testBucketDownsampledToSingleKeyFinishesCorrectly()
{ {