From fcfb7b8ff63edf0c4f7925baeb664d3c17dc8919 Mon Sep 17 00:00:00 2001 From: Gian Merlino Date: Mon, 6 Mar 2023 22:57:10 -0800 Subject: [PATCH] Add warning comments to Granularity.getIterable. (#13888) This function is notorious for causing memory exhaustion and excessive CPU usage; so much so that it was valuable to work around it in the SQL planner in #13206. Hopefully, a warning comment will encourage developers to stay away and come up with solutions that do not involve computing all possible buckets. --- .../java/util/common/granularity/Granularity.java | 13 +++++++++++++ .../apache/druid/sql/calcite/rel/DruidQuery.java | 2 ++ 2 files changed, 15 insertions(+) diff --git a/processing/src/main/java/org/apache/druid/java/util/common/granularity/Granularity.java b/processing/src/main/java/org/apache/druid/java/util/common/granularity/Granularity.java index ba48a998b9d..812538ae6e0 100644 --- a/processing/src/main/java/org/apache/druid/java/util/common/granularity/Granularity.java +++ b/processing/src/main/java/org/apache/druid/java/util/common/granularity/Granularity.java @@ -209,6 +209,19 @@ public abstract class Granularity implements Cacheable return vals; } + /** + * Return an iterable of granular buckets that overlap a particular interval. + * + * In cases where the number of granular buckets is very large, the Iterable returned by this method will take + * an excessive amount of time to compute, and materializing it into a collection will take an excessive amount + * of memory. For example, this happens in the extreme case of an input interval of + * {@link org.apache.druid.java.util.common.Intervals#ETERNITY} and any granularity other than + * {@link Granularities#ALL}, as well as cases like an input interval of ten years with {@link Granularities#SECOND}. + * + * To avoid issues stemming from large numbers of buckets, this method should be avoided, and code that uses + * this method should be rewritten to use some other approach. For example: rather than computing all possible + * buckets in a wide time range, only process buckets related to actual data points that appear. + */ public Iterable getIterable(final Interval input) { return new IntervalIterable(input); diff --git a/sql/src/main/java/org/apache/druid/sql/calcite/rel/DruidQuery.java b/sql/src/main/java/org/apache/druid/sql/calcite/rel/DruidQuery.java index 5019f6fbaf7..dee97065587 100644 --- a/sql/src/main/java/org/apache/druid/sql/calcite/rel/DruidQuery.java +++ b/sql/src/main/java/org/apache/druid/sql/calcite/rel/DruidQuery.java @@ -871,6 +871,8 @@ public class DruidQuery *

* Necessary because some combinations are unsafe, mainly because they would lead to the creation of too many * time-granular buckets during query processing. + * + * @see Granularity#getIterable(Interval) the problematic method call we are trying to avoid */ private static boolean canUseQueryGranularity( final DataSource dataSource,