SQL: use a calendar interval for histograms over 1 month intervals (#52586) (#52715)

(cherry picked from commit 928b11a34ec92d90d082abdf4fa09f7ce1d7c0c4)
This commit is contained in:
Andrei Stefan 2020-02-25 01:41:51 +02:00 committed by GitHub
parent ba0401ecfd
commit ed6b10bc03
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 71 additions and 14 deletions

View File

@ -87,8 +87,8 @@ actually used will be `INTERVAL '2' DAY`. If the interval specified is less than
[IMPORTANT]
All intervals specified for a date/time HISTOGRAM will use a <<search-aggregations-bucket-datehistogram-aggregation,fixed interval>>
in their `date_histogram` aggregation definition, with the notable exception of `INTERVAL '1' YEAR` where a calendar interval is used.
The choice for a calendar interval was made for having a more intuitive result for YEAR groupings. Calendar intervals consider a one year
in their `date_histogram` aggregation definition, with the notable exceptions of `INTERVAL '1' YEAR` AND `INTERVAL '1' MONTH` where a calendar interval is used.
The choice for a calendar interval was made for having a more intuitive result for YEAR and MONTH groupings. In the case of YEAR, for example, the calendar intervals consider a one year
bucket as the one starting on January 1st that specific year, whereas a fixed interval one-year-bucket considers one year as a number
of milliseconds (for example, `31536000000ms` corresponding to 365 days, 24 hours per day, 60 minutes per hour etc.). With fixed intervals,
the day of February 5th, 2019 for example, belongs to a bucket that starts on December 20th, 2018 and {es} (and implicitly {es-sql}) would

View File

@ -503,6 +503,34 @@ SELECT HISTOGRAM(birth_date, INTERVAL 1 YEAR) AS h, COUNT(*) as c FROM test_emp
null |10
;
histogramOneMonth
schema::h:ts|c:l|birth_date:ts
SELECT HISTOGRAM(birth_date, INTERVAL 1 MONTH) AS h, COUNT(*) as c, birth_date FROM test_emp GROUP BY h, birth_date HAVING c >= 1 ORDER BY h ASC LIMIT 20;
h | c | birth_date
------------------------+---------------+------------------------
null |10 |null
1952-02-01T00:00:00.000Z|1 |1952-02-27T00:00:00.000Z
1952-04-01T00:00:00.000Z|1 |1952-04-19T00:00:00.000Z
1952-05-01T00:00:00.000Z|1 |1952-05-15T00:00:00.000Z
1952-06-01T00:00:00.000Z|1 |1952-06-13T00:00:00.000Z
1952-07-01T00:00:00.000Z|1 |1952-07-08T00:00:00.000Z
1952-08-01T00:00:00.000Z|1 |1952-08-06T00:00:00.000Z
1952-11-01T00:00:00.000Z|1 |1952-11-13T00:00:00.000Z
1952-12-01T00:00:00.000Z|1 |1952-12-24T00:00:00.000Z
1953-01-01T00:00:00.000Z|1 |1953-01-07T00:00:00.000Z
1953-01-01T00:00:00.000Z|1 |1953-01-23T00:00:00.000Z
1953-02-01T00:00:00.000Z|1 |1953-02-08T00:00:00.000Z
1953-04-01T00:00:00.000Z|1 |1953-04-03T00:00:00.000Z
1953-04-01T00:00:00.000Z|1 |1953-04-20T00:00:00.000Z
1953-04-01T00:00:00.000Z|1 |1953-04-21T00:00:00.000Z
1953-07-01T00:00:00.000Z|1 |1953-07-28T00:00:00.000Z
1953-09-01T00:00:00.000Z|1 |1953-09-02T00:00:00.000Z
1953-09-01T00:00:00.000Z|1 |1953-09-19T00:00:00.000Z
1953-09-01T00:00:00.000Z|1 |1953-09-29T00:00:00.000Z
1953-11-01T00:00:00.000Z|1 |1953-11-07T00:00:00.000Z
;
histogramDateTimeWithMonthOnTop
schema::h:i|c:l
SELECT HISTOGRAM(MONTH(birth_date), 2) AS h, COUNT(*) as c FROM test_emp GROUP BY h ORDER BY h DESC;

View File

@ -6,6 +6,7 @@
package org.elasticsearch.xpack.sql.expression.function.grouping;
import org.elasticsearch.search.aggregations.bucket.histogram.DateHistogramInterval;
import org.elasticsearch.xpack.ql.expression.Expression;
import org.elasticsearch.xpack.ql.expression.Expressions.ParamOrdinal;
import org.elasticsearch.xpack.ql.expression.Literal;
@ -28,6 +29,8 @@ public class Histogram extends GroupingFunction {
private final Literal interval;
private final ZoneId zoneId;
public static String YEAR_INTERVAL = DateHistogramInterval.YEAR.toString();
public static String MONTH_INTERVAL = DateHistogramInterval.MONTH.toString();
public Histogram(Source source, Expression field, Expression interval, ZoneId zoneId) {
super(source, field, Collections.singletonList(interval));

View File

@ -5,10 +5,10 @@
*/
package org.elasticsearch.xpack.sql.expression.function.scalar.datetime;
import org.elasticsearch.search.aggregations.bucket.histogram.DateHistogramInterval;
import org.elasticsearch.xpack.ql.expression.Expression;
import org.elasticsearch.xpack.ql.tree.NodeInfo.NodeCtor2;
import org.elasticsearch.xpack.ql.tree.Source;
import org.elasticsearch.xpack.sql.expression.function.grouping.Histogram;
import org.elasticsearch.xpack.sql.expression.function.scalar.datetime.DateTimeProcessor.DateTimeExtractor;
import java.time.ZoneId;
@ -18,8 +18,6 @@ import java.time.ZoneId;
*/
public class Year extends DateTimeHistogramFunction {
public static String YEAR_INTERVAL = DateHistogramInterval.YEAR.toString();
public Year(Source source, Expression field, ZoneId zoneId) {
super(source, field, zoneId, DateTimeExtractor.YEAR);
}
@ -41,6 +39,6 @@ public class Year extends DateTimeHistogramFunction {
@Override
public String calendarInterval() {
return YEAR_INTERVAL;
return Histogram.YEAR_INTERVAL;
}
}

View File

@ -41,7 +41,6 @@ import org.elasticsearch.xpack.sql.expression.function.aggregate.CompoundNumeric
import org.elasticsearch.xpack.sql.expression.function.aggregate.TopHits;
import org.elasticsearch.xpack.sql.expression.function.grouping.Histogram;
import org.elasticsearch.xpack.sql.expression.function.scalar.datetime.DateTimeHistogramFunction;
import org.elasticsearch.xpack.sql.expression.function.scalar.datetime.Year;
import org.elasticsearch.xpack.sql.expression.literal.interval.IntervalYearMonth;
import org.elasticsearch.xpack.sql.expression.literal.interval.Intervals;
import org.elasticsearch.xpack.sql.plan.logical.Pivot;
@ -91,6 +90,8 @@ import java.util.Map.Entry;
import java.util.concurrent.atomic.AtomicReference;
import static org.elasticsearch.xpack.ql.util.CollectionUtils.combine;
import static org.elasticsearch.xpack.sql.expression.function.grouping.Histogram.MONTH_INTERVAL;
import static org.elasticsearch.xpack.sql.expression.function.grouping.Histogram.YEAR_INTERVAL;
import static org.elasticsearch.xpack.sql.planner.QueryTranslator.toAgg;
import static org.elasticsearch.xpack.sql.planner.QueryTranslator.toQuery;
import static org.elasticsearch.xpack.sql.type.SqlDataTypes.DATE;
@ -283,7 +284,6 @@ class QueryFolder extends RuleExecutor<PhysicalPlan> {
field = field.exactAttribute();
key = new GroupByValue(aggId, field.name());
}
// handle functions
else if (exp instanceof Function) {
// dates are handled differently because of date histograms
@ -322,13 +322,17 @@ class QueryFolder extends RuleExecutor<PhysicalPlan> {
// date histogram
if (isDateBased(h.dataType())) {
Object value = h.interval().value();
// interval of exactly 1 year
if (value instanceof IntervalYearMonth
&& ((IntervalYearMonth) value).interval().equals(Period.ofYears(1))) {
String calendarInterval = Year.YEAR_INTERVAL;
// When the histogram is `INTERVAL '1' YEAR`, the interval used in the ES date_histogram will be
// a calendar_interval with value "1y". All other intervals will be fixed_intervals expressed in ms.
// interval of exactly 1 year or 1 month
if (value instanceof IntervalYearMonth &&
(((IntervalYearMonth) value).interval().equals(Period.ofYears(1))
|| ((IntervalYearMonth) value).interval().equals(Period.ofMonths(1)))) {
Period yearMonth = ((IntervalYearMonth) value).interval();
String calendarInterval = yearMonth.equals(Period.ofYears(1)) ? YEAR_INTERVAL : MONTH_INTERVAL;
// When the histogram is `INTERVAL '1' YEAR` or `INTERVAL '1' MONTH`, the interval used in
// the ES date_histogram will be a calendar_interval with value "1y" or "1M" respectively.
// All other intervals will be fixed_intervals expressed in ms.
if (field instanceof FieldAttribute) {
key = new GroupByDateHistogram(aggId, QueryTranslator.nameOf(field), calendarInterval, h.zoneId());
} else if (field instanceof Function) {

View File

@ -1054,6 +1054,30 @@ public class QueryTranslatorTests extends ESTestCase {
+ "\"calendar_interval\":\"1y\",\"time_zone\":\"Z\"}}}]}}}"));
}
public void testGroupByOneMonthHistogramQueryTranslator() {
PhysicalPlan p = optimizeAndPlan("SELECT HISTOGRAM(date, INTERVAL 1 MONTH) AS h FROM test GROUP BY h");
assertEquals(EsQueryExec.class, p.getClass());
EsQueryExec eqe = (EsQueryExec) p;
assertEquals(1, eqe.output().size());
assertEquals("h", eqe.output().get(0).qualifiedName());
assertEquals(DATETIME, eqe.output().get(0).dataType());
assertThat(eqe.queryContainer().aggs().asAggBuilder().toString().replaceAll("\\s+", ""),
endsWith("\"date_histogram\":{\"field\":\"date\",\"missing_bucket\":true,\"value_type\":\"date\",\"order\":\"asc\","
+ "\"calendar_interval\":\"1M\",\"time_zone\":\"Z\"}}}]}}}"));
}
public void testGroupByMoreMonthsHistogramQueryTranslator() {
PhysicalPlan p = optimizeAndPlan("SELECT HISTOGRAM(date, INTERVAL 5 MONTH) AS h FROM test GROUP BY h");
assertEquals(EsQueryExec.class, p.getClass());
EsQueryExec eqe = (EsQueryExec) p;
assertEquals(1, eqe.output().size());
assertEquals("h", eqe.output().get(0).qualifiedName());
assertEquals(DATETIME, eqe.output().get(0).dataType());
assertThat(eqe.queryContainer().aggs().asAggBuilder().toString().replaceAll("\\s+", ""),
endsWith("\"date_histogram\":{\"field\":\"date\",\"missing_bucket\":true,\"value_type\":\"date\",\"order\":\"asc\","
+ "\"fixed_interval\":\"12960000000ms\",\"time_zone\":\"Z\"}}}]}}}"));
}
public void testGroupByYearAndScalarsQueryTranslator() {
PhysicalPlan p = optimizeAndPlan("SELECT YEAR(CAST(date + INTERVAL 5 months AS DATE)) FROM test GROUP BY 1");
assertEquals(EsQueryExec.class, p.getClass());