SQL: Fix bug regarding histograms usage in scripting (#36866)
Allow scripts to correctly reference grouping functions Fix bug in translation of date/time functions mixed with histograms. Enhance Verifier to prevent histograms being nested inside other functions inside GROUP BY (as it implies double grouping) Extend Histogram docs
This commit is contained in:
parent
4cd570593d
commit
ac032a0b9d
|
@ -36,6 +36,8 @@ The histogram function takes all matching values and divides them into buckets w
|
|||
bucket_key = Math.floor(value / interval) * interval
|
||||
----
|
||||
|
||||
NOTE:: The histogram in SQL does *NOT* return empty buckets for missing intervals as the traditional <<search-aggregations-bucket-histogram-aggregation, histogram>> and <<search-aggregations-bucket-datehistogram-aggregation, date histogram>>. Such behavior does not fit conceptually in SQL which treats all missing values as `NULL`; as such the histogram places all missing values in the `NULL` group.
|
||||
|
||||
`Histogram` can be applied on either numeric fields:
|
||||
|
||||
|
||||
|
@ -51,4 +53,26 @@ or date/time fields:
|
|||
include-tagged::{sql-specs}/docs.csv-spec[histogramDate]
|
||||
----
|
||||
|
||||
Expressions inside the histogram are also supported as long as the
|
||||
return type is numeric:
|
||||
|
||||
["source","sql",subs="attributes,callouts,macros"]
|
||||
----
|
||||
include-tagged::{sql-specs}/docs.csv-spec[histogramNumericExpression]
|
||||
----
|
||||
|
||||
Do note that histograms (and grouping functions in general) allow custom expressions but cannot have any functions applied to them in the `GROUP BY`. In other words, the following statement is *NOT* allowed:
|
||||
|
||||
["source","sql",subs="attributes,callouts,macros"]
|
||||
----
|
||||
include-tagged::{sql-specs}/docs.csv-spec[expressionOnHistogramNotAllowed]
|
||||
----
|
||||
|
||||
as it requires two groupings (one for histogram followed by a second for applying the function on top of the histogram groups).
|
||||
|
||||
Instead one can rewrite the query to move the expression on the histogram _inside_ of it:
|
||||
|
||||
["source","sql",subs="attributes,callouts,macros"]
|
||||
----
|
||||
include-tagged::{sql-specs}/docs.csv-spec[histogramDateExpression]
|
||||
----
|
||||
|
|
|
@ -262,9 +262,51 @@ SELECT HISTOGRAM(birth_date, INTERVAL 1 YEAR) AS h, COUNT(*) as c FROM test_emp
|
|||
null |10
|
||||
;
|
||||
|
||||
histogramDateWithDateFunction-Ignore
|
||||
SELECT YEAR(HISTOGRAM(birth_date, INTERVAL 1 YEAR)) AS h, COUNT(*) as c FROM test_emp GROUP BY h ORDER BY h DESC;
|
||||
|
||||
|
||||
histogramDateWithMonthOnTop
|
||||
schema::h:i|c:l
|
||||
SELECT HISTOGRAM(MONTH(birth_date), 2) AS h, COUNT(*) as c FROM test_emp GROUP BY h ORDER BY h DESC;
|
||||
|
||||
h | c
|
||||
---------------+---------------
|
||||
12 |7
|
||||
10 |17
|
||||
8 |16
|
||||
6 |16
|
||||
4 |18
|
||||
2 |10
|
||||
0 |6
|
||||
null |10
|
||||
;
|
||||
|
||||
histogramDateWithYearOnTop
|
||||
schema::h:i|c:l
|
||||
SELECT HISTOGRAM(YEAR(birth_date), 2) AS h, COUNT(*) as c FROM test_emp GROUP BY h ORDER BY h DESC;
|
||||
h | c
|
||||
---------------+---------------
|
||||
1964 |5
|
||||
1962 |13
|
||||
1960 |16
|
||||
1958 |16
|
||||
1956 |9
|
||||
1954 |12
|
||||
1952 |19
|
||||
null |10
|
||||
;
|
||||
|
||||
histogramNumericWithExpression
|
||||
schema::h:i|c:l
|
||||
SELECT HISTOGRAM(emp_no % 100, 10) AS h, COUNT(*) as c FROM test_emp GROUP BY h ORDER BY h DESC;
|
||||
|
||||
h | c
|
||||
---------------+---------------
|
||||
90 |10
|
||||
80 |10
|
||||
70 |10
|
||||
60 |10
|
||||
50 |10
|
||||
40 |10
|
||||
30 |10
|
||||
20 |10
|
||||
10 |10
|
||||
0 |10
|
||||
;
|
|
@ -725,6 +725,27 @@ SELECT HISTOGRAM(salary, 5000) AS h FROM emp GROUP BY h;
|
|||
// end::histogramNumeric
|
||||
;
|
||||
|
||||
histogramNumericExpression
|
||||
schema::h:i|c:l
|
||||
// tag::histogramNumericExpression
|
||||
SELECT HISTOGRAM(salary % 100, 10) AS h, COUNT(*) AS c FROM emp GROUP BY h;
|
||||
|
||||
h | c
|
||||
---------------+---------------
|
||||
0 |10
|
||||
10 |15
|
||||
20 |10
|
||||
30 |14
|
||||
40 |9
|
||||
50 |9
|
||||
60 |8
|
||||
70 |13
|
||||
80 |3
|
||||
90 |9
|
||||
|
||||
// end::histogramNumericExpression
|
||||
;
|
||||
|
||||
histogramDate
|
||||
schema::h:ts|c:l
|
||||
// tag::histogramDate
|
||||
|
@ -752,6 +773,30 @@ null |10
|
|||
// end::histogramDate
|
||||
;
|
||||
|
||||
expressionOnHistogramNotAllowed-Ignore
|
||||
// tag::expressionOnHistogramNotAllowed
|
||||
SELECT MONTH(HISTOGRAM(birth_date), 2)) AS h, COUNT(*) as c FROM emp GROUP BY h ORDER BY h DESC;
|
||||
// end::expressionOnHistogramNotAllowed
|
||||
|
||||
histogramDateExpression
|
||||
schema::h:i|c:l
|
||||
// tag::histogramDateExpression
|
||||
SELECT HISTOGRAM(MONTH(birth_date), 2) AS h, COUNT(*) as c FROM emp GROUP BY h ORDER BY h DESC;
|
||||
|
||||
h | c
|
||||
---------------+---------------
|
||||
12 |7
|
||||
10 |17
|
||||
8 |16
|
||||
6 |16
|
||||
4 |18
|
||||
2 |10
|
||||
0 |6
|
||||
null |10
|
||||
|
||||
// end::histogramDateExpression
|
||||
;
|
||||
|
||||
///////////////////////////////
|
||||
//
|
||||
// Date/Time
|
||||
|
|
|
@ -18,6 +18,8 @@ import org.elasticsearch.xpack.sql.expression.function.Function;
|
|||
import org.elasticsearch.xpack.sql.expression.function.FunctionAttribute;
|
||||
import org.elasticsearch.xpack.sql.expression.function.Functions;
|
||||
import org.elasticsearch.xpack.sql.expression.function.Score;
|
||||
import org.elasticsearch.xpack.sql.expression.function.aggregate.AggregateFunctionAttribute;
|
||||
import org.elasticsearch.xpack.sql.expression.function.grouping.GroupingFunctionAttribute;
|
||||
import org.elasticsearch.xpack.sql.expression.function.scalar.ScalarFunction;
|
||||
import org.elasticsearch.xpack.sql.expression.predicate.conditional.ConditionalFunction;
|
||||
import org.elasticsearch.xpack.sql.expression.predicate.operator.comparison.In;
|
||||
|
@ -224,6 +226,7 @@ public final class Verifier {
|
|||
validateConditional(p, localFailures);
|
||||
|
||||
checkFilterOnAggs(p, localFailures);
|
||||
checkFilterOnGrouping(p, localFailures);
|
||||
|
||||
if (!groupingFailures.contains(p)) {
|
||||
checkGroupBy(p, localFailures, resolvedFunctions, groupingFailures);
|
||||
|
@ -419,7 +422,7 @@ public final class Verifier {
|
|||
return true;
|
||||
}
|
||||
// skip aggs (allowed to refer to non-group columns)
|
||||
if (Functions.isAggregate(e)) {
|
||||
if (Functions.isAggregate(e) || Functions.isGrouping(e)) {
|
||||
return true;
|
||||
}
|
||||
|
||||
|
@ -448,6 +451,21 @@ public final class Verifier {
|
|||
}
|
||||
}));
|
||||
|
||||
a.groupings().forEach(e -> {
|
||||
if (Functions.isGrouping(e) == false) {
|
||||
e.collectFirstChildren(c -> {
|
||||
if (Functions.isGrouping(c)) {
|
||||
localFailures.add(fail(c,
|
||||
"Cannot combine [%s] grouping function inside GROUP BY, found [%s];"
|
||||
+ " consider moving the expression inside the histogram",
|
||||
Expressions.name(c), Expressions.name(e)));
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
});
|
||||
}
|
||||
});
|
||||
|
||||
if (!localFailures.isEmpty()) {
|
||||
return false;
|
||||
}
|
||||
|
@ -547,15 +565,26 @@ public final class Verifier {
|
|||
if (p instanceof Filter) {
|
||||
Filter filter = (Filter) p;
|
||||
if ((filter.child() instanceof Aggregate) == false) {
|
||||
filter.condition().forEachDown(f -> {
|
||||
if (Functions.isAggregate(f) || Functions.isGrouping(f)) {
|
||||
String type = Functions.isAggregate(f) ? "aggregate" : "grouping";
|
||||
localFailures.add(fail(f,
|
||||
"Cannot use WHERE filtering on %s function [%s], use HAVING instead", type, Expressions.name(f)));
|
||||
filter.condition().forEachDown(e -> {
|
||||
if (Functions.isAggregate(e) || e instanceof AggregateFunctionAttribute) {
|
||||
localFailures.add(
|
||||
fail(e, "Cannot use WHERE filtering on aggregate function [%s], use HAVING instead", Expressions.name(e)));
|
||||
}
|
||||
}, Expression.class);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}, Function.class);
|
||||
|
||||
private static void checkFilterOnGrouping(LogicalPlan p, Set<Failure> localFailures) {
|
||||
if (p instanceof Filter) {
|
||||
Filter filter = (Filter) p;
|
||||
filter.condition().forEachDown(e -> {
|
||||
if (Functions.isGrouping(e) || e instanceof GroupingFunctionAttribute) {
|
||||
localFailures
|
||||
.add(fail(e, "Cannot filter on grouping function [%s], use its argument instead", Expressions.name(e)));
|
||||
}
|
||||
}, Expression.class);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -346,6 +346,9 @@ public final class InternalSqlScriptUtils {
|
|||
}
|
||||
|
||||
public static ZonedDateTime asDateTime(Object dateTime) {
|
||||
if (dateTime == null) {
|
||||
return null;
|
||||
}
|
||||
if (dateTime instanceof JodaCompatibleZonedDateTime) {
|
||||
return ((JodaCompatibleZonedDateTime) dateTime).getZonedDateTime();
|
||||
}
|
||||
|
|
|
@ -0,0 +1,24 @@
|
|||
/*
|
||||
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
|
||||
* or more contributor license agreements. Licensed under the Elastic License;
|
||||
* you may not use this file except in compliance with the Elastic License.
|
||||
*/
|
||||
package org.elasticsearch.xpack.sql.expression.gen.script;
|
||||
|
||||
import org.elasticsearch.xpack.sql.expression.function.grouping.GroupingFunctionAttribute;
|
||||
|
||||
class Grouping extends Param<GroupingFunctionAttribute> {
|
||||
|
||||
Grouping(GroupingFunctionAttribute groupRef) {
|
||||
super(groupRef);
|
||||
}
|
||||
|
||||
String groupName() {
|
||||
return value().functionId();
|
||||
}
|
||||
|
||||
@Override
|
||||
public String prefix() {
|
||||
return "g";
|
||||
}
|
||||
}
|
|
@ -85,25 +85,15 @@ public class Params {
|
|||
String s = a.aggProperty() != null ? a.aggProperty() : a.aggName();
|
||||
map.put(p.prefix() + aggs++, s);
|
||||
}
|
||||
if (p instanceof Grouping) {
|
||||
Grouping g = (Grouping) p;
|
||||
map.put(p.prefix() + aggs++, g.groupName());
|
||||
}
|
||||
}
|
||||
|
||||
return map;
|
||||
}
|
||||
|
||||
// return the agg refs
|
||||
List<String> asAggRefs() {
|
||||
List<String> refs = new ArrayList<>();
|
||||
|
||||
for (Param<?> p : params) {
|
||||
if (p instanceof Agg) {
|
||||
refs.add(((Agg) p).aggName());
|
||||
}
|
||||
}
|
||||
|
||||
return refs;
|
||||
}
|
||||
|
||||
|
||||
private static List<Param<?>> flatten(List<Param<?>> params) {
|
||||
List<Param<?>> flatten = emptyList();
|
||||
|
||||
|
@ -116,6 +106,9 @@ public class Params {
|
|||
else if (p instanceof Agg) {
|
||||
flatten.add(p);
|
||||
}
|
||||
else if (p instanceof Grouping) {
|
||||
flatten.add(p);
|
||||
}
|
||||
else if (p instanceof Var) {
|
||||
flatten.add(p);
|
||||
}
|
||||
|
|
|
@ -6,6 +6,7 @@
|
|||
package org.elasticsearch.xpack.sql.expression.gen.script;
|
||||
|
||||
import org.elasticsearch.xpack.sql.expression.function.aggregate.AggregateFunctionAttribute;
|
||||
import org.elasticsearch.xpack.sql.expression.function.grouping.GroupingFunctionAttribute;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
@ -28,6 +29,11 @@ public class ParamsBuilder {
|
|||
return this;
|
||||
}
|
||||
|
||||
public ParamsBuilder grouping(GroupingFunctionAttribute grouping) {
|
||||
params.add(new Grouping(grouping));
|
||||
return this;
|
||||
}
|
||||
|
||||
public ParamsBuilder script(Params ps) {
|
||||
params.add(new Script(ps));
|
||||
return this;
|
||||
|
|
|
@ -44,10 +44,6 @@ public class ScriptTemplate {
|
|||
return params;
|
||||
}
|
||||
|
||||
public List<String> aggRefs() {
|
||||
return params.asAggRefs();
|
||||
}
|
||||
|
||||
public Map<String, String> aggPaths() {
|
||||
return params.asAggPaths();
|
||||
}
|
||||
|
|
|
@ -12,6 +12,7 @@ import org.elasticsearch.xpack.sql.expression.Expression;
|
|||
import org.elasticsearch.xpack.sql.expression.Expressions;
|
||||
import org.elasticsearch.xpack.sql.expression.FieldAttribute;
|
||||
import org.elasticsearch.xpack.sql.expression.function.aggregate.AggregateFunctionAttribute;
|
||||
import org.elasticsearch.xpack.sql.expression.function.grouping.GroupingFunctionAttribute;
|
||||
import org.elasticsearch.xpack.sql.expression.function.scalar.ScalarFunctionAttribute;
|
||||
import org.elasticsearch.xpack.sql.expression.literal.IntervalDayTime;
|
||||
import org.elasticsearch.xpack.sql.expression.literal.IntervalYearMonth;
|
||||
|
@ -37,6 +38,9 @@ public interface ScriptWeaver {
|
|||
if (attr instanceof AggregateFunctionAttribute) {
|
||||
return scriptWithAggregate((AggregateFunctionAttribute) attr);
|
||||
}
|
||||
if (attr instanceof GroupingFunctionAttribute) {
|
||||
return scriptWithGrouping((GroupingFunctionAttribute) attr);
|
||||
}
|
||||
if (attr instanceof FieldAttribute) {
|
||||
return scriptWithField((FieldAttribute) attr);
|
||||
}
|
||||
|
@ -83,6 +87,16 @@ public interface ScriptWeaver {
|
|||
dataType());
|
||||
}
|
||||
|
||||
default ScriptTemplate scriptWithGrouping(GroupingFunctionAttribute grouping) {
|
||||
String template = "{}";
|
||||
if (grouping.dataType() == DataType.DATE) {
|
||||
template = "{sql}.asDateTime({})";
|
||||
}
|
||||
return new ScriptTemplate(processScript(template),
|
||||
paramsBuilder().grouping(grouping).build(),
|
||||
dataType());
|
||||
}
|
||||
|
||||
default ScriptTemplate scriptWithField(FieldAttribute field) {
|
||||
return new ScriptTemplate(processScript("doc[{}].value"),
|
||||
paramsBuilder().variable(field.name()).build(),
|
||||
|
|
|
@ -281,7 +281,7 @@ class QueryFolder extends RuleExecutor<PhysicalPlan> {
|
|||
// found match for expression; if it's an attribute or scalar, end the processing chain with
|
||||
// the reference to the backing agg
|
||||
if (matchingGroup != null) {
|
||||
if (exp instanceof Attribute || exp instanceof ScalarFunction) {
|
||||
if (exp instanceof Attribute || exp instanceof ScalarFunction || exp instanceof GroupingFunction) {
|
||||
Processor action = null;
|
||||
ZoneId zi = DataType.DATE == exp.dataType() ? DateUtils.UTC : null;
|
||||
/*
|
||||
|
|
|
@ -277,7 +277,7 @@ final class QueryTranslator {
|
|||
if (h.dataType() == DataType.DATE) {
|
||||
long intervalAsMillis = Intervals.inMillis(h.interval());
|
||||
// TODO: set timezone
|
||||
if (field instanceof FieldAttribute || field instanceof DateTimeHistogramFunction) {
|
||||
if (field instanceof FieldAttribute) {
|
||||
key = new GroupByDateHistogram(aggId, nameOf(field), intervalAsMillis, h.zoneId());
|
||||
} else if (field instanceof Function) {
|
||||
key = new GroupByDateHistogram(aggId, ((Function) field).asScript(), intervalAsMillis, h.zoneId());
|
||||
|
@ -285,7 +285,7 @@ final class QueryTranslator {
|
|||
}
|
||||
// numeric histogram
|
||||
else {
|
||||
if (field instanceof FieldAttribute || field instanceof DateTimeHistogramFunction) {
|
||||
if (field instanceof FieldAttribute) {
|
||||
key = new GroupByNumericHistogram(aggId, nameOf(field), Foldables.doubleValueOf(h.interval()));
|
||||
} else if (field instanceof Function) {
|
||||
key = new GroupByNumericHistogram(aggId, ((Function) field).asScript(),
|
||||
|
|
|
@ -11,7 +11,6 @@ import org.elasticsearch.xpack.sql.expression.gen.script.ScriptTemplate;
|
|||
import org.elasticsearch.xpack.sql.expression.gen.script.Scripts;
|
||||
import org.elasticsearch.xpack.sql.util.Check;
|
||||
|
||||
import java.util.Collection;
|
||||
import java.util.Map;
|
||||
import java.util.Objects;
|
||||
|
||||
|
@ -32,14 +31,6 @@ public class AggFilter extends PipelineAgg {
|
|||
this.aggPaths = scriptTemplate.aggPaths();
|
||||
}
|
||||
|
||||
public Map<String, String> aggPaths() {
|
||||
return aggPaths;
|
||||
}
|
||||
|
||||
public Collection<String> aggRefs() {
|
||||
return scriptTemplate.aggRefs();
|
||||
}
|
||||
|
||||
public ScriptTemplate scriptTemplate() {
|
||||
return scriptTemplate;
|
||||
}
|
||||
|
|
|
@ -481,8 +481,27 @@ public class VerifierErrorMessagesTests extends ESTestCase {
|
|||
}
|
||||
|
||||
public void testHistogramInFilter() {
|
||||
assertEquals("1:63: Cannot use WHERE filtering on grouping function [HISTOGRAM(date)], use HAVING instead",
|
||||
assertEquals("1:63: Cannot filter on grouping function [HISTOGRAM(date)], use its argument instead",
|
||||
error("SELECT HISTOGRAM(date, INTERVAL 1 MONTH) AS h FROM test WHERE "
|
||||
+ "HISTOGRAM(date, INTERVAL 1 MONTH) > CAST('2000-01-01' AS DATE) GROUP BY h"));
|
||||
}
|
||||
|
||||
// related https://github.com/elastic/elasticsearch/issues/36853
|
||||
public void testHistogramInHaving() {
|
||||
assertEquals("1:75: Cannot filter on grouping function [h], use its argument instead",
|
||||
error("SELECT HISTOGRAM(date, INTERVAL 1 MONTH) AS h FROM test GROUP BY h HAVING "
|
||||
+ "h > CAST('2000-01-01' AS DATE)"));
|
||||
}
|
||||
|
||||
public void testGroupByScalarOnTopOfGrouping() {
|
||||
assertEquals(
|
||||
"1:14: Cannot combine [HISTOGRAM(date)] grouping function inside GROUP BY, "
|
||||
+ "found [MONTH_OF_YEAR(HISTOGRAM(date) [Z])]; consider moving the expression inside the histogram",
|
||||
error("SELECT MONTH(HISTOGRAM(date, INTERVAL 1 MONTH)) AS h FROM test GROUP BY h"));
|
||||
}
|
||||
|
||||
public void testAggsInHistogram() {
|
||||
assertEquals("1:47: Cannot use an aggregate [MAX] for grouping",
|
||||
error("SELECT MAX(date) FROM test GROUP BY HISTOGRAM(MAX(int), 1)"));
|
||||
}
|
||||
}
|
|
@ -92,8 +92,9 @@ import static org.mockito.Mockito.mock;
|
|||
*/
|
||||
public class NodeSubclassTests<T extends B, B extends Node<B>> extends ESTestCase {
|
||||
|
||||
private static final List<Class<? extends Node<?>>> CLASSES_WITH_MIN_TWO_CHILDREN = Arrays.asList(
|
||||
IfNull.class, In.class, InPipe.class, Percentile.class, Percentiles.class, PercentileRanks.class);
|
||||
|
||||
private static final List<Class<?>> CLASSES_WITH_MIN_TWO_CHILDREN = Arrays.<Class<?>> asList(IfNull.class, In.class, InPipe.class,
|
||||
Percentile.class, Percentiles.class, PercentileRanks.class);
|
||||
|
||||
private final Class<T> subclass;
|
||||
|
||||
|
|
Loading…
Reference in New Issue