SQL: Fix bug regarding histograms usage in scripting (#36866)

Allow scripts to correctly reference grouping functions
Fix bug in translation of date/time functions mixed with histograms.
Enhance Verifier to prevent histograms being nested inside other
 functions inside GROUP BY (as it implies double grouping)
Extend Histogram docs
This commit is contained in:
Costin Leau 2018-12-20 23:11:56 +02:00 committed by GitHub
parent 4cd570593d
commit ac032a0b9d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
15 changed files with 234 additions and 47 deletions

View File

@ -36,6 +36,8 @@ The histogram function takes all matching values and divides them into buckets w
bucket_key = Math.floor(value / interval) * interval bucket_key = Math.floor(value / interval) * interval
---- ----
NOTE:: The histogram in SQL does *NOT* return empty buckets for missing intervals as the traditional <<search-aggregations-bucket-histogram-aggregation, histogram>> and <<search-aggregations-bucket-datehistogram-aggregation, date histogram>>. Such behavior does not fit conceptually in SQL which treats all missing values as `NULL`; as such the histogram places all missing values in the `NULL` group.
`Histogram` can be applied on either numeric fields: `Histogram` can be applied on either numeric fields:
@ -51,4 +53,26 @@ or date/time fields:
include-tagged::{sql-specs}/docs.csv-spec[histogramDate] include-tagged::{sql-specs}/docs.csv-spec[histogramDate]
---- ----
Expressions inside the histogram are also supported as long as the
return type is numeric:
["source","sql",subs="attributes,callouts,macros"]
----
include-tagged::{sql-specs}/docs.csv-spec[histogramNumericExpression]
----
Do note that histograms (and grouping functions in general) allow custom expressions but cannot have any functions applied to them in the `GROUP BY`. In other words, the following statement is *NOT* allowed:
["source","sql",subs="attributes,callouts,macros"]
----
include-tagged::{sql-specs}/docs.csv-spec[expressionOnHistogramNotAllowed]
----
as it requires two groupings (one for histogram followed by a second for applying the function on top of the histogram groups).
Instead one can rewrite the query to move the expression on the histogram _inside_ of it:
["source","sql",subs="attributes,callouts,macros"]
----
include-tagged::{sql-specs}/docs.csv-spec[histogramDateExpression]
----

View File

@ -262,9 +262,51 @@ SELECT HISTOGRAM(birth_date, INTERVAL 1 YEAR) AS h, COUNT(*) as c FROM test_emp
null |10 null |10
; ;
histogramDateWithDateFunction-Ignore histogramDateWithMonthOnTop
SELECT YEAR(HISTOGRAM(birth_date, INTERVAL 1 YEAR)) AS h, COUNT(*) as c FROM test_emp GROUP BY h ORDER BY h DESC; schema::h:i|c:l
SELECT HISTOGRAM(MONTH(birth_date), 2) AS h, COUNT(*) as c FROM test_emp GROUP BY h ORDER BY h DESC;
h | c
---------------+---------------
12 |7
10 |17
8 |16
6 |16
4 |18
2 |10
0 |6
null |10
;
histogramDateWithYearOnTop
schema::h:i|c:l
SELECT HISTOGRAM(YEAR(birth_date), 2) AS h, COUNT(*) as c FROM test_emp GROUP BY h ORDER BY h DESC;
h | c
---------------+---------------
1964 |5
1962 |13
1960 |16
1958 |16
1956 |9
1954 |12
1952 |19
null |10
;
histogramNumericWithExpression
schema::h:i|c:l
SELECT HISTOGRAM(emp_no % 100, 10) AS h, COUNT(*) as c FROM test_emp GROUP BY h ORDER BY h DESC;
h | c
---------------+---------------
90 |10
80 |10
70 |10
60 |10
50 |10
40 |10
30 |10
20 |10
10 |10
0 |10
; ;

View File

@ -725,6 +725,27 @@ SELECT HISTOGRAM(salary, 5000) AS h FROM emp GROUP BY h;
// end::histogramNumeric // end::histogramNumeric
; ;
histogramNumericExpression
schema::h:i|c:l
// tag::histogramNumericExpression
SELECT HISTOGRAM(salary % 100, 10) AS h, COUNT(*) AS c FROM emp GROUP BY h;
h | c
---------------+---------------
0 |10
10 |15
20 |10
30 |14
40 |9
50 |9
60 |8
70 |13
80 |3
90 |9
// end::histogramNumericExpression
;
histogramDate histogramDate
schema::h:ts|c:l schema::h:ts|c:l
// tag::histogramDate // tag::histogramDate
@ -752,6 +773,30 @@ null |10
// end::histogramDate // end::histogramDate
; ;
expressionOnHistogramNotAllowed-Ignore
// tag::expressionOnHistogramNotAllowed
SELECT MONTH(HISTOGRAM(birth_date), 2)) AS h, COUNT(*) as c FROM emp GROUP BY h ORDER BY h DESC;
// end::expressionOnHistogramNotAllowed
histogramDateExpression
schema::h:i|c:l
// tag::histogramDateExpression
SELECT HISTOGRAM(MONTH(birth_date), 2) AS h, COUNT(*) as c FROM emp GROUP BY h ORDER BY h DESC;
h | c
---------------+---------------
12 |7
10 |17
8 |16
6 |16
4 |18
2 |10
0 |6
null |10
// end::histogramDateExpression
;
/////////////////////////////// ///////////////////////////////
// //
// Date/Time // Date/Time

View File

@ -18,6 +18,8 @@ import org.elasticsearch.xpack.sql.expression.function.Function;
import org.elasticsearch.xpack.sql.expression.function.FunctionAttribute; import org.elasticsearch.xpack.sql.expression.function.FunctionAttribute;
import org.elasticsearch.xpack.sql.expression.function.Functions; import org.elasticsearch.xpack.sql.expression.function.Functions;
import org.elasticsearch.xpack.sql.expression.function.Score; import org.elasticsearch.xpack.sql.expression.function.Score;
import org.elasticsearch.xpack.sql.expression.function.aggregate.AggregateFunctionAttribute;
import org.elasticsearch.xpack.sql.expression.function.grouping.GroupingFunctionAttribute;
import org.elasticsearch.xpack.sql.expression.function.scalar.ScalarFunction; import org.elasticsearch.xpack.sql.expression.function.scalar.ScalarFunction;
import org.elasticsearch.xpack.sql.expression.predicate.conditional.ConditionalFunction; import org.elasticsearch.xpack.sql.expression.predicate.conditional.ConditionalFunction;
import org.elasticsearch.xpack.sql.expression.predicate.operator.comparison.In; import org.elasticsearch.xpack.sql.expression.predicate.operator.comparison.In;
@ -224,6 +226,7 @@ public final class Verifier {
validateConditional(p, localFailures); validateConditional(p, localFailures);
checkFilterOnAggs(p, localFailures); checkFilterOnAggs(p, localFailures);
checkFilterOnGrouping(p, localFailures);
if (!groupingFailures.contains(p)) { if (!groupingFailures.contains(p)) {
checkGroupBy(p, localFailures, resolvedFunctions, groupingFailures); checkGroupBy(p, localFailures, resolvedFunctions, groupingFailures);
@ -419,7 +422,7 @@ public final class Verifier {
return true; return true;
} }
// skip aggs (allowed to refer to non-group columns) // skip aggs (allowed to refer to non-group columns)
if (Functions.isAggregate(e)) { if (Functions.isAggregate(e) || Functions.isGrouping(e)) {
return true; return true;
} }
@ -448,6 +451,21 @@ public final class Verifier {
} }
})); }));
a.groupings().forEach(e -> {
if (Functions.isGrouping(e) == false) {
e.collectFirstChildren(c -> {
if (Functions.isGrouping(c)) {
localFailures.add(fail(c,
"Cannot combine [%s] grouping function inside GROUP BY, found [%s];"
+ " consider moving the expression inside the histogram",
Expressions.name(c), Expressions.name(e)));
return true;
}
return false;
});
}
});
if (!localFailures.isEmpty()) { if (!localFailures.isEmpty()) {
return false; return false;
} }
@ -547,15 +565,26 @@ public final class Verifier {
if (p instanceof Filter) { if (p instanceof Filter) {
Filter filter = (Filter) p; Filter filter = (Filter) p;
if ((filter.child() instanceof Aggregate) == false) { if ((filter.child() instanceof Aggregate) == false) {
filter.condition().forEachDown(f -> { filter.condition().forEachDown(e -> {
if (Functions.isAggregate(f) || Functions.isGrouping(f)) { if (Functions.isAggregate(e) || e instanceof AggregateFunctionAttribute) {
String type = Functions.isAggregate(f) ? "aggregate" : "grouping"; localFailures.add(
localFailures.add(fail(f, fail(e, "Cannot use WHERE filtering on aggregate function [%s], use HAVING instead", Expressions.name(e)));
"Cannot use WHERE filtering on %s function [%s], use HAVING instead", type, Expressions.name(f))); }
}, Expression.class);
}
}
} }
}, Function.class);
private static void checkFilterOnGrouping(LogicalPlan p, Set<Failure> localFailures) {
if (p instanceof Filter) {
Filter filter = (Filter) p;
filter.condition().forEachDown(e -> {
if (Functions.isGrouping(e) || e instanceof GroupingFunctionAttribute) {
localFailures
.add(fail(e, "Cannot filter on grouping function [%s], use its argument instead", Expressions.name(e)));
} }
}, Expression.class);
} }
} }

View File

@ -346,6 +346,9 @@ public final class InternalSqlScriptUtils {
} }
public static ZonedDateTime asDateTime(Object dateTime) { public static ZonedDateTime asDateTime(Object dateTime) {
if (dateTime == null) {
return null;
}
if (dateTime instanceof JodaCompatibleZonedDateTime) { if (dateTime instanceof JodaCompatibleZonedDateTime) {
return ((JodaCompatibleZonedDateTime) dateTime).getZonedDateTime(); return ((JodaCompatibleZonedDateTime) dateTime).getZonedDateTime();
} }

View File

@ -0,0 +1,24 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License;
* you may not use this file except in compliance with the Elastic License.
*/
package org.elasticsearch.xpack.sql.expression.gen.script;
import org.elasticsearch.xpack.sql.expression.function.grouping.GroupingFunctionAttribute;
class Grouping extends Param<GroupingFunctionAttribute> {
Grouping(GroupingFunctionAttribute groupRef) {
super(groupRef);
}
String groupName() {
return value().functionId();
}
@Override
public String prefix() {
return "g";
}
}

View File

@ -85,25 +85,15 @@ public class Params {
String s = a.aggProperty() != null ? a.aggProperty() : a.aggName(); String s = a.aggProperty() != null ? a.aggProperty() : a.aggName();
map.put(p.prefix() + aggs++, s); map.put(p.prefix() + aggs++, s);
} }
if (p instanceof Grouping) {
Grouping g = (Grouping) p;
map.put(p.prefix() + aggs++, g.groupName());
}
} }
return map; return map;
} }
// return the agg refs
List<String> asAggRefs() {
List<String> refs = new ArrayList<>();
for (Param<?> p : params) {
if (p instanceof Agg) {
refs.add(((Agg) p).aggName());
}
}
return refs;
}
private static List<Param<?>> flatten(List<Param<?>> params) { private static List<Param<?>> flatten(List<Param<?>> params) {
List<Param<?>> flatten = emptyList(); List<Param<?>> flatten = emptyList();
@ -116,6 +106,9 @@ public class Params {
else if (p instanceof Agg) { else if (p instanceof Agg) {
flatten.add(p); flatten.add(p);
} }
else if (p instanceof Grouping) {
flatten.add(p);
}
else if (p instanceof Var) { else if (p instanceof Var) {
flatten.add(p); flatten.add(p);
} }

View File

@ -6,6 +6,7 @@
package org.elasticsearch.xpack.sql.expression.gen.script; package org.elasticsearch.xpack.sql.expression.gen.script;
import org.elasticsearch.xpack.sql.expression.function.aggregate.AggregateFunctionAttribute; import org.elasticsearch.xpack.sql.expression.function.aggregate.AggregateFunctionAttribute;
import org.elasticsearch.xpack.sql.expression.function.grouping.GroupingFunctionAttribute;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.List; import java.util.List;
@ -28,6 +29,11 @@ public class ParamsBuilder {
return this; return this;
} }
public ParamsBuilder grouping(GroupingFunctionAttribute grouping) {
params.add(new Grouping(grouping));
return this;
}
public ParamsBuilder script(Params ps) { public ParamsBuilder script(Params ps) {
params.add(new Script(ps)); params.add(new Script(ps));
return this; return this;

View File

@ -44,10 +44,6 @@ public class ScriptTemplate {
return params; return params;
} }
public List<String> aggRefs() {
return params.asAggRefs();
}
public Map<String, String> aggPaths() { public Map<String, String> aggPaths() {
return params.asAggPaths(); return params.asAggPaths();
} }

View File

@ -12,6 +12,7 @@ import org.elasticsearch.xpack.sql.expression.Expression;
import org.elasticsearch.xpack.sql.expression.Expressions; import org.elasticsearch.xpack.sql.expression.Expressions;
import org.elasticsearch.xpack.sql.expression.FieldAttribute; import org.elasticsearch.xpack.sql.expression.FieldAttribute;
import org.elasticsearch.xpack.sql.expression.function.aggregate.AggregateFunctionAttribute; import org.elasticsearch.xpack.sql.expression.function.aggregate.AggregateFunctionAttribute;
import org.elasticsearch.xpack.sql.expression.function.grouping.GroupingFunctionAttribute;
import org.elasticsearch.xpack.sql.expression.function.scalar.ScalarFunctionAttribute; import org.elasticsearch.xpack.sql.expression.function.scalar.ScalarFunctionAttribute;
import org.elasticsearch.xpack.sql.expression.literal.IntervalDayTime; import org.elasticsearch.xpack.sql.expression.literal.IntervalDayTime;
import org.elasticsearch.xpack.sql.expression.literal.IntervalYearMonth; import org.elasticsearch.xpack.sql.expression.literal.IntervalYearMonth;
@ -37,6 +38,9 @@ public interface ScriptWeaver {
if (attr instanceof AggregateFunctionAttribute) { if (attr instanceof AggregateFunctionAttribute) {
return scriptWithAggregate((AggregateFunctionAttribute) attr); return scriptWithAggregate((AggregateFunctionAttribute) attr);
} }
if (attr instanceof GroupingFunctionAttribute) {
return scriptWithGrouping((GroupingFunctionAttribute) attr);
}
if (attr instanceof FieldAttribute) { if (attr instanceof FieldAttribute) {
return scriptWithField((FieldAttribute) attr); return scriptWithField((FieldAttribute) attr);
} }
@ -83,6 +87,16 @@ public interface ScriptWeaver {
dataType()); dataType());
} }
default ScriptTemplate scriptWithGrouping(GroupingFunctionAttribute grouping) {
String template = "{}";
if (grouping.dataType() == DataType.DATE) {
template = "{sql}.asDateTime({})";
}
return new ScriptTemplate(processScript(template),
paramsBuilder().grouping(grouping).build(),
dataType());
}
default ScriptTemplate scriptWithField(FieldAttribute field) { default ScriptTemplate scriptWithField(FieldAttribute field) {
return new ScriptTemplate(processScript("doc[{}].value"), return new ScriptTemplate(processScript("doc[{}].value"),
paramsBuilder().variable(field.name()).build(), paramsBuilder().variable(field.name()).build(),

View File

@ -281,7 +281,7 @@ class QueryFolder extends RuleExecutor<PhysicalPlan> {
// found match for expression; if it's an attribute or scalar, end the processing chain with // found match for expression; if it's an attribute or scalar, end the processing chain with
// the reference to the backing agg // the reference to the backing agg
if (matchingGroup != null) { if (matchingGroup != null) {
if (exp instanceof Attribute || exp instanceof ScalarFunction) { if (exp instanceof Attribute || exp instanceof ScalarFunction || exp instanceof GroupingFunction) {
Processor action = null; Processor action = null;
ZoneId zi = DataType.DATE == exp.dataType() ? DateUtils.UTC : null; ZoneId zi = DataType.DATE == exp.dataType() ? DateUtils.UTC : null;
/* /*

View File

@ -277,7 +277,7 @@ final class QueryTranslator {
if (h.dataType() == DataType.DATE) { if (h.dataType() == DataType.DATE) {
long intervalAsMillis = Intervals.inMillis(h.interval()); long intervalAsMillis = Intervals.inMillis(h.interval());
// TODO: set timezone // TODO: set timezone
if (field instanceof FieldAttribute || field instanceof DateTimeHistogramFunction) { if (field instanceof FieldAttribute) {
key = new GroupByDateHistogram(aggId, nameOf(field), intervalAsMillis, h.zoneId()); key = new GroupByDateHistogram(aggId, nameOf(field), intervalAsMillis, h.zoneId());
} else if (field instanceof Function) { } else if (field instanceof Function) {
key = new GroupByDateHistogram(aggId, ((Function) field).asScript(), intervalAsMillis, h.zoneId()); key = new GroupByDateHistogram(aggId, ((Function) field).asScript(), intervalAsMillis, h.zoneId());
@ -285,7 +285,7 @@ final class QueryTranslator {
} }
// numeric histogram // numeric histogram
else { else {
if (field instanceof FieldAttribute || field instanceof DateTimeHistogramFunction) { if (field instanceof FieldAttribute) {
key = new GroupByNumericHistogram(aggId, nameOf(field), Foldables.doubleValueOf(h.interval())); key = new GroupByNumericHistogram(aggId, nameOf(field), Foldables.doubleValueOf(h.interval()));
} else if (field instanceof Function) { } else if (field instanceof Function) {
key = new GroupByNumericHistogram(aggId, ((Function) field).asScript(), key = new GroupByNumericHistogram(aggId, ((Function) field).asScript(),

View File

@ -11,7 +11,6 @@ import org.elasticsearch.xpack.sql.expression.gen.script.ScriptTemplate;
import org.elasticsearch.xpack.sql.expression.gen.script.Scripts; import org.elasticsearch.xpack.sql.expression.gen.script.Scripts;
import org.elasticsearch.xpack.sql.util.Check; import org.elasticsearch.xpack.sql.util.Check;
import java.util.Collection;
import java.util.Map; import java.util.Map;
import java.util.Objects; import java.util.Objects;
@ -32,14 +31,6 @@ public class AggFilter extends PipelineAgg {
this.aggPaths = scriptTemplate.aggPaths(); this.aggPaths = scriptTemplate.aggPaths();
} }
public Map<String, String> aggPaths() {
return aggPaths;
}
public Collection<String> aggRefs() {
return scriptTemplate.aggRefs();
}
public ScriptTemplate scriptTemplate() { public ScriptTemplate scriptTemplate() {
return scriptTemplate; return scriptTemplate;
} }

View File

@ -481,8 +481,27 @@ public class VerifierErrorMessagesTests extends ESTestCase {
} }
public void testHistogramInFilter() { public void testHistogramInFilter() {
assertEquals("1:63: Cannot use WHERE filtering on grouping function [HISTOGRAM(date)], use HAVING instead", assertEquals("1:63: Cannot filter on grouping function [HISTOGRAM(date)], use its argument instead",
error("SELECT HISTOGRAM(date, INTERVAL 1 MONTH) AS h FROM test WHERE " error("SELECT HISTOGRAM(date, INTERVAL 1 MONTH) AS h FROM test WHERE "
+ "HISTOGRAM(date, INTERVAL 1 MONTH) > CAST('2000-01-01' AS DATE) GROUP BY h")); + "HISTOGRAM(date, INTERVAL 1 MONTH) > CAST('2000-01-01' AS DATE) GROUP BY h"));
} }
// related https://github.com/elastic/elasticsearch/issues/36853
public void testHistogramInHaving() {
assertEquals("1:75: Cannot filter on grouping function [h], use its argument instead",
error("SELECT HISTOGRAM(date, INTERVAL 1 MONTH) AS h FROM test GROUP BY h HAVING "
+ "h > CAST('2000-01-01' AS DATE)"));
}
public void testGroupByScalarOnTopOfGrouping() {
assertEquals(
"1:14: Cannot combine [HISTOGRAM(date)] grouping function inside GROUP BY, "
+ "found [MONTH_OF_YEAR(HISTOGRAM(date) [Z])]; consider moving the expression inside the histogram",
error("SELECT MONTH(HISTOGRAM(date, INTERVAL 1 MONTH)) AS h FROM test GROUP BY h"));
}
public void testAggsInHistogram() {
assertEquals("1:47: Cannot use an aggregate [MAX] for grouping",
error("SELECT MAX(date) FROM test GROUP BY HISTOGRAM(MAX(int), 1)"));
}
} }

View File

@ -92,8 +92,9 @@ import static org.mockito.Mockito.mock;
*/ */
public class NodeSubclassTests<T extends B, B extends Node<B>> extends ESTestCase { public class NodeSubclassTests<T extends B, B extends Node<B>> extends ESTestCase {
private static final List<Class<? extends Node<?>>> CLASSES_WITH_MIN_TWO_CHILDREN = Arrays.asList(
IfNull.class, In.class, InPipe.class, Percentile.class, Percentiles.class, PercentileRanks.class); private static final List<Class<?>> CLASSES_WITH_MIN_TWO_CHILDREN = Arrays.<Class<?>> asList(IfNull.class, In.class, InPipe.class,
Percentile.class, Percentiles.class, PercentileRanks.class);
private final Class<T> subclass; private final Class<T> subclass;