Add sample versions of standard deviation and variance funcs (#59093) (#59274)

* Add sample versions of standard deviation and variance functions (#59093)

* Add STDDEV_SAMP, VAR_SAMP

This commit adds the sampling variations of the standard deviation and
variance agg functions.

(cherry picked from commit 8b29817b49e386215f29cb5b3356d0183fd5d9de)

* Fix: workaround for lack of Map#of() in Java8

Replace Map#of() with a HashMap static init.
This commit is contained in:
Bogdan Pintea 2020-07-09 10:17:13 +02:00 committed by GitHub
parent 14ab35e323
commit acfff7b896
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
9 changed files with 249 additions and 31 deletions

View File

@ -599,6 +599,35 @@ include-tagged::{sql-specs}/docs/docs.csv-spec[aggStddevPop]
include-tagged::{sql-specs}/docs/docs.csv-spec[aggStddevPopScalars]
--------------------------------------------------
[[sql-functions-aggs-stddev-samp]]
==== `STDDEV_SAMP`
.Synopsis:
[source, sql]
--------------------------------------------------
STDDEV_SAMP(field_name) <1>
--------------------------------------------------
*Input*:
<1> a numeric field
*Output*: `double` numeric value
*Description*:
Returns the https://en.wikipedia.org/wiki/Standard_deviations[sample standard deviation] of input values in the field `field_name`.
["source","sql",subs="attributes,macros"]
--------------------------------------------------
include-tagged::{sql-specs}/docs/docs.csv-spec[aggStddevSamp]
--------------------------------------------------
["source","sql",subs="attributes,macros"]
--------------------------------------------------
include-tagged::{sql-specs}/docs/docs.csv-spec[aggStddevSampScalars]
--------------------------------------------------
[[sql-functions-aggs-sum-squares]]
==== `SUM_OF_SQUARES`
@ -657,3 +686,33 @@ include-tagged::{sql-specs}/docs/docs.csv-spec[aggVarPop]
--------------------------------------------------
include-tagged::{sql-specs}/docs/docs.csv-spec[aggVarPopScalars]
--------------------------------------------------
[[sql-functions-aggs-var-samp]]
==== `VAR_SAMP`
.Synopsis:
[source, sql]
--------------------------------------------------
VAR_SAMP(field_name) <1>
--------------------------------------------------
*Input*:
<1> a numeric field
*Output*: `double` numeric value
*Description*:
Returns the https://en.wikipedia.org/wiki/Variance[sample variance] of input values in the field `field_name`.
["source","sql",subs="attributes,macros"]
--------------------------------------------------
include-tagged::{sql-specs}/docs/docs.csv-spec[aggVarSamp]
--------------------------------------------------
["source","sql",subs="attributes,macros"]
--------------------------------------------------
include-tagged::{sql-specs}/docs/docs.csv-spec[aggVarSampScalars]
--------------------------------------------------

View File

@ -42,8 +42,10 @@
** <<sql-functions-aggs-percentile-rank>>
** <<sql-functions-aggs-skewness>>
** <<sql-functions-aggs-stddev-pop>>
** <<sql-functions-aggs-stddev-samp>>
** <<sql-functions-aggs-sum-squares>>
** <<sql-functions-aggs-var-pop>>
** <<sql-functions-aggs-var-samp>>
* <<sql-functions-grouping>>
** <<sql-functions-grouping-histogram>>
* <<sql-functions-datetime-interval, Date-Time Operators>>

View File

@ -1160,33 +1160,37 @@ GROUP BY gender ORDER BY gender;
;
extendedStatsAggregateFunctionsWithScalars
schema::stddev_pop:d|sum_of_squares:d|var_pop:d|gender:s
schema::stddev_pop:d|stddev_samp:d|sum_of_squares:d|var_pop:d|var_samp:d|gender:s
SELECT STDDEV_POP(CASE WHEN (salary / 2) > 10000 THEN (salary + 12345) * 1.2 ELSE (salary - 12345) * 2.7 END) AS "stddev_pop",
STDDEV_SAMP(CASE WHEN (salary / 2) > 10000 THEN (salary + 12345) * 1.2 ELSE (salary - 12345) * 2.7 END) AS "stddev_samp",
SUM_OF_SQUARES(CASE WHEN (salary - 20) > 50000 THEN (salary * 1.2) - 1234 ELSE (salary - 20) * 0.93 END) AS "sum_of_squares",
VAR_POP(CASE WHEN (salary - 20) % 1000 > 200 THEN (salary * 1.2) - 1234 ELSE (salary - 20) * 0.93 END) AS "var_pop",
VAR_SAMP(CASE WHEN (salary - 20) % 1000 > 200 THEN (salary * 1.2) - 1234 ELSE (salary - 20) * 0.93 END) AS "var_samp",
gender FROM test_emp
GROUP BY gender ORDER BY gender;
stddev_pop | sum_of_squares | var_pop | gender
------------------+---------------------+--------------------+---------------
16752.73244172422 |3.06310583829007E10 |3.460331137445282E8 |null
17427.462400181845|1.148127725047658E11 |3.1723426960671306E8|F
15702.798665784752|1.5882243113919238E11|2.529402043805585E8 |M
stddev_pop | stddev_samp | sum_of_squares | var_pop | var_samp | gender
------------------+------------------+---------------------+--------------------+--------------------+---------------
16752.73244172422 |17658.930515747525|3.06310583829007E10 |3.460331137445282E8 |3.844812374939202E8 |null
17427.462400181845|17697.67172930331 |1.148127725047658E11 |3.1723426960671306E8|3.271478405319228E8 |F
15702.798665784752|15842.381843421828|1.5882243113919238E11|2.529402043805585E8 |2.5745699374449703E8|M
;
extendedStatsAggregateFunctionsWithScalarAndSameArg
schema::stddev_pop:d|sum_of_squares:d|var_pop:d|gender:s
schema::stddev_pop:d|stddev_samp:d|sum_of_squares:d|var_pop:d|var_samp:d|gender:s
SELECT STDDEV_POP(CASE WHEN (salary - 20) % 1000 > 200 THEN (salary * 1.2) - 1234 ELSE (salary - 20) * 0.93 END) AS "stddev_pop",
STDDEV_SAMP(CASE WHEN (salary - 20) % 1000 > 200 THEN (salary * 1.2) - 1234 ELSE (salary - 20) * 0.93 END) AS "stddev_samp",
SUM_OF_SQUARES(CASE WHEN (salary - 20) % 1000 > 200 THEN (salary * 1.2) - 1234 ELSE (salary - 20) * 0.93 END) AS "sum_of_squares",
VAR_POP(CASE WHEN (salary - 20) % 1000 > 200 THEN (salary * 1.2) - 1234 ELSE (salary - 20) * 0.93 END) AS "var_pop",
VAR_SAMP(CASE WHEN (salary - 20) % 1000 > 200 THEN (salary * 1.2) - 1234 ELSE (salary - 20) * 0.93 END) AS "var_samp",
gender FROM test_emp
GROUP BY gender ORDER BY gender;
stddev_pop | sum_of_squares | var_pop | gender
------------------+---------------------+--------------------+---------------
18601.965319409886|3.4461553130896095E10|3.460331137445282E8 |null
17811.071545718776|1.2151168881502939E11|3.1723426960671306E8|F
15904.093950318531|1.699198993070239E11 |2.529402043805585E8 |M
stddev_pop | stddev_samp | sum_of_squares | var_pop | var_samp | gender
------------------+------------------+---------------------+--------------------+--------------------+---------------
18601.965319409886|19608.193121598946|3.4461553130896095E10|3.460331137445282E8 |3.844812374939202E8 |null
17811.071545718776|18087.228658142263|1.2151168881502939E11|3.1723426960671306E8|3.271478405319228E8 |F
15904.093950318531|16045.466454562704|1.699198993070239E11 |2.529402043805585E8 |2.5745699374449703E8|M
;

View File

@ -19,14 +19,16 @@ MAX |AGGREGATE
MIN |AGGREGATE
SUM |AGGREGATE
KURTOSIS |AGGREGATE
MAD |AGGREGATE
PERCENTILE |AGGREGATE
PERCENTILE_RANK |AGGREGATE
SKEWNESS |AGGREGATE
STDDEV_POP |AGGREGATE
SUM_OF_SQUARES |AGGREGATE
VAR_POP |AGGREGATE
HISTOGRAM |GROUPING
MAD |AGGREGATE
PERCENTILE |AGGREGATE
PERCENTILE_RANK |AGGREGATE
SKEWNESS |AGGREGATE
STDDEV_POP |AGGREGATE
STDDEV_SAMP |AGGREGATE
SUM_OF_SQUARES |AGGREGATE
VAR_POP |AGGREGATE
VAR_SAMP |AGGREGATE
HISTOGRAM |GROUPING
CASE |CONDITIONAL
COALESCE |CONDITIONAL
GREATEST |CONDITIONAL

View File

@ -215,13 +215,15 @@ MAX |AGGREGATE
MIN |AGGREGATE
SUM |AGGREGATE
KURTOSIS |AGGREGATE
MAD |AGGREGATE
PERCENTILE |AGGREGATE
PERCENTILE_RANK |AGGREGATE
SKEWNESS |AGGREGATE
STDDEV_POP |AGGREGATE
SUM_OF_SQUARES |AGGREGATE
VAR_POP |AGGREGATE
MAD |AGGREGATE
PERCENTILE |AGGREGATE
PERCENTILE_RANK |AGGREGATE
SKEWNESS |AGGREGATE
STDDEV_POP |AGGREGATE
STDDEV_SAMP |AGGREGATE
SUM_OF_SQUARES |AGGREGATE
VAR_POP |AGGREGATE
VAR_SAMP |AGGREGATE
HISTOGRAM |GROUPING
CASE |CONDITIONAL
COALESCE |CONDITIONAL
@ -1549,10 +1551,9 @@ SELECT MIN(salary) AS min, MAX(salary) AS max, SKEWNESS(salary) AS s FROM emp;
aggStddevPop
// tag::aggStddevPop
SELECT MIN(salary) AS min, MAX(salary) AS max, STDDEV_POP(salary) AS stddev
FROM emp;
SELECT MIN(salary) AS min, MAX(salary) AS max, STDDEV_POP(salary) AS stddev FROM emp;
min | max | stddev
min | max | stddev
---------------+---------------+------------------
25324 |74999 |13765.125502787832
// end::aggStddevPop
@ -1569,6 +1570,27 @@ SELECT MIN(salary / 12.0) AS min, MAX(salary / 12.0) AS max, STDDEV_POP(salary /
// end::aggStddevPopScalars
;
aggStddevSamp
// tag::aggStddevSamp
SELECT MIN(salary) AS min, MAX(salary) AS max, STDDEV_SAMP(salary) AS stddev FROM emp;
min | max | stddev
---------------+---------------+------------------
25324 |74999 |13834.471662090747
// end::aggStddevSamp
;
aggStddevSampScalars
schema::min:d|max:d|stddev:d
// tag::aggStddevSampScalars
SELECT MIN(salary / 12.0) AS min, MAX(salary / 12.0) AS max, STDDEV_SAMP(salary / 12.0) AS stddev FROM emp;
min | max | stddev
------------------+-----------------+-----------------
2110.3333333333335|6249.916666666667|1152.872638507562
// end::aggStddevSampScalars
;
aggSumOfSquares
// tag::aggSumOfSquares
@ -1614,6 +1636,27 @@ SELECT MIN(salary / 24.0) AS min, MAX(salary / 24.0) AS max, VAR_POP(salary / 24
// end::aggVarPopScalars
;
aggVarSamp
// tag::aggVarSamp
SELECT MIN(salary) AS min, MAX(salary) AS max, VAR_SAMP(salary) AS varsamp FROM emp;
min | max | varsamp
---------------+---------------+----------------
25324 |74999 |1.913926061691E8
// end::aggVarSamp
;
aggVarSampScalars
schema::min:d|max:d|varsamp:d
// tag::aggVarSampScalars
SELECT MIN(salary / 24.0) AS min, MAX(salary / 24.0) AS max, VAR_SAMP(salary / 24.0) AS varsamp FROM emp;
min | max | varsamp
------------------+------------------+----------------
1055.1666666666667|3124.9583333333335|332278.830154847
// end::aggVarSampScalars
;
///////////////////////////////
//
// String

View File

@ -20,9 +20,11 @@ import org.elasticsearch.xpack.sql.expression.function.aggregate.Percentile;
import org.elasticsearch.xpack.sql.expression.function.aggregate.PercentileRank;
import org.elasticsearch.xpack.sql.expression.function.aggregate.Skewness;
import org.elasticsearch.xpack.sql.expression.function.aggregate.StddevPop;
import org.elasticsearch.xpack.sql.expression.function.aggregate.StddevSamp;
import org.elasticsearch.xpack.sql.expression.function.aggregate.Sum;
import org.elasticsearch.xpack.sql.expression.function.aggregate.SumOfSquares;
import org.elasticsearch.xpack.sql.expression.function.aggregate.VarPop;
import org.elasticsearch.xpack.sql.expression.function.aggregate.VarSamp;
import org.elasticsearch.xpack.sql.expression.function.grouping.Histogram;
import org.elasticsearch.xpack.sql.expression.function.scalar.Cast;
import org.elasticsearch.xpack.sql.expression.function.scalar.Database;
@ -143,8 +145,10 @@ public class SqlFunctionRegistry extends FunctionRegistry {
def(PercentileRank.class, PercentileRank::new, "PERCENTILE_RANK"),
def(Skewness.class, Skewness::new, "SKEWNESS"),
def(StddevPop.class, StddevPop::new, "STDDEV_POP"),
def(StddevSamp.class, StddevSamp::new, "STDDEV_SAMP"),
def(SumOfSquares.class, SumOfSquares::new, "SUM_OF_SQUARES"),
def(VarPop.class, VarPop::new, "VAR_POP")
def(VarPop.class, VarPop::new, "VAR_POP"),
def(VarSamp.class, VarSamp::new, "VAR_SAMP")
},
// histogram
new FunctionDefinition[] {

View File

@ -0,0 +1,37 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License;
* you may not use this file except in compliance with the Elastic License.
*/
package org.elasticsearch.xpack.sql.expression.function.aggregate;
import org.elasticsearch.xpack.ql.expression.Expression;
import org.elasticsearch.xpack.ql.tree.NodeInfo;
import org.elasticsearch.xpack.ql.tree.Source;
import java.util.List;
public class StddevSamp extends NumericAggregate implements ExtendedStatsEnclosed {
public StddevSamp(Source source, Expression field) {
super(source, field);
}
@Override
public String innerName() {
return "std_deviation_sampling";
}
@Override
public Expression replaceChildren(List<Expression> newChildren) {
if (newChildren.size() != 1) {
throw new IllegalArgumentException("expected [1] child but received [" + newChildren.size() + "]");
}
return new StddevSamp(source(), newChildren.get(0));
}
@Override
protected NodeInfo<? extends Expression> info() {
return NodeInfo.create(this, StddevSamp::new, field());
}
}

View File

@ -0,0 +1,37 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License;
* you may not use this file except in compliance with the Elastic License.
*/
package org.elasticsearch.xpack.sql.expression.function.aggregate;
import org.elasticsearch.xpack.ql.expression.Expression;
import org.elasticsearch.xpack.ql.tree.NodeInfo;
import org.elasticsearch.xpack.ql.tree.Source;
import java.util.List;
public class VarSamp extends NumericAggregate implements ExtendedStatsEnclosed {
public VarSamp(Source source, Expression field) {
super(source, field);
}
@Override
public String innerName() {
return "variance_sampling";
}
@Override
public Expression replaceChildren(List<Expression> newChildren) {
if (newChildren.size() != 1) {
throw new IllegalArgumentException("expected [1] child but received [" + newChildren.size() + "]");
}
return new VarSamp(source(), newChildren.get(0));
}
@Override
protected NodeInfo<? extends Expression> info() {
return NodeInfo.create(this, VarSamp::new, field());
}
}

View File

@ -13,6 +13,7 @@ import org.elasticsearch.search.aggregations.metrics.AvgAggregationBuilder;
import org.elasticsearch.search.aggregations.metrics.CardinalityAggregationBuilder;
import org.elasticsearch.test.ESTestCase;
import org.elasticsearch.xpack.ql.QlIllegalArgumentException;
import org.elasticsearch.xpack.ql.execution.search.FieldExtraction;
import org.elasticsearch.xpack.ql.expression.Alias;
import org.elasticsearch.xpack.ql.expression.Attribute;
import org.elasticsearch.xpack.ql.expression.Expression;
@ -68,6 +69,7 @@ import org.elasticsearch.xpack.sql.planner.QueryFolder.FoldAggregate.GroupingCon
import org.elasticsearch.xpack.sql.planner.QueryTranslator.QueryTranslation;
import org.elasticsearch.xpack.sql.querydsl.agg.AggFilter;
import org.elasticsearch.xpack.sql.querydsl.agg.GroupByDateHistogram;
import org.elasticsearch.xpack.sql.querydsl.container.MetricAggRef;
import org.elasticsearch.xpack.sql.stats.Metrics;
import org.elasticsearch.xpack.sql.types.SqlTypesTests;
import org.elasticsearch.xpack.sql.util.DateUtils;
@ -76,6 +78,7 @@ import org.junit.BeforeClass;
import java.time.ZoneId;
import java.time.ZonedDateTime;
import java.util.Collection;
import java.util.HashMap;
import java.util.List;
import java.util.Locale;
import java.util.Map;
@ -1896,6 +1899,33 @@ public class QueryTranslatorTests extends ESTestCase {
}
}
public void testExtendedStatsAggsStddevAndVar() {
final Map<String, String> metricToAgg = new HashMap<String, String>() {{
put("STDDEV_POP", "std_deviation");
put("STDDEV_SAMP", "std_deviation_sampling");
put("VAR_POP", "variance");
put("VAR_SAMP", "variance_sampling");
}};
for (String funcName: metricToAgg.keySet()) {
PhysicalPlan p = optimizeAndPlan("SELECT " + funcName + "(int) FROM test");
assertEquals(EsQueryExec.class, p.getClass());
EsQueryExec eqe = (EsQueryExec) p;
assertEquals(1, eqe.output().size());
assertEquals(funcName + "(int)", eqe.output().get(0).qualifiedName());
assertEquals(DOUBLE, eqe.output().get(0).dataType());
FieldExtraction fe = eqe.queryContainer().fields().get(0).v1();
assertEquals(MetricAggRef.class, fe.getClass());
assertEquals(((MetricAggRef) fe).property(), metricToAgg.get(funcName));
String aggName = eqe.queryContainer().aggs().asAggBuilder().getSubAggregations().iterator().next().getName();
assertThat(eqe.queryContainer().aggs().asAggBuilder().toString().replaceAll("\\s+", ""),
endsWith("\"aggregations\":{\"" + aggName + "\":{\"extended_stats\":{\"field\":\"int\",\"sigma\":2.0}}}}}"));
}
}
public void testGlobalCountInImplicitGroupByForcesTrackHits() {
PhysicalPlan p = optimizeAndPlan("SELECT COUNT(*) FROM test");
assertEquals(EsQueryExec.class, p.getClass());