* Add sample versions of standard deviation and variance functions (#59093) * Add STDDEV_SAMP, VAR_SAMP This commit adds the sampling variations of the standard deviation and variance agg functions. (cherry picked from commit 8b29817b49e386215f29cb5b3356d0183fd5d9de) * Fix: workaround for lack of Map#of() in Java8 Replace Map#of() with a HashMap static init.
This commit is contained in:
parent
14ab35e323
commit
acfff7b896
|
@ -599,6 +599,35 @@ include-tagged::{sql-specs}/docs/docs.csv-spec[aggStddevPop]
|
|||
include-tagged::{sql-specs}/docs/docs.csv-spec[aggStddevPopScalars]
|
||||
--------------------------------------------------
|
||||
|
||||
[[sql-functions-aggs-stddev-samp]]
|
||||
==== `STDDEV_SAMP`
|
||||
|
||||
.Synopsis:
|
||||
[source, sql]
|
||||
--------------------------------------------------
|
||||
STDDEV_SAMP(field_name) <1>
|
||||
--------------------------------------------------
|
||||
|
||||
*Input*:
|
||||
|
||||
<1> a numeric field
|
||||
|
||||
*Output*: `double` numeric value
|
||||
|
||||
*Description*:
|
||||
|
||||
Returns the https://en.wikipedia.org/wiki/Standard_deviations[sample standard deviation] of input values in the field `field_name`.
|
||||
|
||||
["source","sql",subs="attributes,macros"]
|
||||
--------------------------------------------------
|
||||
include-tagged::{sql-specs}/docs/docs.csv-spec[aggStddevSamp]
|
||||
--------------------------------------------------
|
||||
|
||||
["source","sql",subs="attributes,macros"]
|
||||
--------------------------------------------------
|
||||
include-tagged::{sql-specs}/docs/docs.csv-spec[aggStddevSampScalars]
|
||||
--------------------------------------------------
|
||||
|
||||
[[sql-functions-aggs-sum-squares]]
|
||||
==== `SUM_OF_SQUARES`
|
||||
|
||||
|
@ -657,3 +686,33 @@ include-tagged::{sql-specs}/docs/docs.csv-spec[aggVarPop]
|
|||
--------------------------------------------------
|
||||
include-tagged::{sql-specs}/docs/docs.csv-spec[aggVarPopScalars]
|
||||
--------------------------------------------------
|
||||
|
||||
[[sql-functions-aggs-var-samp]]
|
||||
==== `VAR_SAMP`
|
||||
|
||||
.Synopsis:
|
||||
[source, sql]
|
||||
--------------------------------------------------
|
||||
VAR_SAMP(field_name) <1>
|
||||
--------------------------------------------------
|
||||
|
||||
*Input*:
|
||||
|
||||
<1> a numeric field
|
||||
|
||||
*Output*: `double` numeric value
|
||||
|
||||
*Description*:
|
||||
|
||||
Returns the https://en.wikipedia.org/wiki/Variance[sample variance] of input values in the field `field_name`.
|
||||
|
||||
["source","sql",subs="attributes,macros"]
|
||||
--------------------------------------------------
|
||||
include-tagged::{sql-specs}/docs/docs.csv-spec[aggVarSamp]
|
||||
--------------------------------------------------
|
||||
|
||||
|
||||
["source","sql",subs="attributes,macros"]
|
||||
--------------------------------------------------
|
||||
include-tagged::{sql-specs}/docs/docs.csv-spec[aggVarSampScalars]
|
||||
--------------------------------------------------
|
||||
|
|
|
@ -42,8 +42,10 @@
|
|||
** <<sql-functions-aggs-percentile-rank>>
|
||||
** <<sql-functions-aggs-skewness>>
|
||||
** <<sql-functions-aggs-stddev-pop>>
|
||||
** <<sql-functions-aggs-stddev-samp>>
|
||||
** <<sql-functions-aggs-sum-squares>>
|
||||
** <<sql-functions-aggs-var-pop>>
|
||||
** <<sql-functions-aggs-var-samp>>
|
||||
* <<sql-functions-grouping>>
|
||||
** <<sql-functions-grouping-histogram>>
|
||||
* <<sql-functions-datetime-interval, Date-Time Operators>>
|
||||
|
|
|
@ -1160,33 +1160,37 @@ GROUP BY gender ORDER BY gender;
|
|||
;
|
||||
|
||||
extendedStatsAggregateFunctionsWithScalars
|
||||
schema::stddev_pop:d|sum_of_squares:d|var_pop:d|gender:s
|
||||
schema::stddev_pop:d|stddev_samp:d|sum_of_squares:d|var_pop:d|var_samp:d|gender:s
|
||||
SELECT STDDEV_POP(CASE WHEN (salary / 2) > 10000 THEN (salary + 12345) * 1.2 ELSE (salary - 12345) * 2.7 END) AS "stddev_pop",
|
||||
STDDEV_SAMP(CASE WHEN (salary / 2) > 10000 THEN (salary + 12345) * 1.2 ELSE (salary - 12345) * 2.7 END) AS "stddev_samp",
|
||||
SUM_OF_SQUARES(CASE WHEN (salary - 20) > 50000 THEN (salary * 1.2) - 1234 ELSE (salary - 20) * 0.93 END) AS "sum_of_squares",
|
||||
VAR_POP(CASE WHEN (salary - 20) % 1000 > 200 THEN (salary * 1.2) - 1234 ELSE (salary - 20) * 0.93 END) AS "var_pop",
|
||||
VAR_SAMP(CASE WHEN (salary - 20) % 1000 > 200 THEN (salary * 1.2) - 1234 ELSE (salary - 20) * 0.93 END) AS "var_samp",
|
||||
gender FROM test_emp
|
||||
GROUP BY gender ORDER BY gender;
|
||||
|
||||
stddev_pop | sum_of_squares | var_pop | gender
|
||||
------------------+---------------------+--------------------+---------------
|
||||
16752.73244172422 |3.06310583829007E10 |3.460331137445282E8 |null
|
||||
17427.462400181845|1.148127725047658E11 |3.1723426960671306E8|F
|
||||
15702.798665784752|1.5882243113919238E11|2.529402043805585E8 |M
|
||||
stddev_pop | stddev_samp | sum_of_squares | var_pop | var_samp | gender
|
||||
------------------+------------------+---------------------+--------------------+--------------------+---------------
|
||||
16752.73244172422 |17658.930515747525|3.06310583829007E10 |3.460331137445282E8 |3.844812374939202E8 |null
|
||||
17427.462400181845|17697.67172930331 |1.148127725047658E11 |3.1723426960671306E8|3.271478405319228E8 |F
|
||||
15702.798665784752|15842.381843421828|1.5882243113919238E11|2.529402043805585E8 |2.5745699374449703E8|M
|
||||
;
|
||||
|
||||
extendedStatsAggregateFunctionsWithScalarAndSameArg
|
||||
schema::stddev_pop:d|sum_of_squares:d|var_pop:d|gender:s
|
||||
schema::stddev_pop:d|stddev_samp:d|sum_of_squares:d|var_pop:d|var_samp:d|gender:s
|
||||
SELECT STDDEV_POP(CASE WHEN (salary - 20) % 1000 > 200 THEN (salary * 1.2) - 1234 ELSE (salary - 20) * 0.93 END) AS "stddev_pop",
|
||||
STDDEV_SAMP(CASE WHEN (salary - 20) % 1000 > 200 THEN (salary * 1.2) - 1234 ELSE (salary - 20) * 0.93 END) AS "stddev_samp",
|
||||
SUM_OF_SQUARES(CASE WHEN (salary - 20) % 1000 > 200 THEN (salary * 1.2) - 1234 ELSE (salary - 20) * 0.93 END) AS "sum_of_squares",
|
||||
VAR_POP(CASE WHEN (salary - 20) % 1000 > 200 THEN (salary * 1.2) - 1234 ELSE (salary - 20) * 0.93 END) AS "var_pop",
|
||||
VAR_SAMP(CASE WHEN (salary - 20) % 1000 > 200 THEN (salary * 1.2) - 1234 ELSE (salary - 20) * 0.93 END) AS "var_samp",
|
||||
gender FROM test_emp
|
||||
GROUP BY gender ORDER BY gender;
|
||||
|
||||
stddev_pop | sum_of_squares | var_pop | gender
|
||||
------------------+---------------------+--------------------+---------------
|
||||
18601.965319409886|3.4461553130896095E10|3.460331137445282E8 |null
|
||||
17811.071545718776|1.2151168881502939E11|3.1723426960671306E8|F
|
||||
15904.093950318531|1.699198993070239E11 |2.529402043805585E8 |M
|
||||
stddev_pop | stddev_samp | sum_of_squares | var_pop | var_samp | gender
|
||||
------------------+------------------+---------------------+--------------------+--------------------+---------------
|
||||
18601.965319409886|19608.193121598946|3.4461553130896095E10|3.460331137445282E8 |3.844812374939202E8 |null
|
||||
17811.071545718776|18087.228658142263|1.2151168881502939E11|3.1723426960671306E8|3.271478405319228E8 |F
|
||||
15904.093950318531|16045.466454562704|1.699198993070239E11 |2.529402043805585E8 |2.5745699374449703E8|M
|
||||
;
|
||||
|
||||
|
||||
|
|
|
@ -19,14 +19,16 @@ MAX |AGGREGATE
|
|||
MIN |AGGREGATE
|
||||
SUM |AGGREGATE
|
||||
KURTOSIS |AGGREGATE
|
||||
MAD |AGGREGATE
|
||||
PERCENTILE |AGGREGATE
|
||||
PERCENTILE_RANK |AGGREGATE
|
||||
SKEWNESS |AGGREGATE
|
||||
STDDEV_POP |AGGREGATE
|
||||
SUM_OF_SQUARES |AGGREGATE
|
||||
VAR_POP |AGGREGATE
|
||||
HISTOGRAM |GROUPING
|
||||
MAD |AGGREGATE
|
||||
PERCENTILE |AGGREGATE
|
||||
PERCENTILE_RANK |AGGREGATE
|
||||
SKEWNESS |AGGREGATE
|
||||
STDDEV_POP |AGGREGATE
|
||||
STDDEV_SAMP |AGGREGATE
|
||||
SUM_OF_SQUARES |AGGREGATE
|
||||
VAR_POP |AGGREGATE
|
||||
VAR_SAMP |AGGREGATE
|
||||
HISTOGRAM |GROUPING
|
||||
CASE |CONDITIONAL
|
||||
COALESCE |CONDITIONAL
|
||||
GREATEST |CONDITIONAL
|
||||
|
|
|
@ -215,13 +215,15 @@ MAX |AGGREGATE
|
|||
MIN |AGGREGATE
|
||||
SUM |AGGREGATE
|
||||
KURTOSIS |AGGREGATE
|
||||
MAD |AGGREGATE
|
||||
PERCENTILE |AGGREGATE
|
||||
PERCENTILE_RANK |AGGREGATE
|
||||
SKEWNESS |AGGREGATE
|
||||
STDDEV_POP |AGGREGATE
|
||||
SUM_OF_SQUARES |AGGREGATE
|
||||
VAR_POP |AGGREGATE
|
||||
MAD |AGGREGATE
|
||||
PERCENTILE |AGGREGATE
|
||||
PERCENTILE_RANK |AGGREGATE
|
||||
SKEWNESS |AGGREGATE
|
||||
STDDEV_POP |AGGREGATE
|
||||
STDDEV_SAMP |AGGREGATE
|
||||
SUM_OF_SQUARES |AGGREGATE
|
||||
VAR_POP |AGGREGATE
|
||||
VAR_SAMP |AGGREGATE
|
||||
HISTOGRAM |GROUPING
|
||||
CASE |CONDITIONAL
|
||||
COALESCE |CONDITIONAL
|
||||
|
@ -1549,10 +1551,9 @@ SELECT MIN(salary) AS min, MAX(salary) AS max, SKEWNESS(salary) AS s FROM emp;
|
|||
|
||||
aggStddevPop
|
||||
// tag::aggStddevPop
|
||||
SELECT MIN(salary) AS min, MAX(salary) AS max, STDDEV_POP(salary) AS stddev
|
||||
FROM emp;
|
||||
SELECT MIN(salary) AS min, MAX(salary) AS max, STDDEV_POP(salary) AS stddev FROM emp;
|
||||
|
||||
min | max | stddev
|
||||
min | max | stddev
|
||||
---------------+---------------+------------------
|
||||
25324 |74999 |13765.125502787832
|
||||
// end::aggStddevPop
|
||||
|
@ -1569,6 +1570,27 @@ SELECT MIN(salary / 12.0) AS min, MAX(salary / 12.0) AS max, STDDEV_POP(salary /
|
|||
// end::aggStddevPopScalars
|
||||
;
|
||||
|
||||
aggStddevSamp
|
||||
// tag::aggStddevSamp
|
||||
SELECT MIN(salary) AS min, MAX(salary) AS max, STDDEV_SAMP(salary) AS stddev FROM emp;
|
||||
|
||||
min | max | stddev
|
||||
---------------+---------------+------------------
|
||||
25324 |74999 |13834.471662090747
|
||||
// end::aggStddevSamp
|
||||
;
|
||||
|
||||
aggStddevSampScalars
|
||||
schema::min:d|max:d|stddev:d
|
||||
// tag::aggStddevSampScalars
|
||||
SELECT MIN(salary / 12.0) AS min, MAX(salary / 12.0) AS max, STDDEV_SAMP(salary / 12.0) AS stddev FROM emp;
|
||||
|
||||
min | max | stddev
|
||||
------------------+-----------------+-----------------
|
||||
2110.3333333333335|6249.916666666667|1152.872638507562
|
||||
// end::aggStddevSampScalars
|
||||
;
|
||||
|
||||
|
||||
aggSumOfSquares
|
||||
// tag::aggSumOfSquares
|
||||
|
@ -1614,6 +1636,27 @@ SELECT MIN(salary / 24.0) AS min, MAX(salary / 24.0) AS max, VAR_POP(salary / 24
|
|||
// end::aggVarPopScalars
|
||||
;
|
||||
|
||||
aggVarSamp
|
||||
// tag::aggVarSamp
|
||||
SELECT MIN(salary) AS min, MAX(salary) AS max, VAR_SAMP(salary) AS varsamp FROM emp;
|
||||
|
||||
min | max | varsamp
|
||||
---------------+---------------+----------------
|
||||
25324 |74999 |1.913926061691E8
|
||||
// end::aggVarSamp
|
||||
;
|
||||
|
||||
aggVarSampScalars
|
||||
schema::min:d|max:d|varsamp:d
|
||||
// tag::aggVarSampScalars
|
||||
SELECT MIN(salary / 24.0) AS min, MAX(salary / 24.0) AS max, VAR_SAMP(salary / 24.0) AS varsamp FROM emp;
|
||||
|
||||
min | max | varsamp
|
||||
------------------+------------------+----------------
|
||||
1055.1666666666667|3124.9583333333335|332278.830154847
|
||||
// end::aggVarSampScalars
|
||||
;
|
||||
|
||||
///////////////////////////////
|
||||
//
|
||||
// String
|
||||
|
|
|
@ -20,9 +20,11 @@ import org.elasticsearch.xpack.sql.expression.function.aggregate.Percentile;
|
|||
import org.elasticsearch.xpack.sql.expression.function.aggregate.PercentileRank;
|
||||
import org.elasticsearch.xpack.sql.expression.function.aggregate.Skewness;
|
||||
import org.elasticsearch.xpack.sql.expression.function.aggregate.StddevPop;
|
||||
import org.elasticsearch.xpack.sql.expression.function.aggregate.StddevSamp;
|
||||
import org.elasticsearch.xpack.sql.expression.function.aggregate.Sum;
|
||||
import org.elasticsearch.xpack.sql.expression.function.aggregate.SumOfSquares;
|
||||
import org.elasticsearch.xpack.sql.expression.function.aggregate.VarPop;
|
||||
import org.elasticsearch.xpack.sql.expression.function.aggregate.VarSamp;
|
||||
import org.elasticsearch.xpack.sql.expression.function.grouping.Histogram;
|
||||
import org.elasticsearch.xpack.sql.expression.function.scalar.Cast;
|
||||
import org.elasticsearch.xpack.sql.expression.function.scalar.Database;
|
||||
|
@ -143,8 +145,10 @@ public class SqlFunctionRegistry extends FunctionRegistry {
|
|||
def(PercentileRank.class, PercentileRank::new, "PERCENTILE_RANK"),
|
||||
def(Skewness.class, Skewness::new, "SKEWNESS"),
|
||||
def(StddevPop.class, StddevPop::new, "STDDEV_POP"),
|
||||
def(StddevSamp.class, StddevSamp::new, "STDDEV_SAMP"),
|
||||
def(SumOfSquares.class, SumOfSquares::new, "SUM_OF_SQUARES"),
|
||||
def(VarPop.class, VarPop::new, "VAR_POP")
|
||||
def(VarPop.class, VarPop::new, "VAR_POP"),
|
||||
def(VarSamp.class, VarSamp::new, "VAR_SAMP")
|
||||
},
|
||||
// histogram
|
||||
new FunctionDefinition[] {
|
||||
|
|
|
@ -0,0 +1,37 @@
|
|||
/*
|
||||
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
|
||||
* or more contributor license agreements. Licensed under the Elastic License;
|
||||
* you may not use this file except in compliance with the Elastic License.
|
||||
*/
|
||||
|
||||
package org.elasticsearch.xpack.sql.expression.function.aggregate;
|
||||
|
||||
import org.elasticsearch.xpack.ql.expression.Expression;
|
||||
import org.elasticsearch.xpack.ql.tree.NodeInfo;
|
||||
import org.elasticsearch.xpack.ql.tree.Source;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
public class StddevSamp extends NumericAggregate implements ExtendedStatsEnclosed {
|
||||
public StddevSamp(Source source, Expression field) {
|
||||
super(source, field);
|
||||
}
|
||||
|
||||
@Override
|
||||
public String innerName() {
|
||||
return "std_deviation_sampling";
|
||||
}
|
||||
|
||||
@Override
|
||||
public Expression replaceChildren(List<Expression> newChildren) {
|
||||
if (newChildren.size() != 1) {
|
||||
throw new IllegalArgumentException("expected [1] child but received [" + newChildren.size() + "]");
|
||||
}
|
||||
return new StddevSamp(source(), newChildren.get(0));
|
||||
}
|
||||
|
||||
@Override
|
||||
protected NodeInfo<? extends Expression> info() {
|
||||
return NodeInfo.create(this, StddevSamp::new, field());
|
||||
}
|
||||
}
|
|
@ -0,0 +1,37 @@
|
|||
/*
|
||||
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
|
||||
* or more contributor license agreements. Licensed under the Elastic License;
|
||||
* you may not use this file except in compliance with the Elastic License.
|
||||
*/
|
||||
|
||||
package org.elasticsearch.xpack.sql.expression.function.aggregate;
|
||||
|
||||
import org.elasticsearch.xpack.ql.expression.Expression;
|
||||
import org.elasticsearch.xpack.ql.tree.NodeInfo;
|
||||
import org.elasticsearch.xpack.ql.tree.Source;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
public class VarSamp extends NumericAggregate implements ExtendedStatsEnclosed {
|
||||
public VarSamp(Source source, Expression field) {
|
||||
super(source, field);
|
||||
}
|
||||
|
||||
@Override
|
||||
public String innerName() {
|
||||
return "variance_sampling";
|
||||
}
|
||||
|
||||
@Override
|
||||
public Expression replaceChildren(List<Expression> newChildren) {
|
||||
if (newChildren.size() != 1) {
|
||||
throw new IllegalArgumentException("expected [1] child but received [" + newChildren.size() + "]");
|
||||
}
|
||||
return new VarSamp(source(), newChildren.get(0));
|
||||
}
|
||||
|
||||
@Override
|
||||
protected NodeInfo<? extends Expression> info() {
|
||||
return NodeInfo.create(this, VarSamp::new, field());
|
||||
}
|
||||
}
|
|
@ -13,6 +13,7 @@ import org.elasticsearch.search.aggregations.metrics.AvgAggregationBuilder;
|
|||
import org.elasticsearch.search.aggregations.metrics.CardinalityAggregationBuilder;
|
||||
import org.elasticsearch.test.ESTestCase;
|
||||
import org.elasticsearch.xpack.ql.QlIllegalArgumentException;
|
||||
import org.elasticsearch.xpack.ql.execution.search.FieldExtraction;
|
||||
import org.elasticsearch.xpack.ql.expression.Alias;
|
||||
import org.elasticsearch.xpack.ql.expression.Attribute;
|
||||
import org.elasticsearch.xpack.ql.expression.Expression;
|
||||
|
@ -68,6 +69,7 @@ import org.elasticsearch.xpack.sql.planner.QueryFolder.FoldAggregate.GroupingCon
|
|||
import org.elasticsearch.xpack.sql.planner.QueryTranslator.QueryTranslation;
|
||||
import org.elasticsearch.xpack.sql.querydsl.agg.AggFilter;
|
||||
import org.elasticsearch.xpack.sql.querydsl.agg.GroupByDateHistogram;
|
||||
import org.elasticsearch.xpack.sql.querydsl.container.MetricAggRef;
|
||||
import org.elasticsearch.xpack.sql.stats.Metrics;
|
||||
import org.elasticsearch.xpack.sql.types.SqlTypesTests;
|
||||
import org.elasticsearch.xpack.sql.util.DateUtils;
|
||||
|
@ -76,6 +78,7 @@ import org.junit.BeforeClass;
|
|||
import java.time.ZoneId;
|
||||
import java.time.ZonedDateTime;
|
||||
import java.util.Collection;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Locale;
|
||||
import java.util.Map;
|
||||
|
@ -1896,6 +1899,33 @@ public class QueryTranslatorTests extends ESTestCase {
|
|||
}
|
||||
}
|
||||
|
||||
public void testExtendedStatsAggsStddevAndVar() {
|
||||
final Map<String, String> metricToAgg = new HashMap<String, String>() {{
|
||||
put("STDDEV_POP", "std_deviation");
|
||||
put("STDDEV_SAMP", "std_deviation_sampling");
|
||||
put("VAR_POP", "variance");
|
||||
put("VAR_SAMP", "variance_sampling");
|
||||
}};
|
||||
for (String funcName: metricToAgg.keySet()) {
|
||||
PhysicalPlan p = optimizeAndPlan("SELECT " + funcName + "(int) FROM test");
|
||||
assertEquals(EsQueryExec.class, p.getClass());
|
||||
EsQueryExec eqe = (EsQueryExec) p;
|
||||
assertEquals(1, eqe.output().size());
|
||||
|
||||
assertEquals(funcName + "(int)", eqe.output().get(0).qualifiedName());
|
||||
assertEquals(DOUBLE, eqe.output().get(0).dataType());
|
||||
|
||||
FieldExtraction fe = eqe.queryContainer().fields().get(0).v1();
|
||||
assertEquals(MetricAggRef.class, fe.getClass());
|
||||
assertEquals(((MetricAggRef) fe).property(), metricToAgg.get(funcName));
|
||||
|
||||
String aggName = eqe.queryContainer().aggs().asAggBuilder().getSubAggregations().iterator().next().getName();
|
||||
assertThat(eqe.queryContainer().aggs().asAggBuilder().toString().replaceAll("\\s+", ""),
|
||||
endsWith("\"aggregations\":{\"" + aggName + "\":{\"extended_stats\":{\"field\":\"int\",\"sigma\":2.0}}}}}"));
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
public void testGlobalCountInImplicitGroupByForcesTrackHits() {
|
||||
PhysicalPlan p = optimizeAndPlan("SELECT COUNT(*) FROM test");
|
||||
assertEquals(EsQueryExec.class, p.getClass());
|
||||
|
|
Loading…
Reference in New Issue