SQL: fix COUNT DISTINCT filtering (#37176)

* Use `_count` aggregation value only for not-DISTINCT COUNT function calls
* COUNT DISTINCT will use the _exact_ version of a field (the `keyword` sub-field for example), if there is one
This commit is contained in:
Andrei Stefan 2019-01-08 08:47:35 +02:00 committed by GitHub
parent b0665963e8
commit 3fad9d25f6
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 59 additions and 3 deletions

View File

@ -110,6 +110,8 @@ aggCountWithAlias
SELECT gender g, COUNT(*) c FROM "test_emp" GROUP BY g ORDER BY gender;
countDistinct
SELECT COUNT(DISTINCT hire_date) AS count FROM test_emp;
countDistinctAndCountSimpleWithAlias
SELECT COUNT(*) cnt, COUNT(DISTINCT first_name) as names, gender FROM test_emp GROUP BY gender ORDER BY gender;
aggCountAliasAndWhereClauseMultiGroupBy
SELECT gender g, languages l, COUNT(*) c FROM "test_emp" WHERE emp_no < 10020 GROUP BY gender, languages ORDER BY gender, languages;
@ -121,6 +123,8 @@ aggCountWithAliasMultiGroupBy
SELECT gender g, languages l, COUNT(*) c FROM "test_emp" GROUP BY g, l ORDER BY gender, languages;
aggCountWithAliasMultiGroupByDifferentOrder
SELECT gender g, languages l, COUNT(*) c FROM "test_emp" GROUP BY g, l ORDER BY languages ASC, gender DESC;
aggCountDistinctWithAliasAndGroupBy
SELECT COUNT(*) cnt, COUNT(DISTINCT first_name) as names, gender FROM test_emp GROUP BY gender ORDER BY gender;
@ -161,12 +165,20 @@ aggCountStarAndHavingBetween
SELECT gender g, COUNT(*) c FROM "test_emp" GROUP BY g HAVING c BETWEEN 10 AND 70 ORDER BY gender ASC;
aggCountStarAndHavingBetweenWithLimit
SELECT gender g, COUNT(*) c FROM "test_emp" GROUP BY g HAVING c BETWEEN 10 AND 70 ORDER BY gender LIMIT 1;
aggCountDistinctAndHavingBetweenWithLimit
SELECT gender g, COUNT(DISTINCT first_name) c FROM "test_emp" GROUP BY g HAVING c BETWEEN 40 AND 50 ORDER BY gender LIMIT 1;
aggCountOnColumnAndHavingOnAliasAndFunction
SELECT gender g, COUNT(gender) c FROM "test_emp" GROUP BY g HAVING c > 10 AND COUNT(gender) < 70 ORDER BY gender;
aggCountOnColumnAndHavingOnAliasAndFunctionWildcard -> COUNT(*/1) vs COUNT(gender)
SELECT gender g, COUNT(gender) c FROM "test_emp" GROUP BY g HAVING c > 10 AND COUNT(*) < 70 ORDER BY gender;
aggCountOnColumnAndHavingOnAliasAndFunctionConstant
SELECT gender g, COUNT(gender) c FROM "test_emp" GROUP BY g HAVING c > 10 AND COUNT(1) < 70 ORDER BY gender;
aggDistinctCountWithAliasAndHaving
SELECT COUNT(*) c, COUNT(DISTINCT first_name) AS names, gender FROM test_emp GROUP BY gender HAVING names > 40 ORDER BY gender;
aggDistinctCountWithFunctionWildcardAndHaving
SELECT COUNT(*) c, COUNT(DISTINCT first_name) AS names, gender FROM test_emp GROUP BY gender HAVING names < 50 AND c < 50 ORDER BY gender;
aggDistinctCountWithFunctionWildcardAndFunctionConstantAndHaving
SELECT COUNT(*) c, COUNT(DISTINCT first_name) AS names, COUNT(123) AS c123, gender FROM test_emp GROUP BY gender HAVING names < 50 AND c < 50 AND c123 < 50 ORDER BY gender;
aggCountAndHavingMultiGroupBy
SELECT gender g, languages l, COUNT(*) c FROM "test_emp" GROUP BY g, l HAVING COUNT(*) > 10 ORDER BY gender, l;
@ -195,6 +207,8 @@ aggCountOnColumnAndHavingOnAliasAndFunctionWildcardMultiGroupBy -> COUNT(*/1) vs
SELECT gender g, languages l, COUNT(gender) c FROM "test_emp" GROUP BY g, l HAVING c > 10 AND COUNT(*) < 70 ORDER BY gender, languages;
aggCountOnColumnAndHavingOnAliasAndFunctionConstantMultiGroupBy
SELECT gender g, languages l, COUNT(gender) c FROM "test_emp" GROUP BY g, l HAVING c > 10 AND COUNT(1) < 70 ORDER BY gender, languages;
aggCountOnDistinctColumnAndHavingOnAliasAndFunctionConstantMultiGroupBy
SELECT gender g, languages l, COUNT(DISTINCT last_name) c FROM "test_emp" GROUP BY g, l HAVING c > 5 AND COUNT(1) < 70 ORDER BY gender, languages;
// MIN

View File

@ -61,6 +61,9 @@ public class Count extends AggregateFunction {
@Override
public AggregateFunctionAttribute toAttribute() {
return new AggregateFunctionAttribute(source(), name(), dataType(), id(), functionId(), "_count");
if (!distinct()) {
return new AggregateFunctionAttribute(source(), name(), dataType(), id(), functionId(), "_count");
}
return super.toAttribute();
}
}

View File

@ -429,7 +429,13 @@ final class QueryTranslator {
static String field(AggregateFunction af) {
Expression arg = af.field();
if (arg instanceof FieldAttribute) {
return ((FieldAttribute) arg).name();
FieldAttribute field = (FieldAttribute) arg;
// COUNT(DISTINCT) uses cardinality aggregation which works on exact values (not changed by analyzers or normalizers)
if (af instanceof Count && ((Count) af).distinct()) {
// use the `keyword` version of the field, if there is one
return field.isInexact() ? field.exactAttribute().name() : field.name();
}
return field.name();
}
if (arg instanceof Literal) {
return String.valueOf(((Literal) arg).value());

View File

@ -5,6 +5,8 @@
*/
package org.elasticsearch.xpack.sql.planner;
import org.elasticsearch.search.aggregations.AggregationBuilder;
import org.elasticsearch.search.aggregations.metrics.CardinalityAggregationBuilder;
import org.elasticsearch.test.ESTestCase;
import org.elasticsearch.xpack.sql.SqlIllegalArgumentException;
import org.elasticsearch.xpack.sql.TestUtils;
@ -19,11 +21,14 @@ import org.elasticsearch.xpack.sql.expression.function.FunctionRegistry;
import org.elasticsearch.xpack.sql.expression.function.grouping.Histogram;
import org.elasticsearch.xpack.sql.expression.function.scalar.math.MathProcessor.MathOperation;
import org.elasticsearch.xpack.sql.expression.gen.script.ScriptTemplate;
import org.elasticsearch.xpack.sql.optimizer.Optimizer;
import org.elasticsearch.xpack.sql.parser.SqlParser;
import org.elasticsearch.xpack.sql.plan.logical.Aggregate;
import org.elasticsearch.xpack.sql.plan.logical.Filter;
import org.elasticsearch.xpack.sql.plan.logical.LogicalPlan;
import org.elasticsearch.xpack.sql.plan.logical.Project;
import org.elasticsearch.xpack.sql.plan.physical.EsQueryExec;
import org.elasticsearch.xpack.sql.plan.physical.PhysicalPlan;
import org.elasticsearch.xpack.sql.planner.QueryTranslator.QueryTranslation;
import org.elasticsearch.xpack.sql.querydsl.agg.AggFilter;
import org.elasticsearch.xpack.sql.querydsl.query.ExistsQuery;
@ -41,6 +46,7 @@ import org.elasticsearch.xpack.sql.util.DateUtils;
import org.junit.AfterClass;
import org.junit.BeforeClass;
import java.util.Collection;
import java.util.List;
import java.util.Locale;
import java.util.Map;
@ -55,6 +61,8 @@ public class QueryTranslatorTests extends ESTestCase {
private static SqlParser parser;
private static Analyzer analyzer;
private static Optimizer optimizer;
private static Planner planner;
@BeforeClass
public static void init() {
@ -64,6 +72,8 @@ public class QueryTranslatorTests extends ESTestCase {
EsIndex test = new EsIndex("test", mapping);
IndexResolution getIndexResult = IndexResolution.valid(test);
analyzer = new Analyzer(TestUtils.TEST_CFG, new FunctionRegistry(), getIndexResult, new Verifier(new Metrics()));
optimizer = new Optimizer();
planner = new Planner();
}
@AfterClass
@ -76,6 +86,10 @@ public class QueryTranslatorTests extends ESTestCase {
return analyzer.analyze(parser.createStatement(sql), true);
}
private PhysicalPlan optimizeAndPlan(String sql) {
return planner.plan(optimizer.optimize(plan(sql)), true);
}
public void testTermEqualityAnalyzer() {
LogicalPlan p = plan("SELECT some.string FROM test WHERE some.string = 'value'");
assertTrue(p instanceof Project);
@ -433,6 +447,7 @@ public class QueryTranslatorTests extends ESTestCase {
scriptTemplate.toString());
assertEquals("[{v=int}, {v=10}]", scriptTemplate.params().toString());
}
public void testGroupByDateHistogram() {
LogicalPlan p = plan("SELECT MAX(int) FROM test GROUP BY HISTOGRAM(int, 1000)");
assertTrue(p instanceof Aggregate);
@ -448,7 +463,6 @@ public class QueryTranslatorTests extends ESTestCase {
assertEquals(DataType.INTEGER, field.dataType());
}
public void testGroupByHistogram() {
LogicalPlan p = plan("SELECT MAX(int) FROM test GROUP BY HISTOGRAM(date, INTERVAL 2 YEARS)");
assertTrue(p instanceof Aggregate);
@ -463,4 +477,23 @@ public class QueryTranslatorTests extends ESTestCase {
assertEquals(FieldAttribute.class, field.getClass());
assertEquals(DataType.DATE, field.dataType());
}
public void testCountDistinctCardinalityFolder() {
PhysicalPlan p = optimizeAndPlan("SELECT COUNT(DISTINCT keyword) cnt FROM test GROUP BY bool HAVING cnt = 0");
assertEquals(EsQueryExec.class, p.getClass());
EsQueryExec ee = (EsQueryExec) p;
assertEquals(1, ee.output().size());
assertThat(ee.output().get(0).toString(), startsWith("cnt{a->"));
Collection<AggregationBuilder> subAggs = ee.queryContainer().aggs().asAggBuilder().getSubAggregations();
assertEquals(1, subAggs.size());
assertTrue(subAggs.toArray()[0] instanceof CardinalityAggregationBuilder);
CardinalityAggregationBuilder cardinalityAgg = (CardinalityAggregationBuilder) subAggs.toArray()[0];
assertEquals("keyword", cardinalityAgg.field());
assertThat(ee.queryContainer().aggs().asAggBuilder().toString().replaceAll("\\s+", ""),
endsWith("{\"buckets_path\":{\"a0\":\"" + cardinalityAgg.getName() +"\"},\"script\":{"
+ "\"source\":\"InternalSqlScriptUtils.nullSafeFilter(InternalSqlScriptUtils.eq(params.a0,params.v0))\","
+ "\"lang\":\"painless\",\"params\":{\"v0\":0}},\"gap_policy\":\"skip\"}}}}}"));
}
}