SQL: Ability to generate exact distinct count queries. (#3999)

This commit is contained in:
Gian Merlino 2017-03-03 10:10:36 -08:00 committed by Nishant Bangarwa
parent 3e8dbd59f8
commit 4a56d7d8a0
3 changed files with 119 additions and 4 deletions

View File

@ -106,15 +106,16 @@ The following SQL queries and features may be executed using approximate algorit
- `COUNT(DISTINCT col)` and `APPROX_COUNT_DISTINCT(col)` aggregations use
[HyperLogLog](http://algo.inria.fr/flajolet/Publications/FlFuGaMe07.pdf), a fast approximate distinct counting
algorithm. If you need exact distinct counts, you can instead use
`SELECT COUNT(*) FROM (SELECT DISTINCT col FROM data_source)`, which will use a slower and more resource intensive exact
algorithm.
algorithm. If you need exact distinct counts, set "useApproximateCountDistinct" to "false", either through query
context or through broker configuration.
- TopN-style queries with a single grouping column, like
`SELECT col1, SUM(col2) FROM data_source GROUP BY col1 ORDER BY SUM(col2) DESC LIMIT 100`, by default will be executed
as [TopN queries](topnquery.html), which use an approximate algorithm. To disable this behavior, and use exact
algorithms for topN-style queries, set "useApproximateTopN" to "false", either through query context or through broker
configuration.
In both cases, the exact algorithms are generally slower and more resource intensive.
### Time functions
Druid's SQL language supports a number of time operations, including:

View File

@ -31,6 +31,7 @@ import io.druid.sql.calcite.rule.SelectRules;
import org.apache.calcite.interpreter.Bindables;
import org.apache.calcite.plan.RelOptRule;
import org.apache.calcite.plan.volcano.AbstractConverter;
import org.apache.calcite.rel.rules.AggregateExpandDistinctAggregatesRule;
import org.apache.calcite.rel.rules.AggregateJoinTransposeRule;
import org.apache.calcite.rel.rules.AggregateProjectMergeRule;
import org.apache.calcite.rel.rules.AggregateProjectPullUpConstantsRule;
@ -202,6 +203,12 @@ public class Rules
rules.addAll(VOLCANO_ABSTRACT_RULES);
rules.addAll(RELOPTUTIL_ABSTRACT_RULES);
if (!plannerConfig.isUseApproximateCountDistinct()) {
// We'll need this to expand COUNT DISTINCTs.
// Avoid AggregateExpandDistinctAggregatesRule.INSTANCE; it uses grouping sets and we don't support those.
rules.add(AggregateExpandDistinctAggregatesRule.JOIN);
}
if (plannerConfig.isUseFallback()) {
rules.add(DruidRelToBindableRule.instance());
}

View File

@ -122,6 +122,14 @@ public class CalciteQueryTest
return 0;
}
};
private static final PlannerConfig PLANNER_CONFIG_NO_HLL = new PlannerConfig()
{
@Override
public boolean isUseApproximateCountDistinct()
{
return false;
}
};
private static final PlannerConfig PLANNER_CONFIG_SELECT_PAGING = new PlannerConfig()
{
@Override
@ -721,6 +729,22 @@ public class CalciteQueryTest
}
}
@Test
public void testUnplannableExactCountDistinctQueries() throws Exception
{
// All of these queries are unplannable in exact COUNT DISTINCT mode.
final List<String> queries = ImmutableList.of(
"SELECT COUNT(distinct dim1), COUNT(distinct dim2) FROM druid.foo", // two COUNT DISTINCTs, same query
"SELECT dim1, COUNT(distinct dim1), COUNT(distinct dim2) FROM druid.foo GROUP BY dim1", // two COUNT DISTINCTs
"SELECT COUNT(distinct unique_dim1) FROM druid.foo" // COUNT DISTINCT on sketch cannot be exact
);
for (final String query : queries) {
assertQueryIsUnplannable(PLANNER_CONFIG_NO_HLL, query);
}
}
private void assertQueryIsUnplannable(final String sql)
{
assertQueryIsUnplannable(PLANNER_CONFIG_DEFAULT, sql);
@ -1869,6 +1893,84 @@ public class CalciteQueryTest
);
}
@Test
public void testExactCountDistinct() throws Exception
{
// When HLL is disabled, do exact count distinct through a nested query.
testQuery(
PLANNER_CONFIG_NO_HLL,
"SELECT COUNT(distinct dim2) FROM druid.foo",
ImmutableList.<Query>of(
GroupByQuery.builder()
.setDataSource(
new QueryDataSource(
GroupByQuery.builder()
.setDataSource(CalciteTests.DATASOURCE1)
.setInterval(QSS(Filtration.eternity()))
.setGranularity(Granularities.ALL)
.setDimensions(DIMS(new DefaultDimensionSpec("dim2", "d0")))
.setContext(QUERY_CONTEXT_DEFAULT)
.build()
)
)
.setInterval(QSS(Filtration.eternity()))
.setGranularity(Granularities.ALL)
.setAggregatorSpecs(AGGS(
new CountAggregatorFactory("a0")
))
.setContext(QUERY_CONTEXT_DEFAULT)
.build()
),
ImmutableList.of(
new Object[]{3L}
)
);
}
@Test
public void testExactCountDistinctWithGroupingAndOtherAggregators() throws Exception
{
// When HLL is disabled, do exact count distinct through a nested query.
testQuery(
PLANNER_CONFIG_NO_HLL,
"SELECT dim2, SUM(cnt), COUNT(distinct dim1) FROM druid.foo GROUP BY dim2",
ImmutableList.<Query>of(
GroupByQuery.builder()
.setDataSource(
new QueryDataSource(
GroupByQuery.builder()
.setDataSource(CalciteTests.DATASOURCE1)
.setInterval(QSS(Filtration.eternity()))
.setGranularity(Granularities.ALL)
.setDimensions(DIMS(
new DefaultDimensionSpec("dim2", "d0"),
new DefaultDimensionSpec("dim1", "d1")
))
.setAggregatorSpecs(AGGS(new LongSumAggregatorFactory("a0", "cnt")))
.setContext(QUERY_CONTEXT_DEFAULT)
.build()
)
)
.setInterval(QSS(Filtration.eternity()))
.setGranularity(Granularities.ALL)
.setDimensions(DIMS(new DefaultDimensionSpec("d0", "d0")))
.setAggregatorSpecs(AGGS(
new LongSumAggregatorFactory("a0", "a0"),
new CountAggregatorFactory("a1")
))
.setContext(QUERY_CONTEXT_DEFAULT)
.build()
),
ImmutableList.of(
new Object[]{"", 3L, 3L},
new Object[]{"a", 2L, 2L},
new Object[]{"abc", 1L, 1L}
)
);
}
@Test
public void testApproxCountDistinct() throws Exception
{
@ -2844,7 +2946,12 @@ public class CalciteQueryTest
.setGranularity(Granularities.ALL)
.setDimensions(
DIMS(
new ExtractionDimensionSpec("dim1", "d0", ValueType.FLOAT, new BucketExtractionFn(1.0, 0.0))
new ExtractionDimensionSpec(
"dim1",
"d0",
ValueType.FLOAT,
new BucketExtractionFn(1.0, 0.0)
)
)
)
.setAggregatorSpecs(AGGS(new CountAggregatorFactory("a0")))