mirror of https://github.com/apache/druid.git
SQL: Ability to generate exact distinct count queries. (#3999)
This commit is contained in:
parent
3e8dbd59f8
commit
4a56d7d8a0
|
@ -106,15 +106,16 @@ The following SQL queries and features may be executed using approximate algorit
|
||||||
|
|
||||||
- `COUNT(DISTINCT col)` and `APPROX_COUNT_DISTINCT(col)` aggregations use
|
- `COUNT(DISTINCT col)` and `APPROX_COUNT_DISTINCT(col)` aggregations use
|
||||||
[HyperLogLog](http://algo.inria.fr/flajolet/Publications/FlFuGaMe07.pdf), a fast approximate distinct counting
|
[HyperLogLog](http://algo.inria.fr/flajolet/Publications/FlFuGaMe07.pdf), a fast approximate distinct counting
|
||||||
algorithm. If you need exact distinct counts, you can instead use
|
algorithm. If you need exact distinct counts, set "useApproximateCountDistinct" to "false", either through query
|
||||||
`SELECT COUNT(*) FROM (SELECT DISTINCT col FROM data_source)`, which will use a slower and more resource intensive exact
|
context or through broker configuration.
|
||||||
algorithm.
|
|
||||||
- TopN-style queries with a single grouping column, like
|
- TopN-style queries with a single grouping column, like
|
||||||
`SELECT col1, SUM(col2) FROM data_source GROUP BY col1 ORDER BY SUM(col2) DESC LIMIT 100`, by default will be executed
|
`SELECT col1, SUM(col2) FROM data_source GROUP BY col1 ORDER BY SUM(col2) DESC LIMIT 100`, by default will be executed
|
||||||
as [TopN queries](topnquery.html), which use an approximate algorithm. To disable this behavior, and use exact
|
as [TopN queries](topnquery.html), which use an approximate algorithm. To disable this behavior, and use exact
|
||||||
algorithms for topN-style queries, set "useApproximateTopN" to "false", either through query context or through broker
|
algorithms for topN-style queries, set "useApproximateTopN" to "false", either through query context or through broker
|
||||||
configuration.
|
configuration.
|
||||||
|
|
||||||
|
In both cases, the exact algorithms are generally slower and more resource intensive.
|
||||||
|
|
||||||
### Time functions
|
### Time functions
|
||||||
|
|
||||||
Druid's SQL language supports a number of time operations, including:
|
Druid's SQL language supports a number of time operations, including:
|
||||||
|
|
|
@ -31,6 +31,7 @@ import io.druid.sql.calcite.rule.SelectRules;
|
||||||
import org.apache.calcite.interpreter.Bindables;
|
import org.apache.calcite.interpreter.Bindables;
|
||||||
import org.apache.calcite.plan.RelOptRule;
|
import org.apache.calcite.plan.RelOptRule;
|
||||||
import org.apache.calcite.plan.volcano.AbstractConverter;
|
import org.apache.calcite.plan.volcano.AbstractConverter;
|
||||||
|
import org.apache.calcite.rel.rules.AggregateExpandDistinctAggregatesRule;
|
||||||
import org.apache.calcite.rel.rules.AggregateJoinTransposeRule;
|
import org.apache.calcite.rel.rules.AggregateJoinTransposeRule;
|
||||||
import org.apache.calcite.rel.rules.AggregateProjectMergeRule;
|
import org.apache.calcite.rel.rules.AggregateProjectMergeRule;
|
||||||
import org.apache.calcite.rel.rules.AggregateProjectPullUpConstantsRule;
|
import org.apache.calcite.rel.rules.AggregateProjectPullUpConstantsRule;
|
||||||
|
@ -202,6 +203,12 @@ public class Rules
|
||||||
rules.addAll(VOLCANO_ABSTRACT_RULES);
|
rules.addAll(VOLCANO_ABSTRACT_RULES);
|
||||||
rules.addAll(RELOPTUTIL_ABSTRACT_RULES);
|
rules.addAll(RELOPTUTIL_ABSTRACT_RULES);
|
||||||
|
|
||||||
|
if (!plannerConfig.isUseApproximateCountDistinct()) {
|
||||||
|
// We'll need this to expand COUNT DISTINCTs.
|
||||||
|
// Avoid AggregateExpandDistinctAggregatesRule.INSTANCE; it uses grouping sets and we don't support those.
|
||||||
|
rules.add(AggregateExpandDistinctAggregatesRule.JOIN);
|
||||||
|
}
|
||||||
|
|
||||||
if (plannerConfig.isUseFallback()) {
|
if (plannerConfig.isUseFallback()) {
|
||||||
rules.add(DruidRelToBindableRule.instance());
|
rules.add(DruidRelToBindableRule.instance());
|
||||||
}
|
}
|
||||||
|
|
|
@ -122,6 +122,14 @@ public class CalciteQueryTest
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
private static final PlannerConfig PLANNER_CONFIG_NO_HLL = new PlannerConfig()
|
||||||
|
{
|
||||||
|
@Override
|
||||||
|
public boolean isUseApproximateCountDistinct()
|
||||||
|
{
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
};
|
||||||
private static final PlannerConfig PLANNER_CONFIG_SELECT_PAGING = new PlannerConfig()
|
private static final PlannerConfig PLANNER_CONFIG_SELECT_PAGING = new PlannerConfig()
|
||||||
{
|
{
|
||||||
@Override
|
@Override
|
||||||
|
@ -721,6 +729,22 @@ public class CalciteQueryTest
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testUnplannableExactCountDistinctQueries() throws Exception
|
||||||
|
{
|
||||||
|
// All of these queries are unplannable in exact COUNT DISTINCT mode.
|
||||||
|
|
||||||
|
final List<String> queries = ImmutableList.of(
|
||||||
|
"SELECT COUNT(distinct dim1), COUNT(distinct dim2) FROM druid.foo", // two COUNT DISTINCTs, same query
|
||||||
|
"SELECT dim1, COUNT(distinct dim1), COUNT(distinct dim2) FROM druid.foo GROUP BY dim1", // two COUNT DISTINCTs
|
||||||
|
"SELECT COUNT(distinct unique_dim1) FROM druid.foo" // COUNT DISTINCT on sketch cannot be exact
|
||||||
|
);
|
||||||
|
|
||||||
|
for (final String query : queries) {
|
||||||
|
assertQueryIsUnplannable(PLANNER_CONFIG_NO_HLL, query);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
private void assertQueryIsUnplannable(final String sql)
|
private void assertQueryIsUnplannable(final String sql)
|
||||||
{
|
{
|
||||||
assertQueryIsUnplannable(PLANNER_CONFIG_DEFAULT, sql);
|
assertQueryIsUnplannable(PLANNER_CONFIG_DEFAULT, sql);
|
||||||
|
@ -1869,6 +1893,84 @@ public class CalciteQueryTest
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testExactCountDistinct() throws Exception
|
||||||
|
{
|
||||||
|
// When HLL is disabled, do exact count distinct through a nested query.
|
||||||
|
|
||||||
|
testQuery(
|
||||||
|
PLANNER_CONFIG_NO_HLL,
|
||||||
|
"SELECT COUNT(distinct dim2) FROM druid.foo",
|
||||||
|
ImmutableList.<Query>of(
|
||||||
|
GroupByQuery.builder()
|
||||||
|
.setDataSource(
|
||||||
|
new QueryDataSource(
|
||||||
|
GroupByQuery.builder()
|
||||||
|
.setDataSource(CalciteTests.DATASOURCE1)
|
||||||
|
.setInterval(QSS(Filtration.eternity()))
|
||||||
|
.setGranularity(Granularities.ALL)
|
||||||
|
.setDimensions(DIMS(new DefaultDimensionSpec("dim2", "d0")))
|
||||||
|
.setContext(QUERY_CONTEXT_DEFAULT)
|
||||||
|
.build()
|
||||||
|
)
|
||||||
|
)
|
||||||
|
.setInterval(QSS(Filtration.eternity()))
|
||||||
|
.setGranularity(Granularities.ALL)
|
||||||
|
.setAggregatorSpecs(AGGS(
|
||||||
|
new CountAggregatorFactory("a0")
|
||||||
|
))
|
||||||
|
.setContext(QUERY_CONTEXT_DEFAULT)
|
||||||
|
.build()
|
||||||
|
),
|
||||||
|
ImmutableList.of(
|
||||||
|
new Object[]{3L}
|
||||||
|
)
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testExactCountDistinctWithGroupingAndOtherAggregators() throws Exception
|
||||||
|
{
|
||||||
|
// When HLL is disabled, do exact count distinct through a nested query.
|
||||||
|
|
||||||
|
testQuery(
|
||||||
|
PLANNER_CONFIG_NO_HLL,
|
||||||
|
"SELECT dim2, SUM(cnt), COUNT(distinct dim1) FROM druid.foo GROUP BY dim2",
|
||||||
|
ImmutableList.<Query>of(
|
||||||
|
GroupByQuery.builder()
|
||||||
|
.setDataSource(
|
||||||
|
new QueryDataSource(
|
||||||
|
GroupByQuery.builder()
|
||||||
|
.setDataSource(CalciteTests.DATASOURCE1)
|
||||||
|
.setInterval(QSS(Filtration.eternity()))
|
||||||
|
.setGranularity(Granularities.ALL)
|
||||||
|
.setDimensions(DIMS(
|
||||||
|
new DefaultDimensionSpec("dim2", "d0"),
|
||||||
|
new DefaultDimensionSpec("dim1", "d1")
|
||||||
|
))
|
||||||
|
.setAggregatorSpecs(AGGS(new LongSumAggregatorFactory("a0", "cnt")))
|
||||||
|
.setContext(QUERY_CONTEXT_DEFAULT)
|
||||||
|
.build()
|
||||||
|
)
|
||||||
|
)
|
||||||
|
.setInterval(QSS(Filtration.eternity()))
|
||||||
|
.setGranularity(Granularities.ALL)
|
||||||
|
.setDimensions(DIMS(new DefaultDimensionSpec("d0", "d0")))
|
||||||
|
.setAggregatorSpecs(AGGS(
|
||||||
|
new LongSumAggregatorFactory("a0", "a0"),
|
||||||
|
new CountAggregatorFactory("a1")
|
||||||
|
))
|
||||||
|
.setContext(QUERY_CONTEXT_DEFAULT)
|
||||||
|
.build()
|
||||||
|
),
|
||||||
|
ImmutableList.of(
|
||||||
|
new Object[]{"", 3L, 3L},
|
||||||
|
new Object[]{"a", 2L, 2L},
|
||||||
|
new Object[]{"abc", 1L, 1L}
|
||||||
|
)
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testApproxCountDistinct() throws Exception
|
public void testApproxCountDistinct() throws Exception
|
||||||
{
|
{
|
||||||
|
@ -2844,7 +2946,12 @@ public class CalciteQueryTest
|
||||||
.setGranularity(Granularities.ALL)
|
.setGranularity(Granularities.ALL)
|
||||||
.setDimensions(
|
.setDimensions(
|
||||||
DIMS(
|
DIMS(
|
||||||
new ExtractionDimensionSpec("dim1", "d0", ValueType.FLOAT, new BucketExtractionFn(1.0, 0.0))
|
new ExtractionDimensionSpec(
|
||||||
|
"dim1",
|
||||||
|
"d0",
|
||||||
|
ValueType.FLOAT,
|
||||||
|
new BucketExtractionFn(1.0, 0.0)
|
||||||
|
)
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
.setAggregatorSpecs(AGGS(new CountAggregatorFactory("a0")))
|
.setAggregatorSpecs(AGGS(new CountAggregatorFactory("a0")))
|
||||||
|
|
Loading…
Reference in New Issue