mirror of https://github.com/apache/druid.git
SQL: Ability to generate exact distinct count queries. (#3999)
This commit is contained in:
parent
3e8dbd59f8
commit
4a56d7d8a0
|
@ -106,15 +106,16 @@ The following SQL queries and features may be executed using approximate algorit
|
|||
|
||||
- `COUNT(DISTINCT col)` and `APPROX_COUNT_DISTINCT(col)` aggregations use
|
||||
[HyperLogLog](http://algo.inria.fr/flajolet/Publications/FlFuGaMe07.pdf), a fast approximate distinct counting
|
||||
algorithm. If you need exact distinct counts, you can instead use
|
||||
`SELECT COUNT(*) FROM (SELECT DISTINCT col FROM data_source)`, which will use a slower and more resource intensive exact
|
||||
algorithm.
|
||||
algorithm. If you need exact distinct counts, set "useApproximateCountDistinct" to "false", either through query
|
||||
context or through broker configuration.
|
||||
- TopN-style queries with a single grouping column, like
|
||||
`SELECT col1, SUM(col2) FROM data_source GROUP BY col1 ORDER BY SUM(col2) DESC LIMIT 100`, by default will be executed
|
||||
as [TopN queries](topnquery.html), which use an approximate algorithm. To disable this behavior, and use exact
|
||||
algorithms for topN-style queries, set "useApproximateTopN" to "false", either through query context or through broker
|
||||
configuration.
|
||||
|
||||
In both cases, the exact algorithms are generally slower and more resource intensive.
|
||||
|
||||
### Time functions
|
||||
|
||||
Druid's SQL language supports a number of time operations, including:
|
||||
|
|
|
@ -31,6 +31,7 @@ import io.druid.sql.calcite.rule.SelectRules;
|
|||
import org.apache.calcite.interpreter.Bindables;
|
||||
import org.apache.calcite.plan.RelOptRule;
|
||||
import org.apache.calcite.plan.volcano.AbstractConverter;
|
||||
import org.apache.calcite.rel.rules.AggregateExpandDistinctAggregatesRule;
|
||||
import org.apache.calcite.rel.rules.AggregateJoinTransposeRule;
|
||||
import org.apache.calcite.rel.rules.AggregateProjectMergeRule;
|
||||
import org.apache.calcite.rel.rules.AggregateProjectPullUpConstantsRule;
|
||||
|
@ -202,6 +203,12 @@ public class Rules
|
|||
rules.addAll(VOLCANO_ABSTRACT_RULES);
|
||||
rules.addAll(RELOPTUTIL_ABSTRACT_RULES);
|
||||
|
||||
if (!plannerConfig.isUseApproximateCountDistinct()) {
|
||||
// We'll need this to expand COUNT DISTINCTs.
|
||||
// Avoid AggregateExpandDistinctAggregatesRule.INSTANCE; it uses grouping sets and we don't support those.
|
||||
rules.add(AggregateExpandDistinctAggregatesRule.JOIN);
|
||||
}
|
||||
|
||||
if (plannerConfig.isUseFallback()) {
|
||||
rules.add(DruidRelToBindableRule.instance());
|
||||
}
|
||||
|
|
|
@ -122,6 +122,14 @@ public class CalciteQueryTest
|
|||
return 0;
|
||||
}
|
||||
};
|
||||
private static final PlannerConfig PLANNER_CONFIG_NO_HLL = new PlannerConfig()
|
||||
{
|
||||
@Override
|
||||
public boolean isUseApproximateCountDistinct()
|
||||
{
|
||||
return false;
|
||||
}
|
||||
};
|
||||
private static final PlannerConfig PLANNER_CONFIG_SELECT_PAGING = new PlannerConfig()
|
||||
{
|
||||
@Override
|
||||
|
@ -721,6 +729,22 @@ public class CalciteQueryTest
|
|||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testUnplannableExactCountDistinctQueries() throws Exception
|
||||
{
|
||||
// All of these queries are unplannable in exact COUNT DISTINCT mode.
|
||||
|
||||
final List<String> queries = ImmutableList.of(
|
||||
"SELECT COUNT(distinct dim1), COUNT(distinct dim2) FROM druid.foo", // two COUNT DISTINCTs, same query
|
||||
"SELECT dim1, COUNT(distinct dim1), COUNT(distinct dim2) FROM druid.foo GROUP BY dim1", // two COUNT DISTINCTs
|
||||
"SELECT COUNT(distinct unique_dim1) FROM druid.foo" // COUNT DISTINCT on sketch cannot be exact
|
||||
);
|
||||
|
||||
for (final String query : queries) {
|
||||
assertQueryIsUnplannable(PLANNER_CONFIG_NO_HLL, query);
|
||||
}
|
||||
}
|
||||
|
||||
private void assertQueryIsUnplannable(final String sql)
|
||||
{
|
||||
assertQueryIsUnplannable(PLANNER_CONFIG_DEFAULT, sql);
|
||||
|
@ -1869,6 +1893,84 @@ public class CalciteQueryTest
|
|||
);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testExactCountDistinct() throws Exception
|
||||
{
|
||||
// When HLL is disabled, do exact count distinct through a nested query.
|
||||
|
||||
testQuery(
|
||||
PLANNER_CONFIG_NO_HLL,
|
||||
"SELECT COUNT(distinct dim2) FROM druid.foo",
|
||||
ImmutableList.<Query>of(
|
||||
GroupByQuery.builder()
|
||||
.setDataSource(
|
||||
new QueryDataSource(
|
||||
GroupByQuery.builder()
|
||||
.setDataSource(CalciteTests.DATASOURCE1)
|
||||
.setInterval(QSS(Filtration.eternity()))
|
||||
.setGranularity(Granularities.ALL)
|
||||
.setDimensions(DIMS(new DefaultDimensionSpec("dim2", "d0")))
|
||||
.setContext(QUERY_CONTEXT_DEFAULT)
|
||||
.build()
|
||||
)
|
||||
)
|
||||
.setInterval(QSS(Filtration.eternity()))
|
||||
.setGranularity(Granularities.ALL)
|
||||
.setAggregatorSpecs(AGGS(
|
||||
new CountAggregatorFactory("a0")
|
||||
))
|
||||
.setContext(QUERY_CONTEXT_DEFAULT)
|
||||
.build()
|
||||
),
|
||||
ImmutableList.of(
|
||||
new Object[]{3L}
|
||||
)
|
||||
);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testExactCountDistinctWithGroupingAndOtherAggregators() throws Exception
|
||||
{
|
||||
// When HLL is disabled, do exact count distinct through a nested query.
|
||||
|
||||
testQuery(
|
||||
PLANNER_CONFIG_NO_HLL,
|
||||
"SELECT dim2, SUM(cnt), COUNT(distinct dim1) FROM druid.foo GROUP BY dim2",
|
||||
ImmutableList.<Query>of(
|
||||
GroupByQuery.builder()
|
||||
.setDataSource(
|
||||
new QueryDataSource(
|
||||
GroupByQuery.builder()
|
||||
.setDataSource(CalciteTests.DATASOURCE1)
|
||||
.setInterval(QSS(Filtration.eternity()))
|
||||
.setGranularity(Granularities.ALL)
|
||||
.setDimensions(DIMS(
|
||||
new DefaultDimensionSpec("dim2", "d0"),
|
||||
new DefaultDimensionSpec("dim1", "d1")
|
||||
))
|
||||
.setAggregatorSpecs(AGGS(new LongSumAggregatorFactory("a0", "cnt")))
|
||||
.setContext(QUERY_CONTEXT_DEFAULT)
|
||||
.build()
|
||||
)
|
||||
)
|
||||
.setInterval(QSS(Filtration.eternity()))
|
||||
.setGranularity(Granularities.ALL)
|
||||
.setDimensions(DIMS(new DefaultDimensionSpec("d0", "d0")))
|
||||
.setAggregatorSpecs(AGGS(
|
||||
new LongSumAggregatorFactory("a0", "a0"),
|
||||
new CountAggregatorFactory("a1")
|
||||
))
|
||||
.setContext(QUERY_CONTEXT_DEFAULT)
|
||||
.build()
|
||||
),
|
||||
ImmutableList.of(
|
||||
new Object[]{"", 3L, 3L},
|
||||
new Object[]{"a", 2L, 2L},
|
||||
new Object[]{"abc", 1L, 1L}
|
||||
)
|
||||
);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testApproxCountDistinct() throws Exception
|
||||
{
|
||||
|
@ -2844,7 +2946,12 @@ public class CalciteQueryTest
|
|||
.setGranularity(Granularities.ALL)
|
||||
.setDimensions(
|
||||
DIMS(
|
||||
new ExtractionDimensionSpec("dim1", "d0", ValueType.FLOAT, new BucketExtractionFn(1.0, 0.0))
|
||||
new ExtractionDimensionSpec(
|
||||
"dim1",
|
||||
"d0",
|
||||
ValueType.FLOAT,
|
||||
new BucketExtractionFn(1.0, 0.0)
|
||||
)
|
||||
)
|
||||
)
|
||||
.setAggregatorSpecs(AGGS(new CountAggregatorFactory("a0")))
|
||||
|
|
Loading…
Reference in New Issue