From af5a4cce3c33de112894aa4ec80144b98d6bbadf Mon Sep 17 00:00:00 2001 From: Gian Merlino Date: Fri, 3 Mar 2017 13:42:30 -0800 Subject: [PATCH] SQL: Clarify approximate distinct count behavior. (#4000) --- docs/content/querying/sql.md | 7 ++-- .../druid/sql/calcite/CalciteQueryTest.java | 32 +++++++++++++++++++ 2 files changed, 36 insertions(+), 3 deletions(-) diff --git a/docs/content/querying/sql.md b/docs/content/querying/sql.md index f8b87758dc1..93fe8ec037c 100644 --- a/docs/content/querying/sql.md +++ b/docs/content/querying/sql.md @@ -104,10 +104,11 @@ You can access table and column metadata through JDBC using `connection.getMetaD The following SQL queries and features may be executed using approximate algorithms: -- `COUNT(DISTINCT col)` and `APPROX_COUNT_DISTINCT(col)` aggregations use +- `COUNT(DISTINCT col)` and `APPROX_COUNT_DISTINCT(col)` aggregations by default use [HyperLogLog](http://algo.inria.fr/flajolet/Publications/FlFuGaMe07.pdf), a fast approximate distinct counting -algorithm. If you need exact distinct counts, set "useApproximateCountDistinct" to "false", either through query -context or through broker configuration. +algorithm. To disable this behavior for `COUNT(DISTINCT col)`, and use exact distinct counts, set +"useApproximateCountDistinct" to "false", either through query context or through broker configuration. +`APPROX_COUNT_DISTINCT(col)` is always approximate, regardless of this setting. - TopN-style queries with a single grouping column, like `SELECT col1, SUM(col2) FROM data_source GROUP BY col1 ORDER BY SUM(col2) DESC LIMIT 100`, by default will be executed as [TopN queries](topnquery.html), which use an approximate algorithm. To disable this behavior, and use exact diff --git a/sql/src/test/java/io/druid/sql/calcite/CalciteQueryTest.java b/sql/src/test/java/io/druid/sql/calcite/CalciteQueryTest.java index 1c742427bc5..69a867d9014 100644 --- a/sql/src/test/java/io/druid/sql/calcite/CalciteQueryTest.java +++ b/sql/src/test/java/io/druid/sql/calcite/CalciteQueryTest.java @@ -1928,6 +1928,38 @@ public class CalciteQueryTest ); } + @Test + public void testApproxCountDistinctWhenHllDisabled() throws Exception + { + // When HLL is disabled, APPROX_COUNT_DISTINCT is still approximate. + + testQuery( + PLANNER_CONFIG_NO_HLL, + "SELECT APPROX_COUNT_DISTINCT(dim2) FROM druid.foo", + ImmutableList.of( + Druids.newTimeseriesQueryBuilder() + .dataSource(CalciteTests.DATASOURCE1) + .intervals(QSS(Filtration.eternity())) + .granularity(Granularities.ALL) + .aggregators( + AGGS( + new CardinalityAggregatorFactory( + "a0", + null, + DIMS(new DefaultDimensionSpec("dim2", null)), + false + ) + ) + ) + .context(TIMESERIES_CONTEXT_DEFAULT) + .build() + ), + ImmutableList.of( + new Object[]{3L} + ) + ); + } + @Test public void testExactCountDistinctWithGroupingAndOtherAggregators() throws Exception {