SQL: Clarify approximate distinct count behavior. (#4000)

2017-03-03 13:42:30 -08:00 · 2017-03-03 13:42:30 -08:00 · af5a4cce3c
parent 67d0ae3271
commit af5a4cce3c
2 changed files with 36 additions and 3 deletions
--- a/docs/content/querying/sql.md
+++ b/docs/content/querying/sql.md
@ -104,10 +104,11 @@ You can access table and column metadata through JDBC using `connection.getMetaD
 The following SQL queries and features may be executed using approximate algorithms:
- `COUNT(DISTINCT col)` and `APPROX_COUNT_DISTINCT(col)` aggregations use
+- `COUNT(DISTINCT col)` and `APPROX_COUNT_DISTINCT(col)` aggregations by default use
 [HyperLogLog](http://algo.inria.fr/flajolet/Publications/FlFuGaMe07.pdf), a fast approximate distinct counting
-algorithm. If you need exact distinct counts, set "useApproximateCountDistinct" to "false", either through query
+algorithm. To disable this behavior for `COUNT(DISTINCT col)`, and use exact distinct counts, set
-context or through broker configuration.
+"useApproximateCountDistinct" to "false", either through query context or through broker configuration.
 `APPROX_COUNT_DISTINCT(col)` is always approximate, regardless of this setting.
 - TopN-style queries with a single grouping column, like
 `SELECT col1, SUM(col2) FROM data_source GROUP BY col1 ORDER BY SUM(col2) DESC LIMIT 100`, by default will be executed
 as [TopN queries](topnquery.html), which use an approximate algorithm. To disable this behavior, and use exact
--- a/sql/src/test/java/io/druid/sql/calcite/CalciteQueryTest.java
+++ b/sql/src/test/java/io/druid/sql/calcite/CalciteQueryTest.java
@ -1928,6 +1928,38 @@ public class CalciteQueryTest
    );
  }
  @Test
  public void testApproxCountDistinctWhenHllDisabled() throws Exception
  {
    // When HLL is disabled, APPROX_COUNT_DISTINCT is still approximate.
    testQuery(
        PLANNER_CONFIG_NO_HLL,
        "SELECT APPROX_COUNT_DISTINCT(dim2) FROM druid.foo",
        ImmutableList.<Query>of(
            Druids.newTimeseriesQueryBuilder()
                  .dataSource(CalciteTests.DATASOURCE1)
                  .intervals(QSS(Filtration.eternity()))
                  .granularity(Granularities.ALL)
                  .aggregators(
                      AGGS(
                          new CardinalityAggregatorFactory(
                              "a0",
                              null,
                              DIMS(new DefaultDimensionSpec("dim2", null)),
                              false
                          )
                      )
                  )
                  .context(TIMESERIES_CONTEXT_DEFAULT)
                  .build()
        ),
        ImmutableList.of(
            new Object[]{3L}
        )
    );
  }
  @Test
  public void testExactCountDistinctWithGroupingAndOtherAggregators() throws Exception
  {