SQL: Clarify approximate distinct count behavior. (#4000)

This commit is contained in:
Gian Merlino 2017-03-03 13:42:30 -08:00 committed by GitHub
parent 67d0ae3271
commit af5a4cce3c
2 changed files with 36 additions and 3 deletions

View File

@ -104,10 +104,11 @@ You can access table and column metadata through JDBC using `connection.getMetaD
The following SQL queries and features may be executed using approximate algorithms: The following SQL queries and features may be executed using approximate algorithms:
- `COUNT(DISTINCT col)` and `APPROX_COUNT_DISTINCT(col)` aggregations use - `COUNT(DISTINCT col)` and `APPROX_COUNT_DISTINCT(col)` aggregations by default use
[HyperLogLog](http://algo.inria.fr/flajolet/Publications/FlFuGaMe07.pdf), a fast approximate distinct counting [HyperLogLog](http://algo.inria.fr/flajolet/Publications/FlFuGaMe07.pdf), a fast approximate distinct counting
algorithm. If you need exact distinct counts, set "useApproximateCountDistinct" to "false", either through query algorithm. To disable this behavior for `COUNT(DISTINCT col)`, and use exact distinct counts, set
context or through broker configuration. "useApproximateCountDistinct" to "false", either through query context or through broker configuration.
`APPROX_COUNT_DISTINCT(col)` is always approximate, regardless of this setting.
- TopN-style queries with a single grouping column, like - TopN-style queries with a single grouping column, like
`SELECT col1, SUM(col2) FROM data_source GROUP BY col1 ORDER BY SUM(col2) DESC LIMIT 100`, by default will be executed `SELECT col1, SUM(col2) FROM data_source GROUP BY col1 ORDER BY SUM(col2) DESC LIMIT 100`, by default will be executed
as [TopN queries](topnquery.html), which use an approximate algorithm. To disable this behavior, and use exact as [TopN queries](topnquery.html), which use an approximate algorithm. To disable this behavior, and use exact

View File

@ -1928,6 +1928,38 @@ public class CalciteQueryTest
); );
} }
@Test
public void testApproxCountDistinctWhenHllDisabled() throws Exception
{
// When HLL is disabled, APPROX_COUNT_DISTINCT is still approximate.
testQuery(
PLANNER_CONFIG_NO_HLL,
"SELECT APPROX_COUNT_DISTINCT(dim2) FROM druid.foo",
ImmutableList.<Query>of(
Druids.newTimeseriesQueryBuilder()
.dataSource(CalciteTests.DATASOURCE1)
.intervals(QSS(Filtration.eternity()))
.granularity(Granularities.ALL)
.aggregators(
AGGS(
new CardinalityAggregatorFactory(
"a0",
null,
DIMS(new DefaultDimensionSpec("dim2", null)),
false
)
)
)
.context(TIMESERIES_CONTEXT_DEFAULT)
.build()
),
ImmutableList.of(
new Object[]{3L}
)
);
}
@Test @Test
public void testExactCountDistinctWithGroupingAndOtherAggregators() throws Exception public void testExactCountDistinctWithGroupingAndOtherAggregators() throws Exception
{ {