From 8cb62cd32bed5c1c8140b414da1dd144a79219e5 Mon Sep 17 00:00:00 2001 From: YuCheng Hu Date: Thu, 29 Jul 2021 12:47:06 -0400 Subject: [PATCH] =?UTF-8?q?=E6=B7=BB=E5=8A=A0=E6=9F=A5=E8=AF=A2=E9=9C=80?= =?UTF-8?q?=E8=A6=81=E7=9A=84=E6=96=87=E6=A1=A3?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- querying/groupbyquery.md | 449 +++++++++++++ querying/query-execution.md | 118 ++++ querying/querying.md | 125 ++++ querying/sql.md | 1270 +++++++++++++++++++++++++++++++++++ querying/topnquery.md | 261 +++++++ 5 files changed, 2223 insertions(+) create mode 100644 querying/groupbyquery.md create mode 100644 querying/query-execution.md create mode 100644 querying/querying.md create mode 100644 querying/sql.md create mode 100644 querying/topnquery.md diff --git a/querying/groupbyquery.md b/querying/groupbyquery.md new file mode 100644 index 0000000..d130fa4 --- /dev/null +++ b/querying/groupbyquery.md @@ -0,0 +1,449 @@ +# GroupBy 查询 + +> Apache Druid supports two query languages: [Druid SQL](sql.md) and [native queries](querying.md). +> This document describes a query +> type in the native language. For information about when Druid SQL will use this query type, refer to the +> [SQL documentation](sql.md#query-types). + +These types of Apache Druid queries take a groupBy query object and return an array of JSON objects where each object represents a +grouping asked for by the query. + +> Note: If you are doing aggregations with time as your only grouping, or an ordered groupBy over a single dimension, +> consider [Timeseries](timeseriesquery.md) and [TopN](topnquery.md) queries as well as +> groupBy. Their performance may be better in some cases. See [Alternatives](#alternatives) below for more details. + +一个分组查询(groupBy query)对象的查询脚本如下示例: + +``` json +{ + "queryType": "groupBy", + "dataSource": "sample_datasource", + "granularity": "day", + "dimensions": ["country", "device"], + "limitSpec": { "type": "default", "limit": 5000, "columns": ["country", "data_transfer"] }, + "filter": { + "type": "and", + "fields": [ + { "type": "selector", "dimension": "carrier", "value": "AT&T" }, + { "type": "or", + "fields": [ + { "type": "selector", "dimension": "make", "value": "Apple" }, + { "type": "selector", "dimension": "make", "value": "Samsung" } + ] + } + ] + }, + "aggregations": [ + { "type": "longSum", "name": "total_usage", "fieldName": "user_count" }, + { "type": "doubleSum", "name": "data_transfer", "fieldName": "data_transfer" } + ], + "postAggregations": [ + { "type": "arithmetic", + "name": "avg_usage", + "fn": "/", + "fields": [ + { "type": "fieldAccess", "fieldName": "data_transfer" }, + { "type": "fieldAccess", "fieldName": "total_usage" } + ] + } + ], + "intervals": [ "2012-01-01T00:00:00.000/2012-01-03T00:00:00.000" ], + "having": { + "type": "greaterThan", + "aggregation": "total_usage", + "value": 100 + } +} +``` + +Following are main parts to a groupBy query: + +|property|description|required?| +|--------|-----------|---------| +|queryType|This String should always be "groupBy"; this is the first thing Druid looks at to figure out how to interpret the query|yes| +|dataSource|A String or Object defining the data source to query, very similar to a table in a relational database. See [DataSource](../querying/datasource.md) for more information.|yes| +|dimensions|A JSON list of dimensions to do the groupBy over; or see [DimensionSpec](../querying/dimensionspecs.md) for ways to extract dimensions. |yes| +|limitSpec|See [LimitSpec](../querying/limitspec.md).|no| +|having|See [Having](../querying/having.md).|no| +|granularity|Defines the granularity of the query. See [Granularities](../querying/granularities.md)|yes| +|filter|See [Filters](../querying/filters.md)|no| +|aggregations|See [Aggregations](../querying/aggregations.md)|no| +|postAggregations|See [Post Aggregations](../querying/post-aggregations.md)|no| +|intervals|A JSON Object representing ISO-8601 Intervals. This defines the time ranges to run the query over.|yes| +|subtotalsSpec| A JSON array of arrays to return additional result sets for groupings of subsets of top level `dimensions`. It is [described later](groupbyquery.md#more-on-subtotalsspec) in more detail.|no| +|context|An additional JSON Object which can be used to specify certain flags.|no| + +To pull it all together, the above query would return *n\*m* data points, up to a maximum of 5000 points, where n is the cardinality of the `country` dimension, m is the cardinality of the `device` dimension, each day between 2012-01-01 and 2012-01-03, from the `sample_datasource` table. Each data point contains the (long) sum of `total_usage` if the value of the data point is greater than 100, the (double) sum of `data_transfer` and the (double) result of `total_usage` divided by `data_transfer` for the filter set for a particular grouping of `country` and `device`. The output looks like this: + +```json +[ + { + "version" : "v1", + "timestamp" : "2012-01-01T00:00:00.000Z", + "event" : { + "country" : , + "device" : , + "total_usage" : , + "data_transfer" :, + "avg_usage" : + } + }, + { + "version" : "v1", + "timestamp" : "2012-01-01T00:00:12.000Z", + "event" : { + "dim1" : , + "dim2" : , + "sample_name1" : , + "sample_name2" :, + "avg_usage" : + } + }, +... +] +``` + +## Behavior on multi-value dimensions + +groupBy queries can group on multi-value dimensions. When grouping on a multi-value dimension, _all_ values +from matching rows will be used to generate one group per value. It's possible for a query to return more groups than +there are rows. For example, a groupBy on the dimension `tags` with filter `"t1" AND "t3"` would match only row1, and +generate a result with three groups: `t1`, `t2`, and `t3`. If you only need to include values that match +your filter, you can use a [filtered dimensionSpec](dimensionspecs.md#filtered-dimensionspecs). This can also +improve performance. + +See [Multi-value dimensions](multi-value-dimensions.md) for more details. + +## More on subtotalsSpec + +The subtotals feature allows computation of multiple sub-groupings in a single query. To use this feature, add a "subtotalsSpec" to your query as a list of subgroup dimension sets. It should contain the `outputName` from dimensions in your `dimensions` attribute, in the same order as they appear in the `dimensions` attribute (although, of course, you may skip some). + +For example, consider a groupBy query like this one: + +```json +{ +"type": "groupBy", + ... + ... +"dimensions": [ + { + "type" : "default", + "dimension" : "d1col", + "outputName": "D1" + }, + { + "type" : "extraction", + "dimension" : "d2col", + "outputName" : "D2", + "extractionFn" : extraction_func + }, + { + "type":"lookup", + "dimension":"d3col", + "outputName":"D3", + "name":"my_lookup" + } +], +... +... +"subtotalsSpec":[ ["D1", "D2", D3"], ["D1", "D3"], ["D3"]], +.. + +} +``` + +The result of the subtotalsSpec would be equivalent to concatenating the result of three groupBy queries, with the "dimensions" field being `["D1", "D2", D3"]`, `["D1", "D3"]` and `["D3"]`, given the `DimensionSpec` shown above. +The response for the query above would look something like: + +```json +[ + { + "version" : "v1", + "timestamp" : "t1", + "event" : { "D1": "..", "D2": "..", "D3": ".." } + } + }, + { + "version" : "v1", + "timestamp" : "t2", + "event" : { "D1": "..", "D2": "..", "D3": ".." } + } + }, + ... + ... + + { + "version" : "v1", + "timestamp" : "t1", + "event" : { "D1": "..", "D2": null, "D3": ".." } + } + }, + { + "version" : "v1", + "timestamp" : "t2", + "event" : { "D1": "..", "D2": null, "D3": ".." } + } + }, + ... + ... + + { + "version" : "v1", + "timestamp" : "t1", + "event" : { "D1": null, "D2": null, "D3": ".." } + } + }, + { + "version" : "v1", + "timestamp" : "t2", + "event" : { "D1": null, "D2": null, "D3": ".." } + } + }, +... +] +``` + +> Notice that dimensions that are not included in an individual subtotalsSpec grouping are returned with a `null` value. This response format represents a behavior change as of Apache Druid 0.18.0. +> In release 0.17.0 and earlier, such dimensions were entirely excluded from the result. If you were relying on this old behavior to determine whether a particular dimension was not part of +> a subtotal grouping, you can now use [Grouping aggregator](aggregations.md#grouping-aggregator) instead. + + +## Implementation details + +### Strategies + +GroupBy queries can be executed using two different strategies. The default strategy for a cluster is determined by the +"druid.query.groupBy.defaultStrategy" runtime property on the Broker. This can be overridden using "groupByStrategy" in +the query context. If neither the context field nor the property is set, the "v2" strategy will be used. + +- "v2", the default, is designed to offer better performance and memory management. This strategy generates +per-segment results using a fully off-heap map. Data processes merge the per-segment results using a fully off-heap +concurrent facts map combined with an on-heap string dictionary. This may optionally involve spilling to disk. Data +processes return sorted results to the Broker, which merges result streams using an N-way merge. The broker materializes +the results if necessary (e.g. if the query sorts on columns other than its dimensions). Otherwise, it streams results +back as they are merged. + +- "v1", a legacy engine, generates per-segment results on data processes (Historical, realtime, MiddleManager) using a map which +is partially on-heap (dimension keys and the map itself) and partially off-heap (the aggregated values). Data processes then +merge the per-segment results using Druid's indexing mechanism. This merging is multi-threaded by default, but can +optionally be single-threaded. The Broker merges the final result set using Druid's indexing mechanism again. The broker +merging is always single-threaded. Because the Broker merges results using the indexing mechanism, it must materialize +the full result set before returning any results. On both the data processes and the Broker, the merging index is fully +on-heap by default, but it can optionally store aggregated values off-heap. + +### Differences between v1 and v2 + +Query API and results are compatible between the two engines; however, there are some differences from a cluster +configuration perspective: + +- groupBy v1 controls resource usage using a row-based limit (maxResults) whereas groupBy v2 uses bytes-based limits. +In addition, groupBy v1 merges results on-heap, whereas groupBy v2 merges results off-heap. These factors mean that +memory tuning and resource limits behave differently between v1 and v2. In particular, due to this, some queries +that can complete successfully in one engine may exceed resource limits and fail with the other engine. See the +"Memory tuning and resource limits" section for more details. +- groupBy v1 imposes no limit on the number of concurrently running queries, whereas groupBy v2 controls memory usage +by using a finite-sized merge buffer pool. By default, the number of merge buffers is 1/4 the number of processing +threads. You can adjust this as necessary to balance concurrency and memory usage. +- groupBy v1 supports caching on either the Broker or Historical processes, whereas groupBy v2 only supports caching on +Historical processes. +- groupBy v2 supports both array-based aggregation and hash-based aggregation. The array-based aggregation is used only +when the grouping key is a single indexed string column. In array-based aggregation, the dictionary-encoded value is used +as the index, so the aggregated values in the array can be accessed directly without finding buckets based on hashing. + +### Memory tuning and resource limits + +When using groupBy v2, three parameters control resource usage and limits: + +- `druid.processing.buffer.sizeBytes`: size of the off-heap hash table used for aggregation, per query, in bytes. At +most `druid.processing.numMergeBuffers` of these will be created at once, which also serves as an upper limit on the +number of concurrently running groupBy queries. +- `druid.query.groupBy.maxMergingDictionarySize`: size of the on-heap dictionary used when grouping on strings, per query, +in bytes. Note that this is based on a rough estimate of the dictionary size, not the actual size. +- `druid.query.groupBy.maxOnDiskStorage`: amount of space on disk used for aggregation, per query, in bytes. By default, +this is 0, which means aggregation will not use disk. + +If `maxOnDiskStorage` is 0 (the default) then a query that exceeds either the on-heap dictionary limit, or the off-heap +aggregation table limit, will fail with a "Resource limit exceeded" error describing the limit that was exceeded. + +If `maxOnDiskStorage` is greater than 0, queries that exceed the in-memory limits will start using disk for aggregation. +In this case, when either the on-heap dictionary or off-heap hash table fills up, partially aggregated records will be +sorted and flushed to disk. Then, both in-memory structures will be cleared out for further aggregation. Queries that +then go on to exceed `maxOnDiskStorage` will fail with a "Resource limit exceeded" error indicating that they ran out of +disk space. + +With groupBy v2, cluster operators should make sure that the off-heap hash tables and on-heap merging dictionaries +will not exceed available memory for the maximum possible concurrent query load (given by +`druid.processing.numMergeBuffers`). See the [basic cluster tuning guide](../operations/basic-cluster-tuning.md) +for more details about direct memory usage, organized by Druid process type. + +Brokers do not need merge buffers for basic groupBy queries. Queries with subqueries (using a `query` dataSource) require one merge buffer if there is a single subquery, or two merge buffers if there is more than one layer of nested subqueries. Queries with [subtotals](groupbyquery.md#more-on-subtotalsspec) need one merge buffer. These can stack on top of each other: a groupBy query with multiple layers of nested subqueries, and that also uses subtotals, will need three merge buffers. + +Historicals and ingestion tasks need one merge buffer for each groupBy query, unless [parallel combination](groupbyquery.md#parallel-combine) is enabled, in which case they need two merge buffers per query. + +When using groupBy v1, all aggregation is done on-heap, and resource limits are done through the parameter +`druid.query.groupBy.maxResults`. This is a cap on the maximum number of results in a result set. Queries that exceed +this limit will fail with a "Resource limit exceeded" error indicating they exceeded their row limit. Cluster +operators should make sure that the on-heap aggregations will not exceed available JVM heap space for the expected +concurrent query load. + +### Performance tuning for groupBy v2 + +#### Limit pushdown optimization + +Druid pushes down the `limit` spec in groupBy queries to the segments on Historicals wherever possible to early prune unnecessary intermediate results and minimize the amount of data transferred to Brokers. By default, this technique is applied only when all fields in the `orderBy` spec is a subset of the grouping keys. This is because the `limitPushDown` doesn't guarantee the exact results if the `orderBy` spec includes any fields that are not in the grouping keys. However, you can enable this technique even in such cases if you can sacrifice some accuracy for fast query processing like in topN queries. See `forceLimitPushDown` in [advanced groupBy v2 configurations](#groupby-v2-configurations). + + +#### Optimizing hash table + +The groupBy v2 engine uses an open addressing hash table for aggregation. The hash table is initialized with a given initial bucket number and gradually grows on buffer full. On hash collisions, the linear probing technique is used. + +The default number of initial buckets is 1024 and the default max load factor of the hash table is 0.7. If you can see too many collisions in the hash table, you can adjust these numbers. See `bufferGrouperInitialBuckets` and `bufferGrouperMaxLoadFactor` in [Advanced groupBy v2 configurations](#groupby-v2-configurations). + + +#### Parallel combine + +Once a Historical finishes aggregation using the hash table, it sorts the aggregated results and merges them before sending to the +Broker for N-way merge aggregation in the broker. By default, Historicals use all their available processing threads +(configured by `druid.processing.numThreads`) for aggregation, but use a single thread for sorting and merging +aggregates which is an http thread to send data to Brokers. + +This is to prevent some heavy groupBy queries from blocking other queries. In Druid, the processing threads are shared +between all submitted queries and they are _not interruptible_. It means, if a heavy query takes all available +processing threads, all other queries might be blocked until the heavy query is finished. GroupBy queries usually take +longer time than timeseries or topN queries, they should release processing threads as soon as possible. + +However, you might care about the performance of some really heavy groupBy queries. Usually, the performance bottleneck +of heavy groupBy queries is merging sorted aggregates. In such cases, you can use processing threads for it as well. +This is called _parallel combine_. To enable parallel combine, see `numParallelCombineThreads` in +[Advanced groupBy v2 configurations](#groupby-v2-configurations). Note that parallel combine can be enabled only when +data is actually spilled (see [Memory tuning and resource limits](#memory-tuning-and-resource-limits)). + +Once parallel combine is enabled, the groupBy v2 engine can create a combining tree for merging sorted aggregates. Each +intermediate node of the tree is a thread merging aggregates from the child nodes. The leaf node threads read and merge +aggregates from hash tables including spilled ones. Usually, leaf processes are slower than intermediate nodes because they +need to read data from disk. As a result, less threads are used for intermediate nodes by default. You can change the +degree of intermediate nodes. See `intermediateCombineDegree` in [Advanced groupBy v2 configurations](#groupby-v2-configurations). + +Please note that each Historical needs two merge buffers to process a groupBy v2 query with parallel combine: one for +computing intermediate aggregates from each segment and another for combining intermediate aggregates in parallel. + + +### Alternatives + +There are some situations where other query types may be a better choice than groupBy. + +- For queries with no "dimensions" (i.e. grouping by time only) the [Timeseries query](timeseriesquery.md) will +generally be faster than groupBy. The major differences are that it is implemented in a fully streaming manner (taking +advantage of the fact that segments are already sorted on time) and does not need to use a hash table for merging. + +- For queries with a single "dimensions" element (i.e. grouping by one string dimension), the [TopN query](topnquery.md) +will sometimes be faster than groupBy. This is especially true if you are ordering by a metric and find approximate +results acceptable. + +### Nested groupBys + +Nested groupBys (dataSource of type "query") are performed differently for "v1" and "v2". The Broker first runs the +inner groupBy query in the usual way. "v1" strategy then materializes the inner query's results on-heap with Druid's +indexing mechanism, and runs the outer query on these materialized results. "v2" strategy runs the outer query on the +inner query's results stream with off-heap fact map and on-heap string dictionary that can spill to disk. Both +strategy perform the outer query on the Broker in a single-threaded fashion. + +### Configurations + +This section describes the configurations for groupBy queries. You can set the runtime properties in the `runtime.properties` file on Broker, Historical, and MiddleManager processes. You can set the query context parameters through the [query context](query-context.md). + +#### Configurations for groupBy v2 + +Supported runtime properties: + +|Property|Description|Default| +|--------|-----------|-------| +|`druid.query.groupBy.maxMergingDictionarySize`|Maximum amount of heap space (approximately) to use for the string dictionary during merging. When the dictionary exceeds this size, a spill to disk will be triggered.|100000000| +|`druid.query.groupBy.maxOnDiskStorage`|Maximum amount of disk space to use, per-query, for spilling result sets to disk when either the merging buffer or the dictionary fills up. Queries that exceed this limit will fail. Set to zero to disable disk spilling.|0 (disabled)| + +Supported query contexts: + +|Key|Description| +|---|-----------| +|`maxMergingDictionarySize`|Can be used to lower the value of `druid.query.groupBy.maxMergingDictionarySize` for this query.| +|`maxOnDiskStorage`|Can be used to lower the value of `druid.query.groupBy.maxOnDiskStorage` for this query.| + + +### Advanced configurations + +#### Common configurations for all groupBy strategies + +Supported runtime properties: + +|Property|Description|Default| +|--------|-----------|-------| +|`druid.query.groupBy.defaultStrategy`|Default groupBy query strategy.|v2| +|`druid.query.groupBy.singleThreaded`|Merge results using a single thread.|false| + +Supported query contexts: + +|Key|Description| +|---|-----------| +|`groupByStrategy`|Overrides the value of `druid.query.groupBy.defaultStrategy` for this query.| +|`groupByIsSingleThreaded`|Overrides the value of `druid.query.groupBy.singleThreaded` for this query.| + + +#### GroupBy v2 configurations + +Supported runtime properties: + +|Property|Description|Default| +|--------|-----------|-------| +|`druid.query.groupBy.bufferGrouperInitialBuckets`|Initial number of buckets in the off-heap hash table used for grouping results. Set to 0 to use a reasonable default (1024).|0| +|`druid.query.groupBy.bufferGrouperMaxLoadFactor`|Maximum load factor of the off-heap hash table used for grouping results. When the load factor exceeds this size, the table will be grown or spilled to disk. Set to 0 to use a reasonable default (0.7).|0| +|`druid.query.groupBy.forceHashAggregation`|Force to use hash-based aggregation.|false| +|`druid.query.groupBy.intermediateCombineDegree`|Number of intermediate nodes combined together in the combining tree. Higher degrees will need less threads which might be helpful to improve the query performance by reducing the overhead of too many threads if the server has sufficiently powerful cpu cores.|8| +|`druid.query.groupBy.numParallelCombineThreads`|Hint for the number of parallel combining threads. This should be larger than 1 to turn on the parallel combining feature. The actual number of threads used for parallel combining is min(`druid.query.groupBy.numParallelCombineThreads`, `druid.processing.numThreads`).|1 (disabled)| +|`druid.query.groupBy.applyLimitPushDownToSegment`|If Broker pushes limit down to queryable data server (historicals, peons) then limit results during segment scan. If typically there are a large number of segments taking part in a query on a data server, this setting may counterintuitively reduce performance if enabled.|false (disabled)| + +Supported query contexts: + +|Key|Description|Default| +|---|-----------|-------| +|`bufferGrouperInitialBuckets`|Overrides the value of `druid.query.groupBy.bufferGrouperInitialBuckets` for this query.|None| +|`bufferGrouperMaxLoadFactor`|Overrides the value of `druid.query.groupBy.bufferGrouperMaxLoadFactor` for this query.|None| +|`forceHashAggregation`|Overrides the value of `druid.query.groupBy.forceHashAggregation`|None| +|`intermediateCombineDegree`|Overrides the value of `druid.query.groupBy.intermediateCombineDegree`|None| +|`numParallelCombineThreads`|Overrides the value of `druid.query.groupBy.numParallelCombineThreads`|None| +|`sortByDimsFirst`|Sort the results first by dimension values and then by timestamp.|false| +|`forceLimitPushDown`|When all fields in the orderby are part of the grouping key, the Broker will push limit application down to the Historical processes. When the sorting order uses fields that are not in the grouping key, applying this optimization can result in approximate results with unknown accuracy, so this optimization is disabled by default in that case. Enabling this context flag turns on limit push down for limit/orderbys that contain non-grouping key columns.|false| +|`applyLimitPushDownToSegment`|If Broker pushes limit down to queryable nodes (historicals, peons) then limit results during segment scan. This context value can be used to override `druid.query.groupBy.applyLimitPushDownToSegment`.|true| + + +#### GroupBy v1 configurations + +Supported runtime properties: + +|Property|Description|Default| +|--------|-----------|-------| +|`druid.query.groupBy.maxIntermediateRows`|Maximum number of intermediate rows for the per-segment grouping engine. This is a tuning parameter that does not impose a hard limit; rather, it potentially shifts merging work from the per-segment engine to the overall merging index. Queries that exceed this limit will not fail.|50000| +|`druid.query.groupBy.maxResults`|Maximum number of results. Queries that exceed this limit will fail.|500000| + +Supported query contexts: + +|Key|Description|Default| +|---|-----------|-------| +|`maxIntermediateRows`|Can be used to lower the value of `druid.query.groupBy.maxIntermediateRows` for this query.|None| +|`maxResults`|Can be used to lower the value of `druid.query.groupBy.maxResults` for this query.|None| +|`useOffheap`|Set to true to store aggregations off-heap when merging results.|false| + +#### Array based result rows + +Internally Druid always uses an array based representation of groupBy result rows, but by default this is translated +into a map based result format at the Broker. To reduce the overhead of this translation, results may also be returned +from the Broker directly in the array based format if `resultAsArray` is set to `true` on the query context. + +Each row is positional, and has the following fields, in order: + +* Timestamp (optional; only if granularity != ALL) +* Dimensions (in order) +* Aggregators (in order) +* Post-aggregators (optional; in order, if present) + +This schema is not available on the response, so it must be computed from the issued query in order to properly read +the results. diff --git a/querying/query-execution.md b/querying/query-execution.md new file mode 100644 index 0000000..811a6ef --- /dev/null +++ b/querying/query-execution.md @@ -0,0 +1,118 @@ +--- +id: query-execution +title: "Query execution" +--- + + + +> This document describes how Druid executes [native queries](querying.md), but since [Druid SQL](sql.md) queries +> are translated to native queries, this document applies to the SQL runtime as well. Refer to the SQL +> [Query translation](sql.md#query-translation) page for information about how SQL queries are translated to native +> queries. + +Druid's approach to query execution varies depending on the kind of [datasource](datasource.md) you are querying. + +## Datasource type + +### `table` + +Queries that operate directly on [table datasources](datasource.md#table) are executed using a scatter-gather approach +led by the Broker process. The process looks like this: + +1. The Broker identifies which [segments](../design/segments.md) are relevant to the query based on the `"intervals"` +parameter. Segments are always partitioned by time, so any segment whose interval overlaps the query interval is +potentially relevant. + +2. The Broker may additionally further prune the segment list based on the `"filter"`, if the input data was partitioned +by range using the [`single_dim` partitionsSpec](../ingestion/native-batch.md#partitionsspec), and if the filter matches +the dimension used for partitioning. + +3. The Broker, having pruned the list of segments for the query, forwards the query to data servers (like Historicals +and tasks running on MiddleManagers) that are currently serving those segments. + +4. For all query types except [Scan](scan-query.md), data servers process each segment in parallel and generate partial +results for each segment. The specific processing that is done depends on the query type. These partial results may be +cached if [query caching](caching.md) is enabled. For Scan queries, segments are processed in order by a single thread, +and results are not cached. + +5. The Broker receives partial results from each data server, merges them into the final result set, and returns them +to the caller. For Timeseries and Scan queries, and for GroupBy queries where there is no sorting, the Broker is able to +do this in a streaming fashion. Otherwise, the Broker fully computes the result set before returning anything. + +### `lookup` + +Queries that operate directly on [lookup datasources](datasource.md#lookup) (without a join) are executed on the Broker +that received the query, using its local copy of the lookup. All registered lookup tables are preloaded in-memory on the +Broker. The query runs single-threaded. + +Execution of queries that use lookups as right-hand inputs to a join are executed in a way that depends on their +"base" (bottom-leftmost) datasource, as described in the [join](#join) section below. + +### `union` + +Queries that operate directly on [union datasources](datasource.md#union) are split up on the Broker into a separate +query for each table that is part of the union. Each of these queries runs separately, and the Broker merges their +results together. + +### `inline` + +Queries that operate directly on [inline datasources](datasource.md#inline) are executed on the Broker that received the +query. The query runs single-threaded. + +Execution of queries that use inline datasources as right-hand inputs to a join are executed in a way that depends on +their "base" (bottom-leftmost) datasource, as described in the [join](#join) section below. + +### `query` + +[Query datasources](datasource.md#query) are subqueries. Each subquery is executed as if it was its own query and +the results are brought back to the Broker. Then, the Broker continues on with the rest of the query as if the subquery +was replaced with an inline datasource. + +In most cases, subquery results are fully buffered in memory on the Broker before the rest of the query proceeds, +meaning subqueries execute sequentially. The total number of rows buffered across all subqueries of a given query +in this way cannot exceed the [`druid.server.http.maxSubqueryRows` property](../configuration/index.md). + +There is one exception: if the outer query and all subqueries are the [groupBy](groupbyquery.md) type, then subquery +results can be processed in a streaming fashion and the `druid.server.http.maxSubqueryRows` limit does not apply. + +### `join` + +[Join datasources](datasource.md#join) are handled using a broadcast hash-join approach. + +1. The Broker executes any subqueries that are inputs the join, as described in the [query](#query) section, and +replaces them with inline datasources. + +2. The Broker flattens a join tree, if present, into a "base" datasource (the bottom-leftmost one) and other leaf +datasources (the rest). + +3. Query execution proceeds using the same structure that the base datasource would use on its own. If the base +datasource is a [table](#table), segments are pruned based on `"intervals"` as usual, and the query is executed on the +cluster by forwarding it to all relevant data servers in parallel. If the base datasource is a [lookup](#lookup) or +[inline](#inline) datasource (including an inline datasource that was the result of inlining a subquery), the query is +executed on the Broker itself. The base query cannot be a union, because unions are not currently supported as inputs to +a join. + +4. Before beginning to process the base datasource, the server(s) that will execute the query first inspect all the +non-base leaf datasources to determine if a new hash table needs to be built for the upcoming hash join. Currently, +lookups do not require new hash tables to be built (because they are preloaded), but inline datasources do. + +5. Query execution proceeds again using the same structure that the base datasource would use on its own, with one +addition: while processing the base datasource, Druid servers will use the hash tables built from the other join inputs +to produce the join result row-by-row, and query engines will operate on the joined rows rather than the base rows. diff --git a/querying/querying.md b/querying/querying.md new file mode 100644 index 0000000..ba05285 --- /dev/null +++ b/querying/querying.md @@ -0,0 +1,125 @@ +# 原生查询 + +> Apache Druid supports two query languages: [Druid SQL](../querying/sql.md) and [native queries](../querying/querying.md). +> This document describes the +> native query language. For information about how Druid SQL chooses which native query types to use when +> it runs a SQL query, refer to the [SQL documentation](../querying/sql.md#query-types). + +Native queries in Druid are JSON objects and are typically issued to the Broker or Router processes. Queries can be +posted like this: + +```bash +curl -X POST ':/druid/v2/?pretty' -H 'Content-Type:application/json' -H 'Accept:application/json' -d @ +``` + +> Replace `:` with the appropriate address and port for your system. For example, if running the quickstart configuration, replace `:` with localhost:8888. + +You can also enter them directly in the Druid console's Query view. Simply pasting a native query into the console switches the editor into JSON mode. + +![Native query](../assets/native-queries-01.png "Native query") + +Druid's native query language is JSON over HTTP, although many members of the community have contributed different +[client libraries](https://druid.apache.org/libraries.html) in other languages to query Druid. + +The Content-Type/Accept Headers can also take 'application/x-jackson-smile'. + +```bash +curl -X POST ':/druid/v2/?pretty' -H 'Content-Type:application/json' -H 'Accept:application/x-jackson-smile' -d @ +``` + +> If the Accept header is not provided, it defaults to the value of 'Content-Type' header. + +Druid's native query is relatively low level, mapping closely to how computations are performed internally. Druid queries +are designed to be lightweight and complete very quickly. This means that for more complex analysis, or to build +more complex visualizations, multiple Druid queries may be required. + +Even though queries are typically made to Brokers or Routers, they can also be accepted by +[Historical](../design/historical.md) processes and by [Peons (task JVMs)](../design/peons.md)) that are running +stream ingestion tasks. This may be valuable if you want to query results for specific segments that are served by +specific processes. + +## Available queries + +Druid has numerous query types for various use cases. Queries are composed of various JSON properties and Druid has different types of queries for different use cases. The documentation for the various query types describe all the JSON properties that can be set. + +### Aggregation queries + +* [Timeseries](../querying/timeseriesquery.md) +* [TopN](../querying/topnquery.md) +* [GroupBy](../querying/groupbyquery.md) + +### Metadata queries + +* [TimeBoundary](../querying/timeboundaryquery.md) +* [SegmentMetadata](../querying/segmentmetadataquery.md) +* [DatasourceMetadata](../querying/datasourcemetadataquery.md) + +### Other queries + +* [Scan](../querying/scan-query.md) +* [Search](../querying/searchquery.md) + +## Which query type should I use? + +For aggregation queries, if more than one would satisfy your needs, we generally recommend using Timeseries or TopN +whenever possible, as they are specifically optimized for their use cases. If neither is a good fit, you should use +the GroupBy query, which is the most flexible. + +## Query cancellation + +Queries can be cancelled explicitly using their unique identifier. If the +query identifier is set at the time of query, or is otherwise known, the following +endpoint can be used on the Broker or Router to cancel the query. + +```sh +DELETE /druid/v2/{queryId} +``` + +For example, if the query ID is `abc123`, the query can be cancelled as follows: + +```sh +curl -X DELETE "http://host:port/druid/v2/abc123" +``` + +## Query errors + +### Authentication and authorization failures + +For [secured](../design/auth.md) Druid clusters, query requests respond with an HTTP 401 response code in case of an authentication failure. For authorization failures, an HTTP 403 response code is returned. + +### Query execution failures + +If a query fails, Druid returns a response with an HTTP response code and a JSON object with the following structure: + +```json +{ + "error" : "Query timeout", + "errorMessage" : "Timeout waiting for task.", + "errorClass" : "java.util.concurrent.TimeoutException", + "host" : "druid1.example.com:8083" +} +``` + +The fields in the response are: + +|field|description| +|-----|-----------| +|error|A well-defined error code (see below).| +|errorMessage|A free-form message with more information about the error. May be null.| +|errorClass|The class of the exception that caused this error. May be null.| +|host|The host on which this error occurred. May be null.| + +Possible Druid error codes for the `error` field include: + +|Error code|HTTP response code|description| +|----|-----------|-----------| +|`SQL parse failed`|400|Only for SQL queries. The SQL query failed to parse.| +|`Plan validation failed`|400|Only for SQL queries. The SQL query failed to validate.| +|`Resource limit exceeded`|400|The query exceeded a configured resource limit (e.g. groupBy maxResults).| +|`Query capacity exceeded`|429|The query failed to execute because of the lack of resources available at the time when the query was submitted. The resources could be any runtime resources such as [query scheduler lane capacity](../configuration/index.md#query-prioritization-and-laning), merge buffers, and so on. The error message should have more details about the failure.| +|`Unsupported operation`|501|The query attempted to perform an unsupported operation. This may occur when using undocumented features or when using an incompletely implemented extension.| +|`Query timeout`|504|The query timed out.| +|`Query interrupted`|500|The query was interrupted, possibly due to JVM shutdown.| +|`Query cancelled`|500|The query was cancelled through the query cancellation API.| +|`Truncated response context`|500|An intermediate response context for the query exceeded the built-in limit of 7KiB.

The response context is an internal data structure that Druid servers use to share out-of-band information when sending query results to each other. It is serialized in an HTTP header with a maximum length of 7KiB. This error occurs when an intermediate response context sent from a data server (like a Historical) to the Broker exceeds this limit.

The response context is used for a variety of purposes, but the one most likely to generate a large context is sharing details about segments that move during a query. That means this error can potentially indicate that a very large number of segments moved in between the time a Broker issued a query and the time it was processed on Historicals. This should rarely, if ever, occur during normal operation.| +|`Unknown exception`|500|Some other exception occurred. Check errorMessage and errorClass for details, although keep in mind that the contents of those fields are free-form and may change from release to release.| \ No newline at end of file diff --git a/querying/sql.md b/querying/sql.md new file mode 100644 index 0000000..fd5903d --- /dev/null +++ b/querying/sql.md @@ -0,0 +1,1270 @@ +--- +id: sql +title: "SQL" +sidebar_label: "Druid SQL" +--- + + + + + + +> Apache Druid supports two query languages: Druid SQL and [native queries](querying.md). +> This document describes the SQL language. + +Druid SQL is a built-in SQL layer and an alternative to Druid's native JSON-based query language, and is powered by a +parser and planner based on [Apache Calcite](https://calcite.apache.org/). Druid SQL translates SQL into native Druid +queries on the query Broker (the first process you query), which are then passed down to data processes as native Druid +queries. Other than the (slight) overhead of [translating](#query-translation) SQL on the Broker, there isn't an +additional performance penalty versus native queries. + +## Query syntax + +Druid SQL supports SELECT queries with the following structure: + +``` +[ EXPLAIN PLAN FOR ] +[ WITH tableName [ ( column1, column2, ... ) ] AS ( query ) ] +SELECT [ ALL | DISTINCT ] { * | exprs } +FROM { | () | [ INNER | LEFT ] JOIN ON condition } +[ WHERE expr ] +[ GROUP BY [ exprs | GROUPING SETS ( (exprs), ... ) | ROLLUP (exprs) | CUBE (exprs) ] ] +[ HAVING expr ] +[ ORDER BY expr [ ASC | DESC ], expr [ ASC | DESC ], ... ] +[ LIMIT limit ] +[ OFFSET offset ] +[ UNION ALL ] +``` + +### FROM + +The FROM clause can refer to any of the following: + +- [Table datasources](datasource.md#table) from the `druid` schema. This is the default schema, so Druid table +datasources can be referenced as either `druid.dataSourceName` or simply `dataSourceName`. +- [Lookups](datasource.md#lookup) from the `lookup` schema, for example `lookup.countries`. Note that lookups can +also be queried using the [`LOOKUP` function](#string-functions). +- [Subqueries](datasource.md#query). +- [Joins](datasource.md#join) between anything in this list, except between native datasources (table, lookup, +query) and system tables. The join condition must be an equality between expressions from the left- and right-hand side +of the join. +- [Metadata tables](#metadata-tables) from the `INFORMATION_SCHEMA` or `sys` schemas. Unlike the other options for the +FROM clause, metadata tables are not considered datasources. They exist only in the SQL layer. + +For more information about table, lookup, query, and join datasources, refer to the [Datasources](datasource.md) +documentation. + +### WHERE + +The WHERE clause refers to columns in the FROM table, and will be translated to [native filters](filters.md). The +WHERE clause can also reference a subquery, like `WHERE col1 IN (SELECT foo FROM ...)`. Queries like this are executed +as a join on the subquery, described below in the [Query translation](#subqueries) section. + +### GROUP BY + +The GROUP BY clause refers to columns in the FROM table. Using GROUP BY, DISTINCT, or any aggregation functions will +trigger an aggregation query using one of Druid's [three native aggregation query types](#query-types). GROUP BY +can refer to an expression or a select clause ordinal position (like `GROUP BY 2` to group by the second selected +column). + +The GROUP BY clause can also refer to multiple grouping sets in three ways. The most flexible is GROUP BY GROUPING SETS, +for example `GROUP BY GROUPING SETS ( (country, city), () )`. This example is equivalent to a `GROUP BY country, city` +followed by `GROUP BY ()` (a grand total). With GROUPING SETS, the underlying data is only scanned one time, leading to +better efficiency. Second, GROUP BY ROLLUP computes a grouping set for each level of the grouping expressions. For +example `GROUP BY ROLLUP (country, city)` is equivalent to `GROUP BY GROUPING SETS ( (country, city), (country), () )` +and will produce grouped rows for each country / city pair, along with subtotals for each country, along with a grand +total. Finally, GROUP BY CUBE computes a grouping set for each combination of grouping expressions. For example, +`GROUP BY CUBE (country, city)` is equivalent to `GROUP BY GROUPING SETS ( (country, city), (country), (city), () )`. + +Grouping columns that do not apply to a particular row will contain `NULL`. For example, when computing +`GROUP BY GROUPING SETS ( (country, city), () )`, the grand total row corresponding to `()` will have `NULL` for the +"country" and "city" columns. Column may also be `NULL` if it was `NULL` in the data itself. To differentiate such rows, +you can use `GROUPING` aggregation. + +When using GROUP BY GROUPING SETS, GROUP BY ROLLUP, or GROUP BY CUBE, be aware that results may not be generated in the +order that you specify your grouping sets in the query. If you need results to be generated in a particular order, use +the ORDER BY clause. + +### HAVING + +The HAVING clause refers to columns that are present after execution of GROUP BY. It can be used to filter on either +grouping expressions or aggregated values. It can only be used together with GROUP BY. + +### ORDER BY + +The ORDER BY clause refers to columns that are present after execution of GROUP BY. It can be used to order the results +based on either grouping expressions or aggregated values. ORDER BY can refer to an expression or a select clause +ordinal position (like `ORDER BY 2` to order by the second selected column). For non-aggregation queries, ORDER BY +can only order by the `__time` column. For aggregation queries, ORDER BY can order by any column. + +### LIMIT + +The LIMIT clause limits the number of rows returned. In some situations Druid will push down this limit to data servers, +which boosts performance. Limits are always pushed down for queries that run with the native Scan or TopN query types. +With the native GroupBy query type, it is pushed down when ordering on a column that you are grouping by. If you notice +that adding a limit doesn't change performance very much, then it's possible that Druid wasn't able to push down the +limit for your query. + +### OFFSET + +The OFFSET clause skips a certain number of rows when returning results. + +If both LIMIT and OFFSET are provided, then OFFSET will be applied first, followed by LIMIT. For example, using +LIMIT 100 OFFSET 10 will return 100 rows, starting from row number 10. + +Together, LIMIT and OFFSET can be used to implement pagination. However, note that if the underlying datasource is +modified between page fetches, then the different pages will not necessarily align with each other. + +There are two important factors that can affect the performance of queries that use OFFSET: + +- Skipped rows still need to be generated internally and then discarded, meaning that raising offsets to high values + can cause queries to use additional resources. +- OFFSET is only supported by the Scan and GroupBy [native query types](#query-types). Therefore, a query with OFFSET + will use one of those two types, even if it might otherwise have run as a Timeseries or TopN. Switching query engines + in this way can affect performance. + +### UNION ALL + +The "UNION ALL" operator fuses multiple queries together. Druid SQL supports the UNION ALL operator in two situations: +top-level and table-level. Queries that use UNION ALL in any other way will not be able to execute. + +#### Top-level + +UNION ALL can be used at the very top outer layer of a SQL query (not in a subquery, and not in the FROM clause). In +this case, the underlying queries will be run separately, back to back. Their results will be concatenated together +and appear one after the other. + +For example: + +``` +SELECT COUNT(*) FROM tbl WHERE my_column = 'value1' +UNION ALL +SELECT COUNT(*) FROM tbl WHERE my_column = 'value2' +``` + +With top-level UNION ALL, no further processing can be done after the UNION ALL. For example, the results of the +UNION ALL cannot have GROUP BY, ORDER BY, or any other operators applied to them. + +#### Table-level + +UNION ALL can be used to query multiple tables at the same time. In this case, it must appear in a subquery in the +FROM clause, and the lower-level subqueries that are inputs to the UNION ALL operator must be simple table SELECTs. +Features like expressions, column aliasing, JOIN, GROUP BY, ORDER BY, and so on cannot be used. The query will run +natively using a [union datasource](datasource.md#union). + +The same columns must be selected from each table in the same order, and those columns must either have the same types, +or types that can be implicitly cast to each other (such as different numeric types). For this reason, it is generally +more robust to write your queries to select specific columns. If you use `SELECT *`, you will need to modify your +queries if a new column is added to one of the tables but not to the others. + +For example: + +``` +SELECT col1, COUNT(*) +FROM ( + SELECT col1, col2, col3 FROM tbl1 + UNION ALL + SELECT col1, col2, col3 FROM tbl2 +) +GROUP BY col1 +``` + +With table-level UNION ALL, the rows from the unioned tables are not guaranteed to be processed in +any particular order. They may be processed in an interleaved fashion. If you need a particular result ordering, +use [ORDER BY](#order-by) on the outer query. + +### EXPLAIN PLAN + +Add "EXPLAIN PLAN FOR" to the beginning of any query to get information about how it will be translated. In this case, +the query will not actually be executed. Refer to the [Query translation](#query-translation) documentation for help +interpreting EXPLAIN PLAN output. + +### Identifiers and literals + +Identifiers like datasource and column names can optionally be quoted using double quotes. To escape a double quote +inside an identifier, use another double quote, like `"My ""very own"" identifier"`. All identifiers are case-sensitive +and no implicit case conversions are performed. + +Literal strings should be quoted with single quotes, like `'foo'`. Literal strings with Unicode escapes can be written +like `U&'fo\00F6'`, where character codes in hex are prefixed by a backslash. Literal numbers can be written in forms +like `100` (denoting an integer), `100.0` (denoting a floating point value), or `1.0e5` (scientific notation). Literal +timestamps can be written like `TIMESTAMP '2000-01-01 00:00:00'`. Literal intervals, used for time arithmetic, can be +written like `INTERVAL '1' HOUR`, `INTERVAL '1 02:03' DAY TO MINUTE`, `INTERVAL '1-2' YEAR TO MONTH`, and so on. + +### Dynamic parameters + +Druid SQL supports dynamic parameters using question mark (`?`) syntax, where parameters are bound to `?` placeholders +at execution time. To use dynamic parameters, replace any literal in the query with a `?` character and provide a +corresponding parameter value when you execute the query. Parameters are bound to the placeholders in the order in +which they are passed. Parameters are supported in both the [HTTP POST](#http-post) and [JDBC](#jdbc) APIs. + +In certain cases, using dynamic parameters in expressions can cause type inference issues which cause your query to fail, for example: + +```sql +SELECT * FROM druid.foo WHERE dim1 like CONCAT('%', ?, '%') +``` + +To solve this issue, explicitly provide the type of the dynamic parameter using the `CAST` keyword. Consider the fix for the preceding example: + +``` +SELECT * FROM druid.foo WHERE dim1 like CONCAT('%', CAST (? AS VARCHAR), '%') +``` + +## Data types + +### Standard types + +Druid natively supports five basic column types: "long" (64 bit signed int), "float" (32 bit float), "double" (64 bit +float) "string" (UTF-8 encoded strings and string arrays), and "complex" (catch-all for more exotic data types like +hyperUnique and approxHistogram columns). + +Timestamps (including the `__time` column) are treated by Druid as longs, with the value being the number of +milliseconds since 1970-01-01 00:00:00 UTC, not counting leap seconds. Therefore, timestamps in Druid do not carry any +timezone information, but only carry information about the exact moment in time they represent. See the +[Time functions](#time-functions) section for more information about timestamp handling. + +The following table describes how Druid maps SQL types onto native types at query runtime. Casts between two SQL types +that have the same Druid runtime type will have no effect, other than exceptions noted in the table. Casts between two +SQL types that have different Druid runtime types will generate a runtime cast in Druid. If a value cannot be properly +cast to another value, as in `CAST('foo' AS BIGINT)`, the runtime will substitute a default value. NULL values cast +to non-nullable types will also be substituted with a default value (for example, nulls cast to numbers will be +converted to zeroes). + +|SQL type|Druid runtime type|Default value|Notes| +|--------|------------------|-------------|-----| +|CHAR|STRING|`''`|| +|VARCHAR|STRING|`''`|Druid STRING columns are reported as VARCHAR. Can include [multi-value strings](#multi-value-strings) as well.| +|DECIMAL|DOUBLE|`0.0`|DECIMAL uses floating point, not fixed point math| +|FLOAT|FLOAT|`0.0`|Druid FLOAT columns are reported as FLOAT| +|REAL|DOUBLE|`0.0`|| +|DOUBLE|DOUBLE|`0.0`|Druid DOUBLE columns are reported as DOUBLE| +|BOOLEAN|LONG|`false`|| +|TINYINT|LONG|`0`|| +|SMALLINT|LONG|`0`|| +|INTEGER|LONG|`0`|| +|BIGINT|LONG|`0`|Druid LONG columns (except `__time`) are reported as BIGINT| +|TIMESTAMP|LONG|`0`, meaning 1970-01-01 00:00:00 UTC|Druid's `__time` column is reported as TIMESTAMP. Casts between string and timestamp types assume standard SQL formatting, e.g. `2000-01-02 03:04:05`, _not_ ISO8601 formatting. For handling other formats, use one of the [time functions](#time-functions)| +|DATE|LONG|`0`, meaning 1970-01-01|Casting TIMESTAMP to DATE rounds down the timestamp to the nearest day. Casts between string and date types assume standard SQL formatting, e.g. `2000-01-02`. For handling other formats, use one of the [time functions](#time-functions)| +|OTHER|COMPLEX|none|May represent various Druid column types such as hyperUnique, approxHistogram, etc| + +### Multi-value strings + +Druid's native type system allows strings to potentially have multiple values. These +[multi-value string dimensions](multi-value-dimensions.md) will be reported in SQL as `VARCHAR` typed, and can be +syntactically used like any other VARCHAR. Regular string functions that refer to multi-value string dimensions will be +applied to all values for each row individually. Multi-value string dimensions can also be treated as arrays via special +[multi-value string functions](#multi-value-string-functions), which can perform powerful array-aware operations. + +Grouping by a multi-value expression will observe the native Druid multi-value aggregation behavior, which is similar to +the `UNNEST` functionality available in some other SQL dialects. Refer to the documentation on +[multi-value string dimensions](multi-value-dimensions.md) for additional details. + +> Because multi-value dimensions are treated by the SQL planner as `VARCHAR`, there are some inconsistencies between how +> they are handled in Druid SQL and in native queries. For example, expressions involving multi-value dimensions may be +> incorrectly optimized by the Druid SQL planner: `multi_val_dim = 'a' AND multi_val_dim = 'b'` will be optimized to +> `false`, even though it is possible for a single row to have both "a" and "b" as values for `multi_val_dim`. The +> SQL behavior of multi-value dimensions will change in a future release to more closely align with their behavior +> in native queries. + +### NULL values + +The `druid.generic.useDefaultValueForNull` [runtime property](../configuration/index.md#sql-compatible-null-handling) +controls Druid's NULL handling mode. + +In the default value mode (`true`), Druid treats NULLs and empty strings interchangeably, rather than according to the SQL +standard. In this mode Druid SQL only has partial support for NULLs. For example, the expressions `col IS NULL` and +`col = ''` are equivalent, and both will evaluate to true if `col` contains an empty string. Similarly, the expression +`COALESCE(col1, col2)` will return `col2` if `col1` is an empty string. While the `COUNT(*)` aggregator counts all rows, +the `COUNT(expr)` aggregator will count the number of rows where expr is neither null nor the empty string. Numeric +columns in this mode are not nullable; any null or missing values will be treated as zeroes. + +In SQL compatible mode (`false`), NULLs are treated more closely to the SQL standard. The property affects both storage +and querying, so for correct behavior, it should be set on all Druid service types to be available at both ingestion +time and query time. There is some overhead associated with the ability to handle NULLs; see +the [segment internals](../design/segments.md#sql-compatible-null-handling) documentation for more details. + +## Aggregation functions + +Aggregation functions can appear in the SELECT clause of any query. Any aggregator can be filtered using syntax +like `AGG(expr) FILTER(WHERE whereExpr)`. Filtered aggregators will only aggregate rows that match their filter. It's +possible for two aggregators in the same SQL query to have different filters. + +When no rows are selected, aggregate functions will return their initial value. This can occur when filtering results in +no matches while aggregating values across an entire table without a grouping, or, when using filtered aggregations +within a grouping. What this value is exactly varies per aggregator, but COUNT, and the various approximate count +distinct sketch functions, will always return 0. + +Only the COUNT and ARRAY_AGG aggregations can accept the DISTINCT keyword. + +> The order of aggregation operations across segments is not deterministic. This means that non-commutative aggregation +> functions can produce inconsistent results across the same query. +> +> Functions that operate on an input type of "float" or "double" may also see these differences in aggregation +> results across multiple query runs because of this. If precisely the same value is desired across multiple query runs, +> consider using the `ROUND` function to smooth out the inconsistencies between queries. + +|Function|Notes|Default| +|--------|-----|-------| +|`COUNT(*)`|Counts the number of rows.|`0`| +|`COUNT(DISTINCT expr)`|Counts distinct values of expr, which can be string, numeric, or hyperUnique. By default this is approximate, using a variant of [HyperLogLog](http://algo.inria.fr/flajolet/Publications/FlFuGaMe07.pdf). To get exact counts set "useApproximateCountDistinct" to "false". If you do this, expr must be string or numeric, since exact counts are not possible using hyperUnique columns. See also `APPROX_COUNT_DISTINCT(expr)`. In exact mode, only one distinct count per query is permitted unless `useGroupingSetForExactDistinct` is set to true in query contexts or broker configurations.|`0`| +|`SUM(expr)`|Sums numbers.|`null` if `druid.generic.useDefaultValueForNull=false`, otherwise `0`| +|`MIN(expr)`|Takes the minimum of numbers.|`null` if `druid.generic.useDefaultValueForNull=false`, otherwise `9223372036854775807` (maximum LONG value)| +|`MAX(expr)`|Takes the maximum of numbers.|`null` if `druid.generic.useDefaultValueForNull=false`, otherwise `-9223372036854775808` (minimum LONG value)| +|`AVG(expr)`|Averages numbers.|`null` if `druid.generic.useDefaultValueForNull=false`, otherwise `0`| +|`APPROX_COUNT_DISTINCT(expr)`|Counts distinct values of expr, which can be a regular column or a hyperUnique column. This is always approximate, regardless of the value of "useApproximateCountDistinct". This uses Druid's built-in "cardinality" or "hyperUnique" aggregators. See also `COUNT(DISTINCT expr)`.|`0`| +|`APPROX_COUNT_DISTINCT_DS_HLL(expr, [lgK, tgtHllType])`|Counts distinct values of expr, which can be a regular column or an [HLL sketch](../development/extensions-core/datasketches-hll.md) column. The `lgK` and `tgtHllType` parameters are described in the HLL sketch documentation. This is always approximate, regardless of the value of "useApproximateCountDistinct". See also `COUNT(DISTINCT expr)`. The [DataSketches extension](../development/extensions-core/datasketches-extension.md) must be loaded to use this function.|`0`| +|`APPROX_COUNT_DISTINCT_DS_THETA(expr, [size])`|Counts distinct values of expr, which can be a regular column or a [Theta sketch](../development/extensions-core/datasketches-theta.md) column. The `size` parameter is described in the Theta sketch documentation. This is always approximate, regardless of the value of "useApproximateCountDistinct". See also `COUNT(DISTINCT expr)`. The [DataSketches extension](../development/extensions-core/datasketches-extension.md) must be loaded to use this function.|`0`| +|`DS_HLL(expr, [lgK, tgtHllType])`|Creates an [HLL sketch](../development/extensions-core/datasketches-hll.md) on the values of expr, which can be a regular column or a column containing HLL sketches. The `lgK` and `tgtHllType` parameters are described in the HLL sketch documentation. The [DataSketches extension](../development/extensions-core/datasketches-extension.md) must be loaded to use this function.|`'0'` (STRING)| +|`DS_THETA(expr, [size])`|Creates a [Theta sketch](../development/extensions-core/datasketches-theta.md) on the values of expr, which can be a regular column or a column containing Theta sketches. The `size` parameter is described in the Theta sketch documentation. The [DataSketches extension](../development/extensions-core/datasketches-extension.md) must be loaded to use this function.|`'0.0'` (STRING)| +|`APPROX_QUANTILE(expr, probability, [resolution])`|Computes approximate quantiles on numeric or [approxHistogram](../development/extensions-core/approximate-histograms.md#approximate-histogram-aggregator) exprs. The "probability" should be between 0 and 1 (exclusive). The "resolution" is the number of centroids to use for the computation. Higher resolutions will give more precise results but also have higher overhead. If not provided, the default resolution is 50. The [approximate histogram extension](../development/extensions-core/approximate-histograms.md) must be loaded to use this function.|`NaN`| +|`APPROX_QUANTILE_DS(expr, probability, [k])`|Computes approximate quantiles on numeric or [Quantiles sketch](../development/extensions-core/datasketches-quantiles.md) exprs. The "probability" should be between 0 and 1 (exclusive). The `k` parameter is described in the Quantiles sketch documentation. The [DataSketches extension](../development/extensions-core/datasketches-extension.md) must be loaded to use this function.|`NaN`| +|`APPROX_QUANTILE_FIXED_BUCKETS(expr, probability, numBuckets, lowerLimit, upperLimit, [outlierHandlingMode])`|Computes approximate quantiles on numeric or [fixed buckets histogram](../development/extensions-core/approximate-histograms.md#fixed-buckets-histogram) exprs. The "probability" should be between 0 and 1 (exclusive). The `numBuckets`, `lowerLimit`, `upperLimit`, and `outlierHandlingMode` parameters are described in the fixed buckets histogram documentation. The [approximate histogram extension](../development/extensions-core/approximate-histograms.md) must be loaded to use this function.|`0.0`| +|`DS_QUANTILES_SKETCH(expr, [k])`|Creates a [Quantiles sketch](../development/extensions-core/datasketches-quantiles.md) on the values of expr, which can be a regular column or a column containing quantiles sketches. The `k` parameter is described in the Quantiles sketch documentation. The [DataSketches extension](../development/extensions-core/datasketches-extension.md) must be loaded to use this function.|`'0'` (STRING)| +|`BLOOM_FILTER(expr, numEntries)`|Computes a bloom filter from values produced by `expr`, with `numEntries` maximum number of distinct values before false positive rate increases. See [bloom filter extension](../development/extensions-core/bloom-filter.md) documentation for additional details.|Empty base64 encoded bloom filter STRING| +|`TDIGEST_QUANTILE(expr, quantileFraction, [compression])`|Builds a T-Digest sketch on values produced by `expr` and returns the value for the quantile. Compression parameter (default value 100) determines the accuracy and size of the sketch. Higher compression means higher accuracy but more space to store sketches. See [t-digest extension](../development/extensions-contrib/tdigestsketch-quantiles.md) documentation for additional details.|`Double.NaN`| +|`TDIGEST_GENERATE_SKETCH(expr, [compression])`|Builds a T-Digest sketch on values produced by `expr`. Compression parameter (default value 100) determines the accuracy and size of the sketch Higher compression means higher accuracy but more space to store sketches. See [t-digest extension](../development/extensions-contrib/tdigestsketch-quantiles.md) documentation for additional details.|Empty base64 encoded T-Digest sketch STRING| +|`VAR_POP(expr)`|Computes variance population of `expr`. See [stats extension](../development/extensions-core/stats.md) documentation for additional details.|`null` if `druid.generic.useDefaultValueForNull=false`, otherwise `0`| +|`VAR_SAMP(expr)`|Computes variance sample of `expr`. See [stats extension](../development/extensions-core/stats.md) documentation for additional details.|`null` if `druid.generic.useDefaultValueForNull=false`, otherwise `0`| +|`VARIANCE(expr)`|Computes variance sample of `expr`. See [stats extension](../development/extensions-core/stats.md) documentation for additional details.|`null` if `druid.generic.useDefaultValueForNull=false`, otherwise `0`| +|`STDDEV_POP(expr)`|Computes standard deviation population of `expr`. See [stats extension](../development/extensions-core/stats.md) documentation for additional details.|`null` if `druid.generic.useDefaultValueForNull=false`, otherwise `0`| +|`STDDEV_SAMP(expr)`|Computes standard deviation sample of `expr`. See [stats extension](../development/extensions-core/stats.md) documentation for additional details.|`null` if `druid.generic.useDefaultValueForNull=false`, otherwise `0`| +|`STDDEV(expr)`|Computes standard deviation sample of `expr`. See [stats extension](../development/extensions-core/stats.md) documentation for additional details.|`null` if `druid.generic.useDefaultValueForNull=false`, otherwise `0`| +|`EARLIEST(expr)`|Returns the earliest value of `expr`, which must be numeric. If `expr` comes from a relation with a timestamp column (like a Druid datasource) then "earliest" is the value first encountered with the minimum overall timestamp of all values being aggregated. If `expr` does not come from a relation with a timestamp, then it is simply the first value encountered.|`null` if `druid.generic.useDefaultValueForNull=false`, otherwise `0`| +|`EARLIEST(expr, maxBytesPerString)`|Like `EARLIEST(expr)`, but for strings. The `maxBytesPerString` parameter determines how much aggregation space to allocate per string. Strings longer than this limit will be truncated. This parameter should be set as low as possible, since high values will lead to wasted memory.|`null` if `druid.generic.useDefaultValueForNull=false`, otherwise `''`| +|`LATEST(expr)`|Returns the latest value of `expr`, which must be numeric. If `expr` comes from a relation with a timestamp column (like a Druid datasource) then "latest" is the value last encountered with the maximum overall timestamp of all values being aggregated. If `expr` does not come from a relation with a timestamp, then it is simply the last value encountered.|`null` if `druid.generic.useDefaultValueForNull=false`, otherwise `0`| +|`LATEST(expr, maxBytesPerString)`|Like `LATEST(expr)`, but for strings. The `maxBytesPerString` parameter determines how much aggregation space to allocate per string. Strings longer than this limit will be truncated. This parameter should be set as low as possible, since high values will lead to wasted memory.|`null` if `druid.generic.useDefaultValueForNull=false`, otherwise `''`| +|`ANY_VALUE(expr)`|Returns any value of `expr` including null. `expr` must be numeric. This aggregator can simplify and optimize the performance by returning the first encountered value (including null)|`null` if `druid.generic.useDefaultValueForNull=false`, otherwise `0`| +|`ANY_VALUE(expr, maxBytesPerString)`|Like `ANY_VALUE(expr)`, but for strings. The `maxBytesPerString` parameter determines how much aggregation space to allocate per string. Strings longer than this limit will be truncated. This parameter should be set as low as possible, since high values will lead to wasted memory.|`null` if `druid.generic.useDefaultValueForNull=false`, otherwise `''`| +|`GROUPING(expr, expr...)`|Returns a number to indicate which groupBy dimension is included in a row, when using `GROUPING SETS`. Refer to [additional documentation](aggregations.md#grouping-aggregator) on how to infer this number.|N/A| +|`ARRAY_AGG(expr, [size])`|Collects all values of `expr` into an ARRAY, including null values, with `size` in bytes limit on aggregation size (default of 1024 bytes). Use of `ORDER BY` within the `ARRAY_AGG` expression is not currently supported, and the ordering of results within the output array may vary depending on processing order.|`null`| +|`ARRAY_AGG(DISTINCT expr, [size])`|Collects all distinct values of `expr` into an ARRAY, including null values, with `size` in bytes limit on aggregation size (default of 1024 bytes) per aggregate. Use of `ORDER BY` within the `ARRAY_AGG` expression is not currently supported, and the ordering of results within the output array may vary depending on processing order.|`null`| +|`BIT_AND(expr)`|Performs a bitwise AND operation on all input values.|`null` if `druid.generic.useDefaultValueForNull=false`, otherwise `0`| +|`BIT_OR(expr)`|Performs a bitwise OR operation on all input values.|`null` if `druid.generic.useDefaultValueForNull=false`, otherwise `0`| +|`BIT_XOR(expr)`|Performs a bitwise XOR operation on all input values.|`null` if `druid.generic.useDefaultValueForNull=false`, otherwise `0`| + +For advice on choosing approximate aggregation functions, check out our [approximate aggregations documentation](aggregations.md#approx). + +## Scalar functions + +### Numeric functions + +For mathematical operations, Druid SQL will use integer math if all operands involved in an expression are integers. +Otherwise, Druid will switch to floating point math. You can force this to happen by casting one of your operands +to FLOAT. At runtime, Druid will widen 32-bit floats to 64-bit for most expressions. + +|Function|Notes| +|--------|-----| +|`ABS(expr)`|Absolute value.| +|`CEIL(expr)`|Ceiling.| +|`EXP(expr)`|e to the power of expr.| +|`FLOOR(expr)`|Floor.| +|`LN(expr)`|Logarithm (base e).| +|`LOG10(expr)`|Logarithm (base 10).| +|`POWER(expr, power)`|expr to a power.| +|`SQRT(expr)`|Square root.| +|`TRUNCATE(expr[, digits])`|Truncate expr to a specific number of decimal digits. If digits is negative, then this truncates that many places to the left of the decimal point. Digits defaults to zero if not specified.| +|`TRUNC(expr[, digits])`|Synonym for `TRUNCATE`.| +|`ROUND(expr[, digits])`|`ROUND(x, y)` would return the value of the x rounded to the y decimal places. While x can be an integer or floating-point number, y must be an integer. The type of the return value is specified by that of x. y defaults to 0 if omitted. When y is negative, x is rounded on the left side of the y decimal points. If `expr` evaluates to either `NaN`, `expr` will be converted to 0. If `expr` is infinity, `expr` will be converted to the nearest finite double. | +|`x + y`|Addition.| +|`x - y`|Subtraction.| +|`x * y`|Multiplication.| +|`x / y`|Division.| +|`MOD(x, y)`|Modulo (remainder of x divided by y).| +|`SIN(expr)`|Trigonometric sine of an angle expr.| +|`COS(expr)`|Trigonometric cosine of an angle expr.| +|`TAN(expr)`|Trigonometric tangent of an angle expr.| +|`COT(expr)`|Trigonometric cotangent of an angle expr.| +|`ASIN(expr)`|Arc sine of expr.| +|`ACOS(expr)`|Arc cosine of expr.| +|`ATAN(expr)`|Arc tangent of expr.| +|`ATAN2(y, x)`|Angle theta from the conversion of rectangular coordinates (x, y) to polar * coordinates (r, theta).| +|`DEGREES(expr)`|Converts an angle measured in radians to an approximately equivalent angle measured in degrees| +|`RADIANS(expr)`|Converts an angle measured in degrees to an approximately equivalent angle measured in radians| +|`BITWISE_AND(expr1, expr2)`|Returns the result of `expr1 & expr2`. Double values will be implicitly cast to longs, use `BITWISE_CONVERT_DOUBLE_TO_LONG_BITS` to perform bitwise operations directly with doubles| +|`BITWISE_COMPLEMENT(expr)`|Returns the result of `~expr`. Double values will be implicitly cast to longs, use `BITWISE_CONVERT_DOUBLE_TO_LONG_BITS` to perform bitwise operations directly with doubles| +|`BITWISE_CONVERT_DOUBLE_TO_LONG_BITS(expr)`|Converts the bits of an IEEE 754 floating-point double value to a long. If the input is not a double, it is implicitly cast to a double prior to conversion| +|`BITWISE_CONVERT_LONG_BITS_TO_DOUBLE(expr)`|Converts a long to the IEEE 754 floating-point double specified by the bits stored in the long. If the input is not a long, it is implicitly cast to a long prior to conversion| +|`BITWISE_OR(expr1, expr2)`|Returns the result of `expr1 [PIPE] expr2`. Double values will be implicitly cast to longs, use `BITWISE_CONVERT_DOUBLE_TO_LONG_BITS` to perform bitwise operations directly with doubles| +|`BITWISE_SHIFT_LEFT(expr1, expr2)`|Returns the result of `expr1 << expr2`. Double values will be implicitly cast to longs, use `BITWISE_CONVERT_DOUBLE_TO_LONG_BITS` to perform bitwise operations directly with doubles| +|`BITWISE_SHIFT_RIGHT(expr1, expr2)`|Returns the result of `expr1 >> expr2`. Double values will be implicitly cast to longs, use `BITWISE_CONVERT_DOUBLE_TO_LONG_BITS` to perform bitwise operations directly with doubles| +|`BITWISE_XOR(expr1, expr2)`|Returns the result of `expr1 ^ expr2`. Double values will be implicitly cast to longs, use `BITWISE_CONVERT_DOUBLE_TO_LONG_BITS` to perform bitwise operations directly with doubles| + +### String functions + +String functions accept strings, and return a type appropriate to the function. + +|Function|Notes| +|--------|-----| +|x || y|Concat strings x and y.| +|`CONCAT(expr, expr...)`|Concats a list of expressions.| +|`TEXTCAT(expr, expr)`|Two argument version of CONCAT.| +|`STRING_FORMAT(pattern[, args...])`|Returns a string formatted in the manner of Java's [String.format](https://docs.oracle.com/javase/8/docs/api/java/lang/String.html#format-java.lang.String-java.lang.Object...-).| +|`LENGTH(expr)`|Length of expr in UTF-16 code units.| +|`CHAR_LENGTH(expr)`|Synonym for `LENGTH`.| +|`CHARACTER_LENGTH(expr)`|Synonym for `LENGTH`.| +|`STRLEN(expr)`|Synonym for `LENGTH`.| +|`LOOKUP(expr, lookupName)`|Look up expr in a registered [query-time lookup table](lookups.md). Note that lookups can also be queried directly using the [`lookup` schema](#from).| +|`LOWER(expr)`|Returns expr in all lowercase.| +|`PARSE_LONG(string[, radix])`|Parses a string into a long (BIGINT) with the given radix, or 10 (decimal) if a radix is not provided.| +|`POSITION(needle IN haystack [FROM fromIndex])`|Returns the index of needle within haystack, with indexes starting from 1. The search will begin at fromIndex, or 1 if fromIndex is not specified. If the needle is not found, returns 0.| +|`REGEXP_EXTRACT(expr, pattern, [index])`|Apply regular expression `pattern` to `expr` and extract a capture group, or `NULL` if there is no match. If index is unspecified or zero, returns the first substring that matched the pattern. The pattern may match anywhere inside `expr`; if you want to match the entire string instead, use the `^` and `$` markers at the start and end of your pattern. Note: when `druid.generic.useDefaultValueForNull = true`, it is not possible to differentiate an empty-string match from a non-match (both will return `NULL`).| +|`REGEXP_LIKE(expr, pattern)`|Returns whether `expr` matches regular expression `pattern`. The pattern may match anywhere inside `expr`; if you want to match the entire string instead, use the `^` and `$` markers at the start and end of your pattern. Similar to [`LIKE`](#comparison-operators), but uses regexps instead of LIKE patterns. Especially useful in WHERE clauses.| +|`CONTAINS_STRING(, str)`|Returns true if the `str` is a substring of `expr`.| +|`ICONTAINS_STRING(, str)`|Returns true if the `str` is a substring of `expr`. The match is case-insensitive.| +|`REPLACE(expr, pattern, replacement)`|Replaces pattern with replacement in expr, and returns the result.| +|`STRPOS(haystack, needle)`|Returns the index of needle within haystack, with indexes starting from 1. If the needle is not found, returns 0.| +|`SUBSTRING(expr, index, [length])`|Returns a substring of expr starting at index, with a max length, both measured in UTF-16 code units.| +|`RIGHT(expr, [length])`|Returns the rightmost length characters from expr.| +|`LEFT(expr, [length])`|Returns the leftmost length characters from expr.| +|`SUBSTR(expr, index, [length])`|Synonym for SUBSTRING.| +|TRIM([BOTH | LEADING | TRAILING] [ FROM] expr)|Returns expr with characters removed from the leading, trailing, or both ends of "expr" if they are in "chars". If "chars" is not provided, it defaults to " " (a space). If the directional argument is not provided, it defaults to "BOTH".| +|`BTRIM(expr[, chars])`|Alternate form of `TRIM(BOTH FROM )`.| +|`LTRIM(expr[, chars])`|Alternate form of `TRIM(LEADING FROM )`.| +|`RTRIM(expr[, chars])`|Alternate form of `TRIM(TRAILING FROM )`.| +|`UPPER(expr)`|Returns expr in all uppercase.| +|`REVERSE(expr)`|Reverses expr.| +|`REPEAT(expr, [N])`|Repeats expr N times| +|`LPAD(expr, length[, chars])`|Returns a string of `length` from `expr` left-padded with `chars`. If `length` is shorter than the length of `expr`, the result is `expr` which is truncated to `length`. The result will be null if either `expr` or `chars` is null. If `chars` is an empty string, no padding is added, however `expr` may be trimmed if necessary.| +|`RPAD(expr, length[, chars])`|Returns a string of `length` from `expr` right-padded with `chars`. If `length` is shorter than the length of `expr`, the result is `expr` which is truncated to `length`. The result will be null if either `expr` or `chars` is null. If `chars` is an empty string, no padding is added, however `expr` may be trimmed if necessary.| + + +### Time functions + +Time functions can be used with Druid's `__time` column, with any column storing millisecond timestamps through use +of the `MILLIS_TO_TIMESTAMP` function, or with any column storing string timestamps through use of the `TIME_PARSE` +function. By default, time operations use the UTC time zone. You can change the time zone by setting the connection +context parameter "sqlTimeZone" to the name of another time zone, like "America/Los_Angeles", or to an offset like +"-08:00". If you need to mix multiple time zones in the same query, or if you need to use a time zone other than +the connection time zone, some functions also accept time zones as parameters. These parameters always take precedence +over the connection time zone. + +Literal timestamps in the connection time zone can be written using `TIMESTAMP '2000-01-01 00:00:00'` syntax. The +simplest way to write literal timestamps in other time zones is to use TIME_PARSE, like +`TIME_PARSE('2000-02-01 00:00:00', NULL, 'America/Los_Angeles')`. + +|Function|Notes| +|--------|-----| +|`CURRENT_TIMESTAMP`|Current timestamp in the connection's time zone.| +|`CURRENT_DATE`|Current date in the connection's time zone.| +|`DATE_TRUNC(, )`|Rounds down a timestamp, returning it as a new timestamp. Unit can be 'milliseconds', 'second', 'minute', 'hour', 'day', 'week', 'month', 'quarter', 'year', 'decade', 'century', or 'millennium'.| +|`TIME_CEIL(, , [, []])`|Rounds up a timestamp, returning it as a new timestamp. Period can be any ISO8601 period, like P3M (quarters) or PT12H (half-days). The time zone, if provided, should be a time zone name like "America/Los_Angeles" or offset like "-08:00". This function is similar to `CEIL` but is more flexible.| +|`TIME_FLOOR(, , [, []])`|Rounds down a timestamp, returning it as a new timestamp. Period can be any ISO8601 period, like P3M (quarters) or PT12H (half-days). The time zone, if provided, should be a time zone name like "America/Los_Angeles" or offset like "-08:00". This function is similar to `FLOOR` but is more flexible.| +|`TIME_SHIFT(, , , [])`|Shifts a timestamp by a period (step times), returning it as a new timestamp. Period can be any ISO8601 period. Step may be negative. The time zone, if provided, should be a time zone name like "America/Los_Angeles" or offset like "-08:00".| +|`TIME_EXTRACT(, [, []])`|Extracts a time part from expr, returning it as a number. Unit can be EPOCH, SECOND, MINUTE, HOUR, DAY (day of month), DOW (day of week), DOY (day of year), WEEK (week of [week year](https://en.wikipedia.org/wiki/ISO_week_date)), MONTH (1 through 12), QUARTER (1 through 4), or YEAR. The time zone, if provided, should be a time zone name like "America/Los_Angeles" or offset like "-08:00". This function is similar to `EXTRACT` but is more flexible. Unit and time zone must be literals, and must be provided quoted, like `TIME_EXTRACT(__time, 'HOUR')` or `TIME_EXTRACT(__time, 'HOUR', 'America/Los_Angeles')`.| +|`TIME_PARSE(, [, []])`|Parses a string into a timestamp using a given [Joda DateTimeFormat pattern](http://www.joda.org/joda-time/apidocs/org/joda/time/format/DateTimeFormat.html), or ISO8601 (e.g. `2000-01-02T03:04:05Z`) if the pattern is not provided. The time zone, if provided, should be a time zone name like "America/Los_Angeles" or offset like "-08:00", and will be used as the time zone for strings that do not include a time zone offset. Pattern and time zone must be literals. Strings that cannot be parsed as timestamps will be returned as NULL.| +|`TIME_FORMAT(, [, []])`|Formats a timestamp as a string with a given [Joda DateTimeFormat pattern](http://www.joda.org/joda-time/apidocs/org/joda/time/format/DateTimeFormat.html), or ISO8601 (e.g. `2000-01-02T03:04:05Z`) if the pattern is not provided. The time zone, if provided, should be a time zone name like "America/Los_Angeles" or offset like "-08:00". Pattern and time zone must be literals.| +|`MILLIS_TO_TIMESTAMP(millis_expr)`|Converts a number of milliseconds since the epoch into a timestamp.| +|`TIMESTAMP_TO_MILLIS(timestamp_expr)`|Converts a timestamp into a number of milliseconds since the epoch.| +|`EXTRACT( FROM timestamp_expr)`|Extracts a time part from expr, returning it as a number. Unit can be EPOCH, MICROSECOND, MILLISECOND, SECOND, MINUTE, HOUR, DAY (day of month), DOW (day of week), ISODOW (ISO day of week), DOY (day of year), WEEK (week of year), MONTH, QUARTER, YEAR, ISOYEAR, DECADE, CENTURY or MILLENNIUM. Units must be provided unquoted, like `EXTRACT(HOUR FROM __time)`.| +|`FLOOR(timestamp_expr TO )`|Rounds down a timestamp, returning it as a new timestamp. Unit can be SECOND, MINUTE, HOUR, DAY, WEEK, MONTH, QUARTER, or YEAR.| +|`CEIL(timestamp_expr TO )`|Rounds up a timestamp, returning it as a new timestamp. Unit can be SECOND, MINUTE, HOUR, DAY, WEEK, MONTH, QUARTER, or YEAR.| +|`TIMESTAMPADD(, , )`|Equivalent to `timestamp + count * INTERVAL '1' UNIT`.| +|`TIMESTAMPDIFF(, , )`|Returns the (signed) number of `unit` between `timestamp1` and `timestamp2`. Unit can be SECOND, MINUTE, HOUR, DAY, WEEK, MONTH, QUARTER, or YEAR.| +|timestamp_expr { + | - } |Add or subtract an amount of time from a timestamp. interval_expr can include interval literals like `INTERVAL '2' HOUR`, and may include interval arithmetic as well. This operator treats days as uniformly 86400 seconds long, and does not take into account daylight savings time. To account for daylight savings time, use TIME_SHIFT instead.| + + +### Reduction functions + +Reduction functions operate on zero or more expressions and return a single expression. If no expressions are passed as +arguments, then the result is `NULL`. The expressions must all be convertible to a common data type, which will be the +type of the result: +* If all argument are `NULL`, the result is `NULL`. Otherwise, `NULL` arguments are ignored. +* If the arguments comprise a mix of numbers and strings, the arguments are interpreted as strings. +* If all arguments are integer numbers, the arguments are interpreted as longs. +* If all arguments are numbers and at least one argument is a double, the arguments are interpreted as doubles. + +|Function|Notes| +|--------|-----| +|`GREATEST([expr1, ...])`|Evaluates zero or more expressions and returns the maximum value based on comparisons as described above.| +|`LEAST([expr1, ...])`|Evaluates zero or more expressions and returns the minimum value based on comparisons as described above.| + + +### IP address functions + +For the IPv4 address functions, the `address` argument can either be an IPv4 dotted-decimal string +(e.g., '192.168.0.1') or an IP address represented as an integer (e.g., 3232235521). The `subnet` +argument should be a string formatted as an IPv4 address subnet in CIDR notation (e.g., +'192.168.0.0/16'). + +|Function|Notes| +|---|---| +|`IPV4_MATCH(address, subnet)`|Returns true if the `address` belongs to the `subnet` literal, else false. If `address` is not a valid IPv4 address, then false is returned. This function is more efficient if `address` is an integer instead of a string.| +|`IPV4_PARSE(address)`|Parses `address` into an IPv4 address stored as an integer . If `address` is an integer that is a valid IPv4 address, then it is passed through. Returns null if `address` cannot be represented as an IPv4 address.| +|`IPV4_STRINGIFY(address)`|Converts `address` into an IPv4 address dotted-decimal string. If `address` is a string that is a valid IPv4 address, then it is passed through. Returns null if `address` cannot be represented as an IPv4 address.| + + +### Comparison operators + +|Function|Notes| +|--------|-----| +|`x = y`|Equals.| +|`x <> y`|Not-equals.| +|`x > y`|Greater than.| +|`x >= y`|Greater than or equal to.| +|`x < y`|Less than.| +|`x <= y`|Less than or equal to.| +|`x BETWEEN y AND z`|Equivalent to `x >= y AND x <= z`.| +|`x NOT BETWEEN y AND z`|Equivalent to `x < y OR x > z`.| +|`x LIKE pattern [ESCAPE esc]`|True if x matches a SQL LIKE pattern (with an optional escape).| +|`x NOT LIKE pattern [ESCAPE esc]`|True if x does not match a SQL LIKE pattern (with an optional escape).| +|`x IS NULL`|True if x is NULL or empty string.| +|`x IS NOT NULL`|True if x is neither NULL nor empty string.| +|`x IS TRUE`|True if x is true.| +|`x IS NOT TRUE`|True if x is not true.| +|`x IS FALSE`|True if x is false.| +|`x IS NOT FALSE`|True if x is not false.| +|`x IN (values)`|True if x is one of the listed values.| +|`x NOT IN (values)`|True if x is not one of the listed values.| +|`x IN (subquery)`|True if x is returned by the subquery. This will be translated into a join; see [Query translation](#query-translation) for details.| +|`x NOT IN (subquery)`|True if x is not returned by the subquery. This will be translated into a join; see [Query translation](#query-translation) for details.| +|`x AND y`|Boolean AND.| +|`x OR y`|Boolean OR.| +|`NOT x`|Boolean NOT.| + +### Sketch functions + +These functions operate on expressions or columns that return sketch objects. + +#### HLL sketch functions + +The following functions operate on [DataSketches HLL sketches](../development/extensions-core/datasketches-hll.md). +The [DataSketches extension](../development/extensions-core/datasketches-extension.md) must be loaded to use the following functions. + +|Function|Notes| +|--------|-----| +|`HLL_SKETCH_ESTIMATE(expr, [round])`|Returns the distinct count estimate from an HLL sketch. `expr` must return an HLL sketch. The optional `round` boolean parameter will round the estimate if set to `true`, with a default of `false`.| +|`HLL_SKETCH_ESTIMATE_WITH_ERROR_BOUNDS(expr, [numStdDev])`|Returns the distinct count estimate and error bounds from an HLL sketch. `expr` must return an HLL sketch. An optional `numStdDev` argument can be provided.| +|`HLL_SKETCH_UNION([lgK, tgtHllType], expr0, expr1, ...)`|Returns a union of HLL sketches, where each input expression must return an HLL sketch. The `lgK` and `tgtHllType` can be optionally specified as the first parameter; if provided, both optional parameters must be specified.| +|`HLL_SKETCH_TO_STRING(expr)`|Returns a human-readable string representation of an HLL sketch for debugging. `expr` must return an HLL sketch.| + +#### Theta sketch functions + +The following functions operate on [theta sketches](../development/extensions-core/datasketches-theta.md). +The [DataSketches extension](../development/extensions-core/datasketches-extension.md) must be loaded to use the following functions. + +|Function|Notes| +|--------|-----| +|`THETA_SKETCH_ESTIMATE(expr)`|Returns the distinct count estimate from a theta sketch. `expr` must return a theta sketch.| +|`THETA_SKETCH_ESTIMATE_WITH_ERROR_BOUNDS(expr, errorBoundsStdDev)`|Returns the distinct count estimate and error bounds from a theta sketch. `expr` must return a theta sketch.| +|`THETA_SKETCH_UNION([size], expr0, expr1, ...)`|Returns a union of theta sketches, where each input expression must return a theta sketch. The `size` can be optionally specified as the first parameter.| +|`THETA_SKETCH_INTERSECT([size], expr0, expr1, ...)`|Returns an intersection of theta sketches, where each input expression must return a theta sketch. The `size` can be optionally specified as the first parameter.| +|`THETA_SKETCH_NOT([size], expr0, expr1, ...)`|Returns a set difference of theta sketches, where each input expression must return a theta sketch. The `size` can be optionally specified as the first parameter.| + +#### Quantiles sketch functions + +The following functions operate on [quantiles sketches](../development/extensions-core/datasketches-quantiles.md). +The [DataSketches extension](../development/extensions-core/datasketches-extension.md) must be loaded to use the following functions. + +|Function|Notes| +|--------|-----| +|`DS_GET_QUANTILE(expr, fraction)`|Returns the quantile estimate corresponding to `fraction` from a quantiles sketch. `expr` must return a quantiles sketch.| +|`DS_GET_QUANTILES(expr, fraction0, fraction1, ...)`|Returns a string representing an array of quantile estimates corresponding to a list of fractions from a quantiles sketch. `expr` must return a quantiles sketch.| +|`DS_HISTOGRAM(expr, splitPoint0, splitPoint1, ...)`|Returns a string representing an approximation to the histogram given a list of split points that define the histogram bins from a quantiles sketch. `expr` must return a quantiles sketch.| +|`DS_CDF(expr, splitPoint0, splitPoint1, ...)`|Returns a string representing approximation to the Cumulative Distribution Function given a list of split points that define the edges of the bins from a quantiles sketch. `expr` must return a quantiles sketch.| +|`DS_RANK(expr, value)`|Returns an approximation to the rank of a given value that is the fraction of the distribution less than that value from a quantiles sketch. `expr` must return a quantiles sketch.| +|`DS_QUANTILE_SUMMARY(expr)`|Returns a string summary of a quantiles sketch, useful for debugging. `expr` must return a quantiles sketch.| + +### Other scalar functions + +|Function|Notes| +|--------|-----| +|`CAST(value AS TYPE)`|Cast value to another type. See [Data types](#data-types) for details about how Druid SQL handles CAST.| +|`CASE expr WHEN value1 THEN result1 \[ WHEN value2 THEN result2 ... \] \[ ELSE resultN \] END`|Simple CASE.| +|`CASE WHEN boolean_expr1 THEN result1 \[ WHEN boolean_expr2 THEN result2 ... \] \[ ELSE resultN \] END`|Searched CASE.| +|`NULLIF(value1, value2)`|Returns NULL if value1 and value2 match, else returns value1.| +|`COALESCE(value1, value2, ...)`|Returns the first value that is neither NULL nor empty string.| +|`NVL(expr,expr-for-null)`|Returns 'expr-for-null' if 'expr' is null (or empty string for string type).| +|`BLOOM_FILTER_TEST(, )`|Returns true if the value is contained in a Base64-serialized bloom filter. See the [Bloom filter extension](../development/extensions-core/bloom-filter.md) documentation for additional details.| + +## Multi-value string functions + +All 'array' references in the multi-value string function documentation can refer to multi-value string columns or +`ARRAY` literals. + +|Function|Notes| +|--------|-----| +| `ARRAY[expr1,expr ...]` | constructs a SQL ARRAY literal from the expression arguments, using the type of the first argument as the output array type | +| `MV_LENGTH(arr)` | returns length of array expression | +| `MV_OFFSET(arr,long)` | returns the array element at the 0 based index supplied, or null for an out of range index| +| `MV_ORDINAL(arr,long)` | returns the array element at the 1 based index supplied, or null for an out of range index | +| `MV_CONTAINS(arr,expr)` | returns 1 if the array contains the element specified by expr, or contains all elements specified by expr if expr is an array, else 0 | +| `MV_OVERLAP(arr1,arr2)` | returns 1 if arr1 and arr2 have any elements in common, else 0 | +| `MV_OFFSET_OF(arr,expr)` | returns the 0 based index of the first occurrence of expr in the array, or `-1` or `null` if `druid.generic.useDefaultValueForNull=false` if no matching elements exist in the array. | +| `MV_ORDINAL_OF(arr,expr)` | returns the 1 based index of the first occurrence of expr in the array, or `-1` or `null` if `druid.generic.useDefaultValueForNull=false` if no matching elements exist in the array. | +| `MV_PREPEND(expr,arr)` | adds expr to arr at the beginning, the resulting array type determined by the type of the array | +| `MV_APPEND(arr1,expr)` | appends expr to arr, the resulting array type determined by the type of the first array | +| `MV_CONCAT(arr1,arr2)` | concatenates 2 arrays, the resulting array type determined by the type of the first array | +| `MV_SLICE(arr,start,end)` | return the subarray of arr from the 0 based index start(inclusive) to end(exclusive), or `null`, if start is less than 0, greater than length of arr or less than end| +| `MV_TO_STRING(arr,str)` | joins all elements of arr by the delimiter specified by str | +| `STRING_TO_MV(str1,str2)` | splits str1 into an array on the delimiter specified by str2 | + +## Query translation + +Druid SQL translates SQL queries to [native queries](querying.md) before running them, and understanding how this +translation works is key to getting good performance. + +### Best practices + +Consider this (non-exhaustive) list of things to look out for when looking into the performance implications of +how your SQL queries are translated to native queries. + +1. If you wrote a filter on the primary time column `__time`, make sure it is being correctly translated to an +`"intervals"` filter, as described in the [Time filters](#time-filters) section below. If not, you may need to change +the way you write the filter. + +2. Try to avoid subqueries underneath joins: they affect both performance and scalability. This includes implicit +subqueries generated by conditions on mismatched types, and implicit subqueries generated by conditions that use +expressions to refer to the right-hand side. + +3. Currently, Druid does not support pushing down predicates (condition and filter) past a Join (i.e. into +Join's children). Druid only supports pushing predicates into the join if they originated from +above the join. Hence, the location of predicates and filters in your Druid SQL is very important. +Also, as a result of this, comma joins should be avoided. + +4. Read through the [Query execution](query-execution.md) page to understand how various types of native queries +will be executed. + +5. Be careful when interpreting EXPLAIN PLAN output, and use request logging if in doubt. Request logs will show the +exact native query that was run. See the [next section](#interpreting-explain-plan-output) for more details. + +6. If you encounter a query that could be planned better, feel free to +[raise an issue on GitHub](https://github.com/apache/druid/issues/new/choose). A reproducible test case is always +appreciated. + +### Interpreting EXPLAIN PLAN output + +The [EXPLAIN PLAN](#explain-plan) functionality can help you understand how a given SQL query will +be translated to native. For simple queries that do not involve subqueries or joins, the output of EXPLAIN PLAN +is easy to interpret. The native query that will run is embedded as JSON inside a "DruidQueryRel" line: + +``` +> EXPLAIN PLAN FOR SELECT COUNT(*) FROM wikipedia + +DruidQueryRel(query=[{"queryType":"timeseries","dataSource":"wikipedia","intervals":"-146136543-09-08T08:23:32.096Z/146140482-04-24T15:36:27.903Z","granularity":"all","aggregations":[{"type":"count","name":"a0"}]}], signature=[{a0:LONG}]) +``` + +For more complex queries that do involve subqueries or joins, EXPLAIN PLAN is somewhat more difficult to interpret. +For example, consider this query: + +``` +> EXPLAIN PLAN FOR +> SELECT +> channel, +> COUNT(*) +> FROM wikipedia +> WHERE channel IN (SELECT page FROM wikipedia GROUP BY page ORDER BY COUNT(*) DESC LIMIT 10) +> GROUP BY channel + +DruidJoinQueryRel(condition=[=($1, $3)], joinType=[inner], query=[{"queryType":"groupBy","dataSource":{"type":"table","name":"__join__"},"intervals":{"type":"intervals","intervals":["-146136543-09-08T08:23:32.096Z/146140482-04-24T15:36:27.903Z"]},"granularity":"all","dimensions":["channel"],"aggregations":[{"type":"count","name":"a0"}]}], signature=[{d0:STRING, a0:LONG}]) + DruidQueryRel(query=[{"queryType":"scan","dataSource":{"type":"table","name":"wikipedia"},"intervals":{"type":"intervals","intervals":["-146136543-09-08T08:23:32.096Z/146140482-04-24T15:36:27.903Z"]},"resultFormat":"compactedList","columns":["__time","channel","page"],"granularity":"all"}], signature=[{__time:LONG, channel:STRING, page:STRING}]) + DruidQueryRel(query=[{"queryType":"topN","dataSource":{"type":"table","name":"wikipedia"},"dimension":"page","metric":{"type":"numeric","metric":"a0"},"threshold":10,"intervals":{"type":"intervals","intervals":["-146136543-09-08T08:23:32.096Z/146140482-04-24T15:36:27.903Z"]},"granularity":"all","aggregations":[{"type":"count","name":"a0"}]}], signature=[{d0:STRING}]) +``` + +Here, there is a join with two inputs. The way to read this is to consider each line of the EXPLAIN PLAN output as +something that might become a query, or might just become a simple datasource. The `query` field they all have is +called a "partial query" and represents what query would be run on the datasource represented by that line, if that +line ran by itself. In some cases — like the "scan" query in the second line of this example — the query does not +actually run, and it ends up being translated to a simple table datasource. See the [Join translation](#joins) section +for more details about how this works. + +We can see this for ourselves using Druid's [request logging](../configuration/index.md#request-logging) feature. After +enabling logging and running this query, we can see that it actually runs as the following native query. + +```json +{ + "queryType": "groupBy", + "dataSource": { + "type": "join", + "left": "wikipedia", + "right": { + "type": "query", + "query": { + "queryType": "topN", + "dataSource": "wikipedia", + "dimension": {"type": "default", "dimension": "page", "outputName": "d0"}, + "metric": {"type": "numeric", "metric": "a0"}, + "threshold": 10, + "intervals": "-146136543-09-08T08:23:32.096Z/146140482-04-24T15:36:27.903Z", + "granularity": "all", + "aggregations": [ + { "type": "count", "name": "a0"} + ] + } + }, + "rightPrefix": "j0.", + "condition": "(\"page\" == \"j0.d0\")", + "joinType": "INNER" + }, + "intervals": "-146136543-09-08T08:23:32.096Z/146140482-04-24T15:36:27.903Z", + "granularity": "all", + "dimensions": [ + {"type": "default", "dimension": "channel", "outputName": "d0"} + ], + "aggregations": [ + { "type": "count", "name": "a0"} + ] +} +``` + +### Query types + +Druid SQL uses four different native query types. + +- [Scan](scan-query.md) is used for queries that do not aggregate (no GROUP BY, no DISTINCT). + +- [Timeseries](timeseriesquery.md) is used for queries that GROUP BY `FLOOR(__time TO )` or `TIME_FLOOR(__time, +period)`, have no other grouping expressions, no HAVING or LIMIT clauses, no nesting, and either no ORDER BY, or an +ORDER BY that orders by same expression as present in GROUP BY. It also uses Timeseries for "grand total" queries that +have aggregation functions but no GROUP BY. This query type takes advantage of the fact that Druid segments are sorted +by time. + +- [TopN](topnquery.md) is used by default for queries that group by a single expression, do have ORDER BY and LIMIT +clauses, do not have HAVING clauses, and are not nested. However, the TopN query type will deliver approximate ranking +and results in some cases; if you want to avoid this, set "useApproximateTopN" to "false". TopN results are always +computed in memory. See the TopN documentation for more details. + +- [GroupBy](groupbyquery.md) is used for all other aggregations, including any nested aggregation queries. Druid's +GroupBy is a traditional aggregation engine: it delivers exact results and rankings and supports a wide variety of +features. GroupBy aggregates in memory if it can, but it may spill to disk if it doesn't have enough memory to complete +your query. Results are streamed back from data processes through the Broker if you ORDER BY the same expressions in your +GROUP BY clause, or if you don't have an ORDER BY at all. If your query has an ORDER BY referencing expressions that +don't appear in the GROUP BY clause (like aggregation functions) then the Broker will materialize a list of results in +memory, up to a max of your LIMIT, if any. See the GroupBy documentation for details about tuning performance and memory +use. + +### Time filters + +For all native query types, filters on the `__time` column will be translated into top-level query "intervals" whenever +possible, which allows Druid to use its global time index to quickly prune the set of data that must be scanned. +Consider this (non-exhaustive) list of time filters that will be recognized and translated to "intervals": + +- `__time >= TIMESTAMP '2000-01-01 00:00:00'` (comparison to absolute time) +- `__time >= CURRENT_TIMESTAMP - INTERVAL '8' HOUR` (comparison to relative time) +- `FLOOR(__time TO DAY) = TIMESTAMP '2000-01-01 00:00:00'` (specific day) + +Refer to the [Interpreting EXPLAIN PLAN output](#interpreting-explain-plan-output) section for details on confirming +that time filters are being translated as you expect. + +### Joins + +SQL join operators are translated to native join datasources as follows: + +1. Joins that the native layer can handle directly are translated literally, to a [join datasource](datasource.md#join) +whose `left`, `right`, and `condition` are faithful translations of the original SQL. This includes any SQL join where +the right-hand side is a lookup or subquery, and where the condition is an equality where one side is an expression based +on the left-hand table, the other side is a simple column reference to the right-hand table, and both sides of the +equality are the same data type. + +2. If a join cannot be handled directly by a native [join datasource](datasource.md#join) as written, Druid SQL +will insert subqueries to make it runnable. For example, `foo INNER JOIN bar ON foo.abc = LOWER(bar.def)` cannot be +directly translated, because there is an expression on the right-hand side instead of a simple column access. A subquery +will be inserted that effectively transforms this clause to +`foo INNER JOIN (SELECT LOWER(def) AS def FROM bar) t ON foo.abc = t.def`. + +3. Druid SQL does not currently reorder joins to optimize queries. + +Refer to the [Interpreting EXPLAIN PLAN output](#interpreting-explain-plan-output) section for details on confirming +that joins are being translated as you expect. + +Refer to the [Query execution](query-execution.md#join) page for information about how joins are executed. + +### Subqueries + +Subqueries in SQL are generally translated to native query datasources. Refer to the +[Query execution](query-execution.md#query) page for information about how subqueries are executed. + +> Note: Subqueries in the WHERE clause, like `WHERE col1 IN (SELECT foo FROM ...)` are translated to inner joins. + +### Approximations + +Druid SQL will use approximate algorithms in some situations: + +- The `COUNT(DISTINCT col)` aggregation functions by default uses a variant of +[HyperLogLog](http://algo.inria.fr/flajolet/Publications/FlFuGaMe07.pdf), a fast approximate distinct counting +algorithm. Druid SQL will switch to exact distinct counts if you set "useApproximateCountDistinct" to "false", either +through query context or through Broker configuration. + +- GROUP BY queries over a single column with ORDER BY and LIMIT may be executed using the TopN engine, which uses an +approximate algorithm. Druid SQL will switch to an exact grouping algorithm if you set "useApproximateTopN" to "false", +either through query context or through Broker configuration. + +- Aggregation functions that are labeled as using sketches or approximations, such as APPROX_COUNT_DISTINCT, are always +approximate, regardless of configuration. + +### Unsupported features + +Druid does not support all SQL features. In particular, the following features are not supported. + +- JOIN between native datasources (table, lookup, subquery) and [system tables](#metadata-tables). +- JOIN conditions that are not an equality between expressions from the left- and right-hand sides. +- JOIN conditions containing a constant value inside the condition. +- JOIN conditions on a column which contains a multi-value dimension. +- OVER clauses, and analytic functions such as `LAG` and `LEAD`. +- ORDER BY for a non-aggregating query, except for `ORDER BY __time` or `ORDER BY __time DESC`, which are supported. + This restriction only applies to non-aggregating queries; you can ORDER BY any column in an aggregating query. +- DDL and DML. +- Using Druid-specific functions like `TIME_PARSE` and `APPROX_QUANTILE_DS` on [system tables](#metadata-tables). + +Additionally, some Druid native query features are not supported by the SQL language. Some unsupported Druid features +include: + +- [Inline datasources](datasource.md#inline). +- [Spatial filters](../development/geo.md). +- [Query cancellation](querying.md#query-cancellation). +- [Multi-value dimensions](#multi-value-strings) are only partially implemented in Druid SQL. There are known +inconsistencies between their behavior in SQL queries and in native queries due to how they are currently treated by +the SQL planner. + +## Client APIs + + + +### HTTP POST + +You can make Druid SQL queries using HTTP via POST to the endpoint `/druid/v2/sql/`. The request should +be a JSON object with a "query" field, like `{"query" : "SELECT COUNT(*) FROM data_source WHERE foo = 'bar'"}`. + +##### Request + +|Property|Description|Default| +|--------|----|-----------| +|`query`|SQL query string.| none (required)| +|`resultFormat`|Format of query results. See [Responses](#responses) for details.|`"object"`| +|`header`|Whether or not to include a header. See [Responses] for details.|`false`| +|`context`|JSON object containing [connection context parameters](#connection-context).|`{}` (empty)| +|`parameters`|List of query parameters for parameterized queries. Each parameter in the list should be a JSON object like `{"type": "VARCHAR", "value": "foo"}`. The type should be a SQL type; see [Data types](#data-types) for a list of supported SQL types.|`[]` (empty)| + +You can use _curl_ to send SQL queries from the command-line: + +```bash +$ cat query.json +{"query":"SELECT COUNT(*) AS TheCount FROM data_source"} + +$ curl -XPOST -H'Content-Type: application/json' http://BROKER:8082/druid/v2/sql/ -d @query.json +[{"TheCount":24433}] +``` + +There are a variety of [connection context parameters](#connection-context) you can provide by adding a "context" map, +like: + +```json +{ + "query" : "SELECT COUNT(*) FROM data_source WHERE foo = 'bar' AND __time > TIMESTAMP '2000-01-01 00:00:00'", + "context" : { + "sqlTimeZone" : "America/Los_Angeles" + } +} +``` + +Parameterized SQL queries are also supported: + +```json +{ + "query" : "SELECT COUNT(*) FROM data_source WHERE foo = ? AND __time > ?", + "parameters": [ + { "type": "VARCHAR", "value": "bar"}, + { "type": "TIMESTAMP", "value": "2000-01-01 00:00:00" } + ] +} +``` + +Metadata is available over HTTP POST by querying [metadata tables](#metadata-tables). + +#### Responses + +Druid SQL's HTTP POST API supports a variety of result formats. You can specify these by adding a "resultFormat" +parameter, like: + +```json +{ + "query" : "SELECT COUNT(*) FROM data_source WHERE foo = 'bar' AND __time > TIMESTAMP '2000-01-01 00:00:00'", + "resultFormat" : "object" +} +``` + +The supported result formats are: + +|Format|Description|Content-Type| +|------|-----------|------------| +|`object`|The default, a JSON array of JSON objects. Each object's field names match the columns returned by the SQL query, and are provided in the same order as the SQL query.|application/json| +|`array`|JSON array of JSON arrays. Each inner array has elements matching the columns returned by the SQL query, in order.|application/json| +|`objectLines`|Like "object", but the JSON objects are separated by newlines instead of being wrapped in a JSON array. This can make it easier to parse the entire response set as a stream, if you do not have ready access to a streaming JSON parser. To make it possible to detect a truncated response, this format includes a trailer of one blank line.|text/plain| +|`arrayLines`|Like "array", but the JSON arrays are separated by newlines instead of being wrapped in a JSON array. This can make it easier to parse the entire response set as a stream, if you do not have ready access to a streaming JSON parser. To make it possible to detect a truncated response, this format includes a trailer of one blank line.|text/plain| +|`csv`|Comma-separated values, with one row per line. Individual field values may be escaped by being surrounded in double quotes. If double quotes appear in a field value, they will be escaped by replacing them with double-double-quotes like `""this""`. To make it possible to detect a truncated response, this format includes a trailer of one blank line.|text/csv| + +You can additionally request a header by setting "header" to true in your request, like: + +```json +{ + "query" : "SELECT COUNT(*) FROM data_source WHERE foo = 'bar' AND __time > TIMESTAMP '2000-01-01 00:00:00'", + "resultFormat" : "arrayLines", + "header" : true +} +``` + +In this case, the first result returned will be a header. For the `csv`, `array`, and `arrayLines` formats, the header +will be a list of column names. For the `object` and `objectLines` formats, the header will be an object where the +keys are column names, and the values are null. + +Errors that occur before the response body is sent will be reported in JSON, with an HTTP 500 status code, in the +same format as [native Druid query errors](../querying/querying.md#query-errors). If an error occurs while the response body is +being sent, at that point it is too late to change the HTTP status code or report a JSON error, so the response will +simply end midstream and an error will be logged by the Druid server that was handling your request. + +As a caller, it is important that you properly handle response truncation. This is easy for the "object" and "array" +formats, since truncated responses will be invalid JSON. For the line-oriented formats, you should check the +trailer they all include: one blank line at the end of the result set. If you detect a truncated response, either +through a JSON parsing error or through a missing trailing newline, you should assume the response was not fully +delivered due to an error. + +### JDBC + +You can make Druid SQL queries using the [Avatica JDBC driver](https://calcite.apache.org/avatica/downloads/). We recommend using Avatica JDBC driver version 1.17.0 or later. Note that as of the time of this writing, Avatica 1.17.0, the latest version, does not support passing connection string parameters from the URL to Druid, so you must pass them using a `Properties` object. Once you've downloaded the Avatica client jar, add it to your classpath and use the connect string `jdbc:avatica:remote:url=http://BROKER:8082/druid/v2/sql/avatica/`. + +Example code: + +```java +// Connect to /druid/v2/sql/avatica/ on your Broker. +String url = "jdbc:avatica:remote:url=http://localhost:8082/druid/v2/sql/avatica/"; + +// Set any connection context parameters you need here (see "Connection context" below). +// Or leave empty for default behavior. +Properties connectionProperties = new Properties(); + +try (Connection connection = DriverManager.getConnection(url, connectionProperties)) { + try ( + final Statement statement = connection.createStatement(); + final ResultSet resultSet = statement.executeQuery(query) + ) { + while (resultSet.next()) { + // process result set + } + } +} +``` + +It is also possible to use a protocol buffers JDBC connection with Druid, this offer reduced bloat and potential performance +improvements for larger result sets. To use it apply the following connection url instead, everything else remains the same +``` +String url = "jdbc:avatica:remote:url=http://localhost:8082/druid/v2/sql/avatica-protobuf/;serialization=protobuf"; +``` + +> The protobuf endpoint is also known to work with the official [Golang Avatica driver](https://github.com/apache/calcite-avatica-go) + +Table metadata is available over JDBC using `connection.getMetaData()` or by querying the +["INFORMATION_SCHEMA" tables](#metadata-tables). + +#### Connection stickiness + +Druid's JDBC server does not share connection state between Brokers. This means that if you're using JDBC and have +multiple Druid Brokers, you should either connect to a specific Broker, or use a load balancer with sticky sessions +enabled. The Druid Router process provides connection stickiness when balancing JDBC requests, and can be used to achieve +the necessary stickiness even with a normal non-sticky load balancer. Please see the +[Router](../design/router.md) documentation for more details. + +Note that the non-JDBC [JSON over HTTP](#http-post) API is stateless and does not require stickiness. + +### Dynamic Parameters + +You can also use parameterized queries in JDBC code, as in this example; + +```java +PreparedStatement statement = connection.prepareStatement("SELECT COUNT(*) AS cnt FROM druid.foo WHERE dim1 = ? OR dim1 = ?"); +statement.setString(1, "abc"); +statement.setString(2, "def"); +final ResultSet resultSet = statement.executeQuery(); +``` + +### Connection context + +Druid SQL supports setting connection parameters on the client. The parameters in the table below affect SQL planning. +All other context parameters you provide will be attached to Druid queries and can affect how they run. See +[Query context](query-context.md) for details on the possible options. + +```java +String url = "jdbc:avatica:remote:url=http://localhost:8082/druid/v2/sql/avatica/"; + +// Set any query context parameters you need here. +Properties connectionProperties = new Properties(); +connectionProperties.setProperty("sqlTimeZone", "America/Los_Angeles"); +connectionProperties.setProperty("useCache", "false"); + +try (Connection connection = DriverManager.getConnection(url, connectionProperties)) { + // create and execute statements, process result sets, etc +} +``` + +Note that to specify an unique identifier for SQL query, use `sqlQueryId` instead of `queryId`. Setting `queryId` for a SQL +request has no effect, all native queries underlying SQL will use auto-generated queryId. + +Connection context can be specified as JDBC connection properties or as a "context" object in the JSON API. + +|Parameter|Description|Default value| +|---------|-----------|-------------| +|`sqlQueryId`|Unique identifier given to this SQL query. For HTTP client, it will be returned in `X-Druid-SQL-Query-Id` header.|auto-generated| +|`sqlTimeZone`|Sets the time zone for this connection, which will affect how time functions and timestamp literals behave. Should be a time zone name like "America/Los_Angeles" or offset like "-08:00".|druid.sql.planner.sqlTimeZone on the Broker (default: UTC)| +|`sqlStringifyArrays`|When set to true, result columns which return array values will be serialized into a JSON string in the response instead of as an array (default: true, except for JDBC connections, where it is always false)| +|`useApproximateCountDistinct`|Whether to use an approximate cardinality algorithm for `COUNT(DISTINCT foo)`.|druid.sql.planner.useApproximateCountDistinct on the Broker (default: true)| +|`useGroupingSetForExactDistinct`|Whether to use grouping sets to execute queries with multiple exact distinct aggregations.|druid.sql.planner.useGroupingSetForExactDistinct on the Broker (default: false)| +|`useApproximateTopN`|Whether to use approximate [TopN queries](topnquery.md) when a SQL query could be expressed as such. If false, exact [GroupBy queries](groupbyquery.md) will be used instead.|druid.sql.planner.useApproximateTopN on the Broker (default: true)| + +## Metadata tables + +Druid Brokers infer table and column metadata for each datasource from segments loaded in the cluster, and use this to +plan SQL queries. This metadata is cached on Broker startup and also updated periodically in the background through +[SegmentMetadata queries](segmentmetadataquery.md). Background metadata refreshing is triggered by +segments entering and exiting the cluster, and can also be throttled through configuration. + +Druid exposes system information through special system tables. There are two such schemas available: Information Schema and Sys Schema. +Information schema provides details about table and column types. The "sys" schema provides information about Druid internals like segments/tasks/servers. + +### INFORMATION SCHEMA + +You can access table and column metadata through JDBC using `connection.getMetaData()`, or through the +INFORMATION_SCHEMA tables described below. For example, to retrieve metadata for the Druid +datasource "foo", use the query: + +```sql +SELECT * FROM INFORMATION_SCHEMA.COLUMNS WHERE TABLE_SCHEMA = 'druid' AND TABLE_NAME = 'foo' +``` + +> Note: INFORMATION_SCHEMA tables do not currently support Druid-specific functions like `TIME_PARSE` and +> `APPROX_QUANTILE_DS`. Only standard SQL functions can be used. + +#### SCHEMATA table +`INFORMATION_SCHEMA.SCHEMATA` provides a list of all known schemas, which include `druid` for standard [Druid Table datasources](datasource.md#table), `lookup` for [Lookups](datasource.md#lookup), `sys` for the virtual [System metadata tables](#system-schema), and `INFORMATION_SCHEMA` for these virtual tables. Tables are allowed to have the same name across different schemas, so the schema may be included in an SQL statement to distinguish them, e.g. `lookup.table` vs `druid.table`. + +|Column|Notes| +|------|-----| +|CATALOG_NAME|Always set as `druid`| +|SCHEMA_NAME|`druid`, `lookup`, `sys`, or `INFORMATION_SCHEMA`| +|SCHEMA_OWNER|Unused| +|DEFAULT_CHARACTER_SET_CATALOG|Unused| +|DEFAULT_CHARACTER_SET_SCHEMA|Unused| +|DEFAULT_CHARACTER_SET_NAME|Unused| +|SQL_PATH|Unused| + +#### TABLES table +`INFORMATION_SCHEMA.TABLES` provides a list of all known tables and schemas. + +|Column|Notes| +|------|-----| +|TABLE_CATALOG|Always set as `druid`| +|TABLE_SCHEMA|The 'schema' which the table falls under, see [SCHEMATA table for details](#schemata-table)| +|TABLE_NAME|Table name. For the `druid` schema, this is the `dataSource`.| +|TABLE_TYPE|"TABLE" or "SYSTEM_TABLE"| +|IS_JOINABLE|If a table is directly joinable if on the right hand side of a `JOIN` statement, without performing a subquery, this value will be set to `YES`, otherwise `NO`. Lookups are always joinable because they are globally distributed among Druid query processing nodes, but Druid datasources are not, and will use a less efficient subquery join.| +|IS_BROADCAST|If a table is 'broadcast' and distributed among all Druid query processing nodes, this value will be set to `YES`, such as lookups and Druid datasources which have a 'broadcast' load rule, else `NO`.| + +#### COLUMNS table +`INFORMATION_SCHEMA.COLUMNS` provides a list of all known columns across all tables and schema. + +|Column|Notes| +|------|-----| +|TABLE_CATALOG|Always set as `druid`| +|TABLE_SCHEMA|The 'schema' which the table column falls under, see [SCHEMATA table for details](#schemata-table)| +|TABLE_NAME|The 'table' which the column belongs to, see [TABLES table for details](#tables-table)| +|COLUMN_NAME|The column name| +|ORDINAL_POSITION|The order in which the column is stored in a table| +|COLUMN_DEFAULT|Unused| +|IS_NULLABLE|| +|DATA_TYPE|| +|CHARACTER_MAXIMUM_LENGTH|Unused| +|CHARACTER_OCTET_LENGTH|Unused| +|NUMERIC_PRECISION|| +|NUMERIC_PRECISION_RADIX|| +|NUMERIC_SCALE|| +|DATETIME_PRECISION|| +|CHARACTER_SET_NAME|| +|COLLATION_NAME|| +|JDBC_TYPE|Type code from java.sql.Types (Druid extension)| + +### SYSTEM SCHEMA + +The "sys" schema provides visibility into Druid segments, servers and tasks. + +> Note: "sys" tables do not currently support Druid-specific functions like `TIME_PARSE` and +> `APPROX_QUANTILE_DS`. Only standard SQL functions can be used. + +#### SEGMENTS table + +Segments table provides details on all Druid segments, whether they are published yet or not. + +|Column|Type|Notes| +|------|-----|-----| +|segment_id|STRING|Unique segment identifier| +|datasource|STRING|Name of datasource| +|start|STRING|Interval start time (in ISO 8601 format)| +|end|STRING|Interval end time (in ISO 8601 format)| +|size|LONG|Size of segment in bytes| +|version|STRING|Version string (generally an ISO8601 timestamp corresponding to when the segment set was first started). Higher version means the more recently created segment. Version comparing is based on string comparison.| +|partition_num|LONG|Partition number (an integer, unique within a datasource+interval+version; may not necessarily be contiguous)| +|num_replicas|LONG|Number of replicas of this segment currently being served| +|num_rows|LONG|Number of rows in current segment, this value could be null if unknown to Broker at query time| +|is_published|LONG|Boolean is represented as long type where 1 = true, 0 = false. 1 represents this segment has been published to the metadata store with `used=1`. See the [Architecture page](../design/architecture.md#segment-lifecycle) for more details.| +|is_available|LONG|Boolean is represented as long type where 1 = true, 0 = false. 1 if this segment is currently being served by any process(Historical or realtime). See the [Architecture page](../design/architecture.md#segment-lifecycle) for more details.| +|is_realtime|LONG|Boolean is represented as long type where 1 = true, 0 = false. 1 if this segment is _only_ served by realtime tasks, and 0 if any historical process is serving this segment.| +|is_overshadowed|LONG|Boolean is represented as long type where 1 = true, 0 = false. 1 if this segment is published and is _fully_ overshadowed by some other published segments. Currently, is_overshadowed is always false for unpublished segments, although this may change in the future. You can filter for segments that "should be published" by filtering for `is_published = 1 AND is_overshadowed = 0`. Segments can briefly be both published and overshadowed if they were recently replaced, but have not been unpublished yet. See the [Architecture page](../design/architecture.md#segment-lifecycle) for more details.| +|shard_spec|STRING|JSON-serialized form of the segment `ShardSpec`| +|dimensions|STRING|JSON-serialized form of the segment dimensions| +|metrics|STRING|JSON-serialized form of the segment metrics| +|last_compaction_state|STRING|JSON-serialized form of the compaction task's config (compaction task which created this segment). May be null if segment was not created by compaction task.| + +For example to retrieve all segments for datasource "wikipedia", use the query: + +```sql +SELECT * FROM sys.segments WHERE datasource = 'wikipedia' +``` + +Another example to retrieve segments total_size, avg_size, avg_num_rows and num_segments per datasource: + +```sql +SELECT + datasource, + SUM("size") AS total_size, + CASE WHEN SUM("size") = 0 THEN 0 ELSE SUM("size") / (COUNT(*) FILTER(WHERE "size" > 0)) END AS avg_size, + CASE WHEN SUM(num_rows) = 0 THEN 0 ELSE SUM("num_rows") / (COUNT(*) FILTER(WHERE num_rows > 0)) END AS avg_num_rows, + COUNT(*) AS num_segments +FROM sys.segments +GROUP BY 1 +ORDER BY 2 DESC +``` + +If you want to retrieve segment that was compacted (ANY compaction): + +```sql +SELECT * FROM sys.segments WHERE last_compaction_state is not null +``` + +or if you want to retrieve segment that was compacted only by a particular compaction spec (such as that of the auto compaction): + +```sql +SELECT * FROM sys.segments WHERE last_compaction_state == 'SELECT * FROM sys.segments where last_compaction_state = 'CompactionState{partitionsSpec=DynamicPartitionsSpec{maxRowsPerSegment=5000000, maxTotalRows=9223372036854775807}, indexSpec={bitmap={type=roaring, compressRunOnSerialization=true}, dimensionCompression=lz4, metricCompression=lz4, longEncoding=longs, segmentLoader=null}}' +``` + +*Caveat:* Note that a segment can be served by more than one stream ingestion tasks or Historical processes, in that case it would have multiple replicas. These replicas are weakly consistent with each other when served by multiple ingestion tasks, until a segment is eventually served by a Historical, at that point the segment is immutable. Broker prefers to query a segment from Historical over an ingestion task. But if a segment has multiple realtime replicas, for e.g.. Kafka index tasks, and one task is slower than other, then the sys.segments query results can vary for the duration of the tasks because only one of the ingestion tasks is queried by the Broker and it is not guaranteed that the same task gets picked every time. The `num_rows` column of segments table can have inconsistent values during this period. There is an open [issue](https://github.com/apache/druid/issues/5915) about this inconsistency with stream ingestion tasks. + +#### SERVERS table + +Servers table lists all discovered servers in the cluster. + +|Column|Type|Notes| +|------|-----|-----| +|server|STRING|Server name in the form host:port| +|host|STRING|Hostname of the server| +|plaintext_port|LONG|Unsecured port of the server, or -1 if plaintext traffic is disabled| +|tls_port|LONG|TLS port of the server, or -1 if TLS is disabled| +|server_type|STRING|Type of Druid service. Possible values include: COORDINATOR, OVERLORD, BROKER, ROUTER, HISTORICAL, MIDDLE_MANAGER or PEON.| +|tier|STRING|Distribution tier see [druid.server.tier](../configuration/index.md#historical-general-configuration). Only valid for HISTORICAL type, for other types it's null| +|current_size|LONG|Current size of segments in bytes on this server. Only valid for HISTORICAL type, for other types it's 0| +|max_size|LONG|Max size in bytes this server recommends to assign to segments see [druid.server.maxSize](../configuration/index.md#historical-general-configuration). Only valid for HISTORICAL type, for other types it's 0| +|is_leader|LONG|1 if the server is currently the 'leader' (for services which have the concept of leadership), otherwise 0 if the server is not the leader, or the default long value (0 or null depending on `druid.generic.useDefaultValueForNull`) if the server type does not have the concept of leadership| + +To retrieve information about all servers, use the query: + +```sql +SELECT * FROM sys.servers; +``` + +#### SERVER_SEGMENTS table + +SERVER_SEGMENTS is used to join servers with segments table + +|Column|Type|Notes| +|------|-----|-----| +|server|STRING|Server name in format host:port (Primary key of [servers table](#servers-table))| +|segment_id|STRING|Segment identifier (Primary key of [segments table](#segments-table))| + +JOIN between "servers" and "segments" can be used to query the number of segments for a specific datasource, +grouped by server, example query: + +```sql +SELECT count(segments.segment_id) as num_segments from sys.segments as segments +INNER JOIN sys.server_segments as server_segments +ON segments.segment_id = server_segments.segment_id +INNER JOIN sys.servers as servers +ON servers.server = server_segments.server +WHERE segments.datasource = 'wikipedia' +GROUP BY servers.server; +``` + +#### TASKS table + +The tasks table provides information about active and recently-completed indexing tasks. For more information +check out the documentation for [ingestion tasks](../ingestion/tasks.md). + +|Column|Type|Notes| +|------|-----|-----| +|task_id|STRING|Unique task identifier| +|group_id|STRING|Task group ID for this task, the value depends on the task `type`. For example, for native index tasks, it's same as `task_id`, for sub tasks, this value is the parent task's ID| +|type|STRING|Task type, for example this value is "index" for indexing tasks. See [tasks-overview](../ingestion/tasks.md)| +|datasource|STRING|Datasource name being indexed| +|created_time|STRING|Timestamp in ISO8601 format corresponding to when the ingestion task was created. Note that this value is populated for completed and waiting tasks. For running and pending tasks this value is set to 1970-01-01T00:00:00Z| +|queue_insertion_time|STRING|Timestamp in ISO8601 format corresponding to when this task was added to the queue on the Overlord| +|status|STRING|Status of a task can be RUNNING, FAILED, SUCCESS| +|runner_status|STRING|Runner status of a completed task would be NONE, for in-progress tasks this can be RUNNING, WAITING, PENDING| +|duration|LONG|Time it took to finish the task in milliseconds, this value is present only for completed tasks| +|location|STRING|Server name where this task is running in the format host:port, this information is present only for RUNNING tasks| +|host|STRING|Hostname of the server where task is running| +|plaintext_port|LONG|Unsecured port of the server, or -1 if plaintext traffic is disabled| +|tls_port|LONG|TLS port of the server, or -1 if TLS is disabled| +|error_msg|STRING|Detailed error message in case of FAILED tasks| + +For example, to retrieve tasks information filtered by status, use the query + +```sql +SELECT * FROM sys.tasks WHERE status='FAILED'; +``` + +#### SUPERVISORS table + +The supervisors table provides information about supervisors. + +|Column|Type|Notes| +|------|-----|-----| +|supervisor_id|STRING|Supervisor task identifier| +|state|STRING|Basic state of the supervisor. Available states: `UNHEALTHY_SUPERVISOR`, `UNHEALTHY_TASKS`, `PENDING`, `RUNNING`, `SUSPENDED`, `STOPPING`. Check [Kafka Docs](../development/extensions-core/kafka-ingestion.md#operations) for details.| +|detailed_state|STRING|Supervisor specific state. (See documentation of the specific supervisor for details, e.g. [Kafka](../development/extensions-core/kafka-ingestion.md) or [Kinesis](../development/extensions-core/kinesis-ingestion.md))| +|healthy|LONG|Boolean represented as long type where 1 = true, 0 = false. 1 indicates a healthy supervisor| +|type|STRING|Type of supervisor, e.g. `kafka`, `kinesis` or `materialized_view`| +|source|STRING|Source of the supervisor, e.g. Kafka topic or Kinesis stream| +|suspended|LONG|Boolean represented as long type where 1 = true, 0 = false. 1 indicates supervisor is in suspended state| +|spec|STRING|JSON-serialized supervisor spec| + +For example, to retrieve supervisor tasks information filtered by health status, use the query + +```sql +SELECT * FROM sys.supervisors WHERE healthy=0; +``` + +## Server configuration + +Druid SQL planning occurs on the Broker and is configured by +[Broker runtime properties](../configuration/index.md#sql). + +## Security + +Please see [Defining SQL permissions](../operations/security-user-auth.md#sql-permissions) in the +basic security documentation for information on permissions needed for making SQL queries. diff --git a/querying/topnquery.md b/querying/topnquery.md new file mode 100644 index 0000000..f802785 --- /dev/null +++ b/querying/topnquery.md @@ -0,0 +1,261 @@ +--- +id: topnquery +title: "TopN queries" +sidebar_label: "TopN" +--- + + + +> Apache Druid supports two query languages: [Druid SQL](sql.md) and [native queries](querying.md). +> This document describes a query +> type in the native language. For information about when Druid SQL will use this query type, refer to the +> [SQL documentation](sql.md#query-types). + +Apache Druid TopN queries return a sorted set of results for the values in a given dimension according to some criteria. Conceptually, they can be thought of as an approximate [GroupByQuery](../querying/groupbyquery.md) over a single dimension with an [Ordering](../querying/limitspec.md) spec. TopNs are much faster and resource efficient than GroupBys for this use case. These types of queries take a topN query object and return an array of JSON objects where each object represents a value asked for by the topN query. + +TopNs are approximate in that each data process will rank their top K results and only return those top K results to the Broker. K, by default in Druid, is `max(1000, threshold)`. In practice, this means that if you ask for the top 1000 items ordered, the correctness of the first ~900 items will be 100%, and the ordering of the results after that is not guaranteed. TopNs can be made more accurate by increasing the threshold. + +A topN query object looks like: + +```json +{ + "queryType": "topN", + "dataSource": "sample_data", + "dimension": "sample_dim", + "threshold": 5, + "metric": "count", + "granularity": "all", + "filter": { + "type": "and", + "fields": [ + { + "type": "selector", + "dimension": "dim1", + "value": "some_value" + }, + { + "type": "selector", + "dimension": "dim2", + "value": "some_other_val" + } + ] + }, + "aggregations": [ + { + "type": "longSum", + "name": "count", + "fieldName": "count" + }, + { + "type": "doubleSum", + "name": "some_metric", + "fieldName": "some_metric" + } + ], + "postAggregations": [ + { + "type": "arithmetic", + "name": "average", + "fn": "/", + "fields": [ + { + "type": "fieldAccess", + "name": "some_metric", + "fieldName": "some_metric" + }, + { + "type": "fieldAccess", + "name": "count", + "fieldName": "count" + } + ] + } + ], + "intervals": [ + "2013-08-31T00:00:00.000/2013-09-03T00:00:00.000" + ] +} +``` + +There are 11 parts to a topN query. + +|property|description|required?| +|--------|-----------|---------| +|queryType|This String should always be "topN"; this is the first thing Druid looks at to figure out how to interpret the query|yes| +|dataSource|A String or Object defining the data source to query, very similar to a table in a relational database. See [DataSource](../querying/datasource.md) for more information.|yes| +|intervals|A JSON Object representing ISO-8601 Intervals. This defines the time ranges to run the query over.|yes| +|granularity|Defines the granularity to bucket query results. See [Granularities](../querying/granularities.md)|yes| +|filter|See [Filters](../querying/filters.md)|no| +|aggregations|See [Aggregations](../querying/aggregations.md)|for numeric metricSpec, aggregations or postAggregations should be specified. Otherwise no.| +|postAggregations|See [Post Aggregations](../querying/post-aggregations.md)|for numeric metricSpec, aggregations or postAggregations should be specified. Otherwise no.| +|dimension|A String or JSON object defining the dimension that you want the top taken for. For more info, see [DimensionSpecs](../querying/dimensionspecs.md)|yes| +|threshold|An integer defining the N in the topN (i.e. how many results you want in the top list)|yes| +|metric|A String or JSON object specifying the metric to sort by for the top list. For more info, see [TopNMetricSpec](../querying/topnmetricspec.md).|yes| +|context|See [Context](../querying/query-context.md)|no| + +Please note the context JSON object is also available for topN queries and should be used with the same caution as the timeseries case. +The format of the results would look like so: + +```json +[ + { + "timestamp": "2013-08-31T00:00:00.000Z", + "result": [ + { + "dim1": "dim1_val", + "count": 111, + "some_metrics": 10669, + "average": 96.11711711711712 + }, + { + "dim1": "another_dim1_val", + "count": 88, + "some_metrics": 28344, + "average": 322.09090909090907 + }, + { + "dim1": "dim1_val3", + "count": 70, + "some_metrics": 871, + "average": 12.442857142857143 + }, + { + "dim1": "dim1_val4", + "count": 62, + "some_metrics": 815, + "average": 13.14516129032258 + }, + { + "dim1": "dim1_val5", + "count": 60, + "some_metrics": 2787, + "average": 46.45 + } + ] + } +] +``` + +## Behavior on multi-value dimensions + +topN queries can group on multi-value dimensions. When grouping on a multi-value dimension, _all_ values +from matching rows will be used to generate one group per value. It's possible for a query to return more groups than +there are rows. For example, a topN on the dimension `tags` with filter `"t1" AND "t3"` would match only row1, and +generate a result with three groups: `t1`, `t2`, and `t3`. If you only need to include values that match +your filter, you can use a [filtered dimensionSpec](dimensionspecs.md#filtered-dimensionspecs). This can also +improve performance. + +See [Multi-value dimensions](multi-value-dimensions.md) for more details. + +## Aliasing + +The current TopN algorithm is an approximate algorithm. The top 1000 local results from each segment are returned for merging to determine the global topN. As such, the topN algorithm is approximate in both rank and results. Approximate results *ONLY APPLY WHEN THERE ARE MORE THAN 1000 DIM VALUES*. A topN over a dimension with fewer than 1000 unique dimension values can be considered accurate in rank and accurate in aggregates. + +The threshold can be modified from its default 1000 via the server parameter `druid.query.topN.minTopNThreshold`, which needs a restart of the servers to take effect, or via `minTopNThreshold` in the query context, which takes effect per query. + +If you are wanting the top 100 of a high cardinality, uniformly distributed dimension ordered by some low-cardinality, uniformly distributed dimension, you are potentially going to get aggregates back that are missing data. + +To put it another way, the best use cases for topN are when you can have confidence that the overall results are uniformly in the top. For example, if a particular site ID is in the top 10 for some metric for every hour of every day, then it will probably be accurate in the topN over multiple days. But if a site is barely in the top 1000 for any given hour, but over the whole query granularity is in the top 500 (example: a site which gets highly uniform traffic co-mingling in the dataset with sites with highly periodic data), then a top500 query may not have that particular site at the exact rank, and may not be accurate for that particular site's aggregates. + +Before continuing in this section, please consider if you really need exact results. Getting exact results is a very resource intensive process. For the vast majority of "useful" data results, an approximate topN algorithm supplies plenty of accuracy. + +Users wishing to get an *exact rank and exact aggregates* topN over a dimension with greater than 1000 unique values should issue a groupBy query and sort the results themselves. This is very computationally expensive for high-cardinality dimensions. + +Users who can tolerate *approximate rank* topN over a dimension with greater than 1000 unique values, but require *exact aggregates* can issue two queries. One to get the approximate topN dimension values, and another topN with dimension selection filters which only use the topN results of the first. + +### Example First query + +```json +{ + "aggregations": [ + { + "fieldName": "L_QUANTITY_longSum", + "name": "L_QUANTITY_", + "type": "longSum" + } + ], + "dataSource": "tpch_year", + "dimension":"l_orderkey", + "granularity": "all", + "intervals": [ + "1900-01-09T00:00:00.000Z/2992-01-10T00:00:00.000Z" + ], + "metric": "L_QUANTITY_", + "queryType": "topN", + "threshold": 2 +} +``` + +### Example second query + +```json +{ + "aggregations": [ + { + "fieldName": "L_TAX_doubleSum", + "name": "L_TAX_", + "type": "doubleSum" + }, + { + "fieldName": "L_DISCOUNT_doubleSum", + "name": "L_DISCOUNT_", + "type": "doubleSum" + }, + { + "fieldName": "L_EXTENDEDPRICE_doubleSum", + "name": "L_EXTENDEDPRICE_", + "type": "doubleSum" + }, + { + "fieldName": "L_QUANTITY_longSum", + "name": "L_QUANTITY_", + "type": "longSum" + }, + { + "name": "count", + "type": "count" + } + ], + "dataSource": "tpch_year", + "dimension":"l_orderkey", + "filter": { + "fields": [ + { + "dimension": "l_orderkey", + "type": "selector", + "value": "103136" + }, + { + "dimension": "l_orderkey", + "type": "selector", + "value": "1648672" + } + ], + "type": "or" + }, + "granularity": "all", + "intervals": [ + "1900-01-09T00:00:00.000Z/2992-01-10T00:00:00.000Z" + ], + "metric": "L_QUANTITY_", + "queryType": "topN", + "threshold": 2 +} +```