mirror of https://github.com/apache/druid.git
Update the docs for EARLIEST_BY/LATEST_BY aggregators with the newly added numeric capabilities (#15670)
This commit is contained in:
parent
fcd65c9801
commit
7d65caf0c5
|
@ -200,8 +200,8 @@ rollup-related metadata into the generated segments. Other applications can then
|
|||
queries](../querying/segmentmetadataquery.md) to retrieve rollup-related information.
|
||||
|
||||
The following [aggregation functions](../querying/sql-aggregations.md) are supported for rollup at ingestion time:
|
||||
`COUNT` (but switch to `SUM` at query time), `SUM`, `MIN`, `MAX`, `EARLIEST` and `EARLIEST_BY` ([string only](known-issues.md#select-statement)),
|
||||
`LATEST` and `LATEST_BY` ([string only](known-issues.md#select-statement)), `APPROX_COUNT_DISTINCT`, `APPROX_COUNT_DISTINCT_BUILTIN`,
|
||||
`COUNT` (but switch to `SUM` at query time), `SUM`, `MIN`, `MAX`, `EARLIEST` and `EARLIEST_BY`,
|
||||
`LATEST` and `LATEST_BY`, `APPROX_COUNT_DISTINCT`, `APPROX_COUNT_DISTINCT_BUILTIN`,
|
||||
`APPROX_COUNT_DISTINCT_DS_HLL`, `APPROX_COUNT_DISTINCT_DS_THETA`, and `DS_QUANTILES_SKETCH` (but switch to
|
||||
`APPROX_QUANTILE_DS` at query time). Do not use `AVG`; instead, use `SUM` and `COUNT` at ingest time and compute the
|
||||
quotient at query time.
|
||||
|
|
|
@ -42,11 +42,6 @@ an [UnknownError](./reference.md#error_UnknownError) with a message including "N
|
|||
- `GROUPING SETS` are not implemented. Queries using these features return a
|
||||
[QueryNotSupported](reference.md#error_QueryNotSupported) error.
|
||||
|
||||
- The numeric varieties of the `EARLIEST` and `LATEST` aggregators do not work properly. Attempting to use the numeric
|
||||
varieties of these aggregators lead to an error like
|
||||
`java.lang.ClassCastException: class java.lang.Double cannot be cast to class org.apache.druid.collections.SerializablePair`.
|
||||
The string varieties, however, do work properly.
|
||||
|
||||
## `INSERT` and `REPLACE` Statements
|
||||
|
||||
- The `INSERT` and `REPLACE` statements with column lists, like `INSERT INTO tbl (a, b, c) SELECT ...`, is not implemented.
|
||||
|
|
|
@ -177,10 +177,9 @@ Example:
|
|||
|
||||
The first and last aggregators determine the metric values that respectively correspond to the earliest and latest values of a time column.
|
||||
|
||||
Do not use first and last aggregators for the double, float, and long types in an ingestion spec. They are only supported for queries.
|
||||
The string-typed aggregators, `stringFirst` and `stringLast`, are supported for both ingestion and querying.
|
||||
|
||||
Queries with first or last aggregators on a segment created with rollup return the rolled up value, not the first or last value from the raw ingested data.
|
||||
Queries with first or last aggregators on a segment created with rollup return the rolled up value, not the first or last value from the
|
||||
raw ingested data. The `timeColumn` will get ignored in such cases, and the aggregation will use the original value of the time column
|
||||
stored at the time the segment was created.
|
||||
|
||||
#### Numeric first and last aggregators
|
||||
|
||||
|
|
|
@ -87,9 +87,9 @@ In the aggregation functions supported by Druid, only `COUNT`, `ARRAY_AGG`, and
|
|||
|`STDDEV_SAMP(expr)`|Computes standard deviation sample of `expr`. See [stats extension](../development/extensions-core/stats.md) documentation for additional details.|`null` or `0` if `druid.generic.useDefaultValueForNull=true` (legacy mode)|
|
||||
|`STDDEV(expr)`|Computes standard deviation sample of `expr`. See [stats extension](../development/extensions-core/stats.md) documentation for additional details.|`null` or `0` if `druid.generic.useDefaultValueForNull=true` (legacy mode)|
|
||||
|`EARLIEST(expr, [maxBytesPerValue])`|Returns the earliest value of `expr`.<br />If `expr` comes from a relation with a timestamp column (like `__time` in a Druid datasource), the "earliest" is taken from the row with the overall earliest non-null value of the timestamp column.<br />If the earliest non-null value of the timestamp column appears in multiple rows, the `expr` may be taken from any of those rows. If `expr` does not come from a relation with a timestamp, then it is simply the first value encountered.<br /><br />If `expr` is a string or complex type `maxBytesPerValue` amount of space is allocated for the aggregation. Strings longer than this limit are truncated. The `maxBytesPerValue` parameter should be set as low as possible, since high values will lead to wasted memory.<br/>If `maxBytesPerValue`is omitted; it defaults to `1024`. |`null` or `0`/`''` if `druid.generic.useDefaultValueForNull=true` (legacy mode)|
|
||||
|`EARLIEST_BY(expr, timestampExpr, [maxBytesPerValue])`|Returns the earliest value of `expr`.<br />The earliest value of `expr` is taken from the row with the overall earliest non-null value of `timestampExpr`. <br />If the earliest non-null value of `timestampExpr` appears in multiple rows, the `expr` may be taken from any of those rows.<br /><br />If `expr` is a string or complex type `maxBytesPerValue` amount of space is allocated for the aggregation. Strings longer than this limit are truncated. The `maxBytesPerValue` parameter should be set as low as possible, since high values will lead to wasted memory.<br/>If `maxBytesPerValue`is omitted; it defaults to `1024`. |`null` or `0`/`''` if `druid.generic.useDefaultValueForNull=true` (legacy mode)|
|
||||
|`EARLIEST_BY(expr, timestampExpr, [maxBytesPerValue])`|Returns the earliest value of `expr`.<br />The earliest value of `expr` is taken from the row with the overall earliest non-null value of `timestampExpr`. <br />If the earliest non-null value of `timestampExpr` appears in multiple rows, the `expr` may be taken from any of those rows.<br /><br />If `expr` is a string or complex type `maxBytesPerValue` amount of space is allocated for the aggregation. Strings longer than this limit are truncated. The `maxBytesPerValue` parameter should be set as low as possible, since high values will lead to wasted memory.<br/>If `maxBytesPerValue`is omitted; it defaults to `1024`.<br /><br />Use `EARLIEST` instead of `EARLIEST_BY` on a table that has rollup enabled and was created with any variant of `EARLIEST`, `LATEST`, `EARLIEST_BY`, or `LATEST_BY`. In these cases, the intermediate type already stores the timestamp, and Druid ignores the value passed in `timestampExpr`. |`null` or `0`/`''` if `druid.generic.useDefaultValueForNull=true` (legacy mode)|
|
||||
|`LATEST(expr, [maxBytesPerValue])`|Returns the latest value of `expr`<br />The `expr` must come from a relation with a timestamp column (like `__time` in a Druid datasource) and the "latest" is taken from the row with the overall latest non-null value of the timestamp column.<br />If the latest non-null value of the timestamp column appears in multiple rows, the `expr` may be taken from any of those rows.<br /><br />If `expr` is a string or complex type `maxBytesPerValue` amount of space is allocated for the aggregation. Strings longer than this limit are truncated. The `maxBytesPerValue` parameter should be set as low as possible, since high values will lead to wasted memory.<br/>If `maxBytesPerValue`is omitted; it defaults to `1024`. |`null` or `0`/`''` if `druid.generic.useDefaultValueForNull=true` (legacy mode)|
|
||||
|`LATEST_BY(expr, timestampExpr, [maxBytesPerValue])`|Returns the latest value of `expr`.<br />The latest value of `expr` is taken from the row with the overall latest non-null value of `timestampExpr`.<br />If the overall latest non-null value of `timestampExpr` appears in multiple rows, the `expr` may be taken from any of those rows.<br /><br />If `expr` is a string or complex type `maxBytesPerValue` amount of space is allocated for the aggregation. Strings longer than this limit are truncated. The `maxBytesPerValue` parameter should be set as low as possible, since high values will lead to wasted memory.<br/>If `maxBytesPerValue`is omitted; it defaults to `1024`. |`null` or `0`/`''` if `druid.generic.useDefaultValueForNull=true` (legacy mode)|
|
||||
|`LATEST_BY(expr, timestampExpr, [maxBytesPerValue])`|Returns the latest value of `expr`.<br />The latest value of `expr` is taken from the row with the overall latest non-null value of `timestampExpr`.<br />If the overall latest non-null value of `timestampExpr` appears in multiple rows, the `expr` may be taken from any of those rows.<br /><br />If `expr` is a string or complex type `maxBytesPerValue` amount of space is allocated for the aggregation. Strings longer than this limit are truncated. The `maxBytesPerValue` parameter should be set as low as possible, since high values will lead to wasted memory.<br/>If `maxBytesPerValue`is omitted; it defaults to `1024`.<br /><br />Use `LATEST` instead of `LATEST_BY` on a table that has rollup enabled and was created with any variant of `EARLIEST`, `LATEST`, `EARLIEST_BY`, or `LATEST_BY`. In these cases, the intermediate type already stores the timestamp, and Druid ignores the value passed in `timestampExpr`. |`null` or `0`/`''` if `druid.generic.useDefaultValueForNull=true` (legacy mode)|
|
||||
|`ANY_VALUE(expr, [maxBytesPerValue, [aggregateMultipleValues]])`|Returns any value of `expr` including null. This aggregator can simplify and optimize the performance by returning the first encountered value (including `null`).<br /><br />If `expr` is a string or complex type `maxBytesPerValue` amount of space is allocated for the aggregation. Strings longer than this limit are truncated. The `maxBytesPerValue` parameter should be set as low as possible, since high values will lead to wasted memory.<br/>If `maxBytesPerValue` is omitted; it defaults to `1024`. `aggregateMultipleValues` is an optional boolean flag controls the behavior of aggregating a [multi-value dimension](./multi-value-dimensions.md). `aggregateMultipleValues` is set as true by default and returns the stringified array in case of a multi-value dimension. By setting it to false, function will return first value instead. |`null` or `0`/`''` if `druid.generic.useDefaultValueForNull=true` (legacy mode)|
|
||||
|`GROUPING(expr, expr...)`|Returns a number to indicate which groupBy dimension is included in a row, when using `GROUPING SETS`. Refer to [additional documentation](aggregations.md#grouping-aggregator) on how to infer this number.|N/A|
|
||||
|`ARRAY_AGG(expr, [size])`|Collects all values of `expr` into an ARRAY, including null values, with `size` in bytes limit on aggregation size (default of 1024 bytes). If the aggregated array grows larger than the maximum size in bytes, the query will fail. Use of `ORDER BY` within the `ARRAY_AGG` expression is not currently supported, and the ordering of results within the output array may vary depending on processing order.|`null`|
|
||||
|
|
|
@ -229,7 +229,7 @@ public class EarliestLatestAnySqlAggregator implements SqlAggregator
|
|||
);
|
||||
}
|
||||
|
||||
final String fieldName = getColumnName(plannerContext, virtualColumnRegistry, args.get(0), rexNodes.get(0));
|
||||
final String fieldName = getColumnName(virtualColumnRegistry, args.get(0), rexNodes.get(0));
|
||||
|
||||
if (!inputAccessor.getInputRowSignature().contains(ColumnHolder.TIME_COLUMN_NAME)
|
||||
&& (aggregatorType == AggregatorType.LATEST || aggregatorType == AggregatorType.EARLIEST)) {
|
||||
|
@ -291,7 +291,6 @@ public class EarliestLatestAnySqlAggregator implements SqlAggregator
|
|||
}
|
||||
|
||||
static String getColumnName(
|
||||
PlannerContext plannerContext,
|
||||
VirtualColumnRegistry virtualColumnRegistry,
|
||||
DruidExpression arg,
|
||||
RexNode rexNode
|
||||
|
@ -360,7 +359,9 @@ public class EarliestLatestAnySqlAggregator implements SqlAggregator
|
|||
@Override
|
||||
public <R> R accept(SqlVisitor<R> visitor)
|
||||
{
|
||||
|
||||
// We overridde the "accept()" method, because the __time column's presence is determined when Calcite is converting
|
||||
// the identifiers to the fully qualified column names with prefixes. This is where the validation exception can
|
||||
// trigger
|
||||
try {
|
||||
return super.accept(visitor);
|
||||
}
|
||||
|
|
|
@ -100,7 +100,6 @@ public class EarliestLatestBySqlAggregator implements SqlAggregator
|
|||
}
|
||||
|
||||
final String fieldName = EarliestLatestAnySqlAggregator.getColumnName(
|
||||
plannerContext,
|
||||
virtualColumnRegistry,
|
||||
args.get(0),
|
||||
rexNodes.get(0)
|
||||
|
@ -113,7 +112,6 @@ public class EarliestLatestBySqlAggregator implements SqlAggregator
|
|||
aggregatorName,
|
||||
fieldName,
|
||||
EarliestLatestAnySqlAggregator.getColumnName(
|
||||
plannerContext,
|
||||
virtualColumnRegistry,
|
||||
args.get(1),
|
||||
rexNodes.get(1)
|
||||
|
@ -140,7 +138,6 @@ public class EarliestLatestBySqlAggregator implements SqlAggregator
|
|||
aggregatorName,
|
||||
fieldName,
|
||||
EarliestLatestAnySqlAggregator.getColumnName(
|
||||
plannerContext,
|
||||
virtualColumnRegistry,
|
||||
args.get(1),
|
||||
rexNodes.get(1)
|
||||
|
|
|
@ -644,8 +644,6 @@ public class CalciteQueryTest extends BaseCalciteQueryTest
|
|||
@Test
|
||||
public void testEarliestAggregators()
|
||||
{
|
||||
msqIncompatible();
|
||||
|
||||
testQuery(
|
||||
"SELECT "
|
||||
+ "EARLIEST(cnt), EARLIEST(m1), EARLIEST(dim1, 10), EARLIEST(dim1, CAST(10 AS INTEGER)), "
|
||||
|
@ -1207,8 +1205,6 @@ public class CalciteQueryTest extends BaseCalciteQueryTest
|
|||
@Test
|
||||
public void testPrimitiveEarliestInSubquery()
|
||||
{
|
||||
msqIncompatible();
|
||||
|
||||
testQuery(
|
||||
"SELECT SUM(val1), SUM(val2), SUM(val3) FROM (SELECT dim2, EARLIEST(m1) AS val1, EARLIEST(cnt) AS val2, EARLIEST(m2) AS val3 FROM foo GROUP BY dim2)",
|
||||
ImmutableList.of(
|
||||
|
@ -1415,7 +1411,6 @@ public class CalciteQueryTest extends BaseCalciteQueryTest
|
|||
@Test
|
||||
public void testStringEarliestSingleStringDim()
|
||||
{
|
||||
msqIncompatible();
|
||||
testQuery(
|
||||
"SELECT dim2, EARLIEST(dim1,10) AS val FROM foo GROUP BY dim2",
|
||||
ImmutableList.of(
|
||||
|
@ -1531,8 +1526,6 @@ public class CalciteQueryTest extends BaseCalciteQueryTest
|
|||
@Test
|
||||
public void testEarliestAggregatorsNumericNulls()
|
||||
{
|
||||
msqIncompatible();
|
||||
|
||||
testQuery(
|
||||
"SELECT EARLIEST(l1), EARLIEST(d1), EARLIEST(f1) FROM druid.numfoo",
|
||||
ImmutableList.of(
|
||||
|
@ -1590,8 +1583,6 @@ public class CalciteQueryTest extends BaseCalciteQueryTest
|
|||
@Test
|
||||
public void testFirstLatestAggregatorsSkipNulls()
|
||||
{
|
||||
msqIncompatible();
|
||||
|
||||
final DimFilter filter;
|
||||
if (useDefault) {
|
||||
filter = notNull("dim1");
|
||||
|
@ -1704,8 +1695,6 @@ public class CalciteQueryTest extends BaseCalciteQueryTest
|
|||
@Test
|
||||
public void testOrderByEarliestFloat()
|
||||
{
|
||||
msqIncompatible();
|
||||
|
||||
List<Object[]> expected;
|
||||
if (NullHandling.replaceWithDefault()) {
|
||||
expected = ImmutableList.of(
|
||||
|
@ -1751,8 +1740,6 @@ public class CalciteQueryTest extends BaseCalciteQueryTest
|
|||
@Test
|
||||
public void testOrderByEarliestDouble()
|
||||
{
|
||||
msqIncompatible();
|
||||
|
||||
List<Object[]> expected;
|
||||
if (NullHandling.replaceWithDefault()) {
|
||||
expected = ImmutableList.of(
|
||||
|
@ -1798,8 +1785,6 @@ public class CalciteQueryTest extends BaseCalciteQueryTest
|
|||
@Test
|
||||
public void testOrderByEarliestLong()
|
||||
{
|
||||
msqIncompatible();
|
||||
|
||||
List<Object[]> expected;
|
||||
if (NullHandling.replaceWithDefault()) {
|
||||
expected = ImmutableList.of(
|
||||
|
@ -9670,7 +9655,9 @@ public class CalciteQueryTest extends BaseCalciteQueryTest
|
|||
@Test
|
||||
public void testTimeseriesEmptyResultsAggregatorDefaultValuesNonVectorized()
|
||||
{
|
||||
// Empty-dataset aggregation queries in MSQ return an empty row, rather than a single row as SQL requires.
|
||||
msqIncompatible();
|
||||
|
||||
cannotVectorize();
|
||||
skipVectorize();
|
||||
// timeseries with all granularity have a single group, so should return default results for given aggregators
|
||||
|
@ -9986,7 +9973,6 @@ public class CalciteQueryTest extends BaseCalciteQueryTest
|
|||
@Test
|
||||
public void testGroupByAggregatorDefaultValuesNonVectorized()
|
||||
{
|
||||
msqIncompatible();
|
||||
cannotVectorize();
|
||||
skipVectorize();
|
||||
testQuery(
|
||||
|
|
Loading…
Reference in New Issue