Adding new config for disabling group by on multiValue column (#12253)

As part of #12078 one of the followup's was to have a specific config which does not allow accidental unnesting of multi value columns if such columns become part of the grouping key.
Added a config groupByEnableMultiValueUnnesting which can be set in the query context.

The default value of groupByEnableMultiValueUnnesting is true, therefore it does not change the current engine behavior.
If groupByEnableMultiValueUnnesting is set to false, the query will fail if it encounters a multi-value column in the grouping key.
This commit is contained in:
Karan Kumar 2022-02-16 20:53:26 +05:30 committed by GitHub
parent 8fc0e5c95c
commit 5794331eb1
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 129 additions and 3 deletions

View File

@ -436,6 +436,7 @@ Supported query contexts:
|`sortByDimsFirst`|Sort the results first by dimension values and then by timestamp.|false|
|`forceLimitPushDown`|When all fields in the orderby are part of the grouping key, the Broker will push limit application down to the Historical processes. When the sorting order uses fields that are not in the grouping key, applying this optimization can result in approximate results with unknown accuracy, so this optimization is disabled by default in that case. Enabling this context flag turns on limit push down for limit/orderbys that contain non-grouping key columns.|false|
|`applyLimitPushDownToSegment`|If Broker pushes limit down to queryable nodes (historicals, peons) then limit results during segment scan. This context value can be used to override `druid.query.groupBy.applyLimitPushDownToSegment`.|true|
|`groupByEnableMultiValueUnnesting`|Safety flag to enable/disable the implicit unnesting on multi value column's as part of the grouping key. 'true' indicates multi-value grouping keys are unnested. 'false' returns an error if a multi value column is found as part of the grouping key.|true|
#### GroupBy v1 configurations

View File

@ -375,3 +375,10 @@ This query returns the following result:
Note that, for groupBy queries, you could get similar result with a [having spec](having.md) but using a filtered
`dimensionSpec` is much more efficient because that gets applied at the lowest level in the query processing pipeline.
Having specs are applied at the outermost level of groupBy query processing.
## Disable GroupBy on multi-value columns
You can disable the implicit unnesting behavior for groupBy by setting groupByEnableMultiValueUnnesting: false in your
query context. In this mode, the groupBy engine will return an error instead of completing the query. This is a safety
feature for situations where you believe that all dimensions are singly-valued and want the engine to reject any
multi-valued dimensions that were inadvertently included.

View File

@ -34,6 +34,7 @@ public class GroupByQueryConfig
public static final String CTX_KEY_FORCE_PUSH_DOWN_NESTED_QUERY = "forcePushDownNestedQuery";
public static final String CTX_KEY_EXECUTING_NESTED_QUERY = "executingNestedQuery";
public static final String CTX_KEY_ARRAY_RESULT_ROWS = "resultAsArray";
public static final String CTX_KEY_ENABLE_MULTI_VALUE_UNNESTING = "groupByEnableMultiValueUnnesting";
private static final String CTX_KEY_IS_SINGLE_THREADED = "groupByIsSingleThreaded";
private static final String CTX_KEY_MAX_INTERMEDIATE_ROWS = "maxIntermediateRows";
private static final String CTX_KEY_MAX_RESULTS = "maxResults";
@ -97,6 +98,9 @@ public class GroupByQueryConfig
@JsonProperty
private boolean vectorize = true;
@JsonProperty
private boolean enableMultiValueUnnesting = true;
public String getDefaultStrategy()
{
return defaultStrategy;
@ -192,6 +196,11 @@ public class GroupByQueryConfig
return forcePushDownNestedQuery;
}
public boolean isMultiValueUnnestingEnabled()
{
return enableMultiValueUnnesting;
}
public GroupByQueryConfig withOverrides(final GroupByQuery query)
{
final GroupByQueryConfig newConfig = new GroupByQueryConfig();
@ -244,6 +253,10 @@ public class GroupByQueryConfig
getNumParallelCombineThreads()
);
newConfig.vectorize = query.getContextBoolean(QueryContexts.VECTORIZE_KEY, isVectorize());
newConfig.enableMultiValueUnnesting = query.getContextBoolean(
CTX_KEY_ENABLE_MULTI_VALUE_UNNESTING,
isMultiValueUnnestingEnabled()
);
return newConfig;
}
@ -266,6 +279,7 @@ public class GroupByQueryConfig
", numParallelCombineThreads=" + numParallelCombineThreads +
", vectorize=" + vectorize +
", forcePushDownNestedQuery=" + forcePushDownNestedQuery +
", enableMultiValueUnnesting=" + enableMultiValueUnnesting +
'}';
}
}

View File

@ -32,6 +32,7 @@ import org.apache.druid.data.input.Row;
import org.apache.druid.guice.annotations.Global;
import org.apache.druid.java.util.common.IAE;
import org.apache.druid.java.util.common.ISE;
import org.apache.druid.java.util.common.UOE;
import org.apache.druid.java.util.common.guava.BaseSequence;
import org.apache.druid.java.util.common.guava.FunctionalIterator;
import org.apache.druid.java.util.common.guava.Sequence;
@ -91,7 +92,13 @@ public class GroupByQueryEngine
"Null storage adapter found. Probably trying to issue a query against a segment being memory unmapped."
);
}
if (!query.getContextValue(GroupByQueryConfig.CTX_KEY_ENABLE_MULTI_VALUE_UNNESTING, true)) {
throw new UOE(
"GroupBy v1 does not support %s as false. Set %s to true or use groupBy v2",
GroupByQueryConfig.CTX_KEY_ENABLE_MULTI_VALUE_UNNESTING,
GroupByQueryConfig.CTX_KEY_ENABLE_MULTI_VALUE_UNNESTING
);
}
final List<Interval> intervals = query.getQuerySegmentSpec().getIntervals();
if (intervals.size() != 1) {
throw new IAE("Should only have one interval, got[%s]", intervals);

View File

@ -459,6 +459,8 @@ public class GroupByQueryEngineV2
@Nullable
protected CloseableGrouperIterator<KeyType, ResultRow> delegate = null;
protected final boolean allSingleValueDims;
protected final boolean allowMultiValueGrouping;
public GroupByEngineIterator(
final GroupByQuery query,
@ -480,6 +482,10 @@ public class GroupByQueryEngineV2
// Time is the same for every row in the cursor
this.timestamp = fudgeTimestamp != null ? fudgeTimestamp : cursor.getTime();
this.allSingleValueDims = allSingleValueDims;
this.allowMultiValueGrouping = query.getContextBoolean(
GroupByQueryConfig.CTX_KEY_ENABLE_MULTI_VALUE_UNNESTING,
true
);
}
private CloseableGrouperIterator<KeyType, ResultRow> initNewDelegate()
@ -593,6 +599,19 @@ public class GroupByQueryEngineV2
return indexedInts.size() == 1 ? indexedInts.get(0) : GroupByColumnSelectorStrategy.GROUP_BY_MISSING_VALUE;
}
protected void checkIfMultiValueGroupingIsAllowed(String dimName)
{
if (!allowMultiValueGrouping) {
throw new ISE(
"Encountered multi-value dimension %s that cannot be processed with %s set to false."
+ " Consider setting %s to true.",
dimName,
GroupByQueryConfig.CTX_KEY_EXECUTING_NESTED_QUERY,
GroupByQueryConfig.CTX_KEY_EXECUTING_NESTED_QUERY
);
}
}
}
private static class HashAggregateIterator extends GroupByEngineIterator<ByteBuffer>
@ -764,6 +783,9 @@ public class GroupByQueryEngineV2
);
if (doAggregate) {
// this check is done during the row aggregation as a dimension can become multi-value col if column
// capabilities is unknown.
checkIfMultiValueGroupingIsAllowed(dims[stackPointer].getName());
stack[stackPointer]++;
for (int i = stackPointer + 1; i < stack.length; i++) {
dims[i].getColumnSelectorStrategy().initGroupingKeyColumnValue(
@ -889,12 +911,17 @@ public class GroupByQueryEngineV2
}
while (!cursor.isDone()) {
int multiValuesSize = multiValues.size();
final int multiValuesSize = multiValues.size();
if (multiValuesSize == 0) {
if (!grouper.aggregate(GroupByColumnSelectorStrategy.GROUP_BY_MISSING_VALUE).isOk()) {
return;
}
} else {
if (multiValuesSize > 1) {
// this check is done during the row aggregation as a dimension can become multi-value col if column
// capabilities is unknown.
checkIfMultiValueGroupingIsAllowed(dim.getName());
}
for (; nextValIndex < multiValuesSize; nextValIndex++) {
if (!grouper.aggregate(multiValues.get(nextValIndex)).isOk()) {
return;

View File

@ -40,6 +40,7 @@ import org.apache.druid.java.util.common.ISE;
import org.apache.druid.java.util.common.Intervals;
import org.apache.druid.java.util.common.Pair;
import org.apache.druid.java.util.common.StringUtils;
import org.apache.druid.java.util.common.UOE;
import org.apache.druid.java.util.common.concurrent.Execs;
import org.apache.druid.java.util.common.granularity.DurationGranularity;
import org.apache.druid.java.util.common.granularity.Granularities;
@ -1310,6 +1311,41 @@ public class GroupByQueryRunnerTest extends InitializedNullHandlingTest
TestHelper.assertExpectedObjects(expectedResults, results, "multi-value-dim");
}
@Test
public void testMultiValueDimensionNotAllowed()
{
if (config.getDefaultStrategy().equals(GroupByStrategySelector.STRATEGY_V1)) {
expectedException.expect(UOE.class);
expectedException.expectMessage(StringUtils.format(
"GroupBy v1 does not support %s as false",
GroupByQueryConfig.CTX_KEY_ENABLE_MULTI_VALUE_UNNESTING
));
} else if (!vectorize) {
expectedException.expect(RuntimeException.class);
expectedException.expectMessage(StringUtils.format(
"Encountered multi-value dimension %s that cannot be processed with %s set to false."
+ " Consider setting %s to true.",
"placementish",
GroupByQueryConfig.CTX_KEY_EXECUTING_NESTED_QUERY,
GroupByQueryConfig.CTX_KEY_EXECUTING_NESTED_QUERY
));
} else {
cannotVectorize();
}
GroupByQuery query = makeQueryBuilder()
.setDataSource(QueryRunnerTestHelper.DATA_SOURCE)
.setQuerySegmentSpec(QueryRunnerTestHelper.FIRST_TO_THIRD)
.setDimensions(new DefaultDimensionSpec("placementish", "alias"))
.setAggregatorSpecs(QueryRunnerTestHelper.ROWS_COUNT, new LongSumAggregatorFactory("idx", "index"))
.setGranularity(QueryRunnerTestHelper.ALL_GRAN)
.overrideContext(ImmutableMap.of(GroupByQueryConfig.CTX_KEY_ENABLE_MULTI_VALUE_UNNESTING, false))
.build();
GroupByQueryRunnerTestHelper.runQuery(factory, runner, query);
}
@Test
public void testMultiValueDimensionAsArray()
{

View File

@ -22,6 +22,7 @@ package org.apache.druid.sql.calcite;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableSet;
import org.apache.druid.common.config.NullHandling;
import org.apache.druid.java.util.common.StringUtils;
import org.apache.druid.java.util.common.granularity.Granularities;
import org.apache.druid.math.expr.ExpressionProcessing;
import org.apache.druid.query.Druids;
@ -32,6 +33,7 @@ import org.apache.druid.query.filter.InDimFilter;
import org.apache.druid.query.filter.LikeDimFilter;
import org.apache.druid.query.filter.SelectorDimFilter;
import org.apache.druid.query.groupby.GroupByQuery;
import org.apache.druid.query.groupby.GroupByQueryConfig;
import org.apache.druid.query.groupby.orderby.DefaultLimitSpec;
import org.apache.druid.query.groupby.orderby.OrderByColumnSpec;
import org.apache.druid.query.ordering.StringComparators;
@ -43,7 +45,9 @@ import org.apache.druid.sql.calcite.filtration.Filtration;
import org.apache.druid.sql.calcite.util.CalciteTests;
import org.junit.Test;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
public class CalciteMultiValueStringQueryTest extends BaseCalciteQueryTest
{
@ -53,7 +57,8 @@ public class CalciteMultiValueStringQueryTest extends BaseCalciteQueryTest
{
// Cannot vectorize due to usage of expressions.
cannotVectorize();
Map<String, Object> groupByOnMultiValueColumnEnabled = new HashMap<>(QUERY_CONTEXT_DEFAULT);
groupByOnMultiValueColumnEnabled.put(GroupByQueryConfig.CTX_KEY_ENABLE_MULTI_VALUE_UNNESTING, true);
List<Object[]> expected;
if (NullHandling.replaceWithDefault()) {
expected = ImmutableList.of(
@ -76,6 +81,7 @@ public class CalciteMultiValueStringQueryTest extends BaseCalciteQueryTest
}
testQuery(
"SELECT concat(dim3, 'foo'), SUM(cnt) FROM druid.numfoo GROUP BY 1 ORDER BY 2 DESC",
groupByOnMultiValueColumnEnabled,
ImmutableList.of(
GroupByQuery.builder()
.setDataSource(CalciteTests.DATASOURCE3)
@ -103,6 +109,30 @@ public class CalciteMultiValueStringQueryTest extends BaseCalciteQueryTest
);
}
@Test
public void testMultiValueStringGroupByDoesNotWork() throws Exception
{
// Cannot vectorize due to usage of expressions.
cannotVectorize();
Map<String, Object> groupByOnMultiValueColumnDisabled = new HashMap<>(QUERY_CONTEXT_DEFAULT);
groupByOnMultiValueColumnDisabled.put(GroupByQueryConfig.CTX_KEY_ENABLE_MULTI_VALUE_UNNESTING, false);
testQueryThrows(
"SELECT concat(dim3, 'foo'), SUM(cnt) FROM druid.numfoo GROUP BY 1 ORDER BY 2 DESC",
groupByOnMultiValueColumnDisabled,
ImmutableList.of(),
exception -> {
exception.expect(RuntimeException.class);
expectedException.expectMessage(StringUtils.format(
"Encountered multi-value dimension %s that cannot be processed with %s set to false."
+ " Consider setting %s to true.",
"v0",
GroupByQueryConfig.CTX_KEY_EXECUTING_NESTED_QUERY,
GroupByQueryConfig.CTX_KEY_EXECUTING_NESTED_QUERY
));
}
);
}
@Test
public void testMultiValueStringWorksLikeStringGroupByWithFilter() throws Exception
{

View File

@ -1565,6 +1565,8 @@ outputName
pushdown
row1
subtotalsSpec
unnested
unnesting
- ../docs/querying/having.md
HavingSpec
HavingSpecs
@ -1595,6 +1597,8 @@ row4
t3
t4
t5
groupByEnableMultiValueUnnesting
unnesting
- ../docs/querying/multitenancy.md
500ms
tenant_id