fix NestedDataColumnIndexerV4 to not report cardinality (#16507)

* fix NestedDataColumnIndexerV4 to not report cardinality
changes:
* fix issue similar to #16489 but for NestedDataColumnIndexerV4, which can report STRING type if it only processes a single type of values. this should be less common than the auto indexer problem
* fix some issues with sql benchmarks
This commit is contained in:
Clint Wylie 2024-06-11 20:58:12 -07:00 committed by GitHub
parent 3f5f5921e0
commit fee509df2e
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
7 changed files with 56 additions and 43 deletions

View File

@ -360,13 +360,16 @@ public class SqlExpressionBenchmark
try {
SqlVectorizedExpressionSanityTest.sanityTestVectorizedSqlQueries(
engine,
plannerFactory,
QUERIES.get(Integer.parseInt(query))
);
log.info("non-vectorized and vectorized results match");
}
catch (Throwable ignored) {
// the show must go on
catch (Throwable ex) {
log.warn(ex, "non-vectorized and vectorized results do not match");
}
final String sql = QUERIES.get(Integer.parseInt(query));
try (final DruidPlanner planner = plannerFactory.createPlannerForTesting(engine, "EXPLAIN PLAN FOR " + sql, ImmutableMap.of("useNativeQueryExplain", true))) {
@ -378,8 +381,8 @@ public class SqlExpressionBenchmark
.writeValueAsString(jsonMapper.readValue((String) planResult[0], List.class))
);
}
catch (JsonProcessingException ignored) {
catch (JsonProcessingException ex) {
log.warn(ex, "explain failed");
}
try (final DruidPlanner planner = plannerFactory.createPlannerForTesting(engine, sql, ImmutableMap.of())) {
@ -393,8 +396,8 @@ public class SqlExpressionBenchmark
}
log.info("Total result row count:" + rowCounter);
}
catch (Throwable ignored) {
catch (Throwable ex) {
log.warn(ex, "failed to count rows");
}
}

View File

@ -29,6 +29,7 @@ import org.apache.druid.java.util.common.StringUtils;
import org.apache.druid.java.util.common.granularity.Granularities;
import org.apache.druid.java.util.common.guava.Sequence;
import org.apache.druid.java.util.common.io.Closer;
import org.apache.druid.java.util.common.logger.Logger;
import org.apache.druid.math.expr.ExpressionProcessing;
import org.apache.druid.query.DruidProcessingConfig;
import org.apache.druid.query.QueryRunnerFactoryConglomerate;
@ -91,6 +92,8 @@ public class SqlGroupByBenchmark
NestedDataModule.registerHandlersAndSerde();
}
private static final Logger log = new Logger(SqlGroupByBenchmark.class);
private static final DruidProcessingConfig PROCESSING_CONFIG = new DruidProcessingConfig()
{
@Override
@ -349,12 +352,14 @@ public class SqlGroupByBenchmark
try {
SqlVectorizedExpressionSanityTest.sanityTestVectorizedSqlQueries(
engine,
plannerFactory,
sqlQuery(groupingDimension)
);
log.info("non-vectorized and vectorized results match");
}
catch (Throwable ignored) {
// the show must go on
catch (Throwable ex) {
log.warn(ex, "non-vectorized and vectorized results do not match");
}
}

View File

@ -20,7 +20,6 @@
package org.apache.druid.benchmark.query;
import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.databind.JsonMappingException;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableMap;
@ -301,7 +300,7 @@ public class SqlNestedDataBenchmark
private SqlEngine engine;
@Nullable
private PlannerFactory plannerFactory;
private Closer closer = Closer.create();
private final Closer closer = Closer.create();
@Setup(Level.Trial)
public void setup()
@ -345,16 +344,19 @@ public class SqlNestedDataBenchmark
}
final QueryableIndex index;
if ("auto".equals(schema)) {
List<DimensionSchema> columnSchemas = schemaInfo.getDimensionsSpec()
.getDimensions()
.stream()
.map(x -> new AutoTypeColumnSchema(x.getName(), null))
.collect(Collectors.toList());
Iterable<DimensionSchema> columnSchemas = Iterables.concat(
schemaInfo.getDimensionsSpec()
.getDimensions()
.stream()
.map(x -> new AutoTypeColumnSchema(x.getName(), null))
.collect(Collectors.toList()),
Collections.singletonList(new AutoTypeColumnSchema("nested", null))
);
index = segmentGenerator.generate(
dataSegment,
schemaInfo,
DimensionsSpec.builder().setDimensions(columnSchemas).build(),
TransformSpec.NONE,
DimensionsSpec.builder().setDimensions(ImmutableList.copyOf(columnSchemas.iterator())).build(),
transformSpec,
IndexSpec.builder().withStringDictionaryEncoding(encodingStrategy).build(),
Granularities.NONE,
rowsPerSegment
@ -368,7 +370,7 @@ public class SqlNestedDataBenchmark
dataSegment,
schemaInfo,
DimensionsSpec.builder().setDimensions(ImmutableList.copyOf(columnSchemas.iterator())).build(),
TransformSpec.NONE,
transformSpec,
IndexSpec.builder().withStringDictionaryEncoding(encodingStrategy).build(),
Granularities.NONE,
rowsPerSegment
@ -405,12 +407,14 @@ public class SqlNestedDataBenchmark
try {
SqlVectorizedExpressionSanityTest.sanityTestVectorizedSqlQueries(
engine,
plannerFactory,
QUERIES.get(Integer.parseInt(query))
);
log.info("non-vectorized and vectorized results match");
}
catch (Throwable ex) {
log.warn(ex, "failed to sanity check");
log.warn(ex, "non-vectorized and vectorized results do not match");
}
final String sql = QUERIES.get(Integer.parseInt(query));
@ -424,11 +428,8 @@ public class SqlNestedDataBenchmark
.writeValueAsString(jsonMapper.readValue((String) planResult[0], List.class))
);
}
catch (JsonMappingException e) {
throw new RuntimeException(e);
}
catch (JsonProcessingException e) {
throw new RuntimeException(e);
catch (JsonProcessingException ex) {
log.warn(ex, "explain failed");
}
try (final DruidPlanner planner = plannerFactory.createPlannerForTesting(engine, sql, ImmutableMap.of())) {

View File

@ -154,7 +154,7 @@ public class NestedDataColumnIndexerV4 implements DimensionIndexer<StructuredDat
@Override
public int getCardinality()
{
return globalDictionary.getCardinality();
return DimensionDictionarySelector.CARDINALITY_UNKNOWN;
}
@Override

View File

@ -69,6 +69,7 @@ public class AutoTypeColumnIndexerTest extends InitializedNullHandlingTest
public void testKeySizeEstimation()
{
AutoTypeColumnIndexer indexer = new AutoTypeColumnIndexer("test", null);
Assert.assertEquals(DimensionDictionarySelector.CARDINALITY_UNKNOWN, indexer.getCardinality());
int baseCardinality = NullHandling.sqlCompatible() ? 0 : 2;
Assert.assertEquals(baseCardinality, indexer.globalDictionary.getCardinality());
@ -134,6 +135,7 @@ public class AutoTypeColumnIndexerTest extends InitializedNullHandlingTest
Assert.assertEquals(48, key.getEffectiveSizeBytes());
Assert.assertEquals(baseCardinality + 8, indexer.globalDictionary.getCardinality());
}
Assert.assertEquals(DimensionDictionarySelector.CARDINALITY_UNKNOWN, indexer.getCardinality());
}
@Test

View File

@ -68,71 +68,73 @@ public class NestedDataColumnIndexerV4Test extends InitializedNullHandlingTest
public void testKeySizeEstimation()
{
NestedDataColumnIndexerV4 indexer = new NestedDataColumnIndexerV4();
Assert.assertEquals(DimensionDictionarySelector.CARDINALITY_UNKNOWN, indexer.getCardinality());
int baseCardinality = NullHandling.sqlCompatible() ? 0 : 2;
Assert.assertEquals(baseCardinality, indexer.getCardinality());
Assert.assertEquals(baseCardinality, indexer.globalDictionary.getCardinality());
EncodedKeyComponent<StructuredData> key;
// new raw value, new field, new dictionary entry
key = indexer.processRowValsToUnsortedEncodedKeyComponent(ImmutableMap.of("x", "foo"), false);
Assert.assertEquals(228, key.getEffectiveSizeBytes());
Assert.assertEquals(baseCardinality + 1, indexer.getCardinality());
Assert.assertEquals(baseCardinality + 1, indexer.globalDictionary.getCardinality());
// adding same value only adds estimated size of value itself
key = indexer.processRowValsToUnsortedEncodedKeyComponent(ImmutableMap.of("x", "foo"), false);
Assert.assertEquals(112, key.getEffectiveSizeBytes());
Assert.assertEquals(baseCardinality + 1, indexer.getCardinality());
Assert.assertEquals(baseCardinality + 1, indexer.globalDictionary.getCardinality());
// new raw value, new field, new dictionary entry
key = indexer.processRowValsToUnsortedEncodedKeyComponent(10L, false);
Assert.assertEquals(94, key.getEffectiveSizeBytes());
Assert.assertEquals(baseCardinality + 2, indexer.getCardinality());
Assert.assertEquals(baseCardinality + 2, indexer.globalDictionary.getCardinality());
// adding same value only adds estimated size of value itself
key = indexer.processRowValsToUnsortedEncodedKeyComponent(10L, false);
Assert.assertEquals(16, key.getEffectiveSizeBytes());
Assert.assertEquals(baseCardinality + 2, indexer.getCardinality());
Assert.assertEquals(baseCardinality + 2, indexer.globalDictionary.getCardinality());
// new raw value, new dictionary entry
key = indexer.processRowValsToUnsortedEncodedKeyComponent(11L, false);
Assert.assertEquals(48, key.getEffectiveSizeBytes());
Assert.assertEquals(baseCardinality + 3, indexer.getCardinality());
Assert.assertEquals(baseCardinality + 3, indexer.globalDictionary.getCardinality());
// new raw value, new fields
key = indexer.processRowValsToUnsortedEncodedKeyComponent(ImmutableList.of(1L, 2L, 10L), false);
Assert.assertEquals(276, key.getEffectiveSizeBytes());
Assert.assertEquals(baseCardinality + 5, indexer.getCardinality());
Assert.assertEquals(baseCardinality + 5, indexer.globalDictionary.getCardinality());
// new raw value, re-use fields and dictionary
key = indexer.processRowValsToUnsortedEncodedKeyComponent(ImmutableList.of(1L, 2L, 10L), false);
Assert.assertEquals(56, key.getEffectiveSizeBytes());
Assert.assertEquals(baseCardinality + 5, indexer.getCardinality());
Assert.assertEquals(baseCardinality + 5, indexer.globalDictionary.getCardinality());
// new raw value, new fields
key = indexer.processRowValsToUnsortedEncodedKeyComponent(
ImmutableMap.of("x", ImmutableList.of(1L, 2L, 10L)),
false
);
Assert.assertEquals(286, key.getEffectiveSizeBytes());
Assert.assertEquals(baseCardinality + 5, indexer.getCardinality());
Assert.assertEquals(baseCardinality + 5, indexer.globalDictionary.getCardinality());
// new raw value
key = indexer.processRowValsToUnsortedEncodedKeyComponent(
ImmutableMap.of("x", ImmutableList.of(1L, 2L, 10L)),
false
);
Assert.assertEquals(118, key.getEffectiveSizeBytes());
Assert.assertEquals(baseCardinality + 5, indexer.getCardinality());
Assert.assertEquals(baseCardinality + 5, indexer.globalDictionary.getCardinality());
key = indexer.processRowValsToUnsortedEncodedKeyComponent("", false);
if (NullHandling.replaceWithDefault()) {
Assert.assertEquals(0, key.getEffectiveSizeBytes());
Assert.assertEquals(baseCardinality + 6, indexer.getCardinality());
Assert.assertEquals(baseCardinality + 6, indexer.globalDictionary.getCardinality());
} else {
Assert.assertEquals(104, key.getEffectiveSizeBytes());
Assert.assertEquals(baseCardinality + 6, indexer.getCardinality());
Assert.assertEquals(baseCardinality + 6, indexer.globalDictionary.getCardinality());
}
key = indexer.processRowValsToUnsortedEncodedKeyComponent(0, false);
if (NullHandling.replaceWithDefault()) {
Assert.assertEquals(16, key.getEffectiveSizeBytes());
Assert.assertEquals(baseCardinality + 6, indexer.getCardinality());
Assert.assertEquals(baseCardinality + 6, indexer.globalDictionary.getCardinality());
} else {
Assert.assertEquals(48, key.getEffectiveSizeBytes());
Assert.assertEquals(baseCardinality + 7, indexer.getCardinality());
Assert.assertEquals(baseCardinality + 7, indexer.globalDictionary.getCardinality());
}
Assert.assertEquals(DimensionDictionarySelector.CARDINALITY_UNKNOWN, indexer.getCardinality());
}
@Test

View File

@ -178,10 +178,10 @@ public class SqlVectorizedExpressionSanityTest extends InitializedNullHandlingTe
@Test
public void testQuery()
{
sanityTestVectorizedSqlQueries(PLANNER_FACTORY, query);
sanityTestVectorizedSqlQueries(ENGINE, PLANNER_FACTORY, query);
}
public static void sanityTestVectorizedSqlQueries(PlannerFactory plannerFactory, String query)
public static void sanityTestVectorizedSqlQueries(SqlEngine engine, PlannerFactory plannerFactory, String query)
{
final Map<String, Object> vector = ImmutableMap.of(
QueryContexts.VECTORIZE_KEY, "force",
@ -193,8 +193,8 @@ public class SqlVectorizedExpressionSanityTest extends InitializedNullHandlingTe
);
try (
final DruidPlanner vectorPlanner = plannerFactory.createPlannerForTesting(ENGINE, query, vector);
final DruidPlanner nonVectorPlanner = plannerFactory.createPlannerForTesting(ENGINE, query, nonvector)
final DruidPlanner vectorPlanner = plannerFactory.createPlannerForTesting(engine, query, vector);
final DruidPlanner nonVectorPlanner = plannerFactory.createPlannerForTesting(engine, query, nonvector)
) {
final PlannerResult vectorPlan = vectorPlanner.plan();
final PlannerResult nonVectorPlan = nonVectorPlanner.plan();