From fee509df2ebade0d6a17f96ee0637a27acedb45d Mon Sep 17 00:00:00 2001 From: Clint Wylie Date: Tue, 11 Jun 2024 20:58:12 -0700 Subject: [PATCH] fix NestedDataColumnIndexerV4 to not report cardinality (#16507) * fix NestedDataColumnIndexerV4 to not report cardinality changes: * fix issue similar to #16489 but for NestedDataColumnIndexerV4, which can report STRING type if it only processes a single type of values. this should be less common than the auto indexer problem * fix some issues with sql benchmarks --- .../query/SqlExpressionBenchmark.java | 15 +++++---- .../benchmark/query/SqlGroupByBenchmark.java | 9 +++-- .../query/SqlNestedDataBenchmark.java | 33 ++++++++++--------- .../segment/NestedDataColumnIndexerV4.java | 2 +- .../segment/AutoTypeColumnIndexerTest.java | 2 ++ .../NestedDataColumnIndexerV4Test.java | 30 +++++++++-------- .../SqlVectorizedExpressionSanityTest.java | 8 ++--- 7 files changed, 56 insertions(+), 43 deletions(-) diff --git a/benchmarks/src/test/java/org/apache/druid/benchmark/query/SqlExpressionBenchmark.java b/benchmarks/src/test/java/org/apache/druid/benchmark/query/SqlExpressionBenchmark.java index c04d4e31f95..d4c2e0906b4 100644 --- a/benchmarks/src/test/java/org/apache/druid/benchmark/query/SqlExpressionBenchmark.java +++ b/benchmarks/src/test/java/org/apache/druid/benchmark/query/SqlExpressionBenchmark.java @@ -360,13 +360,16 @@ public class SqlExpressionBenchmark try { SqlVectorizedExpressionSanityTest.sanityTestVectorizedSqlQueries( + engine, plannerFactory, QUERIES.get(Integer.parseInt(query)) ); + log.info("non-vectorized and vectorized results match"); } - catch (Throwable ignored) { - // the show must go on + catch (Throwable ex) { + log.warn(ex, "non-vectorized and vectorized results do not match"); } + final String sql = QUERIES.get(Integer.parseInt(query)); try (final DruidPlanner planner = plannerFactory.createPlannerForTesting(engine, "EXPLAIN PLAN FOR " + sql, ImmutableMap.of("useNativeQueryExplain", true))) { @@ -378,8 +381,8 @@ public class SqlExpressionBenchmark .writeValueAsString(jsonMapper.readValue((String) planResult[0], List.class)) ); } - catch (JsonProcessingException ignored) { - + catch (JsonProcessingException ex) { + log.warn(ex, "explain failed"); } try (final DruidPlanner planner = plannerFactory.createPlannerForTesting(engine, sql, ImmutableMap.of())) { @@ -393,8 +396,8 @@ public class SqlExpressionBenchmark } log.info("Total result row count:" + rowCounter); } - catch (Throwable ignored) { - + catch (Throwable ex) { + log.warn(ex, "failed to count rows"); } } diff --git a/benchmarks/src/test/java/org/apache/druid/benchmark/query/SqlGroupByBenchmark.java b/benchmarks/src/test/java/org/apache/druid/benchmark/query/SqlGroupByBenchmark.java index 52745e62fb3..80b6647a0ee 100644 --- a/benchmarks/src/test/java/org/apache/druid/benchmark/query/SqlGroupByBenchmark.java +++ b/benchmarks/src/test/java/org/apache/druid/benchmark/query/SqlGroupByBenchmark.java @@ -29,6 +29,7 @@ import org.apache.druid.java.util.common.StringUtils; import org.apache.druid.java.util.common.granularity.Granularities; import org.apache.druid.java.util.common.guava.Sequence; import org.apache.druid.java.util.common.io.Closer; +import org.apache.druid.java.util.common.logger.Logger; import org.apache.druid.math.expr.ExpressionProcessing; import org.apache.druid.query.DruidProcessingConfig; import org.apache.druid.query.QueryRunnerFactoryConglomerate; @@ -91,6 +92,8 @@ public class SqlGroupByBenchmark NestedDataModule.registerHandlersAndSerde(); } + private static final Logger log = new Logger(SqlGroupByBenchmark.class); + private static final DruidProcessingConfig PROCESSING_CONFIG = new DruidProcessingConfig() { @Override @@ -349,12 +352,14 @@ public class SqlGroupByBenchmark try { SqlVectorizedExpressionSanityTest.sanityTestVectorizedSqlQueries( + engine, plannerFactory, sqlQuery(groupingDimension) ); + log.info("non-vectorized and vectorized results match"); } - catch (Throwable ignored) { - // the show must go on + catch (Throwable ex) { + log.warn(ex, "non-vectorized and vectorized results do not match"); } } diff --git a/benchmarks/src/test/java/org/apache/druid/benchmark/query/SqlNestedDataBenchmark.java b/benchmarks/src/test/java/org/apache/druid/benchmark/query/SqlNestedDataBenchmark.java index 0be1f4d52e6..c329e9da30e 100644 --- a/benchmarks/src/test/java/org/apache/druid/benchmark/query/SqlNestedDataBenchmark.java +++ b/benchmarks/src/test/java/org/apache/druid/benchmark/query/SqlNestedDataBenchmark.java @@ -20,7 +20,6 @@ package org.apache.druid.benchmark.query; import com.fasterxml.jackson.core.JsonProcessingException; -import com.fasterxml.jackson.databind.JsonMappingException; import com.fasterxml.jackson.databind.ObjectMapper; import com.google.common.collect.ImmutableList; import com.google.common.collect.ImmutableMap; @@ -301,7 +300,7 @@ public class SqlNestedDataBenchmark private SqlEngine engine; @Nullable private PlannerFactory plannerFactory; - private Closer closer = Closer.create(); + private final Closer closer = Closer.create(); @Setup(Level.Trial) public void setup() @@ -345,16 +344,19 @@ public class SqlNestedDataBenchmark } final QueryableIndex index; if ("auto".equals(schema)) { - List columnSchemas = schemaInfo.getDimensionsSpec() - .getDimensions() - .stream() - .map(x -> new AutoTypeColumnSchema(x.getName(), null)) - .collect(Collectors.toList()); + Iterable columnSchemas = Iterables.concat( + schemaInfo.getDimensionsSpec() + .getDimensions() + .stream() + .map(x -> new AutoTypeColumnSchema(x.getName(), null)) + .collect(Collectors.toList()), + Collections.singletonList(new AutoTypeColumnSchema("nested", null)) + ); index = segmentGenerator.generate( dataSegment, schemaInfo, - DimensionsSpec.builder().setDimensions(columnSchemas).build(), - TransformSpec.NONE, + DimensionsSpec.builder().setDimensions(ImmutableList.copyOf(columnSchemas.iterator())).build(), + transformSpec, IndexSpec.builder().withStringDictionaryEncoding(encodingStrategy).build(), Granularities.NONE, rowsPerSegment @@ -368,7 +370,7 @@ public class SqlNestedDataBenchmark dataSegment, schemaInfo, DimensionsSpec.builder().setDimensions(ImmutableList.copyOf(columnSchemas.iterator())).build(), - TransformSpec.NONE, + transformSpec, IndexSpec.builder().withStringDictionaryEncoding(encodingStrategy).build(), Granularities.NONE, rowsPerSegment @@ -405,12 +407,14 @@ public class SqlNestedDataBenchmark try { SqlVectorizedExpressionSanityTest.sanityTestVectorizedSqlQueries( + engine, plannerFactory, QUERIES.get(Integer.parseInt(query)) ); + log.info("non-vectorized and vectorized results match"); } catch (Throwable ex) { - log.warn(ex, "failed to sanity check"); + log.warn(ex, "non-vectorized and vectorized results do not match"); } final String sql = QUERIES.get(Integer.parseInt(query)); @@ -424,11 +428,8 @@ public class SqlNestedDataBenchmark .writeValueAsString(jsonMapper.readValue((String) planResult[0], List.class)) ); } - catch (JsonMappingException e) { - throw new RuntimeException(e); - } - catch (JsonProcessingException e) { - throw new RuntimeException(e); + catch (JsonProcessingException ex) { + log.warn(ex, "explain failed"); } try (final DruidPlanner planner = plannerFactory.createPlannerForTesting(engine, sql, ImmutableMap.of())) { diff --git a/processing/src/main/java/org/apache/druid/segment/NestedDataColumnIndexerV4.java b/processing/src/main/java/org/apache/druid/segment/NestedDataColumnIndexerV4.java index ecaf89e8483..f3c92806e4f 100644 --- a/processing/src/main/java/org/apache/druid/segment/NestedDataColumnIndexerV4.java +++ b/processing/src/main/java/org/apache/druid/segment/NestedDataColumnIndexerV4.java @@ -154,7 +154,7 @@ public class NestedDataColumnIndexerV4 implements DimensionIndexer key; // new raw value, new field, new dictionary entry key = indexer.processRowValsToUnsortedEncodedKeyComponent(ImmutableMap.of("x", "foo"), false); Assert.assertEquals(228, key.getEffectiveSizeBytes()); - Assert.assertEquals(baseCardinality + 1, indexer.getCardinality()); + Assert.assertEquals(baseCardinality + 1, indexer.globalDictionary.getCardinality()); // adding same value only adds estimated size of value itself key = indexer.processRowValsToUnsortedEncodedKeyComponent(ImmutableMap.of("x", "foo"), false); Assert.assertEquals(112, key.getEffectiveSizeBytes()); - Assert.assertEquals(baseCardinality + 1, indexer.getCardinality()); + Assert.assertEquals(baseCardinality + 1, indexer.globalDictionary.getCardinality()); // new raw value, new field, new dictionary entry key = indexer.processRowValsToUnsortedEncodedKeyComponent(10L, false); Assert.assertEquals(94, key.getEffectiveSizeBytes()); - Assert.assertEquals(baseCardinality + 2, indexer.getCardinality()); + Assert.assertEquals(baseCardinality + 2, indexer.globalDictionary.getCardinality()); // adding same value only adds estimated size of value itself key = indexer.processRowValsToUnsortedEncodedKeyComponent(10L, false); Assert.assertEquals(16, key.getEffectiveSizeBytes()); - Assert.assertEquals(baseCardinality + 2, indexer.getCardinality()); + Assert.assertEquals(baseCardinality + 2, indexer.globalDictionary.getCardinality()); // new raw value, new dictionary entry key = indexer.processRowValsToUnsortedEncodedKeyComponent(11L, false); Assert.assertEquals(48, key.getEffectiveSizeBytes()); - Assert.assertEquals(baseCardinality + 3, indexer.getCardinality()); + Assert.assertEquals(baseCardinality + 3, indexer.globalDictionary.getCardinality()); // new raw value, new fields key = indexer.processRowValsToUnsortedEncodedKeyComponent(ImmutableList.of(1L, 2L, 10L), false); Assert.assertEquals(276, key.getEffectiveSizeBytes()); - Assert.assertEquals(baseCardinality + 5, indexer.getCardinality()); + Assert.assertEquals(baseCardinality + 5, indexer.globalDictionary.getCardinality()); // new raw value, re-use fields and dictionary key = indexer.processRowValsToUnsortedEncodedKeyComponent(ImmutableList.of(1L, 2L, 10L), false); Assert.assertEquals(56, key.getEffectiveSizeBytes()); - Assert.assertEquals(baseCardinality + 5, indexer.getCardinality()); + Assert.assertEquals(baseCardinality + 5, indexer.globalDictionary.getCardinality()); // new raw value, new fields key = indexer.processRowValsToUnsortedEncodedKeyComponent( ImmutableMap.of("x", ImmutableList.of(1L, 2L, 10L)), false ); Assert.assertEquals(286, key.getEffectiveSizeBytes()); - Assert.assertEquals(baseCardinality + 5, indexer.getCardinality()); + Assert.assertEquals(baseCardinality + 5, indexer.globalDictionary.getCardinality()); // new raw value key = indexer.processRowValsToUnsortedEncodedKeyComponent( ImmutableMap.of("x", ImmutableList.of(1L, 2L, 10L)), false ); Assert.assertEquals(118, key.getEffectiveSizeBytes()); - Assert.assertEquals(baseCardinality + 5, indexer.getCardinality()); + Assert.assertEquals(baseCardinality + 5, indexer.globalDictionary.getCardinality()); key = indexer.processRowValsToUnsortedEncodedKeyComponent("", false); if (NullHandling.replaceWithDefault()) { Assert.assertEquals(0, key.getEffectiveSizeBytes()); - Assert.assertEquals(baseCardinality + 6, indexer.getCardinality()); + Assert.assertEquals(baseCardinality + 6, indexer.globalDictionary.getCardinality()); } else { Assert.assertEquals(104, key.getEffectiveSizeBytes()); - Assert.assertEquals(baseCardinality + 6, indexer.getCardinality()); + Assert.assertEquals(baseCardinality + 6, indexer.globalDictionary.getCardinality()); } key = indexer.processRowValsToUnsortedEncodedKeyComponent(0, false); if (NullHandling.replaceWithDefault()) { Assert.assertEquals(16, key.getEffectiveSizeBytes()); - Assert.assertEquals(baseCardinality + 6, indexer.getCardinality()); + Assert.assertEquals(baseCardinality + 6, indexer.globalDictionary.getCardinality()); } else { Assert.assertEquals(48, key.getEffectiveSizeBytes()); - Assert.assertEquals(baseCardinality + 7, indexer.getCardinality()); + Assert.assertEquals(baseCardinality + 7, indexer.globalDictionary.getCardinality()); } + Assert.assertEquals(DimensionDictionarySelector.CARDINALITY_UNKNOWN, indexer.getCardinality()); } @Test diff --git a/sql/src/test/java/org/apache/druid/sql/calcite/SqlVectorizedExpressionSanityTest.java b/sql/src/test/java/org/apache/druid/sql/calcite/SqlVectorizedExpressionSanityTest.java index 6c87c321a42..304400bc3d8 100644 --- a/sql/src/test/java/org/apache/druid/sql/calcite/SqlVectorizedExpressionSanityTest.java +++ b/sql/src/test/java/org/apache/druid/sql/calcite/SqlVectorizedExpressionSanityTest.java @@ -178,10 +178,10 @@ public class SqlVectorizedExpressionSanityTest extends InitializedNullHandlingTe @Test public void testQuery() { - sanityTestVectorizedSqlQueries(PLANNER_FACTORY, query); + sanityTestVectorizedSqlQueries(ENGINE, PLANNER_FACTORY, query); } - public static void sanityTestVectorizedSqlQueries(PlannerFactory plannerFactory, String query) + public static void sanityTestVectorizedSqlQueries(SqlEngine engine, PlannerFactory plannerFactory, String query) { final Map vector = ImmutableMap.of( QueryContexts.VECTORIZE_KEY, "force", @@ -193,8 +193,8 @@ public class SqlVectorizedExpressionSanityTest extends InitializedNullHandlingTe ); try ( - final DruidPlanner vectorPlanner = plannerFactory.createPlannerForTesting(ENGINE, query, vector); - final DruidPlanner nonVectorPlanner = plannerFactory.createPlannerForTesting(ENGINE, query, nonvector) + final DruidPlanner vectorPlanner = plannerFactory.createPlannerForTesting(engine, query, vector); + final DruidPlanner nonVectorPlanner = plannerFactory.createPlannerForTesting(engine, query, nonvector) ) { final PlannerResult vectorPlan = vectorPlanner.plan(); final PlannerResult nonVectorPlan = nonVectorPlanner.plan();