From c2e9ab810090efdc876c982a8c8676610532d956 Mon Sep 17 00:00:00 2001 From: Clint Wylie Date: Thu, 19 Dec 2019 17:45:19 -0800 Subject: [PATCH] benchmark schema with numeric dimensions and null column values (#9036) * benchmark schema with null column values * oops * adjustments * rename again, different null percentage so rows more variety * more schema --- .../datagen/BenchmarkColumnSchema.java | 22 ++++++ .../BenchmarkColumnValueGenerator.java | 7 ++ .../datagen/BenchmarkSchemaInfo.java | 13 ++++ .../benchmark/datagen/BenchmarkSchemas.java | 69 +++++++++++++++++++ .../benchmark/datagen/SegmentGenerator.java | 33 +-------- .../benchmark/query/GroupByBenchmark.java | 38 +++++++++- 6 files changed, 150 insertions(+), 32 deletions(-) diff --git a/benchmarks/src/main/java/org/apache/druid/benchmark/datagen/BenchmarkColumnSchema.java b/benchmarks/src/main/java/org/apache/druid/benchmark/datagen/BenchmarkColumnSchema.java index 549140e62bd..8636941dd28 100644 --- a/benchmarks/src/main/java/org/apache/druid/benchmark/datagen/BenchmarkColumnSchema.java +++ b/benchmarks/src/main/java/org/apache/druid/benchmark/datagen/BenchmarkColumnSchema.java @@ -19,6 +19,12 @@ package org.apache.druid.benchmark.datagen; +import org.apache.druid.data.input.impl.DimensionSchema; +import org.apache.druid.data.input.impl.DoubleDimensionSchema; +import org.apache.druid.data.input.impl.FloatDimensionSchema; +import org.apache.druid.data.input.impl.LongDimensionSchema; +import org.apache.druid.data.input.impl.StringDimensionSchema; +import org.apache.druid.java.util.common.IAE; import org.apache.druid.segment.column.ValueType; import java.util.List; @@ -144,6 +150,22 @@ public class BenchmarkColumnSchema return new BenchmarkColumnValueGenerator(this, seed); } + public DimensionSchema getDimensionSchema() + { + switch (type) { + case LONG: + return new LongDimensionSchema(name); + case FLOAT: + return new FloatDimensionSchema(name); + case DOUBLE: + return new DoubleDimensionSchema(name); + case STRING: + return new StringDimensionSchema(name); + default: + throw new IAE("unable to make dimension schema for %s", type); + } + } + public String getName() { return name; diff --git a/benchmarks/src/main/java/org/apache/druid/benchmark/datagen/BenchmarkColumnValueGenerator.java b/benchmarks/src/main/java/org/apache/druid/benchmark/datagen/BenchmarkColumnValueGenerator.java index e94a9703598..5fe89d8a9fb 100644 --- a/benchmarks/src/main/java/org/apache/druid/benchmark/datagen/BenchmarkColumnValueGenerator.java +++ b/benchmarks/src/main/java/org/apache/druid/benchmark/datagen/BenchmarkColumnValueGenerator.java @@ -116,6 +116,13 @@ public class BenchmarkColumnValueGenerator ret = Long.parseLong(input.toString()); } break; + case DOUBLE: + if (input instanceof Number) { + ret = ((Number) input).doubleValue(); + } else { + ret = Double.parseDouble(input.toString()); + } + break; case FLOAT: if (input instanceof Number) { ret = ((Number) input).floatValue(); diff --git a/benchmarks/src/main/java/org/apache/druid/benchmark/datagen/BenchmarkSchemaInfo.java b/benchmarks/src/main/java/org/apache/druid/benchmark/datagen/BenchmarkSchemaInfo.java index 7a2720931e8..f8c1b60dd37 100644 --- a/benchmarks/src/main/java/org/apache/druid/benchmark/datagen/BenchmarkSchemaInfo.java +++ b/benchmarks/src/main/java/org/apache/druid/benchmark/datagen/BenchmarkSchemaInfo.java @@ -19,10 +19,13 @@ package org.apache.druid.benchmark.datagen; +import org.apache.druid.data.input.impl.DimensionSchema; +import org.apache.druid.data.input.impl.DimensionsSpec; import org.apache.druid.query.aggregation.AggregatorFactory; import org.joda.time.Interval; import java.util.List; +import java.util.stream.Collectors; public class BenchmarkSchemaInfo { @@ -49,6 +52,16 @@ public class BenchmarkSchemaInfo return columnSchemas; } + public DimensionsSpec getDimensionsSpec() + { + List specs = getColumnSchemas().stream() + .filter(x -> !x.isMetric()) + .map(BenchmarkColumnSchema::getDimensionSchema) + .collect(Collectors.toList()); + + return new DimensionsSpec(specs); + } + public List getAggs() { return aggs; diff --git a/benchmarks/src/main/java/org/apache/druid/benchmark/datagen/BenchmarkSchemas.java b/benchmarks/src/main/java/org/apache/druid/benchmark/datagen/BenchmarkSchemas.java index 69a9d901a1e..142843753fd 100644 --- a/benchmarks/src/main/java/org/apache/druid/benchmark/datagen/BenchmarkSchemas.java +++ b/benchmarks/src/main/java/org/apache/druid/benchmark/datagen/BenchmarkSchemas.java @@ -225,4 +225,73 @@ public class BenchmarkSchemas ); SCHEMA_MAP.put("rollo", rolloSchema); } + + static { // simple schema with null valued rows, no aggs on numeric columns + List nullsSchemaColumns = ImmutableList.of( + // string dims with nulls + BenchmarkColumnSchema.makeZipf("stringZipf", ValueType.STRING, false, 1, 0.8, 1, 101, 1.5), + BenchmarkColumnSchema.makeDiscreteUniform("stringUniform", ValueType.STRING, false, 1, 0.3, 1, 100000), + BenchmarkColumnSchema.makeSequential("stringSequential", ValueType.STRING, false, 1, 0.5, 0, 1000), + + // numeric dims with nulls + BenchmarkColumnSchema.makeSequential("longSequential", ValueType.LONG, false, 1, 0.45, 0, 10000), + BenchmarkColumnSchema.makeDiscreteUniform("longUniform", ValueType.LONG, false, 1, 0.25, 0, 500), + BenchmarkColumnSchema.makeZipf("doubleZipf", ValueType.DOUBLE, false, 1, 0.1, 0, 1000, 2.0), + BenchmarkColumnSchema.makeZipf("floatZipf", ValueType.FLOAT, false, 1, 0.1, 0, 1000, 2.0) + ); + + List simpleNullsSchemaIngestAggs = new ArrayList<>(); + simpleNullsSchemaIngestAggs.add(new CountAggregatorFactory("rows")); + + Interval nullsSchemaDataInterval = Intervals.of("2000-01-01/P1D"); + + BenchmarkSchemaInfo nullsSchema = new BenchmarkSchemaInfo( + nullsSchemaColumns, + simpleNullsSchemaIngestAggs, + nullsSchemaDataInterval, + false + ); + + SCHEMA_MAP.put("nulls", nullsSchema); + } + + static { // simple schema with null valued rows, no aggs on numeric columns + List nullsSchemaColumns = ImmutableList.of( + // string dims + BenchmarkColumnSchema.makeZipf("stringZipf", ValueType.STRING, false, 1, null, 1, 101, 1.5), + BenchmarkColumnSchema.makeDiscreteUniform("stringUniform", ValueType.STRING, false, 1, null, 1, 100000), + BenchmarkColumnSchema.makeSequential("stringSequential", ValueType.STRING, false, 1, null, 0, 1000), + + // numeric dims + BenchmarkColumnSchema.makeSequential("longSequential", ValueType.LONG, false, 1, null, 0, 10000), + BenchmarkColumnSchema.makeDiscreteUniform("longUniform", ValueType.LONG, false, 1, null, 0, 500), + BenchmarkColumnSchema.makeZipf("doubleZipf", ValueType.DOUBLE, false, 1, null, 0, 1000, 2.0), + BenchmarkColumnSchema.makeZipf("floatZipf", ValueType.FLOAT, false, 1, null, 0, 1000, 2.0), + + // string dims with nulls + BenchmarkColumnSchema.makeZipf("stringZipfWithNulls", ValueType.STRING, false, 1, 0.8, 1, 101, 1.5), + BenchmarkColumnSchema.makeDiscreteUniform("stringUniformWithNulls", ValueType.STRING, false, 1, 0.3, 1, 100000), + BenchmarkColumnSchema.makeSequential("stringSequentialWithNulls", ValueType.STRING, false, 1, 0.5, 0, 1000), + + // numeric dims with nulls + BenchmarkColumnSchema.makeSequential("longSequentialWithNulls", ValueType.LONG, false, 1, 0.45, 0, 10000), + BenchmarkColumnSchema.makeDiscreteUniform("longUniformWithNulls", ValueType.LONG, false, 1, 0.25, 0, 500), + BenchmarkColumnSchema.makeZipf("doubleZipfWithNulls", ValueType.DOUBLE, false, 1, 0.1, 0, 1000, 2.0), + BenchmarkColumnSchema.makeZipf("floatZipfWithNulls", ValueType.FLOAT, false, 1, 0.1, 0, 1000, 2.0) + ); + + List simpleNullsSchemaIngestAggs = new ArrayList<>(); + simpleNullsSchemaIngestAggs.add(new CountAggregatorFactory("rows")); + + Interval nullsSchemaDataInterval = Intervals.of("2000-01-01/P1D"); + + BenchmarkSchemaInfo nullsSchema = new BenchmarkSchemaInfo( + nullsSchemaColumns, + simpleNullsSchemaIngestAggs, + nullsSchemaDataInterval, + false + ); + + SCHEMA_MAP.put("nulls-and-non-nulls", nullsSchema); + } } diff --git a/benchmarks/src/main/java/org/apache/druid/benchmark/datagen/SegmentGenerator.java b/benchmarks/src/main/java/org/apache/druid/benchmark/datagen/SegmentGenerator.java index e1dea314249..95a42d5ffd8 100644 --- a/benchmarks/src/main/java/org/apache/druid/benchmark/datagen/SegmentGenerator.java +++ b/benchmarks/src/main/java/org/apache/druid/benchmark/datagen/SegmentGenerator.java @@ -19,16 +19,9 @@ package org.apache.druid.benchmark.datagen; -import com.google.common.collect.ImmutableList; import com.google.common.hash.Hashing; import org.apache.druid.common.config.NullHandling; import org.apache.druid.data.input.InputRow; -import org.apache.druid.data.input.impl.DimensionSchema; -import org.apache.druid.data.input.impl.DimensionsSpec; -import org.apache.druid.data.input.impl.DoubleDimensionSchema; -import org.apache.druid.data.input.impl.FloatDimensionSchema; -import org.apache.druid.data.input.impl.LongDimensionSchema; -import org.apache.druid.data.input.impl.StringDimensionSchema; import org.apache.druid.java.util.common.FileUtils; import org.apache.druid.java.util.common.ISE; import org.apache.druid.java.util.common.StringUtils; @@ -71,7 +64,7 @@ public class SegmentGenerator implements Closeable static { NullHandling.initializeForTests(); } - + private final File cacheDir; private final boolean cleanupCacheDir; @@ -145,30 +138,8 @@ public class SegmentGenerator implements Closeable numRows ); - final List dimensions = new ArrayList<>(); - for (BenchmarkColumnSchema columnSchema : schemaInfo.getColumnSchemas()) { - if (schemaInfo.getAggs().stream().noneMatch(agg -> agg.getName().equals(columnSchema.getName()))) { - switch (columnSchema.getType()) { - case STRING: - dimensions.add(new StringDimensionSchema(columnSchema.getName())); - break; - case LONG: - dimensions.add(new LongDimensionSchema(columnSchema.getName())); - break; - case DOUBLE: - dimensions.add(new DoubleDimensionSchema(columnSchema.getName())); - break; - case FLOAT: - dimensions.add(new FloatDimensionSchema(columnSchema.getName())); - break; - default: - throw new ISE("Unhandleable type[%s]", columnSchema.getType()); - } - } - } - final IncrementalIndexSchema indexSchema = new IncrementalIndexSchema.Builder() - .withDimensionsSpec(new DimensionsSpec(dimensions, ImmutableList.of(), ImmutableList.of())) + .withDimensionsSpec(schemaInfo.getDimensionsSpec()) .withMetrics(schemaInfo.getAggsArray()) .withRollup(schemaInfo.isWithRollup()) .withQueryGranularity(granularity) diff --git a/benchmarks/src/main/java/org/apache/druid/benchmark/query/GroupByBenchmark.java b/benchmarks/src/main/java/org/apache/druid/benchmark/query/GroupByBenchmark.java index 8aca844c8b2..8cff9e5f1f2 100644 --- a/benchmarks/src/main/java/org/apache/druid/benchmark/query/GroupByBenchmark.java +++ b/benchmarks/src/main/java/org/apache/druid/benchmark/query/GroupByBenchmark.java @@ -19,6 +19,7 @@ package org.apache.druid.benchmark.query; +import com.fasterxml.jackson.databind.InjectableValues; import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.dataformat.smile.SmileFactory; import com.google.common.base.Supplier; @@ -41,6 +42,7 @@ import org.apache.druid.java.util.common.granularity.Granularities; import org.apache.druid.java.util.common.granularity.Granularity; import org.apache.druid.java.util.common.guava.Sequence; import org.apache.druid.java.util.common.logger.Logger; +import org.apache.druid.math.expr.ExprMacroTable; import org.apache.druid.offheap.OffheapBufferGenerator; import org.apache.druid.query.DruidProcessingConfig; import org.apache.druid.query.FinalizeResultsQueryRunner; @@ -58,6 +60,7 @@ import org.apache.druid.query.aggregation.LongSumAggregatorFactory; import org.apache.druid.query.aggregation.hyperloglog.HyperUniquesSerde; import org.apache.druid.query.context.ResponseContext; import org.apache.druid.query.dimension.DefaultDimensionSpec; +import org.apache.druid.query.expression.TestExprMacroTable; import org.apache.druid.query.filter.BoundDimFilter; import org.apache.druid.query.groupby.GroupByQuery; import org.apache.druid.query.groupby.GroupByQueryConfig; @@ -166,7 +169,11 @@ public class GroupByBenchmark static { JSON_MAPPER = new DefaultObjectMapper(); INDEX_IO = new IndexIO( - JSON_MAPPER, + JSON_MAPPER.setInjectableValues( + new InjectableValues.Std() + .addValue(ExprMacroTable.class.getName(), TestExprMacroTable.INSTANCE) + .addValue(ObjectMapper.class.getName(), JSON_MAPPER) + ), new ColumnConfig() { @Override @@ -391,6 +398,34 @@ public class GroupByBenchmark simpleFloatQueries.put("A", queryA); } SCHEMA_QUERY_MAP.put("simpleFloat", simpleFloatQueries); + + // simple one column schema, for testing performance difference between querying on numeric values as Strings and + // directly as longs + Map nullQueries = new LinkedHashMap<>(); + BenchmarkSchemaInfo nullSchema = BenchmarkSchemas.SCHEMA_MAP.get("nulls"); + + { // simple-null + QuerySegmentSpec intervalSpec = new MultipleIntervalSegmentSpec(Collections.singletonList(nullSchema.getDataInterval())); + List queryAggs = new ArrayList<>(); + queryAggs.add(new DoubleSumAggregatorFactory( + "doubleSum", + "doubleZipf" + )); + GroupByQuery queryA = GroupByQuery + .builder() + .setDataSource("blah") + .setQuerySegmentSpec(intervalSpec) + .setDimensions(new DefaultDimensionSpec("stringZipf", "stringZipf", ValueType.STRING)) + .setAggregatorSpecs( + queryAggs + ) + .setGranularity(Granularity.fromString(queryGranularity)) + .setContext(ImmutableMap.of("vectorize", vectorize)) + .build(); + + nullQueries.put("A", queryA); + } + SCHEMA_QUERY_MAP.put("nulls", nullQueries); } @Setup(Level.Trial) @@ -549,6 +584,7 @@ public class GroupByBenchmark return new IncrementalIndex.Builder() .setIndexSchema( new IncrementalIndexSchema.Builder() + .withDimensionsSpec(schemaInfo.getDimensionsSpec()) .withMetrics(schemaInfo.getAggsArray()) .withRollup(withRollup) .build()