benchmark schema with numeric dimensions and null column values (#9036)

* benchmark schema with null column values

* oops

* adjustments

* rename again, different null percentage so rows more variety

* more schema
This commit is contained in:
Clint Wylie 2019-12-19 17:45:19 -08:00 committed by Jihoon Son
parent 3c31493772
commit c2e9ab8100
6 changed files with 150 additions and 32 deletions

View File

@ -19,6 +19,12 @@
package org.apache.druid.benchmark.datagen; package org.apache.druid.benchmark.datagen;
import org.apache.druid.data.input.impl.DimensionSchema;
import org.apache.druid.data.input.impl.DoubleDimensionSchema;
import org.apache.druid.data.input.impl.FloatDimensionSchema;
import org.apache.druid.data.input.impl.LongDimensionSchema;
import org.apache.druid.data.input.impl.StringDimensionSchema;
import org.apache.druid.java.util.common.IAE;
import org.apache.druid.segment.column.ValueType; import org.apache.druid.segment.column.ValueType;
import java.util.List; import java.util.List;
@ -144,6 +150,22 @@ public class BenchmarkColumnSchema
return new BenchmarkColumnValueGenerator(this, seed); return new BenchmarkColumnValueGenerator(this, seed);
} }
public DimensionSchema getDimensionSchema()
{
switch (type) {
case LONG:
return new LongDimensionSchema(name);
case FLOAT:
return new FloatDimensionSchema(name);
case DOUBLE:
return new DoubleDimensionSchema(name);
case STRING:
return new StringDimensionSchema(name);
default:
throw new IAE("unable to make dimension schema for %s", type);
}
}
public String getName() public String getName()
{ {
return name; return name;

View File

@ -116,6 +116,13 @@ public class BenchmarkColumnValueGenerator
ret = Long.parseLong(input.toString()); ret = Long.parseLong(input.toString());
} }
break; break;
case DOUBLE:
if (input instanceof Number) {
ret = ((Number) input).doubleValue();
} else {
ret = Double.parseDouble(input.toString());
}
break;
case FLOAT: case FLOAT:
if (input instanceof Number) { if (input instanceof Number) {
ret = ((Number) input).floatValue(); ret = ((Number) input).floatValue();

View File

@ -19,10 +19,13 @@
package org.apache.druid.benchmark.datagen; package org.apache.druid.benchmark.datagen;
import org.apache.druid.data.input.impl.DimensionSchema;
import org.apache.druid.data.input.impl.DimensionsSpec;
import org.apache.druid.query.aggregation.AggregatorFactory; import org.apache.druid.query.aggregation.AggregatorFactory;
import org.joda.time.Interval; import org.joda.time.Interval;
import java.util.List; import java.util.List;
import java.util.stream.Collectors;
public class BenchmarkSchemaInfo public class BenchmarkSchemaInfo
{ {
@ -49,6 +52,16 @@ public class BenchmarkSchemaInfo
return columnSchemas; return columnSchemas;
} }
public DimensionsSpec getDimensionsSpec()
{
List<DimensionSchema> specs = getColumnSchemas().stream()
.filter(x -> !x.isMetric())
.map(BenchmarkColumnSchema::getDimensionSchema)
.collect(Collectors.toList());
return new DimensionsSpec(specs);
}
public List<AggregatorFactory> getAggs() public List<AggregatorFactory> getAggs()
{ {
return aggs; return aggs;

View File

@ -225,4 +225,73 @@ public class BenchmarkSchemas
); );
SCHEMA_MAP.put("rollo", rolloSchema); SCHEMA_MAP.put("rollo", rolloSchema);
} }
static { // simple schema with null valued rows, no aggs on numeric columns
List<BenchmarkColumnSchema> nullsSchemaColumns = ImmutableList.of(
// string dims with nulls
BenchmarkColumnSchema.makeZipf("stringZipf", ValueType.STRING, false, 1, 0.8, 1, 101, 1.5),
BenchmarkColumnSchema.makeDiscreteUniform("stringUniform", ValueType.STRING, false, 1, 0.3, 1, 100000),
BenchmarkColumnSchema.makeSequential("stringSequential", ValueType.STRING, false, 1, 0.5, 0, 1000),
// numeric dims with nulls
BenchmarkColumnSchema.makeSequential("longSequential", ValueType.LONG, false, 1, 0.45, 0, 10000),
BenchmarkColumnSchema.makeDiscreteUniform("longUniform", ValueType.LONG, false, 1, 0.25, 0, 500),
BenchmarkColumnSchema.makeZipf("doubleZipf", ValueType.DOUBLE, false, 1, 0.1, 0, 1000, 2.0),
BenchmarkColumnSchema.makeZipf("floatZipf", ValueType.FLOAT, false, 1, 0.1, 0, 1000, 2.0)
);
List<AggregatorFactory> simpleNullsSchemaIngestAggs = new ArrayList<>();
simpleNullsSchemaIngestAggs.add(new CountAggregatorFactory("rows"));
Interval nullsSchemaDataInterval = Intervals.of("2000-01-01/P1D");
BenchmarkSchemaInfo nullsSchema = new BenchmarkSchemaInfo(
nullsSchemaColumns,
simpleNullsSchemaIngestAggs,
nullsSchemaDataInterval,
false
);
SCHEMA_MAP.put("nulls", nullsSchema);
}
static { // simple schema with null valued rows, no aggs on numeric columns
List<BenchmarkColumnSchema> nullsSchemaColumns = ImmutableList.of(
// string dims
BenchmarkColumnSchema.makeZipf("stringZipf", ValueType.STRING, false, 1, null, 1, 101, 1.5),
BenchmarkColumnSchema.makeDiscreteUniform("stringUniform", ValueType.STRING, false, 1, null, 1, 100000),
BenchmarkColumnSchema.makeSequential("stringSequential", ValueType.STRING, false, 1, null, 0, 1000),
// numeric dims
BenchmarkColumnSchema.makeSequential("longSequential", ValueType.LONG, false, 1, null, 0, 10000),
BenchmarkColumnSchema.makeDiscreteUniform("longUniform", ValueType.LONG, false, 1, null, 0, 500),
BenchmarkColumnSchema.makeZipf("doubleZipf", ValueType.DOUBLE, false, 1, null, 0, 1000, 2.0),
BenchmarkColumnSchema.makeZipf("floatZipf", ValueType.FLOAT, false, 1, null, 0, 1000, 2.0),
// string dims with nulls
BenchmarkColumnSchema.makeZipf("stringZipfWithNulls", ValueType.STRING, false, 1, 0.8, 1, 101, 1.5),
BenchmarkColumnSchema.makeDiscreteUniform("stringUniformWithNulls", ValueType.STRING, false, 1, 0.3, 1, 100000),
BenchmarkColumnSchema.makeSequential("stringSequentialWithNulls", ValueType.STRING, false, 1, 0.5, 0, 1000),
// numeric dims with nulls
BenchmarkColumnSchema.makeSequential("longSequentialWithNulls", ValueType.LONG, false, 1, 0.45, 0, 10000),
BenchmarkColumnSchema.makeDiscreteUniform("longUniformWithNulls", ValueType.LONG, false, 1, 0.25, 0, 500),
BenchmarkColumnSchema.makeZipf("doubleZipfWithNulls", ValueType.DOUBLE, false, 1, 0.1, 0, 1000, 2.0),
BenchmarkColumnSchema.makeZipf("floatZipfWithNulls", ValueType.FLOAT, false, 1, 0.1, 0, 1000, 2.0)
);
List<AggregatorFactory> simpleNullsSchemaIngestAggs = new ArrayList<>();
simpleNullsSchemaIngestAggs.add(new CountAggregatorFactory("rows"));
Interval nullsSchemaDataInterval = Intervals.of("2000-01-01/P1D");
BenchmarkSchemaInfo nullsSchema = new BenchmarkSchemaInfo(
nullsSchemaColumns,
simpleNullsSchemaIngestAggs,
nullsSchemaDataInterval,
false
);
SCHEMA_MAP.put("nulls-and-non-nulls", nullsSchema);
}
} }

View File

@ -19,16 +19,9 @@
package org.apache.druid.benchmark.datagen; package org.apache.druid.benchmark.datagen;
import com.google.common.collect.ImmutableList;
import com.google.common.hash.Hashing; import com.google.common.hash.Hashing;
import org.apache.druid.common.config.NullHandling; import org.apache.druid.common.config.NullHandling;
import org.apache.druid.data.input.InputRow; import org.apache.druid.data.input.InputRow;
import org.apache.druid.data.input.impl.DimensionSchema;
import org.apache.druid.data.input.impl.DimensionsSpec;
import org.apache.druid.data.input.impl.DoubleDimensionSchema;
import org.apache.druid.data.input.impl.FloatDimensionSchema;
import org.apache.druid.data.input.impl.LongDimensionSchema;
import org.apache.druid.data.input.impl.StringDimensionSchema;
import org.apache.druid.java.util.common.FileUtils; import org.apache.druid.java.util.common.FileUtils;
import org.apache.druid.java.util.common.ISE; import org.apache.druid.java.util.common.ISE;
import org.apache.druid.java.util.common.StringUtils; import org.apache.druid.java.util.common.StringUtils;
@ -145,30 +138,8 @@ public class SegmentGenerator implements Closeable
numRows numRows
); );
final List<DimensionSchema> dimensions = new ArrayList<>();
for (BenchmarkColumnSchema columnSchema : schemaInfo.getColumnSchemas()) {
if (schemaInfo.getAggs().stream().noneMatch(agg -> agg.getName().equals(columnSchema.getName()))) {
switch (columnSchema.getType()) {
case STRING:
dimensions.add(new StringDimensionSchema(columnSchema.getName()));
break;
case LONG:
dimensions.add(new LongDimensionSchema(columnSchema.getName()));
break;
case DOUBLE:
dimensions.add(new DoubleDimensionSchema(columnSchema.getName()));
break;
case FLOAT:
dimensions.add(new FloatDimensionSchema(columnSchema.getName()));
break;
default:
throw new ISE("Unhandleable type[%s]", columnSchema.getType());
}
}
}
final IncrementalIndexSchema indexSchema = new IncrementalIndexSchema.Builder() final IncrementalIndexSchema indexSchema = new IncrementalIndexSchema.Builder()
.withDimensionsSpec(new DimensionsSpec(dimensions, ImmutableList.of(), ImmutableList.of())) .withDimensionsSpec(schemaInfo.getDimensionsSpec())
.withMetrics(schemaInfo.getAggsArray()) .withMetrics(schemaInfo.getAggsArray())
.withRollup(schemaInfo.isWithRollup()) .withRollup(schemaInfo.isWithRollup())
.withQueryGranularity(granularity) .withQueryGranularity(granularity)

View File

@ -19,6 +19,7 @@
package org.apache.druid.benchmark.query; package org.apache.druid.benchmark.query;
import com.fasterxml.jackson.databind.InjectableValues;
import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.dataformat.smile.SmileFactory; import com.fasterxml.jackson.dataformat.smile.SmileFactory;
import com.google.common.base.Supplier; import com.google.common.base.Supplier;
@ -41,6 +42,7 @@ import org.apache.druid.java.util.common.granularity.Granularities;
import org.apache.druid.java.util.common.granularity.Granularity; import org.apache.druid.java.util.common.granularity.Granularity;
import org.apache.druid.java.util.common.guava.Sequence; import org.apache.druid.java.util.common.guava.Sequence;
import org.apache.druid.java.util.common.logger.Logger; import org.apache.druid.java.util.common.logger.Logger;
import org.apache.druid.math.expr.ExprMacroTable;
import org.apache.druid.offheap.OffheapBufferGenerator; import org.apache.druid.offheap.OffheapBufferGenerator;
import org.apache.druid.query.DruidProcessingConfig; import org.apache.druid.query.DruidProcessingConfig;
import org.apache.druid.query.FinalizeResultsQueryRunner; import org.apache.druid.query.FinalizeResultsQueryRunner;
@ -58,6 +60,7 @@ import org.apache.druid.query.aggregation.LongSumAggregatorFactory;
import org.apache.druid.query.aggregation.hyperloglog.HyperUniquesSerde; import org.apache.druid.query.aggregation.hyperloglog.HyperUniquesSerde;
import org.apache.druid.query.context.ResponseContext; import org.apache.druid.query.context.ResponseContext;
import org.apache.druid.query.dimension.DefaultDimensionSpec; import org.apache.druid.query.dimension.DefaultDimensionSpec;
import org.apache.druid.query.expression.TestExprMacroTable;
import org.apache.druid.query.filter.BoundDimFilter; import org.apache.druid.query.filter.BoundDimFilter;
import org.apache.druid.query.groupby.GroupByQuery; import org.apache.druid.query.groupby.GroupByQuery;
import org.apache.druid.query.groupby.GroupByQueryConfig; import org.apache.druid.query.groupby.GroupByQueryConfig;
@ -166,7 +169,11 @@ public class GroupByBenchmark
static { static {
JSON_MAPPER = new DefaultObjectMapper(); JSON_MAPPER = new DefaultObjectMapper();
INDEX_IO = new IndexIO( INDEX_IO = new IndexIO(
JSON_MAPPER, JSON_MAPPER.setInjectableValues(
new InjectableValues.Std()
.addValue(ExprMacroTable.class.getName(), TestExprMacroTable.INSTANCE)
.addValue(ObjectMapper.class.getName(), JSON_MAPPER)
),
new ColumnConfig() new ColumnConfig()
{ {
@Override @Override
@ -391,6 +398,34 @@ public class GroupByBenchmark
simpleFloatQueries.put("A", queryA); simpleFloatQueries.put("A", queryA);
} }
SCHEMA_QUERY_MAP.put("simpleFloat", simpleFloatQueries); SCHEMA_QUERY_MAP.put("simpleFloat", simpleFloatQueries);
// simple one column schema, for testing performance difference between querying on numeric values as Strings and
// directly as longs
Map<String, GroupByQuery> nullQueries = new LinkedHashMap<>();
BenchmarkSchemaInfo nullSchema = BenchmarkSchemas.SCHEMA_MAP.get("nulls");
{ // simple-null
QuerySegmentSpec intervalSpec = new MultipleIntervalSegmentSpec(Collections.singletonList(nullSchema.getDataInterval()));
List<AggregatorFactory> queryAggs = new ArrayList<>();
queryAggs.add(new DoubleSumAggregatorFactory(
"doubleSum",
"doubleZipf"
));
GroupByQuery queryA = GroupByQuery
.builder()
.setDataSource("blah")
.setQuerySegmentSpec(intervalSpec)
.setDimensions(new DefaultDimensionSpec("stringZipf", "stringZipf", ValueType.STRING))
.setAggregatorSpecs(
queryAggs
)
.setGranularity(Granularity.fromString(queryGranularity))
.setContext(ImmutableMap.of("vectorize", vectorize))
.build();
nullQueries.put("A", queryA);
}
SCHEMA_QUERY_MAP.put("nulls", nullQueries);
} }
@Setup(Level.Trial) @Setup(Level.Trial)
@ -549,6 +584,7 @@ public class GroupByBenchmark
return new IncrementalIndex.Builder() return new IncrementalIndex.Builder()
.setIndexSchema( .setIndexSchema(
new IncrementalIndexSchema.Builder() new IncrementalIndexSchema.Builder()
.withDimensionsSpec(schemaInfo.getDimensionsSpec())
.withMetrics(schemaInfo.getAggsArray()) .withMetrics(schemaInfo.getAggsArray())
.withRollup(withRollup) .withRollup(withRollup)
.build() .build()