mirror of https://github.com/apache/druid.git
benchmark schema with numeric dimensions and null column values (#9036)
* benchmark schema with null column values * oops * adjustments * rename again, different null percentage so rows more variety * more schema
This commit is contained in:
parent
3c31493772
commit
c2e9ab8100
|
@ -19,6 +19,12 @@
|
||||||
|
|
||||||
package org.apache.druid.benchmark.datagen;
|
package org.apache.druid.benchmark.datagen;
|
||||||
|
|
||||||
|
import org.apache.druid.data.input.impl.DimensionSchema;
|
||||||
|
import org.apache.druid.data.input.impl.DoubleDimensionSchema;
|
||||||
|
import org.apache.druid.data.input.impl.FloatDimensionSchema;
|
||||||
|
import org.apache.druid.data.input.impl.LongDimensionSchema;
|
||||||
|
import org.apache.druid.data.input.impl.StringDimensionSchema;
|
||||||
|
import org.apache.druid.java.util.common.IAE;
|
||||||
import org.apache.druid.segment.column.ValueType;
|
import org.apache.druid.segment.column.ValueType;
|
||||||
|
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
@ -144,6 +150,22 @@ public class BenchmarkColumnSchema
|
||||||
return new BenchmarkColumnValueGenerator(this, seed);
|
return new BenchmarkColumnValueGenerator(this, seed);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public DimensionSchema getDimensionSchema()
|
||||||
|
{
|
||||||
|
switch (type) {
|
||||||
|
case LONG:
|
||||||
|
return new LongDimensionSchema(name);
|
||||||
|
case FLOAT:
|
||||||
|
return new FloatDimensionSchema(name);
|
||||||
|
case DOUBLE:
|
||||||
|
return new DoubleDimensionSchema(name);
|
||||||
|
case STRING:
|
||||||
|
return new StringDimensionSchema(name);
|
||||||
|
default:
|
||||||
|
throw new IAE("unable to make dimension schema for %s", type);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
public String getName()
|
public String getName()
|
||||||
{
|
{
|
||||||
return name;
|
return name;
|
||||||
|
|
|
@ -116,6 +116,13 @@ public class BenchmarkColumnValueGenerator
|
||||||
ret = Long.parseLong(input.toString());
|
ret = Long.parseLong(input.toString());
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
case DOUBLE:
|
||||||
|
if (input instanceof Number) {
|
||||||
|
ret = ((Number) input).doubleValue();
|
||||||
|
} else {
|
||||||
|
ret = Double.parseDouble(input.toString());
|
||||||
|
}
|
||||||
|
break;
|
||||||
case FLOAT:
|
case FLOAT:
|
||||||
if (input instanceof Number) {
|
if (input instanceof Number) {
|
||||||
ret = ((Number) input).floatValue();
|
ret = ((Number) input).floatValue();
|
||||||
|
|
|
@ -19,10 +19,13 @@
|
||||||
|
|
||||||
package org.apache.druid.benchmark.datagen;
|
package org.apache.druid.benchmark.datagen;
|
||||||
|
|
||||||
|
import org.apache.druid.data.input.impl.DimensionSchema;
|
||||||
|
import org.apache.druid.data.input.impl.DimensionsSpec;
|
||||||
import org.apache.druid.query.aggregation.AggregatorFactory;
|
import org.apache.druid.query.aggregation.AggregatorFactory;
|
||||||
import org.joda.time.Interval;
|
import org.joda.time.Interval;
|
||||||
|
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
public class BenchmarkSchemaInfo
|
public class BenchmarkSchemaInfo
|
||||||
{
|
{
|
||||||
|
@ -49,6 +52,16 @@ public class BenchmarkSchemaInfo
|
||||||
return columnSchemas;
|
return columnSchemas;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public DimensionsSpec getDimensionsSpec()
|
||||||
|
{
|
||||||
|
List<DimensionSchema> specs = getColumnSchemas().stream()
|
||||||
|
.filter(x -> !x.isMetric())
|
||||||
|
.map(BenchmarkColumnSchema::getDimensionSchema)
|
||||||
|
.collect(Collectors.toList());
|
||||||
|
|
||||||
|
return new DimensionsSpec(specs);
|
||||||
|
}
|
||||||
|
|
||||||
public List<AggregatorFactory> getAggs()
|
public List<AggregatorFactory> getAggs()
|
||||||
{
|
{
|
||||||
return aggs;
|
return aggs;
|
||||||
|
|
|
@ -225,4 +225,73 @@ public class BenchmarkSchemas
|
||||||
);
|
);
|
||||||
SCHEMA_MAP.put("rollo", rolloSchema);
|
SCHEMA_MAP.put("rollo", rolloSchema);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static { // simple schema with null valued rows, no aggs on numeric columns
|
||||||
|
List<BenchmarkColumnSchema> nullsSchemaColumns = ImmutableList.of(
|
||||||
|
// string dims with nulls
|
||||||
|
BenchmarkColumnSchema.makeZipf("stringZipf", ValueType.STRING, false, 1, 0.8, 1, 101, 1.5),
|
||||||
|
BenchmarkColumnSchema.makeDiscreteUniform("stringUniform", ValueType.STRING, false, 1, 0.3, 1, 100000),
|
||||||
|
BenchmarkColumnSchema.makeSequential("stringSequential", ValueType.STRING, false, 1, 0.5, 0, 1000),
|
||||||
|
|
||||||
|
// numeric dims with nulls
|
||||||
|
BenchmarkColumnSchema.makeSequential("longSequential", ValueType.LONG, false, 1, 0.45, 0, 10000),
|
||||||
|
BenchmarkColumnSchema.makeDiscreteUniform("longUniform", ValueType.LONG, false, 1, 0.25, 0, 500),
|
||||||
|
BenchmarkColumnSchema.makeZipf("doubleZipf", ValueType.DOUBLE, false, 1, 0.1, 0, 1000, 2.0),
|
||||||
|
BenchmarkColumnSchema.makeZipf("floatZipf", ValueType.FLOAT, false, 1, 0.1, 0, 1000, 2.0)
|
||||||
|
);
|
||||||
|
|
||||||
|
List<AggregatorFactory> simpleNullsSchemaIngestAggs = new ArrayList<>();
|
||||||
|
simpleNullsSchemaIngestAggs.add(new CountAggregatorFactory("rows"));
|
||||||
|
|
||||||
|
Interval nullsSchemaDataInterval = Intervals.of("2000-01-01/P1D");
|
||||||
|
|
||||||
|
BenchmarkSchemaInfo nullsSchema = new BenchmarkSchemaInfo(
|
||||||
|
nullsSchemaColumns,
|
||||||
|
simpleNullsSchemaIngestAggs,
|
||||||
|
nullsSchemaDataInterval,
|
||||||
|
false
|
||||||
|
);
|
||||||
|
|
||||||
|
SCHEMA_MAP.put("nulls", nullsSchema);
|
||||||
|
}
|
||||||
|
|
||||||
|
static { // simple schema with null valued rows, no aggs on numeric columns
|
||||||
|
List<BenchmarkColumnSchema> nullsSchemaColumns = ImmutableList.of(
|
||||||
|
// string dims
|
||||||
|
BenchmarkColumnSchema.makeZipf("stringZipf", ValueType.STRING, false, 1, null, 1, 101, 1.5),
|
||||||
|
BenchmarkColumnSchema.makeDiscreteUniform("stringUniform", ValueType.STRING, false, 1, null, 1, 100000),
|
||||||
|
BenchmarkColumnSchema.makeSequential("stringSequential", ValueType.STRING, false, 1, null, 0, 1000),
|
||||||
|
|
||||||
|
// numeric dims
|
||||||
|
BenchmarkColumnSchema.makeSequential("longSequential", ValueType.LONG, false, 1, null, 0, 10000),
|
||||||
|
BenchmarkColumnSchema.makeDiscreteUniform("longUniform", ValueType.LONG, false, 1, null, 0, 500),
|
||||||
|
BenchmarkColumnSchema.makeZipf("doubleZipf", ValueType.DOUBLE, false, 1, null, 0, 1000, 2.0),
|
||||||
|
BenchmarkColumnSchema.makeZipf("floatZipf", ValueType.FLOAT, false, 1, null, 0, 1000, 2.0),
|
||||||
|
|
||||||
|
// string dims with nulls
|
||||||
|
BenchmarkColumnSchema.makeZipf("stringZipfWithNulls", ValueType.STRING, false, 1, 0.8, 1, 101, 1.5),
|
||||||
|
BenchmarkColumnSchema.makeDiscreteUniform("stringUniformWithNulls", ValueType.STRING, false, 1, 0.3, 1, 100000),
|
||||||
|
BenchmarkColumnSchema.makeSequential("stringSequentialWithNulls", ValueType.STRING, false, 1, 0.5, 0, 1000),
|
||||||
|
|
||||||
|
// numeric dims with nulls
|
||||||
|
BenchmarkColumnSchema.makeSequential("longSequentialWithNulls", ValueType.LONG, false, 1, 0.45, 0, 10000),
|
||||||
|
BenchmarkColumnSchema.makeDiscreteUniform("longUniformWithNulls", ValueType.LONG, false, 1, 0.25, 0, 500),
|
||||||
|
BenchmarkColumnSchema.makeZipf("doubleZipfWithNulls", ValueType.DOUBLE, false, 1, 0.1, 0, 1000, 2.0),
|
||||||
|
BenchmarkColumnSchema.makeZipf("floatZipfWithNulls", ValueType.FLOAT, false, 1, 0.1, 0, 1000, 2.0)
|
||||||
|
);
|
||||||
|
|
||||||
|
List<AggregatorFactory> simpleNullsSchemaIngestAggs = new ArrayList<>();
|
||||||
|
simpleNullsSchemaIngestAggs.add(new CountAggregatorFactory("rows"));
|
||||||
|
|
||||||
|
Interval nullsSchemaDataInterval = Intervals.of("2000-01-01/P1D");
|
||||||
|
|
||||||
|
BenchmarkSchemaInfo nullsSchema = new BenchmarkSchemaInfo(
|
||||||
|
nullsSchemaColumns,
|
||||||
|
simpleNullsSchemaIngestAggs,
|
||||||
|
nullsSchemaDataInterval,
|
||||||
|
false
|
||||||
|
);
|
||||||
|
|
||||||
|
SCHEMA_MAP.put("nulls-and-non-nulls", nullsSchema);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -19,16 +19,9 @@
|
||||||
|
|
||||||
package org.apache.druid.benchmark.datagen;
|
package org.apache.druid.benchmark.datagen;
|
||||||
|
|
||||||
import com.google.common.collect.ImmutableList;
|
|
||||||
import com.google.common.hash.Hashing;
|
import com.google.common.hash.Hashing;
|
||||||
import org.apache.druid.common.config.NullHandling;
|
import org.apache.druid.common.config.NullHandling;
|
||||||
import org.apache.druid.data.input.InputRow;
|
import org.apache.druid.data.input.InputRow;
|
||||||
import org.apache.druid.data.input.impl.DimensionSchema;
|
|
||||||
import org.apache.druid.data.input.impl.DimensionsSpec;
|
|
||||||
import org.apache.druid.data.input.impl.DoubleDimensionSchema;
|
|
||||||
import org.apache.druid.data.input.impl.FloatDimensionSchema;
|
|
||||||
import org.apache.druid.data.input.impl.LongDimensionSchema;
|
|
||||||
import org.apache.druid.data.input.impl.StringDimensionSchema;
|
|
||||||
import org.apache.druid.java.util.common.FileUtils;
|
import org.apache.druid.java.util.common.FileUtils;
|
||||||
import org.apache.druid.java.util.common.ISE;
|
import org.apache.druid.java.util.common.ISE;
|
||||||
import org.apache.druid.java.util.common.StringUtils;
|
import org.apache.druid.java.util.common.StringUtils;
|
||||||
|
@ -145,30 +138,8 @@ public class SegmentGenerator implements Closeable
|
||||||
numRows
|
numRows
|
||||||
);
|
);
|
||||||
|
|
||||||
final List<DimensionSchema> dimensions = new ArrayList<>();
|
|
||||||
for (BenchmarkColumnSchema columnSchema : schemaInfo.getColumnSchemas()) {
|
|
||||||
if (schemaInfo.getAggs().stream().noneMatch(agg -> agg.getName().equals(columnSchema.getName()))) {
|
|
||||||
switch (columnSchema.getType()) {
|
|
||||||
case STRING:
|
|
||||||
dimensions.add(new StringDimensionSchema(columnSchema.getName()));
|
|
||||||
break;
|
|
||||||
case LONG:
|
|
||||||
dimensions.add(new LongDimensionSchema(columnSchema.getName()));
|
|
||||||
break;
|
|
||||||
case DOUBLE:
|
|
||||||
dimensions.add(new DoubleDimensionSchema(columnSchema.getName()));
|
|
||||||
break;
|
|
||||||
case FLOAT:
|
|
||||||
dimensions.add(new FloatDimensionSchema(columnSchema.getName()));
|
|
||||||
break;
|
|
||||||
default:
|
|
||||||
throw new ISE("Unhandleable type[%s]", columnSchema.getType());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
final IncrementalIndexSchema indexSchema = new IncrementalIndexSchema.Builder()
|
final IncrementalIndexSchema indexSchema = new IncrementalIndexSchema.Builder()
|
||||||
.withDimensionsSpec(new DimensionsSpec(dimensions, ImmutableList.of(), ImmutableList.of()))
|
.withDimensionsSpec(schemaInfo.getDimensionsSpec())
|
||||||
.withMetrics(schemaInfo.getAggsArray())
|
.withMetrics(schemaInfo.getAggsArray())
|
||||||
.withRollup(schemaInfo.isWithRollup())
|
.withRollup(schemaInfo.isWithRollup())
|
||||||
.withQueryGranularity(granularity)
|
.withQueryGranularity(granularity)
|
||||||
|
|
|
@ -19,6 +19,7 @@
|
||||||
|
|
||||||
package org.apache.druid.benchmark.query;
|
package org.apache.druid.benchmark.query;
|
||||||
|
|
||||||
|
import com.fasterxml.jackson.databind.InjectableValues;
|
||||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
import com.fasterxml.jackson.dataformat.smile.SmileFactory;
|
import com.fasterxml.jackson.dataformat.smile.SmileFactory;
|
||||||
import com.google.common.base.Supplier;
|
import com.google.common.base.Supplier;
|
||||||
|
@ -41,6 +42,7 @@ import org.apache.druid.java.util.common.granularity.Granularities;
|
||||||
import org.apache.druid.java.util.common.granularity.Granularity;
|
import org.apache.druid.java.util.common.granularity.Granularity;
|
||||||
import org.apache.druid.java.util.common.guava.Sequence;
|
import org.apache.druid.java.util.common.guava.Sequence;
|
||||||
import org.apache.druid.java.util.common.logger.Logger;
|
import org.apache.druid.java.util.common.logger.Logger;
|
||||||
|
import org.apache.druid.math.expr.ExprMacroTable;
|
||||||
import org.apache.druid.offheap.OffheapBufferGenerator;
|
import org.apache.druid.offheap.OffheapBufferGenerator;
|
||||||
import org.apache.druid.query.DruidProcessingConfig;
|
import org.apache.druid.query.DruidProcessingConfig;
|
||||||
import org.apache.druid.query.FinalizeResultsQueryRunner;
|
import org.apache.druid.query.FinalizeResultsQueryRunner;
|
||||||
|
@ -58,6 +60,7 @@ import org.apache.druid.query.aggregation.LongSumAggregatorFactory;
|
||||||
import org.apache.druid.query.aggregation.hyperloglog.HyperUniquesSerde;
|
import org.apache.druid.query.aggregation.hyperloglog.HyperUniquesSerde;
|
||||||
import org.apache.druid.query.context.ResponseContext;
|
import org.apache.druid.query.context.ResponseContext;
|
||||||
import org.apache.druid.query.dimension.DefaultDimensionSpec;
|
import org.apache.druid.query.dimension.DefaultDimensionSpec;
|
||||||
|
import org.apache.druid.query.expression.TestExprMacroTable;
|
||||||
import org.apache.druid.query.filter.BoundDimFilter;
|
import org.apache.druid.query.filter.BoundDimFilter;
|
||||||
import org.apache.druid.query.groupby.GroupByQuery;
|
import org.apache.druid.query.groupby.GroupByQuery;
|
||||||
import org.apache.druid.query.groupby.GroupByQueryConfig;
|
import org.apache.druid.query.groupby.GroupByQueryConfig;
|
||||||
|
@ -166,7 +169,11 @@ public class GroupByBenchmark
|
||||||
static {
|
static {
|
||||||
JSON_MAPPER = new DefaultObjectMapper();
|
JSON_MAPPER = new DefaultObjectMapper();
|
||||||
INDEX_IO = new IndexIO(
|
INDEX_IO = new IndexIO(
|
||||||
JSON_MAPPER,
|
JSON_MAPPER.setInjectableValues(
|
||||||
|
new InjectableValues.Std()
|
||||||
|
.addValue(ExprMacroTable.class.getName(), TestExprMacroTable.INSTANCE)
|
||||||
|
.addValue(ObjectMapper.class.getName(), JSON_MAPPER)
|
||||||
|
),
|
||||||
new ColumnConfig()
|
new ColumnConfig()
|
||||||
{
|
{
|
||||||
@Override
|
@Override
|
||||||
|
@ -391,6 +398,34 @@ public class GroupByBenchmark
|
||||||
simpleFloatQueries.put("A", queryA);
|
simpleFloatQueries.put("A", queryA);
|
||||||
}
|
}
|
||||||
SCHEMA_QUERY_MAP.put("simpleFloat", simpleFloatQueries);
|
SCHEMA_QUERY_MAP.put("simpleFloat", simpleFloatQueries);
|
||||||
|
|
||||||
|
// simple one column schema, for testing performance difference between querying on numeric values as Strings and
|
||||||
|
// directly as longs
|
||||||
|
Map<String, GroupByQuery> nullQueries = new LinkedHashMap<>();
|
||||||
|
BenchmarkSchemaInfo nullSchema = BenchmarkSchemas.SCHEMA_MAP.get("nulls");
|
||||||
|
|
||||||
|
{ // simple-null
|
||||||
|
QuerySegmentSpec intervalSpec = new MultipleIntervalSegmentSpec(Collections.singletonList(nullSchema.getDataInterval()));
|
||||||
|
List<AggregatorFactory> queryAggs = new ArrayList<>();
|
||||||
|
queryAggs.add(new DoubleSumAggregatorFactory(
|
||||||
|
"doubleSum",
|
||||||
|
"doubleZipf"
|
||||||
|
));
|
||||||
|
GroupByQuery queryA = GroupByQuery
|
||||||
|
.builder()
|
||||||
|
.setDataSource("blah")
|
||||||
|
.setQuerySegmentSpec(intervalSpec)
|
||||||
|
.setDimensions(new DefaultDimensionSpec("stringZipf", "stringZipf", ValueType.STRING))
|
||||||
|
.setAggregatorSpecs(
|
||||||
|
queryAggs
|
||||||
|
)
|
||||||
|
.setGranularity(Granularity.fromString(queryGranularity))
|
||||||
|
.setContext(ImmutableMap.of("vectorize", vectorize))
|
||||||
|
.build();
|
||||||
|
|
||||||
|
nullQueries.put("A", queryA);
|
||||||
|
}
|
||||||
|
SCHEMA_QUERY_MAP.put("nulls", nullQueries);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Setup(Level.Trial)
|
@Setup(Level.Trial)
|
||||||
|
@ -549,6 +584,7 @@ public class GroupByBenchmark
|
||||||
return new IncrementalIndex.Builder()
|
return new IncrementalIndex.Builder()
|
||||||
.setIndexSchema(
|
.setIndexSchema(
|
||||||
new IncrementalIndexSchema.Builder()
|
new IncrementalIndexSchema.Builder()
|
||||||
|
.withDimensionsSpec(schemaInfo.getDimensionsSpec())
|
||||||
.withMetrics(schemaInfo.getAggsArray())
|
.withMetrics(schemaInfo.getAggsArray())
|
||||||
.withRollup(withRollup)
|
.withRollup(withRollup)
|
||||||
.build()
|
.build()
|
||||||
|
|
Loading…
Reference in New Issue