From e6a6284ebd37584c7d56ca3f346be42ea8ca70ff Mon Sep 17 00:00:00 2001 From: jon-wei Date: Thu, 17 Sep 2015 18:53:03 -0700 Subject: [PATCH] Allow SegmentMetadataQuery to skip cardinality and size calculations --- docs/content/querying/segmentmetadataquery.md | 19 ++++ .../src/main/java/io/druid/query/Druids.java | 1 + .../druid/query/metadata/SegmentAnalyzer.java | 105 +++++++++++++----- .../SegmentMetadataQueryRunnerFactory.java | 15 ++- .../metadata/SegmentMetadataQuery.java | 49 +++++++- .../query/metadata/SegmentAnalyzerTest.java | 59 ++++++++-- .../metadata/SegmentMetadataQueryTest.java | 11 +- 7 files changed, 216 insertions(+), 43 deletions(-) diff --git a/docs/content/querying/segmentmetadataquery.md b/docs/content/querying/segmentmetadataquery.md index 3ddf1f2b914..6337c621bc5 100644 --- a/docs/content/querying/segmentmetadataquery.md +++ b/docs/content/querying/segmentmetadataquery.md @@ -29,6 +29,7 @@ There are several main parts to a segment metadata query: |toInclude|A JSON Object representing what columns should be included in the result. Defaults to "all".|no| |merge|Merge all individual segment metadata results into a single result|no| |context|See [Context](../querying/query-context.html)|no| +|analysisTypes|A list of Strings specifying what column properties (e.g. cardinality, size) should be calculated and returned in the result. Defaults to ["cardinality", "size"]. See section [analysisTypes](#analysistypes) for more details.|no| The format of the result is: @@ -86,3 +87,21 @@ The grammar is as follows: ``` json "toInclude": { "type": "list", "columns": []} ``` + +### analysisTypes + +This is a list of properties that determines the amount of information returned about the columns, i.e. analyses to be performed on the columns. + +By default, all analysis types will be used. If a property is not needed, omitting it from this list will result in a more efficient query. + +There are 2 types of column analyses: + +#### cardinality + +* Estimated floor of cardinality for each column. Only relevant for dimension columns. + +#### size + +* Estimated byte size for the segment columns if they were stored in a flat format + +* Estimated total segment byte size in if it was stored in a flat format diff --git a/processing/src/main/java/io/druid/query/Druids.java b/processing/src/main/java/io/druid/query/Druids.java index 46ae2a0b2bd..32253388782 100644 --- a/processing/src/main/java/io/druid/query/Druids.java +++ b/processing/src/main/java/io/druid/query/Druids.java @@ -905,6 +905,7 @@ public class Druids toInclude, merge, context, + null, false ); } diff --git a/processing/src/main/java/io/druid/query/metadata/SegmentAnalyzer.java b/processing/src/main/java/io/druid/query/metadata/SegmentAnalyzer.java index af6d254effd..e086cb9a2d8 100644 --- a/processing/src/main/java/io/druid/query/metadata/SegmentAnalyzer.java +++ b/processing/src/main/java/io/druid/query/metadata/SegmentAnalyzer.java @@ -26,6 +26,7 @@ import com.google.common.primitives.Longs; import com.metamx.common.logger.Logger; import com.metamx.common.StringUtils; import io.druid.query.metadata.metadata.ColumnAnalysis; +import io.druid.query.metadata.metadata.SegmentMetadataQuery; import io.druid.segment.QueryableIndex; import io.druid.segment.StorageAdapter; import io.druid.segment.column.BitmapIndex; @@ -38,6 +39,7 @@ import io.druid.segment.serde.ComplexMetricSerde; import io.druid.segment.serde.ComplexMetrics; import java.util.Collections; +import java.util.EnumSet; import java.util.List; import java.util.Map; @@ -55,7 +57,7 @@ public class SegmentAnalyzer */ private static final int NUM_BYTES_IN_TEXT_FLOAT = 8; - public Map analyze(QueryableIndex index) + public Map analyze(QueryableIndex index, EnumSet analysisTypes) { Preconditions.checkNotNull(index, "Index cannot be null"); @@ -69,16 +71,16 @@ public class SegmentAnalyzer final ValueType type = capabilities.getType(); switch (type) { case LONG: - analysis = analyzeLongColumn(column); + analysis = analyzeLongColumn(column, analysisTypes); break; case FLOAT: - analysis = analyzeFloatColumn(column); + analysis = analyzeFloatColumn(column, analysisTypes); break; case STRING: - analysis = analyzeStringColumn(column); + analysis = analyzeStringColumn(column, analysisTypes); break; case COMPLEX: - analysis = analyzeComplexColumn(column); + analysis = analyzeComplexColumn(column, analysisTypes); break; default: log.warn("Unknown column type[%s].", type); @@ -90,13 +92,13 @@ public class SegmentAnalyzer columns.put( Column.TIME_COLUMN_NAME, - lengthBasedAnalysis(index.getColumn(Column.TIME_COLUMN_NAME), NUM_BYTES_IN_TIMESTAMP) + lengthBasedAnalysis(index.getColumn(Column.TIME_COLUMN_NAME), NUM_BYTES_IN_TIMESTAMP, analysisTypes) ); return columns; } - public Map analyze(StorageAdapter adapter) + public Map analyze(StorageAdapter adapter, EnumSet analysisTypes) { Preconditions.checkNotNull(adapter, "Adapter cannot be null"); Map columns = Maps.newTreeMap(); @@ -114,16 +116,34 @@ public class SegmentAnalyzer ValueType capType = capabilities.getType(); switch (capType) { case LONG: - analysis = lengthBasedAnalysisForAdapter(capType.name(), capabilities, numRows, Longs.BYTES); + analysis = lengthBasedAnalysisForAdapter( + analysisTypes, + capType.name(), capabilities, + numRows, Longs.BYTES + ); break; case FLOAT: - analysis = lengthBasedAnalysisForAdapter(capType.name(), capabilities, numRows, NUM_BYTES_IN_TEXT_FLOAT); + analysis = lengthBasedAnalysisForAdapter( + analysisTypes, + capType.name(), capabilities, + numRows, NUM_BYTES_IN_TEXT_FLOAT + ); break; case STRING: - analysis = new ColumnAnalysis(capType.name(), 0, adapter.getDimensionCardinality(columnName), null); + analysis = new ColumnAnalysis( + capType.name(), + 0, + analysisHasCardinality(analysisTypes) ? adapter.getDimensionCardinality(columnName) : 0, + null + ); break; case COMPLEX: - analysis = new ColumnAnalysis(capType.name(), 0, null, null); + analysis = new ColumnAnalysis( + capType.name(), + 0, + null, + null + ); break; default: log.warn("Unknown column type[%s].", capType); @@ -135,33 +155,39 @@ public class SegmentAnalyzer columns.put( Column.TIME_COLUMN_NAME, - lengthBasedAnalysisForAdapter(ValueType.LONG.name(), null, numRows, NUM_BYTES_IN_TIMESTAMP) + lengthBasedAnalysisForAdapter(analysisTypes, ValueType.LONG.name(), null, numRows, NUM_BYTES_IN_TIMESTAMP) ); return columns; } - public ColumnAnalysis analyzeLongColumn(Column column) + + public ColumnAnalysis analyzeLongColumn(Column column, EnumSet analysisTypes) { - return lengthBasedAnalysis(column, Longs.BYTES); + return lengthBasedAnalysis(column, Longs.BYTES, analysisTypes); } - public ColumnAnalysis analyzeFloatColumn(Column column) + public ColumnAnalysis analyzeFloatColumn(Column column, EnumSet analysisTypes) { - return lengthBasedAnalysis(column, NUM_BYTES_IN_TEXT_FLOAT); + return lengthBasedAnalysis(column, NUM_BYTES_IN_TEXT_FLOAT, analysisTypes); } - private ColumnAnalysis lengthBasedAnalysis(Column column, final int numBytes) + private ColumnAnalysis lengthBasedAnalysis(Column column, final int numBytes, EnumSet analysisTypes) { final ColumnCapabilities capabilities = column.getCapabilities(); if (capabilities.hasMultipleValues()) { return ColumnAnalysis.error("multi_value"); } - return new ColumnAnalysis(capabilities.getType().name(), column.getLength() * numBytes, null, null); + int size = 0; + if (analysisHasSize(analysisTypes)) { + size = column.getLength() * numBytes; + } + + return new ColumnAnalysis(capabilities.getType().name(), size, null, null); } - public ColumnAnalysis analyzeStringColumn(Column column) + public ColumnAnalysis analyzeStringColumn(Column column, EnumSet analysisTypes) { final ColumnCapabilities capabilities = column.getCapabilities(); @@ -170,21 +196,28 @@ public class SegmentAnalyzer int cardinality = bitmapIndex.getCardinality(); long size = 0; - for (int i = 0; i < cardinality; ++i) { - String value = bitmapIndex.getValue(i); - if (value != null) { - size += StringUtils.toUtf8(value).length * bitmapIndex.getBitmap(value).size(); + if (analysisHasSize(analysisTypes)) { + for (int i = 0; i < cardinality; ++i) { + String value = bitmapIndex.getValue(i); + if (value != null) { + size += StringUtils.toUtf8(value).length * bitmapIndex.getBitmap(value).size(); + } } } - return new ColumnAnalysis(capabilities.getType().name(), size, cardinality, null); + return new ColumnAnalysis( + capabilities.getType().name(), + size, + analysisHasCardinality(analysisTypes) ? cardinality : 0, + null + ); } return ColumnAnalysis.error("string_no_bitmap"); } - public ColumnAnalysis analyzeComplexColumn(Column column) + public ColumnAnalysis analyzeComplexColumn(Column column, EnumSet analysisTypes) { final ColumnCapabilities capabilities = column.getCapabilities(); final ComplexColumn complexColumn = column.getComplexColumn(); @@ -202,8 +235,10 @@ public class SegmentAnalyzer final int length = column.getLength(); long size = 0; - for (int i = 0; i < length; ++i) { - size += inputSizeFn.apply(complexColumn.getRowValue(i)); + if (analysisHasSize(analysisTypes)) { + for (int i = 0; i < length; ++i) { + size += inputSizeFn.apply(complexColumn.getRowValue(i)); + } } return new ColumnAnalysis(typeName, size, null, null); @@ -220,6 +255,7 @@ public class SegmentAnalyzer } private ColumnAnalysis lengthBasedAnalysisForAdapter( + EnumSet analysisTypes, String type, ColumnCapabilities capabilities, int numRows, final int numBytes ) @@ -227,7 +263,20 @@ public class SegmentAnalyzer if (capabilities != null && capabilities.hasMultipleValues()) { return ColumnAnalysis.error("multi_value"); } - return new ColumnAnalysis(type, numRows * numBytes, null, null); + return new ColumnAnalysis( + type, + analysisHasSize(analysisTypes) ? numRows * numBytes : 0, + null, + null + ); + } + + private boolean analysisHasSize(EnumSet analysisTypes) { + return analysisTypes.contains(SegmentMetadataQuery.AnalysisType.SIZE); + } + + private boolean analysisHasCardinality(EnumSet analysisTypes) { + return analysisTypes.contains(SegmentMetadataQuery.AnalysisType.CARDINALITY); } } diff --git a/processing/src/main/java/io/druid/query/metadata/SegmentMetadataQueryRunnerFactory.java b/processing/src/main/java/io/druid/query/metadata/SegmentMetadataQueryRunnerFactory.java index 742b08d26c5..fa60a3344fb 100644 --- a/processing/src/main/java/io/druid/query/metadata/SegmentMetadataQueryRunnerFactory.java +++ b/processing/src/main/java/io/druid/query/metadata/SegmentMetadataQueryRunnerFactory.java @@ -42,6 +42,7 @@ import io.druid.query.metadata.metadata.SegmentAnalysis; import io.druid.query.metadata.metadata.SegmentMetadataQuery; import io.druid.segment.QueryableIndex; import io.druid.segment.Segment; +import io.druid.segment.StorageAdapter; import java.util.ArrayList; import java.util.Arrays; @@ -82,15 +83,23 @@ public class SegmentMetadataQueryRunnerFactory implements QueryRunnerFactory analyzedColumns; + final int numRows; long totalSize = 0; if (index == null) { // IncrementalIndexSegments (used by in-memory hydrants in the realtime service) do not have a QueryableIndex - analyzedColumns = analyzer.analyze(segment.asStorageAdapter()); + StorageAdapter segmentAdapter = segment.asStorageAdapter(); + analyzedColumns = analyzer.analyze(segmentAdapter, query.getAnalysisTypes()); + numRows = segmentAdapter.getNumRows(); } else { - analyzedColumns = analyzer.analyze(index); + analyzedColumns = analyzer.analyze(index, query.getAnalysisTypes()); + numRows = index.getNumRows(); + } + + if (query.hasSize()) { // Initialize with the size of the whitespace, 1 byte per - totalSize = analyzedColumns.size() * index.getNumRows(); + totalSize = analyzedColumns.size() * numRows; } Map columns = Maps.newTreeMap(); diff --git a/processing/src/main/java/io/druid/query/metadata/metadata/SegmentMetadataQuery.java b/processing/src/main/java/io/druid/query/metadata/metadata/SegmentMetadataQuery.java index 621e921bec2..8e40fc2b7d9 100644 --- a/processing/src/main/java/io/druid/query/metadata/metadata/SegmentMetadataQuery.java +++ b/processing/src/main/java/io/druid/query/metadata/metadata/SegmentMetadataQuery.java @@ -19,6 +19,7 @@ package io.druid.query.metadata.metadata; import com.fasterxml.jackson.annotation.JsonCreator; import com.fasterxml.jackson.annotation.JsonProperty; +import com.fasterxml.jackson.annotation.JsonValue; import com.google.common.base.Preconditions; import io.druid.common.utils.JodaUtils; import io.druid.query.BaseQuery; @@ -30,17 +31,43 @@ import io.druid.query.spec.QuerySegmentSpec; import org.joda.time.Interval; import java.util.Arrays; +import java.util.EnumSet; import java.util.Map; public class SegmentMetadataQuery extends BaseQuery { + public enum AnalysisType + { + CARDINALITY, + SIZE; + + @JsonValue + @Override + public String toString() { + return this.name().toLowerCase(); + } + + @JsonCreator + public static AnalysisType fromString(String name) { + return valueOf(name.toUpperCase()); + } + } + + + public static final Interval DEFAULT_INTERVAL = new Interval( JodaUtils.MIN_INSTANT, JodaUtils.MAX_INSTANT ); + public static final EnumSet DEFAULT_ANALYSIS_TYPES = EnumSet.of( + AnalysisType.CARDINALITY, + AnalysisType.SIZE + ); + private final ColumnIncluderator toInclude; private final boolean merge; private final boolean usingDefaultInterval; + private final EnumSet analysisTypes; @JsonCreator public SegmentMetadataQuery( @@ -49,6 +76,7 @@ public class SegmentMetadataQuery extends BaseQuery @JsonProperty("toInclude") ColumnIncluderator toInclude, @JsonProperty("merge") Boolean merge, @JsonProperty("context") Map context, + @JsonProperty("analysisTypes") EnumSet analysisTypes, @JsonProperty("usingDefaultInterval") Boolean useDefaultInterval ) { @@ -64,9 +92,9 @@ public class SegmentMetadataQuery extends BaseQuery } else { this.usingDefaultInterval = useDefaultInterval == null ? false : useDefaultInterval; } - this.toInclude = toInclude == null ? new AllColumnIncluderator() : toInclude; this.merge = merge == null ? false : merge; + this.analysisTypes = (analysisTypes == null) ? DEFAULT_ANALYSIS_TYPES : analysisTypes; Preconditions.checkArgument( dataSource instanceof TableDataSource, "SegmentMetadataQuery only supports table datasource" @@ -103,6 +131,22 @@ public class SegmentMetadataQuery extends BaseQuery return Query.SEGMENT_METADATA; } + @JsonProperty + public EnumSet getAnalysisTypes() + { + return analysisTypes; + } + + public boolean hasCardinality() + { + return analysisTypes.contains(AnalysisType.CARDINALITY); + } + + public boolean hasSize() + { + return analysisTypes.contains(AnalysisType.SIZE); + } + @Override public Query withOverriddenContext(Map contextOverride) { @@ -112,6 +156,7 @@ public class SegmentMetadataQuery extends BaseQuery toInclude, merge, computeOverridenContext(contextOverride), + analysisTypes, usingDefaultInterval ); } @@ -125,6 +170,7 @@ public class SegmentMetadataQuery extends BaseQuery toInclude, merge, getContext(), + analysisTypes, usingDefaultInterval ); } @@ -138,6 +184,7 @@ public class SegmentMetadataQuery extends BaseQuery toInclude, merge, getContext(), + analysisTypes, usingDefaultInterval ); } diff --git a/processing/src/test/java/io/druid/query/metadata/SegmentAnalyzerTest.java b/processing/src/test/java/io/druid/query/metadata/SegmentAnalyzerTest.java index 4722be8d77e..3807bc688ed 100644 --- a/processing/src/test/java/io/druid/query/metadata/SegmentAnalyzerTest.java +++ b/processing/src/test/java/io/druid/query/metadata/SegmentAnalyzerTest.java @@ -17,9 +17,11 @@ package io.druid.query.metadata; +import com.google.common.collect.Iterables; import com.google.common.collect.Lists; import com.metamx.common.guava.Sequences; import io.druid.query.LegacyDataSource; +import io.druid.query.Query; import io.druid.query.QueryRunner; import io.druid.query.QueryRunnerFactory; import io.druid.query.QueryRunnerTestHelper; @@ -32,9 +34,11 @@ import io.druid.segment.QueryableIndexSegment; import io.druid.segment.Segment; import io.druid.segment.TestIndex; import io.druid.segment.column.ValueType; +import org.joda.time.Interval; import org.junit.Assert; import org.junit.Test; +import java.util.EnumSet; import java.util.HashMap; import java.util.List; import java.util.Map; @@ -43,11 +47,21 @@ import java.util.Map; */ public class SegmentAnalyzerTest { + private static final EnumSet emptyAnalyses = + EnumSet.noneOf(SegmentMetadataQuery.AnalysisType.class); + @Test public void testIncrementalWorks() throws Exception + { + testIncrementalWorksHelper(null); + testIncrementalWorksHelper(emptyAnalyses); + } + + private void testIncrementalWorksHelper(EnumSet analyses) throws Exception { final List results = getSegmentAnalysises( - new IncrementalIndexSegment(TestIndex.getIncrementalTestIndex(false), null) + new IncrementalIndexSegment(TestIndex.getIncrementalTestIndex(false), null), + analyses ); Assert.assertEquals(1, results.size()); @@ -61,28 +75,44 @@ public class SegmentAnalyzerTest TestIndex.COLUMNS.length, columns.size() ); // All columns including time and empty/null column - + for (String dimension : TestIndex.DIMENSIONS) { final ColumnAnalysis columnAnalysis = columns.get(dimension); Assert.assertEquals(dimension, ValueType.STRING.name(), columnAnalysis.getType()); - Assert.assertTrue(dimension, columnAnalysis.getCardinality() > 0); + if (analyses == null) { + Assert.assertTrue(dimension, columnAnalysis.getCardinality() > 0); + } else { + Assert.assertEquals(dimension, 0, columnAnalysis.getCardinality().longValue()); + Assert.assertEquals(dimension, 0, columnAnalysis.getSize()); + } } for (String metric : TestIndex.METRICS) { final ColumnAnalysis columnAnalysis = columns.get(metric); Assert.assertEquals(metric, ValueType.FLOAT.name(), columnAnalysis.getType()); - Assert.assertTrue(metric, columnAnalysis.getSize() > 0); + if (analyses == null) { + Assert.assertTrue(metric, columnAnalysis.getSize() > 0); + } else { + Assert.assertEquals(metric, 0, columnAnalysis.getSize()); + } Assert.assertNull(metric, columnAnalysis.getCardinality()); } } @Test public void testMappedWorks() throws Exception + { + testMappedWorksHelper(null); + testMappedWorksHelper(emptyAnalyses); + } + + private void testMappedWorksHelper(EnumSet analyses) throws Exception { final List results = getSegmentAnalysises( - new QueryableIndexSegment("test_1", TestIndex.getMMappedTestIndex()) + new QueryableIndexSegment("test_1", TestIndex.getMMappedTestIndex()), + analyses ); Assert.assertEquals(1, results.size()); @@ -102,8 +132,13 @@ public class SegmentAnalyzerTest Assert.assertNull(columnAnalysis); } else { Assert.assertEquals(dimension, ValueType.STRING.name(), columnAnalysis.getType()); - Assert.assertTrue(dimension, columnAnalysis.getSize() > 0); - Assert.assertTrue(dimension, columnAnalysis.getCardinality() > 0); + if (analyses == null) { + Assert.assertTrue(dimension, columnAnalysis.getSize() > 0); + Assert.assertTrue(dimension, columnAnalysis.getCardinality() > 0); + } else { + Assert.assertEquals(dimension, 0, columnAnalysis.getCardinality().longValue()); + Assert.assertEquals(dimension, 0, columnAnalysis.getSize()); + } } } @@ -111,7 +146,11 @@ public class SegmentAnalyzerTest final ColumnAnalysis columnAnalysis = columns.get(metric); Assert.assertEquals(metric, ValueType.FLOAT.name(), columnAnalysis.getType()); - Assert.assertTrue(metric, columnAnalysis.getSize() > 0); + if (analyses == null) { + Assert.assertTrue(metric, columnAnalysis.getSize() > 0); + } else { + Assert.assertEquals(metric, 0, columnAnalysis.getSize()); + } Assert.assertNull(metric, columnAnalysis.getCardinality()); } } @@ -123,7 +162,7 @@ public class SegmentAnalyzerTest * * @return */ - private List getSegmentAnalysises(Segment index) + private List getSegmentAnalysises(Segment index, EnumSet analyses) { final QueryRunner runner = QueryRunnerTestHelper.makeQueryRunner( (QueryRunnerFactory) new SegmentMetadataQueryRunnerFactory( @@ -133,7 +172,7 @@ public class SegmentAnalyzerTest ); final SegmentMetadataQuery query = new SegmentMetadataQuery( - new LegacyDataSource("test"), QuerySegmentSpecs.create("2011/2012"), null, null, null, false + new LegacyDataSource("test"), QuerySegmentSpecs.create("2011/2012"), null, null, null, analyses, false ); HashMap context = new HashMap(); return Sequences.toList(query.run(runner, context), Lists.newArrayList()); diff --git a/processing/src/test/java/io/druid/query/metadata/SegmentMetadataQueryTest.java b/processing/src/test/java/io/druid/query/metadata/SegmentMetadataQueryTest.java index 8125a5dd0ab..535793c5dcc 100644 --- a/processing/src/test/java/io/druid/query/metadata/SegmentMetadataQueryTest.java +++ b/processing/src/test/java/io/druid/query/metadata/SegmentMetadataQueryTest.java @@ -50,6 +50,7 @@ import org.junit.Assert; import org.junit.Test; import java.util.Arrays; +import java.util.EnumSet; import java.util.List; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; @@ -164,12 +165,20 @@ public class SegmentMetadataQueryTest String queryStr = "{\n" + " \"queryType\":\"segmentMetadata\",\n" + " \"dataSource\":\"test_ds\",\n" - + " \"intervals\":[\"2013-12-04T00:00:00.000Z/2013-12-05T00:00:00.000Z\"]\n" + + " \"intervals\":[\"2013-12-04T00:00:00.000Z/2013-12-05T00:00:00.000Z\"],\n" + + " \"analysisTypes\":[\"cardinality\",\"size\"]\n" + "}"; + + EnumSet expectedAnalysisTypes = EnumSet.of( + SegmentMetadataQuery.AnalysisType.CARDINALITY, + SegmentMetadataQuery.AnalysisType.SIZE + ); + Query query = mapper.readValue(queryStr, Query.class); Assert.assertTrue(query instanceof SegmentMetadataQuery); Assert.assertEquals("test_ds", Iterables.getOnlyElement(query.getDataSource().getNames())); Assert.assertEquals(new Interval("2013-12-04T00:00:00.000Z/2013-12-05T00:00:00.000Z"), query.getIntervals().get(0)); + Assert.assertEquals(expectedAnalysisTypes, ((SegmentMetadataQuery) query).getAnalysisTypes()); // test serialize and deserialize Assert.assertEquals(query, mapper.readValue(mapper.writeValueAsString(query), Query.class));