From b8a4f4ea7b85e96248cfed6f041826697a8c8b0b Mon Sep 17 00:00:00 2001 From: Gian Merlino Date: Wed, 6 Jul 2016 12:42:50 -0700 Subject: [PATCH] DumpSegment: Add --dump bitmaps option. (#3221) Also make --dump metadata respect --column. --- docs/content/operations/dump-segment.md | 53 ++++- .../main/java/io/druid/cli/DumpSegment.java | 198 +++++++++++++++--- 2 files changed, 208 insertions(+), 43 deletions(-) diff --git a/docs/content/operations/dump-segment.md b/docs/content/operations/dump-segment.md index 869078fd424..443ed2f436b 100644 --- a/docs/content/operations/dump-segment.md +++ b/docs/content/operations/dump-segment.md @@ -4,8 +4,8 @@ layout: doc_page # DumpSegment tool The DumpSegment tool can be used to dump the metadata or contents of a segment for debugging purposes. Note that the -dump is not necessarily a full-fidelity translation of the segment. In particular, not all metadata is included, indexes -are not included, and complex metric values may not be complete. +dump is not necessarily a full-fidelity translation of the segment. In particular, not all metadata is included, and +complex metric values may not be complete. To run the tool, point it at a segment directory and provide a file for writing output: @@ -17,8 +17,13 @@ java io.druid.cli.Main tools dump-segment \ ### Output format -Data dumps generated by this tool are newline-separate JSON objects, with one object per line. For example, one line -might look like this when pretty-printed: +#### Data dumps + +By default, or with `--dump rows`, this tool dumps rows of the segment as newline-separate JSON objects, with one +object per line, using the default serialization for each column. Normally all columns are included, but if you like, +you can limit the dump to specific columns with `--column name`. + +For example, one line might look like this when pretty-printed: ``` { @@ -47,16 +52,44 @@ might look like this when pretty-printed: } ``` -Metadata dumps generated by this tool are in the same format as returned by the -[SegmentMetadata query](../querying/segmentmetadataquery.html). +#### Metadata dumps + +With `--dump metadata`, this tool dumps metadata instead of rows. Metadata dumps generated by this tool are in the same +format as returned by the [SegmentMetadata query](../querying/segmentmetadataquery.html). + +#### Bitmap dumps + +With `--dump bitmaps`, this tool dump bitmap indexes instead of rows. Bitmap dumps generated by this tool include +dictionary-encoded string columns only. The output contains a field "bitmapSerdeFactory" describing the type of bitmaps +used in the segment, and a field "bitmaps" containing the bitmaps for each value of each column. These are base64 +encoded by default, but you can also dump them as lists of row numbers with `--decompress-bitmaps`. + +Normally all columns are included, but if you like, you can limit the dump to specific columns with `--column name`. + +Sample output: + +``` +{ + "bitmapSerdeFactory": { + "type": "concise" + }, + "bitmaps": { + "isRobot": { + "false": "//aExfu+Nv3X...", + "true": "gAl7OoRByQ..." + } + } +} +``` ### Command line arguments |argument|description|required?| |--------|-----------|---------| |--directory file|Directory containing segment data. This could be generated by unzipping an "index.zip" from deep storage.|yes| -|--output file|File where output will be written.|yes| -|--filter json|JSON-encoded [query filter](../querying/filters.html). Omit to include all rows.|no| +|--output file|File to write to, or omit to write to stdout.|yes| +|--dump TYPE|Dump either 'rows' (default), 'metadata', or 'bitmaps'|no| |--column columnName|Column to include. Specify multiple times for multiple columns, or omit to include all columns.|no| -|--time-iso8601|Dump __time column in ISO8601 format rather than long.|no| -|--metadata|Dump metadata instead of actual rows, will ignore --filter and --column selections.|no| +|--filter json|JSON-encoded [query filter](../querying/filters.html). Omit to include all rows. Only used if dumping rows.|no| +|--time-iso8601|Format __time column in ISO8601 format rather than long. Only used if dumping rows.|no| +|--decompress-bitmaps|Dump bitmaps as arrays rather than base64-encoded compressed bitmaps. Only used if dumping bitmaps.|no| diff --git a/services/src/main/java/io/druid/cli/DumpSegment.java b/services/src/main/java/io/druid/cli/DumpSegment.java index 21192a031b5..e88ebf0e477 100644 --- a/services/src/main/java/io/druid/cli/DumpSegment.java +++ b/services/src/main/java/io/druid/cli/DumpSegment.java @@ -22,17 +22,25 @@ package io.druid.cli; import com.fasterxml.jackson.core.JsonGenerator; import com.fasterxml.jackson.databind.ObjectMapper; import com.google.common.base.Function; +import com.google.common.base.Strings; import com.google.common.base.Throwables; import com.google.common.collect.ImmutableList; import com.google.common.collect.Iterables; import com.google.common.collect.Lists; import com.google.common.collect.Maps; +import com.google.common.collect.Sets; import com.google.common.util.concurrent.MoreExecutors; import com.google.inject.Binder; import com.google.inject.Injector; import com.google.inject.Key; import com.google.inject.Module; import com.google.inject.name.Names; +import com.metamx.collections.bitmap.BitmapFactory; +import com.metamx.collections.bitmap.ConciseBitmapFactory; +import com.metamx.collections.bitmap.ImmutableBitmap; +import com.metamx.collections.bitmap.RoaringBitmapFactory; +import com.metamx.common.IAE; +import com.metamx.common.ISE; import com.metamx.common.guava.Accumulator; import com.metamx.common.guava.Sequence; import com.metamx.common.guava.Sequences; @@ -50,6 +58,7 @@ import io.druid.query.SegmentDescriptor; import io.druid.query.TableDataSource; import io.druid.query.dimension.DefaultDimensionSpec; import io.druid.query.filter.DimFilter; +import io.druid.query.metadata.metadata.ListColumnIncluderator; import io.druid.query.metadata.metadata.SegmentAnalysis; import io.druid.query.metadata.metadata.SegmentMetadataQuery; import io.druid.query.spec.SpecificSegmentSpec; @@ -61,13 +70,18 @@ import io.druid.segment.ObjectColumnSelector; import io.druid.segment.QueryableIndex; import io.druid.segment.QueryableIndexSegment; import io.druid.segment.QueryableIndexStorageAdapter; +import io.druid.segment.column.BitmapIndex; import io.druid.segment.column.Column; import io.druid.segment.column.ColumnConfig; +import io.druid.segment.data.BitmapSerdeFactory; +import io.druid.segment.data.ConciseBitmapSerdeFactory; import io.druid.segment.data.IndexedInts; +import io.druid.segment.data.RoaringBitmapSerdeFactory; import io.druid.segment.filter.Filters; import org.joda.time.DateTime; import org.joda.time.DateTimeZone; import org.joda.time.chrono.ISOChronology; +import org.roaringbitmap.IntIterator; import java.io.File; import java.io.FileOutputStream; @@ -76,6 +90,7 @@ import java.io.OutputStream; import java.util.EnumSet; import java.util.List; import java.util.Map; +import java.util.Set; @Command( name = "dump-segment", @@ -85,6 +100,13 @@ public class DumpSegment extends GuiceRunnable { private static final Logger log = new Logger(DumpSegment.class); + private enum DumpType + { + ROWS, + METADATA, + BITMAPS + } + public DumpSegment() { super(log); @@ -93,54 +115,77 @@ public class DumpSegment extends GuiceRunnable @Option( name = {"-d", "--directory"}, title = "directory", - description = "Directory containing segment data", + description = "Directory containing segment data.", required = true) public String directory; @Option( name = {"-o", "--out"}, title = "file", - description = "File to write to, or omit to write to stdout", + description = "File to write to, or omit to write to stdout.", required = false) public String outputFileName; @Option( name = {"--filter"}, title = "json", - description = "Filter, JSON encoded, or omit to include all rows", + description = "Filter, JSON encoded, or omit to include all rows. Only used if dumping rows.", required = false) public String filterJson = null; @Option( name = {"-c", "--column"}, title = "column", - description = "Column to include, specify multiple times for multiple columns, or omit to include all columns", + description = "Column to include, specify multiple times for multiple columns, or omit to include all columns.", required = false) - public List columnNames = Lists.newArrayList(); + public List columnNamesFromCli = Lists.newArrayList(); @Option( name = "--time-iso8601", - title = "Dump __time column in ISO8601 format rather than long", + title = "Format __time column in ISO8601 format rather than long. Only used if dumping rows.", required = false) public boolean timeISO8601 = false; @Option( - name = "--metadata", - title = "Dump metadata instead of actual rows, will ignore --filter and --column selections", + name = "--dump", + title = "type", + description = "Dump either 'rows' (default), 'metadata', or 'bitmaps'", required = false) - public boolean metadata = false; + public String dumpTypeString = DumpType.ROWS.toString(); + + @Option( + name = "--decompress-bitmaps", + title = "Dump bitmaps as arrays rather than base64-encoded compressed bitmaps. Only used if dumping bitmaps.", + required = false) + public boolean decompressBitmaps = false; @Override public void run() { final Injector injector = makeInjector(); final IndexIO indexIO = injector.getInstance(IndexIO.class); + final DumpType dumpType; + + try { + dumpType = DumpType.valueOf(dumpTypeString.toUpperCase()); + } + catch (Exception e) { + throw new IAE("Not a valid dump type: %s", dumpTypeString); + } try (final QueryableIndex index = indexIO.loadIndex(new File(directory))) { - if (metadata) { - runMetadata(injector, index); - } else { - runDump(injector, index); + switch (dumpType) { + case ROWS: + runDump(injector, index); + break; + case METADATA: + runMetadata(injector, index); + break; + case BITMAPS: + runBitmaps(injector, index); + break; + default: + throw new ISE("WTF?! dumpType[%s] has no handler?", dumpType); } } catch (Exception e) { @@ -150,11 +195,13 @@ public class DumpSegment extends GuiceRunnable private void runMetadata(final Injector injector, final QueryableIndex index) throws IOException { - final ObjectMapper objectMapper = injector.getInstance(Key.get(ObjectMapper.class, Json.class)); + final ObjectMapper objectMapper = injector.getInstance(Key.get(ObjectMapper.class, Json.class)) + .copy() + .configure(JsonGenerator.Feature.AUTO_CLOSE_TARGET, false); final SegmentMetadataQuery query = new SegmentMetadataQuery( new TableDataSource("dataSource"), new SpecificSegmentSpec(new SegmentDescriptor(index.getDataInterval(), "0", 0)), - null, + new ListColumnIncluderator(getColumnsToInclude(index)), false, null, EnumSet.allOf(SegmentMetadataQuery.AnalysisType.class), @@ -176,9 +223,7 @@ public class DumpSegment extends GuiceRunnable public Object apply(SegmentAnalysis analysis) { try { - objectMapper.copy() - .configure(JsonGenerator.Feature.AUTO_CLOSE_TARGET, false) - .writeValue(out, analysis); + objectMapper.writeValue(out, analysis); } catch (IOException e) { throw Throwables.propagate(e); @@ -199,20 +244,7 @@ public class DumpSegment extends GuiceRunnable { final ObjectMapper objectMapper = injector.getInstance(Key.get(ObjectMapper.class, Json.class)); final QueryableIndexStorageAdapter adapter = new QueryableIndexStorageAdapter(index); - - // Empty columnNames => include all columns - if (columnNames.isEmpty()) { - columnNames.add(Column.TIME_COLUMN_NAME); - Iterables.addAll(columnNames, index.getColumnNames()); - } else { - // Remove any provided columnNames that do not exist in this segment - for (String columnName : ImmutableList.copyOf(columnNames)) { - if (index.getColumn(columnName) == null) { - columnNames.remove(columnName); - } - } - } - + final List columnNames = getColumnsToInclude(index); final DimFilter filter = filterJson != null ? objectMapper.readValue(filterJson, DimFilter.class) : null; final Sequence cursors = adapter.makeCursors( @@ -279,6 +311,101 @@ public class DumpSegment extends GuiceRunnable ); } + private void runBitmaps(final Injector injector, final QueryableIndex index) throws IOException + { + final ObjectMapper objectMapper = injector.getInstance(Key.get(ObjectMapper.class, Json.class)); + final BitmapFactory bitmapFactory = index.getBitmapFactoryForDimensions(); + final BitmapSerdeFactory bitmapSerdeFactory; + + if (bitmapFactory instanceof ConciseBitmapFactory) { + bitmapSerdeFactory = new ConciseBitmapSerdeFactory(); + } else if (bitmapFactory instanceof RoaringBitmapFactory) { + bitmapSerdeFactory = new RoaringBitmapSerdeFactory(); + } else { + throw new ISE( + "Don't know which BitmapSerdeFactory to use for BitmapFactory[%s]!", + bitmapFactory.getClass().getName() + ); + } + + final List columnNames = getColumnsToInclude(index); + + withOutputStream( + new Function() + { + @Override + public Object apply(final OutputStream out) + { + try { + final JsonGenerator jg = objectMapper.getFactory().createGenerator(out); + + jg.writeStartObject(); + jg.writeObjectField("bitmapSerdeFactory", bitmapSerdeFactory); + jg.writeFieldName("bitmaps"); + jg.writeStartObject(); + + for (final String columnName : columnNames) { + final Column column = index.getColumn(columnName); + final BitmapIndex bitmapIndex = column.getBitmapIndex(); + + if (bitmapIndex == null) { + jg.writeNullField(columnName); + } else { + jg.writeFieldName(columnName); + jg.writeStartObject(); + for (int i = 0; i < bitmapIndex.getCardinality(); i++) { + jg.writeFieldName(Strings.nullToEmpty(bitmapIndex.getValue(i))); + final ImmutableBitmap bitmap = bitmapIndex.getBitmap(i); + if (decompressBitmaps) { + jg.writeStartArray(); + final IntIterator iterator = bitmap.iterator(); + while (iterator.hasNext()) { + final int rowNum = iterator.next(); + jg.writeNumber(rowNum); + } + jg.writeEndArray(); + } else { + jg.writeBinary(bitmapSerdeFactory.getObjectStrategy().toBytes(bitmap)); + } + } + jg.writeEndObject(); + } + } + + jg.writeEndObject(); + jg.writeEndObject(); + jg.close(); + } + catch (IOException e) { + throw Throwables.propagate(e); + } + + return null; + } + } + ); + } + + private List getColumnsToInclude(final QueryableIndex index) + { + final Set columnNames = Sets.newLinkedHashSet(columnNamesFromCli); + + // Empty columnNames => include all columns. + if (columnNames.isEmpty()) { + columnNames.add(Column.TIME_COLUMN_NAME); + Iterables.addAll(columnNames, index.getColumnNames()); + } else { + // Remove any provided columns that do not exist in this segment. + for (String columnName : ImmutableList.copyOf(columnNames)) { + if (index.getColumn(columnName) == null) { + columnNames.remove(columnName); + } + } + } + + return ImmutableList.copyOf(columnNames); + } + private T withOutputStream(Function f) throws IOException { if (outputFileName == null) { @@ -321,6 +448,12 @@ public class DumpSegment extends GuiceRunnable { return 1; } + + @Override + public int columnCacheSizeBytes() + { + return 25 * 1024 * 1024; + } } ); binder.bind(ColumnConfig.class).to(DruidProcessingConfig.class); @@ -329,7 +462,6 @@ public class DumpSegment extends GuiceRunnable ); } - private static Sequence executeQuery(final Injector injector, final QueryableIndex index, final Query query) { final QueryRunnerFactoryConglomerate conglomerate = injector.getInstance(QueryRunnerFactoryConglomerate.class);