From b8a4f4ea7b85e96248cfed6f041826697a8c8b0b Mon Sep 17 00:00:00 2001
From: Gian Merlino <gianmerlino@gmail.com>
Date: Wed, 6 Jul 2016 12:42:50 -0700
Subject: [PATCH] DumpSegment: Add --dump bitmaps option. (#3221)

Also make --dump metadata respect --column.
---
 docs/content/operations/dump-segment.md       |  53 ++++-
 .../main/java/io/druid/cli/DumpSegment.java   | 198 +++++++++++++++---
 2 files changed, 208 insertions(+), 43 deletions(-)

diff --git a/docs/content/operations/dump-segment.md b/docs/content/operations/dump-segment.md
index 869078fd424..443ed2f436b 100644
--- a/docs/content/operations/dump-segment.md
+++ b/docs/content/operations/dump-segment.md
@@ -4,8 +4,8 @@ layout: doc_page
 # DumpSegment tool
 
 The DumpSegment tool can be used to dump the metadata or contents of a segment for debugging purposes. Note that the
-dump is not necessarily a full-fidelity translation of the segment. In particular, not all metadata is included, indexes
-are not included, and complex metric values may not be complete.
+dump is not necessarily a full-fidelity translation of the segment. In particular, not all metadata is included, and
+complex metric values may not be complete.
 
 To run the tool, point it at a segment directory and provide a file for writing output:
 
@@ -17,8 +17,13 @@ java io.druid.cli.Main tools dump-segment \
 
 ### Output format
 
-Data dumps generated by this tool are newline-separate JSON objects, with one object per line. For example, one line
-might look like this when pretty-printed:
+#### Data dumps
+
+By default, or with `--dump rows`, this tool dumps rows of the segment as newline-separate JSON objects, with one
+object per line, using the default serialization for each column. Normally all columns are included, but if you like,
+you can limit the dump to specific columns with `--column name`.
+
+For example, one line might look like this when pretty-printed:
 
 ```
 {
@@ -47,16 +52,44 @@ might look like this when pretty-printed:
 }
 ```
 
-Metadata dumps generated by this tool are in the same format as returned by the
-[SegmentMetadata query](../querying/segmentmetadataquery.html).
+#### Metadata dumps
+
+With `--dump metadata`, this tool dumps metadata instead of rows. Metadata dumps generated by this tool are in the same
+format as returned by the [SegmentMetadata query](../querying/segmentmetadataquery.html).
+
+#### Bitmap dumps
+
+With `--dump bitmaps`, this tool dump bitmap indexes instead of rows. Bitmap dumps generated by this tool include
+dictionary-encoded string columns only. The output contains a field "bitmapSerdeFactory" describing the type of bitmaps
+used in the segment, and a field "bitmaps" containing the bitmaps for each value of each column. These are base64
+encoded by default, but you can also dump them as lists of row numbers with `--decompress-bitmaps`.
+
+Normally all columns are included, but if you like, you can limit the dump to specific columns with `--column name`.
+
+Sample output:
+
+```
+{
+  "bitmapSerdeFactory": {
+    "type": "concise"
+  },
+  "bitmaps": {
+    "isRobot": {
+      "false": "//aExfu+Nv3X...",
+      "true": "gAl7OoRByQ..."
+    }
+  }
+}
+```
 
 ### Command line arguments
 
 |argument|description|required?|
 |--------|-----------|---------|
 |--directory file|Directory containing segment data. This could be generated by unzipping an "index.zip" from deep storage.|yes|
-|--output file|File where output will be written.|yes|
-|--filter json|JSON-encoded [query filter](../querying/filters.html). Omit to include all rows.|no|
+|--output file|File to write to, or omit to write to stdout.|yes|
+|--dump TYPE|Dump either 'rows' (default), 'metadata', or 'bitmaps'|no|
 |--column columnName|Column to include. Specify multiple times for multiple columns, or omit to include all columns.|no|
-|--time-iso8601|Dump __time column in ISO8601 format rather than long.|no|
-|--metadata|Dump metadata instead of actual rows, will ignore --filter and --column selections.|no|
+|--filter json|JSON-encoded [query filter](../querying/filters.html). Omit to include all rows. Only used if dumping rows.|no|
+|--time-iso8601|Format __time column in ISO8601 format rather than long. Only used if dumping rows.|no|
+|--decompress-bitmaps|Dump bitmaps as arrays rather than base64-encoded compressed bitmaps. Only used if dumping bitmaps.|no|
diff --git a/services/src/main/java/io/druid/cli/DumpSegment.java b/services/src/main/java/io/druid/cli/DumpSegment.java
index 21192a031b5..e88ebf0e477 100644
--- a/services/src/main/java/io/druid/cli/DumpSegment.java
+++ b/services/src/main/java/io/druid/cli/DumpSegment.java
@@ -22,17 +22,25 @@ package io.druid.cli;
 import com.fasterxml.jackson.core.JsonGenerator;
 import com.fasterxml.jackson.databind.ObjectMapper;
 import com.google.common.base.Function;
+import com.google.common.base.Strings;
 import com.google.common.base.Throwables;
 import com.google.common.collect.ImmutableList;
 import com.google.common.collect.Iterables;
 import com.google.common.collect.Lists;
 import com.google.common.collect.Maps;
+import com.google.common.collect.Sets;
 import com.google.common.util.concurrent.MoreExecutors;
 import com.google.inject.Binder;
 import com.google.inject.Injector;
 import com.google.inject.Key;
 import com.google.inject.Module;
 import com.google.inject.name.Names;
+import com.metamx.collections.bitmap.BitmapFactory;
+import com.metamx.collections.bitmap.ConciseBitmapFactory;
+import com.metamx.collections.bitmap.ImmutableBitmap;
+import com.metamx.collections.bitmap.RoaringBitmapFactory;
+import com.metamx.common.IAE;
+import com.metamx.common.ISE;
 import com.metamx.common.guava.Accumulator;
 import com.metamx.common.guava.Sequence;
 import com.metamx.common.guava.Sequences;
@@ -50,6 +58,7 @@ import io.druid.query.SegmentDescriptor;
 import io.druid.query.TableDataSource;
 import io.druid.query.dimension.DefaultDimensionSpec;
 import io.druid.query.filter.DimFilter;
+import io.druid.query.metadata.metadata.ListColumnIncluderator;
 import io.druid.query.metadata.metadata.SegmentAnalysis;
 import io.druid.query.metadata.metadata.SegmentMetadataQuery;
 import io.druid.query.spec.SpecificSegmentSpec;
@@ -61,13 +70,18 @@ import io.druid.segment.ObjectColumnSelector;
 import io.druid.segment.QueryableIndex;
 import io.druid.segment.QueryableIndexSegment;
 import io.druid.segment.QueryableIndexStorageAdapter;
+import io.druid.segment.column.BitmapIndex;
 import io.druid.segment.column.Column;
 import io.druid.segment.column.ColumnConfig;
+import io.druid.segment.data.BitmapSerdeFactory;
+import io.druid.segment.data.ConciseBitmapSerdeFactory;
 import io.druid.segment.data.IndexedInts;
+import io.druid.segment.data.RoaringBitmapSerdeFactory;
 import io.druid.segment.filter.Filters;
 import org.joda.time.DateTime;
 import org.joda.time.DateTimeZone;
 import org.joda.time.chrono.ISOChronology;
+import org.roaringbitmap.IntIterator;
 
 import java.io.File;
 import java.io.FileOutputStream;
@@ -76,6 +90,7 @@ import java.io.OutputStream;
 import java.util.EnumSet;
 import java.util.List;
 import java.util.Map;
+import java.util.Set;
 
 @Command(
     name = "dump-segment",
@@ -85,6 +100,13 @@ public class DumpSegment extends GuiceRunnable
 {
   private static final Logger log = new Logger(DumpSegment.class);
 
+  private enum DumpType
+  {
+    ROWS,
+    METADATA,
+    BITMAPS
+  }
+
   public DumpSegment()
   {
     super(log);
@@ -93,54 +115,77 @@ public class DumpSegment extends GuiceRunnable
   @Option(
       name = {"-d", "--directory"},
       title = "directory",
-      description = "Directory containing segment data",
+      description = "Directory containing segment data.",
       required = true)
   public String directory;
 
   @Option(
       name = {"-o", "--out"},
       title = "file",
-      description = "File to write to, or omit to write to stdout",
+      description = "File to write to, or omit to write to stdout.",
       required = false)
   public String outputFileName;
 
   @Option(
       name = {"--filter"},
       title = "json",
-      description = "Filter, JSON encoded, or omit to include all rows",
+      description = "Filter, JSON encoded, or omit to include all rows. Only used if dumping rows.",
       required = false)
   public String filterJson = null;
 
   @Option(
       name = {"-c", "--column"},
       title = "column",
-      description = "Column to include, specify multiple times for multiple columns, or omit to include all columns",
+      description = "Column to include, specify multiple times for multiple columns, or omit to include all columns.",
       required = false)
-  public List<String> columnNames = Lists.newArrayList();
+  public List<String> columnNamesFromCli = Lists.newArrayList();
 
   @Option(
       name = "--time-iso8601",
-      title = "Dump __time column in ISO8601 format rather than long",
+      title = "Format __time column in ISO8601 format rather than long. Only used if dumping rows.",
       required = false)
   public boolean timeISO8601 = false;
 
   @Option(
-      name = "--metadata",
-      title = "Dump metadata instead of actual rows, will ignore --filter and --column selections",
+      name = "--dump",
+      title = "type",
+      description = "Dump either 'rows' (default), 'metadata', or 'bitmaps'",
       required = false)
-  public boolean metadata = false;
+  public String dumpTypeString = DumpType.ROWS.toString();
+
+  @Option(
+      name = "--decompress-bitmaps",
+      title = "Dump bitmaps as arrays rather than base64-encoded compressed bitmaps. Only used if dumping bitmaps.",
+      required = false)
+  public boolean decompressBitmaps = false;
 
   @Override
   public void run()
   {
     final Injector injector = makeInjector();
     final IndexIO indexIO = injector.getInstance(IndexIO.class);
+    final DumpType dumpType;
+
+    try {
+      dumpType = DumpType.valueOf(dumpTypeString.toUpperCase());
+    }
+    catch (Exception e) {
+      throw new IAE("Not a valid dump type: %s", dumpTypeString);
+    }
 
     try (final QueryableIndex index = indexIO.loadIndex(new File(directory))) {
-      if (metadata) {
-        runMetadata(injector, index);
-      } else {
-        runDump(injector, index);
+      switch (dumpType) {
+        case ROWS:
+          runDump(injector, index);
+          break;
+        case METADATA:
+          runMetadata(injector, index);
+          break;
+        case BITMAPS:
+          runBitmaps(injector, index);
+          break;
+        default:
+          throw new ISE("WTF?! dumpType[%s] has no handler?", dumpType);
       }
     }
     catch (Exception e) {
@@ -150,11 +195,13 @@ public class DumpSegment extends GuiceRunnable
 
   private void runMetadata(final Injector injector, final QueryableIndex index) throws IOException
   {
-    final ObjectMapper objectMapper = injector.getInstance(Key.get(ObjectMapper.class, Json.class));
+    final ObjectMapper objectMapper = injector.getInstance(Key.get(ObjectMapper.class, Json.class))
+                                              .copy()
+                                              .configure(JsonGenerator.Feature.AUTO_CLOSE_TARGET, false);
     final SegmentMetadataQuery query = new SegmentMetadataQuery(
         new TableDataSource("dataSource"),
         new SpecificSegmentSpec(new SegmentDescriptor(index.getDataInterval(), "0", 0)),
-        null,
+        new ListColumnIncluderator(getColumnsToInclude(index)),
         false,
         null,
         EnumSet.allOf(SegmentMetadataQuery.AnalysisType.class),
@@ -176,9 +223,7 @@ public class DumpSegment extends GuiceRunnable
                       public Object apply(SegmentAnalysis analysis)
                       {
                         try {
-                          objectMapper.copy()
-                                      .configure(JsonGenerator.Feature.AUTO_CLOSE_TARGET, false)
-                                      .writeValue(out, analysis);
+                          objectMapper.writeValue(out, analysis);
                         }
                         catch (IOException e) {
                           throw Throwables.propagate(e);
@@ -199,20 +244,7 @@ public class DumpSegment extends GuiceRunnable
   {
     final ObjectMapper objectMapper = injector.getInstance(Key.get(ObjectMapper.class, Json.class));
     final QueryableIndexStorageAdapter adapter = new QueryableIndexStorageAdapter(index);
-
-    // Empty columnNames => include all columns
-    if (columnNames.isEmpty()) {
-      columnNames.add(Column.TIME_COLUMN_NAME);
-      Iterables.addAll(columnNames, index.getColumnNames());
-    } else {
-      // Remove any provided columnNames that do not exist in this segment
-      for (String columnName : ImmutableList.copyOf(columnNames)) {
-        if (index.getColumn(columnName) == null) {
-          columnNames.remove(columnName);
-        }
-      }
-    }
-
+    final List<String> columnNames = getColumnsToInclude(index);
     final DimFilter filter = filterJson != null ? objectMapper.readValue(filterJson, DimFilter.class) : null;
 
     final Sequence<Cursor> cursors = adapter.makeCursors(
@@ -279,6 +311,101 @@ public class DumpSegment extends GuiceRunnable
     );
   }
 
+  private void runBitmaps(final Injector injector, final QueryableIndex index) throws IOException
+  {
+    final ObjectMapper objectMapper = injector.getInstance(Key.get(ObjectMapper.class, Json.class));
+    final BitmapFactory bitmapFactory = index.getBitmapFactoryForDimensions();
+    final BitmapSerdeFactory bitmapSerdeFactory;
+
+    if (bitmapFactory instanceof ConciseBitmapFactory) {
+      bitmapSerdeFactory = new ConciseBitmapSerdeFactory();
+    } else if (bitmapFactory instanceof RoaringBitmapFactory) {
+      bitmapSerdeFactory = new RoaringBitmapSerdeFactory();
+    } else {
+      throw new ISE(
+          "Don't know which BitmapSerdeFactory to use for BitmapFactory[%s]!",
+          bitmapFactory.getClass().getName()
+      );
+    }
+
+    final List<String> columnNames = getColumnsToInclude(index);
+
+    withOutputStream(
+        new Function<OutputStream, Object>()
+        {
+          @Override
+          public Object apply(final OutputStream out)
+          {
+            try {
+              final JsonGenerator jg = objectMapper.getFactory().createGenerator(out);
+
+              jg.writeStartObject();
+              jg.writeObjectField("bitmapSerdeFactory", bitmapSerdeFactory);
+              jg.writeFieldName("bitmaps");
+              jg.writeStartObject();
+
+              for (final String columnName : columnNames) {
+                final Column column = index.getColumn(columnName);
+                final BitmapIndex bitmapIndex = column.getBitmapIndex();
+
+                if (bitmapIndex == null) {
+                  jg.writeNullField(columnName);
+                } else {
+                  jg.writeFieldName(columnName);
+                  jg.writeStartObject();
+                  for (int i = 0; i < bitmapIndex.getCardinality(); i++) {
+                    jg.writeFieldName(Strings.nullToEmpty(bitmapIndex.getValue(i)));
+                    final ImmutableBitmap bitmap = bitmapIndex.getBitmap(i);
+                    if (decompressBitmaps) {
+                      jg.writeStartArray();
+                      final IntIterator iterator = bitmap.iterator();
+                      while (iterator.hasNext()) {
+                        final int rowNum = iterator.next();
+                        jg.writeNumber(rowNum);
+                      }
+                      jg.writeEndArray();
+                    } else {
+                      jg.writeBinary(bitmapSerdeFactory.getObjectStrategy().toBytes(bitmap));
+                    }
+                  }
+                  jg.writeEndObject();
+                }
+              }
+
+              jg.writeEndObject();
+              jg.writeEndObject();
+              jg.close();
+            }
+            catch (IOException e) {
+              throw Throwables.propagate(e);
+            }
+
+            return null;
+          }
+        }
+    );
+  }
+
+  private List<String> getColumnsToInclude(final QueryableIndex index)
+  {
+    final Set<String> columnNames = Sets.newLinkedHashSet(columnNamesFromCli);
+
+    // Empty columnNames => include all columns.
+    if (columnNames.isEmpty()) {
+      columnNames.add(Column.TIME_COLUMN_NAME);
+      Iterables.addAll(columnNames, index.getColumnNames());
+    } else {
+      // Remove any provided columns that do not exist in this segment.
+      for (String columnName : ImmutableList.copyOf(columnNames)) {
+        if (index.getColumn(columnName) == null) {
+          columnNames.remove(columnName);
+        }
+      }
+    }
+
+    return ImmutableList.copyOf(columnNames);
+  }
+
   private <T> T withOutputStream(Function<OutputStream, T> f) throws IOException
   {
     if (outputFileName == null) {
@@ -321,6 +448,12 @@ public class DumpSegment extends GuiceRunnable
                   {
                     return 1;
                   }
+
+                  @Override
+                  public int columnCacheSizeBytes()
+                  {
+                    return 25 * 1024 * 1024;
+                  }
                 }
             );
             binder.bind(ColumnConfig.class).to(DruidProcessingConfig.class);
@@ -329,7 +462,6 @@ public class DumpSegment extends GuiceRunnable
     );
   }
 
-
   private static <T> Sequence<T> executeQuery(final Injector injector, final QueryableIndex index, final Query<T> query)
   {
     final QueryRunnerFactoryConglomerate conglomerate = injector.getInstance(QueryRunnerFactoryConglomerate.class);