From ea03906fcfb29a70fe7525b2d3114782f7da7903 Mon Sep 17 00:00:00 2001 From: Gian Merlino Date: Thu, 7 Jul 2016 21:54:19 -0700 Subject: [PATCH] Configurable compressRunOnSerialization for Roaring bitmaps. (#3228) Defaults to true, which is a change in behavior (this used to be false and unconfigurable). --- .../druid/benchmark/BoundFilterBenchmark.java | 2 +- .../DimensionPredicateFilterBenchmark.java | 2 +- .../extensions-core/kafka-ingestion.md | 17 ++++++++++- docs/content/ingestion/batch-ingestion.md | 21 ++++++++++++-- docs/content/ingestion/stream-pull.md | 23 ++++++++++++--- docs/content/ingestion/tasks.md | 29 ++++++++++++++----- .../updater/HadoopConverterJobTest.java | 4 +-- .../indexing/common/task/TaskSerdeTest.java | 2 +- .../java/io/druid/segment/BitmapOffset.java | 6 ++-- .../data/RoaringBitmapSerdeFactory.java | 26 +++++++++++++++-- .../io/druid/segment/IndexMergerTest.java | 2 +- .../java/io/druid/segment/IndexSpecTest.java | 2 +- .../segment/data/BitmapSerdeFactoryTest.java | 16 ++++++++-- .../druid/segment/filter/BaseFilterTest.java | 2 +- .../filter/ExtractionDimFilterTest.java | 2 +- .../main/java/io/druid/cli/DumpSegment.java | 2 +- 16 files changed, 125 insertions(+), 33 deletions(-) diff --git a/benchmarks/src/main/java/io/druid/benchmark/BoundFilterBenchmark.java b/benchmarks/src/main/java/io/druid/benchmark/BoundFilterBenchmark.java index d28a8e8fd4b..e62d507133f 100644 --- a/benchmarks/src/main/java/io/druid/benchmark/BoundFilterBenchmark.java +++ b/benchmarks/src/main/java/io/druid/benchmark/BoundFilterBenchmark.java @@ -153,7 +153,7 @@ public class BoundFilterBenchmark { step = (END_INT - START_INT) / cardinality; final BitmapFactory bitmapFactory = new RoaringBitmapFactory(); - final BitmapSerdeFactory serdeFactory = new RoaringBitmapSerdeFactory(); + final BitmapSerdeFactory serdeFactory = new RoaringBitmapSerdeFactory(null); final List ints = generateInts(); final GenericIndexed dictionary = GenericIndexed.fromIterable( FluentIterable.from(ints) diff --git a/benchmarks/src/main/java/io/druid/benchmark/DimensionPredicateFilterBenchmark.java b/benchmarks/src/main/java/io/druid/benchmark/DimensionPredicateFilterBenchmark.java index 605f26877c0..cf21147fd65 100644 --- a/benchmarks/src/main/java/io/druid/benchmark/DimensionPredicateFilterBenchmark.java +++ b/benchmarks/src/main/java/io/druid/benchmark/DimensionPredicateFilterBenchmark.java @@ -89,7 +89,7 @@ public class DimensionPredicateFilterBenchmark public void setup() throws IOException { final BitmapFactory bitmapFactory = new RoaringBitmapFactory(); - final BitmapSerdeFactory serdeFactory = new RoaringBitmapSerdeFactory(); + final BitmapSerdeFactory serdeFactory = new RoaringBitmapSerdeFactory(null); final List ints = generateInts(); final GenericIndexed dictionary = GenericIndexed.fromIterable( FluentIterable.from(ints) diff --git a/docs/content/development/extensions-core/kafka-ingestion.md b/docs/content/development/extensions-core/kafka-ingestion.md index 30280938173..cea6d56c145 100644 --- a/docs/content/development/extensions-core/kafka-ingestion.md +++ b/docs/content/development/extensions-core/kafka-ingestion.md @@ -128,10 +128,25 @@ The tuningConfig is optional and default parameters will be used if no tuningCon |Field|Type|Description|Required| |-----|----|-----------|--------| -|`bitmap`|String|The type of bitmap index to create. Choose from `roaring` or `concise`.|no (default == `concise`)| +|`bitmap`|Object|Compression format for bitmap indexes. Should be a JSON object; see below for options.|no (defaults to Concise)| |`dimensionCompression`|String|Compression format for dimension columns. Choose from `LZ4`, `LZF`, or `uncompressed`.|no (default == `LZ4`)| |`metricCompression`|String|Compression format for metric columns. Choose from `LZ4`, `LZF`, or `uncompressed`.|no (default == `LZ4`)| +##### Bitmap types + +For Concise bitmaps: + +|Field|Type|Description|Required| +|-----|----|-----------|--------| +|`type`|String|Must be `concise`.|yes| + +For Roaring bitmaps: + +|Field|Type|Description|Required| +|-----|----|-----------|--------| +|`type`|String|Must be `roaring`.|yes| +|`compressRunOnSerialization`|Boolean|Use a run-length encoding where it is estimated as more space efficient.|no (default == `true`)| + ### KafkaSupervisorIOConfig |Field|Type|Description|Required| diff --git a/docs/content/ingestion/batch-ingestion.md b/docs/content/ingestion/batch-ingestion.md index 63cc2eb02dd..7bdb46f7a23 100644 --- a/docs/content/ingestion/batch-ingestion.md +++ b/docs/content/ingestion/batch-ingestion.md @@ -191,9 +191,24 @@ The following properties can be used to tune how the MapReduce job is configured |Field|Type|Description|Required| |-----|----|-----------|--------| -|bitmap|String|The type of bitmap index to create. Choose from `roaring` or `concise`, or null to use the default (`concise`).|No| -|dimensionCompression|String|Compression format for dimension columns. Choose from `LZ4`, `LZF`, or `uncompressed`. The default is `LZ4`.|No| -|metricCompression|String|Compression format for metric columns. Choose from `LZ4`, `LZF`, or `uncompressed`. The default is `LZ4`.|No| +|bitmap|Object|Compression format for bitmap indexes. Should be a JSON object; see below for options.|no (defaults to Concise)| +|dimensionCompression|String|Compression format for dimension columns. Choose from `LZ4`, `LZF`, or `uncompressed`.|no (default == `LZ4`)| +|metricCompression|String|Compression format for metric columns. Choose from `LZ4`, `LZF`, or `uncompressed`.|no (default == `LZ4`)| + +##### Bitmap types + +For Concise bitmaps: + +|Field|Type|Description|Required| +|-----|----|-----------|--------| +|type|String|Must be `concise`.|yes| + +For Roaring bitmaps: + +|Field|Type|Description|Required| +|-----|----|-----------|--------| +|type|String|Must be `roaring`.|yes| +|compressRunOnSerialization|Boolean|Use a run-length encoding where it is estimated as more space efficient.|no (default == `true`)| ### Partitioning specification diff --git a/docs/content/ingestion/stream-pull.md b/docs/content/ingestion/stream-pull.md index 66dc755ee31..6baf22f1d3b 100644 --- a/docs/content/ingestion/stream-pull.md +++ b/docs/content/ingestion/stream-pull.md @@ -167,13 +167,28 @@ The following policies are available: * `messageTime` – Can be used for non-"current time" as long as that data is relatively in sequence. Events are rejected if they are less than `windowPeriod` from the event with the latest timestamp. Hand off only occurs if an event is seen after the segmentGranularity and `windowPeriod` (hand off will not periodically occur unless you have a constant stream of data). * `none` – All events are accepted. Never hands off data unless shutdown() is called on the configured firehose. -### Index Spec +#### IndexSpec |Field|Type|Description|Required| |-----|----|-----------|--------| -|bitmap|String|The type of bitmap index to create. Choose from `roaring` or `concise`, or null to use the default (`concise`).|No| -|dimensionCompression|String|Compression format for dimension columns. Choose from `LZ4`, `LZF`, or `uncompressed`. The default is `LZ4`.|No| -|metricCompression|String|Compression format for metric columns. Choose from `LZ4`, `LZF`, or `uncompressed`. The default is `LZ4`.|No| +|bitmap|Object|Compression format for bitmap indexes. Should be a JSON object; see below for options.|no (defaults to Concise)| +|dimensionCompression|String|Compression format for dimension columns. Choose from `LZ4`, `LZF`, or `uncompressed`.|no (default == `LZ4`)| +|metricCompression|String|Compression format for metric columns. Choose from `LZ4`, `LZF`, or `uncompressed`.|no (default == `LZ4`)| + +##### Bitmap types + +For Concise bitmaps: + +|Field|Type|Description|Required| +|-----|----|-----------|--------| +|type|String|Must be `concise`.|yes| + +For Roaring bitmaps: + +|Field|Type|Description|Required| +|-----|----|-----------|--------| +|type|String|Must be `roaring`.|yes| +|compressRunOnSerialization|Boolean|Use a run-length encoding where it is estimated as more space efficient.|no (default == `true`)| #### Sharding diff --git a/docs/content/ingestion/tasks.md b/docs/content/ingestion/tasks.md index faf481fb84a..c86f3de4446 100644 --- a/docs/content/ingestion/tasks.md +++ b/docs/content/ingestion/tasks.md @@ -116,16 +116,29 @@ The tuningConfig is optional and default parameters will be used if no tuningCon #### IndexSpec -The indexSpec defines segment storage format options to be used at indexing -time, such as bitmap type, and column compression formats. +The indexSpec defines segment storage format options to be used at indexing time, such as bitmap type and column +compression formats. The indexSpec is optional and default parameters will be used if not specified. -The indexSpec is optional and default parameters will be used if not specified. +|Field|Type|Description|Required| +|-----|----|-----------|--------| +|bitmap|Object|Compression format for bitmap indexes. Should be a JSON object; see below for options.|no (defaults to Concise)| +|dimensionCompression|String|Compression format for dimension columns. Choose from `LZ4`, `LZF`, or `uncompressed`.|no (default == `LZ4`)| +|metricCompression|String|Compression format for metric columns. Choose from `LZ4`, `LZF`, or `uncompressed`.|no (default == `LZ4`)| -|property|description|possible values|default|required?| -|--------|-----------|---------------|-------|---------| -|bitmap|type of bitmap compression to use for inverted indices.|`"concise"`, `"roaring"`|`"concise"`|no| -|dimensionCompression|compression format for dimension columns|`"uncompressed"`, `"lz4"`, `"lzf"`|`"lz4"`|no| -|metricCompression|compression format for metric columns, defaults to LZ4|`"lz4"`, `"lzf"`|`"lz4"`|no| +##### Bitmap types + +For Concise bitmaps: + +|Field|Type|Description|Required| +|-----|----|-----------|--------| +|type|String|Must be `concise`.|yes| + +For Roaring bitmaps: + +|Field|Type|Description|Required| +|-----|----|-----------|--------| +|type|String|Must be `roaring`.|yes| +|compressRunOnSerialization|Boolean|Use a run-length encoding where it is estimated as more space efficient.|no (default == `true`)| Segment Merging Tasks --------------------- diff --git a/indexing-hadoop/src/test/java/io/druid/indexer/updater/HadoopConverterJobTest.java b/indexing-hadoop/src/test/java/io/druid/indexer/updater/HadoopConverterJobTest.java index 4de688fbd0e..df7ca2148bb 100644 --- a/indexing-hadoop/src/test/java/io/druid/indexer/updater/HadoopConverterJobTest.java +++ b/indexing-hadoop/src/test/java/io/druid/indexer/updater/HadoopConverterJobTest.java @@ -283,7 +283,7 @@ public class HadoopConverterJobTest new HadoopDruidConverterConfig( DATASOURCE, interval, - new IndexSpec(new RoaringBitmapSerdeFactory(), "uncompressed", "uncompressed"), + new IndexSpec(new RoaringBitmapSerdeFactory(null), "uncompressed", "uncompressed"), oldSemgments, true, tmpDir.toURI(), @@ -386,7 +386,7 @@ public class HadoopConverterJobTest new HadoopDruidConverterConfig( DATASOURCE, interval, - new IndexSpec(new RoaringBitmapSerdeFactory(), "uncompressed", "uncompressed"), + new IndexSpec(new RoaringBitmapSerdeFactory(null), "uncompressed", "uncompressed"), oldSemgments, true, tmpDir.toURI(), diff --git a/indexing-service/src/test/java/io/druid/indexing/common/task/TaskSerdeTest.java b/indexing-service/src/test/java/io/druid/indexing/common/task/TaskSerdeTest.java index 4db0d36b3c7..86b584ca8c5 100644 --- a/indexing-service/src/test/java/io/druid/indexing/common/task/TaskSerdeTest.java +++ b/indexing-service/src/test/java/io/druid/indexing/common/task/TaskSerdeTest.java @@ -517,7 +517,7 @@ public class TaskSerdeTest ); final ConvertSegmentTask convertSegmentTaskOriginal = ConvertSegmentTask.create( segment, - new IndexSpec(new RoaringBitmapSerdeFactory(), "lzf", "uncompressed"), + new IndexSpec(new RoaringBitmapSerdeFactory(null), "lzf", "uncompressed"), false, true, null diff --git a/processing/src/main/java/io/druid/segment/BitmapOffset.java b/processing/src/main/java/io/druid/segment/BitmapOffset.java index 9cd77f2402e..1c41ee55718 100644 --- a/processing/src/main/java/io/druid/segment/BitmapOffset.java +++ b/processing/src/main/java/io/druid/segment/BitmapOffset.java @@ -32,6 +32,7 @@ import org.roaringbitmap.IntIterator; public class BitmapOffset implements Offset { private static final int INVALID_VALUE = -1; + private static final BitmapFactory ROARING_BITMAP_FACTORY = new RoaringBitmapSerdeFactory(false).getBitmapFactory(); private final IntIterator itr; private final BitmapFactory bitmapFactory; @@ -44,13 +45,12 @@ public class BitmapOffset implements Offset { ImmutableBitmap roaringBitmap = bitmapIndex; if (!(bitmapIndex instanceof WrappedImmutableRoaringBitmap)) { - final BitmapFactory factory = RoaringBitmapSerdeFactory.bitmapFactory; - final MutableBitmap bitmap = factory.makeEmptyMutableBitmap(); + final MutableBitmap bitmap = ROARING_BITMAP_FACTORY.makeEmptyMutableBitmap(); final IntIterator iterator = bitmapIndex.iterator(); while (iterator.hasNext()) { bitmap.add(iterator.next()); } - roaringBitmap = factory.makeImmutableBitmap(bitmap); + roaringBitmap = ROARING_BITMAP_FACTORY.makeImmutableBitmap(bitmap); } return ((WrappedImmutableRoaringBitmap) roaringBitmap).getBitmap().getReverseIntIterator(); } diff --git a/processing/src/main/java/io/druid/segment/data/RoaringBitmapSerdeFactory.java b/processing/src/main/java/io/druid/segment/data/RoaringBitmapSerdeFactory.java index 28f279c8682..86e6a0d0217 100644 --- a/processing/src/main/java/io/druid/segment/data/RoaringBitmapSerdeFactory.java +++ b/processing/src/main/java/io/druid/segment/data/RoaringBitmapSerdeFactory.java @@ -19,6 +19,8 @@ package io.druid.segment.data; +import com.fasterxml.jackson.annotation.JsonCreator; +import com.fasterxml.jackson.annotation.JsonProperty; import com.google.common.collect.Ordering; import com.metamx.collections.bitmap.BitmapFactory; import com.metamx.collections.bitmap.ImmutableBitmap; @@ -32,8 +34,28 @@ import java.nio.ByteBuffer; */ public class RoaringBitmapSerdeFactory implements BitmapSerdeFactory { - public static final ObjectStrategy objectStrategy = new ImmutableRoaringBitmapObjectStrategy(); - public static final BitmapFactory bitmapFactory = new RoaringBitmapFactory(); + private static final boolean DEFAULT_COMPRESS_RUN_ON_SERIALIZATION = true; + private static final ObjectStrategy objectStrategy = new ImmutableRoaringBitmapObjectStrategy(); + + private final boolean compressRunOnSerialization; + private final BitmapFactory bitmapFactory; + + @JsonCreator + public RoaringBitmapSerdeFactory( + @JsonProperty("compressRunOnSerialization") Boolean compressRunOnSerialization + ) + { + this.compressRunOnSerialization = compressRunOnSerialization == null + ? DEFAULT_COMPRESS_RUN_ON_SERIALIZATION + : compressRunOnSerialization; + this.bitmapFactory = new RoaringBitmapFactory(this.compressRunOnSerialization); + } + + @JsonProperty + public boolean getCompressRunOnSerialization() + { + return compressRunOnSerialization; + } @Override public ObjectStrategy getObjectStrategy() diff --git a/processing/src/test/java/io/druid/segment/IndexMergerTest.java b/processing/src/test/java/io/druid/segment/IndexMergerTest.java index 3ebbb30e00f..a06377a4c81 100644 --- a/processing/src/test/java/io/druid/segment/IndexMergerTest.java +++ b/processing/src/test/java/io/druid/segment/IndexMergerTest.java @@ -91,7 +91,7 @@ public class IndexMergerTest false ), ImmutableSet.of( - new RoaringBitmapSerdeFactory(), + new RoaringBitmapSerdeFactory(null), new ConciseBitmapSerdeFactory() ), ImmutableSet.of( diff --git a/processing/src/test/java/io/druid/segment/IndexSpecTest.java b/processing/src/test/java/io/druid/segment/IndexSpecTest.java index 902039d062f..2053c69869a 100644 --- a/processing/src/test/java/io/druid/segment/IndexSpecTest.java +++ b/processing/src/test/java/io/druid/segment/IndexSpecTest.java @@ -35,7 +35,7 @@ public class IndexSpecTest final String json = "{ \"bitmap\" : { \"type\" : \"roaring\" }, \"dimensionCompression\" : \"lz4\", \"metricCompression\" : \"lzf\" }"; final IndexSpec spec = objectMapper.readValue(json, IndexSpec.class); - Assert.assertEquals(new RoaringBitmapSerdeFactory(), spec.getBitmapSerdeFactory()); + Assert.assertEquals(new RoaringBitmapSerdeFactory(null), spec.getBitmapSerdeFactory()); Assert.assertEquals(CompressedObjectStrategy.CompressionStrategy.LZ4, spec.getDimensionCompressionStrategy()); Assert.assertEquals(CompressedObjectStrategy.CompressionStrategy.LZF, spec.getMetricCompressionStrategy()); diff --git a/processing/src/test/java/io/druid/segment/data/BitmapSerdeFactoryTest.java b/processing/src/test/java/io/druid/segment/data/BitmapSerdeFactoryTest.java index 407f3d71535..adc26d70331 100644 --- a/processing/src/test/java/io/druid/segment/data/BitmapSerdeFactoryTest.java +++ b/processing/src/test/java/io/druid/segment/data/BitmapSerdeFactoryTest.java @@ -30,7 +30,9 @@ public class BitmapSerdeFactoryTest public void testSerialization() throws Exception { ObjectMapper mapper = new DefaultObjectMapper(); - Assert.assertEquals("{\"type\":\"roaring\"}", mapper.writeValueAsString(new RoaringBitmapSerdeFactory())); + Assert.assertEquals("{\"type\":\"roaring\",\"compressRunOnSerialization\":true}", mapper.writeValueAsString(new RoaringBitmapSerdeFactory(null))); + Assert.assertEquals("{\"type\":\"roaring\",\"compressRunOnSerialization\":false}", mapper.writeValueAsString(new RoaringBitmapSerdeFactory(false))); + Assert.assertEquals("{\"type\":\"roaring\",\"compressRunOnSerialization\":true}", mapper.writeValueAsString(new RoaringBitmapSerdeFactory(true))); Assert.assertEquals("{\"type\":\"concise\"}", mapper.writeValueAsString(new ConciseBitmapSerdeFactory())); Assert.assertEquals("{\"type\":\"concise\"}", mapper.writeValueAsString(BitmapSerde.createLegacyFactory())); Assert.assertEquals("{\"type\":\"concise\"}", mapper.writeValueAsString(new BitmapSerde.DefaultBitmapSerdeFactory())); @@ -41,7 +43,17 @@ public class BitmapSerdeFactoryTest public void testDeserialization() throws Exception { ObjectMapper mapper = new DefaultObjectMapper(); - Assert.assertTrue(mapper.readValue("{\"type\":\"roaring\"}", BitmapSerdeFactory.class) instanceof RoaringBitmapSerdeFactory); + final BitmapSerdeFactory roaringFactory = mapper.readValue("{\"type\":\"roaring\"}", BitmapSerdeFactory.class); + Assert.assertTrue(roaringFactory instanceof RoaringBitmapSerdeFactory); + Assert.assertTrue(((RoaringBitmapSerdeFactory)roaringFactory).getCompressRunOnSerialization()); + + final BitmapSerdeFactory compressingRoaringFactory = mapper.readValue( + "{\"type\":\"roaring\", \"compressRunOnSerialization\":false}", + BitmapSerdeFactory.class + ); + Assert.assertTrue(compressingRoaringFactory instanceof RoaringBitmapSerdeFactory); + Assert.assertFalse(((RoaringBitmapSerdeFactory)compressingRoaringFactory).getCompressRunOnSerialization()); + Assert.assertTrue(mapper.readValue("{\"type\":\"concise\"}", BitmapSerdeFactory.class) instanceof ConciseBitmapSerdeFactory); Assert.assertTrue(mapper.readValue("{\"type\":\"BitmapSerde$SomeRandomClass\"}", BitmapSerdeFactory.class) instanceof ConciseBitmapSerdeFactory); } diff --git a/processing/src/test/java/io/druid/segment/filter/BaseFilterTest.java b/processing/src/test/java/io/druid/segment/filter/BaseFilterTest.java index 7387ef8616e..3e048c50223 100644 --- a/processing/src/test/java/io/druid/segment/filter/BaseFilterTest.java +++ b/processing/src/test/java/io/druid/segment/filter/BaseFilterTest.java @@ -156,7 +156,7 @@ public abstract class BaseFilterTest final Map bitmapSerdeFactories = ImmutableMap.of( "concise", new ConciseBitmapSerdeFactory(), - "roaring", new RoaringBitmapSerdeFactory() + "roaring", new RoaringBitmapSerdeFactory(true) ); final Map indexMergers = ImmutableMap.of( diff --git a/processing/src/test/java/io/druid/segment/filter/ExtractionDimFilterTest.java b/processing/src/test/java/io/druid/segment/filter/ExtractionDimFilterTest.java index 0e3e61ef541..c45f773bfbe 100644 --- a/processing/src/test/java/io/druid/segment/filter/ExtractionDimFilterTest.java +++ b/processing/src/test/java/io/druid/segment/filter/ExtractionDimFilterTest.java @@ -71,7 +71,7 @@ public class ExtractionDimFilterTest { return ImmutableList.of( new Object[]{new ConciseBitmapFactory(), new ConciseBitmapSerdeFactory()}, - new Object[]{new RoaringBitmapFactory(), new RoaringBitmapSerdeFactory()} + new Object[]{new RoaringBitmapFactory(), new RoaringBitmapSerdeFactory(null)} ); } diff --git a/services/src/main/java/io/druid/cli/DumpSegment.java b/services/src/main/java/io/druid/cli/DumpSegment.java index e88ebf0e477..d4aaa6b1a24 100644 --- a/services/src/main/java/io/druid/cli/DumpSegment.java +++ b/services/src/main/java/io/druid/cli/DumpSegment.java @@ -320,7 +320,7 @@ public class DumpSegment extends GuiceRunnable if (bitmapFactory instanceof ConciseBitmapFactory) { bitmapSerdeFactory = new ConciseBitmapSerdeFactory(); } else if (bitmapFactory instanceof RoaringBitmapFactory) { - bitmapSerdeFactory = new RoaringBitmapSerdeFactory(); + bitmapSerdeFactory = new RoaringBitmapSerdeFactory(null); } else { throw new ISE( "Don't know which BitmapSerdeFactory to use for BitmapFactory[%s]!",