From bf85ea19b283ffc6e26c6aef28d264233a1509db Mon Sep 17 00:00:00 2001 From: Clint Wylie Date: Mon, 23 Mar 2020 18:15:57 -0700 Subject: [PATCH] roaring bitmaps by default (#9548) * it is finally time * fix it * more docs * fix doc --- docs/design/index.md | 4 ++-- docs/design/segments.md | 3 +++ .../development/extensions-core/kafka-ingestion.md | 14 +++++++------- .../extensions-core/kinesis-ingestion.md | 14 +++++++------- docs/ingestion/index.md | 4 ++-- docs/ingestion/native-batch.md | 4 ++-- docs/operations/dump-segment.md | 3 ++- .../common/task/CompactionTaskRunTest.java | 2 +- .../java/org/apache/druid/segment/IndexSpec.java | 6 +++--- .../org/apache/druid/segment/data/BitmapSerde.java | 11 ++++++++--- .../segment/data/RoaringBitmapSerdeFactory.java | 2 +- .../druid/segment/data/BitmapSerdeFactoryTest.java | 4 ++-- .../coordinator/duty/CompactSegmentsTest.java | 2 +- web-console/src/utils/ingestion-spec.tsx | 2 +- 14 files changed, 42 insertions(+), 33 deletions(-) diff --git a/docs/design/index.md b/docs/design/index.md index 63c1db52ed4..d803a704abe 100644 --- a/docs/design/index.md +++ b/docs/design/index.md @@ -61,8 +61,8 @@ updates. stored safely in [deep storage](architecture.html#deep-storage) (typically cloud storage, HDFS, or a shared filesystem). Your data can be recovered from deep storage even if every single Druid server fails. For more limited failures affecting just a few Druid servers, replication ensures that queries are still possible while the system recovers. -7. **Indexes for quick filtering.** Druid uses [CONCISE](https://arxiv.org/pdf/1004.0403) or -[Roaring](https://roaringbitmap.org/) compressed bitmap indexes to create indexes that power fast filtering and +7. **Indexes for quick filtering.** Druid uses [Roaring](https://roaringbitmap.org/) or +[CONCISE](https://arxiv.org/pdf/1004.0403) compressed bitmap indexes to create indexes that power fast filtering and searching across multiple columns. 8. **Time-based partitioning.** Druid first partitions data by time, and can additionally partition based on other fields. This means time-based queries will only access the partitions that match the time range of the query. This leads to diff --git a/docs/design/segments.md b/docs/design/segments.md index 3d2939a72cc..e54353b4d99 100644 --- a/docs/design/segments.md +++ b/docs/design/segments.md @@ -186,6 +186,9 @@ Each column is stored as two parts: A ColumnDescriptor is essentially an object that allows us to use Jackson's polymorphic deserialization to add new and interesting methods of serialization with minimal impact to the code. It consists of some metadata about the column (what type is it, is it multi-value, etc.) and then a list of serialization/deserialization logic that can deserialize the rest of the binary. +### Compression +Druid compresses blocks of values for string, long, float, and double columns, using [LZ4](https://github.com/lz4/lz4-java) by default, and bitmaps for string columns and numeric null values are compressed using [Roaring](https://github.com/RoaringBitmap/RoaringBitmap). We recommend sticking with these defaults unless experimental verification with your own data and query patterns suggest that non-default options will perform better in your specific case. For example, for bitmap in string columns, the differences between using Roaring and CONCISE are most pronounced for high cardinality columns. In this case, Roaring is substantially faster on filters that match a lot of values, but in some cases CONCISE can have a lower footprint due to the overhead of the Roaring format (but is still slower when lots of values are matched). Currently, compression is configured on at the segment level rather than individual columns, see [IndexSpec](../ingestion/index.md#indexspec) for more details. + ## Sharding Data to Create Segments ### Sharding diff --git a/docs/development/extensions-core/kafka-ingestion.md b/docs/development/extensions-core/kafka-ingestion.md index a2cff30275c..12d937cee24 100644 --- a/docs/development/extensions-core/kafka-ingestion.md +++ b/docs/development/extensions-core/kafka-ingestion.md @@ -162,19 +162,13 @@ The tuningConfig is optional and default parameters will be used if no tuningCon |Field|Type|Description|Required| |-----|----|-----------|--------| -|bitmap|Object|Compression format for bitmap indexes. Should be a JSON object; see below for options.|no (defaults to Concise)| +|bitmap|Object|Compression format for bitmap indexes. Should be a JSON object; see below for options.|no (defaults to Roaring)| |dimensionCompression|String|Compression format for dimension columns. Choose from `LZ4`, `LZF`, or `uncompressed`.|no (default == `LZ4`)| |metricCompression|String|Compression format for metric columns. Choose from `LZ4`, `LZF`, `uncompressed`, or `none`.|no (default == `LZ4`)| |longEncoding|String|Encoding format for metric and dimension columns with type long. Choose from `auto` or `longs`. `auto` encodes the values using offset or lookup table depending on column cardinality, and store them with variable size. `longs` stores the value as is with 8 bytes each.|no (default == `longs`)| ##### Bitmap types -For Concise bitmaps: - -|Field|Type|Description|Required| -|-----|----|-----------|--------| -|`type`|String|Must be `concise`.|yes| - For Roaring bitmaps: |Field|Type|Description|Required| @@ -182,6 +176,12 @@ For Roaring bitmaps: |`type`|String|Must be `roaring`.|yes| |`compressRunOnSerialization`|Boolean|Use a run-length encoding where it is estimated as more space efficient.|no (default == `true`)| +For Concise bitmaps: + +|Field|Type|Description|Required| +|-----|----|-----------|--------| +|`type`|String|Must be `concise`.|yes| + #### SegmentWriteOutMediumFactory |Field|Type|Description|Required| diff --git a/docs/development/extensions-core/kinesis-ingestion.md b/docs/development/extensions-core/kinesis-ingestion.md index d44d9bfe85e..bb54e4be9cd 100644 --- a/docs/development/extensions-core/kinesis-ingestion.md +++ b/docs/development/extensions-core/kinesis-ingestion.md @@ -161,19 +161,13 @@ The tuningConfig is optional and default parameters will be used if no tuningCon |Field|Type|Description|Required| |-----|----|-----------|--------| -|bitmap|Object|Compression format for bitmap indexes. Should be a JSON object; see below for options.|no (defaults to Concise)| +|bitmap|Object|Compression format for bitmap indexes. Should be a JSON object; see below for options.|no (defaults to Roaring)| |dimensionCompression|String|Compression format for dimension columns. Choose from `LZ4`, `LZF`, or `uncompressed`.|no (default == `LZ4`)| |metricCompression|String|Compression format for metric columns. Choose from `LZ4`, `LZF`, `uncompressed`, or `none`.|no (default == `LZ4`)| |longEncoding|String|Encoding format for metric and dimension columns with type long. Choose from `auto` or `longs`. `auto` encodes the values using sequence number or lookup table depending on column cardinality, and store them with variable size. `longs` stores the value as is with 8 bytes each.|no (default == `longs`)| ##### Bitmap types -For Concise bitmaps: - -|Field|Type|Description|Required| -|-----|----|-----------|--------| -|`type`|String|Must be `concise`.|yes| - For Roaring bitmaps: |Field|Type|Description|Required| @@ -181,6 +175,12 @@ For Roaring bitmaps: |`type`|String|Must be `roaring`.|yes| |`compressRunOnSerialization`|Boolean|Use a run-length encoding where it is estimated as more space efficient.|no (default == `true`)| +For Concise bitmaps: + +|Field|Type|Description|Required| +|-----|----|-----------|--------| +|`type`|String|Must be `concise`.|yes| + #### SegmentWriteOutMediumFactory |Field|Type|Description|Required| diff --git a/docs/ingestion/index.md b/docs/ingestion/index.md index a52081a66a6..5871445a7c0 100644 --- a/docs/ingestion/index.md +++ b/docs/ingestion/index.md @@ -707,7 +707,7 @@ is: "maxRowsInMemory": 1000000, "maxBytesInMemory": , "indexSpec": { - "bitmap": { "type": "concise" }, + "bitmap": { "type": "roaring" }, "dimensionCompression": "lz4", "metricCompression": "lz4", "longEncoding": "longs" @@ -730,7 +730,7 @@ The `indexSpec` object can include the following properties: |Field|Description|Default| |-----|-----------|-------| -|bitmap|Compression format for bitmap indexes. Should be a JSON object with `type` set to `concise` or `roaring`. For type `roaring`, the boolean property `compressRunOnSerialization` (defaults to true) controls whether or not run-length encoding will be used when it is determined to be more space-efficient.|`{"type": "concise"}`| +|bitmap|Compression format for bitmap indexes. Should be a JSON object with `type` set to `roaring` or `concise`. For type `roaring`, the boolean property `compressRunOnSerialization` (defaults to true) controls whether or not run-length encoding will be used when it is determined to be more space-efficient.|`{"type": "concise"}`| |dimensionCompression|Compression format for dimension columns. Options are `lz4`, `lzf`, or `uncompressed`.|`lz4`| |metricCompression|Compression format for metric columns. Options are `lz4`, `lzf`, `uncompressed`, or `none` (which is more efficient than `uncompressed`, but not supported by older versions of Druid).|`lz4`| |longEncoding|Encoding format for long-typed columns. Applies regardless of whether they are dimensions or metrics. Options are `auto` or `longs`. `auto` encodes the values using offset or lookup table depending on column cardinality, and store them with variable size. `longs` stores the value as-is with 8 bytes each.|`longs`| diff --git a/docs/ingestion/native-batch.md b/docs/ingestion/native-batch.md index a2bfe27e6bc..453c26b680c 100644 --- a/docs/ingestion/native-batch.md +++ b/docs/ingestion/native-batch.md @@ -522,7 +522,7 @@ An example of the result is "numShards": null, "indexSpec": { "bitmap": { - "type": "concise" + "type": "roaring" }, "dimensionCompression": "lz4", "metricCompression": "lz4", @@ -530,7 +530,7 @@ An example of the result is }, "indexSpecForIntermediatePersists": { "bitmap": { - "type": "concise" + "type": "roaring" }, "dimensionCompression": "lz4", "metricCompression": "lz4", diff --git a/docs/operations/dump-segment.md b/docs/operations/dump-segment.md index cc6eef107f4..571d7f45ced 100644 --- a/docs/operations/dump-segment.md +++ b/docs/operations/dump-segment.md @@ -92,7 +92,8 @@ Sample output: ``` { "bitmapSerdeFactory": { - "type": "concise" + "type": "roaring", + "compressRunOnSerialization": true }, "bitmaps": { "isRobot": { diff --git a/indexing-service/src/test/java/org/apache/druid/indexing/common/task/CompactionTaskRunTest.java b/indexing-service/src/test/java/org/apache/druid/indexing/common/task/CompactionTaskRunTest.java index b67f47d3399..80ea86e8898 100644 --- a/indexing-service/src/test/java/org/apache/druid/indexing/common/task/CompactionTaskRunTest.java +++ b/indexing-service/src/test/java/org/apache/druid/indexing/common/task/CompactionTaskRunTest.java @@ -129,7 +129,7 @@ public class CompactionTaskRunTest extends IngestionTestBase new DynamicPartitionsSpec(5000000, Long.MAX_VALUE), ImmutableMap.of( "bitmap", - ImmutableMap.of("type", "concise"), + ImmutableMap.of("type", "roaring", "compressRunOnSerialization", true), "dimensionCompression", "lz4", "metricCompression", diff --git a/processing/src/main/java/org/apache/druid/segment/IndexSpec.java b/processing/src/main/java/org/apache/druid/segment/IndexSpec.java index 5aa4aed5a38..d14f927c49d 100644 --- a/processing/src/main/java/org/apache/druid/segment/IndexSpec.java +++ b/processing/src/main/java/org/apache/druid/segment/IndexSpec.java @@ -27,10 +27,8 @@ import org.apache.druid.segment.data.BitmapSerde; import org.apache.druid.segment.data.BitmapSerdeFactory; import org.apache.druid.segment.data.CompressionFactory; import org.apache.druid.segment.data.CompressionStrategy; -import org.apache.druid.segment.data.ConciseBitmapSerdeFactory; import javax.annotation.Nullable; - import java.util.Arrays; import java.util.Objects; import java.util.Set; @@ -107,7 +105,9 @@ public class IndexSpec Preconditions.checkArgument(longEncoding == null || LONG_ENCODING_NAMES.contains(longEncoding), "Unknown long encoding type[%s]", longEncoding); - this.bitmapSerdeFactory = bitmapSerdeFactory != null ? bitmapSerdeFactory : new ConciseBitmapSerdeFactory(); + this.bitmapSerdeFactory = bitmapSerdeFactory != null + ? bitmapSerdeFactory + : new BitmapSerde.DefaultBitmapSerdeFactory(); this.dimensionCompression = dimensionCompression == null ? DEFAULT_DIMENSION_COMPRESSION : dimensionCompression; this.metricCompression = metricCompression == null ? DEFAULT_METRIC_COMPRESSION : metricCompression; this.longEncoding = longEncoding == null ? DEFAULT_LONG_ENCODING : longEncoding; diff --git a/processing/src/main/java/org/apache/druid/segment/data/BitmapSerde.java b/processing/src/main/java/org/apache/druid/segment/data/BitmapSerde.java index ff570723131..d4ef7adacfc 100644 --- a/processing/src/main/java/org/apache/druid/segment/data/BitmapSerde.java +++ b/processing/src/main/java/org/apache/druid/segment/data/BitmapSerde.java @@ -24,11 +24,16 @@ import com.fasterxml.jackson.annotation.JsonTypeName; public class BitmapSerde { - // default bitmap indices for Druid >= 0.7.x + // default bitmap indices for Druid + // concise was default from 0.7+, roaring is default 0.18+ // annotation required so Jackson doesn't get confused - @JsonTypeName("concise") - public static class DefaultBitmapSerdeFactory extends ConciseBitmapSerdeFactory + @JsonTypeName("roaring") + public static class DefaultBitmapSerdeFactory extends RoaringBitmapSerdeFactory { + public DefaultBitmapSerdeFactory() + { + super(RoaringBitmapSerdeFactory.DEFAULT_COMPRESS_RUN_ON_SERIALIZATION); + } } // default bitmap indices in Druid <= 0.6.x diff --git a/processing/src/main/java/org/apache/druid/segment/data/RoaringBitmapSerdeFactory.java b/processing/src/main/java/org/apache/druid/segment/data/RoaringBitmapSerdeFactory.java index 0a923ab0b26..92b58d91e99 100644 --- a/processing/src/main/java/org/apache/druid/segment/data/RoaringBitmapSerdeFactory.java +++ b/processing/src/main/java/org/apache/druid/segment/data/RoaringBitmapSerdeFactory.java @@ -34,7 +34,7 @@ import java.nio.ByteBuffer; */ public class RoaringBitmapSerdeFactory implements BitmapSerdeFactory { - private static final boolean DEFAULT_COMPRESS_RUN_ON_SERIALIZATION = true; + public static final boolean DEFAULT_COMPRESS_RUN_ON_SERIALIZATION = true; private static final ObjectStrategy OBJECT_STRATEGY = new ImmutableRoaringBitmapObjectStrategy(); private final boolean compressRunOnSerialization; diff --git a/processing/src/test/java/org/apache/druid/segment/data/BitmapSerdeFactoryTest.java b/processing/src/test/java/org/apache/druid/segment/data/BitmapSerdeFactoryTest.java index 10df2506456..66f1e99cd36 100644 --- a/processing/src/test/java/org/apache/druid/segment/data/BitmapSerdeFactoryTest.java +++ b/processing/src/test/java/org/apache/druid/segment/data/BitmapSerdeFactoryTest.java @@ -35,7 +35,7 @@ public class BitmapSerdeFactoryTest Assert.assertEquals("{\"type\":\"roaring\",\"compressRunOnSerialization\":true}", mapper.writeValueAsString(new RoaringBitmapSerdeFactory(true))); Assert.assertEquals("{\"type\":\"concise\"}", mapper.writeValueAsString(new ConciseBitmapSerdeFactory())); Assert.assertEquals("{\"type\":\"concise\"}", mapper.writeValueAsString(BitmapSerde.createLegacyFactory())); - Assert.assertEquals("{\"type\":\"concise\"}", mapper.writeValueAsString(new BitmapSerde.DefaultBitmapSerdeFactory())); + Assert.assertEquals("{\"type\":\"roaring\",\"compressRunOnSerialization\":true}", mapper.writeValueAsString(new BitmapSerde.DefaultBitmapSerdeFactory())); Assert.assertEquals("{\"type\":\"concise\"}", mapper.writeValueAsString(new BitmapSerde.LegacyBitmapSerdeFactory())); } @@ -55,6 +55,6 @@ public class BitmapSerdeFactoryTest Assert.assertFalse(((RoaringBitmapSerdeFactory) compressingRoaringFactory).getCompressRunOnSerialization()); Assert.assertTrue(mapper.readValue("{\"type\":\"concise\"}", BitmapSerdeFactory.class) instanceof ConciseBitmapSerdeFactory); - Assert.assertTrue(mapper.readValue("{\"type\":\"BitmapSerde$SomeRandomClass\"}", BitmapSerdeFactory.class) instanceof ConciseBitmapSerdeFactory); + Assert.assertTrue(mapper.readValue("{\"type\":\"BitmapSerde$SomeRandomClass\"}", BitmapSerdeFactory.class) instanceof RoaringBitmapSerdeFactory); } } diff --git a/server/src/test/java/org/apache/druid/server/coordinator/duty/CompactSegmentsTest.java b/server/src/test/java/org/apache/druid/server/coordinator/duty/CompactSegmentsTest.java index 6bac4945e57..683e776fc51 100644 --- a/server/src/test/java/org/apache/druid/server/coordinator/duty/CompactSegmentsTest.java +++ b/server/src/test/java/org/apache/druid/server/coordinator/duty/CompactSegmentsTest.java @@ -484,7 +484,7 @@ public class CompactSegmentsTest ), ImmutableMap.of( "bitmap", - ImmutableMap.of("type", "concise"), + ImmutableMap.of("type", "roaring", "compressRunOnSerialization", true), "dimensionCompression", "lz4", "metricCompression", diff --git a/web-console/src/utils/ingestion-spec.tsx b/web-console/src/utils/ingestion-spec.tsx index 92068816537..f0bd50b56af 100644 --- a/web-console/src/utils/ingestion-spec.tsx +++ b/web-console/src/utils/ingestion-spec.tsx @@ -2402,7 +2402,7 @@ const TUNING_CONFIG_FORM_FIELDS: Field[] = [ name: 'indexSpec.bitmap.type', label: 'Index bitmap type', type: 'string', - defaultValue: 'concise', + defaultValue: 'roaring', suggestions: ['concise', 'roaring'], info: <>Compression format for bitmap indexes., },