Configurable compressRunOnSerialization for Roaring bitmaps. (#3228)

Defaults to true, which is a change in behavior (this used to be false and unconfigurable).
This commit is contained in:
Gian Merlino 2016-07-07 21:54:19 -07:00 committed by Nishant
parent 5d9fd0a713
commit ea03906fcf
16 changed files with 125 additions and 33 deletions

View File

@ -153,7 +153,7 @@ public class BoundFilterBenchmark
{
step = (END_INT - START_INT) / cardinality;
final BitmapFactory bitmapFactory = new RoaringBitmapFactory();
final BitmapSerdeFactory serdeFactory = new RoaringBitmapSerdeFactory();
final BitmapSerdeFactory serdeFactory = new RoaringBitmapSerdeFactory(null);
final List<Integer> ints = generateInts();
final GenericIndexed<String> dictionary = GenericIndexed.fromIterable(
FluentIterable.from(ints)

View File

@ -89,7 +89,7 @@ public class DimensionPredicateFilterBenchmark
public void setup() throws IOException
{
final BitmapFactory bitmapFactory = new RoaringBitmapFactory();
final BitmapSerdeFactory serdeFactory = new RoaringBitmapSerdeFactory();
final BitmapSerdeFactory serdeFactory = new RoaringBitmapSerdeFactory(null);
final List<Integer> ints = generateInts();
final GenericIndexed<String> dictionary = GenericIndexed.fromIterable(
FluentIterable.from(ints)

View File

@ -128,10 +128,25 @@ The tuningConfig is optional and default parameters will be used if no tuningCon
|Field|Type|Description|Required|
|-----|----|-----------|--------|
|`bitmap`|String|The type of bitmap index to create. Choose from `roaring` or `concise`.|no (default == `concise`)|
|`bitmap`|Object|Compression format for bitmap indexes. Should be a JSON object; see below for options.|no (defaults to Concise)|
|`dimensionCompression`|String|Compression format for dimension columns. Choose from `LZ4`, `LZF`, or `uncompressed`.|no (default == `LZ4`)|
|`metricCompression`|String|Compression format for metric columns. Choose from `LZ4`, `LZF`, or `uncompressed`.|no (default == `LZ4`)|
##### Bitmap types
For Concise bitmaps:
|Field|Type|Description|Required|
|-----|----|-----------|--------|
|`type`|String|Must be `concise`.|yes|
For Roaring bitmaps:
|Field|Type|Description|Required|
|-----|----|-----------|--------|
|`type`|String|Must be `roaring`.|yes|
|`compressRunOnSerialization`|Boolean|Use a run-length encoding where it is estimated as more space efficient.|no (default == `true`)|
### KafkaSupervisorIOConfig
|Field|Type|Description|Required|

View File

@ -191,9 +191,24 @@ The following properties can be used to tune how the MapReduce job is configured
|Field|Type|Description|Required|
|-----|----|-----------|--------|
|bitmap|String|The type of bitmap index to create. Choose from `roaring` or `concise`, or null to use the default (`concise`).|No|
|dimensionCompression|String|Compression format for dimension columns. Choose from `LZ4`, `LZF`, or `uncompressed`. The default is `LZ4`.|No|
|metricCompression|String|Compression format for metric columns. Choose from `LZ4`, `LZF`, or `uncompressed`. The default is `LZ4`.|No|
|bitmap|Object|Compression format for bitmap indexes. Should be a JSON object; see below for options.|no (defaults to Concise)|
|dimensionCompression|String|Compression format for dimension columns. Choose from `LZ4`, `LZF`, or `uncompressed`.|no (default == `LZ4`)|
|metricCompression|String|Compression format for metric columns. Choose from `LZ4`, `LZF`, or `uncompressed`.|no (default == `LZ4`)|
##### Bitmap types
For Concise bitmaps:
|Field|Type|Description|Required|
|-----|----|-----------|--------|
|type|String|Must be `concise`.|yes|
For Roaring bitmaps:
|Field|Type|Description|Required|
|-----|----|-----------|--------|
|type|String|Must be `roaring`.|yes|
|compressRunOnSerialization|Boolean|Use a run-length encoding where it is estimated as more space efficient.|no (default == `true`)|
### Partitioning specification

View File

@ -167,13 +167,28 @@ The following policies are available:
* `messageTime` &ndash; Can be used for non-"current time" as long as that data is relatively in sequence. Events are rejected if they are less than `windowPeriod` from the event with the latest timestamp. Hand off only occurs if an event is seen after the segmentGranularity and `windowPeriod` (hand off will not periodically occur unless you have a constant stream of data).
* `none` &ndash; All events are accepted. Never hands off data unless shutdown() is called on the configured firehose.
### Index Spec
#### IndexSpec
|Field|Type|Description|Required|
|-----|----|-----------|--------|
|bitmap|String|The type of bitmap index to create. Choose from `roaring` or `concise`, or null to use the default (`concise`).|No|
|dimensionCompression|String|Compression format for dimension columns. Choose from `LZ4`, `LZF`, or `uncompressed`. The default is `LZ4`.|No|
|metricCompression|String|Compression format for metric columns. Choose from `LZ4`, `LZF`, or `uncompressed`. The default is `LZ4`.|No|
|bitmap|Object|Compression format for bitmap indexes. Should be a JSON object; see below for options.|no (defaults to Concise)|
|dimensionCompression|String|Compression format for dimension columns. Choose from `LZ4`, `LZF`, or `uncompressed`.|no (default == `LZ4`)|
|metricCompression|String|Compression format for metric columns. Choose from `LZ4`, `LZF`, or `uncompressed`.|no (default == `LZ4`)|
##### Bitmap types
For Concise bitmaps:
|Field|Type|Description|Required|
|-----|----|-----------|--------|
|type|String|Must be `concise`.|yes|
For Roaring bitmaps:
|Field|Type|Description|Required|
|-----|----|-----------|--------|
|type|String|Must be `roaring`.|yes|
|compressRunOnSerialization|Boolean|Use a run-length encoding where it is estimated as more space efficient.|no (default == `true`)|
#### Sharding

View File

@ -116,16 +116,29 @@ The tuningConfig is optional and default parameters will be used if no tuningCon
#### IndexSpec
The indexSpec defines segment storage format options to be used at indexing
time, such as bitmap type, and column compression formats.
The indexSpec defines segment storage format options to be used at indexing time, such as bitmap type and column
compression formats. The indexSpec is optional and default parameters will be used if not specified.
The indexSpec is optional and default parameters will be used if not specified.
|Field|Type|Description|Required|
|-----|----|-----------|--------|
|bitmap|Object|Compression format for bitmap indexes. Should be a JSON object; see below for options.|no (defaults to Concise)|
|dimensionCompression|String|Compression format for dimension columns. Choose from `LZ4`, `LZF`, or `uncompressed`.|no (default == `LZ4`)|
|metricCompression|String|Compression format for metric columns. Choose from `LZ4`, `LZF`, or `uncompressed`.|no (default == `LZ4`)|
|property|description|possible values|default|required?|
|--------|-----------|---------------|-------|---------|
|bitmap|type of bitmap compression to use for inverted indices.|`"concise"`, `"roaring"`|`"concise"`|no|
|dimensionCompression|compression format for dimension columns|`"uncompressed"`, `"lz4"`, `"lzf"`|`"lz4"`|no|
|metricCompression|compression format for metric columns, defaults to LZ4|`"lz4"`, `"lzf"`|`"lz4"`|no|
##### Bitmap types
For Concise bitmaps:
|Field|Type|Description|Required|
|-----|----|-----------|--------|
|type|String|Must be `concise`.|yes|
For Roaring bitmaps:
|Field|Type|Description|Required|
|-----|----|-----------|--------|
|type|String|Must be `roaring`.|yes|
|compressRunOnSerialization|Boolean|Use a run-length encoding where it is estimated as more space efficient.|no (default == `true`)|
Segment Merging Tasks
---------------------

View File

@ -283,7 +283,7 @@ public class HadoopConverterJobTest
new HadoopDruidConverterConfig(
DATASOURCE,
interval,
new IndexSpec(new RoaringBitmapSerdeFactory(), "uncompressed", "uncompressed"),
new IndexSpec(new RoaringBitmapSerdeFactory(null), "uncompressed", "uncompressed"),
oldSemgments,
true,
tmpDir.toURI(),
@ -386,7 +386,7 @@ public class HadoopConverterJobTest
new HadoopDruidConverterConfig(
DATASOURCE,
interval,
new IndexSpec(new RoaringBitmapSerdeFactory(), "uncompressed", "uncompressed"),
new IndexSpec(new RoaringBitmapSerdeFactory(null), "uncompressed", "uncompressed"),
oldSemgments,
true,
tmpDir.toURI(),

View File

@ -517,7 +517,7 @@ public class TaskSerdeTest
);
final ConvertSegmentTask convertSegmentTaskOriginal = ConvertSegmentTask.create(
segment,
new IndexSpec(new RoaringBitmapSerdeFactory(), "lzf", "uncompressed"),
new IndexSpec(new RoaringBitmapSerdeFactory(null), "lzf", "uncompressed"),
false,
true,
null

View File

@ -32,6 +32,7 @@ import org.roaringbitmap.IntIterator;
public class BitmapOffset implements Offset
{
private static final int INVALID_VALUE = -1;
private static final BitmapFactory ROARING_BITMAP_FACTORY = new RoaringBitmapSerdeFactory(false).getBitmapFactory();
private final IntIterator itr;
private final BitmapFactory bitmapFactory;
@ -44,13 +45,12 @@ public class BitmapOffset implements Offset
{
ImmutableBitmap roaringBitmap = bitmapIndex;
if (!(bitmapIndex instanceof WrappedImmutableRoaringBitmap)) {
final BitmapFactory factory = RoaringBitmapSerdeFactory.bitmapFactory;
final MutableBitmap bitmap = factory.makeEmptyMutableBitmap();
final MutableBitmap bitmap = ROARING_BITMAP_FACTORY.makeEmptyMutableBitmap();
final IntIterator iterator = bitmapIndex.iterator();
while (iterator.hasNext()) {
bitmap.add(iterator.next());
}
roaringBitmap = factory.makeImmutableBitmap(bitmap);
roaringBitmap = ROARING_BITMAP_FACTORY.makeImmutableBitmap(bitmap);
}
return ((WrappedImmutableRoaringBitmap) roaringBitmap).getBitmap().getReverseIntIterator();
}

View File

@ -19,6 +19,8 @@
package io.druid.segment.data;
import com.fasterxml.jackson.annotation.JsonCreator;
import com.fasterxml.jackson.annotation.JsonProperty;
import com.google.common.collect.Ordering;
import com.metamx.collections.bitmap.BitmapFactory;
import com.metamx.collections.bitmap.ImmutableBitmap;
@ -32,8 +34,28 @@ import java.nio.ByteBuffer;
*/
public class RoaringBitmapSerdeFactory implements BitmapSerdeFactory
{
public static final ObjectStrategy<ImmutableBitmap> objectStrategy = new ImmutableRoaringBitmapObjectStrategy();
public static final BitmapFactory bitmapFactory = new RoaringBitmapFactory();
private static final boolean DEFAULT_COMPRESS_RUN_ON_SERIALIZATION = true;
private static final ObjectStrategy<ImmutableBitmap> objectStrategy = new ImmutableRoaringBitmapObjectStrategy();
private final boolean compressRunOnSerialization;
private final BitmapFactory bitmapFactory;
@JsonCreator
public RoaringBitmapSerdeFactory(
@JsonProperty("compressRunOnSerialization") Boolean compressRunOnSerialization
)
{
this.compressRunOnSerialization = compressRunOnSerialization == null
? DEFAULT_COMPRESS_RUN_ON_SERIALIZATION
: compressRunOnSerialization;
this.bitmapFactory = new RoaringBitmapFactory(this.compressRunOnSerialization);
}
@JsonProperty
public boolean getCompressRunOnSerialization()
{
return compressRunOnSerialization;
}
@Override
public ObjectStrategy<ImmutableBitmap> getObjectStrategy()

View File

@ -91,7 +91,7 @@ public class IndexMergerTest
false
),
ImmutableSet.of(
new RoaringBitmapSerdeFactory(),
new RoaringBitmapSerdeFactory(null),
new ConciseBitmapSerdeFactory()
),
ImmutableSet.of(

View File

@ -35,7 +35,7 @@ public class IndexSpecTest
final String json = "{ \"bitmap\" : { \"type\" : \"roaring\" }, \"dimensionCompression\" : \"lz4\", \"metricCompression\" : \"lzf\" }";
final IndexSpec spec = objectMapper.readValue(json, IndexSpec.class);
Assert.assertEquals(new RoaringBitmapSerdeFactory(), spec.getBitmapSerdeFactory());
Assert.assertEquals(new RoaringBitmapSerdeFactory(null), spec.getBitmapSerdeFactory());
Assert.assertEquals(CompressedObjectStrategy.CompressionStrategy.LZ4, spec.getDimensionCompressionStrategy());
Assert.assertEquals(CompressedObjectStrategy.CompressionStrategy.LZF, spec.getMetricCompressionStrategy());

View File

@ -30,7 +30,9 @@ public class BitmapSerdeFactoryTest
public void testSerialization() throws Exception
{
ObjectMapper mapper = new DefaultObjectMapper();
Assert.assertEquals("{\"type\":\"roaring\"}", mapper.writeValueAsString(new RoaringBitmapSerdeFactory()));
Assert.assertEquals("{\"type\":\"roaring\",\"compressRunOnSerialization\":true}", mapper.writeValueAsString(new RoaringBitmapSerdeFactory(null)));
Assert.assertEquals("{\"type\":\"roaring\",\"compressRunOnSerialization\":false}", mapper.writeValueAsString(new RoaringBitmapSerdeFactory(false)));
Assert.assertEquals("{\"type\":\"roaring\",\"compressRunOnSerialization\":true}", mapper.writeValueAsString(new RoaringBitmapSerdeFactory(true)));
Assert.assertEquals("{\"type\":\"concise\"}", mapper.writeValueAsString(new ConciseBitmapSerdeFactory()));
Assert.assertEquals("{\"type\":\"concise\"}", mapper.writeValueAsString(BitmapSerde.createLegacyFactory()));
Assert.assertEquals("{\"type\":\"concise\"}", mapper.writeValueAsString(new BitmapSerde.DefaultBitmapSerdeFactory()));
@ -41,7 +43,17 @@ public class BitmapSerdeFactoryTest
public void testDeserialization() throws Exception
{
ObjectMapper mapper = new DefaultObjectMapper();
Assert.assertTrue(mapper.readValue("{\"type\":\"roaring\"}", BitmapSerdeFactory.class) instanceof RoaringBitmapSerdeFactory);
final BitmapSerdeFactory roaringFactory = mapper.readValue("{\"type\":\"roaring\"}", BitmapSerdeFactory.class);
Assert.assertTrue(roaringFactory instanceof RoaringBitmapSerdeFactory);
Assert.assertTrue(((RoaringBitmapSerdeFactory)roaringFactory).getCompressRunOnSerialization());
final BitmapSerdeFactory compressingRoaringFactory = mapper.readValue(
"{\"type\":\"roaring\", \"compressRunOnSerialization\":false}",
BitmapSerdeFactory.class
);
Assert.assertTrue(compressingRoaringFactory instanceof RoaringBitmapSerdeFactory);
Assert.assertFalse(((RoaringBitmapSerdeFactory)compressingRoaringFactory).getCompressRunOnSerialization());
Assert.assertTrue(mapper.readValue("{\"type\":\"concise\"}", BitmapSerdeFactory.class) instanceof ConciseBitmapSerdeFactory);
Assert.assertTrue(mapper.readValue("{\"type\":\"BitmapSerde$SomeRandomClass\"}", BitmapSerdeFactory.class) instanceof ConciseBitmapSerdeFactory);
}

View File

@ -156,7 +156,7 @@ public abstract class BaseFilterTest
final Map<String, BitmapSerdeFactory> bitmapSerdeFactories = ImmutableMap.<String, BitmapSerdeFactory>of(
"concise", new ConciseBitmapSerdeFactory(),
"roaring", new RoaringBitmapSerdeFactory()
"roaring", new RoaringBitmapSerdeFactory(true)
);
final Map<String, IndexMerger> indexMergers = ImmutableMap.<String, IndexMerger>of(

View File

@ -71,7 +71,7 @@ public class ExtractionDimFilterTest
{
return ImmutableList.of(
new Object[]{new ConciseBitmapFactory(), new ConciseBitmapSerdeFactory()},
new Object[]{new RoaringBitmapFactory(), new RoaringBitmapSerdeFactory()}
new Object[]{new RoaringBitmapFactory(), new RoaringBitmapSerdeFactory(null)}
);
}

View File

@ -320,7 +320,7 @@ public class DumpSegment extends GuiceRunnable
if (bitmapFactory instanceof ConciseBitmapFactory) {
bitmapSerdeFactory = new ConciseBitmapSerdeFactory();
} else if (bitmapFactory instanceof RoaringBitmapFactory) {
bitmapSerdeFactory = new RoaringBitmapSerdeFactory();
bitmapSerdeFactory = new RoaringBitmapSerdeFactory(null);
} else {
throw new ISE(
"Don't know which BitmapSerdeFactory to use for BitmapFactory[%s]!",