port hll from 0.5.x to 0.6.x

This commit is contained in:
fjy 2014-01-21 13:21:31 -08:00
parent 2ff86984da
commit b85a28ab96
9 changed files with 803 additions and 2 deletions

82
hll/pom.xml Normal file
View File

@ -0,0 +1,82 @@
<?xml version="1.0" encoding="UTF-8"?>
<!--
~ Druid - a distributed column store.
~ Copyright (C) 2012, 2013 Metamarkets Group Inc.
~
~ This program is free software; you can redistribute it and/or
~ modify it under the terms of the GNU General Public License
~ as published by the Free Software Foundation; either version 2
~ of the License, or (at your option) any later version.
~
~ This program is distributed in the hope that it will be useful,
~ but WITHOUT ANY WARRANTY; without even the implied warranty of
~ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
~ GNU General Public License for more details.
~
~ You should have received a copy of the GNU General Public License
~ along with this program; if not, write to the Free Software
~ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
-->
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>io.druid.extensions</groupId>
<artifactId>druid-hll</artifactId>
<name>druid-hll</name>
<description>druid-hll</description>
<parent>
<groupId>io.druid</groupId>
<artifactId>druid</artifactId>
<version>0.6.53-SNAPSHOT</version>
</parent>
<dependencies>
<dependency>
<groupId>io.druid</groupId>
<artifactId>druid-api</artifactId>
</dependency>
<dependency>
<groupId>io.druid</groupId>
<artifactId>druid-processing</artifactId>
<version>${project.parent.version}</version>
</dependency>
<dependency>
<groupId>com.metamx</groupId>
<artifactId>emitter</artifactId>
</dependency>
<dependency>
<groupId>net.sf.trove4j</groupId>
<artifactId>trove4j</artifactId>
<version>3.0.3</version>
</dependency>
<dependency>
<groupId>commons-codec</groupId>
<artifactId>commons-codec</artifactId>
</dependency>
<!-- Tests -->
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<scope>test</scope>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<artifactId>maven-jar-plugin</artifactId>
<configuration>
<archive>
<manifest>
<addDefaultImplementationEntries>true</addDefaultImplementationEntries>
<addDefaultSpecificationEntries>true</addDefaultSpecificationEntries>
</manifest>
</archive>
</configuration>
</plugin>
</plugins>
</build>
</project>

View File

@ -0,0 +1,137 @@
/*
* Druid - a distributed column store.
* Copyright (C) 2012 Metamarkets Group Inc.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version 2
* of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
*/
package io.druid.query.aggregation;
import com.google.common.hash.Hashing;
import com.metamx.common.ISE;
import com.metamx.common.logger.Logger;
import gnu.trove.map.TIntByteMap;
import gnu.trove.map.hash.TIntByteHashMap;
import io.druid.segment.ObjectColumnSelector;
import java.util.Comparator;
public class HyperloglogAggregator implements Aggregator
{
private static final Logger log = new Logger(HyperloglogAggregator.class);
public static final int log2m = 12;
public static final int m = (int) Math.pow(2, log2m);
public static final double alphaMM = (0.7213 / (1 + 1.079 / m)) * m * m;
private final String name;
private final ObjectColumnSelector selector;
private TIntByteHashMap ibMap;
static final Comparator COMPARATOR = new Comparator()
{
@Override
public int compare(Object o, Object o1)
{
return o.equals(o1) ? 0 : 1;
}
};
static Object combine(Object lhs, Object rhs)
{
final TIntByteMap newIbMap = new TIntByteHashMap((TIntByteMap) lhs);
final TIntByteMap rightIbMap = (TIntByteMap) rhs;
final int[] keys = rightIbMap.keys();
for (int key : keys) {
if (newIbMap.get(key) == newIbMap.getNoEntryValue() || rightIbMap.get(key) > newIbMap.get(key)) {
newIbMap.put(key, rightIbMap.get(key));
}
}
return newIbMap;
}
public HyperloglogAggregator(String name, ObjectColumnSelector selector)
{
this.name = name;
this.selector = selector;
this.ibMap = new TIntByteHashMap();
}
@Override
public void aggregate()
{
final Object value = selector.get();
if (value == null) {
return;
}
if (value instanceof TIntByteHashMap) {
final TIntByteHashMap newIbMap = (TIntByteHashMap) value;
final int[] indexes = newIbMap.keys();
for (int index : indexes) {
if (ibMap.get(index) == ibMap.getNoEntryValue() || newIbMap.get(index) > ibMap.get(index)) {
ibMap.put(index, newIbMap.get(index));
}
}
} else if (value instanceof String) {
log.debug("value [%s]", selector.get());
final long id = Hashing.murmur3_128().hashString((String) (value)).asLong();
final int bucket = (int) (id >>> (Long.SIZE - log2m));
final int zerolength = Long.numberOfLeadingZeros((id << log2m) | (1 << (log2m - 1)) + 1) + 1;
if (ibMap.get(bucket) == ibMap.getNoEntryValue() || ibMap.get(bucket) < (byte) zerolength) {
ibMap.put(bucket, (byte) zerolength);
}
} else {
throw new ISE("Aggregate does not support values of type[%s]", value.getClass().getName());
}
}
@Override
public void reset()
{
this.ibMap = new TIntByteHashMap();
}
@Override
public Object get()
{
return ibMap;
}
@Override
public float getFloat()
{
throw new UnsupportedOperationException("HyperloglogAggregator does not support getFloat()");
}
@Override
public String getName()
{
return name;
}
@Override
public void close()
{
// do nothing
}
}

View File

@ -0,0 +1,209 @@
/*
* Druid - a distributed column store.
* Copyright (C) 2012 Metamarkets Group Inc.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version 2
* of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
*/
package io.druid.query.aggregation;
import com.fasterxml.jackson.annotation.JsonCreator;
import com.fasterxml.jackson.annotation.JsonProperty;
import com.google.common.base.Preconditions;
import com.metamx.common.logger.Logger;
import gnu.trove.map.hash.TIntByteHashMap;
import io.druid.segment.ColumnSelectorFactory;
import org.apache.commons.codec.binary.Base64;
import java.nio.ByteBuffer;
import java.util.Arrays;
import java.util.Comparator;
import java.util.List;
public class HyperloglogAggregatorFactory implements AggregatorFactory
{
private static final Logger log = new Logger(HyperloglogAggregatorFactory.class);
private static final byte[] CACHE_KEY = new byte[]{0x37};
private final String name;
private final String fieldName;
@JsonCreator
public HyperloglogAggregatorFactory(
@JsonProperty("name") final String name,
@JsonProperty("fieldName") final String fieldName
)
{
Preconditions.checkNotNull(name, "Must have a valid, non-null aggregator name");
Preconditions.checkNotNull(fieldName, "Must have a valid, non-null fieldName");
this.name = name;
this.fieldName = fieldName;
}
@Override
public Aggregator factorize(ColumnSelectorFactory metricFactory)
{
return new HyperloglogAggregator(
name,
metricFactory.makeObjectColumnSelector(fieldName)
);
}
@Override
public BufferAggregator factorizeBuffered(
ColumnSelectorFactory metricFactory
)
{
return new HyperloglogBufferAggregator(
metricFactory.makeObjectColumnSelector(fieldName)
);
}
@Override
public Comparator getComparator()
{
return HyperloglogAggregator.COMPARATOR;
}
@Override
public Object combine(Object lhs, Object rhs)
{
if (rhs == null) {
return lhs;
}
if (lhs == null) {
return rhs;
}
return HyperloglogAggregator.combine(lhs, rhs);
}
@Override
public AggregatorFactory getCombiningFactory()
{
log.debug("factory name: %s", name);
return new HyperloglogAggregatorFactory(name, fieldName);
}
@Override
public Object deserialize(Object object)
{
log.debug("class name: [%s]:value [%s]", object.getClass().getName(), object);
final String k = (String) object;
final byte[] ibmapByte = Base64.decodeBase64(k);
final ByteBuffer buffer = ByteBuffer.wrap(ibmapByte);
final int keylength = buffer.getInt();
final int valuelength = buffer.getInt();
TIntByteHashMap newIbMap;
if (keylength == 0) {
newIbMap = new TIntByteHashMap();
} else {
final int[] keys = new int[keylength];
final byte[] values = new byte[valuelength];
for (int i = 0; i < keylength; i++) {
keys[i] = buffer.getInt();
}
buffer.get(values);
newIbMap = new TIntByteHashMap(keys, values);
}
return newIbMap;
}
@Override
public Object finalizeComputation(Object object)
{
final TIntByteHashMap ibMap = (TIntByteHashMap) object;
final int[] keys = ibMap.keys();
final int count = keys.length;
double registerSum = 0;
double zeros = 0.0;
for (int key : keys) {
int val = ibMap.get(key);
registerSum += 1.0 / (1 << val);
if (val == 0) {
zeros++;
}
}
registerSum += (HyperloglogAggregator.m - count);
zeros += HyperloglogAggregator.m - count;
double estimate = HyperloglogAggregator.alphaMM * (1.0 / registerSum);
if (estimate <= (5.0 / 2.0) * (HyperloglogAggregator.m)) {
// Small Range Estimate
return Math.round(HyperloglogAggregator.m * Math.log(HyperloglogAggregator.m / zeros));
} else {
return Math.round(estimate);
}
}
@JsonProperty
public String getFieldName()
{
return fieldName;
}
@Override
@JsonProperty
public String getName()
{
return name;
}
@Override
public List<String> requiredFields()
{
return Arrays.asList(fieldName);
}
@Override
public byte[] getCacheKey()
{
byte[] fieldNameBytes = fieldName.getBytes();
return ByteBuffer.allocate(1 + fieldNameBytes.length).put(CACHE_KEY)
.put(fieldNameBytes).array();
}
@Override
public String getTypeName()
{
return "hyperloglog";
}
@Override
public int getMaxIntermediateSize()
{
return HyperloglogAggregator.m;
}
@Override
public Object getAggregatorStartValue()
{
return new TIntByteHashMap();
}
}

View File

@ -0,0 +1,94 @@
/*
* Druid - a distributed column store.
* Copyright (C) 2012 Metamarkets Group Inc.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version 2
* of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
*/
package io.druid.query.aggregation;
import gnu.trove.map.hash.TIntByteHashMap;
import gnu.trove.procedure.TIntByteProcedure;
import io.druid.segment.ObjectColumnSelector;
import java.nio.ByteBuffer;
public class HyperloglogBufferAggregator implements BufferAggregator
{
private final ObjectColumnSelector selector;
public HyperloglogBufferAggregator(ObjectColumnSelector selector)
{
this.selector = selector;
}
/*
* byte 1 key length byte 2 value length byte 3...n key array byte n+1....
* value array
*/
@Override
public void init(ByteBuffer buf, int position)
{
for (int i = 0; i < HyperloglogAggregator.m; i++) {
buf.put(position + i, (byte) 0);
}
}
@Override
public void aggregate(ByteBuffer buf, int position)
{
final ByteBuffer fb = buf;
final int fp = position;
final TIntByteHashMap newObj = (TIntByteHashMap) (selector.get());
newObj.forEachEntry(
new TIntByteProcedure()
{
public boolean execute(int a, byte b)
{
if (b > fb.get(fp + a)) {
fb.put(fp + a, b);
}
return true;
}
}
);
}
@Override
public Object get(ByteBuffer buf, int position)
{
final TIntByteHashMap ret = new TIntByteHashMap();
for (int i = 0; i < HyperloglogAggregator.m; i++) {
if (buf.get(position + i) != 0) {
ret.put(i, buf.get(position + i));
}
}
return ret;
}
@Override
public float getFloat(ByteBuffer buf, int position)
{
throw new UnsupportedOperationException("HyperloglogAggregator does not support getFloat()");
}
@Override
public void close()
{
// do nothing
}
}

View File

@ -0,0 +1,137 @@
/*
* Druid - a distributed column store.
* Copyright (C) 2012 Metamarkets Group Inc.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version 2
* of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
*/
package io.druid.query.aggregation;
import gnu.trove.map.hash.TIntByteHashMap;
import io.druid.data.input.InputRow;
import io.druid.segment.column.ColumnBuilder;
import io.druid.segment.column.ValueType;
import io.druid.segment.data.GenericIndexed;
import io.druid.segment.data.ObjectStrategy;
import io.druid.segment.serde.ColumnPartSerde;
import io.druid.segment.serde.ComplexColumnPartSerde;
import io.druid.segment.serde.ComplexColumnPartSupplier;
import io.druid.segment.serde.ComplexMetricExtractor;
import io.druid.segment.serde.ComplexMetricSerde;
import java.nio.ByteBuffer;
import java.util.List;
public class HyperloglogComplexMetricSerde extends ComplexMetricSerde
{
@Override
public String getTypeName()
{
return "hyperloglog";
}
@Override
public ComplexMetricExtractor getExtractor()
{
return new HyperloglogComplexMetricExtractor();
}
@Override
public ColumnPartSerde deserializeColumn(ByteBuffer buffer, ColumnBuilder builder)
{
GenericIndexed column = GenericIndexed.read(buffer, getObjectStrategy());
builder.setType(ValueType.COMPLEX);
builder.setComplexColumn(new ComplexColumnPartSupplier("hyperloglog", column));
return new ComplexColumnPartSerde(column, "hyperloglog");
}
@Override
public ObjectStrategy getObjectStrategy()
{
return new HyperloglogObjectStrategy();
}
public static class HyperloglogObjectStrategy implements ObjectStrategy<TIntByteHashMap>
{
@Override
public Class<? extends TIntByteHashMap> getClazz()
{
return TIntByteHashMap.class;
}
@Override
public TIntByteHashMap fromByteBuffer(ByteBuffer buffer, int numBytes)
{
int keylength = buffer.getInt();
int valuelength = buffer.getInt();
if (keylength == 0) {
return new TIntByteHashMap();
}
int[] keys = new int[keylength];
byte[] values = new byte[valuelength];
for (int i = 0; i < keylength; i++) {
keys[i] = buffer.getInt();
}
buffer.get(values);
TIntByteHashMap tib = new TIntByteHashMap(keys, values);
return tib;
}
@Override
public byte[] toBytes(TIntByteHashMap val)
{
TIntByteHashMap ibmap = val;
int[] indexesResult = ibmap.keys();
byte[] valueResult = ibmap.values();
ByteBuffer buffer = ByteBuffer.allocate(4 * indexesResult.length + valueResult.length + 8);
byte[] result = new byte[4 * indexesResult.length + valueResult.length + 8];
buffer.putInt((int) indexesResult.length);
buffer.putInt((int) valueResult.length);
for (int i = 0; i < indexesResult.length; i++) {
buffer.putInt(indexesResult[i]);
}
buffer.put(valueResult);
buffer.flip();
buffer.get(result);
return result;
}
@Override
public int compare(TIntByteHashMap o1, TIntByteHashMap o2)
{
return o1.equals(o2) ? 0 : 1;
}
}
public static class HyperloglogComplexMetricExtractor implements ComplexMetricExtractor
{
@Override
public Class<?> extractedClass()
{
return List.class;
}
@Override
public Object extractValue(InputRow inputRow, String metricName)
{
return inputRow.getRaw(metricName);
}
}
}

View File

@ -0,0 +1,140 @@
/*
* Druid - a distributed column store.
* Copyright (C) 2012 Metamarkets Group Inc.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version 2
* of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
*/
package io.druid.query.aggregation;
import com.fasterxml.jackson.core.JsonGenerator;
import com.fasterxml.jackson.core.JsonParser;
import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.databind.DeserializationContext;
import com.fasterxml.jackson.databind.JsonDeserializer;
import com.fasterxml.jackson.databind.JsonSerializer;
import com.fasterxml.jackson.databind.Module;
import com.fasterxml.jackson.databind.SerializerProvider;
import com.fasterxml.jackson.databind.jsontype.NamedType;
import com.fasterxml.jackson.databind.module.SimpleModule;
import com.google.common.collect.ImmutableList;
import com.google.inject.Binder;
import gnu.trove.map.hash.TIntByteHashMap;
import io.druid.initialization.DruidModule;
import io.druid.segment.serde.ComplexMetrics;
import org.apache.commons.codec.binary.Base64;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.util.List;
/**
*/
public class HyperloglogDruidModule implements DruidModule
{
@Override
public List<? extends Module> getJacksonModules()
{
return ImmutableList.of(
new HyperloglogJacksonSerdeModule().registerSubtypes(
new NamedType(HyperloglogAggregatorFactory.class, "hyperloglog")
)
);
}
@Override
public void configure(Binder binder)
{
if (ComplexMetrics.getSerdeForType("hyperloglog") == null) {
ComplexMetrics.registerSerde("hyperloglog", new HyperloglogComplexMetricSerde());
}
}
public static class HyperloglogJacksonSerdeModule extends SimpleModule
{
public HyperloglogJacksonSerdeModule()
{
super("Hyperloglog deserializers");
addDeserializer(
TIntByteHashMap.class,
new JsonDeserializer<TIntByteHashMap>()
{
@Override
public TIntByteHashMap deserialize(
JsonParser jp,
DeserializationContext ctxt
) throws IOException
{
byte[] ibmapByte = Base64.decodeBase64(jp.getText());
ByteBuffer buffer = ByteBuffer.wrap(ibmapByte);
int keylength = buffer.getInt();
int valuelength = buffer.getInt();
if (keylength == 0) {
return (new TIntByteHashMap());
}
int[] keys = new int[keylength];
byte[] values = new byte[valuelength];
for (int i = 0; i < keylength; i++) {
keys[i] = buffer.getInt();
}
buffer.get(values);
return (new TIntByteHashMap(keys, values));
}
}
);
addSerializer(
TIntByteHashMap.class,
new JsonSerializer<TIntByteHashMap>()
{
@Override
public void serialize(
TIntByteHashMap ibmap,
JsonGenerator jsonGenerator,
SerializerProvider serializerProvider
)
throws IOException, JsonProcessingException
{
int[] indexesResult = ibmap.keys();
byte[] valueResult = ibmap.values();
ByteBuffer buffer = ByteBuffer
.allocate(
4 * indexesResult.length
+ valueResult.length + 8
);
byte[] result = new byte[4 * indexesResult.length
+ valueResult.length + 8];
buffer.putInt((int) indexesResult.length);
buffer.putInt((int) valueResult.length);
for (int i = 0; i < indexesResult.length; i++) {
buffer.putInt(indexesResult[i]);
}
buffer.put(valueResult);
buffer.flip();
buffer.get(result);
String str = Base64.encodeBase64String(result);
jsonGenerator.writeString(str);
}
}
);
}
}
}

View File

@ -0,0 +1 @@
io.druid.query.aggregation.HyperloglogDruidModule

View File

@ -41,7 +41,7 @@
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<metamx.java-util.version>0.25.2</metamx.java-util.version>
<apache.curator.version>2.3.0</apache.curator.version>
<druid.api.version>0.1.7</druid.api.version>
<druid.api.version>0.1.8</druid.api.version>
</properties>
<modules>
@ -59,6 +59,7 @@
<module>kafka-seven</module>
<module>kafka-eight</module>
<module>rabbitmq</module>
<module>hll</module>
</modules>
<dependencyManagement>

View File

@ -196,7 +196,7 @@ public class Initialization
if (!exclusions.contains(artifact.getGroupId())) {
urls.add(artifact.getFile().toURI().toURL());
} else {
log.debug("Skipped Artifact[%s]", artifact);
log.info("Skipped Artifact[%s]", artifact);
}
}