diff --git a/build.sh b/build.sh index 727a15d954d..a5b6f7e1366 100755 --- a/build.sh +++ b/build.sh @@ -30,4 +30,4 @@ echo "For examples, see: " echo " " ls -1 examples/*/*sh echo " " -echo "See also http://druid.io/docs/0.6.61" +echo "See also http://druid.io/docs/0.6.65" diff --git a/cassandra-storage/pom.xml b/cassandra-storage/pom.xml index 38ccdc30476..4858c321bad 100644 --- a/cassandra-storage/pom.xml +++ b/cassandra-storage/pom.xml @@ -28,7 +28,7 @@ io.druid druid - 0.6.63-SNAPSHOT + 0.6.66-SNAPSHOT diff --git a/common/pom.xml b/common/pom.xml index fd51a1837d0..51c19714242 100644 --- a/common/pom.xml +++ b/common/pom.xml @@ -28,7 +28,7 @@ io.druid druid - 0.6.63-SNAPSHOT + 0.6.66-SNAPSHOT diff --git a/common/src/main/java/io/druid/concurrent/Execs.java b/common/src/main/java/io/druid/concurrent/Execs.java index 308208ef98d..66af2a196ba 100644 --- a/common/src/main/java/io/druid/concurrent/Execs.java +++ b/common/src/main/java/io/druid/concurrent/Execs.java @@ -22,11 +22,13 @@ package io.druid.concurrent; import com.google.common.util.concurrent.ThreadFactoryBuilder; import java.util.concurrent.ArrayBlockingQueue; +import java.util.concurrent.BlockingQueue; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import java.util.concurrent.RejectedExecutionException; import java.util.concurrent.RejectedExecutionHandler; import java.util.concurrent.ScheduledExecutorService; +import java.util.concurrent.SynchronousQueue; import java.util.concurrent.ThreadFactory; import java.util.concurrent.ThreadPoolExecutor; import java.util.concurrent.TimeUnit; @@ -60,25 +62,29 @@ public class Execs * @param capacity maximum capacity after which the executorService will block on accepting new tasks * @return ExecutorService which blocks accepting new tasks when the capacity reached */ - public static ExecutorService newBlockingSingleThreaded(String nameFormat, int capacity) + public static ExecutorService newBlockingSingleThreaded(final String nameFormat, final int capacity) { - return new ThreadPoolExecutor( - 1, 1, - 0L, TimeUnit.MILLISECONDS, - new ArrayBlockingQueue(capacity), makeThreadFactory(nameFormat) - , new RejectedExecutionHandler() - { - @Override - public void rejectedExecution(Runnable r, ThreadPoolExecutor executor) - { - try { - ((ArrayBlockingQueue) executor.getQueue()).put(r); - } - catch (InterruptedException e) { - throw new RejectedExecutionException("Got Interrupted while adding to the Queue"); - } - } + final BlockingQueue queue; + if (capacity > 0) { + queue = new ArrayBlockingQueue<>(capacity); + } else { + queue = new SynchronousQueue<>(); } + return new ThreadPoolExecutor( + 1, 1, 0L, TimeUnit.MILLISECONDS, queue, makeThreadFactory(nameFormat), + new RejectedExecutionHandler() + { + @Override + public void rejectedExecution(Runnable r, ThreadPoolExecutor executor) + { + try { + executor.getQueue().put(r); + } + catch (InterruptedException e) { + throw new RejectedExecutionException("Got Interrupted while adding to the Queue"); + } + } + } ); } } diff --git a/common/src/test/java/io/druid/concurrent/ExecsTest.java b/common/src/test/java/io/druid/concurrent/ExecsTest.java index 809ed5eac02..ae2b0e15473 100644 --- a/common/src/test/java/io/druid/concurrent/ExecsTest.java +++ b/common/src/test/java/io/druid/concurrent/ExecsTest.java @@ -20,6 +20,8 @@ package io.druid.concurrent; import com.google.common.base.Throwables; +import com.google.common.util.concurrent.ThreadFactoryBuilder; +import com.metamx.common.logger.Logger; import org.junit.Assert; import org.junit.Test; @@ -30,23 +32,46 @@ import java.util.concurrent.atomic.AtomicInteger; public class ExecsTest { + private static final Logger log = new Logger(ExecsTest.class); + @Test - public void testBlockingExecutorService() throws Exception + public void testBlockingExecutorServiceZeroCapacity() throws Exception { - final int capacity = 3; - final ExecutorService blockingExecutor = Execs.newBlockingSingleThreaded("test%d", capacity); - final CountDownLatch queueFullSignal = new CountDownLatch(capacity + 1); - final CountDownLatch taskCompletedSignal = new CountDownLatch(2 * capacity); + runTest(0); + } + + @Test + public void testBlockingExecutorServiceOneCapacity() throws Exception + { + runTest(1); + } + + @Test + public void testBlockingExecutorServiceThreeCapacity() throws Exception + { + runTest(3); + } + + private static void runTest(final int capacity) throws Exception + { + final int nTasks = (capacity + 1) * 3; + final ExecutorService blockingExecutor = Execs.newBlockingSingleThreaded("ExecsTest-Blocking-%d", capacity); + final CountDownLatch queueShouldBeFullSignal = new CountDownLatch(capacity + 1); + final CountDownLatch taskCompletedSignal = new CountDownLatch(nTasks); final CountDownLatch taskStartSignal = new CountDownLatch(1); final AtomicInteger producedCount = new AtomicInteger(); final AtomicInteger consumedCount = new AtomicInteger(); - ExecutorService producer = Executors.newSingleThreadExecutor(); + final ExecutorService producer = Executors.newSingleThreadExecutor( + new ThreadFactoryBuilder().setNameFormat( + "ExecsTest-Producer-%d" + ).build() + ); producer.submit( new Runnable() { public void run() { - for (int i = 0; i < 2 * capacity; i++) { + for (int i = 0; i < nTasks; i++) { final int taskID = i; System.out.println("Produced task" + taskID); blockingExecutor.submit( @@ -55,7 +80,7 @@ public class ExecsTest @Override public void run() { - System.out.println("Starting task" + taskID); + log.info("Starting task: %s", taskID); try { taskStartSignal.await(); consumedCount.incrementAndGet(); @@ -64,29 +89,31 @@ public class ExecsTest catch (Exception e) { throw Throwables.propagate(e); } - System.out.println("Completed task" + taskID); + log.info("Completed task: %s", taskID); } } ); producedCount.incrementAndGet(); - queueFullSignal.countDown(); + queueShouldBeFullSignal.countDown(); } } } ); - queueFullSignal.await(); - // verify that the producer blocks + queueShouldBeFullSignal.await(); + // Verify that the producer blocks. I don't think it's possible to be sure that the producer is blocking (since + // it could be doing nothing for any reason). But waiting a short period of time and checking that it hasn't done + // anything should hopefully be sufficient. + Thread.sleep(500); Assert.assertEquals(capacity + 1, producedCount.get()); // let the tasks run taskStartSignal.countDown(); // wait until all tasks complete taskCompletedSignal.await(); // verify all tasks consumed - Assert.assertEquals(2 * capacity, consumedCount.get()); + Assert.assertEquals(nTasks, consumedCount.get()); // cleanup blockingExecutor.shutdown(); producer.shutdown(); - } } diff --git a/docs/content/Aggregations.md b/docs/content/Aggregations.md index 93bfb76c90e..74ad226ff81 100644 --- a/docs/content/Aggregations.md +++ b/docs/content/Aggregations.md @@ -82,3 +82,13 @@ All JavaScript functions must return numerical values. "fnReset" : "function() { return 10; }" } ``` + +### Complex aggregators + +#### `hyperUnique` aggregator + +`hyperUnique` uses [Hyperloglog](http://algo.inria.fr/flajolet/Publications/FlFuGaMe07.pdf) to compute the estimated cardinality of a dimension. + +```json +{ "type" : "hyperUnique", "name" : , "fieldName" : } +``` diff --git a/docs/content/DataSource.md b/docs/content/DataSource.md new file mode 100644 index 00000000000..49c583561ba --- /dev/null +++ b/docs/content/DataSource.md @@ -0,0 +1,25 @@ +--- +layout: doc_page +--- +A data source is the Druid equivalent of a database table. However, a query can also masquerade as a data source, providing subquery-like functionality. Query data sources are currently only supported by [GroupBy](GroupByQuery.html) queries. + +### Table Data Source +The table data source the most common type. It's represented by a string, or by the full structure: + +```json +{ + "type": "table", + "name": +} +``` + +### Query Data Source +```json +{ + "type": "query", + "query": { + "type": "groupBy", + ... + } +} +``` diff --git a/docs/content/Examples.md b/docs/content/Examples.md index 495891f6da9..6c2589b2160 100644 --- a/docs/content/Examples.md +++ b/docs/content/Examples.md @@ -19,13 +19,13 @@ Clone Druid and build it: git clone https://github.com/metamx/druid.git druid cd druid git fetch --tags -git checkout druid-0.6.61 +git checkout druid-0.6.65 ./build.sh ``` ### Downloading the DSK (Druid Standalone Kit) -[Download](http://static.druid.io/artifacts/releases/druid-services-0.6.61-bin.tar.gz) a stand-alone tarball and run it: +[Download](http://static.druid.io/artifacts/releases/druid-services-0.6.65-bin.tar.gz) a stand-alone tarball and run it: ``` bash tar -xzf druid-services-0.X.X-bin.tar.gz diff --git a/docs/content/GroupByQuery.md b/docs/content/GroupByQuery.md index 9edca5d2861..ca6ef8e277a 100644 --- a/docs/content/GroupByQuery.md +++ b/docs/content/GroupByQuery.md @@ -48,7 +48,7 @@ There are 9 main parts to a groupBy query: |property|description|required?| |--------|-----------|---------| |queryType|This String should always be "groupBy"; this is the first thing Druid looks at to figure out how to interpret the query|yes| -|dataSource|A String defining the data source to query, very similar to a table in a relational database|yes| +|dataSource|A String defining the data source to query, very similar to a table in a relational database, or a [DataSource](DataSource.html) structure.|yes| |dimensions|A JSON list of dimensions to do the groupBy over|yes| |orderBy|See [OrderBy](OrderBy.html).|no| |having|See [Having](Having.html).|no| diff --git a/docs/content/Indexing-Service-Config.md b/docs/content/Indexing-Service-Config.md index 88f89220d21..79c5462584b 100644 --- a/docs/content/Indexing-Service-Config.md +++ b/docs/content/Indexing-Service-Config.md @@ -66,7 +66,7 @@ druid.host=#{IP_ADDR}:8080 druid.port=8080 druid.service=druid/prod/indexer -druid.extensions.coordinates=["io.druid.extensions:druid-s3-extensions:0.6.61"] +druid.extensions.coordinates=["io.druid.extensions:druid-s3-extensions:0.6.65"] druid.zk.service.host=#{ZK_IPs} druid.zk.paths.base=/druid/prod @@ -115,7 +115,7 @@ druid.host=#{IP_ADDR}:8080 druid.port=8080 druid.service=druid/prod/worker -druid.extensions.coordinates=["io.druid.extensions:druid-s3-extensions:0.6.61","io.druid.extensions:druid-kafka-seven:0.6.61"] +druid.extensions.coordinates=["io.druid.extensions:druid-s3-extensions:0.6.65","io.druid.extensions:druid-kafka-seven:0.6.65"] druid.zk.service.host=#{ZK_IPs} druid.zk.paths.base=/druid/prod diff --git a/docs/content/Post-aggregations.md b/docs/content/Post-aggregations.md index 527d64e7971..4dce46ceff1 100644 --- a/docs/content/Post-aggregations.md +++ b/docs/content/Post-aggregations.md @@ -64,6 +64,31 @@ Example JavaScript aggregator: "function": "function(delta, total) { return 100 * Math.abs(delta) / total; }" } ``` +### `hyperUniqueCardinality` post-aggregator + +The hyperUniqueCardinality post aggregator is used to wrap a hyperUnique object such that it can be used in post aggregations. + +```json +{ "type" : "hyperUniqueCardinality", "fieldName" : } +``` + +It can be used in a sample calculation as so: + +```json + "aggregations" : [{ + {"type" : "count", "name" : "rows"}, + {"type" : "hyperUnique", "name" : "unique_users", "fieldName" : "uniques"} + }], + "postAggregations" : { + "type" : "arithmetic", + "name" : "average_users_per_row", + "fn" : "/", + "fields" : [ + { "type" : "hyperUniqueCardinality", "fieldName" : "unique_users" }, + { "type" : "fieldAccess", "name" : "rows", "fieldName" : "rows" } + ] + } +``` ### Example Usage @@ -98,4 +123,4 @@ The format of the query JSON is as follows: ... } -``` +``` \ No newline at end of file diff --git a/docs/content/Realtime-Config.md b/docs/content/Realtime-Config.md index 551d6704ae8..89cdc1dd630 100644 --- a/docs/content/Realtime-Config.md +++ b/docs/content/Realtime-Config.md @@ -27,7 +27,7 @@ druid.host=localhost druid.service=realtime druid.port=8083 -druid.extensions.coordinates=["io.druid.extensions:druid-kafka-seven:0.6.61"] +druid.extensions.coordinates=["io.druid.extensions:druid-kafka-seven:0.6.65"] druid.zk.service.host=localhost @@ -76,7 +76,7 @@ druid.host=#{IP_ADDR}:8080 druid.port=8080 druid.service=druid/prod/realtime -druid.extensions.coordinates=["io.druid.extensions:druid-s3-extensions:0.6.61","io.druid.extensions:druid-kafka-seven:0.6.61"] +druid.extensions.coordinates=["io.druid.extensions:druid-s3-extensions:0.6.65","io.druid.extensions:druid-kafka-seven:0.6.65"] druid.zk.service.host=#{ZK_IPs} druid.zk.paths.base=/druid/prod diff --git a/docs/content/Tutorial:-A-First-Look-at-Druid.md b/docs/content/Tutorial:-A-First-Look-at-Druid.md index cbf04e9a708..8ea276eb0d6 100644 --- a/docs/content/Tutorial:-A-First-Look-at-Druid.md +++ b/docs/content/Tutorial:-A-First-Look-at-Druid.md @@ -49,7 +49,7 @@ There are two ways to setup Druid: download a tarball, or [Build From Source](Bu ### Download a Tarball -We've built a tarball that contains everything you'll need. You'll find it [here](http://static.druid.io/artifacts/releases/druid-services-0.6.61-bin.tar.gz). Download this file to a directory of your choosing. +We've built a tarball that contains everything you'll need. You'll find it [here](http://static.druid.io/artifacts/releases/druid-services-0.6.65-bin.tar.gz). Download this file to a directory of your choosing. You can extract the awesomeness within by issuing: @@ -60,7 +60,7 @@ tar -zxvf druid-services-*-bin.tar.gz Not too lost so far right? That's great! If you cd into the directory: ``` -cd druid-services-0.6.61 +cd druid-services-0.6.65 ``` You should see a bunch of files: diff --git a/docs/content/Tutorial:-The-Druid-Cluster.md b/docs/content/Tutorial:-The-Druid-Cluster.md index 958cd580752..e3a805bb19c 100644 --- a/docs/content/Tutorial:-The-Druid-Cluster.md +++ b/docs/content/Tutorial:-The-Druid-Cluster.md @@ -13,7 +13,7 @@ In this tutorial, we will set up other types of Druid nodes and external depende If you followed the first tutorial, you should already have Druid downloaded. If not, let's go back and do that first. -You can download the latest version of druid [here](http://static.druid.io/artifacts/releases/druid-services-0.6.61-bin.tar.gz) +You can download the latest version of druid [here](http://static.druid.io/artifacts/releases/druid-services-0.6.65-bin.tar.gz) and untar the contents within by issuing: @@ -149,7 +149,7 @@ druid.port=8081 druid.zk.service.host=localhost -druid.extensions.coordinates=["io.druid.extensions:druid-s3-extensions:0.6.61"] +druid.extensions.coordinates=["io.druid.extensions:druid-s3-extensions:0.6.65"] # Dummy read only AWS account (used to download example data) druid.s3.secretKey=QyyfVZ7llSiRg6Qcrql1eEUG7buFpAK6T6engr1b @@ -240,7 +240,7 @@ druid.port=8083 druid.zk.service.host=localhost -druid.extensions.coordinates=["io.druid.extensions:druid-examples:0.6.61","io.druid.extensions:druid-kafka-seven:0.6.61"] +druid.extensions.coordinates=["io.druid.extensions:druid-examples:0.6.65","io.druid.extensions:druid-kafka-seven:0.6.65"] # Change this config to db to hand off to the rest of the Druid cluster druid.publish.type=noop diff --git a/docs/content/Tutorial:-Webstream.md b/docs/content/Tutorial:-Webstream.md index 481b2b897b6..e37c1d4f677 100644 --- a/docs/content/Tutorial:-Webstream.md +++ b/docs/content/Tutorial:-Webstream.md @@ -37,7 +37,7 @@ There are two ways to setup Druid: download a tarball, or [Build From Source](Bu h3. Download a Tarball -We've built a tarball that contains everything you'll need. You'll find it [here](http://static.druid.io/artifacts/releases/druid-services-0.6.61-bin.tar.gz) +We've built a tarball that contains everything you'll need. You'll find it [here](http://static.druid.io/artifacts/releases/druid-services-0.6.65-bin.tar.gz) Download this file to a directory of your choosing. You can extract the awesomeness within by issuing: @@ -48,7 +48,7 @@ tar zxvf druid-services-*-bin.tar.gz Not too lost so far right? That's great! If you cd into the directory: ``` -cd druid-services-0.6.61 +cd druid-services-0.6.65 ``` You should see a bunch of files: diff --git a/docs/content/Twitter-Tutorial.textile b/docs/content/Twitter-Tutorial.textile index 9652bdb2355..a8db1f8ed5d 100644 --- a/docs/content/Twitter-Tutorial.textile +++ b/docs/content/Twitter-Tutorial.textile @@ -9,7 +9,7 @@ There are two ways to setup Druid: download a tarball, or build it from source. h3. Download a Tarball -We've built a tarball that contains everything you'll need. You'll find it "here":http://static.druid.io/artifacts/releases/druid-services-0.6.61-bin.tar.gz. +We've built a tarball that contains everything you'll need. You'll find it "here":http://static.druid.io/artifacts/releases/druid-services-0.6.65-bin.tar.gz. Download this bad boy to a directory of your choosing. You can extract the awesomeness within by issuing: diff --git a/examples/bin/run_example_server.sh b/examples/bin/run_example_server.sh index f1adada0bfa..3d63e7cb0c3 100755 --- a/examples/bin/run_example_server.sh +++ b/examples/bin/run_example_server.sh @@ -55,8 +55,6 @@ JAVA_ARGS="${JAVA_ARGS} -Ddruid.realtime.specFile=${SPEC_FILE}" DRUID_CP=${EXAMPLE_LOC} #For a pull -DRUID_CP=${DRUID_CP}:`ls ${SCRIPT_DIR}/../target/druid-examples-*-selfcontained.jar` -DRUID_CP=${DRUID_CP}:`ls ${SCRIPT_DIR}/../../services/target/druid-services-*-selfcontained.jar` DRUID_CP=${DRUID_CP}:${SCRIPT_DIR}/../config/realtime #For the kit DRUID_CP=${DRUID_CP}:${SCRIPT_DIR}/lib/* diff --git a/examples/config/historical/runtime.properties b/examples/config/historical/runtime.properties index a94ec460fb3..8632e8813be 100644 --- a/examples/config/historical/runtime.properties +++ b/examples/config/historical/runtime.properties @@ -4,7 +4,7 @@ druid.port=8081 druid.zk.service.host=localhost -druid.extensions.coordinates=["io.druid.extensions:druid-s3-extensions:0.6.61"] +druid.extensions.coordinates=["io.druid.extensions:druid-s3-extensions:0.6.65"] # Dummy read only AWS account (used to download example data) druid.s3.secretKey=QyyfVZ7llSiRg6Qcrql1eEUG7buFpAK6T6engr1b diff --git a/examples/config/realtime/runtime.properties b/examples/config/realtime/runtime.properties index 45471c989ac..10e5f85c97e 100644 --- a/examples/config/realtime/runtime.properties +++ b/examples/config/realtime/runtime.properties @@ -4,7 +4,7 @@ druid.port=8083 druid.zk.service.host=localhost -druid.extensions.coordinates=["io.druid.extensions:druid-examples:0.6.61","io.druid.extensions:druid-kafka-seven:0.6.61","io.druid.extensions:druid-rabbitmq:0.6.61"] +druid.extensions.coordinates=["io.druid.extensions:druid-examples:0.6.65","io.druid.extensions:druid-kafka-seven:0.6.65","io.druid.extensions:druid-rabbitmq:0.6.65"] # Change this config to db to hand off to the rest of the Druid cluster druid.publish.type=noop diff --git a/examples/pom.xml b/examples/pom.xml index bce8ed35efd..69080729f79 100644 --- a/examples/pom.xml +++ b/examples/pom.xml @@ -28,7 +28,7 @@ io.druid druid - 0.6.63-SNAPSHOT + 0.6.66-SNAPSHOT diff --git a/hdfs-storage/pom.xml b/hdfs-storage/pom.xml index a6990497dd8..e65075d3c9d 100644 --- a/hdfs-storage/pom.xml +++ b/hdfs-storage/pom.xml @@ -28,7 +28,7 @@ io.druid druid - 0.6.63-SNAPSHOT + 0.6.66-SNAPSHOT diff --git a/hll/pom.xml b/hll/pom.xml index 09e6fba02e6..0531ab90758 100644 --- a/hll/pom.xml +++ b/hll/pom.xml @@ -28,7 +28,7 @@ io.druid druid - 0.6.63-SNAPSHOT + 0.6.66-SNAPSHOT diff --git a/indexing-hadoop/pom.xml b/indexing-hadoop/pom.xml index e164043244e..494ba1aa354 100644 --- a/indexing-hadoop/pom.xml +++ b/indexing-hadoop/pom.xml @@ -28,7 +28,7 @@ io.druid druid - 0.6.63-SNAPSHOT + 0.6.66-SNAPSHOT @@ -97,11 +97,6 @@ junit test - - com.clearspring.analytics - stream - 2.5.2 - diff --git a/indexing-hadoop/src/main/java/io/druid/indexer/DetermineHashedPartitionsJob.java b/indexing-hadoop/src/main/java/io/druid/indexer/DetermineHashedPartitionsJob.java index 407cf84dec3..ae2d61a9a93 100644 --- a/indexing-hadoop/src/main/java/io/druid/indexer/DetermineHashedPartitionsJob.java +++ b/indexing-hadoop/src/main/java/io/druid/indexer/DetermineHashedPartitionsJob.java @@ -19,8 +19,6 @@ package io.druid.indexer; -import com.clearspring.analytics.stream.cardinality.CardinalityMergeException; -import com.clearspring.analytics.stream.cardinality.HyperLogLog; import com.fasterxml.jackson.core.type.TypeReference; import com.google.common.base.Optional; import com.google.common.base.Throwables; @@ -36,6 +34,7 @@ import io.druid.data.input.InputRow; import io.druid.data.input.Rows; import io.druid.granularity.QueryGranularity; import io.druid.indexer.granularity.UniformGranularitySpec; +import io.druid.query.aggregation.hyperloglog.HyperLogLogCollector; import io.druid.timeline.partition.HashBasedNumberedShardSpec; import io.druid.timeline.partition.NoneShardSpec; import org.apache.hadoop.conf.Configuration; @@ -56,6 +55,7 @@ import org.joda.time.Interval; import java.io.IOException; import java.io.OutputStream; +import java.nio.ByteBuffer; import java.util.List; import java.util.Map; import java.util.Set; @@ -67,7 +67,6 @@ public class DetermineHashedPartitionsJob implements Jobby { private static final int MAX_SHARDS = 128; private static final Logger log = new Logger(DetermineHashedPartitionsJob.class); - private static final int HYPER_LOG_LOG_BIT_SIZE = 20; private final HadoopDruidIndexerConfig config; public DetermineHashedPartitionsJob( @@ -99,8 +98,8 @@ public class DetermineHashedPartitionsJob implements Jobby groupByJob.setOutputKeyClass(NullWritable.class); groupByJob.setOutputValueClass(NullWritable.class); groupByJob.setOutputFormatClass(SequenceFileOutputFormat.class); - if(!config.getSegmentGranularIntervals().isPresent()){ - groupByJob.setNumReduceTasks(1); + if (!config.getSegmentGranularIntervals().isPresent()) { + groupByJob.setNumReduceTasks(1); } JobHelper.setupClasspath(config, groupByJob); @@ -194,7 +193,7 @@ public class DetermineHashedPartitionsJob implements Jobby { private static HashFunction hashFunction = Hashing.murmur3_128(); private QueryGranularity rollupGranularity = null; - private Map hyperLogLogs; + private Map hyperLogLogs; private HadoopDruidIndexerConfig config; private boolean determineIntervals; @@ -208,9 +207,9 @@ public class DetermineHashedPartitionsJob implements Jobby Optional> intervals = config.getSegmentGranularIntervals(); if (intervals.isPresent()) { determineIntervals = false; - final ImmutableMap.Builder builder = ImmutableMap.builder(); + final ImmutableMap.Builder builder = ImmutableMap.builder(); for (final Interval bucketInterval : intervals.get()) { - builder.put(bucketInterval, new HyperLogLog(HYPER_LOG_LOG_BIT_SIZE)); + builder.put(bucketInterval, HyperLogLogCollector.makeLatestCollector()); } hyperLogLogs = builder.build(); } else { @@ -236,7 +235,7 @@ public class DetermineHashedPartitionsJob implements Jobby interval = config.getGranularitySpec().getGranularity().bucket(new DateTime(inputRow.getTimestampFromEpoch())); if (!hyperLogLogs.containsKey(interval)) { - hyperLogLogs.put(interval, new HyperLogLog(HYPER_LOG_LOG_BIT_SIZE)); + hyperLogLogs.put(interval, HyperLogLogCollector.makeLatestCollector()); } } else { final Optional maybeInterval = config.getGranularitySpec() @@ -248,9 +247,9 @@ public class DetermineHashedPartitionsJob implements Jobby interval = maybeInterval.get(); } hyperLogLogs.get(interval) - .offerHashed( + .add( hashFunction.hashBytes(HadoopDruidIndexerConfig.jsonMapper.writeValueAsBytes(groupKey)) - .asLong() + .asBytes() ); } @@ -263,10 +262,10 @@ public class DetermineHashedPartitionsJob implements Jobby map(context.getCurrentKey(), context.getCurrentValue(), context); } - for (Map.Entry entry : hyperLogLogs.entrySet()) { + for (Map.Entry entry : hyperLogLogs.entrySet()) { context.write( new LongWritable(entry.getKey().getStartMillis()), - new BytesWritable(entry.getValue().getBytes()) + new BytesWritable(entry.getValue().toByteArray()) ); } cleanup(context); @@ -294,15 +293,9 @@ public class DetermineHashedPartitionsJob implements Jobby Context context ) throws IOException, InterruptedException { - HyperLogLog aggregate = new HyperLogLog(HYPER_LOG_LOG_BIT_SIZE); + HyperLogLogCollector aggregate = HyperLogLogCollector.makeLatestCollector(); for (BytesWritable value : values) { - HyperLogLog logValue = HyperLogLog.Builder.build(getDataBytes(value)); - try { - aggregate.addAll(logValue); - } - catch (CardinalityMergeException e) { - e.printStackTrace(); // TODO: check for better handling - } + aggregate.fold(ByteBuffer.wrap(value.getBytes(), 0, value.getLength())); } Interval interval = config.getGranularitySpec().getGranularity().bucket(new DateTime(key.get())); intervals.add(interval); @@ -318,7 +311,7 @@ public class DetermineHashedPartitionsJob implements Jobby } ).writeValue( out, - aggregate.cardinality() + new Double(aggregate.estimateCardinality()).longValue() ); } finally { diff --git a/indexing-service/pom.xml b/indexing-service/pom.xml index 4b3b68fb4d0..e6d7ea63442 100644 --- a/indexing-service/pom.xml +++ b/indexing-service/pom.xml @@ -28,7 +28,7 @@ io.druid druid - 0.6.63-SNAPSHOT + 0.6.66-SNAPSHOT diff --git a/indexing-service/src/main/java/io/druid/indexing/common/task/HadoopIndexTask.java b/indexing-service/src/main/java/io/druid/indexing/common/task/HadoopIndexTask.java index 30d0750f3d0..878f950f0c4 100644 --- a/indexing-service/src/main/java/io/druid/indexing/common/task/HadoopIndexTask.java +++ b/indexing-service/src/main/java/io/druid/indexing/common/task/HadoopIndexTask.java @@ -281,7 +281,7 @@ public class HadoopIndexTask extends AbstractTask Jobby job = new HadoopDruidDetermineConfigurationJob(config); - log.info("Starting a hadoop index generator job..."); + log.info("Starting a hadoop determine configuration job..."); if (job.run()) { return HadoopDruidIndexerConfig.jsonMapper.writeValueAsString(HadoopDruidIndexerConfigBuilder.toSchema(config)); } diff --git a/indexing-service/src/main/java/io/druid/indexing/common/task/RealtimeIndexTask.java b/indexing-service/src/main/java/io/druid/indexing/common/task/RealtimeIndexTask.java index 7a40035c3e6..09172da3a4d 100644 --- a/indexing-service/src/main/java/io/druid/indexing/common/task/RealtimeIndexTask.java +++ b/indexing-service/src/main/java/io/druid/indexing/common/task/RealtimeIndexTask.java @@ -109,7 +109,7 @@ public class RealtimeIndexTask extends AbstractTask @JsonProperty("firehose") FirehoseFactory firehoseFactory, @JsonProperty("fireDepartmentConfig") FireDepartmentConfig fireDepartmentConfig, @JsonProperty("windowPeriod") Period windowPeriod, - @JsonProperty("maxPendingPersists") int maxPendingPersists, + @JsonProperty("maxPendingPersists") Integer maxPendingPersists, @JsonProperty("segmentGranularity") IndexGranularity segmentGranularity, @JsonProperty("rejectionPolicy") RejectionPolicyFactory rejectionPolicyFactory ) @@ -139,7 +139,7 @@ public class RealtimeIndexTask extends AbstractTask this.firehoseFactory = firehoseFactory; this.fireDepartmentConfig = fireDepartmentConfig; this.windowPeriod = windowPeriod; - this.maxPendingPersists = (maxPendingPersists == 0) + this.maxPendingPersists = (maxPendingPersists == null) ? RealtimePlumberSchool.DEFAULT_MAX_PENDING_PERSISTS : maxPendingPersists; this.segmentGranularity = segmentGranularity; @@ -398,6 +398,12 @@ public class RealtimeIndexTask extends AbstractTask return windowPeriod; } + @JsonProperty + public int getMaxPendingPersists() + { + return maxPendingPersists; + } + @JsonProperty public IndexGranularity getSegmentGranularity() { diff --git a/indexing-service/src/main/java/io/druid/indexing/overlord/IndexerDBCoordinator.java b/indexing-service/src/main/java/io/druid/indexing/overlord/IndexerDBCoordinator.java index dc02ee9d4ef..2afa4ed0dd5 100644 --- a/indexing-service/src/main/java/io/druid/indexing/overlord/IndexerDBCoordinator.java +++ b/indexing-service/src/main/java/io/druid/indexing/overlord/IndexerDBCoordinator.java @@ -90,7 +90,7 @@ public class IndexerDBCoordinator final ResultIterator> dbSegments = handle.createQuery( String.format( - "SELECT payload FROM %s WHERE used = 1 AND dataSource = :dataSource", + "SELECT payload FROM %s WHERE used = true AND dataSource = :dataSource", dbTables.getSegmentsTable() ) ) @@ -304,8 +304,8 @@ public class IndexerDBCoordinator return handle.createQuery( String.format( DbConnector.isPostgreSQL(handle)? - "SELECT payload FROM %s WHERE dataSource = :dataSource and start >= :start and \"end\" <= :end and used = 0": - "SELECT payload FROM %s WHERE dataSource = :dataSource and start >= :start and end <= :end and used = 0", + "SELECT payload FROM %s WHERE dataSource = :dataSource and start >= :start and \"end\" <= :end and used = false": + "SELECT payload FROM %s WHERE dataSource = :dataSource and start >= :start and end <= :end and used = false", dbTables.getSegmentsTable() ) ) diff --git a/indexing-service/src/main/java/io/druid/indexing/overlord/ThreadPoolTaskRunner.java b/indexing-service/src/main/java/io/druid/indexing/overlord/ThreadPoolTaskRunner.java index 2cc94ac7400..0d69ed5036c 100644 --- a/indexing-service/src/main/java/io/druid/indexing/overlord/ThreadPoolTaskRunner.java +++ b/indexing-service/src/main/java/io/druid/indexing/overlord/ThreadPoolTaskRunner.java @@ -38,6 +38,7 @@ import io.druid.indexing.common.TaskToolboxFactory; import io.druid.indexing.common.task.Task; import io.druid.query.NoopQueryRunner; import io.druid.query.Query; +import io.druid.query.TableDataSource; import io.druid.query.QueryRunner; import io.druid.query.QuerySegmentWalker; import io.druid.query.SegmentDescriptor; @@ -152,10 +153,17 @@ public class ThreadPoolTaskRunner implements TaskRunner, QuerySegmentWalker private QueryRunner getQueryRunnerImpl(Query query) { QueryRunner queryRunner = null; + String queryDataSource; + try { + queryDataSource = ((TableDataSource)query.getDataSource()).getName(); + } + catch (ClassCastException e) { + throw new IllegalArgumentException("Subqueries are not welcome here"); + } for (final ThreadPoolTaskRunnerWorkItem taskRunnerWorkItem : ImmutableList.copyOf(runningItems)) { final Task task = taskRunnerWorkItem.getTask(); - if (task.getDataSource().equals(query.getDataSource())) { + if (task.getDataSource().equals(queryDataSource)) { final QueryRunner taskQueryRunner = task.getQueryRunner(query); if (taskQueryRunner != null) { @@ -163,7 +171,7 @@ public class ThreadPoolTaskRunner implements TaskRunner, QuerySegmentWalker queryRunner = taskQueryRunner; } else { log.makeAlert("Found too many query runners for datasource") - .addData("dataSource", query.getDataSource()) + .addData("dataSource", queryDataSource) .emit(); } } diff --git a/indexing-service/src/test/java/io/druid/indexing/common/task/TaskSerdeTest.java b/indexing-service/src/test/java/io/druid/indexing/common/task/TaskSerdeTest.java index b75169c8b9a..ba5afb04d3b 100644 --- a/indexing-service/src/test/java/io/druid/indexing/common/task/TaskSerdeTest.java +++ b/indexing-service/src/test/java/io/druid/indexing/common/task/TaskSerdeTest.java @@ -198,7 +198,7 @@ public class TaskSerdeTest null, null, new Period("PT10M"), - 1, + 5, IndexGranularity.HOUR, null ); @@ -214,6 +214,7 @@ public class TaskSerdeTest Assert.assertEquals("rofl", task.getTaskResource().getAvailabilityGroup()); Assert.assertEquals(new Period("PT10M"), task.getWindowPeriod()); Assert.assertEquals(IndexGranularity.HOUR, task.getSegmentGranularity()); + Assert.assertEquals(5, task.getMaxPendingPersists()); Assert.assertEquals(task.getId(), task2.getId()); Assert.assertEquals(task.getGroupId(), task2.getGroupId()); @@ -222,6 +223,7 @@ public class TaskSerdeTest Assert.assertEquals(task.getTaskResource().getAvailabilityGroup(), task2.getTaskResource().getAvailabilityGroup()); Assert.assertEquals(task.getWindowPeriod(), task2.getWindowPeriod()); Assert.assertEquals(task.getSegmentGranularity(), task2.getSegmentGranularity()); + Assert.assertEquals(task.getMaxPendingPersists(), task2.getMaxPendingPersists()); } @Test diff --git a/kafka-eight/pom.xml b/kafka-eight/pom.xml index 3457f0cae32..e0aefd8fbd5 100644 --- a/kafka-eight/pom.xml +++ b/kafka-eight/pom.xml @@ -28,7 +28,7 @@ io.druid druid - 0.6.63-SNAPSHOT + 0.6.66-SNAPSHOT diff --git a/kafka-seven/pom.xml b/kafka-seven/pom.xml index 0eef3c4b016..fede6b78fe4 100644 --- a/kafka-seven/pom.xml +++ b/kafka-seven/pom.xml @@ -28,7 +28,7 @@ io.druid druid - 0.6.63-SNAPSHOT + 0.6.66-SNAPSHOT diff --git a/pom.xml b/pom.xml index f83594e8aa0..742ef76d28e 100644 --- a/pom.xml +++ b/pom.xml @@ -23,7 +23,7 @@ io.druid druid pom - 0.6.63-SNAPSHOT + 0.6.66-SNAPSHOT druid druid diff --git a/processing/pom.xml b/processing/pom.xml index bcd1a378018..bb11fe8dd8a 100644 --- a/processing/pom.xml +++ b/processing/pom.xml @@ -28,7 +28,7 @@ io.druid druid - 0.6.63-SNAPSHOT + 0.6.66-SNAPSHOT diff --git a/processing/src/main/java/io/druid/jackson/AggregatorsModule.java b/processing/src/main/java/io/druid/jackson/AggregatorsModule.java index 15a76639997..3029d2bcc4e 100644 --- a/processing/src/main/java/io/druid/jackson/AggregatorsModule.java +++ b/processing/src/main/java/io/druid/jackson/AggregatorsModule.java @@ -22,6 +22,7 @@ package io.druid.jackson; import com.fasterxml.jackson.annotation.JsonSubTypes; import com.fasterxml.jackson.annotation.JsonTypeInfo; import com.fasterxml.jackson.databind.module.SimpleModule; +import com.google.common.hash.Hashing; import io.druid.query.aggregation.AggregatorFactory; import io.druid.query.aggregation.CountAggregatorFactory; import io.druid.query.aggregation.DoubleSumAggregatorFactory; @@ -31,10 +32,14 @@ import io.druid.query.aggregation.LongSumAggregatorFactory; import io.druid.query.aggregation.MaxAggregatorFactory; import io.druid.query.aggregation.MinAggregatorFactory; import io.druid.query.aggregation.PostAggregator; +import io.druid.query.aggregation.hyperloglog.HyperUniqueFinalizingPostAggregator; +import io.druid.query.aggregation.hyperloglog.HyperUniquesAggregatorFactory; +import io.druid.query.aggregation.hyperloglog.HyperUniquesSerde; import io.druid.query.aggregation.post.ArithmeticPostAggregator; import io.druid.query.aggregation.post.ConstantPostAggregator; import io.druid.query.aggregation.post.FieldAccessPostAggregator; import io.druid.query.aggregation.post.JavaScriptPostAggregator; +import io.druid.segment.serde.ComplexMetrics; /** */ @@ -44,28 +49,38 @@ public class AggregatorsModule extends SimpleModule { super("AggregatorFactories"); + if (ComplexMetrics.getSerdeForType("hyperUnique") == null) { + ComplexMetrics.registerSerde("hyperUnique", new HyperUniquesSerde(Hashing.murmur3_128())); + } + setMixInAnnotation(AggregatorFactory.class, AggregatorFactoryMixin.class); setMixInAnnotation(PostAggregator.class, PostAggregatorMixin.class); } - @JsonTypeInfo(use= JsonTypeInfo.Id.NAME, property="type") - @JsonSubTypes(value={ - @JsonSubTypes.Type(name="count", value=CountAggregatorFactory.class), - @JsonSubTypes.Type(name="longSum", value=LongSumAggregatorFactory.class), - @JsonSubTypes.Type(name="doubleSum", value=DoubleSumAggregatorFactory.class), - @JsonSubTypes.Type(name="max", value=MaxAggregatorFactory.class), - @JsonSubTypes.Type(name="min", value=MinAggregatorFactory.class), - @JsonSubTypes.Type(name="javascript", value=JavaScriptAggregatorFactory.class), - @JsonSubTypes.Type(name="histogram", value=HistogramAggregatorFactory.class) + @JsonTypeInfo(use = JsonTypeInfo.Id.NAME, property = "type") + @JsonSubTypes(value = { + @JsonSubTypes.Type(name = "count", value = CountAggregatorFactory.class), + @JsonSubTypes.Type(name = "longSum", value = LongSumAggregatorFactory.class), + @JsonSubTypes.Type(name = "doubleSum", value = DoubleSumAggregatorFactory.class), + @JsonSubTypes.Type(name = "max", value = MaxAggregatorFactory.class), + @JsonSubTypes.Type(name = "min", value = MinAggregatorFactory.class), + @JsonSubTypes.Type(name = "javascript", value = JavaScriptAggregatorFactory.class), + @JsonSubTypes.Type(name = "histogram", value = HistogramAggregatorFactory.class), + @JsonSubTypes.Type(name = "hyperUnique", value = HyperUniquesAggregatorFactory.class) }) - public static interface AggregatorFactoryMixin {} + public static interface AggregatorFactoryMixin + { + } @JsonTypeInfo(use = JsonTypeInfo.Id.NAME, property = "type") @JsonSubTypes(value = { @JsonSubTypes.Type(name = "arithmetic", value = ArithmeticPostAggregator.class), @JsonSubTypes.Type(name = "fieldAccess", value = FieldAccessPostAggregator.class), @JsonSubTypes.Type(name = "constant", value = ConstantPostAggregator.class), - @JsonSubTypes.Type(name = "javascript", value = JavaScriptPostAggregator.class) + @JsonSubTypes.Type(name = "javascript", value = JavaScriptPostAggregator.class), + @JsonSubTypes.Type(name = "hyperUniqueCardinality", value = HyperUniqueFinalizingPostAggregator.class) }) - public static interface PostAggregatorMixin {} + public static interface PostAggregatorMixin + { + } } diff --git a/processing/src/main/java/io/druid/query/BaseQuery.java b/processing/src/main/java/io/druid/query/BaseQuery.java index 7195e1fcd8e..ed13f9ddf39 100644 --- a/processing/src/main/java/io/druid/query/BaseQuery.java +++ b/processing/src/main/java/io/druid/query/BaseQuery.java @@ -36,13 +36,13 @@ import java.util.Map; public abstract class BaseQuery implements Query { public static String QUERYID = "queryId"; - private final String dataSource; + private final DataSource dataSource; private final Map context; private final QuerySegmentSpec querySegmentSpec; private volatile Duration duration; public BaseQuery( - String dataSource, + DataSource dataSource, QuerySegmentSpec querySegmentSpec, Map context ) @@ -50,14 +50,14 @@ public abstract class BaseQuery implements Query Preconditions.checkNotNull(dataSource, "dataSource can't be null"); Preconditions.checkNotNull(querySegmentSpec, "querySegmentSpec can't be null"); - this.dataSource = dataSource.toLowerCase(); + this.dataSource = dataSource; this.context = context; this.querySegmentSpec = querySegmentSpec; } @JsonProperty @Override - public String getDataSource() + public DataSource getDataSource() { return dataSource; } @@ -143,4 +143,31 @@ public abstract class BaseQuery implements Query { return withOverriddenContext(ImmutableMap.of(QUERYID, id)); } + + @Override + public boolean equals(Object o) + { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + + BaseQuery baseQuery = (BaseQuery) o; + + if (context != null ? !context.equals(baseQuery.context) : baseQuery.context != null) return false; + if (dataSource != null ? !dataSource.equals(baseQuery.dataSource) : baseQuery.dataSource != null) return false; + if (duration != null ? !duration.equals(baseQuery.duration) : baseQuery.duration != null) return false; + if (querySegmentSpec != null ? !querySegmentSpec.equals(baseQuery.querySegmentSpec) : baseQuery.querySegmentSpec != null) + return false; + + return true; + } + + @Override + public int hashCode() + { + int result = dataSource != null ? dataSource.hashCode() : 0; + result = 31 * result + (context != null ? context.hashCode() : 0); + result = 31 * result + (querySegmentSpec != null ? querySegmentSpec.hashCode() : 0); + result = 31 * result + (duration != null ? duration.hashCode() : 0); + return result; + } } diff --git a/processing/src/main/java/io/druid/query/ChainedExecutionQueryRunner.java b/processing/src/main/java/io/druid/query/ChainedExecutionQueryRunner.java index 316c8d8675e..83d2ff48f98 100644 --- a/processing/src/main/java/io/druid/query/ChainedExecutionQueryRunner.java +++ b/processing/src/main/java/io/druid/query/ChainedExecutionQueryRunner.java @@ -25,6 +25,7 @@ import com.google.common.base.Throwables; import com.google.common.collect.Iterables; import com.google.common.collect.Lists; import com.google.common.collect.Ordering; +import com.metamx.common.ISE; import com.metamx.common.guava.BaseSequence; import com.metamx.common.guava.MergeIterable; import com.metamx.common.guava.Sequence; @@ -84,11 +85,6 @@ public class ChainedExecutionQueryRunner implements QueryRunner { final int priority = Integer.parseInt(query.getContextValue("priority", "0")); - if (Iterables.isEmpty(queryables)) { - log.warn("No queryables found."); - return Sequences.empty(); - } - return new BaseSequence>( new BaseSequence.IteratorMaker>() { @@ -111,6 +107,9 @@ public class ChainedExecutionQueryRunner implements QueryRunner public List call() throws Exception { try { + if (input == null) { + throw new ISE("Input is null?! How is this possible?!"); + } return Sequences.toList(input.run(query), Lists.newArrayList()); } catch (Exception e) { diff --git a/processing/src/main/java/io/druid/query/DataSource.java b/processing/src/main/java/io/druid/query/DataSource.java new file mode 100644 index 00000000000..a4ef603da1f --- /dev/null +++ b/processing/src/main/java/io/druid/query/DataSource.java @@ -0,0 +1,38 @@ +/* + * Druid - a distributed column store. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * + * This file Copyright (C) 2014 N3TWORK, Inc. and contributed to the Druid project + * under the Druid Corporate Contributor License Agreement. + */ + +package io.druid.query; + +import com.fasterxml.jackson.annotation.JsonSubTypes; +import com.fasterxml.jackson.annotation.JsonTypeInfo; + +@JsonTypeInfo(use = JsonTypeInfo.Id.NAME, + include = JsonTypeInfo.As.PROPERTY, + property = "type", + defaultImpl = LegacyDataSource.class) +@JsonSubTypes({ + @JsonSubTypes.Type(value = TableDataSource.class, name = "table"), + @JsonSubTypes.Type(value = QueryDataSource.class, name = "query") + }) +public interface DataSource +{ + public String getName(); +} diff --git a/processing/src/main/java/io/druid/query/Druids.java b/processing/src/main/java/io/druid/query/Druids.java index 99d5acdc6f6..fd2abd13ab8 100644 --- a/processing/src/main/java/io/druid/query/Druids.java +++ b/processing/src/main/java/io/druid/query/Druids.java @@ -298,7 +298,7 @@ public class Druids */ public static class TimeseriesQueryBuilder { - private String dataSource; + private DataSource dataSource; private QuerySegmentSpec querySegmentSpec; private DimFilter dimFilter; private QueryGranularity granularity; @@ -308,7 +308,7 @@ public class Druids private TimeseriesQueryBuilder() { - dataSource = ""; + dataSource = null; querySegmentSpec = null; dimFilter = null; granularity = QueryGranularity.ALL; @@ -354,7 +354,7 @@ public class Druids .context(builder.context); } - public String getDataSource() + public DataSource getDataSource() { return dataSource; } @@ -390,6 +390,12 @@ public class Druids } public TimeseriesQueryBuilder dataSource(String ds) + { + dataSource = new TableDataSource(ds); + return this; + } + + public TimeseriesQueryBuilder dataSource(DataSource ds) { dataSource = ds; return this; @@ -492,7 +498,7 @@ public class Druids */ public static class SearchQueryBuilder { - private String dataSource; + private DataSource dataSource; private DimFilter dimFilter; private QueryGranularity granularity; private int limit; @@ -503,7 +509,7 @@ public class Druids public SearchQueryBuilder() { - dataSource = ""; + dataSource = null; dimFilter = null; granularity = QueryGranularity.ALL; limit = 0; @@ -531,7 +537,7 @@ public class Druids public SearchQueryBuilder copy(SearchQuery query) { return new SearchQueryBuilder() - .dataSource(query.getDataSource()) + .dataSource(((TableDataSource)query.getDataSource()).getName()) .intervals(query.getQuerySegmentSpec()) .filters(query.getDimensionsFilter()) .granularity(query.getGranularity()) @@ -555,6 +561,12 @@ public class Druids } public SearchQueryBuilder dataSource(String d) + { + dataSource = new TableDataSource(d); + return this; + } + + public SearchQueryBuilder dataSource(DataSource d) { dataSource = d; return this; @@ -676,13 +688,13 @@ public class Druids */ public static class TimeBoundaryQueryBuilder { - private String dataSource; + private DataSource dataSource; private QuerySegmentSpec querySegmentSpec; private Map context; public TimeBoundaryQueryBuilder() { - dataSource = ""; + dataSource = null; querySegmentSpec = null; context = null; } @@ -704,9 +716,15 @@ public class Druids .context(builder.context); } - public TimeBoundaryQueryBuilder dataSource(String d) + public TimeBoundaryQueryBuilder dataSource(String ds) { - dataSource = d; + dataSource = new TableDataSource(ds); + return this; + } + + public TimeBoundaryQueryBuilder dataSource(DataSource ds) + { + dataSource = ds; return this; } diff --git a/processing/src/main/java/io/druid/query/LegacyDataSource.java b/processing/src/main/java/io/druid/query/LegacyDataSource.java new file mode 100644 index 00000000000..07a8c647297 --- /dev/null +++ b/processing/src/main/java/io/druid/query/LegacyDataSource.java @@ -0,0 +1,35 @@ +/* + * Druid - a distributed column store. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * + * This file Copyright (C) 2014 N3TWORK, Inc. and contributed to the Druid project + * under the Druid Corporate Contributor License Agreement. + */ +package io.druid.query; + +import com.fasterxml.jackson.annotation.JsonCreator; +import com.fasterxml.jackson.annotation.JsonProperty; +import com.fasterxml.jackson.annotation.JsonTypeName; + +@JsonTypeName("table") +public class LegacyDataSource extends TableDataSource +{ + @JsonCreator + public LegacyDataSource(String name) + { + super(name); + } +} diff --git a/processing/src/main/java/io/druid/query/Query.java b/processing/src/main/java/io/druid/query/Query.java index 6823e1220ae..d58798539ba 100644 --- a/processing/src/main/java/io/druid/query/Query.java +++ b/processing/src/main/java/io/druid/query/Query.java @@ -56,7 +56,7 @@ public interface Query public static final String SELECT = "select"; public static final String TOPN = "topN"; - public String getDataSource(); + public DataSource getDataSource(); public boolean hasFilters(); diff --git a/processing/src/main/java/io/druid/query/QueryDataSource.java b/processing/src/main/java/io/druid/query/QueryDataSource.java new file mode 100644 index 00000000000..3f0c397f6d4 --- /dev/null +++ b/processing/src/main/java/io/druid/query/QueryDataSource.java @@ -0,0 +1,78 @@ +/* + * Druid - a distributed column store. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * + * This file Copyright (C) 2014 N3TWORK, Inc. and contributed to the Druid project + * under the Druid Corporate Contributor License Agreement. + */ + +package io.druid.query; + +import com.fasterxml.jackson.annotation.JsonCreator; +import com.fasterxml.jackson.annotation.JsonProperty; +import com.fasterxml.jackson.annotation.JsonTypeName; + +@JsonTypeName("query") +public class QueryDataSource implements DataSource +{ + @JsonProperty + private final Query query; + + @JsonCreator + public QueryDataSource(@JsonProperty("query") Query query) + { + this.query = query; + } + + @Override + public String getName() + { + return query.getDataSource().getName(); + } + + @JsonProperty + public Query getQuery() + { + return query; + } + + public String toString() { return query.toString(); } + + @Override + public boolean equals(Object o) + { + if (this == o) { + return true; + } + if (o == null || getClass() != o.getClass()) { + return false; + } + + QueryDataSource that = (QueryDataSource) o; + + if (!query.equals(that.query)) { + return false; + } + + return true; + } + + @Override + public int hashCode() + { + return query.hashCode(); + } +} diff --git a/processing/src/main/java/io/druid/query/SubqueryQueryRunner.java b/processing/src/main/java/io/druid/query/SubqueryQueryRunner.java new file mode 100644 index 00000000000..8e13d9219e9 --- /dev/null +++ b/processing/src/main/java/io/druid/query/SubqueryQueryRunner.java @@ -0,0 +1,48 @@ +/* + * Druid - a distributed column store. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * + * This file Copyright (C) 2014 N3TWORK, Inc. and contributed to the Druid project + * under the Druid Corporate Contributor License Agreement. + */ + +package io.druid.query; + +import com.metamx.common.guava.Sequence; + +/** + * If there's a subquery, run it instead of the outer query + */ +public class SubqueryQueryRunner implements QueryRunner +{ + private final QueryRunner baseRunner; + + public SubqueryQueryRunner(QueryRunner baseRunner) + { + this.baseRunner = baseRunner; + } + + @Override + public Sequence run(final Query query) + { + DataSource dataSource = query.getDataSource(); + if (dataSource instanceof QueryDataSource) { + return run((Query) ((QueryDataSource) dataSource).getQuery()); + } else { + return baseRunner.run(query); + } + } +} diff --git a/processing/src/main/java/io/druid/query/TableDataSource.java b/processing/src/main/java/io/druid/query/TableDataSource.java new file mode 100644 index 00000000000..b658454cbc1 --- /dev/null +++ b/processing/src/main/java/io/druid/query/TableDataSource.java @@ -0,0 +1,72 @@ +/* + * Druid - a distributed column store. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * + * This file Copyright (C) 2014 N3TWORK, Inc. and contributed to the Druid project + * under the Druid Corporate Contributor License Agreement. + */ +package io.druid.query; + +import com.fasterxml.jackson.annotation.JsonCreator; +import com.fasterxml.jackson.annotation.JsonProperty; +import com.fasterxml.jackson.annotation.JsonTypeName; + +@JsonTypeName("table") +public class TableDataSource implements DataSource +{ + @JsonProperty + private final String name; + + @JsonCreator + public TableDataSource(@JsonProperty("name") String name) + { + this.name = (name == null ? null : name.toLowerCase()); + } + + @JsonProperty + @Override + public String getName() + { + return name; + } + + public String toString() { return name; } + + @Override + public boolean equals(Object o) + { + if (this == o) { + return true; + } + if (!(o instanceof TableDataSource)) { + return false; + } + + TableDataSource that = (TableDataSource) o; + + if (!name.equals(that.name)) { + return false; + } + + return true; + } + + @Override + public int hashCode() + { + return name.hashCode(); + } +} diff --git a/processing/src/main/java/io/druid/query/aggregation/CountAggregatorFactory.java b/processing/src/main/java/io/druid/query/aggregation/CountAggregatorFactory.java index 1b02f923996..e47999e8719 100644 --- a/processing/src/main/java/io/druid/query/aggregation/CountAggregatorFactory.java +++ b/processing/src/main/java/io/druid/query/aggregation/CountAggregatorFactory.java @@ -132,4 +132,23 @@ public class CountAggregatorFactory implements AggregatorFactory "name='" + name + '\'' + '}'; } + + @Override + public boolean equals(Object o) + { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + + CountAggregatorFactory that = (CountAggregatorFactory) o; + + if (name != null ? !name.equals(that.name) : that.name != null) return false; + + return true; + } + + @Override + public int hashCode() + { + return name != null ? name.hashCode() : 0; + } } diff --git a/processing/src/main/java/io/druid/query/aggregation/DoubleSumAggregatorFactory.java b/processing/src/main/java/io/druid/query/aggregation/DoubleSumAggregatorFactory.java index f85d0c677f5..ebd4e185ea3 100644 --- a/processing/src/main/java/io/druid/query/aggregation/DoubleSumAggregatorFactory.java +++ b/processing/src/main/java/io/druid/query/aggregation/DoubleSumAggregatorFactory.java @@ -150,4 +150,26 @@ public class DoubleSumAggregatorFactory implements AggregatorFactory ", name='" + name + '\'' + '}'; } + + @Override + public boolean equals(Object o) + { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + + DoubleSumAggregatorFactory that = (DoubleSumAggregatorFactory) o; + + if (fieldName != null ? !fieldName.equals(that.fieldName) : that.fieldName != null) return false; + if (name != null ? !name.equals(that.name) : that.name != null) return false; + + return true; + } + + @Override + public int hashCode() + { + int result = fieldName != null ? fieldName.hashCode() : 0; + result = 31 * result + (name != null ? name.hashCode() : 0); + return result; + } } diff --git a/processing/src/main/java/io/druid/query/aggregation/HistogramAggregatorFactory.java b/processing/src/main/java/io/druid/query/aggregation/HistogramAggregatorFactory.java index 2abd19d2330..060d40d2798 100644 --- a/processing/src/main/java/io/druid/query/aggregation/HistogramAggregatorFactory.java +++ b/processing/src/main/java/io/druid/query/aggregation/HistogramAggregatorFactory.java @@ -179,4 +179,30 @@ public class HistogramAggregatorFactory implements AggregatorFactory ", breaks=" + Arrays.toString(breaks) + '}'; } + + @Override + public boolean equals(Object o) + { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + + HistogramAggregatorFactory that = (HistogramAggregatorFactory) o; + + if (!Arrays.equals(breaks, that.breaks)) return false; + if (breaksList != null ? !breaksList.equals(that.breaksList) : that.breaksList != null) return false; + if (fieldName != null ? !fieldName.equals(that.fieldName) : that.fieldName != null) return false; + if (name != null ? !name.equals(that.name) : that.name != null) return false; + + return true; + } + + @Override + public int hashCode() + { + int result = name != null ? name.hashCode() : 0; + result = 31 * result + (fieldName != null ? fieldName.hashCode() : 0); + result = 31 * result + (breaksList != null ? breaksList.hashCode() : 0); + result = 31 * result + (breaks != null ? Arrays.hashCode(breaks) : 0); + return result; + } } diff --git a/processing/src/main/java/io/druid/query/aggregation/JavaScriptAggregatorFactory.java b/processing/src/main/java/io/druid/query/aggregation/JavaScriptAggregatorFactory.java index 927ab89676f..6de6be09ad8 100644 --- a/processing/src/main/java/io/druid/query/aggregation/JavaScriptAggregatorFactory.java +++ b/processing/src/main/java/io/druid/query/aggregation/JavaScriptAggregatorFactory.java @@ -317,4 +317,35 @@ public class JavaScriptAggregatorFactory implements AggregatorFactory } }; } + + @Override + public boolean equals(Object o) + { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + + JavaScriptAggregatorFactory that = (JavaScriptAggregatorFactory) o; + + if (compiledScript != null ? !compiledScript.equals(that.compiledScript) : that.compiledScript != null) + return false; + if (fieldNames != null ? !fieldNames.equals(that.fieldNames) : that.fieldNames != null) return false; + if (fnAggregate != null ? !fnAggregate.equals(that.fnAggregate) : that.fnAggregate != null) return false; + if (fnCombine != null ? !fnCombine.equals(that.fnCombine) : that.fnCombine != null) return false; + if (fnReset != null ? !fnReset.equals(that.fnReset) : that.fnReset != null) return false; + if (name != null ? !name.equals(that.name) : that.name != null) return false; + + return true; + } + + @Override + public int hashCode() + { + int result = name != null ? name.hashCode() : 0; + result = 31 * result + (fieldNames != null ? fieldNames.hashCode() : 0); + result = 31 * result + (fnAggregate != null ? fnAggregate.hashCode() : 0); + result = 31 * result + (fnReset != null ? fnReset.hashCode() : 0); + result = 31 * result + (fnCombine != null ? fnCombine.hashCode() : 0); + result = 31 * result + (compiledScript != null ? compiledScript.hashCode() : 0); + return result; + } } diff --git a/processing/src/main/java/io/druid/query/aggregation/LongSumAggregatorFactory.java b/processing/src/main/java/io/druid/query/aggregation/LongSumAggregatorFactory.java index f1372ff024c..50ef5130756 100644 --- a/processing/src/main/java/io/druid/query/aggregation/LongSumAggregatorFactory.java +++ b/processing/src/main/java/io/druid/query/aggregation/LongSumAggregatorFactory.java @@ -150,4 +150,26 @@ public class LongSumAggregatorFactory implements AggregatorFactory ", name='" + name + '\'' + '}'; } + + @Override + public boolean equals(Object o) + { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + + LongSumAggregatorFactory that = (LongSumAggregatorFactory) o; + + if (fieldName != null ? !fieldName.equals(that.fieldName) : that.fieldName != null) return false; + if (name != null ? !name.equals(that.name) : that.name != null) return false; + + return true; + } + + @Override + public int hashCode() + { + int result = fieldName != null ? fieldName.hashCode() : 0; + result = 31 * result + (name != null ? name.hashCode() : 0); + return result; + } } diff --git a/processing/src/main/java/io/druid/query/aggregation/MaxAggregatorFactory.java b/processing/src/main/java/io/druid/query/aggregation/MaxAggregatorFactory.java index d66d9c5b2d9..ee8217f820b 100644 --- a/processing/src/main/java/io/druid/query/aggregation/MaxAggregatorFactory.java +++ b/processing/src/main/java/io/druid/query/aggregation/MaxAggregatorFactory.java @@ -150,4 +150,26 @@ public class MaxAggregatorFactory implements AggregatorFactory ", name='" + name + '\'' + '}'; } + + @Override + public boolean equals(Object o) + { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + + MaxAggregatorFactory that = (MaxAggregatorFactory) o; + + if (fieldName != null ? !fieldName.equals(that.fieldName) : that.fieldName != null) return false; + if (name != null ? !name.equals(that.name) : that.name != null) return false; + + return true; + } + + @Override + public int hashCode() + { + int result = fieldName != null ? fieldName.hashCode() : 0; + result = 31 * result + (name != null ? name.hashCode() : 0); + return result; + } } diff --git a/processing/src/main/java/io/druid/query/aggregation/MinAggregatorFactory.java b/processing/src/main/java/io/druid/query/aggregation/MinAggregatorFactory.java index 0a168114358..9c3d560bacf 100644 --- a/processing/src/main/java/io/druid/query/aggregation/MinAggregatorFactory.java +++ b/processing/src/main/java/io/druid/query/aggregation/MinAggregatorFactory.java @@ -150,4 +150,26 @@ public class MinAggregatorFactory implements AggregatorFactory ", name='" + name + '\'' + '}'; } + + @Override + public boolean equals(Object o) + { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + + MinAggregatorFactory that = (MinAggregatorFactory) o; + + if (fieldName != null ? !fieldName.equals(that.fieldName) : that.fieldName != null) return false; + if (name != null ? !name.equals(that.name) : that.name != null) return false; + + return true; + } + + @Override + public int hashCode() + { + int result = fieldName != null ? fieldName.hashCode() : 0; + result = 31 * result + (name != null ? name.hashCode() : 0); + return result; + } } diff --git a/processing/src/main/java/io/druid/query/aggregation/ToLowerCaseAggregatorFactory.java b/processing/src/main/java/io/druid/query/aggregation/ToLowerCaseAggregatorFactory.java index 457d22f2cdf..6c559ba8ec6 100644 --- a/processing/src/main/java/io/druid/query/aggregation/ToLowerCaseAggregatorFactory.java +++ b/processing/src/main/java/io/druid/query/aggregation/ToLowerCaseAggregatorFactory.java @@ -112,4 +112,24 @@ public class ToLowerCaseAggregatorFactory implements AggregatorFactory { return baseAggregatorFactory.getAggregatorStartValue(); } + + @Override + public boolean equals(Object o) + { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + + ToLowerCaseAggregatorFactory that = (ToLowerCaseAggregatorFactory) o; + + if (baseAggregatorFactory != null ? !baseAggregatorFactory.equals(that.baseAggregatorFactory) : that.baseAggregatorFactory != null) + return false; + + return true; + } + + @Override + public int hashCode() + { + return baseAggregatorFactory != null ? baseAggregatorFactory.hashCode() : 0; + } } diff --git a/processing/src/main/java/io/druid/query/aggregation/hyperloglog/ByteBitLookup.java b/processing/src/main/java/io/druid/query/aggregation/hyperloglog/ByteBitLookup.java new file mode 100644 index 00000000000..aebc908be9a --- /dev/null +++ b/processing/src/main/java/io/druid/query/aggregation/hyperloglog/ByteBitLookup.java @@ -0,0 +1,288 @@ +/* + * Druid - a distributed column store. + * Copyright (C) 2012, 2013 Metamarkets Group Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + */ + +package io.druid.query.aggregation.hyperloglog; + +/** + */ +public class ByteBitLookup +{ + public static final byte[] lookup; + + static { + lookup = new byte[256]; + + lookup[0] = 0; + lookup[1] = 1; + lookup[2] = 2; + lookup[3] = 1; + lookup[4] = 3; + lookup[5] = 1; + lookup[6] = 2; + lookup[7] = 1; + lookup[8] = 4; + lookup[9] = 1; + lookup[10] = 2; + lookup[11] = 1; + lookup[12] = 3; + lookup[13] = 1; + lookup[14] = 2; + lookup[15] = 1; + lookup[16] = 5; + lookup[17] = 1; + lookup[18] = 2; + lookup[19] = 1; + lookup[20] = 3; + lookup[21] = 1; + lookup[22] = 2; + lookup[23] = 1; + lookup[24] = 4; + lookup[25] = 1; + lookup[26] = 2; + lookup[27] = 1; + lookup[28] = 3; + lookup[29] = 1; + lookup[30] = 2; + lookup[31] = 1; + lookup[32] = 6; + lookup[33] = 1; + lookup[34] = 2; + lookup[35] = 1; + lookup[36] = 3; + lookup[37] = 1; + lookup[38] = 2; + lookup[39] = 1; + lookup[40] = 4; + lookup[41] = 1; + lookup[42] = 2; + lookup[43] = 1; + lookup[44] = 3; + lookup[45] = 1; + lookup[46] = 2; + lookup[47] = 1; + lookup[48] = 5; + lookup[49] = 1; + lookup[50] = 2; + lookup[51] = 1; + lookup[52] = 3; + lookup[53] = 1; + lookup[54] = 2; + lookup[55] = 1; + lookup[56] = 4; + lookup[57] = 1; + lookup[58] = 2; + lookup[59] = 1; + lookup[60] = 3; + lookup[61] = 1; + lookup[62] = 2; + lookup[63] = 1; + lookup[64] = 7; + lookup[65] = 1; + lookup[66] = 2; + lookup[67] = 1; + lookup[68] = 3; + lookup[69] = 1; + lookup[70] = 2; + lookup[71] = 1; + lookup[72] = 4; + lookup[73] = 1; + lookup[74] = 2; + lookup[75] = 1; + lookup[76] = 3; + lookup[77] = 1; + lookup[78] = 2; + lookup[79] = 1; + lookup[80] = 5; + lookup[81] = 1; + lookup[82] = 2; + lookup[83] = 1; + lookup[84] = 3; + lookup[85] = 1; + lookup[86] = 2; + lookup[87] = 1; + lookup[88] = 4; + lookup[89] = 1; + lookup[90] = 2; + lookup[91] = 1; + lookup[92] = 3; + lookup[93] = 1; + lookup[94] = 2; + lookup[95] = 1; + lookup[96] = 6; + lookup[97] = 1; + lookup[98] = 2; + lookup[99] = 1; + lookup[100] = 3; + lookup[101] = 1; + lookup[102] = 2; + lookup[103] = 1; + lookup[104] = 4; + lookup[105] = 1; + lookup[106] = 2; + lookup[107] = 1; + lookup[108] = 3; + lookup[109] = 1; + lookup[110] = 2; + lookup[111] = 1; + lookup[112] = 5; + lookup[113] = 1; + lookup[114] = 2; + lookup[115] = 1; + lookup[116] = 3; + lookup[117] = 1; + lookup[118] = 2; + lookup[119] = 1; + lookup[120] = 4; + lookup[121] = 1; + lookup[122] = 2; + lookup[123] = 1; + lookup[124] = 3; + lookup[125] = 1; + lookup[126] = 2; + lookup[127] = 1; + lookup[128] = 8; + lookup[129] = 1; + lookup[130] = 2; + lookup[131] = 1; + lookup[132] = 3; + lookup[133] = 1; + lookup[134] = 2; + lookup[135] = 1; + lookup[136] = 4; + lookup[137] = 1; + lookup[138] = 2; + lookup[139] = 1; + lookup[140] = 3; + lookup[141] = 1; + lookup[142] = 2; + lookup[143] = 1; + lookup[144] = 5; + lookup[145] = 1; + lookup[146] = 2; + lookup[147] = 1; + lookup[148] = 3; + lookup[149] = 1; + lookup[150] = 2; + lookup[151] = 1; + lookup[152] = 4; + lookup[153] = 1; + lookup[154] = 2; + lookup[155] = 1; + lookup[156] = 3; + lookup[157] = 1; + lookup[158] = 2; + lookup[159] = 1; + lookup[160] = 6; + lookup[161] = 1; + lookup[162] = 2; + lookup[163] = 1; + lookup[164] = 3; + lookup[165] = 1; + lookup[166] = 2; + lookup[167] = 1; + lookup[168] = 4; + lookup[169] = 1; + lookup[170] = 2; + lookup[171] = 1; + lookup[172] = 3; + lookup[173] = 1; + lookup[174] = 2; + lookup[175] = 1; + lookup[176] = 5; + lookup[177] = 1; + lookup[178] = 2; + lookup[179] = 1; + lookup[180] = 3; + lookup[181] = 1; + lookup[182] = 2; + lookup[183] = 1; + lookup[184] = 4; + lookup[185] = 1; + lookup[186] = 2; + lookup[187] = 1; + lookup[188] = 3; + lookup[189] = 1; + lookup[190] = 2; + lookup[191] = 1; + lookup[192] = 7; + lookup[193] = 1; + lookup[194] = 2; + lookup[195] = 1; + lookup[196] = 3; + lookup[197] = 1; + lookup[198] = 2; + lookup[199] = 1; + lookup[200] = 4; + lookup[201] = 1; + lookup[202] = 2; + lookup[203] = 1; + lookup[204] = 3; + lookup[205] = 1; + lookup[206] = 2; + lookup[207] = 1; + lookup[208] = 5; + lookup[209] = 1; + lookup[210] = 2; + lookup[211] = 1; + lookup[212] = 3; + lookup[213] = 1; + lookup[214] = 2; + lookup[215] = 1; + lookup[216] = 4; + lookup[217] = 1; + lookup[218] = 2; + lookup[219] = 1; + lookup[220] = 3; + lookup[221] = 1; + lookup[222] = 2; + lookup[223] = 1; + lookup[224] = 6; + lookup[225] = 1; + lookup[226] = 2; + lookup[227] = 1; + lookup[228] = 3; + lookup[229] = 1; + lookup[230] = 2; + lookup[231] = 1; + lookup[232] = 4; + lookup[233] = 1; + lookup[234] = 2; + lookup[235] = 1; + lookup[236] = 3; + lookup[237] = 1; + lookup[238] = 2; + lookup[239] = 1; + lookup[240] = 5; + lookup[241] = 1; + lookup[242] = 2; + lookup[243] = 1; + lookup[244] = 3; + lookup[245] = 1; + lookup[246] = 2; + lookup[247] = 1; + lookup[248] = 4; + lookup[249] = 1; + lookup[250] = 2; + lookup[251] = 1; + lookup[252] = 3; + lookup[253] = 1; + lookup[254] = 2; + lookup[255] = 1; + } +} diff --git a/processing/src/main/java/io/druid/query/aggregation/hyperloglog/HLLCV0.java b/processing/src/main/java/io/druid/query/aggregation/hyperloglog/HLLCV0.java new file mode 100644 index 00000000000..281a19fc9ef --- /dev/null +++ b/processing/src/main/java/io/druid/query/aggregation/hyperloglog/HLLCV0.java @@ -0,0 +1,152 @@ +/* + * Druid - a distributed column store. + * Copyright (C) 2012, 2013 Metamarkets Group Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + */ + +package io.druid.query.aggregation.hyperloglog; + +import java.nio.ByteBuffer; + +/** + */ +@Deprecated +public class HLLCV0 extends HyperLogLogCollector +{ + /** + * Header: + * Byte 0: registerOffset + * Byte 1-2: numNonZeroRegisters + */ + public static final int NUM_NON_ZERO_REGISTERS_BYTE = 1; + public static final int HEADER_NUM_BYTES = 3; + public static final int NUM_BYTES_FOR_DENSE_STORAGE = NUM_BYTES_FOR_BUCKETS + HEADER_NUM_BYTES; + + private static final ByteBuffer defaultStorageBuffer = ByteBuffer.wrap(new byte[]{0, 0, 0}).asReadOnlyBuffer(); + + public HLLCV0() + { + super(defaultStorageBuffer); + } + + public HLLCV0(ByteBuffer buffer) + { + super(buffer); + } + + @Override + public byte getVersion() + { + return 0; + } + + @Override + public void setVersion(ByteBuffer buffer) + { + } + + @Override + public byte getRegisterOffset() + { + return getStorageBuffer().get(getInitPosition()); + } + + @Override + public void setRegisterOffset(byte registerOffset) + { + getStorageBuffer().put(getInitPosition(), registerOffset); + } + + @Override + public void setRegisterOffset(ByteBuffer buffer, byte registerOffset) + { + buffer.put(buffer.position(), registerOffset); + } + + @Override + public short getNumNonZeroRegisters() + { + return getStorageBuffer().getShort(getInitPosition() + NUM_NON_ZERO_REGISTERS_BYTE); + } + + @Override + public void setNumNonZeroRegisters(short numNonZeroRegisters) + { + getStorageBuffer().putShort(getInitPosition() + NUM_NON_ZERO_REGISTERS_BYTE, numNonZeroRegisters); + } + + @Override + public void setNumNonZeroRegisters(ByteBuffer buffer, short numNonZeroRegisters) + { + buffer.putShort(buffer.position() + NUM_NON_ZERO_REGISTERS_BYTE, numNonZeroRegisters); + } + + @Override + public byte getMaxOverflowValue() + { + return 0; + } + + @Override + public void setMaxOverflowValue(byte value) + { + } + + @Override + public void setMaxOverflowValue(ByteBuffer buffer, byte value) + { + } + + @Override + public short getMaxOverflowRegister() + { + return 0; + } + + @Override + public void setMaxOverflowRegister(short register) + { + } + + @Override + public void setMaxOverflowRegister(ByteBuffer buffer, short register) + { + } + + @Override + public int getNumHeaderBytes() + { + return HEADER_NUM_BYTES; + } + + @Override + public int getNumBytesForDenseStorage() + { + return NUM_BYTES_FOR_DENSE_STORAGE; + } + + @Override + public int getPayloadBytePosition() + { + return getInitPosition() + HEADER_NUM_BYTES; + } + + @Override + public int getPayloadBytePosition(ByteBuffer buffer) + { + return buffer.position() + HEADER_NUM_BYTES; + } +} \ No newline at end of file diff --git a/processing/src/main/java/io/druid/query/aggregation/hyperloglog/HLLCV1.java b/processing/src/main/java/io/druid/query/aggregation/hyperloglog/HLLCV1.java new file mode 100644 index 00000000000..3080275ec82 --- /dev/null +++ b/processing/src/main/java/io/druid/query/aggregation/hyperloglog/HLLCV1.java @@ -0,0 +1,164 @@ +/* + * Druid - a distributed column store. + * Copyright (C) 2012, 2013 Metamarkets Group Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + */ + +package io.druid.query.aggregation.hyperloglog; + +import java.nio.ByteBuffer; + +/** + */ +public class HLLCV1 extends HyperLogLogCollector +{ + /** + * Header: + * Byte 0: version + * Byte 1: registerOffset + * Byte 2-3: numNonZeroRegisters + * Byte 4: maxOverflowValue + * Byte 5-6: maxOverflowRegister + */ + public static final byte VERSION = 0x1; + public static final int REGISTER_OFFSET_BYTE = 1; + public static final int NUM_NON_ZERO_REGISTERS_BYTE = 2; + public static final int MAX_OVERFLOW_VALUE_BYTE = 4; + public static final int MAX_OVERFLOW_REGISTER_BYTE = 5; + public static final int HEADER_NUM_BYTES = 7; + public static final int NUM_BYTES_FOR_DENSE_STORAGE = NUM_BYTES_FOR_BUCKETS + HEADER_NUM_BYTES; + + private static final ByteBuffer defaultStorageBuffer = ByteBuffer.wrap(new byte[]{VERSION, 0, 0, 0, 0, 0, 0}) + .asReadOnlyBuffer(); + + public HLLCV1() + { + super(defaultStorageBuffer); + } + + public HLLCV1(ByteBuffer buffer) + { + super(buffer); + } + + @Override + public byte getVersion() + { + return VERSION; + } + + @Override + public void setVersion(ByteBuffer buffer) + { + buffer.put(buffer.position(), VERSION); + } + + @Override + public byte getRegisterOffset() + { + return getStorageBuffer().get(getInitPosition() + REGISTER_OFFSET_BYTE); + } + + @Override + public void setRegisterOffset(byte registerOffset) + { + getStorageBuffer().put(getInitPosition() + REGISTER_OFFSET_BYTE, registerOffset); + } + + @Override + public void setRegisterOffset(ByteBuffer buffer, byte registerOffset) + { + buffer.put(buffer.position() + REGISTER_OFFSET_BYTE, registerOffset); + } + + @Override + public short getNumNonZeroRegisters() + { + return getStorageBuffer().getShort(getInitPosition() + NUM_NON_ZERO_REGISTERS_BYTE); + } + + @Override + public void setNumNonZeroRegisters(short numNonZeroRegisters) + { + getStorageBuffer().putShort(getInitPosition() + NUM_NON_ZERO_REGISTERS_BYTE, numNonZeroRegisters); + } + + @Override + public void setNumNonZeroRegisters(ByteBuffer buffer, short numNonZeroRegisters) + { + buffer.putShort(buffer.position() + NUM_NON_ZERO_REGISTERS_BYTE, numNonZeroRegisters); + } + + @Override + public byte getMaxOverflowValue() + { + return getStorageBuffer().get(getInitPosition() + MAX_OVERFLOW_VALUE_BYTE); + } + + @Override + public void setMaxOverflowValue(byte value) + { + getStorageBuffer().put(getInitPosition() + MAX_OVERFLOW_VALUE_BYTE, value); + } + + @Override + public void setMaxOverflowValue(ByteBuffer buffer, byte value) + { + buffer.put(buffer.position() + MAX_OVERFLOW_VALUE_BYTE, value); + } + + @Override + public short getMaxOverflowRegister() + { + return getStorageBuffer().getShort(getInitPosition() + MAX_OVERFLOW_REGISTER_BYTE); + } + + @Override + public void setMaxOverflowRegister(short register) + { + getStorageBuffer().putShort(getInitPosition() + MAX_OVERFLOW_REGISTER_BYTE, register); + } + + @Override + public void setMaxOverflowRegister(ByteBuffer buffer, short register) + { + buffer.putShort(buffer.position() + MAX_OVERFLOW_REGISTER_BYTE, register); + } + + @Override + public int getNumHeaderBytes() + { + return HEADER_NUM_BYTES; + } + + @Override + public int getNumBytesForDenseStorage() + { + return NUM_BYTES_FOR_DENSE_STORAGE; + } + + @Override + public int getPayloadBytePosition() + { + return getInitPosition() + HEADER_NUM_BYTES; + } + + @Override + public int getPayloadBytePosition(ByteBuffer buffer) + { + return buffer.position() + HEADER_NUM_BYTES; + } +} diff --git a/processing/src/main/java/io/druid/query/aggregation/hyperloglog/HyperLogLogCollector.java b/processing/src/main/java/io/druid/query/aggregation/hyperloglog/HyperLogLogCollector.java new file mode 100644 index 00000000000..bf32d85886c --- /dev/null +++ b/processing/src/main/java/io/druid/query/aggregation/hyperloglog/HyperLogLogCollector.java @@ -0,0 +1,673 @@ +/* + * Druid - a distributed column store. + * Copyright (C) 2012, 2013 Metamarkets Group Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + */ + +package io.druid.query.aggregation.hyperloglog; + +import com.fasterxml.jackson.annotation.JsonValue; +import com.google.common.primitives.UnsignedBytes; +import com.metamx.common.IAE; +import com.metamx.common.ISE; +import com.metamx.common.logger.Logger; + +import java.nio.ByteBuffer; + +/** + * Implements the HyperLogLog cardinality estimator described in: + *

+ * http://algo.inria.fr/flajolet/Publications/FlFuGaMe07.pdf + *

+ * Run this code to see a simple indication of expected errors based on different m values: + *

+ * for (int i = 1; i < 20; ++i) { + * System.out.printf("i[%,d], val[%,d] => error[%f%%]%n", i, 2 << i, 104 / Math.sqrt(2 << i)); + * } + *

+ * This class is *not* multi-threaded. It can be passed among threads, but it is written with the assumption that + * only one thread is ever calling methods on it. + *

+ * If you have multiple threads calling methods on this concurrently, I hope you manage to get correct behavior + */ +public abstract class HyperLogLogCollector implements Comparable +{ + public static final int DENSE_THRESHOLD = 128; + public static final int BITS_FOR_BUCKETS = 11; + public static final int NUM_BUCKETS = 1 << BITS_FOR_BUCKETS; + public static final int NUM_BYTES_FOR_BUCKETS = NUM_BUCKETS / 2; + + private static final double TWO_TO_THE_SIXTY_FOUR = Math.pow(2, 64); + private static final double ALPHA = 0.7213 / (1 + 1.079 / NUM_BUCKETS); + + public static final double LOW_CORRECTION_THRESHOLD = (5 * NUM_BUCKETS) / 2.0d; + public static final double HIGH_CORRECTION_THRESHOLD = TWO_TO_THE_SIXTY_FOUR / 30.0d; + public static final double CORRECTION_PARAMETER = ALPHA * NUM_BUCKETS * NUM_BUCKETS; + + private static final Logger log = new Logger(HyperLogLogCollector.class); + private static final int bucketMask = 0x7ff; + private static final int minBytesRequired = 10; + private static final int bitsPerBucket = 4; + private static final int range = (int) Math.pow(2, bitsPerBucket) - 1; + + private final static double[][] minNumRegisterLookup = new double[64][256]; + + static { + for (int registerOffset = 0; registerOffset < 64; ++registerOffset) { + for (int register = 0; register < 256; ++register) { + final int upper = ((register & 0xf0) >> 4) + registerOffset; + final int lower = (register & 0x0f) + registerOffset; + minNumRegisterLookup[registerOffset][register] = 1.0d / Math.pow(2, upper) + 1.0d / Math.pow(2, lower); + } + } + } + + // we have to keep track of the number of zeroes in each of the two halves of the byte register (0, 1, or 2) + private final static int[] numZeroLookup = new int[256]; + + static { + for (int i = 0; i < numZeroLookup.length; ++i) { + numZeroLookup[i] = (((i & 0xf0) == 0) ? 1 : 0) + (((i & 0x0f) == 0) ? 1 : 0); + } + } + + // Methods to build the latest HLLC + public static HyperLogLogCollector makeLatestCollector() + { + return new HLLCV1(); + } + + public static HyperLogLogCollector makeCollector(ByteBuffer buffer) + { + int remaining = buffer.remaining(); + return (remaining % 3 == 0 || remaining == 1027) ? new HLLCV0(buffer) : new HLLCV1(buffer); + } + + public static int getLatestNumBytesForDenseStorage() + { + return HLLCV1.NUM_BYTES_FOR_DENSE_STORAGE; + } + + public static byte[] makeEmptyVersionedByteArray() + { + byte[] arr = new byte[getLatestNumBytesForDenseStorage()]; + arr[0] = HLLCV1.VERSION; + return arr; + } + + public static double applyCorrection(double e, int zeroCount) + { + e = CORRECTION_PARAMETER / e; + + if (e <= LOW_CORRECTION_THRESHOLD) { + return zeroCount == 0 ? e : NUM_BUCKETS * Math.log(NUM_BUCKETS / (double) zeroCount); + } + + if (e > HIGH_CORRECTION_THRESHOLD) { + final double ratio = e / TWO_TO_THE_SIXTY_FOUR; + if (ratio >= 1) { + // handle very unlikely case that value is > 2^64 + return Double.MAX_VALUE; + } else { + return -TWO_TO_THE_SIXTY_FOUR * Math.log(1 - ratio); + } + } + + return e; + } + + private static double estimateSparse( + final ByteBuffer buf, + final byte minNum, + final byte overflowValue, + final short overflowPosition, + final boolean isUpperNibble + ) + { + final ByteBuffer copy = buf.asReadOnlyBuffer(); + double e = 0.0d; + int zeroCount = NUM_BUCKETS - 2 * (buf.remaining() / 3); + while (copy.hasRemaining()) { + short position = copy.getShort(); + final int register = (int) copy.get() & 0xff; + if (overflowValue != 0 && position == overflowPosition) { + int upperNibble = ((register & 0xf0) >>> bitsPerBucket) + minNum; + int lowerNibble = (register & 0x0f) + minNum; + if (isUpperNibble) { + upperNibble = Math.max(upperNibble, overflowValue); + } else { + lowerNibble = Math.max(lowerNibble, overflowValue); + } + e += 1.0d / Math.pow(2, upperNibble) + 1.0d / Math.pow(2, lowerNibble); + zeroCount += (((upperNibble & 0xf0) == 0) ? 1 : 0) + (((lowerNibble & 0x0f) == 0) ? 1 : 0); + } else { + e += minNumRegisterLookup[minNum][register]; + zeroCount += numZeroLookup[register]; + } + } + + e += zeroCount; + return applyCorrection(e, zeroCount); + } + + private static double estimateDense( + final ByteBuffer buf, + final byte minNum, + final byte overflowValue, + final short overflowPosition, + final boolean isUpperNibble + ) + { + final ByteBuffer copy = buf.asReadOnlyBuffer(); + double e = 0.0d; + int zeroCount = 0; + int position = 0; + while (copy.hasRemaining()) { + final int register = (int) copy.get() & 0xff; + if (overflowValue != 0 && position == overflowPosition) { + int upperNibble = ((register & 0xf0) >>> bitsPerBucket) + minNum; + int lowerNibble = (register & 0x0f) + minNum; + if (isUpperNibble) { + upperNibble = Math.max(upperNibble, overflowValue); + } else { + lowerNibble = Math.max(lowerNibble, overflowValue); + } + e += 1.0d / Math.pow(2, upperNibble) + 1.0d / Math.pow(2, lowerNibble); + zeroCount += (((upperNibble & 0xf0) == 0) ? 1 : 0) + (((lowerNibble & 0x0f) == 0) ? 1 : 0); + } else { + e += minNumRegisterLookup[minNum][register]; + zeroCount += numZeroLookup[register]; + } + position++; + } + + return applyCorrection(e, zeroCount); + } + + private static boolean isSparse(ByteBuffer buffer) + { + return buffer.remaining() != NUM_BYTES_FOR_BUCKETS; + } + + private volatile ByteBuffer storageBuffer; + private volatile int initPosition; + private volatile Double estimatedCardinality; + + public HyperLogLogCollector(ByteBuffer byteBuffer) + { + storageBuffer = byteBuffer.duplicate(); + initPosition = byteBuffer.position(); + estimatedCardinality = null; + } + + public abstract byte getVersion(); + + public abstract void setVersion(ByteBuffer buffer); + + public abstract byte getRegisterOffset(); + + public abstract void setRegisterOffset(byte registerOffset); + + public abstract void setRegisterOffset(ByteBuffer buffer, byte registerOffset); + + public abstract short getNumNonZeroRegisters(); + + public abstract void setNumNonZeroRegisters(short numNonZeroRegisters); + + public abstract void setNumNonZeroRegisters(ByteBuffer buffer, short numNonZeroRegisters); + + public abstract byte getMaxOverflowValue(); + + public abstract void setMaxOverflowValue(byte value); + + public abstract void setMaxOverflowValue(ByteBuffer buffer, byte value); + + public abstract short getMaxOverflowRegister(); + + public abstract void setMaxOverflowRegister(short register); + + public abstract void setMaxOverflowRegister(ByteBuffer buffer, short register); + + public abstract int getNumHeaderBytes(); + + public abstract int getNumBytesForDenseStorage(); + + public abstract int getPayloadBytePosition(); + + public abstract int getPayloadBytePosition(ByteBuffer buffer); + + protected int getInitPosition() + { + return initPosition; + } + + protected ByteBuffer getStorageBuffer() + { + return storageBuffer; + } + + public void add(byte[] hashedValue) + { + if (hashedValue.length < minBytesRequired) { + throw new IAE("Insufficient bytes, need[%d] got [%d]", minBytesRequired, hashedValue.length); + } + + estimatedCardinality = null; + + final ByteBuffer buffer = ByteBuffer.wrap(hashedValue); + + short bucket = (short) (buffer.getShort(hashedValue.length - 2) & bucketMask); + + byte positionOf1 = 0; + + for (int i = 0; i < 8; ++i) { + byte lookupVal = ByteBitLookup.lookup[UnsignedBytes.toInt(hashedValue[i])]; + switch (lookupVal) { + case 0: + positionOf1 += 8; + continue; + default: + positionOf1 += lookupVal; + i = 8; + break; + } + } + + add(bucket, positionOf1); + } + + public void add(short bucket, byte positionOf1) + { + if (storageBuffer.isReadOnly()) { + convertToMutableByteBuffer(); + } + + byte registerOffset = getRegisterOffset(); + + // discard everything outside of the range we care about + if (positionOf1 <= registerOffset) { + return; + } else if (positionOf1 > (registerOffset + range)) { + byte currMax = getMaxOverflowValue(); + if (positionOf1 > currMax) { + setMaxOverflowValue(positionOf1); + setMaxOverflowRegister(bucket); + } + return; + } + + // whatever value we add must be stored in 4 bits + short numNonZeroRegisters = addNibbleRegister(bucket, (byte) ((0xff & positionOf1) - registerOffset)); + setNumNonZeroRegisters(numNonZeroRegisters); + if (numNonZeroRegisters == NUM_BUCKETS) { + setRegisterOffset(++registerOffset); + setNumNonZeroRegisters(decrementBuckets()); + } + } + + public HyperLogLogCollector fold(HyperLogLogCollector other) + { + if (other == null || other.storageBuffer.remaining() == 0) { + return this; + } + + if (storageBuffer.isReadOnly()) { + convertToMutableByteBuffer(); + } + + estimatedCardinality = null; + + if (getRegisterOffset() < other.getRegisterOffset()) { + // "Swap" the buffers so that we are folding into the one with the higher offset + ByteBuffer newStorage = ByteBuffer.allocate(other.storageBuffer.remaining()); + newStorage.put(other.storageBuffer.asReadOnlyBuffer()); + newStorage.clear(); + + other.storageBuffer = storageBuffer; + other.initPosition = initPosition; + storageBuffer = newStorage; + initPosition = 0; + } + + ByteBuffer otherBuffer = other.storageBuffer.asReadOnlyBuffer(); + byte otherOffset = other.getRegisterOffset(); + + if (storageBuffer.remaining() != getNumBytesForDenseStorage()) { + convertToDenseStorage(); + } + + byte myOffset = getRegisterOffset(); + short numNonZero = getNumNonZeroRegisters(); + + int offsetDiff = myOffset - otherOffset; + if (offsetDiff < 0) { + throw new ISE("offsetDiff[%d] < 0, shouldn't happen because of swap.", offsetDiff); + } + + byte otherOverflowValue = other.getMaxOverflowValue(); + short otherOverflowRegister = other.getMaxOverflowRegister(); + add(otherOverflowRegister, otherOverflowValue); + + int myPayloadStart = getPayloadBytePosition(); + otherBuffer.position(other.getPayloadBytePosition()); + if (isSparse(otherBuffer)) { + while (otherBuffer.hasRemaining()) { + short position = otherBuffer.getShort(); + int payloadStartPosition = position - other.getNumHeaderBytes(); + numNonZero += mergeAndStoreByteRegister( + myPayloadStart + payloadStartPosition, + offsetDiff, + otherBuffer.get() + ); + if (numNonZero == NUM_BUCKETS) { + myOffset += 1; + numNonZero = decrementBuckets(); + setRegisterOffset(myOffset); + setNumNonZeroRegisters(numNonZero); + + offsetDiff = myOffset - otherOffset; + } + } + } else { // dense + int position = getPayloadBytePosition(); + while (otherBuffer.hasRemaining()) { + numNonZero += mergeAndStoreByteRegister( + position, + offsetDiff, + otherBuffer.get() + ); + if (numNonZero == NUM_BUCKETS) { + myOffset += 1; + numNonZero = decrementBuckets(); + setRegisterOffset(myOffset); + setNumNonZeroRegisters(numNonZero); + + offsetDiff = myOffset - otherOffset; + } + position++; + } + } + + setRegisterOffset(myOffset); + setNumNonZeroRegisters(numNonZero); + + return this; + } + + public HyperLogLogCollector fold(ByteBuffer buffer) + { + return fold(makeCollector(buffer)); + } + + public ByteBuffer toByteBuffer() + { + short numNonZeroRegisters = getNumNonZeroRegisters(); + + // store sparsely + if (storageBuffer.remaining() == getNumBytesForDenseStorage() && numNonZeroRegisters < DENSE_THRESHOLD) { + ByteBuffer retVal = ByteBuffer.wrap(new byte[numNonZeroRegisters * 3 + getNumHeaderBytes()]); + setVersion(retVal); + setRegisterOffset(retVal, getRegisterOffset()); + setNumNonZeroRegisters(retVal, numNonZeroRegisters); + setMaxOverflowValue(retVal, getMaxOverflowValue()); + setMaxOverflowRegister(retVal, getMaxOverflowRegister()); + + int startPosition = getPayloadBytePosition(); + retVal.position(getPayloadBytePosition(retVal)); + for (int i = startPosition; i < startPosition + NUM_BYTES_FOR_BUCKETS; i++) { + if (storageBuffer.get(i) != 0) { + retVal.putShort((short) (0xffff & (i - initPosition))); + retVal.put(storageBuffer.get(i)); + } + } + retVal.rewind(); + return retVal.asReadOnlyBuffer(); + } + + return storageBuffer.asReadOnlyBuffer(); + } + + @JsonValue + public byte[] toByteArray() + { + final ByteBuffer buffer = toByteBuffer(); + byte[] theBytes = new byte[buffer.remaining()]; + buffer.get(theBytes); + + return theBytes; + } + + public double estimateCardinality() + { + if (estimatedCardinality == null) { + byte registerOffset = getRegisterOffset(); + byte overflowValue = getMaxOverflowValue(); + short overflowRegister = getMaxOverflowRegister(); + short overflowPosition = (short) (overflowRegister >>> 1); + boolean isUpperNibble = ((overflowRegister & 0x1) == 0); + + storageBuffer.position(getPayloadBytePosition()); + + if (isSparse(storageBuffer)) { + estimatedCardinality = estimateSparse( + storageBuffer, + registerOffset, + overflowValue, + overflowPosition, + isUpperNibble + ); + } else { + estimatedCardinality = estimateDense( + storageBuffer, + registerOffset, + overflowValue, + overflowPosition, + isUpperNibble + ); + } + + storageBuffer.position(initPosition); + } + return estimatedCardinality; + } + + public double estimateByteBuffer(ByteBuffer buf) + { + return makeCollector(buf).estimateCardinality(); + } + + @Override + public boolean equals(Object o) + { + if (this == o) { + return true; + } + if (o == null || getClass() != o.getClass()) { + return false; + } + + HyperLogLogCollector collector = (HyperLogLogCollector) o; + + if (storageBuffer != null ? !storageBuffer.equals(collector.storageBuffer) : collector.storageBuffer != null) { + return false; + } + + return true; + } + + @Override + public int hashCode() + { + int result = storageBuffer != null ? storageBuffer.hashCode() : 0; + result = 31 * result + initPosition; + return result; + } + + @Override + public String toString() + { + return "HyperLogLogCollector{" + + "initPosition=" + initPosition + + ", version=" + getVersion() + + ", registerOffset=" + getRegisterOffset() + + ", numNonZeroRegisters=" + getNumNonZeroRegisters() + + ", maxOverflowValue=" + getMaxOverflowValue() + + ", maxOverflowRegister=" + getMaxOverflowRegister() + + '}'; + } + + private short decrementBuckets() + { + short count = 0; + int startPosition = getPayloadBytePosition(); + for (int i = startPosition; i < startPosition + NUM_BYTES_FOR_BUCKETS; i++) { + byte val = (byte) (storageBuffer.get(i) - 0x11); + if ((val & 0xf0) != 0) { + count++; + } + if ((val & 0x0f) != 0) { + count++; + } + storageBuffer.put(i, val); + } + return count; + } + + private void convertToMutableByteBuffer() + { + ByteBuffer tmpBuffer = ByteBuffer.allocate(storageBuffer.remaining()); + tmpBuffer.put(storageBuffer.asReadOnlyBuffer()); + tmpBuffer.position(0); + storageBuffer = tmpBuffer; + initPosition = 0; + } + + private void convertToDenseStorage() + { + ByteBuffer tmpBuffer = ByteBuffer.wrap(new byte[getNumBytesForDenseStorage()]); + // put header + setVersion(tmpBuffer); + setRegisterOffset(tmpBuffer, getRegisterOffset()); + setNumNonZeroRegisters(tmpBuffer, getNumNonZeroRegisters()); + setMaxOverflowValue(tmpBuffer, getMaxOverflowValue()); + setMaxOverflowRegister(tmpBuffer, getMaxOverflowRegister()); + + storageBuffer.position(getPayloadBytePosition()); + tmpBuffer.position(getPayloadBytePosition(tmpBuffer)); + // put payload + while (storageBuffer.hasRemaining()) { + tmpBuffer.put(storageBuffer.getShort(), storageBuffer.get()); + } + tmpBuffer.rewind(); + storageBuffer = tmpBuffer; + initPosition = 0; + } + + private short addNibbleRegister(short bucket, byte positionOf1) + { + short numNonZeroRegs = getNumNonZeroRegisters(); + final short position = (short) (bucket >> 1); + final boolean isUpperNibble = ((bucket & 0x1) == 0); + + byte shiftedPositionOf1 = (isUpperNibble) ? (byte) (positionOf1 << bitsPerBucket) : positionOf1; + + if (storageBuffer.remaining() != getNumBytesForDenseStorage()) { + convertToDenseStorage(); + } + + byte origVal = storageBuffer.get(getPayloadBytePosition() + position); + byte newValueMask = (isUpperNibble) ? (byte) 0xf0 : (byte) 0x0f; + byte originalValueMask = (byte) (newValueMask ^ 0xff); + + // if something was at zero, we have to increase the numNonZeroRegisters + if ((origVal & newValueMask) == 0 && shiftedPositionOf1 != 0) { + numNonZeroRegs++; + } + + storageBuffer.put( + getPayloadBytePosition() + position, + (byte) (UnsignedBytes.max((byte) (origVal & newValueMask), shiftedPositionOf1) | (origVal & originalValueMask)) + ); + + return numNonZeroRegs; + } + + /** + * Returns the number of registers that are no longer zero after the value was added + * + * @param position The position into the byte buffer, this position represents two "registers" + * @param offsetDiff The difference in offset between the byteToAdd and the current HyperLogLogCollector + * @param byteToAdd The byte to merge into the current HyperLogLogCollector + * + * @return + */ + private int mergeAndStoreByteRegister( + int position, + int offsetDiff, + byte byteToAdd + ) + { + if (byteToAdd == 0) { + return 0; + } + + byte currVal = storageBuffer.get(position); + + int upperNibble = currVal & 0xf0; + int lowerNibble = currVal & 0x0f; + + // subtract the differences so that the nibbles align + int otherUpper = (byteToAdd & 0xf0) - (offsetDiff << bitsPerBucket); + int otherLower = (byteToAdd & 0x0f) - offsetDiff; + + final int newUpper = Math.max(upperNibble, otherUpper); + final int newLower = Math.max(lowerNibble, otherLower); + + int numNoLongerZero = 0; + if (upperNibble == 0 && newUpper > 0) { + ++numNoLongerZero; + } + + if (lowerNibble == 0 && newLower > 0) { + ++numNoLongerZero; + } + + storageBuffer.put(position, (byte) ((newUpper | newLower) & 0xff)); + + return numNoLongerZero; + } + + @Override + public int compareTo(HyperLogLogCollector other) + { + final int lhsOffset = (int) this.getRegisterOffset() & 0xffff; + final int rhsOffset = (int) other.getRegisterOffset() & 0xffff; + + if (lhsOffset == rhsOffset) { + final int lhsNumNonZero = (int) this.getNumNonZeroRegisters() & 0xff; + final int rhsNumNonZero = (int) this.getNumNonZeroRegisters() & 0xff; + int retVal = Double.compare(lhsNumNonZero, rhsNumNonZero); + + if (retVal == 0) { + retVal = Double.compare(this.estimateCardinality(), other.estimateCardinality()); + } + + return retVal; + } else { + return Double.compare(lhsOffset, rhsOffset); + } + } +} diff --git a/processing/src/main/java/io/druid/query/aggregation/hyperloglog/HyperUniqueFinalizingPostAggregator.java b/processing/src/main/java/io/druid/query/aggregation/hyperloglog/HyperUniqueFinalizingPostAggregator.java new file mode 100644 index 00000000000..0286ddd70d9 --- /dev/null +++ b/processing/src/main/java/io/druid/query/aggregation/hyperloglog/HyperUniqueFinalizingPostAggregator.java @@ -0,0 +1,69 @@ +/* + * Druid - a distributed column store. + * Copyright (C) 2012, 2013 Metamarkets Group Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + */ + +package io.druid.query.aggregation.hyperloglog; + +import com.fasterxml.jackson.annotation.JsonCreator; +import com.fasterxml.jackson.annotation.JsonProperty; +import com.google.common.collect.Sets; +import io.druid.query.aggregation.PostAggregator; + +import java.util.Comparator; +import java.util.Map; +import java.util.Set; + +/** + */ +public class HyperUniqueFinalizingPostAggregator implements PostAggregator +{ + private final String fieldName; + + @JsonCreator + public HyperUniqueFinalizingPostAggregator( + @JsonProperty("fieldName") String fieldName + ) + { + this.fieldName = fieldName; + } + + @Override + public Set getDependentFields() + { + return Sets.newHashSet(fieldName); + } + + @Override + public Comparator getComparator() + { + throw new UnsupportedOperationException(); + } + + @Override + public Object compute(Map combinedAggregators) + { + return HyperUniquesAggregatorFactory.estimateCardinality(combinedAggregators.get(fieldName)); + } + + @Override + @JsonProperty("fieldName") + public String getName() + { + return fieldName; + } +} diff --git a/processing/src/main/java/io/druid/query/aggregation/hyperloglog/HyperUniquesAggregator.java b/processing/src/main/java/io/druid/query/aggregation/hyperloglog/HyperUniquesAggregator.java new file mode 100644 index 00000000000..1aa8f6fd6d2 --- /dev/null +++ b/processing/src/main/java/io/druid/query/aggregation/hyperloglog/HyperUniquesAggregator.java @@ -0,0 +1,86 @@ +/* + * Druid - a distributed column store. + * Copyright (C) 2012, 2013 Metamarkets Group Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + */ + +package io.druid.query.aggregation.hyperloglog; + +import io.druid.query.aggregation.Aggregator; +import io.druid.segment.ObjectColumnSelector; + +/** + */ +public class HyperUniquesAggregator implements Aggregator +{ + private final String name; + private final ObjectColumnSelector selector; + + private HyperLogLogCollector collector; + + public HyperUniquesAggregator( + String name, + ObjectColumnSelector selector + ) + { + this.name = name; + this.selector = selector; + + this.collector = HyperLogLogCollector.makeLatestCollector(); + } + + @Override + public void aggregate() + { + collector.fold((HyperLogLogCollector) selector.get()); + } + + @Override + public void reset() + { + collector = HyperLogLogCollector.makeLatestCollector(); + } + + @Override + public Object get() + { + return collector; + } + + @Override + public float getFloat() + { + throw new UnsupportedOperationException(); + } + + @Override + public String getName() + { + return name; + } + + @Override + public Aggregator clone() + { + return new HyperUniquesAggregator(name, selector); + } + + @Override + public void close() + { + // no resources to cleanup + } +} diff --git a/processing/src/main/java/io/druid/query/aggregation/hyperloglog/HyperUniquesAggregatorFactory.java b/processing/src/main/java/io/druid/query/aggregation/hyperloglog/HyperUniquesAggregatorFactory.java new file mode 100644 index 00000000000..5eb45524773 --- /dev/null +++ b/processing/src/main/java/io/druid/query/aggregation/hyperloglog/HyperUniquesAggregatorFactory.java @@ -0,0 +1,232 @@ +/* + * Druid - a distributed column store. + * Copyright (C) 2012, 2013 Metamarkets Group Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + */ + +package io.druid.query.aggregation.hyperloglog; + +import com.fasterxml.jackson.annotation.JsonCreator; +import com.fasterxml.jackson.annotation.JsonProperty; +import com.google.common.base.Charsets; +import com.metamx.common.IAE; +import io.druid.query.aggregation.Aggregator; +import io.druid.query.aggregation.AggregatorFactory; +import io.druid.query.aggregation.BufferAggregator; +import io.druid.query.aggregation.NoopAggregator; +import io.druid.query.aggregation.NoopBufferAggregator; +import io.druid.segment.ColumnSelectorFactory; +import io.druid.segment.ObjectColumnSelector; +import org.apache.commons.codec.binary.Base64; + +import java.nio.ByteBuffer; +import java.util.Arrays; +import java.util.Comparator; +import java.util.List; + +/** + */ +public class HyperUniquesAggregatorFactory implements AggregatorFactory +{ + public static Object estimateCardinality(Object object) + { + if (object == null) { + return 0; + } + + return ((HyperLogLogCollector) object).estimateCardinality(); + } + + private static final byte CACHE_TYPE_ID = 0x5; + + private final String name; + private final String fieldName; + + @JsonCreator + public HyperUniquesAggregatorFactory( + @JsonProperty("name") String name, + @JsonProperty("fieldName") String fieldName + ) + { + this.name = name; + this.fieldName = fieldName.toLowerCase(); + } + + @Override + public Aggregator factorize(ColumnSelectorFactory metricFactory) + { + ObjectColumnSelector selector = metricFactory.makeObjectColumnSelector(fieldName); + + if (selector == null) { + return new NoopAggregator(name); + } + + if (HyperLogLogCollector.class.isAssignableFrom(selector.classOfObject())) { + return new HyperUniquesAggregator(name, selector); + } + + throw new IAE( + "Incompatible type for metric[%s], expected a HyperUnique, got a %s", fieldName, selector.classOfObject() + ); + } + + @Override + public BufferAggregator factorizeBuffered(ColumnSelectorFactory metricFactory) + { + ObjectColumnSelector selector = metricFactory.makeObjectColumnSelector(fieldName); + + if (selector == null) { + return new NoopBufferAggregator(); + } + + if (HyperLogLogCollector.class.isAssignableFrom(selector.classOfObject())) { + return new HyperUniquesBufferAggregator(selector); + } + + throw new IAE( + "Incompatible type for metric[%s], expected a HyperUnique, got a %s", fieldName, selector.classOfObject() + ); + } + + @Override + public Comparator getComparator() + { + return new Comparator() + { + @Override + public int compare(HyperLogLogCollector lhs, HyperLogLogCollector rhs) + { + return lhs.compareTo(rhs); + } + }; + } + + @Override + public Object combine(Object lhs, Object rhs) + { + if (rhs == null) { + return lhs; + } + if (lhs == null) { + return rhs; + } + return ((HyperLogLogCollector) lhs).fold((HyperLogLogCollector) rhs); + } + + @Override + public AggregatorFactory getCombiningFactory() + { + return new HyperUniquesAggregatorFactory(name, name); + } + + @Override + public Object deserialize(Object object) + { + if (object instanceof byte[]) { + return HyperLogLogCollector.makeCollector(ByteBuffer.wrap((byte[]) object)); + } else if (object instanceof ByteBuffer) { + return HyperLogLogCollector.makeCollector((ByteBuffer) object); + } else if (object instanceof String) { + return HyperLogLogCollector.makeCollector( + ByteBuffer.wrap(Base64.decodeBase64(((String) object).getBytes(Charsets.UTF_8))) + ); + } + return object; + } + + @Override + + public Object finalizeComputation(Object object) + { + return estimateCardinality(object); + } + + @Override + @JsonProperty + public String getName() + { + return name; + } + + @Override + public List requiredFields() + { + return Arrays.asList(fieldName); + } + + @JsonProperty + public String getFieldName() + { + return fieldName; + } + + @Override + public byte[] getCacheKey() + { + byte[] fieldNameBytes = fieldName.getBytes(Charsets.UTF_8); + + return ByteBuffer.allocate(1 + fieldNameBytes.length).put(CACHE_TYPE_ID).put(fieldNameBytes).array(); + } + + @Override + public String getTypeName() + { + return "hyperUnique"; + } + + @Override + public int getMaxIntermediateSize() + { + return HyperLogLogCollector.getLatestNumBytesForDenseStorage(); + } + + @Override + public Object getAggregatorStartValue() + { + return HyperLogLogCollector.makeLatestCollector(); + } + + @Override + public String toString() + { + return "HyperUniquesAggregatorFactory{" + + "name='" + name + '\'' + + ", fieldName='" + fieldName + '\'' + + '}'; + } + + @Override + public boolean equals(Object o) + { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + + HyperUniquesAggregatorFactory that = (HyperUniquesAggregatorFactory) o; + + if (!fieldName.equals(that.fieldName)) return false; + if (!name.equals(that.name)) return false; + + return true; + } + + @Override + public int hashCode() + { + int result = name.hashCode(); + result = 31 * result + fieldName.hashCode(); + return result; + } +} diff --git a/processing/src/main/java/io/druid/query/aggregation/hyperloglog/HyperUniquesBufferAggregator.java b/processing/src/main/java/io/druid/query/aggregation/hyperloglog/HyperUniquesBufferAggregator.java new file mode 100644 index 00000000000..290fe35a370 --- /dev/null +++ b/processing/src/main/java/io/druid/query/aggregation/hyperloglog/HyperUniquesBufferAggregator.java @@ -0,0 +1,87 @@ +/* + * Druid - a distributed column store. + * Copyright (C) 2012, 2013 Metamarkets Group Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + */ + +package io.druid.query.aggregation.hyperloglog; + +import io.druid.query.aggregation.BufferAggregator; +import io.druid.segment.ObjectColumnSelector; + +import java.nio.ByteBuffer; + +/** + */ +public class HyperUniquesBufferAggregator implements BufferAggregator +{ + private static final byte[] EMPTY_BYTES = HyperLogLogCollector.makeEmptyVersionedByteArray(); + private final ObjectColumnSelector selector; + + public HyperUniquesBufferAggregator( + ObjectColumnSelector selector + ) + { + this.selector = selector; + } + + @Override + public void init(ByteBuffer buf, int position) + { + final ByteBuffer mutationBuffer = buf.duplicate(); + mutationBuffer.position(position); + mutationBuffer.put(EMPTY_BYTES); + } + + @Override + public void aggregate(ByteBuffer buf, int position) + { + HyperLogLogCollector collector = (HyperLogLogCollector) selector.get(); + + if (collector == null) { + return; + } + + HyperLogLogCollector.makeCollector( + (ByteBuffer) buf.duplicate().position(position).limit( + position + + HyperLogLogCollector.getLatestNumBytesForDenseStorage() + ) + ).fold(collector); + } + + @Override + public Object get(ByteBuffer buf, int position) + { + ByteBuffer dataCopyBuffer = ByteBuffer.allocate(HyperLogLogCollector.getLatestNumBytesForDenseStorage()); + ByteBuffer mutationBuffer = buf.duplicate(); + mutationBuffer.position(position); + mutationBuffer.get(dataCopyBuffer.array()); + return HyperLogLogCollector.makeCollector(dataCopyBuffer); + } + + @Override + public float getFloat(ByteBuffer buf, int position) + { + throw new UnsupportedOperationException(); + } + + @Override + public void close() + { + // no resources to cleanup + } +} diff --git a/processing/src/main/java/io/druid/query/aggregation/hyperloglog/HyperUniquesSerde.java b/processing/src/main/java/io/druid/query/aggregation/hyperloglog/HyperUniquesSerde.java new file mode 100644 index 00000000000..b720184fc9e --- /dev/null +++ b/processing/src/main/java/io/druid/query/aggregation/hyperloglog/HyperUniquesSerde.java @@ -0,0 +1,148 @@ +/* + * Druid - a distributed column store. + * Copyright (C) 2012, 2013 Metamarkets Group Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + */ + +package io.druid.query.aggregation.hyperloglog; + +import com.google.common.base.Charsets; +import com.google.common.collect.Ordering; +import com.google.common.hash.HashFunction; +import io.druid.data.input.InputRow; +import io.druid.segment.column.ColumnBuilder; +import io.druid.segment.data.GenericIndexed; +import io.druid.segment.data.ObjectStrategy; +import io.druid.segment.serde.ColumnPartSerde; +import io.druid.segment.serde.ComplexColumnPartSerde; +import io.druid.segment.serde.ComplexColumnPartSupplier; +import io.druid.segment.serde.ComplexMetricExtractor; +import io.druid.segment.serde.ComplexMetricSerde; + +import java.nio.ByteBuffer; +import java.util.List; + +/** + */ +public class HyperUniquesSerde extends ComplexMetricSerde +{ + private static Ordering comparator = new Ordering() + { + @Override + public int compare( + HyperLogLogCollector arg1, HyperLogLogCollector arg2 + ) + { + return arg1.toByteBuffer().compareTo(arg2.toByteBuffer()); + } + }.nullsFirst(); + + private final HashFunction hashFn; + + public HyperUniquesSerde( + HashFunction hashFn + ) + { + this.hashFn = hashFn; + } + + @Override + public String getTypeName() + { + return "hyperUnique"; + } + + @Override + public ComplexMetricExtractor getExtractor() + { + return new ComplexMetricExtractor() + { + @Override + public Class extractedClass() + { + return HyperLogLogCollector.class; + } + + @Override + public HyperLogLogCollector extractValue(InputRow inputRow, String metricName) + { + HyperLogLogCollector collector = HyperLogLogCollector.makeLatestCollector(); + + List dimValues = inputRow.getDimension(metricName); + if (dimValues == null) { + return collector; + } + + for (String dimensionValue : dimValues) { + collector.add(hashFn.hashBytes(dimensionValue.getBytes(Charsets.UTF_8)).asBytes()); + } + return collector; + } + }; + } + + @Override + public ColumnPartSerde deserializeColumn( + ByteBuffer byteBuffer, ColumnBuilder columnBuilder + ) + { + final GenericIndexed column = GenericIndexed.read(byteBuffer, getObjectStrategy()); + + columnBuilder.setComplexColumn(new ComplexColumnPartSupplier(getTypeName(), column)); + + return new ComplexColumnPartSerde(column, getTypeName()); + } + + @Override + public ObjectStrategy getObjectStrategy() + { + return new ObjectStrategy() + { + @Override + public Class getClazz() + { + return HyperLogLogCollector.class; + } + + @Override + public HyperLogLogCollector fromByteBuffer(ByteBuffer buffer, int numBytes) + { + buffer.limit(buffer.position() + numBytes); + + int remaining = buffer.remaining(); + return (remaining % 3 == 0 || remaining == 1027) ? new HLLCV0(buffer) : new HLLCV1(buffer); + } + + @Override + public byte[] toBytes(HyperLogLogCollector collector) + { + if (collector == null) { + return new byte[]{}; + } + ByteBuffer val = collector.toByteBuffer(); + byte[] retVal = new byte[val.remaining()]; + val.asReadOnlyBuffer().get(retVal); + return retVal; + } + + @Override + public int compare(HyperLogLogCollector o1, HyperLogLogCollector o2) + { + return comparator.compare(o1, o2); + } + }; + } +} diff --git a/processing/src/main/java/io/druid/query/aggregation/post/ArithmeticPostAggregator.java b/processing/src/main/java/io/druid/query/aggregation/post/ArithmeticPostAggregator.java index cf6881d18ad..87138dadef8 100644 --- a/processing/src/main/java/io/druid/query/aggregation/post/ArithmeticPostAggregator.java +++ b/processing/src/main/java/io/druid/query/aggregation/post/ArithmeticPostAggregator.java @@ -193,4 +193,30 @@ public class ArithmeticPostAggregator implements PostAggregator return lookupMap.keySet(); } } + + @Override + public boolean equals(Object o) + { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + + ArithmeticPostAggregator that = (ArithmeticPostAggregator) o; + + if (fields != null ? !fields.equals(that.fields) : that.fields != null) return false; + if (fnName != null ? !fnName.equals(that.fnName) : that.fnName != null) return false; + if (name != null ? !name.equals(that.name) : that.name != null) return false; + if (op != that.op) return false; + + return true; + } + + @Override + public int hashCode() + { + int result = name != null ? name.hashCode() : 0; + result = 31 * result + (fnName != null ? fnName.hashCode() : 0); + result = 31 * result + (fields != null ? fields.hashCode() : 0); + result = 31 * result + (op != null ? op.hashCode() : 0); + return result; + } } diff --git a/processing/src/main/java/io/druid/query/aggregation/post/ConstantPostAggregator.java b/processing/src/main/java/io/druid/query/aggregation/post/ConstantPostAggregator.java index d5dce4d3140..c61af6312da 100644 --- a/processing/src/main/java/io/druid/query/aggregation/post/ConstantPostAggregator.java +++ b/processing/src/main/java/io/druid/query/aggregation/post/ConstantPostAggregator.java @@ -21,6 +21,7 @@ package io.druid.query.aggregation.post; import com.fasterxml.jackson.annotation.JsonCreator; import com.fasterxml.jackson.annotation.JsonProperty; +import com.google.common.base.Preconditions; import com.google.common.collect.Sets; import io.druid.query.aggregation.PostAggregator; @@ -41,6 +42,8 @@ public class ConstantPostAggregator implements PostAggregator @JsonProperty("value") Number constantValue ) { + // only value should be required for constants + Preconditions.checkNotNull(constantValue, "Constant value must not be null"); this.name = name; this.constantValue = constantValue; } @@ -77,7 +80,7 @@ public class ConstantPostAggregator implements PostAggregator return name; } - @JsonProperty + @JsonProperty("value") public Number getConstantValue() { return constantValue; @@ -91,4 +94,33 @@ public class ConstantPostAggregator implements PostAggregator ", constantValue=" + constantValue + '}'; } + + @Override + public boolean equals(Object o) + { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + + ConstantPostAggregator that = (ConstantPostAggregator) o; + + if (constantValue != null && that.constantValue != null) { + if (constantValue.doubleValue() != that.constantValue.doubleValue()) + return false; + } + else if (constantValue != that.constantValue) { + return false; + } + + if (name != null ? !name.equals(that.name) : that.name != null) return false; + + return true; + } + + @Override + public int hashCode() + { + int result = name != null ? name.hashCode() : 0; + result = 31 * result + (constantValue != null ? constantValue.hashCode() : 0); + return result; + } } diff --git a/processing/src/main/java/io/druid/query/aggregation/post/FieldAccessPostAggregator.java b/processing/src/main/java/io/druid/query/aggregation/post/FieldAccessPostAggregator.java index 59962836d2e..6c2321b10fd 100644 --- a/processing/src/main/java/io/druid/query/aggregation/post/FieldAccessPostAggregator.java +++ b/processing/src/main/java/io/druid/query/aggregation/post/FieldAccessPostAggregator.java @@ -84,4 +84,26 @@ public class FieldAccessPostAggregator implements PostAggregator ", fieldName='" + fieldName + '\'' + '}'; } + + @Override + public boolean equals(Object o) + { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + + FieldAccessPostAggregator that = (FieldAccessPostAggregator) o; + + if (fieldName != null ? !fieldName.equals(that.fieldName) : that.fieldName != null) return false; + if (name != null ? !name.equals(that.name) : that.name != null) return false; + + return true; + } + + @Override + public int hashCode() + { + int result = name != null ? name.hashCode() : 0; + result = 31 * result + (fieldName != null ? fieldName.hashCode() : 0); + return result; + } } diff --git a/processing/src/main/java/io/druid/query/aggregation/post/JavaScriptPostAggregator.java b/processing/src/main/java/io/druid/query/aggregation/post/JavaScriptPostAggregator.java index fef4b26f0f2..666dbc64a82 100644 --- a/processing/src/main/java/io/druid/query/aggregation/post/JavaScriptPostAggregator.java +++ b/processing/src/main/java/io/druid/query/aggregation/post/JavaScriptPostAggregator.java @@ -142,4 +142,30 @@ public class JavaScriptPostAggregator implements PostAggregator { return function; } + + @Override + public boolean equals(Object o) + { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + + JavaScriptPostAggregator that = (JavaScriptPostAggregator) o; + + if (fieldNames != null ? !fieldNames.equals(that.fieldNames) : that.fieldNames != null) return false; + if (fn != null ? !fn.equals(that.fn) : that.fn != null) return false; + if (function != null ? !function.equals(that.function) : that.function != null) return false; + if (name != null ? !name.equals(that.name) : that.name != null) return false; + + return true; + } + + @Override + public int hashCode() + { + int result = name != null ? name.hashCode() : 0; + result = 31 * result + (fieldNames != null ? fieldNames.hashCode() : 0); + result = 31 * result + (function != null ? function.hashCode() : 0); + result = 31 * result + (fn != null ? fn.hashCode() : 0); + return result; + } } diff --git a/processing/src/main/java/io/druid/query/dimension/DefaultDimensionSpec.java b/processing/src/main/java/io/druid/query/dimension/DefaultDimensionSpec.java index 3a9137b008e..8e18ce61228 100644 --- a/processing/src/main/java/io/druid/query/dimension/DefaultDimensionSpec.java +++ b/processing/src/main/java/io/druid/query/dimension/DefaultDimensionSpec.java @@ -84,4 +84,26 @@ public class DefaultDimensionSpec implements DimensionSpec ", outputName='" + outputName + '\'' + '}'; } + + @Override + public boolean equals(Object o) + { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + + DefaultDimensionSpec that = (DefaultDimensionSpec) o; + + if (dimension != null ? !dimension.equals(that.dimension) : that.dimension != null) return false; + if (outputName != null ? !outputName.equals(that.outputName) : that.outputName != null) return false; + + return true; + } + + @Override + public int hashCode() + { + int result = dimension != null ? dimension.hashCode() : 0; + result = 31 * result + (outputName != null ? outputName.hashCode() : 0); + return result; + } } diff --git a/processing/src/main/java/io/druid/query/dimension/ExtractionDimensionSpec.java b/processing/src/main/java/io/druid/query/dimension/ExtractionDimensionSpec.java index 82fba73d0a1..9fe480e396d 100644 --- a/processing/src/main/java/io/druid/query/dimension/ExtractionDimensionSpec.java +++ b/processing/src/main/java/io/druid/query/dimension/ExtractionDimensionSpec.java @@ -92,4 +92,29 @@ public class ExtractionDimensionSpec implements DimensionSpec ", outputName='" + outputName + '\'' + '}'; } + + @Override + public boolean equals(Object o) + { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + + ExtractionDimensionSpec that = (ExtractionDimensionSpec) o; + + if (dimExtractionFn != null ? !dimExtractionFn.equals(that.dimExtractionFn) : that.dimExtractionFn != null) + return false; + if (dimension != null ? !dimension.equals(that.dimension) : that.dimension != null) return false; + if (outputName != null ? !outputName.equals(that.outputName) : that.outputName != null) return false; + + return true; + } + + @Override + public int hashCode() + { + int result = dimension != null ? dimension.hashCode() : 0; + result = 31 * result + (dimExtractionFn != null ? dimExtractionFn.hashCode() : 0); + result = 31 * result + (outputName != null ? outputName.hashCode() : 0); + return result; + } } diff --git a/processing/src/main/java/io/druid/query/groupby/GroupByQuery.java b/processing/src/main/java/io/druid/query/groupby/GroupByQuery.java index 93db02101ba..80322a29531 100644 --- a/processing/src/main/java/io/druid/query/groupby/GroupByQuery.java +++ b/processing/src/main/java/io/druid/query/groupby/GroupByQuery.java @@ -33,7 +33,11 @@ import com.metamx.common.guava.Sequences; import io.druid.data.input.Row; import io.druid.granularity.QueryGranularity; import io.druid.query.BaseQuery; +import io.druid.query.DataSource; import io.druid.query.Queries; +import io.druid.query.Query; +import io.druid.query.QueryDataSource; +import io.druid.query.TableDataSource; import io.druid.query.aggregation.AggregatorFactory; import io.druid.query.aggregation.PostAggregator; import io.druid.query.dimension.DefaultDimensionSpec; @@ -72,7 +76,7 @@ public class GroupByQuery extends BaseQuery @JsonCreator public GroupByQuery( - @JsonProperty("dataSource") String dataSource, + @JsonProperty("dataSource") DataSource dataSource, @JsonProperty("intervals") QuerySegmentSpec querySegmentSpec, @JsonProperty("filter") DimFilter dimFilter, @JsonProperty("granularity") QueryGranularity granularity, @@ -133,7 +137,7 @@ public class GroupByQuery extends BaseQuery * have already passed in order for the object to exist. */ private GroupByQuery( - String dataSource, + DataSource dataSource, QuerySegmentSpec querySegmentSpec, DimFilter dimFilter, QueryGranularity granularity, @@ -255,7 +259,7 @@ public class GroupByQuery extends BaseQuery public static class Builder { - private String dataSource; + private DataSource dataSource; private QuerySegmentSpec querySegmentSpec; private DimFilter dimFilter; private QueryGranularity granularity; @@ -270,7 +274,9 @@ public class GroupByQuery extends BaseQuery private List orderByColumnSpecs = Lists.newArrayList(); private int limit = Integer.MAX_VALUE; - private Builder() {} + private Builder() + { + } private Builder(Builder builder) { @@ -288,12 +294,24 @@ public class GroupByQuery extends BaseQuery context = builder.context; } - public Builder setDataSource(String dataSource) + public Builder setDataSource(DataSource dataSource) { this.dataSource = dataSource; return this; } + public Builder setDataSource(String dataSource) + { + this.dataSource = new TableDataSource(dataSource); + return this; + } + + public Builder setDataSource(Query query) + { + this.dataSource = new QueryDataSource(query); + return this; + } + public Builder setInterval(Object interval) { return setQuerySegmentSpec(new LegacySegmentSpec(interval)); @@ -479,13 +497,52 @@ public class GroupByQuery extends BaseQuery public String toString() { return "GroupByQuery{" + - "limitSpec=" + limitSpec + - ", dimFilter=" + dimFilter + - ", granularity=" + granularity + - ", dimensions=" + dimensions + - ", aggregatorSpecs=" + aggregatorSpecs + - ", postAggregatorSpecs=" + postAggregatorSpecs + - ", orderByLimitFn=" + orderByLimitFn + - '}'; + "limitSpec=" + limitSpec + + ", dimFilter=" + dimFilter + + ", granularity=" + granularity + + ", dimensions=" + dimensions + + ", aggregatorSpecs=" + aggregatorSpecs + + ", postAggregatorSpecs=" + postAggregatorSpecs + + ", orderByLimitFn=" + orderByLimitFn + + '}'; + } + + @Override + public boolean equals(Object o) + { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + if (!super.equals(o)) return false; + + GroupByQuery that = (GroupByQuery) o; + + if (aggregatorSpecs != null ? !aggregatorSpecs.equals(that.aggregatorSpecs) : that.aggregatorSpecs != null) + return false; + if (dimFilter != null ? !dimFilter.equals(that.dimFilter) : that.dimFilter != null) return false; + if (dimensions != null ? !dimensions.equals(that.dimensions) : that.dimensions != null) return false; + if (granularity != null ? !granularity.equals(that.granularity) : that.granularity != null) return false; + if (havingSpec != null ? !havingSpec.equals(that.havingSpec) : that.havingSpec != null) return false; + if (limitSpec != null ? !limitSpec.equals(that.limitSpec) : that.limitSpec != null) return false; + if (orderByLimitFn != null ? !orderByLimitFn.equals(that.orderByLimitFn) : that.orderByLimitFn != null) + return false; + if (postAggregatorSpecs != null ? !postAggregatorSpecs.equals(that.postAggregatorSpecs) : that.postAggregatorSpecs != null) + return false; + + return true; + } + + @Override + public int hashCode() + { + int result = super.hashCode(); + result = 31 * result + (limitSpec != null ? limitSpec.hashCode() : 0); + result = 31 * result + (havingSpec != null ? havingSpec.hashCode() : 0); + result = 31 * result + (dimFilter != null ? dimFilter.hashCode() : 0); + result = 31 * result + (granularity != null ? granularity.hashCode() : 0); + result = 31 * result + (dimensions != null ? dimensions.hashCode() : 0); + result = 31 * result + (aggregatorSpecs != null ? aggregatorSpecs.hashCode() : 0); + result = 31 * result + (postAggregatorSpecs != null ? postAggregatorSpecs.hashCode() : 0); + result = 31 * result + (orderByLimitFn != null ? orderByLimitFn.hashCode() : 0); + return result; } } diff --git a/processing/src/main/java/io/druid/query/groupby/GroupByQueryHelper.java b/processing/src/main/java/io/druid/query/groupby/GroupByQueryHelper.java index cb1f0279d13..00298f18ba0 100644 --- a/processing/src/main/java/io/druid/query/groupby/GroupByQueryHelper.java +++ b/processing/src/main/java/io/druid/query/groupby/GroupByQueryHelper.java @@ -24,9 +24,6 @@ import com.google.common.collect.Lists; import com.metamx.common.ISE; import com.metamx.common.Pair; import com.metamx.common.guava.Accumulator; -import com.metamx.common.guava.Sequence; -import com.metamx.common.guava.Sequences; -import io.druid.data.input.MapBasedRow; import io.druid.data.input.Row; import io.druid.data.input.Rows; import io.druid.granularity.QueryGranularity; diff --git a/processing/src/main/java/io/druid/query/groupby/GroupByQueryQueryToolChest.java b/processing/src/main/java/io/druid/query/groupby/GroupByQueryQueryToolChest.java index d2a338460e2..543ec0ba130 100644 --- a/processing/src/main/java/io/druid/query/groupby/GroupByQueryQueryToolChest.java +++ b/processing/src/main/java/io/druid/query/groupby/GroupByQueryQueryToolChest.java @@ -34,13 +34,17 @@ import com.metamx.common.guava.Sequences; import com.metamx.emitter.service.ServiceMetricEvent; import io.druid.data.input.MapBasedRow; import io.druid.data.input.Row; +import io.druid.query.DataSource; import io.druid.query.IntervalChunkingQueryRunner; import io.druid.query.Query; +import io.druid.query.QueryDataSource; import io.druid.query.QueryRunner; import io.druid.query.QueryToolChest; +import io.druid.query.SubqueryQueryRunner; import io.druid.query.aggregation.AggregatorFactory; import io.druid.query.aggregation.MetricManipulationFn; import io.druid.segment.incremental.IncrementalIndex; +import io.druid.segment.incremental.IncrementalIndexStorageAdapter; import org.joda.time.Interval; import org.joda.time.Minutes; @@ -56,13 +60,16 @@ public class GroupByQueryQueryToolChest extends QueryToolChest NO_MERGE_CONTEXT = ImmutableMap.of(GROUP_BY_MERGE_KEY, "false"); private final Supplier configSupplier; + private GroupByQueryEngine engine; // For running the outer query around a subquery @Inject public GroupByQueryQueryToolChest( - Supplier configSupplier + Supplier configSupplier, + GroupByQueryEngine engine ) { this.configSupplier = configSupplier; + this.engine = engine; } @Override @@ -84,13 +91,32 @@ public class GroupByQueryQueryToolChest extends QueryToolChest mergeGroupByResults(final GroupByQuery query, QueryRunner runner) { - final GroupByQueryConfig config = configSupplier.get(); - Pair> indexAccumulatorPair = GroupByQueryHelper.createIndexAccumulatorPair( - query, - config - ); - IncrementalIndex index = runner.run(query).accumulate(indexAccumulatorPair.lhs, indexAccumulatorPair.rhs); + Sequence result; + + // If there's a subquery, merge subquery results and then apply the aggregator + DataSource dataSource = query.getDataSource(); + if (dataSource instanceof QueryDataSource) { + GroupByQuery subquery; + try { + subquery = (GroupByQuery) ((QueryDataSource) dataSource).getQuery(); + } catch (ClassCastException e) { + throw new UnsupportedOperationException("Subqueries must be of type 'group by'"); + } + Sequence subqueryResult = mergeGroupByResults(subquery, runner); + IncrementalIndexStorageAdapter adapter + = new IncrementalIndexStorageAdapter(makeIncrementalIndex(subquery, subqueryResult)); + result = engine.process(query, adapter); + } else { + result = runner.run(query); + } + + return postAggregate(query, makeIncrementalIndex(query, result)); + } + + + private Sequence postAggregate(final GroupByQuery query, IncrementalIndex index) + { Sequence sequence = Sequences.map( Sequences.simple(index.iterableWithPostAggregations(query.getPostAggregatorSpecs())), new Function() @@ -101,7 +127,7 @@ public class GroupByQueryQueryToolChest extends QueryToolChest rows) + { + final GroupByQueryConfig config = configSupplier.get(); + Pair> indexAccumulatorPair = GroupByQueryHelper.createIndexAccumulatorPair( + query, + config + ); + + return rows.accumulate(indexAccumulatorPair.lhs, indexAccumulatorPair.rhs); + } + + @Override public Sequence mergeSequences(Sequence> seqOfSequences) { @@ -125,7 +163,7 @@ public class GroupByQueryQueryToolChest extends QueryToolChest preMergeQueryDecoration(QueryRunner runner) { - return new IntervalChunkingQueryRunner(runner, configSupplier.get().getChunkPeriod()); + return new SubqueryQueryRunner( + new IntervalChunkingQueryRunner(runner, configSupplier.get().getChunkPeriod())); } } diff --git a/processing/src/main/java/io/druid/query/groupby/orderby/DefaultLimitSpec.java b/processing/src/main/java/io/druid/query/groupby/orderby/DefaultLimitSpec.java index 3bb79ed9617..eda54ea0dc3 100644 --- a/processing/src/main/java/io/druid/query/groupby/orderby/DefaultLimitSpec.java +++ b/processing/src/main/java/io/druid/query/groupby/orderby/DefaultLimitSpec.java @@ -196,6 +196,25 @@ public class DefaultLimitSpec implements LimitSpec { return Sequences.limit(input, limit); } + + @Override + public boolean equals(Object o) + { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + + LimitingFn that = (LimitingFn) o; + + if (limit != that.limit) return false; + + return true; + } + + @Override + public int hashCode() + { + return limit; + } } private static class SortingFn implements Function, Sequence> @@ -209,6 +228,25 @@ public class DefaultLimitSpec implements LimitSpec { return Sequences.sort(input, ordering); } + + @Override + public boolean equals(Object o) + { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + + SortingFn sortingFn = (SortingFn) o; + + if (ordering != null ? !ordering.equals(sortingFn.ordering) : sortingFn.ordering != null) return false; + + return true; + } + + @Override + public int hashCode() + { + return ordering != null ? ordering.hashCode() : 0; + } } private static class TopNFunction implements Function, Sequence> @@ -231,5 +269,49 @@ public class DefaultLimitSpec implements LimitSpec final ArrayList materializedList = Sequences.toList(input, Lists.newArrayList()); return Sequences.simple(sorter.toTopN(materializedList, limit)); } + + @Override + public boolean equals(Object o) + { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + + TopNFunction that = (TopNFunction) o; + + if (limit != that.limit) return false; + if (sorter != null ? !sorter.equals(that.sorter) : that.sorter != null) return false; + + return true; + } + + @Override + public int hashCode() + { + int result = sorter != null ? sorter.hashCode() : 0; + result = 31 * result + limit; + return result; + } + } + + @Override + public boolean equals(Object o) + { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + + DefaultLimitSpec that = (DefaultLimitSpec) o; + + if (limit != that.limit) return false; + if (columns != null ? !columns.equals(that.columns) : that.columns != null) return false; + + return true; + } + + @Override + public int hashCode() + { + int result = columns != null ? columns.hashCode() : 0; + result = 31 * result + limit; + return result; } } diff --git a/processing/src/main/java/io/druid/query/groupby/orderby/NoopLimitSpec.java b/processing/src/main/java/io/druid/query/groupby/orderby/NoopLimitSpec.java index 6fbc063c72d..d975e24a65f 100644 --- a/processing/src/main/java/io/druid/query/groupby/orderby/NoopLimitSpec.java +++ b/processing/src/main/java/io/druid/query/groupby/orderby/NoopLimitSpec.java @@ -46,4 +46,15 @@ public class NoopLimitSpec implements LimitSpec { return "NoopLimitSpec"; } + + @Override + public boolean equals(Object other) + { + return (other instanceof NoopLimitSpec); + } + + @Override + public int hashCode() { + return 0; + } } diff --git a/processing/src/main/java/io/druid/query/metadata/SegmentMetadataQueryQueryToolChest.java b/processing/src/main/java/io/druid/query/metadata/SegmentMetadataQueryQueryToolChest.java index 00642baee3c..85fa61d26e9 100644 --- a/processing/src/main/java/io/druid/query/metadata/SegmentMetadataQueryQueryToolChest.java +++ b/processing/src/main/java/io/druid/query/metadata/SegmentMetadataQueryQueryToolChest.java @@ -147,7 +147,7 @@ public class SegmentMetadataQueryQueryToolChest extends QueryToolChest @JsonProperty("context") Map context ) { - super(dataSource, querySegmentSpec, context); + super(new TableDataSource(dataSource), querySegmentSpec, context); this.toInclude = toInclude == null ? new AllColumnIncluderator() : toInclude; this.merge = merge == null ? false : merge; @@ -76,13 +77,40 @@ public class SegmentMetadataQuery extends BaseQuery public Query withOverriddenContext(Map contextOverride) { return new SegmentMetadataQuery( - getDataSource(), getQuerySegmentSpec(), toInclude, merge, computeOverridenContext(contextOverride) + ((TableDataSource)getDataSource()).getName(), + getQuerySegmentSpec(), toInclude, merge, computeOverridenContext(contextOverride) ); } @Override public Query withQuerySegmentSpec(QuerySegmentSpec spec) { - return new SegmentMetadataQuery(getDataSource(), spec, toInclude, merge, getContext()); + return new SegmentMetadataQuery( + ((TableDataSource)getDataSource()).getName(), + spec, toInclude, merge, getContext()); + } + + @Override + public boolean equals(Object o) + { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + if (!super.equals(o)) return false; + + SegmentMetadataQuery that = (SegmentMetadataQuery) o; + + if (merge != that.merge) return false; + if (toInclude != null ? !toInclude.equals(that.toInclude) : that.toInclude != null) return false; + + return true; + } + + @Override + public int hashCode() + { + int result = super.hashCode(); + result = 31 * result + (toInclude != null ? toInclude.hashCode() : 0); + result = 31 * result + (merge ? 1 : 0); + return result; } } diff --git a/processing/src/main/java/io/druid/query/search/SearchQueryQueryToolChest.java b/processing/src/main/java/io/druid/query/search/SearchQueryQueryToolChest.java index 980e6dc2e5c..a11a4c2ac01 100644 --- a/processing/src/main/java/io/druid/query/search/SearchQueryQueryToolChest.java +++ b/processing/src/main/java/io/druid/query/search/SearchQueryQueryToolChest.java @@ -121,7 +121,7 @@ public class SearchQueryQueryToolChest extends QueryToolChest> @JsonCreator public SearchQuery( - @JsonProperty("dataSource") String dataSource, + @JsonProperty("dataSource") DataSource dataSource, @JsonProperty("filter") DimFilter dimFilter, @JsonProperty("granularity") QueryGranularity granularity, @JsonProperty("limit") int limit, @@ -181,13 +182,45 @@ public class SearchQuery extends BaseQuery> public String toString() { return "SearchQuery{" + - "dataSource='" + getDataSource() + '\'' + - ", dimFilter=" + dimFilter + - ", granularity='" + granularity + '\'' + - ", dimensions=" + dimensions + - ", querySpec=" + querySpec + - ", querySegmentSpec=" + getQuerySegmentSpec() + - ", limit=" + limit + - '}'; + "dataSource='" + getDataSource() + '\'' + + ", dimFilter=" + dimFilter + + ", granularity='" + granularity + '\'' + + ", dimensions=" + dimensions + + ", querySpec=" + querySpec + + ", querySegmentSpec=" + getQuerySegmentSpec() + + ", limit=" + limit + + '}'; + } + + @Override + public boolean equals(Object o) + { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + if (!super.equals(o)) return false; + + SearchQuery that = (SearchQuery) o; + + if (limit != that.limit) return false; + if (dimFilter != null ? !dimFilter.equals(that.dimFilter) : that.dimFilter != null) return false; + if (dimensions != null ? !dimensions.equals(that.dimensions) : that.dimensions != null) return false; + if (granularity != null ? !granularity.equals(that.granularity) : that.granularity != null) return false; + if (querySpec != null ? !querySpec.equals(that.querySpec) : that.querySpec != null) return false; + if (sortSpec != null ? !sortSpec.equals(that.sortSpec) : that.sortSpec != null) return false; + + return true; + } + + @Override + public int hashCode() + { + int result = super.hashCode(); + result = 31 * result + (dimFilter != null ? dimFilter.hashCode() : 0); + result = 31 * result + (sortSpec != null ? sortSpec.hashCode() : 0); + result = 31 * result + (granularity != null ? granularity.hashCode() : 0); + result = 31 * result + (dimensions != null ? dimensions.hashCode() : 0); + result = 31 * result + (querySpec != null ? querySpec.hashCode() : 0); + result = 31 * result + limit; + return result; } } diff --git a/processing/src/main/java/io/druid/query/select/SelectQuery.java b/processing/src/main/java/io/druid/query/select/SelectQuery.java index 8c5eb2ba59f..bcd29cb7f96 100644 --- a/processing/src/main/java/io/druid/query/select/SelectQuery.java +++ b/processing/src/main/java/io/druid/query/select/SelectQuery.java @@ -24,6 +24,7 @@ import com.fasterxml.jackson.annotation.JsonProperty; import com.fasterxml.jackson.annotation.JsonTypeName; import io.druid.granularity.QueryGranularity; import io.druid.query.BaseQuery; +import io.druid.query.DataSource; import io.druid.query.Query; import io.druid.query.Result; import io.druid.query.filter.DimFilter; @@ -45,7 +46,7 @@ public class SelectQuery extends BaseQuery> @JsonCreator public SelectQuery( - @JsonProperty("dataSource") String dataSource, + @JsonProperty("dataSource") DataSource dataSource, @JsonProperty("intervals") QuerySegmentSpec querySegmentSpec, @JsonProperty("filter") DimFilter dimFilter, @JsonProperty("granularity") QueryGranularity granularity, @@ -146,4 +147,34 @@ public class SelectQuery extends BaseQuery> ", pagingSpec=" + pagingSpec + '}'; } + + @Override + public boolean equals(Object o) + { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + if (!super.equals(o)) return false; + + SelectQuery that = (SelectQuery) o; + + if (dimFilter != null ? !dimFilter.equals(that.dimFilter) : that.dimFilter != null) return false; + if (dimensions != null ? !dimensions.equals(that.dimensions) : that.dimensions != null) return false; + if (granularity != null ? !granularity.equals(that.granularity) : that.granularity != null) return false; + if (metrics != null ? !metrics.equals(that.metrics) : that.metrics != null) return false; + if (pagingSpec != null ? !pagingSpec.equals(that.pagingSpec) : that.pagingSpec != null) return false; + + return true; + } + + @Override + public int hashCode() + { + int result = super.hashCode(); + result = 31 * result + (dimFilter != null ? dimFilter.hashCode() : 0); + result = 31 * result + (granularity != null ? granularity.hashCode() : 0); + result = 31 * result + (dimensions != null ? dimensions.hashCode() : 0); + result = 31 * result + (metrics != null ? metrics.hashCode() : 0); + result = 31 * result + (pagingSpec != null ? pagingSpec.hashCode() : 0); + return result; + } } diff --git a/processing/src/main/java/io/druid/query/select/SelectQueryQueryToolChest.java b/processing/src/main/java/io/druid/query/select/SelectQueryQueryToolChest.java index 8a728749b22..4e45ffac9e5 100644 --- a/processing/src/main/java/io/druid/query/select/SelectQueryQueryToolChest.java +++ b/processing/src/main/java/io/druid/query/select/SelectQueryQueryToolChest.java @@ -123,7 +123,7 @@ public class SelectQueryQueryToolChest extends QueryToolChest context ) diff --git a/processing/src/main/java/io/druid/query/timeboundary/TimeBoundaryQueryQueryToolChest.java b/processing/src/main/java/io/druid/query/timeboundary/TimeBoundaryQueryQueryToolChest.java index 7ccba774799..352cd38b023 100644 --- a/processing/src/main/java/io/druid/query/timeboundary/TimeBoundaryQueryQueryToolChest.java +++ b/processing/src/main/java/io/druid/query/timeboundary/TimeBoundaryQueryQueryToolChest.java @@ -78,7 +78,7 @@ public class TimeBoundaryQueryQueryToolChest public boolean apply(T input) { return input.getInterval().overlaps(first.getInterval()) || input.getInterval() - .overlaps(second.getInterval()); + .overlaps(second.getInterval()); } } ) @@ -117,7 +117,7 @@ public class TimeBoundaryQueryQueryToolChest public ServiceMetricEvent.Builder makeMetricBuilder(TimeBoundaryQuery query) { return new ServiceMetricEvent.Builder() - .setUser2(query.getDataSource()) + .setUser2(query.getDataSource().toString()) .setUser4(query.getType()) .setUser6("false") .setUser10(query.getId()); @@ -146,9 +146,9 @@ public class TimeBoundaryQueryQueryToolChest public byte[] computeCacheKey(TimeBoundaryQuery query) { return ByteBuffer.allocate(2) - .put(TIMEBOUNDARY_QUERY) - .put(query.getCacheKey()) - .array(); + .put(TIMEBOUNDARY_QUERY) + .put(query.getCacheKey()) + .array(); } @Override diff --git a/processing/src/main/java/io/druid/query/timeseries/TimeseriesQuery.java b/processing/src/main/java/io/druid/query/timeseries/TimeseriesQuery.java index ab5b649a896..a1de320f5ec 100644 --- a/processing/src/main/java/io/druid/query/timeseries/TimeseriesQuery.java +++ b/processing/src/main/java/io/druid/query/timeseries/TimeseriesQuery.java @@ -25,6 +25,7 @@ import com.fasterxml.jackson.annotation.JsonTypeName; import com.google.common.collect.ImmutableList; import io.druid.granularity.QueryGranularity; import io.druid.query.BaseQuery; +import io.druid.query.DataSource; import io.druid.query.Queries; import io.druid.query.Query; import io.druid.query.Result; @@ -48,7 +49,7 @@ public class TimeseriesQuery extends BaseQuery> @JsonCreator public TimeseriesQuery( - @JsonProperty("dataSource") String dataSource, + @JsonProperty("dataSource") DataSource dataSource, @JsonProperty("intervals") QuerySegmentSpec querySegmentSpec, @JsonProperty("filter") DimFilter dimFilter, @JsonProperty("granularity") QueryGranularity granularity, @@ -132,14 +133,43 @@ public class TimeseriesQuery extends BaseQuery> public String toString() { return "TimeseriesQuery{" + - "dataSource='" + getDataSource() + '\'' + - ", querySegmentSpec=" + getQuerySegmentSpec() + - ", dimFilter=" + dimFilter + - ", granularity='" + granularity + '\'' + - ", aggregatorSpecs=" + aggregatorSpecs + - ", postAggregatorSpecs=" + postAggregatorSpecs + - ", context=" + getContext() + - '}'; + "dataSource='" + getDataSource() + '\'' + + ", querySegmentSpec=" + getQuerySegmentSpec() + + ", dimFilter=" + dimFilter + + ", granularity='" + granularity + '\'' + + ", aggregatorSpecs=" + aggregatorSpecs + + ", postAggregatorSpecs=" + postAggregatorSpecs + + ", context=" + getContext() + + '}'; } + @Override + public boolean equals(Object o) + { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + if (!super.equals(o)) return false; + + TimeseriesQuery that = (TimeseriesQuery) o; + + if (aggregatorSpecs != null ? !aggregatorSpecs.equals(that.aggregatorSpecs) : that.aggregatorSpecs != null) + return false; + if (dimFilter != null ? !dimFilter.equals(that.dimFilter) : that.dimFilter != null) return false; + if (granularity != null ? !granularity.equals(that.granularity) : that.granularity != null) return false; + if (postAggregatorSpecs != null ? !postAggregatorSpecs.equals(that.postAggregatorSpecs) : that.postAggregatorSpecs != null) + return false; + + return true; + } + + @Override + public int hashCode() + { + int result = super.hashCode(); + result = 31 * result + (dimFilter != null ? dimFilter.hashCode() : 0); + result = 31 * result + (granularity != null ? granularity.hashCode() : 0); + result = 31 * result + (aggregatorSpecs != null ? aggregatorSpecs.hashCode() : 0); + result = 31 * result + (postAggregatorSpecs != null ? postAggregatorSpecs.hashCode() : 0); + return result; + } } diff --git a/processing/src/main/java/io/druid/query/timeseries/TimeseriesQueryQueryToolChest.java b/processing/src/main/java/io/druid/query/timeseries/TimeseriesQueryQueryToolChest.java index d5de1bda272..cefe50c5384 100644 --- a/processing/src/main/java/io/druid/query/timeseries/TimeseriesQueryQueryToolChest.java +++ b/processing/src/main/java/io/druid/query/timeseries/TimeseriesQueryQueryToolChest.java @@ -123,7 +123,7 @@ public class TimeseriesQueryQueryToolChest extends QueryToolChest condensedAggs = Lists.newArrayList(); diff --git a/processing/src/main/java/io/druid/query/topn/BaseTopNAlgorithm.java b/processing/src/main/java/io/druid/query/topn/BaseTopNAlgorithm.java index 47093cea8a5..0c32d8db676 100644 --- a/processing/src/main/java/io/druid/query/topn/BaseTopNAlgorithm.java +++ b/processing/src/main/java/io/druid/query/topn/BaseTopNAlgorithm.java @@ -78,8 +78,6 @@ public abstract class BaseTopNAlgorithm> @JsonCreator public TopNQuery( - @JsonProperty("dataSource") String dataSource, + @JsonProperty("dataSource") DataSource dataSource, @JsonProperty("dimension") DimensionSpec dimensionSpec, @JsonProperty("metric") TopNMetricSpec topNMetricSpec, @JsonProperty("threshold") int threshold, @@ -208,4 +209,42 @@ public class TopNQuery extends BaseQuery> ", postAggregatorSpecs=" + postAggregatorSpecs + '}'; } + + @Override + public boolean equals(Object o) + { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + if (!super.equals(o)) return false; + + TopNQuery topNQuery = (TopNQuery) o; + + if (threshold != topNQuery.threshold) return false; + if (aggregatorSpecs != null ? !aggregatorSpecs.equals(topNQuery.aggregatorSpecs) : topNQuery.aggregatorSpecs != null) + return false; + if (dimFilter != null ? !dimFilter.equals(topNQuery.dimFilter) : topNQuery.dimFilter != null) return false; + if (dimensionSpec != null ? !dimensionSpec.equals(topNQuery.dimensionSpec) : topNQuery.dimensionSpec != null) + return false; + if (granularity != null ? !granularity.equals(topNQuery.granularity) : topNQuery.granularity != null) return false; + if (postAggregatorSpecs != null ? !postAggregatorSpecs.equals(topNQuery.postAggregatorSpecs) : topNQuery.postAggregatorSpecs != null) + return false; + if (topNMetricSpec != null ? !topNMetricSpec.equals(topNQuery.topNMetricSpec) : topNQuery.topNMetricSpec != null) + return false; + + return true; + } + + @Override + public int hashCode() + { + int result = super.hashCode(); + result = 31 * result + (dimensionSpec != null ? dimensionSpec.hashCode() : 0); + result = 31 * result + (topNMetricSpec != null ? topNMetricSpec.hashCode() : 0); + result = 31 * result + threshold; + result = 31 * result + (dimFilter != null ? dimFilter.hashCode() : 0); + result = 31 * result + (granularity != null ? granularity.hashCode() : 0); + result = 31 * result + (aggregatorSpecs != null ? aggregatorSpecs.hashCode() : 0); + result = 31 * result + (postAggregatorSpecs != null ? postAggregatorSpecs.hashCode() : 0); + return result; + } } diff --git a/processing/src/main/java/io/druid/query/topn/TopNQueryBuilder.java b/processing/src/main/java/io/druid/query/topn/TopNQueryBuilder.java index 1bfb690f490..21efd3b8351 100644 --- a/processing/src/main/java/io/druid/query/topn/TopNQueryBuilder.java +++ b/processing/src/main/java/io/druid/query/topn/TopNQueryBuilder.java @@ -21,6 +21,8 @@ package io.druid.query.topn; import com.google.common.collect.Lists; import io.druid.granularity.QueryGranularity; +import io.druid.query.DataSource; +import io.druid.query.TableDataSource; import io.druid.query.aggregation.AggregatorFactory; import io.druid.query.aggregation.PostAggregator; import io.druid.query.dimension.DefaultDimensionSpec; @@ -58,7 +60,7 @@ import java.util.Map; */ public class TopNQueryBuilder { - private String dataSource; + private DataSource dataSource; private DimensionSpec dimensionSpec; private TopNMetricSpec topNMetricSpec; private int threshold; @@ -71,7 +73,7 @@ public class TopNQueryBuilder public TopNQueryBuilder() { - dataSource = ""; + dataSource = null; dimensionSpec = null; topNMetricSpec = null; threshold = 0; @@ -83,7 +85,7 @@ public class TopNQueryBuilder context = null; } - public String getDataSource() + public DataSource getDataSource() { return dataSource; } @@ -152,7 +154,7 @@ public class TopNQueryBuilder public TopNQueryBuilder copy(TopNQuery query) { return new TopNQueryBuilder() - .dataSource(query.getDataSource()) + .dataSource(query.getDataSource().toString()) .dimension(query.getDimensionSpec()) .metric(query.getTopNMetricSpec()) .threshold(query.getThreshold()) @@ -180,6 +182,12 @@ public class TopNQueryBuilder } public TopNQueryBuilder dataSource(String d) + { + dataSource = new TableDataSource(d); + return this; + } + + public TopNQueryBuilder dataSource(DataSource d) { dataSource = d; return this; diff --git a/processing/src/main/java/io/druid/query/topn/TopNQueryQueryToolChest.java b/processing/src/main/java/io/druid/query/topn/TopNQueryQueryToolChest.java index dc4dd1b4855..de0f28dc204 100644 --- a/processing/src/main/java/io/druid/query/topn/TopNQueryQueryToolChest.java +++ b/processing/src/main/java/io/druid/query/topn/TopNQueryQueryToolChest.java @@ -128,7 +128,7 @@ public class TopNQueryQueryToolChest extends QueryToolChest() - { - @Override - public boolean apply(@Nullable String input) - { - return predicate.applyInContext(cx, input); - } - }) - .transform( - new com.google.common.base.Function() - { - @Override - public ImmutableConciseSet apply(@Nullable String input) - { - return selector.getConciseInvertedIndex(dimension, input); - } - } - ) - ); + final Indexed dimValues = selector.getDimensionValues(dimension); + ImmutableConciseSet conciseSet; + if (dimValues == null) { + conciseSet = new ImmutableConciseSet(); + } else { + conciseSet = ImmutableConciseSet.union( + FunctionalIterable.create(dimValues) + .filter(new Predicate() + { + @Override + public boolean apply(@Nullable String input) + { + return predicate.applyInContext(cx, input); + } + }) + .transform( + new com.google.common.base.Function() + { + @Override + public ImmutableConciseSet apply(@Nullable String input) + { + return selector.getConciseInvertedIndex(dimension, input); + } + } + ) + ); + } return conciseSet; } finally { Context.exit(); @@ -83,12 +90,14 @@ public class JavaScriptFilter implements Filter return factory.makeValueMatcher(dimension, predicate); } - static class JavaScriptPredicate implements Predicate { + static class JavaScriptPredicate implements Predicate + { final ScriptableObject scope; final Function fnApply; final String script; - public JavaScriptPredicate(final String script) { + public JavaScriptPredicate(final String script) + { Preconditions.checkNotNull(script, "script must not be null"); this.script = script; diff --git a/processing/src/main/java/io/druid/segment/incremental/IncrementalIndexStorageAdapter.java b/processing/src/main/java/io/druid/segment/incremental/IncrementalIndexStorageAdapter.java index d0243f39123..e765ab7867d 100644 --- a/processing/src/main/java/io/druid/segment/incremental/IncrementalIndexStorageAdapter.java +++ b/processing/src/main/java/io/druid/segment/incremental/IncrementalIndexStorageAdapter.java @@ -126,8 +126,13 @@ public class IncrementalIndexStorageAdapter implements StorageAdapter @Override public Iterable makeCursors(final Filter filter, final Interval interval, final QueryGranularity gran) { + if (index.isEmpty()) { + return ImmutableList.of(); + } + Interval actualIntervalTmp = interval; + final Interval dataInterval = new Interval(getMinTime().getMillis(), gran.next(getMaxTime().getMillis())); if (!actualIntervalTmp.overlaps(dataInterval)) { return ImmutableList.of(); @@ -236,23 +241,22 @@ public class IncrementalIndexStorageAdapter implements StorageAdapter if (numAdvanced == -1) { numAdvanced = 0; - while (baseIter.hasNext()) { - currEntry.set(baseIter.next()); - if (filterMatcher.matches()) { - return; - } - - numAdvanced++; - } } else { Iterators.advance(baseIter, numAdvanced); - if (baseIter.hasNext()) { - currEntry.set(baseIter.next()); - } } - done = cursorMap.size() == 0 || !baseIter.hasNext(); + boolean foundMatched = false; + while (baseIter.hasNext()) { + currEntry.set(baseIter.next()); + if (filterMatcher.matches()) { + foundMatched = true; + break; + } + numAdvanced++; + } + + done = !foundMatched && (cursorMap.size() == 0 || !baseIter.hasNext()); } @Override diff --git a/processing/src/test/java/io/druid/query/DataSourceTest.java b/processing/src/test/java/io/druid/query/DataSourceTest.java new file mode 100644 index 00000000000..5819fc49701 --- /dev/null +++ b/processing/src/test/java/io/druid/query/DataSourceTest.java @@ -0,0 +1,88 @@ +/* + * Druid - a distributed column store. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * + * This file Copyright (C) 2014 N3TWORK, Inc. and contributed to the Druid project + * under the Druid Corporate Contributor License Agreement. + */ + +package io.druid.query; + +import com.fasterxml.jackson.databind.ObjectMapper; +import com.google.common.collect.Lists; +import io.druid.jackson.DefaultObjectMapper; +import io.druid.query.aggregation.AggregatorFactory; +import io.druid.query.aggregation.LongSumAggregatorFactory; +import io.druid.query.dimension.DefaultDimensionSpec; +import io.druid.query.dimension.DimensionSpec; +import io.druid.query.groupby.GroupByQuery; +import org.junit.Assert; +import org.junit.Test; + +import java.io.IOException; +import java.util.Arrays; + +public class DataSourceTest +{ + private static final ObjectMapper jsonMapper = new DefaultObjectMapper(); + + @Test + public void testSerialization() throws IOException + { + DataSource dataSource = new TableDataSource("somedatasource"); + String json = jsonMapper.writeValueAsString(dataSource); + DataSource serdeDataSource = jsonMapper.readValue(json, DataSource.class); + Assert.assertEquals(dataSource, serdeDataSource); + } + + @Test + public void testLegacyDataSource() throws IOException + { + DataSource dataSource = jsonMapper.readValue("\"somedatasource\"", DataSource.class); + Assert.assertEquals(new TableDataSource("somedatasource"), dataSource); + } + + @Test + public void testTableDataSource() throws IOException + { + DataSource dataSource = jsonMapper.readValue("{\"type\":\"table\", \"name\":\"somedatasource\"}", DataSource.class); + Assert.assertEquals(new TableDataSource("somedatasource"), dataSource); + } + + @Test + public void testQueryDataSource() throws IOException + { + GroupByQuery query = GroupByQuery + .builder() + .setDataSource(QueryRunnerTestHelper.dataSource) + .setQuerySegmentSpec(QueryRunnerTestHelper.firstToThird) + .setDimensions(Lists.newArrayList(new DefaultDimensionSpec("quality", "alias"))) + .setAggregatorSpecs( + Arrays.asList( + QueryRunnerTestHelper.rowsCount, + new LongSumAggregatorFactory("idx", "index") + ) + ) + .setGranularity(QueryRunnerTestHelper.dayGran) + .build(); + + String dataSourceJSON = "{\"type\":\"query\", \"query\":" + jsonMapper.writeValueAsString(query) + "}"; + + DataSource dataSource = jsonMapper.readValue(dataSourceJSON, DataSource.class); + Assert.assertEquals(new QueryDataSource(query), dataSource); + } + +} diff --git a/processing/src/test/java/io/druid/query/QueryRunnerTestHelper.java b/processing/src/test/java/io/druid/query/QueryRunnerTestHelper.java index 2f51162f076..519c7375b10 100644 --- a/processing/src/test/java/io/druid/query/QueryRunnerTestHelper.java +++ b/processing/src/test/java/io/druid/query/QueryRunnerTestHelper.java @@ -25,6 +25,7 @@ import io.druid.query.aggregation.AggregatorFactory; import io.druid.query.aggregation.CountAggregatorFactory; import io.druid.query.aggregation.DoubleSumAggregatorFactory; import io.druid.query.aggregation.LongSumAggregatorFactory; +import io.druid.query.aggregation.hyperloglog.HyperUniquesAggregatorFactory; import io.druid.query.aggregation.post.ArithmeticPostAggregator; import io.druid.query.aggregation.post.ConstantPostAggregator; import io.druid.query.aggregation.post.FieldAccessPostAggregator; @@ -48,7 +49,7 @@ import java.util.List; */ public class QueryRunnerTestHelper { - public static final String segmentId= "testSegment"; + public static final String segmentId = "testSegment"; public static final String dataSource = "testing"; public static final QueryGranularity dayGran = QueryGranularity.DAY; public static final QueryGranularity allGran = QueryGranularity.ALL; @@ -57,9 +58,15 @@ public class QueryRunnerTestHelper public static final String placementDimension = "placement"; public static final String placementishDimension = "placementish"; public static final String indexMetric = "index"; + public static final String uniqueMetric = "uniques"; + public static final String addRowsIndexConstantMetric = "addRowsIndexConstant"; public static final CountAggregatorFactory rowsCount = new CountAggregatorFactory("rows"); public static final LongSumAggregatorFactory indexLongSum = new LongSumAggregatorFactory("index", "index"); public static final DoubleSumAggregatorFactory indexDoubleSum = new DoubleSumAggregatorFactory("index", "index"); + public static final HyperUniquesAggregatorFactory qualityUniques = new HyperUniquesAggregatorFactory( + "uniques", + "quality_uniques" + ); public static final ConstantPostAggregator constant = new ConstantPostAggregator("const", 1L); public static final FieldAccessPostAggregator rowsPostAgg = new FieldAccessPostAggregator("rows", "rows"); public static final FieldAccessPostAggregator indexPostAgg = new FieldAccessPostAggregator("index", "index"); @@ -67,7 +74,15 @@ public class QueryRunnerTestHelper new ArithmeticPostAggregator( "addRowsIndexConstant", "+", Lists.newArrayList(constant, rowsPostAgg, indexPostAgg) ); - public static final List commonAggregators = Arrays.asList(rowsCount, indexDoubleSum); + public static final List commonAggregators = Arrays.asList( + rowsCount, + indexDoubleSum, + qualityUniques + ); + + public static final double UNIQUES_9 = 9.019833517963864; + public static final double UNIQUES_2 = 2.000977198748901d; + public static final double UNIQUES_1 = 1.0002442201269182d; public static final String[] expectedFullOnIndexValues = new String[]{ "4500.0", "6077.949111938477", "4922.488838195801", "5726.140853881836", "4698.468170166016", @@ -96,9 +111,15 @@ public class QueryRunnerTestHelper public static final QuerySegmentSpec firstToThird = new MultipleIntervalSegmentSpec( Arrays.asList(new Interval("2011-04-01T00:00:00.000Z/2011-04-03T00:00:00.000Z")) ); + public static final QuerySegmentSpec secondOnly = new MultipleIntervalSegmentSpec( + Arrays.asList(new Interval("2011-04-02T00:00:00.000Z/P1D")) + ); public static final QuerySegmentSpec fullOnInterval = new MultipleIntervalSegmentSpec( Arrays.asList(new Interval("1970-01-01T00:00:00.000Z/2020-01-01T00:00:00.000Z")) ); + public static final QuerySegmentSpec emptyInterval = new MultipleIntervalSegmentSpec( + Arrays.asList(new Interval("2020-04-02T00:00:00.000Z/P1D")) + ); @SuppressWarnings("unchecked") public static Collection makeQueryRunners( diff --git a/processing/src/test/java/io/druid/query/aggregation/hyperloglog/HyperLogLogCollectorTest.java b/processing/src/test/java/io/druid/query/aggregation/hyperloglog/HyperLogLogCollectorTest.java new file mode 100644 index 00000000000..8968c9a8675 --- /dev/null +++ b/processing/src/test/java/io/druid/query/aggregation/hyperloglog/HyperLogLogCollectorTest.java @@ -0,0 +1,793 @@ +/* + * Druid - a distributed column store. + * Copyright (C) 2012, 2013 Metamarkets Group Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + */ + +package io.druid.query.aggregation.hyperloglog; + +import com.google.common.hash.HashFunction; +import com.google.common.hash.Hashing; +import org.junit.Assert; +import org.junit.Test; + +import java.nio.ByteBuffer; +import java.security.MessageDigest; +import java.util.Arrays; +import java.util.Comparator; +import java.util.Random; + +/** + */ +public class HyperLogLogCollectorTest +{ + + private final HashFunction fn = Hashing.murmur3_128(); + private final Random random = new Random(); + + @Test + public void testFolding() throws Exception + { + final int[] numValsToCheck = {10, 20, 50, 100, 1000, 2000}; + for (int numThings : numValsToCheck) { + HyperLogLogCollector allCombined = HyperLogLogCollector.makeLatestCollector(); + HyperLogLogCollector oneHalf = HyperLogLogCollector.makeLatestCollector(); + HyperLogLogCollector otherHalf = HyperLogLogCollector.makeLatestCollector(); + + for (int i = 0; i < numThings; ++i) { + byte[] hashedVal = fn.hashLong(random.nextLong()).asBytes(); + + allCombined.add(hashedVal); + if (i % 2 == 0) { + oneHalf.add(hashedVal); + } else { + otherHalf.add(hashedVal); + } + } + + HyperLogLogCollector folded = HyperLogLogCollector.makeLatestCollector(); + + folded.fold(oneHalf); + Assert.assertEquals(oneHalf, folded); + Assert.assertEquals(oneHalf.estimateCardinality(), folded.estimateCardinality(), 0.0d); + + folded.fold(otherHalf); + Assert.assertEquals(allCombined, folded); + Assert.assertEquals(allCombined.estimateCardinality(), folded.estimateCardinality(), 0.0d); + } + } + + // @Test + public void testHighCardinalityRollingFold() throws Exception + { + final HyperLogLogCollector rolling = HyperLogLogCollector.makeLatestCollector(); + final HyperLogLogCollector simple = HyperLogLogCollector.makeLatestCollector(); + + int count; + MessageDigest md = MessageDigest.getInstance("SHA-1"); + HyperLogLogCollector tmp = HyperLogLogCollector.makeLatestCollector(); + + for (count = 0; count < 5000000; ++count) { + md.update(Integer.toString(count).getBytes()); + + byte[] hashed = fn.hashBytes(md.digest()).asBytes(); + + tmp.add(hashed); + simple.add(hashed); + + if (count % 100 == 0) { + rolling.fold(tmp); + tmp = HyperLogLogCollector.makeLatestCollector(); + } + } + + int n = count; + + System.out.println("True cardinality " + n); + System.out.println("Rolling buffer cardinality " + rolling.estimateCardinality()); + System.out.println("Simple buffer cardinality " + simple.estimateCardinality()); + System.out.println( + String.format( + "Rolling cardinality estimate off by %4.1f%%", + 100 * (1 - rolling.estimateCardinality() / n) + ) + ); + + Assert.assertEquals(n, simple.estimateCardinality(), n * 0.05); + Assert.assertEquals(n, rolling.estimateCardinality(), n * 0.05); + } + + //@Test + public void testHighCardinalityRollingFold2() throws Exception + { + final HyperLogLogCollector rolling = HyperLogLogCollector.makeLatestCollector(); + int count; + long start = System.currentTimeMillis(); + + for (count = 0; count < 5000000; ++count) { + HyperLogLogCollector theCollector = HyperLogLogCollector.makeLatestCollector(); + theCollector.add(fn.hashLong(count).asBytes()); + rolling.fold(theCollector); + } + System.out.printf("testHighCardinalityRollingFold2 took %d ms%n", System.currentTimeMillis() - start); + + int n = count; + + System.out.println("True cardinality " + n); + System.out.println("Rolling buffer cardinality " + rolling.estimateCardinality()); + System.out.println( + String.format( + "Rolling cardinality estimate off by %4.1f%%", + 100 * (1 - rolling.estimateCardinality() / n) + ) + ); + + Assert.assertEquals(n, rolling.estimateCardinality(), n * 0.05); + } + + @Test + public void testFoldingByteBuffers() throws Exception + { + final int[] numValsToCheck = {10, 20, 50, 100, 1000, 2000}; + for (int numThings : numValsToCheck) { + HyperLogLogCollector allCombined = HyperLogLogCollector.makeLatestCollector(); + HyperLogLogCollector oneHalf = HyperLogLogCollector.makeLatestCollector(); + HyperLogLogCollector otherHalf = HyperLogLogCollector.makeLatestCollector(); + + for (int i = 0; i < numThings; ++i) { + byte[] hashedVal = fn.hashLong(random.nextLong()).asBytes(); + + allCombined.add(hashedVal); + if (i % 2 == 0) { + oneHalf.add(hashedVal); + } else { + otherHalf.add(hashedVal); + } + } + + HyperLogLogCollector folded = HyperLogLogCollector.makeLatestCollector(); + + folded.fold(oneHalf.toByteBuffer()); + Assert.assertEquals(oneHalf, folded); + Assert.assertEquals(oneHalf.estimateCardinality(), folded.estimateCardinality(), 0.0d); + + folded.fold(otherHalf.toByteBuffer()); + Assert.assertEquals(allCombined, folded); + Assert.assertEquals(allCombined.estimateCardinality(), folded.estimateCardinality(), 0.0d); + } + } + + @Test + public void testFoldingReadOnlyByteBuffers() throws Exception + { + final int[] numValsToCheck = {10, 20, 50, 100, 1000, 2000}; + for (int numThings : numValsToCheck) { + HyperLogLogCollector allCombined = HyperLogLogCollector.makeLatestCollector(); + HyperLogLogCollector oneHalf = HyperLogLogCollector.makeLatestCollector(); + HyperLogLogCollector otherHalf = HyperLogLogCollector.makeLatestCollector(); + + for (int i = 0; i < numThings; ++i) { + byte[] hashedVal = fn.hashLong(random.nextLong()).asBytes(); + + allCombined.add(hashedVal); + if (i % 2 == 0) { + oneHalf.add(hashedVal); + } else { + otherHalf.add(hashedVal); + } + } + + HyperLogLogCollector folded = HyperLogLogCollector.makeCollector( + ByteBuffer.wrap(HyperLogLogCollector.makeEmptyVersionedByteArray()) + .asReadOnlyBuffer() + ); + + folded.fold(oneHalf.toByteBuffer()); + Assert.assertEquals(oneHalf, folded); + Assert.assertEquals(oneHalf.estimateCardinality(), folded.estimateCardinality(), 0.0d); + + folded.fold(otherHalf.toByteBuffer()); + Assert.assertEquals(allCombined, folded); + Assert.assertEquals(allCombined.estimateCardinality(), folded.estimateCardinality(), 0.0d); + } + } + + @Test + public void testFoldingReadOnlyByteBuffersWithArbitraryPosition() throws Exception + { + final int[] numValsToCheck = {10, 20, 50, 100, 1000, 2000}; + for (int numThings : numValsToCheck) { + HyperLogLogCollector allCombined = HyperLogLogCollector.makeLatestCollector(); + HyperLogLogCollector oneHalf = HyperLogLogCollector.makeLatestCollector(); + HyperLogLogCollector otherHalf = HyperLogLogCollector.makeLatestCollector(); + + for (int i = 0; i < numThings; ++i) { + byte[] hashedVal = fn.hashLong(random.nextLong()).asBytes(); + + allCombined.add(hashedVal); + if (i % 2 == 0) { + oneHalf.add(hashedVal); + } else { + otherHalf.add(hashedVal); + } + } + + HyperLogLogCollector folded = HyperLogLogCollector.makeCollector( + shiftedBuffer( + ByteBuffer.wrap(HyperLogLogCollector.makeEmptyVersionedByteArray()) + .asReadOnlyBuffer(), + 17 + ) + ); + + folded.fold(oneHalf.toByteBuffer()); + Assert.assertEquals(oneHalf, folded); + Assert.assertEquals(oneHalf.estimateCardinality(), folded.estimateCardinality(), 0.0d); + + folded.fold(otherHalf.toByteBuffer()); + Assert.assertEquals(allCombined, folded); + Assert.assertEquals(allCombined.estimateCardinality(), folded.estimateCardinality(), 0.0d); + } + } + + @Test + public void testFoldWithDifferentOffsets1() throws Exception + { + ByteBuffer biggerOffset = makeCollectorBuffer(1, (byte) 0x00, 0x11); + ByteBuffer smallerOffset = makeCollectorBuffer(0, (byte) 0x20, 0x00); + + HyperLogLogCollector collector = HyperLogLogCollector.makeLatestCollector(); + collector.fold(biggerOffset); + collector.fold(smallerOffset); + + ByteBuffer outBuffer = collector.toByteBuffer(); + + Assert.assertEquals(outBuffer.get(), collector.getVersion()); + Assert.assertEquals(outBuffer.get(), 1); + Assert.assertEquals(outBuffer.getShort(), 2047); + outBuffer.get(); + outBuffer.getShort(); + Assert.assertEquals(outBuffer.get(), 0x10); + while (outBuffer.hasRemaining()) { + Assert.assertEquals(outBuffer.get(), 0x11); + } + + collector = HyperLogLogCollector.makeLatestCollector(); + collector.fold(smallerOffset); + collector.fold(biggerOffset); + + outBuffer = collector.toByteBuffer(); + + Assert.assertEquals(outBuffer.get(), collector.getVersion()); + Assert.assertEquals(outBuffer.get(), 1); + Assert.assertEquals(outBuffer.getShort(), 2047); + Assert.assertEquals(outBuffer.get(), 0); + Assert.assertEquals(outBuffer.getShort(), 0); + Assert.assertEquals(outBuffer.get(), 0x10); + while (outBuffer.hasRemaining()) { + Assert.assertEquals(outBuffer.get(), 0x11); + } + } + + @Test + public void testFoldWithArbitraryInitialPositions() throws Exception + { + ByteBuffer biggerOffset = shiftedBuffer(makeCollectorBuffer(1, (byte) 0x00, 0x11), 10); + ByteBuffer smallerOffset = shiftedBuffer(makeCollectorBuffer(0, (byte) 0x20, 0x00), 15); + + HyperLogLogCollector collector = HyperLogLogCollector.makeLatestCollector(); + collector.fold(biggerOffset); + collector.fold(smallerOffset); + + ByteBuffer outBuffer = collector.toByteBuffer(); + + Assert.assertEquals(outBuffer.get(), collector.getVersion()); + Assert.assertEquals(outBuffer.get(), 1); + Assert.assertEquals(outBuffer.getShort(), 2047); + outBuffer.get(); + outBuffer.getShort(); + Assert.assertEquals(outBuffer.get(), 0x10); + while (outBuffer.hasRemaining()) { + Assert.assertEquals(outBuffer.get(), 0x11); + } + + collector = HyperLogLogCollector.makeLatestCollector(); + collector.fold(smallerOffset); + collector.fold(biggerOffset); + + outBuffer = collector.toByteBuffer(); + + Assert.assertEquals(outBuffer.get(), collector.getVersion()); + Assert.assertEquals(outBuffer.get(), 1); + Assert.assertEquals(outBuffer.getShort(), 2047); + outBuffer.get(); + outBuffer.getShort(); + Assert.assertEquals(outBuffer.get(), 0x10); + while (outBuffer.hasRemaining()) { + Assert.assertEquals(outBuffer.get(), 0x11); + } + } + + protected ByteBuffer shiftedBuffer(ByteBuffer buf, int offset) + { + ByteBuffer shifted = ByteBuffer.allocate(buf.remaining() + offset); + shifted.position(offset); + shifted.put(buf); + shifted.position(offset); + return shifted; + } + + @Test + public void testFoldWithDifferentOffsets2() throws Exception + { + ByteBuffer biggerOffset = makeCollectorBuffer(1, (byte) 0x01, 0x11); + ByteBuffer smallerOffset = makeCollectorBuffer(0, (byte) 0x20, 0x00); + + HyperLogLogCollector collector = HyperLogLogCollector.makeLatestCollector(); + collector.fold(biggerOffset); + collector.fold(smallerOffset); + + ByteBuffer outBuffer = collector.toByteBuffer(); + + Assert.assertEquals(outBuffer.get(), collector.getVersion()); + Assert.assertEquals(outBuffer.get(), 2); + Assert.assertEquals(outBuffer.getShort(), 0); + outBuffer.get(); + outBuffer.getShort(); + Assert.assertFalse(outBuffer.hasRemaining()); + + collector = HyperLogLogCollector.makeLatestCollector(); + collector.fold(smallerOffset); + collector.fold(biggerOffset); + + outBuffer = collector.toByteBuffer(); + + Assert.assertEquals(outBuffer.get(), collector.getVersion()); + Assert.assertEquals(outBuffer.get(), 2); + Assert.assertEquals(outBuffer.getShort(), 0); + outBuffer.get(); + outBuffer.getShort(); + Assert.assertFalse(outBuffer.hasRemaining()); + } + + @Test + public void testFoldWithUpperNibbleTriggersOffsetChange() throws Exception + { + byte[] arr1 = new byte[HyperLogLogCollector.getLatestNumBytesForDenseStorage()]; + Arrays.fill(arr1, (byte) 0x11); + ByteBuffer buffer1 = ByteBuffer.wrap(arr1); + buffer1.put(0, HLLCV1.VERSION); + buffer1.put(1, (byte) 0); + buffer1.putShort(2, (short) (2047)); + buffer1.put(HLLCV1.HEADER_NUM_BYTES, (byte) 0x1); + + byte[] arr2 = new byte[HyperLogLogCollector.getLatestNumBytesForDenseStorage()]; + Arrays.fill(arr2, (byte) 0x11); + ByteBuffer buffer2 = ByteBuffer.wrap(arr2); + buffer2.put(0, HLLCV1.VERSION); + buffer2.put(1, (byte) 0); + buffer2.putShort(2, (short) (2048)); + + HyperLogLogCollector collector = HyperLogLogCollector.makeCollector(buffer1); + collector.fold(buffer2); + + ByteBuffer outBuffer = collector.toByteBuffer(); + + Assert.assertEquals(outBuffer.get(), HLLCV1.VERSION); + Assert.assertEquals(outBuffer.get(), 1); + Assert.assertEquals(outBuffer.getShort(), 0); + outBuffer.get(); + outBuffer.getShort(); + Assert.assertFalse(outBuffer.hasRemaining()); + } + + @Test + public void testSparseFoldWithDifferentOffsets1() throws Exception + { + ByteBuffer biggerOffset = makeCollectorBuffer(1, new byte[]{0x11, 0x10}, 0x11); + ByteBuffer sparse = HyperLogLogCollector.makeCollector(makeCollectorBuffer(0, new byte[]{0x00, 0x02}, 0x00)) + .toByteBuffer(); + + HyperLogLogCollector collector = HyperLogLogCollector.makeLatestCollector(); + collector.fold(biggerOffset); + collector.fold(sparse); + + ByteBuffer outBuffer = collector.toByteBuffer(); + + Assert.assertEquals(outBuffer.get(), collector.getVersion()); + Assert.assertEquals(outBuffer.get(), 2); + Assert.assertEquals(outBuffer.getShort(), 0); + Assert.assertEquals(outBuffer.get(), 0); + Assert.assertEquals(outBuffer.getShort(), 0); + Assert.assertFalse(outBuffer.hasRemaining()); + + collector = HyperLogLogCollector.makeLatestCollector(); + collector.fold(sparse); + collector.fold(biggerOffset); + + outBuffer = collector.toByteBuffer(); + + Assert.assertEquals(outBuffer.get(), collector.getVersion()); + Assert.assertEquals(outBuffer.get(), 2); + Assert.assertEquals(outBuffer.getShort(), 0); + Assert.assertEquals(outBuffer.get(), 0); + Assert.assertEquals(outBuffer.getShort(), 0); + Assert.assertFalse(outBuffer.hasRemaining()); + } + + private ByteBuffer makeCollectorBuffer(int offset, byte initialBytes, int remainingBytes) + { + return makeCollectorBuffer(offset, new byte[]{initialBytes}, remainingBytes); + } + + private ByteBuffer makeCollectorBuffer(int offset, byte[] initialBytes, int remainingBytes) + { + short numNonZero = 0; + for (byte initialByte : initialBytes) { + numNonZero += computeNumNonZero(initialByte); + } + + final short numNonZeroInRemaining = computeNumNonZero((byte) remainingBytes); + numNonZero += (HyperLogLogCollector.NUM_BYTES_FOR_BUCKETS - initialBytes.length) * numNonZeroInRemaining; + + ByteBuffer biggerOffset = ByteBuffer.allocate(HyperLogLogCollector.getLatestNumBytesForDenseStorage()); + biggerOffset.put(HLLCV1.VERSION); + biggerOffset.put((byte) offset); + biggerOffset.putShort(numNonZero); + biggerOffset.put((byte) 0); + biggerOffset.putShort((short) 0); + biggerOffset.put(initialBytes); + while (biggerOffset.hasRemaining()) { + biggerOffset.put((byte) remainingBytes); + } + biggerOffset.clear(); + return biggerOffset.asReadOnlyBuffer(); + } + + private short computeNumNonZero(byte theByte) + { + short retVal = 0; + if ((theByte & 0x0f) > 0) { + ++retVal; + } + if ((theByte & 0xf0) > 0) { + ++retVal; + } + return retVal; + } + + //@Test // This test can help when finding potential combinations that are weird, but it's non-deterministic + public void testFoldingwithDifferentOffsets() throws Exception + { + for (int j = 0; j < 10; j++) { + HyperLogLogCollector smallVals = HyperLogLogCollector.makeLatestCollector(); + HyperLogLogCollector bigVals = HyperLogLogCollector.makeLatestCollector(); + HyperLogLogCollector all = HyperLogLogCollector.makeLatestCollector(); + + int numThings = 500000; + for (int i = 0; i < numThings; i++) { + byte[] hashedVal = fn.hashLong(random.nextLong()).asBytes(); + + if (i < 1000) { + smallVals.add(hashedVal); + } else { + bigVals.add(hashedVal); + } + all.add(hashedVal); + } + + HyperLogLogCollector folded = HyperLogLogCollector.makeLatestCollector(); + folded.fold(smallVals); + folded.fold(bigVals); + final double expected = all.estimateCardinality(); + Assert.assertEquals(expected, folded.estimateCardinality(), expected * 0.025); + Assert.assertEquals(numThings, folded.estimateCardinality(), numThings * 0.05); + } + } + + //@Test + public void testFoldingwithDifferentOffsets2() throws Exception + { + MessageDigest md = MessageDigest.getInstance("SHA-1"); + + for (int j = 0; j < 1; j++) { + HyperLogLogCollector evenVals = HyperLogLogCollector.makeLatestCollector(); + HyperLogLogCollector oddVals = HyperLogLogCollector.makeLatestCollector(); + HyperLogLogCollector all = HyperLogLogCollector.makeLatestCollector(); + + int numThings = 500000; + for (int i = 0; i < numThings; i++) { + md.update(Integer.toString(random.nextInt()).getBytes()); + byte[] hashedVal = fn.hashBytes(md.digest()).asBytes(); + + if (i % 2 == 0) { + evenVals.add(hashedVal); + } else { + oddVals.add(hashedVal); + } + all.add(hashedVal); + } + + HyperLogLogCollector folded = HyperLogLogCollector.makeLatestCollector(); + folded.fold(evenVals); + folded.fold(oddVals); + final double expected = all.estimateCardinality(); + Assert.assertEquals(expected, folded.estimateCardinality(), expected * 0.025); + Assert.assertEquals(numThings, folded.estimateCardinality(), numThings * 0.05); + } + } + + @Test + public void testEstimation() throws Exception + { + Random random = new Random(0l); + + final int[] valsToCheck = {10, 20, 50, 100, 1000, 2000, 5000, 10000, 20000, 50000, 100000, 1000000, 2000000}; + final double[] expectedVals = { + 11.029647221949576, 21.108407720752034, 51.64575281885815, 100.42231726408892, + 981.8579991802412, 1943.1337257462792, 4946.192042635218, 9935.088157579434, + 20366.1486889433, 49433.56029693898, 100615.26273314281, 980831.624899156000, + 1982408.2608981386 + }; + + int valsToCheckIndex = 0; + HyperLogLogCollector collector = HyperLogLogCollector.makeLatestCollector(); + for (int i = 0; i < valsToCheck[valsToCheck.length - 1]; ++i) { + collector.add(fn.hashLong(random.nextLong()).asBytes()); + if (i == valsToCheck[valsToCheckIndex]) { + Assert.assertEquals(expectedVals[valsToCheckIndex], collector.estimateCardinality(), 0.0d); + ++valsToCheckIndex; + } + } + Assert.assertEquals(expectedVals.length, valsToCheckIndex + 1); + Assert.assertEquals(expectedVals[valsToCheckIndex], collector.estimateCardinality(), 0.0d); + } + + @Test + public void testEstimationReadOnlyByteBuffers() throws Exception + { + Random random = new Random(0l); + + final int[] valsToCheck = {10, 20, 50, 100, 1000, 2000, 5000, 10000, 20000, 50000, 100000, 1000000, 2000000}; + final double[] expectedVals = { + 11.029647221949576, 21.108407720752034, 51.64575281885815, 100.42231726408892, + 981.8579991802412, 1943.1337257462792, 4946.192042635218, 9935.088157579434, + 20366.1486889433, 49433.56029693898, 100615.26273314281, 980831.624899156000, + 1982408.2608981386 + }; + + int valsToCheckIndex = 0; + HyperLogLogCollector collector = HyperLogLogCollector.makeCollector( + ByteBuffer.allocateDirect( + HyperLogLogCollector.getLatestNumBytesForDenseStorage() + ) + ); + for (int i = 0; i < valsToCheck[valsToCheck.length - 1]; ++i) { + collector.add(fn.hashLong(random.nextLong()).asBytes()); + if (i == valsToCheck[valsToCheckIndex]) { + Assert.assertEquals(expectedVals[valsToCheckIndex], collector.estimateCardinality(), 0.0d); + ++valsToCheckIndex; + } + } + Assert.assertEquals(expectedVals.length, valsToCheckIndex + 1); + Assert.assertEquals(expectedVals[valsToCheckIndex], collector.estimateCardinality(), 0.0d); + } + + @Test + public void testEstimationLimitDifferentFromCapacity() throws Exception + { + Random random = new Random(0l); + + final int[] valsToCheck = {10, 20, 50, 100, 1000, 2000, 5000, 10000, 20000, 50000, 100000, 1000000, 2000000}; + final double[] expectedVals = { + 11.029647221949576, 21.108407720752034, 51.64575281885815, 100.42231726408892, + 981.8579991802412, 1943.1337257462792, 4946.192042635218, 9935.088157579434, + 20366.1486889433, 49433.56029693898, 100615.26273314281, 980831.624899156000, + 1982408.2608981386 + }; + + int valsToCheckIndex = 0; + HyperLogLogCollector collector = HyperLogLogCollector.makeCollector( + (ByteBuffer) ByteBuffer.allocate(10000) + .position(0) + .limit(HyperLogLogCollector.getLatestNumBytesForDenseStorage()) + ); + for (int i = 0; i < valsToCheck[valsToCheck.length - 1]; ++i) { + collector.add(fn.hashLong(random.nextLong()).asBytes()); + if (i == valsToCheck[valsToCheckIndex]) { + Assert.assertEquals(expectedVals[valsToCheckIndex], collector.estimateCardinality(), 0.0d); + ++valsToCheckIndex; + } + } + Assert.assertEquals(expectedVals.length, valsToCheckIndex + 1); + Assert.assertEquals(expectedVals[valsToCheckIndex], collector.estimateCardinality(), 0.0d); + } + + @Test + public void testSparseEstimation() throws Exception + { + HyperLogLogCollector collector = HyperLogLogCollector.makeLatestCollector(); + + for (int i = 0; i < 100; ++i) { + collector.add(fn.hashLong(random.nextLong()).asBytes()); + } + + Assert.assertEquals( + collector.estimateCardinality(), collector.estimateByteBuffer(collector.toByteBuffer()), 0.0d + ); + } + + @Test + public void testHighBits() throws Exception + { + HyperLogLogCollector collector = HyperLogLogCollector.makeLatestCollector(); + + // fill up all the buckets so we reach a registerOffset of 49 + fillBuckets(collector, (byte) 0, (byte) 49); + + // highest possible bit position is 64 + collector.add(new byte[]{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}); + Assert.assertEquals(8.5089685793441677E17, collector.estimateCardinality(), 1000); + + // this might happen once in a million years if you hash a billion values a second + fillBuckets(collector, (byte) 0, (byte) 63); + collector.add(new byte[]{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}); + + Assert.assertEquals(Double.MAX_VALUE, collector.estimateCardinality(), 1000); + } + + @Test + public void testCompare1() throws Exception + { + HyperLogLogCollector collector1 = HyperLogLogCollector.makeLatestCollector(); + HyperLogLogCollector collector2 = HyperLogLogCollector.makeLatestCollector(); + collector1.add(fn.hashLong(0).asBytes()); + HyperUniquesAggregatorFactory factory = new HyperUniquesAggregatorFactory("foo", "bar"); + Comparator comparator = factory.getComparator(); + for (int i = 1; i < 100; i = i + 2) { + collector1.add(fn.hashLong(i).asBytes()); + collector2.add(fn.hashLong(i + 1).asBytes()); + Assert.assertEquals(1, comparator.compare(collector1, collector2)); + Assert.assertEquals(1, Double.compare(collector1.estimateCardinality(), collector2.estimateCardinality())); + } + } + + @Test + public void testCompare2() throws Exception + { + Random rand = new Random(0); + HyperUniquesAggregatorFactory factory = new HyperUniquesAggregatorFactory("foo", "bar"); + Comparator comparator = factory.getComparator(); + for (int i = 1; i < 1000; ++i) { + HyperLogLogCollector collector1 = HyperLogLogCollector.makeLatestCollector(); + int j = rand.nextInt(50); + for (int l = 0; l < j; ++l) { + collector1.add(fn.hashLong(rand.nextLong()).asBytes()); + } + + HyperLogLogCollector collector2 = HyperLogLogCollector.makeLatestCollector(); + int k = j + 1 + rand.nextInt(5); + for (int l = 0; l < k; ++l) { + collector2.add(fn.hashLong(rand.nextLong()).asBytes()); + } + + Assert.assertEquals( + Double.compare(collector1.estimateCardinality(), collector2.estimateCardinality()), + comparator.compare(collector1, collector2) + ); + } + + for (int i = 1; i < 100; ++i) { + HyperLogLogCollector collector1 = HyperLogLogCollector.makeLatestCollector(); + int j = rand.nextInt(500); + for (int l = 0; l < j; ++l) { + collector1.add(fn.hashLong(rand.nextLong()).asBytes()); + } + + HyperLogLogCollector collector2 = HyperLogLogCollector.makeLatestCollector(); + int k = j + 2 + rand.nextInt(5); + for (int l = 0; l < k; ++l) { + collector2.add(fn.hashLong(rand.nextLong()).asBytes()); + } + + Assert.assertEquals( + Double.compare(collector1.estimateCardinality(), collector2.estimateCardinality()), + comparator.compare(collector1, collector2) + ); + } + + for (int i = 1; i < 10; ++i) { + HyperLogLogCollector collector1 = HyperLogLogCollector.makeLatestCollector(); + int j = rand.nextInt(100000); + for (int l = 0; l < j; ++l) { + collector1.add(fn.hashLong(rand.nextLong()).asBytes()); + } + + HyperLogLogCollector collector2 = HyperLogLogCollector.makeLatestCollector(); + int k = j + 20000 + rand.nextInt(100000); + for (int l = 0; l < k; ++l) { + collector2.add(fn.hashLong(rand.nextLong()).asBytes()); + } + + Assert.assertEquals( + Double.compare(collector1.estimateCardinality(), collector2.estimateCardinality()), + comparator.compare(collector1, collector2) + ); + } + } + + + private static void fillBuckets(HyperLogLogCollector collector, byte startOffset, byte endOffset) + { + byte offset = startOffset; + while (offset <= endOffset) { + // fill buckets to shift registerOffset + for (short bucket = 0; bucket < 2048; ++bucket) { + collector.add(bucket, offset); + } + offset++; + } + } + + // Provides a nice printout of error rates as a function of cardinality + //@Test + public void showErrorRate() throws Exception + { + HashFunction fn = Hashing.murmur3_128(); + Random random = new Random(); + + double error = 0.0d; + int count = 0; + + final int[] valsToCheck = { + 10, 20, 50, 100, 1000, 2000, 5000, 10000, 20000, 50000, 100000, 1000000, 2000000, 10000000, Integer.MAX_VALUE + }; + + for (int numThings : valsToCheck) { + long startTime = System.currentTimeMillis(); + HyperLogLogCollector collector = HyperLogLogCollector.makeLatestCollector(); + + for (int i = 0; i < numThings; ++i) { + if (i != 0 && i % 100000000 == 0) { + ++count; + error = computeError(error, count, i, startTime, collector); + } + collector.add(fn.hashLong(random.nextLong()).asBytes()); + } + + ++count; + error = computeError(error, count, numThings, startTime, collector); + } + } + + private double computeError(double error, int count, int numThings, long startTime, HyperLogLogCollector collector) + { + final double estimatedValue = collector.estimateCardinality(); + final double errorThisTime = Math.abs((double) numThings - estimatedValue) / numThings; + + error += errorThisTime; + + System.out.printf( + "%,d ==? %,f in %,d millis. actual error[%,f%%], avg. error [%,f%%]%n", + numThings, + estimatedValue, + System.currentTimeMillis() - startTime, + 100 * errorThisTime, + (error / count) * 100 + ); + return error; + } +} diff --git a/processing/src/test/java/io/druid/query/aggregation/hyperloglog/HyperUniqueFinalizingPostAggregatorTest.java b/processing/src/test/java/io/druid/query/aggregation/hyperloglog/HyperUniqueFinalizingPostAggregatorTest.java new file mode 100644 index 00000000000..5a0f0de6617 --- /dev/null +++ b/processing/src/test/java/io/druid/query/aggregation/hyperloglog/HyperUniqueFinalizingPostAggregatorTest.java @@ -0,0 +1,54 @@ +/* + * Druid - a distributed column store. + * Copyright (C) 2012, 2013 Metamarkets Group Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + */ + +package io.druid.query.aggregation.hyperloglog; + +import com.google.common.collect.ImmutableMap; +import com.google.common.hash.HashFunction; +import com.google.common.hash.Hashing; +import org.junit.Assert; +import org.junit.Test; + +import java.util.Random; + +/** + */ +public class HyperUniqueFinalizingPostAggregatorTest +{ + private final HashFunction fn = Hashing.murmur3_128(); + + @Test + public void testCompute() throws Exception + { + Random random = new Random(0l); + HyperUniqueFinalizingPostAggregator postAggregator = new HyperUniqueFinalizingPostAggregator( + "uniques" + ); + HyperLogLogCollector collector = HyperLogLogCollector.makeLatestCollector(); + + for (int i = 0; i < 100; ++i) { + byte[] hashedVal = fn.hashLong(random.nextLong()).asBytes(); + collector.add(hashedVal); + } + + double cardinality = (Double) postAggregator.compute(ImmutableMap.of("uniques", collector)); + + Assert.assertTrue(cardinality == 99.37233005831612); + } +} diff --git a/processing/src/test/java/io/druid/query/aggregation/hyperloglog/HyperUniquesAggregatorFactoryTest.java b/processing/src/test/java/io/druid/query/aggregation/hyperloglog/HyperUniquesAggregatorFactoryTest.java new file mode 100644 index 00000000000..44162e8db51 --- /dev/null +++ b/processing/src/test/java/io/druid/query/aggregation/hyperloglog/HyperUniquesAggregatorFactoryTest.java @@ -0,0 +1,48 @@ +/* + * Druid - a distributed column store. + * Copyright (C) 2012, 2013 Metamarkets Group Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + */ + +package io.druid.query.aggregation.hyperloglog; + +import junit.framework.Assert; +import org.junit.Test; + +public class HyperUniquesAggregatorFactoryTest +{ + final static HyperUniquesAggregatorFactory aggregatorFactory = new HyperUniquesAggregatorFactory( + "hyperUnique", + "uniques" + ); + final static String V0_BASE64 = "AAYbEyQwFyQVASMCVFEQQgEQIxIhM4ISAQMhUkICEDFDIBMhMgFQFAFAMjAAEhEREyVAEiUBAhIjISATMCECMiERIRIiVRFRAyIAEgFCQSMEJAITATAAEAMQgCEBEjQiAyUTAyEQASJyAGURAAISAwISATETQhAREBYDIVIlFTASAzJgERIgRCcmUyAwNAMyEJMjIhQXQhEWECABQDETATEREjIRAgEyIiMxMBQiAkBBMDYAMEQQACMzMhIkMTQSkYIRABIBADMBAhIEISAENkEBQDAxETMAIEEwEzQiQSEVQSFBBAQDICIiAVIAMTAQIQYBIRABADMDEzEAQSMkEiAYFBAQI0AmECEyQSARRTIVMhEkMiKAMCUBxUghAkIBI3EmMAQiACEAJDJCAAADOzESEDBCRjMgEUQQETQwEWIhA6MlAiAAZDI1AgEIIDUyFDIHMQEEAwIRBRABBStCZCQhAgJSMQIiQEEURTBmM1MxACIAETGhMgQnBRICNiIREyIUNAEAAkABAwQSEBJBIhIhIRERAiIRACUhEUAVMkQGEVMjECYjACBwEQQSIRIgAAEyExQUFSEAIBJCIDIDYTAgMiNBIUADUiETADMoFEADETMCIwUEQkIAESMSIzIABDERIXEhIiACQgUSEgJiQCAUARIRAREDQiEUAkQgAgQiIEAzIxRCARIgBAAVAzMAECEwE0Qh8gAAASEhEiAiMhUxcRImIVABATYyUBAwIoE1QhRDIiYBIBEBEiQSQyERAAADMAARAEACFYUwQSQBIRIgURITARFSEzEHEBACOTMREBIAMjIgEhU0cxEQIRIhIi1wEgMRUBEgMQIRAnAVASURMHQBAiEyBSAAEBQTAWQ5EQA0IUMSISAUEiASIjIhMhMFJBBSEjEAECEwACASEQFBAjARITEQIgYTEKEAeAAiMkEyARowARFBAicRISIBIxAQAgEBARMCIRQgMSIVIAkjMxIAIEMyADASMgFRIjEyKjEjBBIEQCUAARYBEQMxMCIBACNCACRCMlEzUUAAUDM1MhAjEgAxAAISAVFQECAhQAMBMhEzEgASNxAhFRIxECMRJBQAERAToBgQMhJSRQFAEhAwMiIhMQAwAgQiBQJiIGMQQhEiQxR1MiAjIAIEEiAkARECEzQlMjECIRATBgIhEBQAIQAEATEjBCMwAgMBMhAhIyFBIxQAARI1AAEABCIDFBIRUzMBIgAgEiARQCASMQQDQCFBAQAUJwMUElAyIAIRBSIRITICEAIxMAEUBEYTcBMBEEIxMREwIRIDAGIAEgYxBAEANCAhBAI2UhIiIgIRABIEVRAwNEIQERQgEFMhFCQSIAEhQDMTEQMiAjJyEQ=="; + + @Test + public void testDeserializeV0() throws Exception + { + Object v0 = aggregatorFactory.deserialize(V0_BASE64); + Assert.assertEquals("deserialized value is HLLCV0", HLLCV0.class, v0.getClass()); + } + + @Test + public void testCombineStartValueV0() throws Exception + { + Object combined = aggregatorFactory.getAggregatorStartValue(); + aggregatorFactory.combine(combined, aggregatorFactory.deserialize(V0_BASE64)); + } + + +} diff --git a/processing/src/test/java/io/druid/query/groupby/GroupByQueryRunnerTest.java b/processing/src/test/java/io/druid/query/groupby/GroupByQueryRunnerTest.java index e1556e94da7..f336a26884e 100644 --- a/processing/src/test/java/io/druid/query/groupby/GroupByQueryRunnerTest.java +++ b/processing/src/test/java/io/druid/query/groupby/GroupByQueryRunnerTest.java @@ -38,11 +38,14 @@ import io.druid.granularity.QueryGranularity; import io.druid.query.Query; import io.druid.query.QueryRunner; import io.druid.query.QueryRunnerTestHelper; +import io.druid.query.QueryToolChest; import io.druid.query.aggregation.AggregatorFactory; import io.druid.query.aggregation.DoubleSumAggregatorFactory; import io.druid.query.aggregation.LongSumAggregatorFactory; +import io.druid.query.aggregation.MaxAggregatorFactory; import io.druid.query.dimension.DefaultDimensionSpec; import io.druid.query.dimension.DimensionSpec; +import io.druid.query.filter.JavaScriptDimFilter; import io.druid.query.filter.RegexDimFilter; import io.druid.query.groupby.having.EqualToHavingSpec; import io.druid.query.groupby.having.GreaterThanHavingSpec; @@ -56,7 +59,9 @@ import org.joda.time.DateTime; import org.joda.time.DateTimeZone; import org.joda.time.Interval; import org.joda.time.Period; +import org.junit.Assert; import org.junit.Before; +import org.junit.Ignore; import org.junit.Test; import org.junit.runner.RunWith; import org.junit.runners.Parameterized; @@ -90,22 +95,24 @@ public class GroupByQueryRunnerTest config.setMaxIntermediateRows(10000); final Supplier configSupplier = Suppliers.ofInstance(config); - final GroupByQueryRunnerFactory factory = new GroupByQueryRunnerFactory( - new GroupByQueryEngine( - configSupplier, - new StupidPool( - new Supplier() - { - @Override - public ByteBuffer get() - { - return ByteBuffer.allocate(1024 * 1024); - } - } - ) - ), + final GroupByQueryEngine engine = new GroupByQueryEngine( configSupplier, - new GroupByQueryQueryToolChest(configSupplier) + new StupidPool( + new Supplier() + { + @Override + public ByteBuffer get() + { + return ByteBuffer.allocate(1024 * 1024); + } + } + ) + ); + + final GroupByQueryRunnerFactory factory = new GroupByQueryRunnerFactory( + engine, + configSupplier, + new GroupByQueryQueryToolChest(configSupplier, engine) ); return Lists.newArrayList( @@ -167,8 +174,7 @@ public class GroupByQueryRunnerTest createExpectedRow("2011-04-02", "alias", "travel", "rows", 1L, "idx", 126L) ); - Iterable results = Sequences.toList(runner.run(query), Lists.newArrayList()); - + Iterable results = runQuery(query); TestHelper.assertExpectedObjects(expectedResults, results, ""); } @@ -178,33 +184,33 @@ public class GroupByQueryRunnerTest DateTimeZone tz = DateTimeZone.forID("America/Los_Angeles"); GroupByQuery query = GroupByQuery.builder() - .setDataSource(QueryRunnerTestHelper.dataSource) - .setInterval("2011-03-31T00:00:00-07:00/2011-04-02T00:00:00-07:00") - .setDimensions( - Lists.newArrayList( - (DimensionSpec) new DefaultDimensionSpec( - "quality", - "alias" - ) - ) - ) - .setAggregatorSpecs( - Arrays.asList( - QueryRunnerTestHelper.rowsCount, - new LongSumAggregatorFactory( - "idx", - "index" - ) - ) - ) - .setGranularity( - new PeriodGranularity( - new Period("P1D"), - null, - tz - ) - ) - .build(); + .setDataSource(QueryRunnerTestHelper.dataSource) + .setInterval("2011-03-31T00:00:00-07:00/2011-04-02T00:00:00-07:00") + .setDimensions( + Lists.newArrayList( + (DimensionSpec) new DefaultDimensionSpec( + "quality", + "alias" + ) + ) + ) + .setAggregatorSpecs( + Arrays.asList( + QueryRunnerTestHelper.rowsCount, + new LongSumAggregatorFactory( + "idx", + "index" + ) + ) + ) + .setGranularity( + new PeriodGranularity( + new Period("P1D"), + null, + tz + ) + ) + .build(); List expectedResults = Arrays.asList( createExpectedRow(new DateTime("2011-03-31", tz), "alias", "automotive", "rows", 1L, "idx", 135L), @@ -228,11 +234,7 @@ public class GroupByQueryRunnerTest createExpectedRow(new DateTime("2011-04-01", tz), "alias", "travel", "rows", 1L, "idx", 126L) ); - Iterable results = Sequences.toList( - runner.run(query), - Lists.newArrayList() - ); - + Iterable results = runQuery(query); TestHelper.assertExpectedObjects(expectedResults, results, ""); } @@ -661,7 +663,21 @@ public class GroupByQueryRunnerTest createExpectedRow("2011-04-01", "quality", "automotive", "rows", 2L) ); - QueryRunner mergeRunner = new GroupByQueryQueryToolChest(configSupplier).mergeResults(runner); + final GroupByQueryEngine engine = new GroupByQueryEngine( + configSupplier, + new StupidPool( + new Supplier() + { + @Override + public ByteBuffer get() + { + return ByteBuffer.allocate(1024 * 1024); + } + } + ) + ); + + QueryRunner mergeRunner = new GroupByQueryQueryToolChest(configSupplier, engine).mergeResults(runner); TestHelper.assertExpectedObjects(expectedResults, mergeRunner.run(query), "no-limit"); } @@ -696,7 +712,21 @@ public class GroupByQueryRunnerTest ); TestHelper.assertExpectedObjects(expectedResults, runner.run(query), "normal"); - QueryRunner mergeRunner = new GroupByQueryQueryToolChest(configSupplier).mergeResults(runner); + final GroupByQueryEngine engine = new GroupByQueryEngine( + configSupplier, + new StupidPool( + new Supplier() + { + @Override + public ByteBuffer get() + { + return ByteBuffer.allocate(1024 * 1024); + } + } + ) + ); + + QueryRunner mergeRunner = new GroupByQueryQueryToolChest(configSupplier, engine).mergeResults(runner); TestHelper.assertExpectedObjects(expectedResults, mergeRunner.run(query), "no-limit"); } @@ -731,10 +761,200 @@ public class GroupByQueryRunnerTest ); TestHelper.assertExpectedObjects(expectedResults, runner.run(query), "normal"); - QueryRunner mergeRunner = new GroupByQueryQueryToolChest(configSupplier).mergeResults(runner); + final GroupByQueryEngine engine = new GroupByQueryEngine( + configSupplier, + new StupidPool( + new Supplier() + { + @Override + public ByteBuffer get() + { + return ByteBuffer.allocate(1024 * 1024); + } + } + ) + ); + + QueryRunner mergeRunner = new GroupByQueryQueryToolChest(configSupplier, engine).mergeResults(runner); TestHelper.assertExpectedObjects(expectedResults, mergeRunner.run(query), "no-limit"); } + // A subquery identical to the query should yield identical results + @Test + public void testIdenticalSubquery() + { + GroupByQuery subquery = GroupByQuery + .builder() + .setDataSource(QueryRunnerTestHelper.dataSource) + .setQuerySegmentSpec(QueryRunnerTestHelper.firstToThird) + .setDimensions(Lists.newArrayList(new DefaultDimensionSpec("quality", "alias"))) + .setDimFilter(new JavaScriptDimFilter("quality", "function(dim){ return true; }")) + .setAggregatorSpecs( + Arrays.asList( + QueryRunnerTestHelper.rowsCount, + new LongSumAggregatorFactory("idx", "index") + ) + ) + .setGranularity(QueryRunnerTestHelper.dayGran) + .build(); + + GroupByQuery query = GroupByQuery + .builder() + .setDataSource(subquery) + .setQuerySegmentSpec(QueryRunnerTestHelper.firstToThird) + .setDimensions(Lists.newArrayList(new DefaultDimensionSpec("alias", "alias"))) + .setAggregatorSpecs( + Arrays.asList( + new LongSumAggregatorFactory("rows", "rows"), + new LongSumAggregatorFactory("idx", "idx") + ) + ) + .setGranularity(QueryRunnerTestHelper.dayGran) + .build(); + + List expectedResults = Arrays.asList( + createExpectedRow("2011-04-01", "alias", "automotive", "rows", 1L, "idx", 135L), + createExpectedRow("2011-04-01", "alias", "business", "rows", 1L, "idx", 118L), + createExpectedRow("2011-04-01", "alias", "entertainment", "rows", 1L, "idx", 158L), + createExpectedRow("2011-04-01", "alias", "health", "rows", 1L, "idx", 120L), + createExpectedRow("2011-04-01", "alias", "mezzanine", "rows", 3L, "idx", 2870L), + createExpectedRow("2011-04-01", "alias", "news", "rows", 1L, "idx", 121L), + createExpectedRow("2011-04-01", "alias", "premium", "rows", 3L, "idx", 2900L), + createExpectedRow("2011-04-01", "alias", "technology", "rows", 1L, "idx", 78L), + createExpectedRow("2011-04-01", "alias", "travel", "rows", 1L, "idx", 119L), + + createExpectedRow("2011-04-02", "alias", "automotive", "rows", 1L, "idx", 147L), + createExpectedRow("2011-04-02", "alias", "business", "rows", 1L, "idx", 112L), + createExpectedRow("2011-04-02", "alias", "entertainment", "rows", 1L, "idx", 166L), + createExpectedRow("2011-04-02", "alias", "health", "rows", 1L, "idx", 113L), + createExpectedRow("2011-04-02", "alias", "mezzanine", "rows", 3L, "idx", 2447L), + createExpectedRow("2011-04-02", "alias", "news", "rows", 1L, "idx", 114L), + createExpectedRow("2011-04-02", "alias", "premium", "rows", 3L, "idx", 2505L), + createExpectedRow("2011-04-02", "alias", "technology", "rows", 1L, "idx", 97L), + createExpectedRow("2011-04-02", "alias", "travel", "rows", 1L, "idx", 126L) + ); + + // Subqueries are handled by the ToolChest + Iterable results = runQuery(query); + TestHelper.assertExpectedObjects(expectedResults, results, ""); + } + + @Test + public void testDifferentGroupingSubquery() + { + GroupByQuery subquery = GroupByQuery + .builder() + .setDataSource(QueryRunnerTestHelper.dataSource) + .setQuerySegmentSpec(QueryRunnerTestHelper.firstToThird) + .setDimensions(Lists.newArrayList(new DefaultDimensionSpec("quality", "alias"))) + .setAggregatorSpecs( + Arrays.asList( + QueryRunnerTestHelper.rowsCount, + new LongSumAggregatorFactory("idx", "index") + ) + ) + .setGranularity(QueryRunnerTestHelper.dayGran) + .build(); + + GroupByQuery query = GroupByQuery + .builder() + .setDataSource(subquery) + .setQuerySegmentSpec(QueryRunnerTestHelper.firstToThird) + .setAggregatorSpecs( + Arrays.asList( + new MaxAggregatorFactory("idx", "idx") + ) + ) + .setGranularity(QueryRunnerTestHelper.dayGran) + .build(); + + List expectedResults = Arrays.asList( + createExpectedRow("2011-04-01", "idx", 2900.0), + createExpectedRow("2011-04-02", "idx", 2505.0) + ); + + Iterable results = runQuery(query); + TestHelper.assertExpectedObjects(expectedResults, results, ""); + } + + @Test + public void testDifferentIntervalSubquery() + { + GroupByQuery subquery = GroupByQuery + .builder() + .setDataSource(QueryRunnerTestHelper.dataSource) + .setQuerySegmentSpec(QueryRunnerTestHelper.firstToThird) + .setDimensions(Lists.newArrayList(new DefaultDimensionSpec("quality", "alias"))) + .setAggregatorSpecs( + Arrays.asList( + QueryRunnerTestHelper.rowsCount, + new LongSumAggregatorFactory("idx", "index") + ) + ) + .setGranularity(QueryRunnerTestHelper.dayGran) + .build(); + + GroupByQuery query = GroupByQuery + .builder() + .setDataSource(subquery) + .setQuerySegmentSpec(QueryRunnerTestHelper.secondOnly) + .setAggregatorSpecs( + Arrays.asList( + new MaxAggregatorFactory("idx", "idx") + ) + ) + .setGranularity(QueryRunnerTestHelper.dayGran) + .build(); + + List expectedResults = Arrays.asList( + createExpectedRow("2011-04-02", "idx", 2505.0) + ); + + Iterable results = runQuery(query); + TestHelper.assertExpectedObjects(expectedResults, results, ""); + } + + @Test + public void testEmptySubquery() + { + GroupByQuery subquery = GroupByQuery + .builder() + .setDataSource(QueryRunnerTestHelper.dataSource) + .setQuerySegmentSpec(QueryRunnerTestHelper.emptyInterval) + .setDimensions(Lists.newArrayList(new DefaultDimensionSpec("quality", "alias"))) + .setAggregatorSpecs( + Arrays.asList( + QueryRunnerTestHelper.rowsCount, + new LongSumAggregatorFactory("idx", "index") + ) + ) + .setGranularity(QueryRunnerTestHelper.dayGran) + .build(); + + GroupByQuery query = GroupByQuery + .builder() + .setDataSource(subquery) + .setQuerySegmentSpec(QueryRunnerTestHelper.firstToThird) + .setAggregatorSpecs( + Arrays.asList( + new MaxAggregatorFactory("idx", "idx") + ) + ) + .setGranularity(QueryRunnerTestHelper.dayGran) + .build(); + + Iterable results = runQuery(query); + Assert.assertFalse(results.iterator().hasNext()); + } + + private Iterable runQuery(GroupByQuery query) + { + QueryToolChest toolChest = factory.getToolchest(); + Sequence queryResult = toolChest.mergeResults(toolChest.preMergeQueryDecoration(runner)).run(query); + return Sequences.toList(queryResult, Lists.newArrayList()); + } + + private Row createExpectedRow(final String timestamp, Object... vals) { return createExpectedRow(new DateTime(timestamp), vals); diff --git a/processing/src/test/java/io/druid/query/groupby/GroupByQueryTest.java b/processing/src/test/java/io/druid/query/groupby/GroupByQueryTest.java new file mode 100644 index 00000000000..8557ba04c24 --- /dev/null +++ b/processing/src/test/java/io/druid/query/groupby/GroupByQueryTest.java @@ -0,0 +1,68 @@ +/* + * Druid - a distributed column store. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * + * This file Copyright (C) 2014 N3TWORK, Inc. and contributed to the Druid project + * under the Druid Corporate Contributor License Agreement. + */ + +package io.druid.query.groupby; + +import com.fasterxml.jackson.databind.ObjectMapper; +import com.google.common.collect.Lists; +import io.druid.jackson.DefaultObjectMapper; +import io.druid.query.Druids; +import io.druid.query.Query; +import io.druid.query.QueryRunnerTestHelper; +import io.druid.query.aggregation.AggregatorFactory; +import io.druid.query.aggregation.LongSumAggregatorFactory; +import io.druid.query.aggregation.PostAggregator; +import io.druid.query.dimension.DefaultDimensionSpec; +import io.druid.query.dimension.DimensionSpec; +import org.junit.Assert; +import org.junit.Test; + +import java.io.IOException; +import java.util.Arrays; + +public class GroupByQueryTest +{ + private static final ObjectMapper jsonMapper = new DefaultObjectMapper(); + + @Test + public void testQuerySerialization() throws IOException + { + Query query = GroupByQuery + .builder() + .setDataSource(QueryRunnerTestHelper.dataSource) + .setQuerySegmentSpec(QueryRunnerTestHelper.firstToThird) + .setDimensions(Lists.newArrayList(new DefaultDimensionSpec("quality", "alias"))) + .setAggregatorSpecs( + Arrays.asList( + QueryRunnerTestHelper.rowsCount, + new LongSumAggregatorFactory("idx", "index") + ) + ) + .setGranularity(QueryRunnerTestHelper.dayGran) + .build(); + + String json = jsonMapper.writeValueAsString(query); + Query serdeQuery = jsonMapper.readValue(json, Query.class); + + Assert.assertEquals(query, serdeQuery); + } + +} diff --git a/processing/src/test/java/io/druid/query/groupby/GroupByTimeseriesQueryRunnerTest.java b/processing/src/test/java/io/druid/query/groupby/GroupByTimeseriesQueryRunnerTest.java index 562527556a9..2538e91bc76 100644 --- a/processing/src/test/java/io/druid/query/groupby/GroupByTimeseriesQueryRunnerTest.java +++ b/processing/src/test/java/io/druid/query/groupby/GroupByTimeseriesQueryRunnerTest.java @@ -56,22 +56,24 @@ public class GroupByTimeseriesQueryRunnerTest extends TimeseriesQueryRunnerTest config.setMaxIntermediateRows(10000); final Supplier configSupplier = Suppliers.ofInstance(config); - final GroupByQueryRunnerFactory factory = new GroupByQueryRunnerFactory( - new GroupByQueryEngine( - configSupplier, - new StupidPool( - new Supplier() - { - @Override - public ByteBuffer get() - { - return ByteBuffer.allocate(1024 * 1024); - } - } - ) - ), + final GroupByQueryEngine engine = new GroupByQueryEngine( configSupplier, - new GroupByQueryQueryToolChest(configSupplier) + new StupidPool( + new Supplier() + { + @Override + public ByteBuffer get() + { + return ByteBuffer.allocate(1024 * 1024); + } + } + ) + ); + + final GroupByQueryRunnerFactory factory = new GroupByQueryRunnerFactory( + engine, + configSupplier, + new GroupByQueryQueryToolChest(configSupplier, engine) ); final Collection objects = QueryRunnerTestHelper.makeQueryRunners(factory); @@ -95,13 +97,13 @@ public class GroupByTimeseriesQueryRunnerTest extends TimeseriesQueryRunnerTest return Sequences.map( groupByRunner.run( GroupByQuery.builder() - .setDataSource(tsQuery.getDataSource()) - .setQuerySegmentSpec(tsQuery.getQuerySegmentSpec()) - .setGranularity(tsQuery.getGranularity()) - .setDimFilter(tsQuery.getDimensionsFilter()) - .setAggregatorSpecs(tsQuery.getAggregatorSpecs()) - .setPostAggregatorSpecs(tsQuery.getPostAggregatorSpecs()) - .build() + .setDataSource(tsQuery.getDataSource()) + .setQuerySegmentSpec(tsQuery.getQuerySegmentSpec()) + .setGranularity(tsQuery.getGranularity()) + .setDimFilter(tsQuery.getDimensionsFilter()) + .setAggregatorSpecs(tsQuery.getAggregatorSpecs()) + .setPostAggregatorSpecs(tsQuery.getPostAggregatorSpecs()) + .build() ), new Function>() { diff --git a/processing/src/test/java/io/druid/query/search/SearchQueryTest.java b/processing/src/test/java/io/druid/query/search/SearchQueryTest.java new file mode 100644 index 00000000000..87bae26f0fd --- /dev/null +++ b/processing/src/test/java/io/druid/query/search/SearchQueryTest.java @@ -0,0 +1,54 @@ +/* + * Druid - a distributed column store. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * + * This file Copyright (C) 2014 N3TWORK, Inc. and contributed to the Druid project + * under the Druid Corporate Contributor License Agreement. + */ + +package io.druid.query.search; + +import com.fasterxml.jackson.databind.ObjectMapper; +import io.druid.jackson.DefaultObjectMapper; +import io.druid.query.Druids; +import io.druid.query.Query; +import io.druid.query.QueryRunnerTestHelper; +import org.junit.Assert; +import org.junit.Test; + +import java.io.IOException; + +public class SearchQueryTest +{ + private static final ObjectMapper jsonMapper = new DefaultObjectMapper(); + + @Test + public void testQuerySerialization() throws IOException + { + Query query = Druids.newSearchQueryBuilder() + .dataSource(QueryRunnerTestHelper.dataSource) + .granularity(QueryRunnerTestHelper.allGran) + .intervals(QueryRunnerTestHelper.fullOnInterval) + .query("a") + .build(); + + String json = jsonMapper.writeValueAsString(query); + Query serdeQuery = jsonMapper.readValue(json, Query.class); + + Assert.assertEquals(query, serdeQuery); + } + +} diff --git a/processing/src/test/java/io/druid/query/select/SelectQueryRunnerTest.java b/processing/src/test/java/io/druid/query/select/SelectQueryRunnerTest.java index 6c7b26d6059..5015239870e 100644 --- a/processing/src/test/java/io/druid/query/select/SelectQueryRunnerTest.java +++ b/processing/src/test/java/io/druid/query/select/SelectQueryRunnerTest.java @@ -28,6 +28,7 @@ import io.druid.jackson.DefaultObjectMapper; import io.druid.query.QueryRunner; import io.druid.query.QueryRunnerTestHelper; import io.druid.query.Result; +import io.druid.query.TableDataSource; import io.druid.query.filter.SelectorDimFilter; import io.druid.query.spec.LegacySegmentSpec; import org.joda.time.DateTime; @@ -72,7 +73,7 @@ public class SelectQueryRunnerTest public void testFullOnSelect() { SelectQuery query = new SelectQuery( - QueryRunnerTestHelper.dataSource, + new TableDataSource(QueryRunnerTestHelper.dataSource), QueryRunnerTestHelper.fullOnInterval, null, QueryRunnerTestHelper.allGran, @@ -141,7 +142,7 @@ public class SelectQueryRunnerTest public void testSelectWithDimsAndMets() { SelectQuery query = new SelectQuery( - QueryRunnerTestHelper.dataSource, + new TableDataSource(QueryRunnerTestHelper.dataSource), QueryRunnerTestHelper.fullOnInterval, null, QueryRunnerTestHelper.allGran, @@ -201,7 +202,7 @@ public class SelectQueryRunnerTest public void testSelectPagination() { SelectQuery query = new SelectQuery( - QueryRunnerTestHelper.dataSource, + new TableDataSource(QueryRunnerTestHelper.dataSource), QueryRunnerTestHelper.fullOnInterval, null, QueryRunnerTestHelper.allGran, @@ -261,7 +262,7 @@ public class SelectQueryRunnerTest public void testFullOnSelectWithFilter() { SelectQuery query = new SelectQuery( - QueryRunnerTestHelper.dataSource, + new TableDataSource(QueryRunnerTestHelper.dataSource), new LegacySegmentSpec(new Interval("2011-01-12/2011-01-14")), new SelectorDimFilter(QueryRunnerTestHelper.providerDimension, "spot"), QueryRunnerTestHelper.dayGran, diff --git a/processing/src/test/java/io/druid/query/timeboundary/TimeBoundaryQueryTest.java b/processing/src/test/java/io/druid/query/timeboundary/TimeBoundaryQueryTest.java new file mode 100644 index 00000000000..1dd50e9493d --- /dev/null +++ b/processing/src/test/java/io/druid/query/timeboundary/TimeBoundaryQueryTest.java @@ -0,0 +1,50 @@ +/* + * Druid - a distributed column store. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * + * This file Copyright (C) 2014 N3TWORK, Inc. and contributed to the Druid project + * under the Druid Corporate Contributor License Agreement. + */ + +package io.druid.query.timeboundary; + +import com.fasterxml.jackson.databind.ObjectMapper; +import io.druid.jackson.DefaultObjectMapper; +import io.druid.query.Druids; +import io.druid.query.Query; +import org.junit.Assert; +import org.junit.Test; + +import java.io.IOException; + +public class TimeBoundaryQueryTest +{ + private static final ObjectMapper jsonMapper = new DefaultObjectMapper(); + + @Test + public void testQuerySerialization() throws IOException + { + Query query = Druids.newTimeBoundaryQueryBuilder() + .dataSource("testing") + .build(); + + String json = jsonMapper.writeValueAsString(query); + Query serdeQuery = jsonMapper.readValue(json, Query.class); + + Assert.assertEquals(query, serdeQuery); + } + +} diff --git a/processing/src/test/java/io/druid/query/timeseries/TimeseriesQueryRunnerTest.java b/processing/src/test/java/io/druid/query/timeseries/TimeseriesQueryRunnerTest.java index 00aaca6f9e5..a8d626ae044 100644 --- a/processing/src/test/java/io/druid/query/timeseries/TimeseriesQueryRunnerTest.java +++ b/processing/src/test/java/io/druid/query/timeseries/TimeseriesQueryRunnerTest.java @@ -85,7 +85,8 @@ public class TimeseriesQueryRunnerTest .aggregators( Arrays.asList( QueryRunnerTestHelper.rowsCount, - QueryRunnerTestHelper.indexDoubleSum + QueryRunnerTestHelper.indexDoubleSum, + QueryRunnerTestHelper.qualityUniques ) ) .postAggregators(Arrays.asList(QueryRunnerTestHelper.addRowsIndexConstant)) @@ -128,6 +129,11 @@ public class TimeseriesQueryRunnerTest value.getDoubleMetric("addRowsIndexConstant"), 0.0 ); + Assert.assertEquals( + value.getDoubleMetric("uniques"), + QueryRunnerTestHelper.skippedDay.equals(result.getTimestamp()) ? 0.0d : 9.0d, + 0.02 + ); expectedEarliest = gran.toDateTime(gran.next(expectedEarliest.getMillis())); ++count; @@ -182,7 +188,12 @@ public class TimeseriesQueryRunnerTest .granularity(QueryRunnerTestHelper.dayGran) .filters(QueryRunnerTestHelper.providerDimension, "upfront") .intervals(QueryRunnerTestHelper.fullOnInterval) - .aggregators(Arrays.asList(QueryRunnerTestHelper.rowsCount)) + .aggregators( + Arrays.asList( + QueryRunnerTestHelper.rowsCount, + QueryRunnerTestHelper.qualityUniques + ) + ) .build(); Assert.assertEquals( @@ -215,6 +226,14 @@ public class TimeseriesQueryRunnerTest QueryRunnerTestHelper.skippedDay.equals(result.getTimestamp()) ? 0L : 2L, value.getLongMetric("rows").longValue() ); + Assert.assertEquals( + result.toString(), + QueryRunnerTestHelper.skippedDay.equals(result.getTimestamp()) ? 0.0d : 2.0d, + value.getDoubleMetric( + "uniques" + ), + 0.01 + ); expectedEarliest = gran.toDateTime(gran.next(expectedEarliest.getMillis())); } @@ -233,7 +252,8 @@ public class TimeseriesQueryRunnerTest new LongSumAggregatorFactory( "idx", "index" - ) + ), + QueryRunnerTestHelper.qualityUniques ) ) .build(); @@ -242,13 +262,13 @@ public class TimeseriesQueryRunnerTest new Result( new DateTime("2011-04-01"), new TimeseriesResultValue( - ImmutableMap.of("rows", 13L, "idx", 6619L) + ImmutableMap.of("rows", 13L, "idx", 6619L, "uniques", QueryRunnerTestHelper.UNIQUES_9) ) ), new Result( new DateTime("2011-04-02"), new TimeseriesResultValue( - ImmutableMap.of("rows", 13L, "idx", 5827L) + ImmutableMap.of("rows", 13L, "idx", 5827L, "uniques", QueryRunnerTestHelper.UNIQUES_9) ) ) ); @@ -327,7 +347,8 @@ public class TimeseriesQueryRunnerTest new LongSumAggregatorFactory( "idx", "index" - ) + ), + QueryRunnerTestHelper.qualityUniques ) ) .build(); @@ -336,7 +357,7 @@ public class TimeseriesQueryRunnerTest new Result( new DateTime("2011-04-01"), new TimeseriesResultValue( - ImmutableMap.of("rows", 13L, "idx", 5827L) + ImmutableMap.of("rows", 13L, "idx", 5827L, "uniques", QueryRunnerTestHelper.UNIQUES_9) ) ) ); @@ -363,7 +384,8 @@ public class TimeseriesQueryRunnerTest new LongSumAggregatorFactory( "idx", "index" - ) + ), + QueryRunnerTestHelper.qualityUniques ) ) .build(); @@ -372,7 +394,7 @@ public class TimeseriesQueryRunnerTest new Result( new DateTime("2011-04-02"), new TimeseriesResultValue( - ImmutableMap.of("rows", 13L, "idx", 5827L) + ImmutableMap.of("rows", 13L, "idx", 5827L, "uniques", QueryRunnerTestHelper.UNIQUES_9) ) ) ); @@ -457,7 +479,8 @@ public class TimeseriesQueryRunnerTest new LongSumAggregatorFactory( "idx", "index" - ) + ), + QueryRunnerTestHelper.qualityUniques ) ) .build(); @@ -466,7 +489,7 @@ public class TimeseriesQueryRunnerTest new Result( new DateTime("2011-04-01"), new TimeseriesResultValue( - ImmutableMap.of("rows", 13L, "idx", 5827L) + ImmutableMap.of("rows", 13L, "idx", 5827L, "uniques", QueryRunnerTestHelper.UNIQUES_9) ) ) ); @@ -494,7 +517,8 @@ public class TimeseriesQueryRunnerTest new LongSumAggregatorFactory( "idx", "index" - ) + ), + QueryRunnerTestHelper.qualityUniques ) ) .build(); @@ -503,7 +527,7 @@ public class TimeseriesQueryRunnerTest new Result( new DateTime("2011-04-02"), new TimeseriesResultValue( - ImmutableMap.of("rows", 13L, "idx", 5827L) + ImmutableMap.of("rows", 13L, "idx", 5827L, "uniques", QueryRunnerTestHelper.UNIQUES_9) ) ) ); @@ -561,7 +585,8 @@ public class TimeseriesQueryRunnerTest .aggregators( Arrays.asList( QueryRunnerTestHelper.rowsCount, - QueryRunnerTestHelper.indexLongSum + QueryRunnerTestHelper.indexLongSum, + QueryRunnerTestHelper.qualityUniques ) ) .postAggregators(Arrays.asList(QueryRunnerTestHelper.addRowsIndexConstant)) @@ -574,7 +599,8 @@ public class TimeseriesQueryRunnerTest ImmutableMap.of( "rows", 13L, "index", 6619L, - "addRowsIndexConstant", 6633.0 + "addRowsIndexConstant", 6633.0, + "uniques", QueryRunnerTestHelper.UNIQUES_9 ) ) ), @@ -584,7 +610,8 @@ public class TimeseriesQueryRunnerTest ImmutableMap.of( "rows", 13L, "index", 5827L, - "addRowsIndexConstant", 5841.0 + "addRowsIndexConstant", 5841.0, + "uniques", QueryRunnerTestHelper.UNIQUES_9 ) ) ) @@ -608,7 +635,8 @@ public class TimeseriesQueryRunnerTest .aggregators( Arrays.asList( QueryRunnerTestHelper.rowsCount, - QueryRunnerTestHelper.indexLongSum + QueryRunnerTestHelper.indexLongSum, + QueryRunnerTestHelper.qualityUniques ) ) .postAggregators(Arrays.asList(QueryRunnerTestHelper.addRowsIndexConstant)) @@ -621,7 +649,8 @@ public class TimeseriesQueryRunnerTest ImmutableMap.of( "rows", 11L, "index", 3783L, - "addRowsIndexConstant", 3795.0 + "addRowsIndexConstant", 3795.0, + "uniques", QueryRunnerTestHelper.UNIQUES_9 ) ) ), @@ -631,7 +660,8 @@ public class TimeseriesQueryRunnerTest ImmutableMap.of( "rows", 11L, "index", 3313L, - "addRowsIndexConstant", 3325.0 + "addRowsIndexConstant", 3325.0, + "uniques", QueryRunnerTestHelper.UNIQUES_9 ) ) ) @@ -655,7 +685,8 @@ public class TimeseriesQueryRunnerTest .aggregators( Arrays.asList( QueryRunnerTestHelper.rowsCount, - QueryRunnerTestHelper.indexLongSum + QueryRunnerTestHelper.indexLongSum, + QueryRunnerTestHelper.qualityUniques ) ) .postAggregators(Arrays.asList(QueryRunnerTestHelper.addRowsIndexConstant)) @@ -668,7 +699,8 @@ public class TimeseriesQueryRunnerTest ImmutableMap.of( "rows", 9L, "index", 1102L, - "addRowsIndexConstant", 1112.0 + "addRowsIndexConstant", 1112.0, + "uniques", QueryRunnerTestHelper.UNIQUES_9 ) ) ), @@ -678,7 +710,8 @@ public class TimeseriesQueryRunnerTest ImmutableMap.of( "rows", 9L, "index", 1120L, - "addRowsIndexConstant", 1130.0 + "addRowsIndexConstant", 1130.0, + "uniques", QueryRunnerTestHelper.UNIQUES_9 ) ) ) @@ -702,7 +735,8 @@ public class TimeseriesQueryRunnerTest .aggregators( Arrays.asList( QueryRunnerTestHelper.rowsCount, - QueryRunnerTestHelper.indexLongSum + QueryRunnerTestHelper.indexLongSum, + QueryRunnerTestHelper.qualityUniques ) ) .postAggregators(Arrays.asList(QueryRunnerTestHelper.addRowsIndexConstant)) @@ -715,7 +749,8 @@ public class TimeseriesQueryRunnerTest ImmutableMap.of( "rows", 2L, "index", 2681L, - "addRowsIndexConstant", 2684.0 + "addRowsIndexConstant", 2684.0, + "uniques", QueryRunnerTestHelper.UNIQUES_2 ) ) ), @@ -725,7 +760,8 @@ public class TimeseriesQueryRunnerTest ImmutableMap.of( "rows", 2L, "index", 2193L, - "addRowsIndexConstant", 2196.0 + "addRowsIndexConstant", 2196.0, + "uniques", QueryRunnerTestHelper.UNIQUES_2 ) ) ) @@ -749,7 +785,8 @@ public class TimeseriesQueryRunnerTest .aggregators( Arrays.asList( QueryRunnerTestHelper.rowsCount, - QueryRunnerTestHelper.indexLongSum + QueryRunnerTestHelper.indexLongSum, + QueryRunnerTestHelper.qualityUniques ) ) .postAggregators(Arrays.asList(QueryRunnerTestHelper.addRowsIndexConstant)) @@ -762,7 +799,8 @@ public class TimeseriesQueryRunnerTest ImmutableMap.of( "rows", 2L, "index", 2836L, - "addRowsIndexConstant", 2839.0 + "addRowsIndexConstant", 2839.0, + "uniques", QueryRunnerTestHelper.UNIQUES_2 ) ) ), @@ -772,7 +810,8 @@ public class TimeseriesQueryRunnerTest ImmutableMap.of( "rows", 2L, "index", 2514L, - "addRowsIndexConstant", 2517.0 + "addRowsIndexConstant", 2517.0, + "uniques", QueryRunnerTestHelper.UNIQUES_2 ) ) ) @@ -818,7 +857,8 @@ public class TimeseriesQueryRunnerTest ImmutableMap.of( "rows", 2L, "index", 254.4554443359375D, - "addRowsIndexConstant", 257.4554443359375D + "addRowsIndexConstant", 257.4554443359375D, + "uniques", QueryRunnerTestHelper.UNIQUES_2 ) ) ), @@ -828,7 +868,8 @@ public class TimeseriesQueryRunnerTest ImmutableMap.of( "rows", 2L, "index", 260.4129638671875D, - "addRowsIndexConstant", 263.4129638671875D + "addRowsIndexConstant", 263.4129638671875D, + "uniques", QueryRunnerTestHelper.UNIQUES_2 ) ) ) @@ -874,7 +915,8 @@ public class TimeseriesQueryRunnerTest ImmutableMap.of( "rows", 1L, "index", new Float(135.885094).doubleValue(), - "addRowsIndexConstant", new Float(137.885094).doubleValue() + "addRowsIndexConstant", new Float(137.885094).doubleValue(), + "uniques", QueryRunnerTestHelper.UNIQUES_1 ) ) ), @@ -884,7 +926,8 @@ public class TimeseriesQueryRunnerTest ImmutableMap.of( "rows", 1L, "index", new Float(147.425935).doubleValue(), - "addRowsIndexConstant", new Float(149.425935).doubleValue() + "addRowsIndexConstant", new Float(149.425935).doubleValue(), + "uniques", QueryRunnerTestHelper.UNIQUES_1 ) ) ) @@ -930,7 +973,8 @@ public class TimeseriesQueryRunnerTest ImmutableMap.of( "rows", 1L, "index", new Float(118.570340).doubleValue(), - "addRowsIndexConstant", new Float(120.570340).doubleValue() + "addRowsIndexConstant", new Float(120.570340).doubleValue(), + "uniques", QueryRunnerTestHelper.UNIQUES_1 ) ) ), @@ -940,7 +984,8 @@ public class TimeseriesQueryRunnerTest ImmutableMap.of( "rows", 1L, "index", new Float(112.987027).doubleValue(), - "addRowsIndexConstant", new Float(114.987027).doubleValue() + "addRowsIndexConstant", new Float(114.987027).doubleValue(), + "uniques", QueryRunnerTestHelper.UNIQUES_1 ) ) ) @@ -970,7 +1015,8 @@ public class TimeseriesQueryRunnerTest .aggregators( Arrays.asList( QueryRunnerTestHelper.rowsCount, - QueryRunnerTestHelper.indexLongSum + QueryRunnerTestHelper.indexLongSum, + QueryRunnerTestHelper.qualityUniques ) ) .postAggregators(Arrays.asList(QueryRunnerTestHelper.addRowsIndexConstant)) @@ -983,7 +1029,8 @@ public class TimeseriesQueryRunnerTest ImmutableMap.of( "rows", 13L, "index", 6619L, - "addRowsIndexConstant", 6633.0 + "addRowsIndexConstant", 6633.0, + "uniques", QueryRunnerTestHelper.UNIQUES_9 ) ) ), @@ -993,7 +1040,8 @@ public class TimeseriesQueryRunnerTest ImmutableMap.of( "rows", 13L, "index", 5827L, - "addRowsIndexConstant", 5841.0 + "addRowsIndexConstant", 5841.0, + "uniques", QueryRunnerTestHelper.UNIQUES_9 ) ) ) @@ -1043,7 +1091,8 @@ public class TimeseriesQueryRunnerTest ImmutableMap.of( "rows", 2L, "index", 254.4554443359375D, - "addRowsIndexConstant", 257.4554443359375D + "addRowsIndexConstant", 257.4554443359375D, + "uniques", QueryRunnerTestHelper.UNIQUES_2 ) ) ), @@ -1053,7 +1102,8 @@ public class TimeseriesQueryRunnerTest ImmutableMap.of( "rows", 2L, "index", 260.4129638671875D, - "addRowsIndexConstant", 263.4129638671875D + "addRowsIndexConstant", 263.4129638671875D, + "uniques", QueryRunnerTestHelper.UNIQUES_2 ) ) ) @@ -1085,7 +1135,8 @@ public class TimeseriesQueryRunnerTest ImmutableMap.of( "rows", 0L, "index", 0.0, - "addRowsIndexConstant", 1.0 + "addRowsIndexConstant", 1.0, + "uniques", 0.0 ) ) ), @@ -1095,7 +1146,8 @@ public class TimeseriesQueryRunnerTest ImmutableMap.of( "rows", 0L, "index", 0.0, - "addRowsIndexConstant", 1.0 + "addRowsIndexConstant", 1.0, + "uniques", 0.0 ) ) ) @@ -1127,7 +1179,8 @@ public class TimeseriesQueryRunnerTest ImmutableMap.of( "rows", 0L, "index", 0.0, - "addRowsIndexConstant", 1.0 + "addRowsIndexConstant", 1.0, + "uniques", 0.0 ) ) ), @@ -1137,7 +1190,8 @@ public class TimeseriesQueryRunnerTest ImmutableMap.of( "rows", 0L, "index", 0.0, - "addRowsIndexConstant", 1.0 + "addRowsIndexConstant", 1.0, + "uniques", 0.0 ) ) ) @@ -1183,7 +1237,8 @@ public class TimeseriesQueryRunnerTest ImmutableMap.of( "rows", 0L, "index", 0.0, - "addRowsIndexConstant", 1.0 + "addRowsIndexConstant", 1.0, + "uniques", 0.0 ) ) ), @@ -1193,7 +1248,8 @@ public class TimeseriesQueryRunnerTest ImmutableMap.of( "rows", 0L, "index", 0.0, - "addRowsIndexConstant", 1.0 + "addRowsIndexConstant", 1.0, + "uniques", 0.0 ) ) ) diff --git a/processing/src/test/java/io/druid/query/timeseries/TimeseriesQueryTest.java b/processing/src/test/java/io/druid/query/timeseries/TimeseriesQueryTest.java new file mode 100644 index 00000000000..43e22ddfb94 --- /dev/null +++ b/processing/src/test/java/io/druid/query/timeseries/TimeseriesQueryTest.java @@ -0,0 +1,62 @@ +/* + * Druid - a distributed column store. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * + * This file Copyright (C) 2014 N3TWORK, Inc. and contributed to the Druid project + * under the Druid Corporate Contributor License Agreement. + */ + +package io.druid.query.timeseries; + +import com.fasterxml.jackson.databind.ObjectMapper; +import io.druid.jackson.DefaultObjectMapper; +import io.druid.query.Druids; +import io.druid.query.Query; +import io.druid.query.QueryRunnerTestHelper; +import io.druid.query.aggregation.PostAggregator; +import org.junit.Assert; +import org.junit.Test; + +import java.io.IOException; +import java.util.Arrays; + +public class TimeseriesQueryTest +{ + private static final ObjectMapper jsonMapper = new DefaultObjectMapper(); + + @Test + public void testQuerySerialization() throws IOException + { + Query query = Druids.newTimeseriesQueryBuilder() + .dataSource(QueryRunnerTestHelper.dataSource) + .granularity(QueryRunnerTestHelper.dayGran) + .intervals(QueryRunnerTestHelper.fullOnInterval) + .aggregators( + Arrays.asList( + QueryRunnerTestHelper.rowsCount, + QueryRunnerTestHelper.indexDoubleSum + ) + ) + .postAggregators(Arrays.asList(QueryRunnerTestHelper.addRowsIndexConstant)) + .build(); + + String json = jsonMapper.writeValueAsString(query); + Query serdeQuery = jsonMapper.readValue(json, Query.class); + + Assert.assertEquals(query, serdeQuery); + } + +} diff --git a/processing/src/test/java/io/druid/query/topn/TopNQueryRunnerTest.java b/processing/src/test/java/io/druid/query/topn/TopNQueryRunnerTest.java index 839f82c3cbc..291eab8171a 100644 --- a/processing/src/test/java/io/druid/query/topn/TopNQueryRunnerTest.java +++ b/processing/src/test/java/io/druid/query/topn/TopNQueryRunnerTest.java @@ -25,27 +25,20 @@ import com.google.common.collect.Iterables; import com.google.common.collect.Lists; import com.metamx.common.guava.Sequences; import io.druid.collections.StupidPool; -import io.druid.granularity.QueryGranularity; import io.druid.query.Druids; import io.druid.query.QueryRunner; +import io.druid.query.QueryRunnerTestHelper; import io.druid.query.Result; import io.druid.query.TestQueryRunners; import io.druid.query.aggregation.AggregatorFactory; -import io.druid.query.aggregation.CountAggregatorFactory; -import io.druid.query.aggregation.DoubleSumAggregatorFactory; -import io.druid.query.aggregation.LongSumAggregatorFactory; import io.druid.query.aggregation.MaxAggregatorFactory; import io.druid.query.aggregation.MinAggregatorFactory; import io.druid.query.aggregation.PostAggregator; -import io.druid.query.aggregation.post.ArithmeticPostAggregator; -import io.druid.query.aggregation.post.ConstantPostAggregator; -import io.druid.query.aggregation.post.FieldAccessPostAggregator; import io.druid.query.dimension.ExtractionDimensionSpec; import io.druid.query.extraction.RegexDimExtractionFn; import io.druid.query.filter.AndDimFilter; import io.druid.query.filter.DimFilter; import io.druid.query.spec.MultipleIntervalSegmentSpec; -import io.druid.query.spec.QuerySegmentSpec; import io.druid.segment.TestHelper; import org.joda.time.DateTime; import org.joda.time.Interval; @@ -108,73 +101,22 @@ public class TopNQueryRunnerTest this.runner = runner; } - final String dataSource = "testing"; - final QueryGranularity gran = QueryGranularity.DAY; - final QueryGranularity allGran = QueryGranularity.ALL; - final String providerDimension = "provider"; - final String qualityDimension = "quality"; - final String placementishDimension = "placementish"; - final String indexMetric = "index"; - final String addRowsIndexConstantMetric = "addRowsIndexConstant"; - final CountAggregatorFactory rowsCount = new CountAggregatorFactory("rows"); - final LongSumAggregatorFactory indexLongSum = new LongSumAggregatorFactory("index", "index"); - final DoubleSumAggregatorFactory indexDoubleSum = new DoubleSumAggregatorFactory("index", "index"); - final ConstantPostAggregator constant = new ConstantPostAggregator("const", 1L); - final FieldAccessPostAggregator rowsPostAgg = new FieldAccessPostAggregator("rows", "rows"); - final FieldAccessPostAggregator indexPostAgg = new FieldAccessPostAggregator("index", "index"); - final ArithmeticPostAggregator addRowsIndexConstant = - new ArithmeticPostAggregator( - "addRowsIndexConstant", "+", Lists.newArrayList(constant, rowsPostAgg, indexPostAgg) - ); - final List commonAggregators = Arrays.asList(rowsCount, indexDoubleSum); - - - final String[] expectedFullOnIndexValues = new String[]{ - "4500.0", "6077.949111938477", "4922.488838195801", "5726.140853881836", "4698.468170166016", - "4651.030891418457", "4398.145851135254", "4596.068244934082", "4434.630561828613", "0.0", - "6162.801361083984", "5590.292701721191", "4994.298484802246", "5179.679672241211", "6288.556800842285", - "6025.663551330566", "5772.855537414551", "5346.517524719238", "5497.331253051758", "5909.684387207031", - "5862.711364746094", "5958.373008728027", "5224.882194519043", "5456.789611816406", "5456.095397949219", - "4642.481948852539", "5023.572692871094", "5155.821723937988", "5350.3723220825195", "5236.997489929199", - "4910.097717285156", "4507.608840942383", "4659.80500793457", "5354.878845214844", "4945.796455383301", - "6459.080368041992", "4390.493583679199", "6545.758262634277", "6922.801231384277", "6023.452911376953", - "6812.107475280762", "6368.713348388672", "6381.748748779297", "5631.245086669922", "4976.192253112793", - "6541.463027954102", "5983.8513107299805", "5967.189498901367", "5567.139289855957", "4863.5944747924805", - "4681.164360046387", "6122.321441650391", "5410.308860778809", "4846.676376342773", "5333.872688293457", - "5013.053741455078", "4836.85563659668", "5264.486434936523", "4581.821243286133", "4680.233596801758", - "4771.363662719727", "5038.354717254639", "4816.808464050293", "4684.095504760742", "5023.663467407227", - "5889.72257232666", "4984.973915100098", "5664.220512390137", "5572.653915405273", "5537.123138427734", - "5980.422874450684", "6243.834693908691", "5372.147285461426", "5690.728981018066", "5827.796455383301", - "6141.0769119262695", "6082.3237228393555", "5678.771339416504", "6814.467971801758", "6626.151596069336", - "5833.2095947265625", "4679.222328186035", "5367.9403076171875", "5410.445640563965", "5689.197135925293", - "5240.5018310546875", "4790.912239074707", "4992.670921325684", "4796.888023376465", "5479.439590454102", - "5506.567192077637", "4743.144546508789", "4913.282669067383", "4723.869743347168" - }; - - final DateTime skippedDay = new DateTime("2011-01-21T00:00:00.000Z"); - - final QuerySegmentSpec firstToThird = new MultipleIntervalSegmentSpec( - Arrays.asList(new Interval("2011-04-01T00:00:00.000Z/2011-04-03T00:00:00.000Z")) - ); - final QuerySegmentSpec fullOnInterval = new MultipleIntervalSegmentSpec( - Arrays.asList(new Interval("1970-01-01T00:00:00.000Z/2020-01-01T00:00:00.000Z")) - ); - + private static final String providerDimension = "provider"; @Test public void testFullOnTopN() { TopNQuery query = new TopNQueryBuilder() - .dataSource(dataSource) - .granularity(allGran) + .dataSource(QueryRunnerTestHelper.dataSource) + .granularity(QueryRunnerTestHelper.allGran) .dimension(providerDimension) - .metric(indexMetric) + .metric(QueryRunnerTestHelper.indexMetric) .threshold(4) - .intervals(fullOnInterval) + .intervals(QueryRunnerTestHelper.fullOnInterval) .aggregators( Lists.newArrayList( Iterables.concat( - commonAggregators, + QueryRunnerTestHelper.commonAggregators, Lists.newArrayList( new MaxAggregatorFactory("maxIndex", "index"), new MinAggregatorFactory("minIndex", "index") @@ -182,7 +124,7 @@ public class TopNQueryRunnerTest ) ) ) - .postAggregators(Arrays.asList(addRowsIndexConstant)) + .postAggregators(Arrays.asList(QueryRunnerTestHelper.addRowsIndexConstant)) .build(); List> expectedResults = Arrays.asList( @@ -191,29 +133,32 @@ public class TopNQueryRunnerTest new TopNResultValue( Arrays.>asList( ImmutableMap.builder() - .put("provider", "total_market") - .put("rows", 186L) - .put("index", 215679.82879638672D) - .put("addRowsIndexConstant", 215866.82879638672D) - .put("maxIndex", 1743.9217529296875D) - .put("minIndex", 792.3260498046875D) - .build(), + .put(providerDimension, "total_market") + .put("rows", 186L) + .put("index", 215679.82879638672D) + .put("addRowsIndexConstant", 215866.82879638672D) + .put("uniques", QueryRunnerTestHelper.UNIQUES_2) + .put("maxIndex", 1743.9217529296875D) + .put("minIndex", 792.3260498046875D) + .build(), ImmutableMap.builder() - .put("provider", "upfront") - .put("rows", 186L) - .put("index", 192046.1060180664D) - .put("addRowsIndexConstant", 192233.1060180664D) - .put("maxIndex", 1870.06103515625D) - .put("minIndex", 545.9906005859375D) - .build(), + .put(providerDimension, "upfront") + .put("rows", 186L) + .put("index", 192046.1060180664D) + .put("addRowsIndexConstant", 192233.1060180664D) + .put("uniques", QueryRunnerTestHelper.UNIQUES_2) + .put("maxIndex", 1870.06103515625D) + .put("minIndex", 545.9906005859375D) + .build(), ImmutableMap.builder() - .put("provider", "spot") - .put("rows", 837L) - .put("index", 95606.57232284546D) - .put("addRowsIndexConstant", 96444.57232284546D) - .put("maxIndex", 277.2735290527344D) - .put("minIndex", 59.02102279663086D) - .build() + .put(providerDimension, "spot") + .put("rows", 837L) + .put("index", 95606.57232284546D) + .put("addRowsIndexConstant", 96444.57232284546D) + .put("uniques", QueryRunnerTestHelper.UNIQUES_9) + .put("maxIndex", 277.2735290527344D) + .put("minIndex", 59.02102279663086D) + .build() ) ) ) @@ -226,16 +171,16 @@ public class TopNQueryRunnerTest public void testFullOnTopNOverPostAggs() { TopNQuery query = new TopNQueryBuilder() - .dataSource(dataSource) - .granularity(allGran) + .dataSource(QueryRunnerTestHelper.dataSource) + .granularity(QueryRunnerTestHelper.allGran) .dimension(providerDimension) - .metric(addRowsIndexConstantMetric) + .metric(QueryRunnerTestHelper.addRowsIndexConstantMetric) .threshold(4) - .intervals(fullOnInterval) + .intervals(QueryRunnerTestHelper.fullOnInterval) .aggregators( Lists.newArrayList( Iterables.concat( - commonAggregators, + QueryRunnerTestHelper.commonAggregators, Lists.newArrayList( new MaxAggregatorFactory("maxIndex", "index"), new MinAggregatorFactory("minIndex", "index") @@ -243,7 +188,7 @@ public class TopNQueryRunnerTest ) ) ) - .postAggregators(Arrays.asList(addRowsIndexConstant)) + .postAggregators(Arrays.asList(QueryRunnerTestHelper.addRowsIndexConstant)) .build(); List> expectedResults = Arrays.asList( @@ -252,29 +197,32 @@ public class TopNQueryRunnerTest new TopNResultValue( Arrays.>asList( ImmutableMap.builder() - .put("provider", "total_market") - .put("rows", 186L) - .put("index", 215679.82879638672D) - .put("addRowsIndexConstant", 215866.82879638672D) - .put("maxIndex", 1743.9217529296875D) - .put("minIndex", 792.3260498046875D) - .build(), + .put(providerDimension, "total_market") + .put("rows", 186L) + .put("index", 215679.82879638672D) + .put("addRowsIndexConstant", 215866.82879638672D) + .put("uniques", QueryRunnerTestHelper.UNIQUES_2) + .put("maxIndex", 1743.9217529296875D) + .put("minIndex", 792.3260498046875D) + .build(), ImmutableMap.builder() - .put("provider", "upfront") - .put("rows", 186L) - .put("index", 192046.1060180664D) - .put("addRowsIndexConstant", 192233.1060180664D) - .put("maxIndex", 1870.06103515625D) - .put("minIndex", 545.9906005859375D) - .build(), + .put(providerDimension, "upfront") + .put("rows", 186L) + .put("index", 192046.1060180664D) + .put("addRowsIndexConstant", 192233.1060180664D) + .put("uniques", QueryRunnerTestHelper.UNIQUES_2) + .put("maxIndex", 1870.06103515625D) + .put("minIndex", 545.9906005859375D) + .build(), ImmutableMap.builder() - .put("provider", "spot") - .put("rows", 837L) - .put("index", 95606.57232284546D) - .put("addRowsIndexConstant", 96444.57232284546D) - .put("maxIndex", 277.2735290527344D) - .put("minIndex", 59.02102279663086D) - .build() + .put(providerDimension, "spot") + .put("rows", 837L) + .put("index", 95606.57232284546D) + .put("addRowsIndexConstant", 96444.57232284546D) + .put("uniques", QueryRunnerTestHelper.UNIQUES_9) + .put("maxIndex", 277.2735290527344D) + .put("minIndex", 59.02102279663086D) + .build() ) ) ) @@ -283,18 +231,84 @@ public class TopNQueryRunnerTest TestHelper.assertExpectedResults(expectedResults, runner.run(query)); } + + @Test + public void testFullOnTopNOverUniques() + { + TopNQuery query = new TopNQueryBuilder() + .dataSource(QueryRunnerTestHelper.dataSource) + .granularity(QueryRunnerTestHelper.allGran) + .dimension(providerDimension) + .metric(QueryRunnerTestHelper.uniqueMetric) + .threshold(3) + .intervals(QueryRunnerTestHelper.fullOnInterval) + .aggregators( + Lists.newArrayList( + Iterables.concat( + QueryRunnerTestHelper.commonAggregators, + Lists.newArrayList( + new MaxAggregatorFactory("maxIndex", "index"), + new MinAggregatorFactory("minIndex", "index") + ) + ) + ) + ) + .postAggregators(Arrays.asList(QueryRunnerTestHelper.addRowsIndexConstant)) + .build(); + + List> expectedResults = Arrays.asList( + new Result( + new DateTime("2011-01-12T00:00:00.000Z"), + new TopNResultValue( + Arrays.>asList( + ImmutableMap.builder() + .put("provider", "spot") + .put("rows", 837L) + .put("index", 95606.57232284546D) + .put("addRowsIndexConstant", 96444.57232284546D) + .put("uniques", QueryRunnerTestHelper.UNIQUES_9) + .put("maxIndex", 277.2735290527344D) + .put("minIndex", 59.02102279663086D) + .build(), + ImmutableMap.builder() + .put("provider", "total_market") + .put("rows", 186L) + .put("index", 215679.82879638672D) + .put("addRowsIndexConstant", 215866.82879638672D) + .put("uniques", QueryRunnerTestHelper.UNIQUES_2) + .put("maxIndex", 1743.9217529296875D) + .put("minIndex", 792.3260498046875D) + .build(), + ImmutableMap.builder() + .put("provider", "upfront") + .put("rows", 186L) + .put("index", 192046.1060180664D) + .put("addRowsIndexConstant", 192233.1060180664D) + .put("uniques", QueryRunnerTestHelper.UNIQUES_2) + .put("maxIndex", 1870.06103515625D) + .put("minIndex", 545.9906005859375D) + .build() + ) + ) + ) + ); + + TestHelper.assertExpectedResults(expectedResults, runner.run(query)); + } + + @Test public void testTopN() { TopNQuery query = new TopNQueryBuilder() - .dataSource(dataSource) - .granularity(allGran) + .dataSource(QueryRunnerTestHelper.dataSource) + .granularity(QueryRunnerTestHelper.allGran) .dimension(providerDimension) - .metric(indexMetric) + .metric(QueryRunnerTestHelper.indexMetric) .threshold(4) - .intervals(firstToThird) - .aggregators(commonAggregators) - .postAggregators(Arrays.asList(addRowsIndexConstant)) + .intervals(QueryRunnerTestHelper.firstToThird) + .aggregators(QueryRunnerTestHelper.commonAggregators) + .postAggregators(Arrays.asList(QueryRunnerTestHelper.addRowsIndexConstant)) .build(); @@ -303,23 +317,75 @@ public class TopNQueryRunnerTest new DateTime("2011-04-01T00:00:00.000Z"), new TopNResultValue( Arrays.>asList( + ImmutableMap.of( + providerDimension, "total_market", + "rows", 4L, + "index", 5351.814697265625D, + "addRowsIndexConstant", 5356.814697265625D, + "uniques", QueryRunnerTestHelper.UNIQUES_2 + ), + ImmutableMap.of( + providerDimension, "upfront", + "rows", 4L, + "index", 4875.669677734375D, + "addRowsIndexConstant", 4880.669677734375D, + "uniques", QueryRunnerTestHelper.UNIQUES_2 + ), + ImmutableMap.of( + providerDimension, "spot", + "rows", 18L, + "index", 2231.8768157958984D, + "addRowsIndexConstant", 2250.8768157958984D, + "uniques", QueryRunnerTestHelper.UNIQUES_9 + ) + ) + ) + ) + ); + + TestHelper.assertExpectedResults(expectedResults, runner.run(query)); + } + + @Test + public void testTopNByUniques() + { + TopNQuery query = new TopNQueryBuilder() + .dataSource(QueryRunnerTestHelper.dataSource) + .granularity(QueryRunnerTestHelper.allGran) + .dimension(providerDimension) + .metric(new NumericTopNMetricSpec("uniques")) + .threshold(4) + .intervals(QueryRunnerTestHelper.firstToThird) + .aggregators(QueryRunnerTestHelper.commonAggregators) + .postAggregators(Arrays.asList(QueryRunnerTestHelper.addRowsIndexConstant)) + .build(); + + + List> expectedResults = Arrays.asList( + new Result( + new DateTime("2011-04-01T00:00:00.000Z"), + new TopNResultValue( + Arrays.>asList( + ImmutableMap.of( + "provider", "spot", + "rows", 18L, + "index", 2231.8768157958984D, + "addRowsIndexConstant", 2250.8768157958984D, + "uniques", QueryRunnerTestHelper.UNIQUES_9 + ), ImmutableMap.of( "provider", "total_market", "rows", 4L, "index", 5351.814697265625D, - "addRowsIndexConstant", 5356.814697265625D + "addRowsIndexConstant", 5356.814697265625D, + "uniques", QueryRunnerTestHelper.UNIQUES_2 ), ImmutableMap.of( "provider", "upfront", "rows", 4L, "index", 4875.669677734375D, - "addRowsIndexConstant", 4880.669677734375D - ), - ImmutableMap.of( - "provider", "spot", - "rows", 18L, - "index", 2231.8768157958984D, - "addRowsIndexConstant", 2250.8768157958984D + "addRowsIndexConstant", 4880.669677734375D, + "uniques", QueryRunnerTestHelper.UNIQUES_2 ) ) ) @@ -333,15 +399,15 @@ public class TopNQueryRunnerTest public void testTopNWithOrFilter1() { TopNQuery query = new TopNQueryBuilder() - .dataSource(dataSource) - .granularity(allGran) + .dataSource(QueryRunnerTestHelper.dataSource) + .granularity(QueryRunnerTestHelper.allGran) .filters(providerDimension, "total_market", "upfront", "spot") .dimension(providerDimension) - .metric(indexMetric) + .metric(QueryRunnerTestHelper.indexMetric) .threshold(4) - .intervals(firstToThird) - .aggregators(commonAggregators) - .postAggregators(Arrays.asList(addRowsIndexConstant)) + .intervals(QueryRunnerTestHelper.firstToThird) + .aggregators(QueryRunnerTestHelper.commonAggregators) + .postAggregators(Arrays.asList(QueryRunnerTestHelper.addRowsIndexConstant)) .build(); List> expectedResults = Arrays.asList( @@ -350,22 +416,25 @@ public class TopNQueryRunnerTest new TopNResultValue( Arrays.>asList( ImmutableMap.of( - "provider", "total_market", + providerDimension, "total_market", "rows", 4L, "index", 5351.814697265625D, - "addRowsIndexConstant", 5356.814697265625D + "addRowsIndexConstant", 5356.814697265625D, + "uniques", QueryRunnerTestHelper.UNIQUES_2 ), ImmutableMap.of( - "provider", "upfront", + providerDimension, "upfront", "rows", 4L, "index", 4875.669677734375D, - "addRowsIndexConstant", 4880.669677734375D + "addRowsIndexConstant", 4880.669677734375D, + "uniques", QueryRunnerTestHelper.UNIQUES_2 ), ImmutableMap.of( - "provider", "spot", + providerDimension, "spot", "rows", 18L, "index", 2231.8768157958984D, - "addRowsIndexConstant", 2250.8768157958984D + "addRowsIndexConstant", 2250.8768157958984D, + "uniques", QueryRunnerTestHelper.UNIQUES_9 ) ) ) @@ -379,15 +448,15 @@ public class TopNQueryRunnerTest public void testTopNWithOrFilter2() { TopNQuery query = new TopNQueryBuilder() - .dataSource(dataSource) - .granularity(allGran) + .dataSource(QueryRunnerTestHelper.dataSource) + .granularity(QueryRunnerTestHelper.allGran) .filters(providerDimension, "total_market", "upfront") .dimension(providerDimension) - .metric(indexMetric) + .metric(QueryRunnerTestHelper.indexMetric) .threshold(4) - .intervals(firstToThird) - .aggregators(commonAggregators) - .postAggregators(Arrays.asList(addRowsIndexConstant)) + .intervals(QueryRunnerTestHelper.firstToThird) + .aggregators(QueryRunnerTestHelper.commonAggregators) + .postAggregators(Arrays.asList(QueryRunnerTestHelper.addRowsIndexConstant)) .build(); List> expectedResults = Arrays.asList( @@ -396,16 +465,18 @@ public class TopNQueryRunnerTest new TopNResultValue( Arrays.>asList( ImmutableMap.of( - "provider", "total_market", + providerDimension, "total_market", "rows", 4L, "index", 5351.814697265625D, - "addRowsIndexConstant", 5356.814697265625D + "addRowsIndexConstant", 5356.814697265625D, + "uniques", QueryRunnerTestHelper.UNIQUES_2 ), ImmutableMap.of( - "provider", "upfront", + providerDimension, "upfront", "rows", 4L, "index", 4875.669677734375D, - "addRowsIndexConstant", 4880.669677734375D + "addRowsIndexConstant", 4880.669677734375D, + "uniques", QueryRunnerTestHelper.UNIQUES_2 ) ) ) @@ -419,15 +490,15 @@ public class TopNQueryRunnerTest public void testTopNWithFilter1() { TopNQuery query = new TopNQueryBuilder() - .dataSource(dataSource) - .granularity(allGran) + .dataSource(QueryRunnerTestHelper.dataSource) + .granularity(QueryRunnerTestHelper.allGran) .filters(providerDimension, "upfront") .dimension(providerDimension) - .metric(indexMetric) + .metric(QueryRunnerTestHelper.indexMetric) .threshold(4) - .intervals(firstToThird) - .aggregators(commonAggregators) - .postAggregators(Arrays.asList(addRowsIndexConstant)) + .intervals(QueryRunnerTestHelper.firstToThird) + .aggregators(QueryRunnerTestHelper.commonAggregators) + .postAggregators(Arrays.asList(QueryRunnerTestHelper.addRowsIndexConstant)) .build(); List> expectedResults = Arrays.asList( @@ -436,10 +507,11 @@ public class TopNQueryRunnerTest new TopNResultValue( Arrays.>asList( ImmutableMap.of( - "provider", "upfront", + providerDimension, "upfront", "rows", 4L, "index", 4875.669677734375D, - "addRowsIndexConstant", 4880.669677734375D + "addRowsIndexConstant", 4880.669677734375D, + "uniques", QueryRunnerTestHelper.UNIQUES_2 ) ) ) @@ -453,15 +525,15 @@ public class TopNQueryRunnerTest public void testTopNWithFilter2() { TopNQuery query = new TopNQueryBuilder() - .dataSource(dataSource) - .granularity(allGran) - .filters(qualityDimension, "mezzanine") + .dataSource(QueryRunnerTestHelper.dataSource) + .granularity(QueryRunnerTestHelper.allGran) + .filters(QueryRunnerTestHelper.qualityDimension, "mezzanine") .dimension(providerDimension) - .metric(indexMetric) + .metric(QueryRunnerTestHelper.indexMetric) .threshold(4) - .intervals(firstToThird) - .aggregators(commonAggregators) - .postAggregators(Arrays.asList(addRowsIndexConstant)) + .intervals(QueryRunnerTestHelper.firstToThird) + .aggregators(QueryRunnerTestHelper.commonAggregators) + .postAggregators(Arrays.asList(QueryRunnerTestHelper.addRowsIndexConstant)) .build(); List> expectedResults = Arrays.asList( @@ -470,22 +542,25 @@ public class TopNQueryRunnerTest new TopNResultValue( Arrays.>asList( ImmutableMap.of( - "provider", "upfront", + providerDimension, "upfront", "rows", 2L, "index", 2591.68359375D, - "addRowsIndexConstant", 2594.68359375D + "addRowsIndexConstant", 2594.68359375D, + "uniques", QueryRunnerTestHelper.UNIQUES_1 ), ImmutableMap.of( - "provider", "total_market", + providerDimension, "total_market", "rows", 2L, "index", 2508.39599609375D, - "addRowsIndexConstant", 2511.39599609375D + "addRowsIndexConstant", 2511.39599609375D, + "uniques", QueryRunnerTestHelper.UNIQUES_1 ), ImmutableMap.of( - "provider", "spot", + providerDimension, "spot", "rows", 2L, "index", 220.63774871826172D, - "addRowsIndexConstant", 223.63774871826172D + "addRowsIndexConstant", 223.63774871826172D, + "uniques", QueryRunnerTestHelper.UNIQUES_1 ) ) ) @@ -499,19 +574,19 @@ public class TopNQueryRunnerTest public void testTopNWithFilter2OneDay() { TopNQuery query = new TopNQueryBuilder() - .dataSource(dataSource) - .granularity(allGran) - .filters(qualityDimension, "mezzanine") + .dataSource(QueryRunnerTestHelper.dataSource) + .granularity(QueryRunnerTestHelper.allGran) + .filters(QueryRunnerTestHelper.qualityDimension, "mezzanine") .dimension(providerDimension) - .metric(indexMetric) + .metric(QueryRunnerTestHelper.indexMetric) .threshold(4) .intervals( new MultipleIntervalSegmentSpec( Arrays.asList(new Interval("2011-04-01T00:00:00.000Z/2011-04-02T00:00:00.000Z")) ) ) - .aggregators(commonAggregators) - .postAggregators(Arrays.asList(addRowsIndexConstant)) + .aggregators(QueryRunnerTestHelper.commonAggregators) + .postAggregators(Arrays.asList(QueryRunnerTestHelper.addRowsIndexConstant)) .build(); List> expectedResults = Arrays.asList( @@ -520,22 +595,25 @@ public class TopNQueryRunnerTest new TopNResultValue( Arrays.>asList( ImmutableMap.of( - "provider", "upfront", + providerDimension, "upfront", "rows", 1L, "index", new Float(1447.341160).doubleValue(), - "addRowsIndexConstant", new Float(1449.341160).doubleValue() + "addRowsIndexConstant", new Float(1449.341160).doubleValue(), + "uniques", QueryRunnerTestHelper.UNIQUES_1 ), ImmutableMap.of( - "provider", "total_market", + providerDimension, "total_market", "rows", 1L, "index", new Float(1314.839715).doubleValue(), - "addRowsIndexConstant", new Float(1316.839715).doubleValue() + "addRowsIndexConstant", new Float(1316.839715).doubleValue(), + "uniques", QueryRunnerTestHelper.UNIQUES_1 ), ImmutableMap.of( - "provider", "spot", + providerDimension, "spot", "rows", 1L, "index", new Float(109.705815).doubleValue(), - "addRowsIndexConstant", new Float(111.705815).doubleValue() + "addRowsIndexConstant", new Float(111.705815).doubleValue(), + "uniques", QueryRunnerTestHelper.UNIQUES_1 ) ) ) @@ -549,15 +627,15 @@ public class TopNQueryRunnerTest public void testTopNWithNonExistentFilterInOr() { TopNQuery query = new TopNQueryBuilder() - .dataSource(dataSource) - .granularity(allGran) + .dataSource(QueryRunnerTestHelper.dataSource) + .granularity(QueryRunnerTestHelper.allGran) .filters(providerDimension, "total_market", "upfront", "billyblank") .dimension(providerDimension) - .metric(indexMetric) + .metric(QueryRunnerTestHelper.indexMetric) .threshold(4) - .intervals(firstToThird) - .aggregators(commonAggregators) - .postAggregators(Arrays.asList(addRowsIndexConstant)) + .intervals(QueryRunnerTestHelper.firstToThird) + .aggregators(QueryRunnerTestHelper.commonAggregators) + .postAggregators(Arrays.asList(QueryRunnerTestHelper.addRowsIndexConstant)) .build(); List> expectedResults = Arrays.asList( @@ -566,16 +644,18 @@ public class TopNQueryRunnerTest new TopNResultValue( Arrays.>asList( ImmutableMap.of( - "provider", "total_market", + providerDimension, "total_market", "rows", 4L, "index", 5351.814697265625D, - "addRowsIndexConstant", 5356.814697265625D + "addRowsIndexConstant", 5356.814697265625D, + "uniques", QueryRunnerTestHelper.UNIQUES_2 ), ImmutableMap.of( - "provider", "upfront", + providerDimension, "upfront", "rows", 4L, "index", 4875.669677734375D, - "addRowsIndexConstant", 4880.669677734375D + "addRowsIndexConstant", 4880.669677734375D, + "uniques", QueryRunnerTestHelper.UNIQUES_2 ) ) ) @@ -589,15 +669,15 @@ public class TopNQueryRunnerTest public void testTopNWithNonExistentFilter() { TopNQuery query = new TopNQueryBuilder() - .dataSource(dataSource) - .granularity(allGran) + .dataSource(QueryRunnerTestHelper.dataSource) + .granularity(QueryRunnerTestHelper.allGran) .filters(providerDimension, "billyblank") .dimension(providerDimension) - .metric(indexMetric) + .metric(QueryRunnerTestHelper.indexMetric) .threshold(4) - .intervals(firstToThird) - .aggregators(commonAggregators) - .postAggregators(Arrays.asList(addRowsIndexConstant)) + .intervals(QueryRunnerTestHelper.firstToThird) + .aggregators(QueryRunnerTestHelper.commonAggregators) + .postAggregators(Arrays.asList(QueryRunnerTestHelper.addRowsIndexConstant)) .build(); TestHelper.assertExpectedResults( @@ -615,28 +695,28 @@ public class TopNQueryRunnerTest public void testTopNWithNonExistentFilterMultiDim() { AndDimFilter andDimFilter = Druids.newAndDimFilterBuilder() - .fields( - Lists.newArrayList( - Druids.newSelectorDimFilterBuilder() - .dimension(providerDimension) - .value("billyblank") - .build(), - Druids.newSelectorDimFilterBuilder() - .dimension(qualityDimension) - .value("mezzanine") - .build() - ) - ).build(); + .fields( + Lists.newArrayList( + Druids.newSelectorDimFilterBuilder() + .dimension(providerDimension) + .value("billyblank") + .build(), + Druids.newSelectorDimFilterBuilder() + .dimension(QueryRunnerTestHelper.qualityDimension) + .value("mezzanine") + .build() + ) + ).build(); TopNQuery query = new TopNQueryBuilder() - .dataSource(dataSource) - .granularity(allGran) + .dataSource(QueryRunnerTestHelper.dataSource) + .granularity(QueryRunnerTestHelper.allGran) .filters(andDimFilter) .dimension(providerDimension) - .metric(indexMetric) + .metric(QueryRunnerTestHelper.indexMetric) .threshold(4) - .intervals(firstToThird) - .aggregators(commonAggregators) - .postAggregators(Arrays.asList(addRowsIndexConstant)) + .intervals(QueryRunnerTestHelper.firstToThird) + .aggregators(QueryRunnerTestHelper.commonAggregators) + .postAggregators(Arrays.asList(QueryRunnerTestHelper.addRowsIndexConstant)) .build(); TestHelper.assertExpectedResults( @@ -654,30 +734,30 @@ public class TopNQueryRunnerTest public void testTopNWithMultiValueDimFilter1() { TopNQuery query = new TopNQueryBuilder() - .dataSource(dataSource) - .granularity(allGran) - .filters(placementishDimension, "m") + .dataSource(QueryRunnerTestHelper.dataSource) + .granularity(QueryRunnerTestHelper.allGran) + .filters(QueryRunnerTestHelper.placementishDimension, "m") .dimension(providerDimension) - .metric(indexMetric) + .metric(QueryRunnerTestHelper.indexMetric) .threshold(4) - .intervals(firstToThird) - .aggregators(commonAggregators) - .postAggregators(Arrays.asList(addRowsIndexConstant)) + .intervals(QueryRunnerTestHelper.firstToThird) + .aggregators(QueryRunnerTestHelper.commonAggregators) + .postAggregators(Arrays.asList(QueryRunnerTestHelper.addRowsIndexConstant)) .build(); TestHelper.assertExpectedResults( Sequences.toList( runner.run( new TopNQueryBuilder() - .dataSource(dataSource) - .granularity(allGran) - .filters(qualityDimension, "mezzanine") + .dataSource(QueryRunnerTestHelper.dataSource) + .granularity(QueryRunnerTestHelper.allGran) + .filters(QueryRunnerTestHelper.qualityDimension, "mezzanine") .dimension(providerDimension) - .metric(indexMetric) + .metric(QueryRunnerTestHelper.indexMetric) .threshold(4) - .intervals(firstToThird) - .aggregators(commonAggregators) - .postAggregators(Arrays.asList(addRowsIndexConstant)) + .intervals(QueryRunnerTestHelper.firstToThird) + .aggregators(QueryRunnerTestHelper.commonAggregators) + .postAggregators(Arrays.asList(QueryRunnerTestHelper.addRowsIndexConstant)) .build() ), Lists.>newArrayList() ), runner.run(query) @@ -688,30 +768,30 @@ public class TopNQueryRunnerTest public void testTopNWithMultiValueDimFilter2() { TopNQuery query = new TopNQueryBuilder() - .dataSource(dataSource) - .granularity(allGran) - .filters(placementishDimension, "m", "a", "b") - .dimension(qualityDimension) - .metric(indexMetric) + .dataSource(QueryRunnerTestHelper.dataSource) + .granularity(QueryRunnerTestHelper.allGran) + .filters(QueryRunnerTestHelper.placementishDimension, "m", "a", "b") + .dimension(QueryRunnerTestHelper.qualityDimension) + .metric(QueryRunnerTestHelper.indexMetric) .threshold(4) - .intervals(firstToThird) - .aggregators(commonAggregators) - .postAggregators(Arrays.asList(addRowsIndexConstant)) + .intervals(QueryRunnerTestHelper.firstToThird) + .aggregators(QueryRunnerTestHelper.commonAggregators) + .postAggregators(Arrays.asList(QueryRunnerTestHelper.addRowsIndexConstant)) .build(); TestHelper.assertExpectedResults( Sequences.toList( runner.run( new TopNQueryBuilder() - .dataSource(dataSource) - .granularity(allGran) - .filters(qualityDimension, "mezzanine", "automotive", "business") - .dimension(qualityDimension) - .metric(indexMetric) + .dataSource(QueryRunnerTestHelper.dataSource) + .granularity(QueryRunnerTestHelper.allGran) + .filters(QueryRunnerTestHelper.qualityDimension, "mezzanine", "automotive", "business") + .dimension(QueryRunnerTestHelper.qualityDimension) + .metric(QueryRunnerTestHelper.indexMetric) .threshold(4) - .intervals(firstToThird) - .aggregators(commonAggregators) - .postAggregators(Arrays.asList(addRowsIndexConstant)) + .intervals(QueryRunnerTestHelper.firstToThird) + .aggregators(QueryRunnerTestHelper.commonAggregators) + .postAggregators(Arrays.asList(QueryRunnerTestHelper.addRowsIndexConstant)) .build() ), Lists.>newArrayList() ) @@ -723,15 +803,15 @@ public class TopNQueryRunnerTest public void testTopNWithMultiValueDimFilter3() { TopNQuery query = new TopNQueryBuilder() - .dataSource(dataSource) - .granularity(allGran) - .filters(placementishDimension, "a") - .dimension(placementishDimension) - .metric(indexMetric) + .dataSource(QueryRunnerTestHelper.dataSource) + .granularity(QueryRunnerTestHelper.allGran) + .filters(QueryRunnerTestHelper.placementishDimension, "a") + .dimension(QueryRunnerTestHelper.placementishDimension) + .metric(QueryRunnerTestHelper.indexMetric) .threshold(4) - .intervals(firstToThird) - .aggregators(commonAggregators) - .postAggregators(Arrays.asList(addRowsIndexConstant)) + .intervals(QueryRunnerTestHelper.firstToThird) + .aggregators(QueryRunnerTestHelper.commonAggregators) + .postAggregators(Arrays.asList(QueryRunnerTestHelper.addRowsIndexConstant)) .build(); final ArrayList> expectedResults = Lists.newArrayList( @@ -743,13 +823,15 @@ public class TopNQueryRunnerTest "placementish", "a", "rows", 2L, "index", 283.31103515625D, - "addRowsIndexConstant", 286.31103515625D + "addRowsIndexConstant", 286.31103515625D, + "uniques", QueryRunnerTestHelper.UNIQUES_1 ), ImmutableMap.of( "placementish", "preferred", "rows", 2L, "index", 283.31103515625D, - "addRowsIndexConstant", 286.31103515625D + "addRowsIndexConstant", 286.31103515625D, + "uniques", QueryRunnerTestHelper.UNIQUES_1 ) ) ) @@ -763,15 +845,15 @@ public class TopNQueryRunnerTest public void testTopNWithMultiValueDimFilter4() { TopNQuery query = new TopNQueryBuilder() - .dataSource(dataSource) - .granularity(allGran) - .filters(placementishDimension, "a", "b") - .dimension(placementishDimension) - .metric(indexMetric) + .dataSource(QueryRunnerTestHelper.dataSource) + .granularity(QueryRunnerTestHelper.allGran) + .filters(QueryRunnerTestHelper.placementishDimension, "a", "b") + .dimension(QueryRunnerTestHelper.placementishDimension) + .metric(QueryRunnerTestHelper.indexMetric) .threshold(4) - .intervals(firstToThird) - .aggregators(commonAggregators) - .postAggregators(Arrays.asList(addRowsIndexConstant)) + .intervals(QueryRunnerTestHelper.firstToThird) + .aggregators(QueryRunnerTestHelper.commonAggregators) + .postAggregators(Arrays.asList(QueryRunnerTestHelper.addRowsIndexConstant)) .build(); final ArrayList> expectedResults = Lists.newArrayList( @@ -783,19 +865,22 @@ public class TopNQueryRunnerTest "placementish", "preferred", "rows", 4L, "index", 514.868408203125D, - "addRowsIndexConstant", 519.868408203125D + "addRowsIndexConstant", 519.868408203125D, + "uniques", QueryRunnerTestHelper.UNIQUES_2 ), ImmutableMap.of( "placementish", "a", "rows", 2L, "index", 283.31103515625D, - "addRowsIndexConstant", 286.31103515625D + "addRowsIndexConstant", 286.31103515625D, + "uniques", QueryRunnerTestHelper.UNIQUES_1 ), ImmutableMap.of( "placementish", "b", "rows", 2L, "index", 231.557373046875D, - "addRowsIndexConstant", 234.557373046875D + "addRowsIndexConstant", 234.557373046875D, + "uniques", QueryRunnerTestHelper.UNIQUES_1 ) ) ) @@ -809,15 +894,15 @@ public class TopNQueryRunnerTest public void testTopNWithMultiValueDimFilter5() { TopNQuery query = new TopNQueryBuilder() - .dataSource(dataSource) - .granularity(allGran) - .filters(placementishDimension, "preferred") - .dimension(placementishDimension) - .metric(indexMetric) + .dataSource(QueryRunnerTestHelper.dataSource) + .granularity(QueryRunnerTestHelper.allGran) + .filters(QueryRunnerTestHelper.placementishDimension, "preferred") + .dimension(QueryRunnerTestHelper.placementishDimension) + .metric(QueryRunnerTestHelper.indexMetric) .threshold(4) - .intervals(firstToThird) - .aggregators(commonAggregators) - .postAggregators(Arrays.asList(addRowsIndexConstant)) + .intervals(QueryRunnerTestHelper.firstToThird) + .aggregators(QueryRunnerTestHelper.commonAggregators) + .postAggregators(Arrays.asList(QueryRunnerTestHelper.addRowsIndexConstant)) .build(); final ArrayList> expectedResults = Lists.newArrayList( @@ -829,25 +914,29 @@ public class TopNQueryRunnerTest "placementish", "preferred", "rows", 26L, "index", 12459.361190795898D, - "addRowsIndexConstant", 12486.361190795898D + "addRowsIndexConstant", 12486.361190795898D, + "uniques", QueryRunnerTestHelper.UNIQUES_9 ), ImmutableMap.of( "placementish", "p", "rows", 6L, "index", 5407.213653564453D, - "addRowsIndexConstant", 5414.213653564453D + "addRowsIndexConstant", 5414.213653564453D, + "uniques", QueryRunnerTestHelper.UNIQUES_1 ), ImmutableMap.of( "placementish", "m", "rows", 6L, "index", 5320.717338562012D, - "addRowsIndexConstant", 5327.717338562012D + "addRowsIndexConstant", 5327.717338562012D, + "uniques", QueryRunnerTestHelper.UNIQUES_1 ), ImmutableMap.of( "placementish", "t", "rows", 4L, "index", 422.3440856933594D, - "addRowsIndexConstant", 427.3440856933594D + "addRowsIndexConstant", 427.3440856933594D, + "uniques", QueryRunnerTestHelper.UNIQUES_2 ) ) ) @@ -861,14 +950,14 @@ public class TopNQueryRunnerTest public void testTopNLexicographic() { TopNQuery query = new TopNQueryBuilder() - .dataSource(dataSource) - .granularity(allGran) + .dataSource(QueryRunnerTestHelper.dataSource) + .granularity(QueryRunnerTestHelper.allGran) .dimension(providerDimension) .metric(new LexicographicTopNMetricSpec("")) .threshold(4) - .intervals(firstToThird) - .aggregators(commonAggregators) - .postAggregators(Arrays.asList(addRowsIndexConstant)) + .intervals(QueryRunnerTestHelper.firstToThird) + .aggregators(QueryRunnerTestHelper.commonAggregators) + .postAggregators(Arrays.asList(QueryRunnerTestHelper.addRowsIndexConstant)) .build(); List> expectedResults = Arrays.asList( @@ -877,22 +966,25 @@ public class TopNQueryRunnerTest new TopNResultValue( Arrays.>asList( ImmutableMap.of( - "provider", "spot", + providerDimension, "spot", "rows", 18L, "index", 2231.8768157958984D, - "addRowsIndexConstant", 2250.8768157958984D + "addRowsIndexConstant", 2250.8768157958984D, + "uniques", QueryRunnerTestHelper.UNIQUES_9 ), ImmutableMap.of( - "provider", "total_market", + providerDimension, "total_market", "rows", 4L, "index", 5351.814697265625D, - "addRowsIndexConstant", 5356.814697265625D + "addRowsIndexConstant", 5356.814697265625D, + "uniques", QueryRunnerTestHelper.UNIQUES_2 ), ImmutableMap.of( - "provider", "upfront", + providerDimension, "upfront", "rows", 4L, "index", 4875.669677734375D, - "addRowsIndexConstant", 4880.669677734375D + "addRowsIndexConstant", 4880.669677734375D, + "uniques", QueryRunnerTestHelper.UNIQUES_2 ) ) ) @@ -906,14 +998,14 @@ public class TopNQueryRunnerTest public void testTopNLexicographicWithPreviousStop() { TopNQuery query = new TopNQueryBuilder() - .dataSource(dataSource) - .granularity(allGran) + .dataSource(QueryRunnerTestHelper.dataSource) + .granularity(QueryRunnerTestHelper.allGran) .dimension(providerDimension) .metric(new LexicographicTopNMetricSpec("spot")) .threshold(4) - .intervals(firstToThird) - .aggregators(commonAggregators) - .postAggregators(Arrays.asList(addRowsIndexConstant)) + .intervals(QueryRunnerTestHelper.firstToThird) + .aggregators(QueryRunnerTestHelper.commonAggregators) + .postAggregators(Arrays.asList(QueryRunnerTestHelper.addRowsIndexConstant)) .build(); List> expectedResults = Arrays.asList( @@ -922,16 +1014,18 @@ public class TopNQueryRunnerTest new TopNResultValue( Arrays.>asList( ImmutableMap.of( - "provider", "total_market", + providerDimension, "total_market", "rows", 4L, "index", 5351.814697265625D, - "addRowsIndexConstant", 5356.814697265625D + "addRowsIndexConstant", 5356.814697265625D, + "uniques", QueryRunnerTestHelper.UNIQUES_2 ), ImmutableMap.of( - "provider", "upfront", + providerDimension, "upfront", "rows", 4L, "index", 4875.669677734375D, - "addRowsIndexConstant", 4880.669677734375D + "addRowsIndexConstant", 4880.669677734375D, + "uniques", QueryRunnerTestHelper.UNIQUES_2 ) ) ) @@ -945,14 +1039,14 @@ public class TopNQueryRunnerTest public void testTopNLexicographicWithNonExistingPreviousStop() { TopNQuery query = new TopNQueryBuilder() - .dataSource(dataSource) - .granularity(allGran) + .dataSource(QueryRunnerTestHelper.dataSource) + .granularity(QueryRunnerTestHelper.allGran) .dimension(providerDimension) .metric(new LexicographicTopNMetricSpec("t")) .threshold(4) - .intervals(firstToThird) - .aggregators(commonAggregators) - .postAggregators(Arrays.asList(addRowsIndexConstant)) + .intervals(QueryRunnerTestHelper.firstToThird) + .aggregators(QueryRunnerTestHelper.commonAggregators) + .postAggregators(Arrays.asList(QueryRunnerTestHelper.addRowsIndexConstant)) .build(); List> expectedResults = Arrays.asList( @@ -961,16 +1055,18 @@ public class TopNQueryRunnerTest new TopNResultValue( Arrays.>asList( ImmutableMap.of( - "provider", "total_market", + providerDimension, "total_market", "rows", 4L, "index", 5351.814697265625D, - "addRowsIndexConstant", 5356.814697265625D + "addRowsIndexConstant", 5356.814697265625D, + "uniques", QueryRunnerTestHelper.UNIQUES_2 ), ImmutableMap.of( - "provider", "upfront", + providerDimension, "upfront", "rows", 4L, "index", 4875.669677734375D, - "addRowsIndexConstant", 4880.669677734375D + "addRowsIndexConstant", 4880.669677734375D, + "uniques", QueryRunnerTestHelper.UNIQUES_2 ) ) ) @@ -984,8 +1080,8 @@ public class TopNQueryRunnerTest public void testTopNDimExtraction() { TopNQuery query = new TopNQueryBuilder() - .dataSource(dataSource) - .granularity(allGran) + .dataSource(QueryRunnerTestHelper.dataSource) + .granularity(QueryRunnerTestHelper.allGran) .dimension( new ExtractionDimensionSpec( providerDimension, providerDimension, new RegexDimExtractionFn("(.)") @@ -993,9 +1089,9 @@ public class TopNQueryRunnerTest ) .metric("rows") .threshold(4) - .intervals(firstToThird) - .aggregators(commonAggregators) - .postAggregators(Arrays.asList(addRowsIndexConstant)) + .intervals(QueryRunnerTestHelper.firstToThird) + .aggregators(QueryRunnerTestHelper.commonAggregators) + .postAggregators(Arrays.asList(QueryRunnerTestHelper.addRowsIndexConstant)) .build(); List> expectedResults = Arrays.asList( @@ -1004,22 +1100,25 @@ public class TopNQueryRunnerTest new TopNResultValue( Arrays.>asList( ImmutableMap.of( - "provider", "s", + providerDimension, "s", "rows", 18L, "index", 2231.8768157958984D, - "addRowsIndexConstant", 2250.8768157958984D + "addRowsIndexConstant", 2250.8768157958984D, + "uniques", QueryRunnerTestHelper.UNIQUES_9 ), ImmutableMap.of( - "provider", "t", + providerDimension, "t", "rows", 4L, "index", 5351.814697265625D, - "addRowsIndexConstant", 5356.814697265625D + "addRowsIndexConstant", 5356.814697265625D, + "uniques", QueryRunnerTestHelper.UNIQUES_2 ), ImmutableMap.of( - "provider", "u", + providerDimension, "u", "rows", 4L, "index", 4875.669677734375D, - "addRowsIndexConstant", 4880.669677734375D + "addRowsIndexConstant", 4880.669677734375D, + "uniques", QueryRunnerTestHelper.UNIQUES_2 ) ) ) @@ -1034,14 +1133,14 @@ public class TopNQueryRunnerTest { TopNQuery query = new TopNQueryBuilder() - .dataSource(dataSource) - .granularity(allGran) + .dataSource(QueryRunnerTestHelper.dataSource) + .granularity(QueryRunnerTestHelper.allGran) .dimension(providerDimension) - .metric(new InvertedTopNMetricSpec(new NumericTopNMetricSpec(indexMetric))) + .metric(new InvertedTopNMetricSpec(new NumericTopNMetricSpec(QueryRunnerTestHelper.indexMetric))) .threshold(3) - .intervals(firstToThird) - .aggregators(commonAggregators) - .postAggregators(Arrays.asList(addRowsIndexConstant)) + .intervals(QueryRunnerTestHelper.firstToThird) + .aggregators(QueryRunnerTestHelper.commonAggregators) + .postAggregators(Arrays.asList(QueryRunnerTestHelper.addRowsIndexConstant)) .build(); List> expectedResults = Arrays.asList( @@ -1050,22 +1149,25 @@ public class TopNQueryRunnerTest new TopNResultValue( Arrays.>asList( ImmutableMap.of( - "provider", "spot", + providerDimension, "spot", "rows", 18L, "index", 2231.8768157958984D, - "addRowsIndexConstant", 2250.8768157958984D + "addRowsIndexConstant", 2250.8768157958984D, + "uniques", QueryRunnerTestHelper.UNIQUES_9 ), ImmutableMap.of( - "provider", "upfront", + providerDimension, "upfront", "rows", 4L, "index", 4875.669677734375D, - "addRowsIndexConstant", 4880.669677734375D + "addRowsIndexConstant", 4880.669677734375D, + "uniques", QueryRunnerTestHelper.UNIQUES_2 ), ImmutableMap.of( - "provider", "total_market", + providerDimension, "total_market", "rows", 4L, "index", 5351.814697265625D, - "addRowsIndexConstant", 5356.814697265625D + "addRowsIndexConstant", 5356.814697265625D, + "uniques", QueryRunnerTestHelper.UNIQUES_2 ) ) ) @@ -1074,4 +1176,4 @@ public class TopNQueryRunnerTest TestHelper.assertExpectedResults(expectedResults, runner.run(query)); } -} \ No newline at end of file +} diff --git a/processing/src/test/java/io/druid/query/topn/TopNQueryTest.java b/processing/src/test/java/io/druid/query/topn/TopNQueryTest.java new file mode 100644 index 00000000000..f2f4ac22f3d --- /dev/null +++ b/processing/src/test/java/io/druid/query/topn/TopNQueryTest.java @@ -0,0 +1,81 @@ +/* + * Druid - a distributed column store. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * + * This file Copyright (C) 2014 N3TWORK, Inc. and contributed to the Druid project + * under the Druid Corporate Contributor License Agreement. + */ + +package io.druid.query.topn; + +import com.fasterxml.jackson.databind.ObjectMapper; +import com.google.common.collect.Iterables; +import com.google.common.collect.Lists; +import io.druid.jackson.DefaultObjectMapper; +import io.druid.query.Query; +import io.druid.query.aggregation.AggregatorFactory; +import io.druid.query.aggregation.MaxAggregatorFactory; +import io.druid.query.aggregation.MinAggregatorFactory; +import io.druid.query.aggregation.PostAggregator; +import org.junit.Assert; +import org.junit.Test; + +import java.io.IOException; +import java.util.Arrays; + +import static io.druid.query.QueryRunnerTestHelper.addRowsIndexConstant; +import static io.druid.query.QueryRunnerTestHelper.allGran; +import static io.druid.query.QueryRunnerTestHelper.commonAggregators; +import static io.druid.query.QueryRunnerTestHelper.dataSource; +import static io.druid.query.QueryRunnerTestHelper.fullOnInterval; +import static io.druid.query.QueryRunnerTestHelper.indexMetric; +import static io.druid.query.QueryRunnerTestHelper.providerDimension; + +public class TopNQueryTest +{ + private static final ObjectMapper jsonMapper = new DefaultObjectMapper(); + + @Test + public void testQuerySerialization() throws IOException + { + Query query = new TopNQueryBuilder() + .dataSource(dataSource) + .granularity(allGran) + .dimension(providerDimension) + .metric(indexMetric) + .threshold(4) + .intervals(fullOnInterval) + .aggregators( + Lists.newArrayList( + Iterables.concat( + commonAggregators, + Lists.newArrayList( + new MaxAggregatorFactory("maxIndex", "index"), + new MinAggregatorFactory("minIndex", "index") + ) + ) + ) + ) + .postAggregators(Arrays.asList(addRowsIndexConstant)) + .build(); + + String json = jsonMapper.writeValueAsString(query); + Query serdeQuery = jsonMapper.readValue(json, Query.class); + + Assert.assertEquals(query, serdeQuery); + } + +} diff --git a/processing/src/test/java/io/druid/segment/TestIndex.java b/processing/src/test/java/io/druid/segment/TestIndex.java index 0fd53185727..aba067d6f93 100644 --- a/processing/src/test/java/io/druid/segment/TestIndex.java +++ b/processing/src/test/java/io/druid/segment/TestIndex.java @@ -21,6 +21,7 @@ package io.druid.segment; import com.google.common.base.Charsets; import com.google.common.base.Throwables; +import com.google.common.hash.Hashing; import com.google.common.io.CharStreams; import com.google.common.io.InputSupplier; import com.google.common.io.LineProcessor; @@ -31,7 +32,10 @@ import io.druid.data.input.impl.TimestampSpec; import io.druid.granularity.QueryGranularity; import io.druid.query.aggregation.AggregatorFactory; import io.druid.query.aggregation.DoubleSumAggregatorFactory; +import io.druid.query.aggregation.hyperloglog.HyperUniquesAggregatorFactory; +import io.druid.query.aggregation.hyperloglog.HyperUniquesSerde; import io.druid.segment.incremental.IncrementalIndex; +import io.druid.segment.serde.ComplexMetrics; import org.joda.time.DateTime; import org.joda.time.Interval; @@ -52,14 +56,29 @@ public class TestIndex private static QueryableIndex mmappedIndex = null; private static QueryableIndex mergedRealtime = null; - public static final String[] COLUMNS = new String[]{"ts", "provider", "quALIty", "plAcEmEnT", "pLacementish", "iNdEx"}; + public static final String[] COLUMNS = new String[]{ + "ts", + "provider", + "quALIty", + "plAcEmEnT", + "pLacementish", + "iNdEx", + "qualiTy_Uniques" + }; public static final String[] DIMENSIONS = new String[]{"provider", "quALIty", "plAcEmEnT", "pLacementish"}; public static final String[] METRICS = new String[]{"iNdEx"}; private static final Interval DATA_INTERVAL = new Interval("2011-01-12T00:00:00.000Z/2011-04-16T00:00:00.000Z"); private static final AggregatorFactory[] METRIC_AGGS = new AggregatorFactory[]{ - new DoubleSumAggregatorFactory(METRICS[0], METRICS[0]) + new DoubleSumAggregatorFactory(METRICS[0], METRICS[0]), + new HyperUniquesAggregatorFactory("quality_uniques", "quality") }; + static { + if (ComplexMetrics.getSerdeForType("hyperUnique") == null) { + ComplexMetrics.registerSerde("hyperUnique", new HyperUniquesSerde(Hashing.murmur3_128())); + } + } + public static IncrementalIndex getIncrementalTestIndex() { synchronized (log) { diff --git a/processing/src/test/java/io/druid/segment/incremental/IncrementalIndexStorageAdapterTest.java b/processing/src/test/java/io/druid/segment/incremental/IncrementalIndexStorageAdapterTest.java index e3b603af9ae..3c52b2b322f 100644 --- a/processing/src/test/java/io/druid/segment/incremental/IncrementalIndexStorageAdapterTest.java +++ b/processing/src/test/java/io/druid/segment/incremental/IncrementalIndexStorageAdapterTest.java @@ -22,6 +22,7 @@ package io.druid.segment.incremental; import com.google.common.base.Supplier; import com.google.common.base.Suppliers; import com.google.common.collect.ImmutableMap; +import com.google.common.collect.Iterables; import com.google.common.collect.Lists; import com.metamx.common.guava.Sequence; import com.metamx.common.guava.Sequences; @@ -30,6 +31,7 @@ import io.druid.data.input.MapBasedInputRow; import io.druid.data.input.MapBasedRow; import io.druid.data.input.Row; import io.druid.granularity.QueryGranularity; +import io.druid.query.Result; import io.druid.query.aggregation.AggregatorFactory; import io.druid.query.aggregation.CountAggregatorFactory; import io.druid.query.aggregation.LongSumAggregatorFactory; @@ -37,9 +39,15 @@ import io.druid.query.filter.DimFilters; import io.druid.query.groupby.GroupByQuery; import io.druid.query.groupby.GroupByQueryConfig; import io.druid.query.groupby.GroupByQueryEngine; -import junit.framework.Assert; +import io.druid.query.topn.TopNQueryBuilder; +import io.druid.query.topn.TopNQueryEngine; +import io.druid.query.topn.TopNResultValue; +import io.druid.segment.Cursor; +import io.druid.segment.DimensionSelector; +import io.druid.segment.filter.SelectorFilter; import org.joda.time.DateTime; import org.joda.time.Interval; +import org.junit.Assert; import org.junit.Test; import java.nio.ByteBuffer; @@ -116,9 +124,111 @@ public class IncrementalIndexStorageAdapterTest } @Test - public void testFilterByNull() throws Exception - { - IncrementalIndex index = new IncrementalIndex( + public void testResetSanity() { + IncrementalIndex index = new IncrementalIndex( + 0, QueryGranularity.MINUTE, new AggregatorFactory[]{new CountAggregatorFactory("cnt")} + ); + + + DateTime t = DateTime.now(); + Interval interval = new Interval(t.minusMinutes(1), t.plusMinutes(1)); + + index.add( + new MapBasedInputRow( + t.minus(1).getMillis(), + Lists.newArrayList("billy"), + ImmutableMap.of("billy", "hi") + ) + ); + index.add( + new MapBasedInputRow( + t.minus(1).getMillis(), + Lists.newArrayList("sally"), + ImmutableMap.of("sally", "bo") + ) + ); + + IncrementalIndexStorageAdapter adapter = new IncrementalIndexStorageAdapter(index); + Iterable cursorIterable = adapter.makeCursors(new SelectorFilter("sally", "bo"), + interval, + QueryGranularity.NONE); + Cursor cursor = cursorIterable.iterator().next(); + DimensionSelector dimSelector; + + dimSelector = cursor.makeDimensionSelector("sally"); + Assert.assertEquals("bo", dimSelector.lookupName(dimSelector.getRow().get(0))); + + index.add( + new MapBasedInputRow( + t.minus(1).getMillis(), + Lists.newArrayList("sally"), + ImmutableMap.of("sally", "ah") + ) + ); + + // Cursor reset should not be affected by out of order values + cursor.reset(); + + dimSelector = cursor.makeDimensionSelector("sally"); + Assert.assertEquals("bo", dimSelector.lookupName(dimSelector.getRow().get(0))); + } + + @Test + public void testSingleValueTopN() + { + IncrementalIndex index = new IncrementalIndex( + 0, QueryGranularity.MINUTE, new AggregatorFactory[]{new CountAggregatorFactory("cnt")} + ); + + DateTime t = DateTime.now(); + index.add( + new MapBasedInputRow( + t.minus(1).getMillis(), + Lists.newArrayList("sally"), + ImmutableMap.of("sally", "bo") + ) + ); + + TopNQueryEngine engine = new TopNQueryEngine( + new StupidPool( + new Supplier() + { + @Override + public ByteBuffer get() + { + return ByteBuffer.allocate(50000); + } + } + ) + ); + + final Iterable> results = engine.query( + new TopNQueryBuilder().dataSource("test") + .granularity(QueryGranularity.ALL) + .intervals(Lists.newArrayList(new Interval(0, new DateTime().getMillis()))) + .dimension("sally") + .metric("cnt") + .threshold(10) + .aggregators( + Lists.newArrayList( + new LongSumAggregatorFactory( + "cnt", + "cnt" + ) + ) + ) + .build(), + new IncrementalIndexStorageAdapter(index) + ); + + Assert.assertEquals(1, Iterables.size(results)); + Assert.assertEquals(1, results.iterator().next().getValue().getValue().size()); + } + + @Test + public void testFilterByNull() throws Exception + { + IncrementalIndex index = new IncrementalIndex( 0, QueryGranularity.MINUTE, new AggregatorFactory[]{new CountAggregatorFactory("cnt")} ); diff --git a/publications/whitepaper/druid.pdf b/publications/whitepaper/druid.pdf index 7cfc29d8390..dd2900af392 100644 Binary files a/publications/whitepaper/druid.pdf and b/publications/whitepaper/druid.pdf differ diff --git a/publications/whitepaper/druid.tex b/publications/whitepaper/druid.tex index 8f417cfbfcc..e27c12404e9 100644 --- a/publications/whitepaper/druid.tex +++ b/publications/whitepaper/druid.tex @@ -5,6 +5,7 @@ \setmainfont[Ligatures={TeX}]{Times} \usepackage{hyperref} \graphicspath{{figures/}} +\usepackage{enumitem} \hyphenation{metamarkets nelson} @@ -18,8 +19,8 @@ \numberofauthors{6} \author{ -\alignauthor Fangjin Yang, Eric Tschetter, Gian Merlino, Nelson Ray, Xavier Léauté, Deep Ganguli, Himadri Singh\\ -\email{\{fangjin, cheddar, gian, nelson, xavier, deep, himadri\}@metamarkets.com} +\alignauthor Fangjin Yang, Eric Tschetter, Xavier Léauté, Nelson Ray, Gian Merlino, Deep Ganguli\\ +\email{\{fangjin, cheddar, xavier, nelson, gian, deep\}@metamarkets.com} } \date{21 March 2013} @@ -42,10 +43,10 @@ created a surge in machine-generated events. Individually, these events contain minimal useful information and are of low value. Given the time and resources required to extract meaning from large collections of events, many companies were willing to discard this data instead. Although -infrastructure has been built handle event based data (e.g. IBM's +infrastructure has been built to handle event based data (e.g. IBM's Netezza\cite{singh2011introduction}, HP's Vertica\cite{bear2012vertica}, and EMC's Greenplum\cite{miner2012unified}), they are largely sold at high price points -and are only targeted towards those companies who can afford the offerings. +and are only targeted towards those companies who can afford the offering. A few years ago, Google introduced MapReduce \cite{dean2008mapreduce} as their mechanism of leveraging commodity hardware to index the internet and analyze @@ -96,18 +97,18 @@ Section \ref{sec:problem-definition}. Next, we detail system architecture from the point of view of how data flows through the system in Section \ref{sec:architecture}. We then discuss how and why data gets converted into a binary format in Section \ref{sec:storage-format}. We briefly describe the -query API in Section \ref{sec:query-api} and present our experimental results -in Section \ref{sec:benchmarks}. Lastly, we leave off with our learnings from -running Druid in production in Section \ref{sec:production}, related work -in Section \ref{sec:related}, and conclusions in Section \ref{sec:conclusions}. +query API in Section \ref{sec:query-api} and present performance results +in Section \ref{sec:benchmarks}. Lastly, we leave off with our lessons from +running Druid in production in Section \ref{sec:production}, and related work +in Section \ref{sec:related}. \section{Problem Definition} \label{sec:problem-definition} Druid was originally designed to solve problems around ingesting and exploring -large quantities of transactional events (log data). This form of timeseries data is -commonly found in OLAP workflows and the nature of the data tends to be very -append heavy. For example, consider the data shown in +large quantities of transactional events (log data). This form of timeseries +data is commonly found in OLAP workflows and the nature of the data tends to be +very append heavy. For example, consider the data shown in Table~\ref{tab:sample_data}. Table~\ref{tab:sample_data} contains data for edits that have occurred on Wikipedia. Each time a user edits a page in Wikipedia, an event is generated that contains metadata about the edit. This @@ -115,12 +116,12 @@ metadata is comprised of 3 distinct components. First, there is a timestamp column indicating when the edit was made. Next, there are a set dimension columns indicating various attributes about the edit such as the page that was edited, the user who made the edit, and the location of the user. Finally, -there are a set of metric columns that contain values (usually numeric) to -aggregate over, such as the number of characters added or removed in an edit. +there are a set of metric columns that contain values (usually numeric) that +can be aggregated, such as the number of characters added or removed in an +edit. \begin{table*} \centering - \caption{Sample Druid data for edits that have occurred on Wikipedia.} \label{tab:sample_data} \begin{tabular}{| l | l | l | l | l | l | l | l |} \hline @@ -130,6 +131,7 @@ aggregate over, such as the number of characters added or removed in an edit. 2011-01-01T02:00:00Z & Ke\$ha & Helz & Male & Calgary & 1953 & 17 \\ \hline 2011-01-01T02:00:00Z & Ke\$ha & Xeno & Male & Taiyuan & 3194 & 170 \\ \hline \end{tabular} + \caption{Sample Druid data for edits that have occurred on Wikipedia.} \end{table*} Our goal is to rapidly compute drill-downs and aggregates over this data. We @@ -173,14 +175,14 @@ analytics platform in multiple companies. \label{sec:architecture} A Druid cluster consists of different types of nodes and each node type is designed to perform a specific set of things. We believe this design separates -concerns and simplifies the complexity of the system. The different node types +concerns and simplifies the complexity of the system. The different node types operate fairly independent of each other and there is minimal interaction between them. Hence, intra-cluster communication failures have minimal impact on data availability. To solve complex data analysis problems, the different node types come together to form a fully working system. The name Druid comes from the Druid class in many role-playing games: it is a shape-shifter, capable of taking on many different forms to fulfill various different roles in a -group. The composition of and flow of data in a Druid cluster are shown in +group. The composition of and flow of data in a Druid cluster are shown in Figure~\ref{fig:cluster}. \begin{figure*} @@ -205,7 +207,7 @@ Zookeeper. Real-time nodes maintain an in-memory index buffer for all incoming events. These indexes are incrementally populated as new events are ingested and the -indexes are also directly queryable. Druid virtually behaves as a row store +indexes are also directly queryable. Druid behaves as a row store for queries on events that exist in this JVM heap-based buffer. To avoid heap overflow problems, real-time nodes persist their in-memory indexes to disk either periodically or after some maximum row limit is reached. This persist @@ -217,10 +219,10 @@ in \cite{o1996log} and is illustrated in Figure~\ref{fig:realtime_flow}. \begin{figure} \centering -\includegraphics[width = 2.8in]{realtime_flow} +\includegraphics[width = 2.6in]{realtime_flow} \caption{Real-time nodes first buffer events in memory. On a periodic basis, the in-memory index is persisted to disk. On another periodic basis, all -persisted indexes are merged together and handed off. Queries for data will hit the +persisted indexes are merged together and handed off. Queries will hit the in-memory index and the persisted indexes.} \label{fig:realtime_flow} \end{figure} @@ -236,25 +238,23 @@ file system such as S3 \cite{decandia2007dynamo} or HDFS persist, merge, and handoff steps are fluid; there is no data loss during any of the processes. -To better understand the flow of data through a real-time node, consider the -following example. First, we start a real-time node at 13:37. The node will -only accept events for the current hour or the next hour. When the node begins -ingesting events, it will announce that it is serving a segment of data for a -time window from 13:00 to 14:00. Every 10 minutes (the persist period is -configurable), the node will flush and persist its in-memory buffer to disk. -Near the end of the hour, the node will likely see events with timestamps from -14:00 to 15:00. When this occurs, the node prepares to serve data for the next -hour and creates a new in-memory index. The node then announces that it is also -serving a segment for data from 14:00 to 15:00. The node does not immediately -merge the indexes it persisted from 13:00 to 14:00, instead it waits for a -configurable window period for straggling events from 13:00 to 14:00 to come -in. Having a window period minimizes the risk of data loss from delays in event -delivery. At the end of the window period, the real-time node merges all -persisted indexes from 13:00 to 14:00 into a single immutable segment and hands -the segment off. Once this segment is loaded and queryable somewhere else in -the Druid cluster, the real-time node flushes all information about the data it -collected for 13:00 to 14:00 and unannounces it is serving this data. This -process is shown in Figure~\ref{fig:realtime_timeline}. +Figure~\ref{fig:realtime_timeline} illustrates the operations of a real-time +node. The node starts at 13:37 and will only accept events for the current hour +or the next hour. When events are ingested, the node announces that it is +serving a segment of data for an interval from 13:00 to 14:00. Every 10 +minutes (the persist period is configurable), the node will flush and persist +its in-memory buffer to disk. Near the end of the hour, the node will likely +see events for 14:00 to 15:00. When this occurs, the node prepares to serve +data for the next hour and creates a new in-memory index. The node then +announces that it is also serving a segment from 14:00 to 15:00. The node does +not immediately merge persisted indexes from 13:00 to 14:00, instead it waits +for a configurable window period for straggling events from 13:00 to 14:00 to +arrive. This window period minimizes the risk of data loss from delays in event +delivery. At the end of the window period, the node merges all persisted +indexes from 13:00 to 14:00 into a single immutable segment and hands the +segment off. Once this segment is loaded and queryable somewhere else in the +Druid cluster, the real-time node flushes all information about the data it +collected for 13:00 to 14:00 and unannounces it is serving this data. \begin{figure*} \centering @@ -283,26 +283,26 @@ milliseconds. The purpose of the message bus in Figure~\ref{fig:realtime_pipeline} is two-fold. First, the message bus acts as a buffer for incoming events. A -message bus such as Kafka maintains offsets indicating the position in an event -stream that a consumer (a real-time node) has read up to and consumers can -programmatically update these offsets. Typically, real-time nodes update this -offset each time they persist their in-memory buffers to disk. In a fail and -recover scenario, if a node has not lost disk, it can reload all persisted -indexes from disk and continue reading events from the last offset it -committed. Ingesting events from a recently committed offset greatly reduces a -node's recovery time. In practice, we see real-time nodes recover from such -failure scenarios in an order of seconds. +message bus such as Kafka maintains positional offsets indicating how far a +consumer (a real-time node) has read in an event stream. Consumers can +programmatically update these offsets. Real-time nodes update this offset each +time they persist their in-memory buffers to disk. In a fail and recover +scenario, if a node has not lost disk, it can reload all persisted indexes from +disk and continue reading events from the last offset it committed. Ingesting +events from a recently committed offset greatly reduces a node's recovery time. +In practice, we see nodes recover from such failure scenarios in a +few seconds. The second purpose of the message bus is to act as a single endpoint from which multiple real-time nodes can read events. Multiple real-time nodes can ingest -the same set of events from the bus, thus creating a replication of events. In -a scenario where a node completely fails and does not recover, replicated -streams ensure that no data is lost. A single ingestion endpoint also allows -for data streams for be partitioned such that multiple real-time nodes each -ingest a portion of a stream. This allows additional real-time nodes to be -seamlessly added. In practice, this model has allowed one of the largest -production Druid clusters to be able to consume raw data at approximately 500 -MB/s (150,000 events/s or 2 TB/hour). +the same set of events from the bus, creating a replication of events. In a +scenario where a node completely fails and loses disk, replicated streams +ensure that no data is lost. A single ingestion endpoint also allows for data +streams for be partitioned such that multiple real-time nodes each ingest a +portion of a stream. This allows additional real-time nodes to be seamlessly +added. In practice, this model has allowed one of the largest production Druid +clusters to be able to consume raw data at approximately 500 MB/s (150,000 +events/s or 2 TB/hour). \subsection{Historical Nodes} Historical nodes encapsulate the functionality to load and serve the immutable @@ -373,7 +373,7 @@ a final consolidated result to the caller. \label{sec:caching} Broker nodes contain a cache with a LRU \cite{o1993lru, kim2001lrfu} invalidation strategy. The cache can use local heap memory or an external -distributed key/value store such as memcached +distributed key/value store such as Memcached \cite{fitzpatrick2004distributed}. Each time a broker node receives a query, it first maps the query to a set of segments. Results for certain segments may already exist in the cache and there is no need to recompute them. For any @@ -489,7 +489,7 @@ information and segment metadata information about what segments should exist in the cluster. If MySQL goes down, this information becomes unavailable to coordinator nodes. However, this does not mean data itself is unavailable. If coordinator nodes cannot communicate to MySQL, they will cease to assign new -segments and drop outdated ones. Broker, historical and real-time nodes are still +segments and drop outdated ones. Broker, historical, and real-time nodes are still queryable during MySQL outages. \section{Storage Format} @@ -505,9 +505,7 @@ Druid always requires a timestamp column as a method of simplifying data distribution policies, data retention policies, and first-level query pruning. Druid partitions its data sources into well-defined time intervals, typically an hour or a day, and may further partition on values from other columns to -achieve the desired segment size. For example, partitioning the data in -Table~\ref{tab:sample_data} by hour results in two segments for 2011-01-01, and -partitioning the data by day results in a single segment. The time granularity +achieve the desired segment size. The time granularity to partition segments is a function of data volume and time range. A data set with timestamps spread over a year is better partitioned by day, and a data set with timestamps spread over a day is better partitioned by hour. @@ -540,17 +538,17 @@ method to compress data and has been used in other data stores such as PowerDrill \cite{hall2012processing}. In the example in Table~\ref{tab:sample_data}, we can map each page to an unique integer identifier. -\begin{verbatim} +{\small\begin{verbatim} Justin Bieber -> 0 Ke$ha -> 1 -\end{verbatim} +\end{verbatim}} This mapping allows us to represent the page column as an integer array where the array indices correspond to the rows of the original data set. For the page column, we can represent the unique pages as follows: -\begin{verbatim} +{\small\begin{verbatim} [0, 0, 1, 1] -\end{verbatim} +\end{verbatim}} The resulting integer array lends itself very well to compression methods. Generic compression algorithms on top of encodings are @@ -561,17 +559,17 @@ Similar compression methods can be applied to numeric columns. For example, the characters added and characters removed columns in Table~\ref{tab:sample_data} can also be expressed as individual arrays. -\begin{verbatim} -Characters Added -> [1800, 2912, 1953, 3194] +{\small\begin{verbatim} +Characters Added -> [1800, 2912, 1953, 3194] Characters Removed -> [25, 42, 17, 170] -\end{verbatim} +\end{verbatim}} In this case, we compress the raw values as opposed to their dictionary representations. \subsection{Indices for Filtering Data} In many real world OLAP workflows, queries are issued for the aggregated results of some set of metrics where some set of dimension specifications are -met. An example query may be asked is: "How many Wikipedia edits were done by users in +met. An example query is: "How many Wikipedia edits were done by users in San Francisco who are also male?". This query is filtering the Wikipedia data set in Table~\ref{tab:sample_data} based on a Boolean expression of dimension values. In many real world data sets, dimension columns contain strings and @@ -586,22 +584,22 @@ indicating in which table rows a particular page is seen. We can store this information in a binary array where the array indices represent our rows. If a particular page is seen in a certain row, that array index is marked as \texttt{1}. For example: -\begin{verbatim} -Justin Bieber -> rows [0, 1] -> [1][1][0][0] -Ke$ha -> rows [2, 3] -> [0][0][1][1] -\end{verbatim} +{\small\begin{verbatim} +Justin Bieber -> rows [0, 1] -> [1][1][0][0] +Ke$ha -> rows [2, 3] -> [0][0][1][1] +\end{verbatim}} \texttt{Justin Bieber} is seen in rows \texttt{0} and \texttt{1}. This mapping of column values to row indices forms an inverted index \cite{tomasic1993performance}. To know which rows contain {\ttfamily Justin Bieber} or {\ttfamily Ke\$ha}, we can \texttt{OR} together the two arrays. -\begin{verbatim} +{\small\begin{verbatim} [0][1][0][1] OR [1][0][1][0] = [1][1][1][1] -\end{verbatim} +\end{verbatim}} \begin{figure} \centering -\includegraphics[width = 3in]{concise_plot} +\includegraphics[width = 2.8in]{concise_plot} \caption{Integer array size versus Concise set size.} \label{fig:concise_plot} \end{figure} @@ -609,31 +607,24 @@ the two arrays. This approach of performing Boolean operations on large bitmap sets is commonly used in search engines. Bitmap indices for OLAP workloads is described in detail in \cite{o1997improved}. Bitmap compression algorithms are a -well-defined area of research and often utilize run-length encoding. Popular -algorithms include Byte-aligned Bitmap Code \cite{antoshenkov1995byte}, -Word-Aligned Hybrid (WAH) code \cite{wu2006optimizing}, and Partitioned -Word-Aligned Hybrid (PWAH) compression \cite{van2011memory}. Druid opted to use -the Concise algorithm \cite{colantonio2010concise} as it can outperform WAH by -reducing the size of the compressed bitmaps by up to 50\%. -Figure~\ref{fig:concise_plot} illustrates the number of bytes using Concise -compression versus using an integer array. The results were generated on a -cc2.8xlarge system with a single thread, 2G heap, 512m young gen, and a forced -GC between each run. The data set is a single day’s worth of data collected -from the Twitter garden hose \cite{twitter2013} data stream. The data set -contains 2,272,295 rows and 12 dimensions of varying cardinality. As an -additional comparison, we also resorted the data set rows to maximize -compression. +well-defined area of research \cite{antoshenkov1995byte, wu2006optimizing, +van2011memory} and often utilize run-length encoding. Druid opted to use the +Concise algorithm \cite{colantonio2010concise} as it can outperform WAH by +reducing compressed bitmap size by up to 50\%. Figure~\ref{fig:concise_plot} +illustrates the number of bytes using Concise compression versus using an +integer array. The results were generated on a \texttt{cc2.8xlarge} system with a single +thread, 2G heap, 512m young gen, and a forced GC between each run. The data set +is a single day’s worth of data collected from the Twitter garden hose +\cite{twitter2013} data stream. The data set contains 2,272,295 rows and 12 +dimensions of varying cardinality. As an additional comparison, we also +resorted the data set rows to maximize compression. In the unsorted case, the total Concise size was 53,451,144 bytes and the total integer array size was 127,248,520 bytes. Overall, Concise compressed sets are about 42\% smaller than integer arrays. In the sorted case, the total Concise compressed size was 43,832,884 bytes and the total integer array size was 127,248,520 bytes. What is interesting to note is that after sorting, global -compression only increased minimally. The total Concise set size to total -integer array size is 34\%. It is also interesting to note that as the -cardinality of a dimension approaches the total number of rows in a data set, -integer arrays require less space than Concise sets and become a better -alternative. +compression only increased minimally. \subsection{Storage Engine} Druid’s persistence components allows for different storage engines to be @@ -674,253 +665,323 @@ into data at any depth. The exact query syntax depends on the query type and the information requested. A sample count query over a week of data is as follows: -\newpage -\begin{verbatim} +{\scriptsize\begin{verbatim} { - "queryType" : "timeseries", - "dataSource" : "wikipedia", - "intervals" : "2013-01-01/2013-01-08", - "filter" : { - "type" : "selector", - "dimension" : "page", - "value" : "Ke$ha" - }, - "granularity" : "day", - "aggregations" : [ { - "type" : "count", - "name" : "rows" - } ] + "queryType" : "timeseries", + "dataSource" : "wikipedia", + "intervals" : "2013-01-01/2013-01-08", + "filter" : { + "type" : "selector", + "dimension" : "page", + "value" : "Ke$ha" + }, + "granularity" : "day", + "aggregations" : [{"type":"count", "name":"rows"}] } -\end{verbatim} +\end{verbatim}} The query shown above will return a count of the number of rows in the Wikipedia datasource from 2013-01-01 to 2013-01-08, filtered for only those rows where the value of the "page" dimension is equal to "Ke\$ha". The results will be bucketed by day and will be a JSON array of the following form: -\begin{verbatim} +{\scriptsize\begin{verbatim} [ { "timestamp": "2012-01-01T00:00:00.000Z", - "result": { - "rows": 393298 - } + "result": {"rows":393298} }, { "timestamp": "2012-01-02T00:00:00.000Z", - "result": { - "rows": 382932 - } + "result": {"rows":382932} }, ... { "timestamp": "2012-01-07T00:00:00.000Z", - "result": { - "rows": 1337 - } + "result": {"rows": 1337} } ] -\end{verbatim} +\end{verbatim}} + Druid supports many types of aggregations including double sums, long sums, -minimums, maximums, and several others. Druid also supports complex aggregations -such as cardinality estimation and approximate quantile estimation. The -results of aggregations can be combined in mathematical expressions to form -other aggregations. The query API is highly customizable and can be extended to -filter and group results based on almost any arbitrary condition. It is beyond -the scope of this paper to fully describe the query API but more information -can be found +minimums, maximums, and complex aggregations such as cardinality estimation and +approximate quantile estimation. The results of aggregations can be combined +in mathematical expressions to form other aggregations. It is beyond the scope +of this paper to fully describe the query API but more information can be found online\footnote{\href{http://druid.io/docs/latest/Querying.html}{http://druid.io/docs/latest/Querying.html}}. -At the time of writing, the query language does not support joins. Although the -storage format is able to support joins, we've targeted Druid at user-facing -workloads that must return in a matter of seconds, and as such, we've chosen to -not spend the time to implement joins as it has been our experience that -requiring joins on your queries often limits the performance you can achieve. -Implemting joins and extending the Druid API to understand SQL is something -we'd like to do in future work. -\section{Experimental Results} +As of this writing, a join query for Druid is not yet implemented. This has +been a function of engineering resource allocation decisions and use case more +than a decision driven by technical merit. Indeed, Druid's storage format +would allow for the implementation of joins (there is no loss of fidelity for +columns included as dimensions) and the implementation of them has been a +conversation that we have every few months. To date, we have made the choice +that the implementation cost is not worth the investment for our organization. +The reasons for this decision are generally two-fold. + +\begin{enumerate} +\item Scaling join queries has been, in our professional experience, a constant bottleneck of working with distributed databases. +\item The incremental gains in functionality are perceived to be of less value than the anticipated problems with managing highly concurrent, join-heavy workloads. +\end{enumerate} + +A join query is essentially the merging of two or more streams of data based on +a shared set of keys. The primary high-level strategies for join queries the +authors are aware of are a hash-based strategy or a sorted-merge strategy. The +hash-based strategy requires that all but one data set be available as +something that looks like a hash table, a lookup operation is then performed on +this hash table for every row in the "primary" stream. The sorted-merge +strategy assumes that each stream is sorted by the join key and thus allows for +the incremental joining of the streams. Each of these strategies, however, +requires the materialization of some number of the streams either in sorted +order or in a hash table form. + +When all sides of the join are significantly large tables (> 1 billion records), +materializing the pre-join streams requires complex distributed memory +management. The complexity of the memory management is only amplified by +the fact that we are targeting highly concurrent, multitenant workloads. +This is, as far as the authors are aware, an active academic research +problem that we would be more than willing to engage with the academic +community to help resolving in a scalable manner. + + +\section{Performance} \label{sec:benchmarks} -To illustrate Druid's performance, we conducted a series of experiments that -focused on measuring Druid's query and data ingestion capabilities. +Druid runs in production at several organizations, and to demonstrate its +performance, we have chosen to share some real world numbers for the main production +cluster running at Metamarkets in early 2014. For comparison with other databases +we also include results from synthetic workloads on TPC-H data. -\subsection{Query Performance} -To benchmark Druid query performance, we created a large test cluster with 6TB -of uncompressed data, representing tens of billions of fact rows. The data set -contained more than a dozen dimensions, with cardinalities ranging from the -double digits to tens of millions. We computed four metrics for each row -(counts, sums, and averages). The data was sharded first on timestamp and then -on dimension values, creating thousands of shards roughly 8 million fact rows -apiece. +\subsection{Query Performance in Production} +Druid query performance can vary signficantly depending on the query +being issued. For example, sorting the values of a high cardinality dimension +based on a given metric is much more expensive than a simple count over a time +range. To showcase the average query latencies in a production Druid cluster, +we selected 8 of our most queried data sources, described in +Table~\ref{tab:datasources}. -The cluster used in the benchmark consisted of 100 historical nodes, each with -16 cores, 60GB of RAM, 10 GigE Ethernet, and 1TB of disk space. Collectively, -the cluster comprised of 1600 cores, 6TB or RAM, sufficiently fast Ethernet and -more than enough disk space. +Approximately 30\% of the queries are standard +aggregates involving different types of metrics and filters, 60\% of queries +are ordered group bys over one or more dimensions with aggregates, and 10\% of +queries are search queries and metadata retrieval queries. The number of +columns scanned in aggregate queries roughly follows an exponential +distribution. Queries involving a single column are very frequent, and queries +involving all columns are very rare. -SQL statements are included in Table~\ref{tab:sql_queries}. These queries are -meant to represent some common queries that are made against Druid for typical data -analysis workflows. Although Druid has its own query language, we choose to -translate the queries into SQL to better describe what the queries are doing. -Please note: -\begin{itemize} -\item The timestamp range of the queries encompassed all data. -\item Each machine was a 16-core machine with 60GB RAM and 1TB of local - disk. The machine was configured to only use 15 threads for - processing queries. -\item A memory-mapped storage engine was used (the machine was configured to memory map the data - instead of loading it into the Java heap.) +\begin{table} + \centering + \label{tab:datasources} + \begin{tabular}{| l | l | l |} + \hline + \textbf{Data Source} & \textbf{Dimensions} & \textbf{Metrics} \\ \hline + \texttt{a} & 25 & 21 \\ \hline + \texttt{b} & 30 & 26 \\ \hline + \texttt{c} & 71 & 35 \\ \hline + \texttt{d} & 60 & 19 \\ \hline + \texttt{e} & 29 & 8 \\ \hline + \texttt{f} & 30 & 16 \\ \hline + \texttt{g} & 26 & 18 \\ \hline + \texttt{h} & 78 & 14 \\ \hline + \end{tabular} + \caption{Characteristics of production data sources.} +\end{table} + +A few notes about our results: +\begin{itemize}[leftmargin=*,beginpenalty=5000,topsep=0pt] +\item The results are from a "hot" tier in our production cluster. We run +several tiers of varying performance in production. + +\item There is approximately 10.5TB of RAM available in the "hot" tier and +approximately 10TB of segments loaded (including replication). Collectively, +there are about 50 billion Druid rows in this tier. Results for +every data source are not shown. + +\item The hot tier uses Xeon E5-2670 processors and consists of 1302 processing +threads and 672 total cores (hyperthreaded). + +\item A memory-mapped storage engine was used (the machine was configured to + memory map the data instead of loading it into the Java heap.) \end{itemize} -\begin{table*} - \centering - \caption{Druid Queries} - \label{tab:sql_queries} - \begin{tabular}{| l | p{15cm} |} - \hline - \textbf{Query \#} & \textbf{Query} \\ \hline - 1 & \texttt{SELECT count(*) FROM \_table\_ WHERE timestamp $\geq$ ? AND timestamp < ?} \\ \hline - 2 & \texttt{SELECT count(*), sum(metric1) FROM \_table\_ WHERE timestamp $\geq$ ? AND timestamp < ?} \\ \hline - 3 & \texttt{SELECT count(*), sum(metric1), sum(metric2), sum(metric3), sum(metric4) FROM \_table\_ WHERE timestamp $\geq$ ? AND timestamp < ?} \\ \hline - 4 & \texttt{SELECT high\_card\_dimension, count(*) AS cnt FROM \_table\_ -WHERE timestamp $\geq$ ? AND timestamp < ? GROUP BY high\_card\_dimension ORDER -BY cnt limit 100} \\ \hline 5 & \texttt{SELECT high\_card\_dimension, count(*) -AS cnt, sum(metric1) FROM \_table\_ WHERE timestamp $\geq$ ? AND timestamp < ? -GROUP BY high\_card\_dimension ORDER BY cnt limit 100} \\ \hline 6 & -\texttt{SELECT high\_card\_dimension, count(*) AS cnt, sum(metric1), -sum(metric2), sum(metric3), sum(metric4) FROM \_table\_ WHERE timestamp $\geq$ -? AND timestamp < ? GROUP BY high\_card\_dimension ORDER BY cnt limit 100} \\ -\hline \end{tabular} \end{table*} +Query latencies are shown in Figure~\ref{fig:query_latency} and the queries per +minute is shown in Figure~\ref{fig:queries_per_min}. Across all the various +data sources, average query latency is approximately 550 milliseconds, with +90\% of queries returning in less than 1 second, 95\% in under 2 seconds, and +99\% of queries taking less than 10 seconds to complete. +Occasionally we observe spikes in latency, as observed on February 19, +in which case network issues on the cache nodes were compounded by very high +query load on one of our largest datasources. -Figure~\ref{fig:cluster_scan_rate} shows the cluster scan rate and -Figure~\ref{fig:core_scan_rate} shows the core scan rate. In -Figure~\ref{fig:cluster_scan_rate} we also include projected linear scaling -based on the results of the 25 core cluster. In particular, we observe -diminishing marginal returns to performance in the size of the cluster. Under -linear scaling, the first SQL count query (query 1) would have achieved a speed -of 37 billion rows per second on our 75 node cluster. In fact, the speed was -26 billion rows per second. However, queries 2-6 maintain a near-linear -speedup up to 50 nodes: the core scan rates in Figure~\ref{fig:core_scan_rate} -remain nearly constant. The increase in speed of a parallel computing system -is often limited by the time needed for the sequential operations of the -system, in accordance with Amdahl's law \cite{amdahl1967validity}. +\begin{figure} +\centering +\includegraphics[width = 2.3in]{avg_query_latency} +\includegraphics[width = 2.3in]{query_percentiles} +\caption{Query latencies of production data sources.} +\label{fig:query_latency} +\end{figure} -\begin{figure} \centering \includegraphics[width = 2.8in]{cluster_scan_rate} -\caption{Druid cluster scan rate with lines indicating linear scaling from 25 -nodes.} \label{fig:cluster_scan_rate} \end{figure} +\begin{figure} +\centering +\includegraphics[width = 2.8in]{queries_per_min} +\caption{Queries per minute of production data sources.} +\label{fig:queries_per_min} +\end{figure} -\begin{figure} \centering \includegraphics[width = 2.8in]{core_scan_rate} -\caption{Druid core scan rate.} \label{fig:core_scan_rate} \end{figure} +\subsection{Query Benchmarks on TPC-H Data} +We also present Druid benchmarks on TPC-H data. +Most TPC-H queries do not directly apply to Druid, so we +selected queries more typical of Druid's workload to demonstrate query performance. As a +comparison, we also provide the results of the same queries using MySQL using the +MyISAM engine (InnoDB was slower in our experiments). -The first query listed in Table~\ref{tab:sql_queries} is a simple -count, achieving scan rates of 33M rows/second/core. We believe -the 75 node cluster was actually overprovisioned for the test -dataset, explaining the modest improvement over the 50 node cluster. -Druid's concurrency model is based on shards: one thread will scan one -shard. If the number of segments on a historical node modulo the number -of cores is small (e.g. 17 segments and 15 cores), then many of the -cores will be idle during the last round of the computation. +We selected MySQL to benchmark +against because of its universal popularity. We choose not to select another +open source column store because we were not confident we could correctly tune +it for optimal performance. -When we include more aggregations we see performance degrade. This is -because of the column-oriented storage format Druid employs. For the -\texttt{count(*)} queries, Druid only has to check the timestamp column to satisfy -the ``where'' clause. As we add metrics, it has to also load those metric -values and scan over them, increasing the amount of memory scanned. +Our Druid setup used Amazon EC2 +\texttt{m3.2xlarge} (Intel(R) Xeon(R) CPU E5-2680 v2 @ 2.80GHz) instances for +historical nodes and \texttt{c3.2xlarge} (Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz) instances for broker +nodes. Our MySQL setup was an Amazon RDS instance that ran on the same \texttt{m3.2xlarge} instance type. + +The results for the 1 GB TPC-H data set are shown +in Figure~\ref{fig:tpch_1gb} and the results of the 100 GB data set are shown +in Figure~\ref{fig:tpch_100gb}. We benchmarked Druid's scan rate at +53,539,211 rows/second/core for \texttt{select count(*)} equivalent query over a given time interval +and 36,246,530 rows/second/core for a \texttt{select sum(float)} type query. + +\begin{figure} +\centering +\includegraphics[width = 2.3in]{tpch_1gb} +\caption{Druid \& MySQL benchmarks -- 1GB TPC-H data.} +\label{fig:tpch_1gb} +\end{figure} + +\begin{figure} +\centering +\includegraphics[width = 2.3in]{tpch_100gb} +\caption{Druid \& MySQL benchmarks -- 100GB TPC-H data.} +\label{fig:tpch_100gb} +\end{figure} + +Finally, we present our results of scaling Druid to meet increasing data +volumes with the TPC-H 100 GB data set. We observe that when we +increased the number of cores from 8 to 48, not all types of queries +achieve linear scaling, but the simpler aggregation queries do, +as shown in Figure~\ref{fig:tpch_scaling}. + +The increase in speed of a parallel computing system is often limited by the +time needed for the sequential operations of the system. In this case, queries +requiring a substantial amount of work at the broker level do not parallelize as +well. + +\begin{figure} +\centering +\includegraphics[width = 2.3in]{tpch_scaling} +\caption{Druid scaling benchmarks -- 100GB TPC-H data.} +\label{fig:tpch_scaling} +\end{figure} \subsection{Data Ingestion Performance} -To measure Druid's data latency latency, we spun up a single real-time node -with the following configurations: -\begin{itemize} -\item JVM arguments: -Xmx2g -Duser.timezone=UTC -Dfile.encoding=UTF-8 -XX:+HeapDumpOnOutOfMemoryError -\item CPU: 2.3 GHz Intel Core i7 -\end{itemize} +To showcase Druid's data ingestion latency, we selected several production +datasources of varying dimensions, metrics, and event volumes. Our production +ingestion setup consists of 6 nodes, totalling 360GB of RAM and 96 cores +(12 x Intel Xeon E5-2670). + +Note that in this setup, several other data sources were being ingested and +many other Druid related ingestion tasks were running concurrently on those machines. Druid's data ingestion latency is heavily dependent on the complexity of the data set being ingested. The data complexity is determined by the number of dimensions in each event, the number of metrics in each event, and the types of aggregations we want to perform on those metrics. With the most basic data set (one that only has a timestamp column), our setup can ingest data at a rate of -800k events/sec/node, which is really just a measurement of how fast we can -deserialize events. Real world data sets are never this simple. To simulate -real-world ingestion rates, we created a data set with 5 dimensions and a -single metric. 4 out of the 5 dimensions have a cardinality less than 100, and -we varied the cardinality of the final dimension. The results of varying the -cardinality of a dimension is shown in -Figure~\ref{fig:throughput_vs_cardinality}. +800,000 events/second/core, which is really just a measurement of how fast we can +deserialize events. Real world data sets are never this simple. +Table~\ref{tab:ingest_datasources} shows a selection of data sources and their +chracteristics. + +\begin{table} + \centering + \label{tab:ingest_datasources} + \begin{tabular}{| l | l | l | l |} + \hline + \scriptsize\textbf{Data Source} & \scriptsize\textbf{Dimensions} & \scriptsize\textbf{Metrics} & \scriptsize\textbf{Peak events/s} \\ \hline + \texttt{s} & 7 & 2 & 28334.60 \\ \hline + \texttt{t} & 10 & 7 & 68808.70 \\ \hline + \texttt{u} & 5 & 1 & 49933.93 \\ \hline + \texttt{v} & 30 & 10 & 22240.45 \\ \hline + \texttt{w} & 35 & 14 & 135763.17 \\ \hline + \texttt{x} & 28 & 6 & 46525.85 \\ \hline + \texttt{y} & 33 & 24 & 162462.41 \\ \hline + \texttt{z} & 33 & 24 & 95747.74 \\ \hline + \end{tabular} + \caption{Ingestion characteristics of various data sources.} +\end{table} + +We can see that, based on the descriptions in +Table~\ref{tab:ingest_datasources}, latencies vary significantly and the +ingestion latency is not always a factor of the number of dimensions and +metrics. We see some lower latencies on simple data sets because that was the +rate that the data producer was delivering data. The results are shown in +Figure~\ref{fig:ingestion_rate}. + +We define throughput as the number of events a +real-time node can ingest and also make queryable. If too many events are sent +to the real-time node, those events are blocked until the real-time node has +capacity to accept them. The peak ingestion latency we measured in production +was 22914.43 events/second/core on a datasource with 30 dimensions and 19 metrics, +running an Amazon \texttt{cc2.8xlarge} instance. \begin{figure} \centering -\includegraphics[width = 2.8in]{throughput_vs_cardinality} -\caption{When we vary the cardinality of a single dimension, we can see monotonically decreasing throughput.} -\label{fig:throughput_vs_cardinality} +\includegraphics[width = 2.8in]{ingestion_rate} +\caption{Combined cluster ingestion rates.} +\label{fig:ingestion_rate} \end{figure} -In Figure~\ref{fig:throughput_vs_num_dims}, we instead vary the number of -dimensions in our data set. Each dimension has a cardinality less than 100. We -can see a similar decline in ingestion throughput as the number of dimensions -increases. +The latency measurements we presented are sufficient to address the our stated +problems of interactivity. We would prefer the variability in the latencies to +be less. It is still very possible to possible to decrease latencies by adding +additional hardware, but we have not chosen to do so because infrastructure +cost is still a consideration to us. -\begin{figure} -\centering -\includegraphics[width = 2.8in]{throughput_vs_num_dims} -\caption{Increasing the number of dimensions of our data set also leads to a decline in throughput.} -\label{fig:throughput_vs_num_dims} -\end{figure} +\section{Druid in Production}\label{sec:production} +Over the last few years, we have gained tremendous knowledge about handling +production workloads with Druid and have made a couple of interesting observations. -Finally, keeping our number of dimensions constant at 5, with four dimensions -having a cardinality in the 0-100 range and the final dimension having a -cardinality of 10,000, we can see a similar decline in throughput when we -increase the number of metrics/aggregators in the data set. We used random -types of metrics/aggregators in this experiment, and they vary from longs, -doubles, and other more complex types. The randomization introduces more noise -in the results, leading to a graph that is not strictly decreasing. These -results are shown in Figure~\ref{fig:throughput_vs_num_metrics}. For most real -world data sets, the number of metrics tends to be less than the number of -dimensions. Hence, we can see that introducing a few new metrics does not -impact the ingestion latency as severely as in the other graphs. +\paragraph{Query Patterns} +Druid is often used to explore data and generate reports on data. In the +explore use case, the number of queries issued by a single user is much higher +than in the reporting use case. Exploratory queries often involve progressively +adding filters for the same time range to narrow down results. Users tend to +explore short time intervals of recent data. In the generate report use case, +users query for much longer data intervals, but users also already have the +queries they want to issue in mind. -\begin{figure} -\centering -\includegraphics[width = 2.8in]{throughput_vs_num_metrics} -\caption{Adding new metrics to a data set decreases ingestion latency. In most -real world data sets, the number of metrics in a data set tends to be lower -than the number of dimensions.} -\label{fig:throughput_vs_num_metrics} -\end{figure} +\paragraph{Multitenancy} +Expensive concurrent queries can be problematic in a multitenant +environment. Queries for large datasources may end up hitting every historical +node in a cluster and consume all cluster resources. Smaller, cheaper queries +may be blocked from executing in such cases. We introduced query prioritization +to address these issues. Each historical node is able to prioritize which +segments it needs to scan. Proper query planning is critical for production +workloads. Thankfully, queries for a significant amount of data tend to be for +reporting use cases, and users are not expecting the same level of +interactivity as when they are querying to explore data. -\section{Druid in Production} -\label{sec:production} -Over the last few years of using Druid, we've gained tremendous -knowledge about handling production workloads, setting up correct operational -monitoring, integrating Druid with other products as part of a more -sophisticated data analytics stack, and distributing data to handle entire data -center outages. One of the most important lessons we've learned is that no -amount of testing can accurately simulate a production environment, and failures -will occur for every imaginable and unimaginable reason. Interestingly, most of -our most severe crashes were due to misunderstanding the impacts a -seemingly small feature would have on the overall system. +\paragraph{Node failures} +Single node failures are common in distributed environments, but many nodes +failing at once are not. If historical nodes completely fail and do not +recover, their segments need to reassigned, which means we need excess cluster +capacity to load this data. The amount of additional capacity to have at any +time contributes to the cost of running a cluster. From our experiences, it is +extremely rare to see more than 2 nodes completely fail at once and hence, we +leave enough capacity in our cluster to completely reassign the data from 2 +historical nodes. -Some of our more interesting observations include: -\begin{itemize} -\item Druid is most often used in production to power exploratory dashboards. -Interestingly, because many users of explatory dashboards are not from -technical backgrounds, they often issue queries without understanding the -impacts to the underlying system. For example, some users become impatient that -their queries for terabytes of data do not return in milliseconds and -continously refresh their dashboard view, generating heavy load to Druid. This -type of usage forced Druid to better defend itself against expensive repetitive -queries. - -\item Cluster query performance benefits from multitenancy. Hosting every -production datasource in the same cluster leads to better data parallelization -as additional nodes are added. - -\item Even if you provide users with the ability to arbitrarily explore data, they -often only have a few questions in mind. Caching is extremely important, and in -fact we see a very high percentage of our query results come from the broker cache. - -\item When using a memory mapped storage engine, even a small amount of paging -data from disk can severely impact query performance. SSDs can greatly solve -this problem. - -\item Leveraging approximate algorithms can greatly reduce data storage costs and -improve query performance. Many users do not care about exact answers to their -questions and are comfortable with a few percentage points of error. -\end{itemize} +\paragraph{Data Center Outages} +Complete cluster failures are possible, but extremely rare. If Druid is +deployed only in a single data center, it is possible for the entire data +center to fail. In such cases, new machines need to be provisioned. As long as +deep storage is still available, cluster recovery time is network bound as +historical nodes simply need to redownload every segment from deep storage. We +have experienced such failures in the past, and the recovery time was around +several hours in the AWS ecosystem on several TBs of data. \subsection{Operational Monitoring} Proper monitoring is critical to run a large scale distributed cluster. @@ -928,29 +989,27 @@ Each Druid node is designed to periodically emit a set of operational metrics. These metrics may include system level data such as CPU usage, available memory, and disk capacity, JVM statistics such as garbage collection time, and heap usage, or node specific metrics such as segment scan time, cache -hit rates, and data ingestion latencies. For each query, Druid nodes can also -emit metrics about the details of the query such as the number of filters -applied, or the interval of data requested. +hit rates, and data ingestion latencies. Druid also emits per query metrics. -Metrics can be emitted from a production Druid cluster into a dedicated metrics -Druid cluster. Queries can be made to the metrics Druid cluster to explore -production cluster performance and stability. Leveraging a dedicated metrics +We emit metrics from a production Druid cluster and load them into a dedicated +metrics Druid cluster. The metrics Druid cluster is used to explore the +performance and stability of the production cluster. This dedicated metrics cluster has allowed us to find numerous production problems, such as gradual query speed degregations, less than optimally tuned hardware, and various other system bottlenecks. We also use a metrics cluster to analyze what queries are -made in production. This analysis allows us to determine what our users are -most often doing and we use this information to drive our road map. +made in production and what users are most interested in. \subsection{Pairing Druid with a Stream Processor} At the time of writing, Druid can only understand fully denormalized data streams. In order to provide full business logic in production, Druid can be -paired with a stream processor such as Apache Storm \cite{marz2013storm}. A -Storm topology consumes events from a data stream, retains only those that are +paired with a stream processor such as Apache Storm \cite{marz2013storm}. + +A Storm topology consumes events from a data stream, retains only those that are “on-time”, and applies any relevant business logic. This could range from simple transformations, such as id to name lookups, up to complex operations such as multi-stream joins. The Storm topology forwards the processed event stream to Druid in real-time. Storm handles the streaming data processing work, -and Druid is used for responding to queries on top of both real-time and +and Druid is used for responding to queries for both real-time and historical data. \subsection{Multiple Data Center Distribution} diff --git a/publications/whitepaper/figures/90th_percentile.pdf b/publications/whitepaper/figures/90th_percentile.pdf new file mode 100644 index 00000000000..78d53d63571 Binary files /dev/null and b/publications/whitepaper/figures/90th_percentile.pdf differ diff --git a/publications/whitepaper/figures/95th_percentile.pdf b/publications/whitepaper/figures/95th_percentile.pdf new file mode 100644 index 00000000000..0ce91b4aced Binary files /dev/null and b/publications/whitepaper/figures/95th_percentile.pdf differ diff --git a/publications/whitepaper/figures/99th_percentile.pdf b/publications/whitepaper/figures/99th_percentile.pdf new file mode 100644 index 00000000000..8cddbd61377 Binary files /dev/null and b/publications/whitepaper/figures/99th_percentile.pdf differ diff --git a/publications/whitepaper/figures/avg_query_latency.pdf b/publications/whitepaper/figures/avg_query_latency.pdf new file mode 100644 index 00000000000..5ae784e97c5 Binary files /dev/null and b/publications/whitepaper/figures/avg_query_latency.pdf differ diff --git a/publications/whitepaper/figures/ingestion_rate.pdf b/publications/whitepaper/figures/ingestion_rate.pdf new file mode 100644 index 00000000000..fe14933c7c1 Binary files /dev/null and b/publications/whitepaper/figures/ingestion_rate.pdf differ diff --git a/publications/whitepaper/figures/queries_per_min.pdf b/publications/whitepaper/figures/queries_per_min.pdf new file mode 100644 index 00000000000..b1782ca1db9 Binary files /dev/null and b/publications/whitepaper/figures/queries_per_min.pdf differ diff --git a/publications/whitepaper/figures/query_percentiles.pdf b/publications/whitepaper/figures/query_percentiles.pdf new file mode 100644 index 00000000000..35c2e560afe Binary files /dev/null and b/publications/whitepaper/figures/query_percentiles.pdf differ diff --git a/publications/whitepaper/figures/tpch_100gb.pdf b/publications/whitepaper/figures/tpch_100gb.pdf new file mode 100644 index 00000000000..ea73efc056d Binary files /dev/null and b/publications/whitepaper/figures/tpch_100gb.pdf differ diff --git a/publications/whitepaper/figures/tpch_1gb.pdf b/publications/whitepaper/figures/tpch_1gb.pdf new file mode 100644 index 00000000000..df1c74bc369 Binary files /dev/null and b/publications/whitepaper/figures/tpch_1gb.pdf differ diff --git a/publications/whitepaper/figures/tpch_scaling.png b/publications/whitepaper/figures/tpch_scaling.png new file mode 100644 index 00000000000..e929da0c5dd Binary files /dev/null and b/publications/whitepaper/figures/tpch_scaling.png differ diff --git a/rabbitmq/pom.xml b/rabbitmq/pom.xml index d146f4b5444..a6320647205 100644 --- a/rabbitmq/pom.xml +++ b/rabbitmq/pom.xml @@ -9,7 +9,7 @@ io.druid druid - 0.6.63-SNAPSHOT + 0.6.66-SNAPSHOT diff --git a/s3-extensions/pom.xml b/s3-extensions/pom.xml index 5cde0904da6..93e8125ee7d 100644 --- a/s3-extensions/pom.xml +++ b/s3-extensions/pom.xml @@ -28,7 +28,7 @@ io.druid druid - 0.6.63-SNAPSHOT + 0.6.66-SNAPSHOT diff --git a/server/pom.xml b/server/pom.xml index 0892a6771f0..844fc87f01e 100644 --- a/server/pom.xml +++ b/server/pom.xml @@ -28,7 +28,7 @@ io.druid druid - 0.6.63-SNAPSHOT + 0.6.66-SNAPSHOT diff --git a/server/src/main/java/io/druid/client/BrokerServerView.java b/server/src/main/java/io/druid/client/BrokerServerView.java index d09a6d11ec6..be6a6553ae7 100644 --- a/server/src/main/java/io/druid/client/BrokerServerView.java +++ b/server/src/main/java/io/druid/client/BrokerServerView.java @@ -25,13 +25,16 @@ import com.google.common.collect.Ordering; import com.google.inject.Inject; import com.metamx.common.logger.Logger; import com.metamx.http.client.HttpClient; -import io.druid.client.selector.ServerSelector; import io.druid.client.selector.QueryableDruidServer; +import io.druid.client.selector.ServerSelector; import io.druid.client.selector.ServerSelectorStrategy; import io.druid.concurrent.Execs; import io.druid.guice.annotations.Client; +import io.druid.query.DataSource; +import io.druid.query.QueryDataSource; import io.druid.query.QueryRunner; import io.druid.query.QueryToolChestWarehouse; +import io.druid.query.TableDataSource; import io.druid.timeline.DataSegment; import io.druid.timeline.VersionedIntervalTimeline; import io.druid.timeline.partition.PartitionChunk; @@ -232,10 +235,21 @@ public class BrokerServerView implements TimelineServerView @Override - public VersionedIntervalTimeline getTimeline(String dataSource) + public VersionedIntervalTimeline getTimeline(DataSource dataSource) { + String table; + while (dataSource instanceof QueryDataSource) { + dataSource = ((QueryDataSource) dataSource).getQuery().getDataSource(); + } + + if (dataSource instanceof TableDataSource) { + table = ((TableDataSource) dataSource).getName(); + } else { + throw new UnsupportedOperationException("Unsupported data source type: " + dataSource.getClass().getSimpleName()); + } + synchronized (lock) { - return timelines.get(dataSource); + return timelines.get(table); } } diff --git a/server/src/main/java/io/druid/client/CachingClusteredClient.java b/server/src/main/java/io/druid/client/CachingClusteredClient.java index dacfc7938ef..a9d6538b200 100644 --- a/server/src/main/java/io/druid/client/CachingClusteredClient.java +++ b/server/src/main/java/io/druid/client/CachingClusteredClient.java @@ -40,8 +40,8 @@ import com.metamx.common.guava.Sequence; import com.metamx.common.guava.Sequences; import com.metamx.emitter.EmittingLogger; import io.druid.client.cache.Cache; -import io.druid.client.selector.ServerSelector; import io.druid.client.selector.QueryableDruidServer; +import io.druid.client.selector.ServerSelector; import io.druid.guice.annotations.Smile; import io.druid.query.BySegmentResultValueClass; import io.druid.query.CacheStrategy; @@ -124,11 +124,11 @@ public class CachingClusteredClient implements QueryRunner final boolean useCache = Boolean.parseBoolean(query.getContextValue("useCache", "true")) && strategy != null; final boolean populateCache = Boolean.parseBoolean(query.getContextValue("populateCache", "true")) - && strategy != null; + && strategy != null; final boolean isBySegment = Boolean.parseBoolean(query.getContextValue("bySegment", "false")); - ImmutableMap.Builder contextBuilder = new ImmutableMap.Builder(); + ImmutableMap.Builder contextBuilder = new ImmutableMap.Builder<>(); final String priority = query.getContextValue("priority", "0"); contextBuilder.put("priority", priority); @@ -140,6 +140,7 @@ public class CachingClusteredClient implements QueryRunner final Query rewrittenQuery = query.withOverriddenContext(contextBuilder.build()); + VersionedIntervalTimeline timeline = serverView.getTimeline(query.getDataSource()); if (timeline == null) { return Sequences.empty(); @@ -176,32 +177,37 @@ public class CachingClusteredClient implements QueryRunner queryCacheKey = null; } - // Pull cached segments from cache and remove from set of segments to query - if (useCache && queryCacheKey != null) { + if (queryCacheKey != null) { Map, Cache.NamedKey> cacheKeys = Maps.newHashMap(); - for (Pair e : segments) { - cacheKeys.put(e, computeSegmentCacheKey(e.lhs.getSegment().getIdentifier(), e.rhs, queryCacheKey)); + for (Pair segment : segments) { + final Cache.NamedKey segmentCacheKey = computeSegmentCacheKey( + segment.lhs.getSegment().getIdentifier(), + segment.rhs, + queryCacheKey + ); + cacheKeys.put(segment, segmentCacheKey); } - Map cachedValues = cache.getBulk(cacheKeys.values()); + // Pull cached segments from cache and remove from set of segments to query + final Map cachedValues; + if (useCache) { + cachedValues = cache.getBulk(cacheKeys.values()); + } else { + cachedValues = ImmutableMap.of(); + } for (Map.Entry, Cache.NamedKey> entry : cacheKeys.entrySet()) { Pair segment = entry.getKey(); Cache.NamedKey segmentCacheKey = entry.getValue(); - - final ServerSelector selector = segment.lhs; - final SegmentDescriptor descriptor = segment.rhs; - final Interval segmentQueryInterval = descriptor.getInterval(); + final Interval segmentQueryInterval = segment.rhs.getInterval(); final byte[] cachedValue = cachedValues.get(segmentCacheKey); - if (cachedValue != null) { - cachedResults.add(Pair.of(segmentQueryInterval.getStart(), cachedValue)); - // remove cached segment from set of segments to query segments.remove(segment); - } else { - final String segmentIdentifier = selector.getSegment().getIdentifier(); + cachedResults.add(Pair.of(segmentQueryInterval.getStart(), cachedValue)); + } else if (populateCache) { + final String segmentIdentifier = segment.lhs.getSegment().getIdentifier(); cachePopulatorMap.put( String.format("%s_%s", segmentIdentifier, segmentQueryInterval), new CachePopulator(cache, objectMapper, segmentCacheKey) @@ -229,7 +235,7 @@ public class CachingClusteredClient implements QueryRunner } } - return new LazySequence( + return new LazySequence<>( new Supplier>() { @Override @@ -265,7 +271,7 @@ public class CachingClusteredClient implements QueryRunner final TypeReference cacheObjectClazz = strategy.getCacheObjectClazz(); for (Pair cachedResultPair : cachedResults) { final byte[] cachedResult = cachedResultPair.rhs; - Sequence cachedSequence = new BaseSequence>( + Sequence cachedSequence = new BaseSequence<>( new BaseSequence.IteratorMaker>() { @Override @@ -280,8 +286,7 @@ public class CachingClusteredClient implements QueryRunner objectMapper.getFactory().createParser(cachedResult), cacheObjectClazz ); - } - catch (IOException e) { + } catch (IOException e) { throw Throwables.propagate(e); } } @@ -331,9 +336,12 @@ public class CachingClusteredClient implements QueryRunner String segmentIdentifier = value.getSegmentId(); final Iterable segmentResults = value.getResults(); - cachePopulatorMap.get( + CachePopulator cachePopulator = cachePopulatorMap.get( String.format("%s_%s", segmentIdentifier, value.getInterval()) - ).populate(Iterables.transform(segmentResults, prepareForCache)); + ); + if (cachePopulator != null) { + cachePopulator.populate(Iterables.transform(segmentResults, prepareForCache)); + } return Sequences.simple( Iterables.transform( @@ -416,8 +424,7 @@ public class CachingClusteredClient implements QueryRunner } cache.put(key, valueBytes); - } - catch (IOException e) { + } catch (IOException e) { throw Throwables.propagate(e); } } diff --git a/server/src/main/java/io/druid/client/TimelineServerView.java b/server/src/main/java/io/druid/client/TimelineServerView.java index 7082c599c75..0a6a43c8fdb 100644 --- a/server/src/main/java/io/druid/client/TimelineServerView.java +++ b/server/src/main/java/io/druid/client/TimelineServerView.java @@ -20,6 +20,7 @@ package io.druid.client; import io.druid.client.selector.ServerSelector; +import io.druid.query.DataSource; import io.druid.query.QueryRunner; import io.druid.timeline.VersionedIntervalTimeline; @@ -27,6 +28,6 @@ import io.druid.timeline.VersionedIntervalTimeline; */ public interface TimelineServerView extends ServerView { - VersionedIntervalTimeline getTimeline(String dataSource); + VersionedIntervalTimeline getTimeline(DataSource dataSource); QueryRunner getQueryRunner(DruidServer server); } diff --git a/server/src/main/java/io/druid/client/cache/BytesBoundedLinkedQueue.java b/server/src/main/java/io/druid/client/cache/BytesBoundedLinkedQueue.java index 84c5f83d6a2..2d892bc00d9 100644 --- a/server/src/main/java/io/druid/client/cache/BytesBoundedLinkedQueue.java +++ b/server/src/main/java/io/druid/client/cache/BytesBoundedLinkedQueue.java @@ -22,9 +22,11 @@ package io.druid.client.cache; import java.util.AbstractQueue; import java.util.Collection; import java.util.Iterator; -import java.util.LinkedList; +import java.util.Queue; import java.util.concurrent.BlockingQueue; +import java.util.concurrent.ConcurrentLinkedQueue; import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicInteger; import java.util.concurrent.atomic.AtomicLong; import java.util.concurrent.locks.Condition; import java.util.concurrent.locks.Lock; @@ -36,17 +38,18 @@ import java.util.concurrent.locks.ReentrantLock; */ public abstract class BytesBoundedLinkedQueue extends AbstractQueue implements BlockingQueue { - private final LinkedList delegate; + private final Queue delegate; private final AtomicLong currentSize = new AtomicLong(0); private final Lock putLock = new ReentrantLock(); private final Condition notFull = putLock.newCondition(); private final Lock takeLock = new ReentrantLock(); private final Condition notEmpty = takeLock.newCondition(); + private final AtomicInteger elementCount = new AtomicInteger(0); private long capacity; public BytesBoundedLinkedQueue(long capacity) { - delegate = new LinkedList<>(); + delegate = new ConcurrentLinkedQueue<>(); this.capacity = capacity; } @@ -71,11 +74,13 @@ public abstract class BytesBoundedLinkedQueue extends AbstractQueue implem public void elementAdded(E e) { currentSize.addAndGet(getBytesSize(e)); + elementCount.getAndIncrement(); } public void elementRemoved(E e) { currentSize.addAndGet(-1 * getBytesSize(e)); + elementCount.getAndDecrement(); } private void fullyUnlock() @@ -115,7 +120,7 @@ public abstract class BytesBoundedLinkedQueue extends AbstractQueue implem @Override public int size() { - return delegate.size(); + return elementCount.get(); } @Override @@ -163,7 +168,7 @@ public abstract class BytesBoundedLinkedQueue extends AbstractQueue implem E e; takeLock.lockInterruptibly(); try { - while (delegate.size() == 0) { + while (elementCount.get() == 0) { notEmpty.await(); } e = delegate.remove(); @@ -181,8 +186,16 @@ public abstract class BytesBoundedLinkedQueue extends AbstractQueue implem @Override public int remainingCapacity() { - int delegateSize = delegate.size(); - long currentByteSize = currentSize.get(); + int delegateSize; + long currentByteSize; + fullyLock(); + try { + delegateSize = elementCount.get(); + currentByteSize = currentSize.get(); + } + finally { + fullyUnlock(); + } // return approximate remaining capacity based on current data if (delegateSize == 0) { return (int) Math.min(capacity, Integer.MAX_VALUE); @@ -214,13 +227,13 @@ public abstract class BytesBoundedLinkedQueue extends AbstractQueue implem int n = 0; takeLock.lock(); try { - n = Math.min(maxElements, delegate.size()); + // elementCount.get provides visibility to first n Nodes + n = Math.min(maxElements, elementCount.get()); if (n < 0) { return 0; } - // count.get provides visibility to first n Nodes for (int i = 0; i < n; i++) { - E e = delegate.remove(0); + E e = delegate.remove(); elementRemoved(e); c.add(e); } @@ -287,7 +300,7 @@ public abstract class BytesBoundedLinkedQueue extends AbstractQueue implem E e = null; takeLock.lockInterruptibly(); try { - while (delegate.size() == 0) { + while (elementCount.get() == 0) { if (nanos <= 0) { return null; } diff --git a/server/src/main/java/io/druid/client/cache/MemcachedCache.java b/server/src/main/java/io/druid/client/cache/MemcachedCache.java index abbfb54139c..8ffe995ee76 100644 --- a/server/src/main/java/io/druid/client/cache/MemcachedCache.java +++ b/server/src/main/java/io/druid/client/cache/MemcachedCache.java @@ -32,6 +32,8 @@ import net.spy.memcached.FailureMode; import net.spy.memcached.MemcachedClient; import net.spy.memcached.MemcachedClientIF; import net.spy.memcached.internal.BulkFuture; +import net.spy.memcached.ops.LinkedOperationQueueFactory; +import net.spy.memcached.ops.OperationQueueFactory; import org.apache.commons.codec.digest.DigestUtils; import javax.annotation.Nullable; @@ -56,7 +58,15 @@ public class MemcachedCache implements Cache // always use compression transcoder.setCompressionThreshold(0); - MemcachedOperationQueueFactory queueFactory = new MemcachedOperationQueueFactory(config.getMaxOperationQueueSize()); + + OperationQueueFactory opQueueFactory; + long maxQueueBytes = config.getMaxOperationQueueSize(); + if(maxQueueBytes > 0) { + opQueueFactory = new MemcachedOperationQueueFactory(maxQueueBytes); + } else { + opQueueFactory = new LinkedOperationQueueFactory(); + } + return new MemcachedCache( new MemcachedClient( new ConnectionFactoryBuilder().setProtocol(ConnectionFactoryBuilder.Protocol.BINARY) @@ -68,7 +78,8 @@ public class MemcachedCache implements Cache .setShouldOptimize(true) .setOpQueueMaxBlockTime(config.getTimeout()) .setOpTimeout(config.getTimeout()) - .setOpQueueFactory(queueFactory) + .setReadBufferSize(config.getReadBufferSize()) + .setOpQueueFactory(opQueueFactory) .build(), AddrUtil.getAddresses(config.getHosts()) ), diff --git a/server/src/main/java/io/druid/client/cache/MemcachedCacheConfig.java b/server/src/main/java/io/druid/client/cache/MemcachedCacheConfig.java index 2d8674cdd24..4a573e5d7d2 100644 --- a/server/src/main/java/io/druid/client/cache/MemcachedCacheConfig.java +++ b/server/src/main/java/io/druid/client/cache/MemcachedCacheConfig.java @@ -20,24 +20,38 @@ package io.druid.client.cache; import com.fasterxml.jackson.annotation.JsonProperty; +import net.spy.memcached.DefaultConnectionFactory; import javax.validation.constraints.NotNull; public class MemcachedCacheConfig { + // default to 30 day expiration for cache entries + // values greater than 30 days are interpreted by memcached as absolute POSIX timestamps instead of duration @JsonProperty - private int expiration = 2592000; // What is this number? + private int expiration = 30 * 24 * 3600; + @JsonProperty private int timeout = 500; + + // comma delimited list of memcached servers, given as host:port combination @JsonProperty @NotNull private String hosts; + @JsonProperty private int maxObjectSize = 50 * 1024 * 1024; + + // memcached client read buffer size, -1 uses the spymemcached library default + @JsonProperty + private int readBufferSize = DefaultConnectionFactory.DEFAULT_READ_BUFFER_SIZE; + @JsonProperty private String memcachedPrefix = "druid"; + + // maximum size in bytes of memcached client operation queue. 0 means unbounded @JsonProperty - private long maxOperationQueueSize = 256 * 1024 * 1024L; // 256 MB + private long maxOperationQueueSize = 0; public int getExpiration() { @@ -68,4 +82,9 @@ public class MemcachedCacheConfig { return maxOperationQueueSize; } + + public int getReadBufferSize() + { + return readBufferSize; + } } diff --git a/server/src/main/java/io/druid/guice/DataSegmentPusherPullerModule.java b/server/src/main/java/io/druid/guice/LocalDataStorageDruidModule.java similarity index 86% rename from server/src/main/java/io/druid/guice/DataSegmentPusherPullerModule.java rename to server/src/main/java/io/druid/guice/LocalDataStorageDruidModule.java index af45ea18f17..4511359a522 100644 --- a/server/src/main/java/io/druid/guice/DataSegmentPusherPullerModule.java +++ b/server/src/main/java/io/druid/guice/LocalDataStorageDruidModule.java @@ -23,15 +23,17 @@ import com.google.inject.Binder; import com.google.inject.Key; import com.google.inject.Module; import io.druid.segment.loading.DataSegmentPusher; +import io.druid.segment.loading.DataSegmentKiller; import io.druid.segment.loading.LocalDataSegmentPuller; import io.druid.segment.loading.LocalDataSegmentPusher; import io.druid.segment.loading.LocalDataSegmentPusherConfig; +import io.druid.segment.loading.LocalDataSegmentKiller; import io.druid.segment.loading.OmniSegmentLoader; import io.druid.segment.loading.SegmentLoader; /** */ -public class DataSegmentPusherPullerModule implements Module +public class LocalDataStorageDruidModule implements Module { @Override public void configure(Binder binder) @@ -52,6 +54,11 @@ public class DataSegmentPusherPullerModule implements Module .to(LocalDataSegmentPuller.class) .in(LazySingleton.class); + PolyBind.optionBinder(binder, Key.get(DataSegmentKiller.class)) + .addBinding("local") + .to(LocalDataSegmentKiller.class) + .in(LazySingleton.class); + PolyBind.optionBinder(binder, Key.get(DataSegmentPusher.class)) .addBinding("local") .to(LocalDataSegmentPusher.class) diff --git a/server/src/main/java/io/druid/initialization/Initialization.java b/server/src/main/java/io/druid/initialization/Initialization.java index 5e8e0461202..37dcc5821b0 100644 --- a/server/src/main/java/io/druid/initialization/Initialization.java +++ b/server/src/main/java/io/druid/initialization/Initialization.java @@ -36,7 +36,7 @@ import io.druid.curator.CuratorModule; import io.druid.curator.discovery.DiscoveryModule; import io.druid.guice.AWSModule; import io.druid.guice.AnnouncerModule; -import io.druid.guice.DataSegmentPusherPullerModule; +import io.druid.guice.LocalDataStorageDruidModule; import io.druid.guice.DbConnectorModule; import io.druid.guice.DruidGuiceExtensions; import io.druid.guice.DruidProcessingModule; @@ -316,7 +316,7 @@ public class Initialization new DbConnectorModule(), new JacksonConfigManagerModule(), new IndexingServiceDiscoveryModule(), - new DataSegmentPusherPullerModule(), + new LocalDataStorageDruidModule(), new FirehoseModule() ); diff --git a/server/src/main/java/io/druid/segment/loading/LocalDataSegmentKiller.java b/server/src/main/java/io/druid/segment/loading/LocalDataSegmentKiller.java new file mode 100644 index 00000000000..014805e4d0b --- /dev/null +++ b/server/src/main/java/io/druid/segment/loading/LocalDataSegmentKiller.java @@ -0,0 +1,68 @@ +package io.druid.segment.loading; + +import com.google.inject.Inject; +import com.metamx.common.MapUtils; +import com.metamx.common.logger.Logger; +import io.druid.segment.loading.DataSegmentKiller; +import io.druid.segment.loading.SegmentLoadingException; +import io.druid.timeline.DataSegment; + +import java.io.File; +import java.util.Map; + +/** + */ +public class LocalDataSegmentKiller implements DataSegmentKiller +{ + private static final Logger log = new Logger(LocalDataSegmentKiller.class); + + @Override + public void kill(DataSegment segment) throws SegmentLoadingException + { + final File path = getDirectory(segment); + log.info("segment[%s] maps to path[%s]", segment.getIdentifier(), path); + + if (!path.isDirectory()) { + if (!path.delete()) { + log.error("Unable to delete file[%s].", path); + throw new SegmentLoadingException("Couldn't kill segment[%s]", segment.getIdentifier()); + } + + return; + } + + final File[] files = path.listFiles(); + int success = 0; + + for (File file : files) { + if (!file.delete()) { + log.error("Unable to delete file[%s].", file); + } else { + ++success; + } + } + + if (success == 0 && files.length != 0) { + throw new SegmentLoadingException("Couldn't kill segment[%s]", segment.getIdentifier()); + } + + if (success < files.length) { + log.warn("Couldn't completely kill segment[%s]", segment.getIdentifier()); + } else if (!path.delete()) { + log.warn("Unable to delete directory[%s].", path); + log.warn("Couldn't completely kill segment[%s]", segment.getIdentifier()); + } + } + + private File getDirectory(DataSegment segment) throws SegmentLoadingException + { + final Map loadSpec = segment.getLoadSpec(); + final File path = new File(MapUtils.getString(loadSpec, "path")); + + if (!path.exists()) { + throw new SegmentLoadingException("Asked to load path[%s], but it doesn't exist.", path); + } + + return path.getParentFile(); + } +} diff --git a/server/src/main/java/io/druid/segment/realtime/RealtimeManager.java b/server/src/main/java/io/druid/segment/realtime/RealtimeManager.java index bb2b666db26..e54ff4bcb9c 100644 --- a/server/src/main/java/io/druid/segment/realtime/RealtimeManager.java +++ b/server/src/main/java/io/druid/segment/realtime/RealtimeManager.java @@ -30,6 +30,7 @@ import com.metamx.common.lifecycle.LifecycleStop; import com.metamx.emitter.EmittingLogger; import io.druid.data.input.Firehose; import io.druid.data.input.InputRow; +import io.druid.query.DataSource; import io.druid.query.FinalizeResultsQueryRunner; import io.druid.query.NoopQueryRunner; import io.druid.query.Query; @@ -39,6 +40,7 @@ import io.druid.query.QueryRunnerFactoryConglomerate; import io.druid.query.QuerySegmentWalker; import io.druid.query.QueryToolChest; import io.druid.query.SegmentDescriptor; +import io.druid.query.TableDataSource; import io.druid.segment.realtime.plumber.Plumber; import io.druid.segment.realtime.plumber.Sink; import org.joda.time.DateTime; @@ -96,6 +98,7 @@ public class RealtimeManager implements QuerySegmentWalker Closeables.closeQuietly(chief); } } + public FireDepartmentMetrics getMetrics(String datasource) { FireChief chief = chiefs.get(datasource); @@ -108,7 +111,7 @@ public class RealtimeManager implements QuerySegmentWalker @Override public QueryRunner getQueryRunnerForIntervals(Query query, Iterable intervals) { - final FireChief chief = chiefs.get(query.getDataSource()); + final FireChief chief = chiefs.get(getDataSourceName(query)); return chief == null ? new NoopQueryRunner() : chief.getQueryRunner(query); } @@ -116,11 +119,28 @@ public class RealtimeManager implements QuerySegmentWalker @Override public QueryRunner getQueryRunnerForSegments(Query query, Iterable specs) { - final FireChief chief = chiefs.get(query.getDataSource()); + final FireChief chief = chiefs.get(getDataSourceName(query)); return chief == null ? new NoopQueryRunner() : chief.getQueryRunner(query); } + private String getDataSourceName(Query query) + { + DataSource dataSource = query.getDataSource(); + if (!(dataSource instanceof TableDataSource)) { + throw new UnsupportedOperationException("data source type '" + dataSource.getClass().getName() + "' unsupported"); + } + + String dataSourceName; + try { + dataSourceName = ((TableDataSource) query.getDataSource()).getName(); + } catch (ClassCastException e) { + throw new UnsupportedOperationException("Subqueries are only supported in the broker"); + } + return dataSourceName; + } + + private class FireChief extends Thread implements Closeable { private final FireDepartment fireDepartment; @@ -152,8 +172,7 @@ public class RealtimeManager implements QuerySegmentWalker log.info("Someone get us a plumber!"); plumber = fireDepartment.findPlumber(); log.info("We have our plumber!"); - } - catch (IOException e) { + } catch (IOException e) { throw Throwables.propagate(e); } } @@ -180,8 +199,7 @@ public class RealtimeManager implements QuerySegmentWalker try { try { inputRow = firehose.nextRow(); - } - catch (Exception e) { + } catch (Exception e) { log.debug(e, "thrown away line due to exception, considering unparseable"); metrics.incrementUnparseable(); continue; @@ -206,8 +224,7 @@ public class RealtimeManager implements QuerySegmentWalker nextFlush = new DateTime().plus(intermediatePersistPeriod).getMillis(); } metrics.incrementProcessed(); - } - catch (FormattedException e) { + } catch (FormattedException e) { log.info(e, "unparseable line: %s", e.getDetails()); metrics.incrementUnparseable(); continue; @@ -215,16 +232,15 @@ public class RealtimeManager implements QuerySegmentWalker } } catch (RuntimeException e) { log.makeAlert(e, "RuntimeException aborted realtime processing[%s]", fireDepartment.getSchema().getDataSource()) - .emit(); + .emit(); normalExit = false; throw e; } catch (Error e) { log.makeAlert(e, "Exception aborted realtime processing[%s]", fireDepartment.getSchema().getDataSource()) - .emit(); + .emit(); normalExit = false; throw e; - } - finally { + } finally { Closeables.closeQuietly(firehose); if (normalExit) { plumber.finishJob(); diff --git a/server/src/main/java/io/druid/segment/realtime/plumber/RealtimePlumberSchool.java b/server/src/main/java/io/druid/segment/realtime/plumber/RealtimePlumberSchool.java index f7d6398a194..4a8332137d4 100644 --- a/server/src/main/java/io/druid/segment/realtime/plumber/RealtimePlumberSchool.java +++ b/server/src/main/java/io/druid/segment/realtime/plumber/RealtimePlumberSchool.java @@ -44,7 +44,7 @@ import java.util.concurrent.ExecutorService; */ public class RealtimePlumberSchool implements PlumberSchool { - public static final int DEFAULT_MAX_PENDING_PERSISTS = 2; + public static final int DEFAULT_MAX_PENDING_PERSISTS = 0; private static final EmittingLogger log = new EmittingLogger(RealtimePlumberSchool.class); diff --git a/server/src/main/java/io/druid/server/QueryResource.java b/server/src/main/java/io/druid/server/QueryResource.java index 97bca835ddb..051a56e4465 100644 --- a/server/src/main/java/io/druid/server/QueryResource.java +++ b/server/src/main/java/io/druid/server/QueryResource.java @@ -131,7 +131,7 @@ public class QueryResource emitter.emit( new ServiceMetricEvent.Builder() - .setUser2(query.getDataSource()) + .setUser2(query.getDataSource().toString()) .setUser4(query.getType()) .setUser5(query.getIntervals().get(0).toString()) .setUser6(String.valueOf(query.hasFilters())) diff --git a/server/src/main/java/io/druid/server/coordination/ServerManager.java b/server/src/main/java/io/druid/server/coordination/ServerManager.java index 950be651e86..3ee0c108f8a 100644 --- a/server/src/main/java/io/druid/server/coordination/ServerManager.java +++ b/server/src/main/java/io/druid/server/coordination/ServerManager.java @@ -31,6 +31,7 @@ import com.metamx.emitter.service.ServiceMetricEvent; import io.druid.collections.CountingMap; import io.druid.guice.annotations.Processing; import io.druid.query.BySegmentQueryRunner; +import io.druid.query.DataSource; import io.druid.query.FinalizeResultsQueryRunner; import io.druid.query.MetricsEmittingQueryRunner; import io.druid.query.NoopQueryRunner; @@ -42,6 +43,7 @@ import io.druid.query.QuerySegmentWalker; import io.druid.query.QueryToolChest; import io.druid.query.ReferenceCountingSegmentQueryRunner; import io.druid.query.SegmentDescriptor; +import io.druid.query.TableDataSource; import io.druid.query.spec.QuerySegmentSpec; import io.druid.query.spec.SpecificSegmentQueryRunner; import io.druid.query.spec.SpecificSegmentSpec; @@ -118,6 +120,7 @@ public class ServerManager implements QuerySegmentWalker /** * Load a single segment. + * * @param segment segment to load * @return true if the segment was newly loaded, false if it was already loaded * @throws SegmentLoadingException if the segment cannot be loaded @@ -127,12 +130,10 @@ public class ServerManager implements QuerySegmentWalker final Segment adapter; try { adapter = segmentLoader.getSegment(segment); - } - catch (SegmentLoadingException e) { + } catch (SegmentLoadingException e) { try { segmentLoader.cleanup(segment); - } - catch (SegmentLoadingException e1) { + } catch (SegmentLoadingException e1) { // ignore } throw e; @@ -204,12 +205,11 @@ public class ServerManager implements QuerySegmentWalker try { log.info("Attempting to close segment %s", segment.getIdentifier()); oldQueryable.close(); - } - catch (IOException e) { + } catch (IOException e) { log.makeAlert(e, "Exception closing segment") - .addData("dataSource", dataSource) - .addData("segmentId", segment.getIdentifier()) - .emit(); + .addData("dataSource", dataSource) + .addData("segmentId", segment.getIdentifier()) + .emit(); } } else { log.info( @@ -233,7 +233,19 @@ public class ServerManager implements QuerySegmentWalker final QueryToolChest> toolChest = factory.getToolchest(); - final VersionedIntervalTimeline timeline = dataSources.get(query.getDataSource()); + DataSource dataSource = query.getDataSource(); + if (!(dataSource instanceof TableDataSource)) { + throw new UnsupportedOperationException("data source type '" + dataSource.getClass().getName() + "' unsupported"); + } + + String dataSourceName; + try { + dataSourceName = ((TableDataSource) query.getDataSource()).getName(); + } catch (ClassCastException e) { + throw new UnsupportedOperationException("Subqueries are only supported in the broker"); + } + + final VersionedIntervalTimeline timeline = dataSources.get(dataSourceName); if (timeline == null) { return new NoopQueryRunner(); @@ -294,6 +306,7 @@ public class ServerManager implements QuerySegmentWalker Predicates.>notNull() ); + return new FinalizeResultsQueryRunner(toolChest.mergeResults(factory.mergeRunners(exec, adapters)), toolChest); } @@ -303,14 +316,21 @@ public class ServerManager implements QuerySegmentWalker final QueryRunnerFactory> factory = conglomerate.findFactory(query); if (factory == null) { log.makeAlert("Unknown query type, [%s]", query.getClass()) - .addData("dataSource", query.getDataSource()) - .emit(); + .addData("dataSource", query.getDataSource()) + .emit(); return new NoopQueryRunner(); } final QueryToolChest> toolChest = factory.getToolchest(); - final VersionedIntervalTimeline timeline = dataSources.get(query.getDataSource()); + String dataSourceName; + try { + dataSourceName = ((TableDataSource) query.getDataSource()).getName(); + } catch (ClassCastException e) { + throw new UnsupportedOperationException("Subqueries are only supported in the broker"); + } + + final VersionedIntervalTimeline timeline = dataSources.get(dataSourceName); if (timeline == null) { return new NoopQueryRunner(); diff --git a/server/src/main/java/io/druid/server/coordinator/DruidCoordinator.java b/server/src/main/java/io/druid/server/coordinator/DruidCoordinator.java index f5f202b5904..4b241060b0a 100644 --- a/server/src/main/java/io/druid/server/coordinator/DruidCoordinator.java +++ b/server/src/main/java/io/druid/server/coordinator/DruidCoordinator.java @@ -81,7 +81,6 @@ import java.util.Set; import java.util.concurrent.Callable; import java.util.concurrent.ConcurrentMap; import java.util.concurrent.ScheduledExecutorService; -import java.util.concurrent.atomic.AtomicLong; import java.util.concurrent.atomic.AtomicReference; /** @@ -95,10 +94,6 @@ public class DruidCoordinator private final Object lock = new Object(); - private volatile boolean started = false; - private volatile int leaderCounter = 0; - private volatile boolean leader = false; - private final DruidCoordinatorConfig config; private final ZkPathsConfig zkPaths; private final JacksonConfigManager configManager; @@ -115,6 +110,12 @@ public class DruidCoordinator private final ServiceAnnouncer serviceAnnouncer; private final DruidNode self; + private volatile boolean started = false; + private volatile int leaderCounter = 0; + private volatile boolean leader = false; + private volatile SegmentReplicantLookup segmentReplicantLookup = null; + + @Inject public DruidCoordinator( DruidCoordinatorConfig config, @@ -197,39 +198,55 @@ public class DruidCoordinator return loadManagementPeons; } - public Map getReplicationStatus() + public Map> getReplicationStatus() { - // find expected load per datasource - final CountingMap expectedSegmentsInCluster = new CountingMap<>(); + final Map> retVal = Maps.newHashMap(); + + if (segmentReplicantLookup == null) { + return retVal; + } + final DateTime now = new DateTime(); for (DataSegment segment : getAvailableDataSegments()) { List rules = databaseRuleManager.getRulesWithDefault(segment.getDataSource()); for (Rule rule : rules) { if (rule instanceof LoadRule && rule.appliesTo(segment, now)) { - for (Integer numReplicants : ((LoadRule) rule).getTieredReplicants().values()) { - expectedSegmentsInCluster.add(segment.getDataSource(), numReplicants); + for (Map.Entry entry : ((LoadRule) rule).getTieredReplicants().entrySet()) { + CountingMap dataSourceMap = retVal.get(entry.getKey()); + if (dataSourceMap == null) { + dataSourceMap = new CountingMap<>(); + retVal.put(entry.getKey(), dataSourceMap); + } + + int diff = Math.max( + entry.getValue() - segmentReplicantLookup.getTotalReplicants(segment.getIdentifier(), entry.getKey()), + 0 + ); + dataSourceMap.add(segment.getDataSource(), diff); } break; } } } - // find segments currently loaded per datasource - CountingMap segmentsInCluster = new CountingMap<>(); - for (DruidServer druidServer : serverInventoryView.getInventory()) { - for (DataSegment segment : druidServer.getSegments().values()) { - segmentsInCluster.add(segment.getDataSource(), 1); - } + return retVal; + } + + + public CountingMap getSegmentAvailability() + { + final CountingMap retVal = new CountingMap<>(); + + if (segmentReplicantLookup == null) { + return retVal; } - // compare available segments with currently loaded - Map loadStatus = Maps.newHashMap(); - for (Map.Entry entry : expectedSegmentsInCluster.entrySet()) { - Long actual = segmentsInCluster.get(entry.getKey()).get(); - loadStatus.put(entry.getKey(), 100 * (actual == null ? 0.0D : (double) actual) / entry.getValue().get()); + for (DataSegment segment : getAvailableDataSegments()) { + int available = (segmentReplicantLookup.getTotalReplicants(segment.getIdentifier()) == 0) ? 0 : 1; + retVal.add(segment.getDataSource(), 1 - available); } - return loadStatus; + return retVal; } public Map getLoadStatus() @@ -808,7 +825,7 @@ public class DruidCoordinator cluster.add(new ServerHolder(server, loadManagementPeons.get(server.getName()))); } - SegmentReplicantLookup segmentReplicantLookup = SegmentReplicantLookup.make(cluster); + segmentReplicantLookup = SegmentReplicantLookup.make(cluster); // Stop peons for servers that aren't there anymore. final Set disappeared = Sets.newHashSet(loadManagementPeons.keySet()); diff --git a/server/src/main/java/io/druid/server/http/CoordinatorResource.java b/server/src/main/java/io/druid/server/http/CoordinatorResource.java index aea61681183..218cd81904b 100644 --- a/server/src/main/java/io/druid/server/http/CoordinatorResource.java +++ b/server/src/main/java/io/druid/server/http/CoordinatorResource.java @@ -20,6 +20,7 @@ package io.druid.server.http; import com.google.common.base.Function; +import com.google.common.collect.Collections2; import com.google.common.collect.ImmutableMap; import com.google.common.collect.Maps; import com.google.inject.Inject; @@ -60,9 +61,14 @@ public class CoordinatorResource @Path("/loadstatus") @Produces("application/json") public Response getLoadStatus( + @QueryParam("simple") String simple, @QueryParam("full") String full ) { + if (simple != null) { + return Response.ok(coordinator.getSegmentAvailability()).build(); + } + if (full != null) { return Response.ok(coordinator.getReplicationStatus()).build(); } @@ -73,7 +79,8 @@ public class CoordinatorResource @Path("/loadqueue") @Produces("application/json") public Response getLoadQueue( - @QueryParam("simple") String simple + @QueryParam("simple") String simple, + @QueryParam("simple") String full ) { if (simple != null) { @@ -106,6 +113,51 @@ public class CoordinatorResource ) ).build(); } - return Response.ok(coordinator.getLoadManagementPeons()).build(); + + if (full != null) { + return Response.ok(coordinator.getLoadManagementPeons()).build(); + } + + return Response.ok( + Maps.transformValues( + coordinator.getLoadManagementPeons(), + new Function() + { + @Override + public Object apply(LoadQueuePeon input) + { + return new ImmutableMap.Builder<>() + .put( + "segmentsToLoad", + Collections2.transform( + input.getSegmentsToLoad(), + new Function() + { + @Override + public String apply(DataSegment segment) + { + return segment.getIdentifier(); + } + } + ) + ) + .put( + "segmentsToDrop", Collections2.transform( + input.getSegmentsToDrop(), + new Function() + { + @Override + public String apply(DataSegment segment) + { + return segment.getIdentifier(); + } + } + ) + ) + .build(); + } + } + ) + ).build(); } } \ No newline at end of file diff --git a/server/src/main/java/io/druid/server/http/DBResource.java b/server/src/main/java/io/druid/server/http/DBResource.java index f979b76961b..02277f9e79d 100644 --- a/server/src/main/java/io/druid/server/http/DBResource.java +++ b/server/src/main/java/io/druid/server/http/DBResource.java @@ -52,7 +52,6 @@ public class DBResource this.databaseSegmentManager = databaseSegmentManager; } - @GET @Path("/datasources") @Produces("application/json") diff --git a/server/src/main/java/io/druid/server/http/DatasourcesResource.java b/server/src/main/java/io/druid/server/http/DatasourcesResource.java index c77e6fee9bb..3a1a316e658 100644 --- a/server/src/main/java/io/druid/server/http/DatasourcesResource.java +++ b/server/src/main/java/io/druid/server/http/DatasourcesResource.java @@ -31,7 +31,6 @@ import io.druid.client.DruidServer; import io.druid.client.InventoryView; import io.druid.client.indexing.IndexingServiceClient; import io.druid.db.DatabaseSegmentManager; -import io.druid.segment.IndexGranularity; import io.druid.timeline.DataSegment; import org.joda.time.Interval; @@ -125,6 +124,35 @@ public class DatasourcesResource ).build(); } + @GET + @Path("/{dataSourceName}") + @Consumes("application/json") + public Response getTheDataSource( + @PathParam("dataSourceName") final String dataSourceName + ) + { + DruidDataSource dataSource = getDataSource(dataSourceName.toLowerCase()); + if (dataSource == null) { + return Response.status(Response.Status.NOT_FOUND).build(); + } + + return Response.ok(dataSource).build(); + } + + @POST + @Path("/{dataSourceName}") + @Consumes("application/json") + public Response enableDataSource( + @PathParam("dataSourceName") final String dataSourceName + ) + { + if (!databaseSegmentManager.enableDatasource(dataSourceName)) { + return Response.status(Response.Status.NOT_FOUND).build(); + } + + return Response.status(Response.Status.OK).build(); + } + @DELETE @Path("/{dataSourceName}") @Produces("application/json") @@ -160,20 +188,6 @@ public class DatasourcesResource return Response.status(Response.Status.OK).build(); } - @POST - @Path("/{dataSourceName}") - @Consumes("application/json") - public Response enableDataSource( - @PathParam("dataSourceName") final String dataSourceName - ) - { - if (!databaseSegmentManager.enableDatasource(dataSourceName)) { - return Response.status(Response.Status.NOT_FOUND).build(); - } - - return Response.status(Response.Status.OK).build(); - } - @GET @Path("/{dataSourceName}/segments") @Produces("application/json") diff --git a/server/src/main/java/io/druid/server/http/TiersResource.java b/server/src/main/java/io/druid/server/http/TiersResource.java index 2ec84d79e32..81698e4ff3d 100644 --- a/server/src/main/java/io/druid/server/http/TiersResource.java +++ b/server/src/main/java/io/druid/server/http/TiersResource.java @@ -25,6 +25,7 @@ import com.google.common.collect.ImmutableMap; import com.google.common.collect.Sets; import com.google.common.collect.Table; import com.google.inject.Inject; +import com.metamx.common.MapUtils; import io.druid.client.DruidServer; import io.druid.client.InventoryView; @@ -70,10 +71,10 @@ public class TiersResource } Long currSize = tierMetadata.get("currSize"); - tierMetadata.put("currSize", (currSize == null) ? 0 : currSize + druidServer.getCurrSize()); + tierMetadata.put("currSize", ((currSize == null) ? 0 : currSize) + druidServer.getCurrSize()); Long maxSize = tierMetadata.get("maxSize"); - tierMetadata.put("maxSize", (maxSize == null) ? 0 : maxSize + druidServer.getMaxSize()); + tierMetadata.put("maxSize", ((maxSize == null) ? 0 : maxSize) + druidServer.getMaxSize()); } return builder.entity(metadata).build(); } diff --git a/server/src/main/resources/static/js/rules-0.0.2.js b/server/src/main/resources/static/js/rules-0.0.2.js index 78ff948484a..e7268620a43 100644 --- a/server/src/main/resources/static/js/rules-0.0.2.js +++ b/server/src/main/resources/static/js/rules-0.0.2.js @@ -140,6 +140,10 @@ function makeJSON() { function makeTiersDropdown(selTier) { var retVal = "tier