diff --git a/cassandra-storage/pom.xml b/cassandra-storage/pom.xml
index e07536e0705..5682138b49d 100644
--- a/cassandra-storage/pom.xml
+++ b/cassandra-storage/pom.xml
@@ -28,7 +28,7 @@
io.druiddruid
- 0.6.157-SNAPSHOT
+ 0.6.160-SNAPSHOT
diff --git a/common/pom.xml b/common/pom.xml
index 687d67afe92..da437e99e1f 100644
--- a/common/pom.xml
+++ b/common/pom.xml
@@ -28,7 +28,7 @@
io.druiddruid
- 0.6.157-SNAPSHOT
+ 0.6.160-SNAPSHOT
diff --git a/docs/content/Aggregations.md b/docs/content/Aggregations.md
index 29740a2858c..abd4780b025 100644
--- a/docs/content/Aggregations.md
+++ b/docs/content/Aggregations.md
@@ -159,4 +159,4 @@ Uses [HyperLogLog](http://algo.inria.fr/flajolet/Publications/FlFuGaMe07.pdf) to
```json
{ "type" : "hyperUnique", "name" : , "fieldName" : }
-```
\ No newline at end of file
+```
diff --git a/docs/content/Batch-ingestion.md b/docs/content/Batch-ingestion.md
index 2f53eb48b4e..0326d1e1a66 100644
--- a/docs/content/Batch-ingestion.md
+++ b/docs/content/Batch-ingestion.md
@@ -162,37 +162,58 @@ The indexing process has the ability to roll data up as it processes the incomin
### Partitioning specification
-Segments are always partitioned based on timestamp (according to the granularitySpec) and may be further partitioned in some other way depending on partition type.
-Druid supports two types of partitions spec - singleDimension and hashed.
+Segments are always partitioned based on timestamp (according to the granularitySpec) and may be further partitioned in
+some other way depending on partition type. Druid supports two types of partitioning strategies: "hashed" (based on the
+hash of all dimensions in each row), and "dimension" (based on ranges of a single dimension).
-In SingleDimension partition type data is partitioned based on the values in that dimension.
-For example, data for a day may be split by the dimension "last\_name" into two segments: one with all values from A-M and one with all values from N-Z.
+Hashed partitioning is recommended in most cases, as it will improve indexing performance and create more uniformly
+sized data segments relative to single-dimension partitioning.
-In hashed partition type, the number of partitions is determined based on the targetPartitionSize and cardinality of input set and the data is partitioned based on the hashcode of the row.
-
-It is recommended to use Hashed partition as it is more efficient than singleDimension since it does not need to determine the dimension for creating partitions.
-Hashing also gives better distribution of data resulting in equal sized partitions and improving query performance
-
-To use this druid to automatically determine optimal partitions indexer must be given a target partition size. It can then find a good set of partition ranges on its own.
-
-#### Configuration for disabling auto-sharding and creating Fixed number of partitions
- Druid can be configured to NOT run determine partitions and create a fixed number of shards by specifying numShards in hashed partitionsSpec.
- e.g This configuration will skip determining optimal partitions and always create 4 shards for every segment granular interval
+#### Hash-based partitioning
```json
"partitionsSpec": {
- "type": "hashed"
- "numShards": 4
+ "type": "hashed",
+ "targetPartitionSize": 5000000
}
```
+Hashed partitioning works by first selecting a number of segments, and then partitioning rows across those segments
+according to the hash of all dimensions in each row. The number of segments is determined automatically based on the
+cardinality of the input set and a target partition size.
+
+The configuration options are:
+
|property|description|required?|
|--------|-----------|---------|
-|type|type of partitionSpec to be used |no, default : singleDimension|
-|targetPartitionSize|target number of rows to include in a partition, should be a number that targets segments of 700MB\~1GB.|yes|
+|type|type of partitionSpec to be used |"hashed"|
+|targetPartitionSize|target number of rows to include in a partition, should be a number that targets segments of 500MB\~1GB.|either this or numShards|
+|numShards|specify the number of partitions directly, instead of a target partition size. Ingestion will run faster, since it can skip the step necessary to select a number of partitions automatically.|either this or targetPartitionSize|
+
+#### Single-dimension partitioning
+
+```json
+ "partitionsSpec": {
+ "type": "dimension",
+ "targetPartitionSize": 5000000
+ }
+```
+
+Single-dimension partitioning works by first selecting a dimension to partition on, and then separating that dimension
+into contiguous ranges. Each segment will contain all rows with values of that dimension in that range. For example,
+your segments may be partitioned on the dimension "host" using the ranges "a.example.com" to "f.example.com" and
+"f.example.com" to "z.example.com". By default, the dimension to use is determined automatically, although you can
+override it with a specific dimension.
+
+The configuration options are:
+
+|property|description|required?|
+|--------|-----------|---------|
+|type|type of partitionSpec to be used |"dimension"|
+|targetPartitionSize|target number of rows to include in a partition, should be a number that targets segments of 500MB\~1GB.|yes|
+|maxPartitionSize|maximum number of rows to include in a partition. Defaults to 50% larger than the targetPartitionSize.|no|
|partitionDimension|the dimension to partition on. Leave blank to select a dimension automatically.|no|
-|assumeGrouped|assume input data has already been grouped on time and dimensions. This is faster, but can choose suboptimal partitions if the assumption is violated.|no|
-|numShards|provides a way to manually override druid-auto sharding and specify the number of shards to create for each segment granular interval.It is only supported by hashed partitionSpec and targetPartitionSize must be set to -1|no|
+|assumeGrouped|assume input data has already been grouped on time and dimensions. Ingestion will run faster, but can choose suboptimal partitions if the assumption is violated.|no|
### Updater job spec
diff --git a/docs/content/Coordinator.md b/docs/content/Coordinator.md
index 9021cbe1dff..42d1d17041d 100644
--- a/docs/content/Coordinator.md
+++ b/docs/content/Coordinator.md
@@ -20,9 +20,7 @@ io.druid.cli.Main server coordinator
Rules
-----
-Segments are loaded and dropped from the cluster based on a set of rules. Rules indicate how segments should be assigned to different historical node tiers and how many replicants of a segment should exist in each tier. Rules may also indicate when segments should be dropped entirely from the cluster. The coordinator loads a set of rules from the database. Rules may be specific to a certain datasource and/or a default set of rules can be configured. Rules are read in order and hence the ordering of rules is important. The coordinator will cycle through all available segments and match each segment with the first rule that applies. Each segment may only match a single rule.
-
-For more information on rules, see [Rule Configuration](Rule-Configuration.html).
+Segments can be automatically loaded and dropped from the cluster based on a set of rules. For more information on rules, see [Rule Configuration](Rule-Configuration.html).
Cleaning Up Segments
--------------------
diff --git a/docs/content/Druid-vs-Cassandra.md b/docs/content/Druid-vs-Cassandra.md
index 4ef29a1ab36..f8d1d59a473 100644
--- a/docs/content/Druid-vs-Cassandra.md
+++ b/docs/content/Druid-vs-Cassandra.md
@@ -1,6 +1,11 @@
---
layout: doc_page
---
+
+Druid vs. Cassandra
+===================
+
+
We are not experts on Cassandra, if anything is incorrect about our portrayal, please let us know on the mailing list or via some other means. We will fix this page.
Druid is highly optimized for scans and aggregations, it supports arbitrarily deep drill downs into data sets without the need to pre-compute, and it can ingest event streams in real-time and allow users to query events as they come in. Cassandra is a great key-value store and it has some features that allow you to use it to do more interesting things than what you can do with a pure key-value store. But, it is not built for the same use cases that Druid handles, namely regularly scanning over billions of entries per query.
diff --git a/docs/content/Druid-vs-Hadoop.md b/docs/content/Druid-vs-Hadoop.md
index 3b4ef1df76c..47d03dd7704 100644
--- a/docs/content/Druid-vs-Hadoop.md
+++ b/docs/content/Druid-vs-Hadoop.md
@@ -2,6 +2,10 @@
layout: doc_page
---
+Druid vs Hadoop
+===============
+
+
Hadoop has shown the world that it’s possible to house your data warehouse on commodity hardware for a fraction of the price of typical solutions. As people adopt Hadoop for their data warehousing needs, they find two things.
1. They can now query all of their data in a fairly flexible manner and answer any question they have
diff --git a/docs/content/Druid-vs-Impala-or-Shark.md b/docs/content/Druid-vs-Impala-or-Shark.md
index cb658c8e087..ab2cb122524 100644
--- a/docs/content/Druid-vs-Impala-or-Shark.md
+++ b/docs/content/Druid-vs-Impala-or-Shark.md
@@ -1,6 +1,10 @@
---
layout: doc_page
---
+
+Druid vs Impala or Shark
+========================
+
The question of Druid versus Impala or Shark basically comes down to your product requirements and what the systems were designed to do.
Druid was designed to
diff --git a/docs/content/Druid-vs-Redshift.md b/docs/content/Druid-vs-Redshift.md
index 4fe06586467..faaaa3f0f33 100644
--- a/docs/content/Druid-vs-Redshift.md
+++ b/docs/content/Druid-vs-Redshift.md
@@ -1,6 +1,10 @@
---
layout: doc_page
---
+Druid vs Redshift
+=================
+
+
###How does Druid compare to Redshift?
In terms of drawing a differentiation, Redshift is essentially ParAccel (Actian) which Amazon is licensing.
diff --git a/docs/content/Druid-vs-Vertica.md b/docs/content/Druid-vs-Vertica.md
index e6971ae03d9..4fe0eb78892 100644
--- a/docs/content/Druid-vs-Vertica.md
+++ b/docs/content/Druid-vs-Vertica.md
@@ -1,6 +1,11 @@
---
layout: doc_page
---
+
+Druid vs Vertica
+================
+
+
How does Druid compare to Vertica?
Vertica is similar to ParAccel/Redshift ([Druid-vs-Redshift](Druid-vs-Redshift.html)) described above in that it wasn’t built for real-time streaming data ingestion and it supports full SQL.
diff --git a/docs/content/Examples.md b/docs/content/Examples.md
index 468bdb8f41a..dfd44ffe927 100644
--- a/docs/content/Examples.md
+++ b/docs/content/Examples.md
@@ -19,13 +19,13 @@ Clone Druid and build it:
git clone https://github.com/metamx/druid.git druid
cd druid
git fetch --tags
-git checkout druid-0.6.156
+git checkout druid-0.6.159
./build.sh
```
### Downloading the DSK (Druid Standalone Kit)
-[Download](http://static.druid.io/artifacts/releases/druid-services-0.6.156-bin.tar.gz) a stand-alone tarball and run it:
+[Download](http://static.druid.io/artifacts/releases/druid-services-0.6.159-bin.tar.gz) a stand-alone tarball and run it:
``` bash
tar -xzf druid-services-0.X.X-bin.tar.gz
diff --git a/docs/content/Granularities.md b/docs/content/Granularities.md
index d4f3ea73141..e8a3e4fc2bf 100644
--- a/docs/content/Granularities.md
+++ b/docs/content/Granularities.md
@@ -21,13 +21,13 @@ Duration granularities are specified as an exact duration in milliseconds and ti
They also support specifying an optional origin, which defines where to start counting time buckets from (defaults to 1970-01-01T00:00:00Z).
-```
+```javascript
{"type": "duration", "duration": "7200000"}
```
This chunks up every 2 hours.
-```
+```javascript
{"type": "duration", "duration": "3600000", "origin": "2012-01-01T00:30:00Z"}
```
@@ -39,13 +39,13 @@ Period granularities are specified as arbitrary period combinations of years, mo
Time zone is optional (defaults to UTC). Origin is optional (defaults to 1970-01-01T00:00:00 in the given time zone).
-```
+```javascript
{"type": "period", "period": "P2D", "timeZone": "America/Los_Angeles"}
```
This will bucket by two-day chunks in the Pacific timezone.
-```
+```javascript
{"type": "period", "period": "P3M", "timeZone": "America/Los_Angeles", "origin": "2012-02-01T00:00:00-08:00"}
```
diff --git a/docs/content/Hadoop-Configuration.md b/docs/content/Hadoop-Configuration.md
new file mode 100644
index 00000000000..ef8c48e167a
--- /dev/null
+++ b/docs/content/Hadoop-Configuration.md
@@ -0,0 +1,360 @@
+---
+layout: doc_page
+---
+
+Example Production Hadoop Configuration
+=======================================
+
+The following configuration should work relatively well for Druid indexing and Hadoop. In the example, we are using Hadoop 2.4 with EC2 m1.xlarge nodes for NameNodes and cc2.8xlarge nodes for DataNodes.
+
+### Core-site.xml
+
+```
+
+
+
+
+ hadoop.tmp.dir
+ /mnt/persistent/hadoop
+
+
+
+
+ fs.defaultFS
+ hdfs://#{IP}:9000
+
+
+ fs.s3.impl
+ org.apache.hadoop.fs.s3native.NativeS3FileSystem
+
+
+ fs.s3.awsAccessKeyId
+ #{S3_ACCESS_KEY}
+
+
+ fs.s3.awsSecretAccessKey
+ #{S3_SECRET_KEY}
+
+
+ fs.s3.buffer.dir
+ /mnt/persistent/hadoop-s3n
+
+
+ fs.s3n.awsAccessKeyId
+ #{S3N_ACCESS_KEY}
+
+
+ fs.s3n.awsSecretAccessKey
+ #{S3N_SECRET_KEY}
+
+
+
+
+ io.compression.codecs
+ org.apache.hadoop.io.compress.GzipCodec,org.apache.hadoop.io.compress.DefaultCodec,org.apache.hadoop.io.compress.Lz4Codec,org.apache.hadoop.io.compress.BZip2Codec,org.apache.hadoop.io.compress.SnappyCodec
+
+
+
+
+ io.seqfile.local.dir
+ /mnt/persistent/hadoop/io/local
+
+
+
+```
+
+### Mapred-site.xml
+
+```
+
+
+
+ mapreduce.framework.name
+ yarn
+
+
+
+ mapreduce.jobtracker.address
+ #{JT_ADDR}:9001
+
+
+ mapreduce.jobtracker.http.address
+ #{JT_HTTP_ADDR}:9100
+
+
+ mapreduce.jobhistory.address
+ #{JH_ADDR}:10020
+
+
+ mapreduce.jobhistory.webapp.address
+ #{JH_WEBAPP_ADDR}:19888
+
+
+ mapreduce.tasktracker.http.address
+ #{TT_ADDR}:9103
+
+
+
+
+ mapreduce.job.reduces
+ 21
+
+
+
+ mapreduce.job.jvm.numtasks
+ 20
+
+
+ mapreduce.map.memory.mb
+ 2048
+
+
+ mapreduce.map.java.opts
+ -server -Xmx1536m -Duser.timezone=UTC -Dfile.encoding=UTF-8 -XX:+PrintGCDetails -XX:+PrintGCTimeStamps
+
+
+ mapreduce.reduce.memory.mb
+ 6144
+
+
+ mapreduce.reduce.java.opts
+ -server -Xmx2560m -Duser.timezone=UTC -Dfile.encoding=UTF-8 -XX:+PrintGCDetails -XX:+PrintGCTimeStamps
+
+
+ mapreduce.reduce.shuffle.parallelcopies
+ 50
+
+
+ mapreduce.reduce.shuffle.input.buffer.percent
+ 0.5
+
+
+ mapreduce.task.io.sort.mb
+ 256
+
+
+ mapreduce.task.io.sort.factor
+ 100
+
+
+ mapreduce.jobtracker.handler.count
+ 64
+
+
+ mapreduce.tasktracker.http.threads
+ 20
+
+
+
+
+ mapreduce.cluster.local.dir
+ /mnt/persistent/hadoop/mapred/local
+
+
+
+
+ mapreduce.jobhistory.recovery.enable
+ true
+
+
+ mapreduce.jobhistory.recovery.store.class
+ org.apache.hadoop.mapreduce.v2.hs.HistoryServerFileSystemStateStoreService
+
+
+ mapreduce.jobhistory.recovery.store.fs.uri
+ file://${hadoop.tmp.dir}/mapred-jobhistory-state
+
+
+
+
+
+ mapreduce.output.fileoutputformat.compress
+ false
+
+
+ mapreduce.map.output.compress
+ true
+
+
+ mapreduce.output.fileoutputformat.compress.type
+ BLOCK
+
+
+ mapreduce.map.output.compress.codec
+ org.apache.hadoop.io.compress.Lz4Codec
+
+
+ mapreduce.output.fileoutputformat.compress.codec
+ org.apache.hadoop.io.compress.GzipCodec
+
+
+
+ mapreduce.map.speculative
+ false
+
+
+ mapreduce.reduce.speculative
+ false
+
+
+
+
+ mapreduce.task.timeout
+ 1800000
+
+
+
+```
+
+### Yarn-site.xml
+
+```
+
+
+
+ yarn.resourcemanager.hostname
+ #{RM_HOSTNAME}
+
+
+ yarn.resourcemanager.scheduler.class
+ org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.CapacityScheduler
+
+
+ yarn.nodemanager.aux-services
+ mapreduce_shuffle
+
+
+ yarn.log-aggregation-enable
+ true
+
+
+ yarn.log.server.url
+ http://#{IP_LOG_SERVER}:19888/jobhistory/logs/
+
+
+ yarn.nodemanager.hostname
+ #{IP_ADDR}
+
+
+ yarn.scheduler.minimum-allocation-mb
+ 512
+
+
+ yarn.nodemanager.resource.memory-mb
+ 1024
+
+
+ yarn.nodemanager.resource.cpu-vcores
+ 1
+
+
+ yarn.nodemanager.vmem-check-enabled
+ false
+
+
+
+
+ yarn.nodemanager.local-dirs
+ /mnt/persistent/hadoop/nm-local-dir
+
+
+
+
+ yarn.resourcemanager.recovery.enabled
+ false
+
+
+ yarn.resourcemanager.store.class
+ org.apache.hadoop.yarn.server.resourcemanager.recovery.FileSystemRMStateStore
+
+
+ yarn.resourcemanager.fs.state-store.uri
+ file://${hadoop.tmp.dir}/yarn-resourcemanager-state
+
+
+
+
+ yarn.resourcemanager.nodes.exclude-path
+ /mnt/persistent/hadoop/yarn-exclude.txt
+
+
+
+```
+
+### HDFS-site.xml
+
+```
+
+
+
+ dfs.replication
+ 3
+
+
+ dfs.namenode.datanode.registration.ip-hostname-check
+ false
+
+
+ dfs.hosts.exclude
+ /mnt/persistent/hadoop/hdfs-exclude.txt
+
+
+
+
+ dfs.datanode.data.dir
+ file:///mnt/persistent/hadoop/dfs/data
+
+
+
+```
+
+### Capacity-scheduler.xml
+
+```
+
+
+
+ yarn.scheduler.capacity.maximum-am-resource-percent
+ 0.1
+
+
+ yarn.scheduler.capacity.root.queues
+ default
+
+
+ yarn.scheduler.capacity.root.default.capacity
+ 100
+
+
+ yarn.scheduler.capacity.root.default.user-limit-factor
+ 1
+
+
+ yarn.scheduler.capacity.root.queues
+ default
+
+
+ yarn.scheduler.capacity.root.default.maximum-capacity
+ 100
+
+
+ yarn.scheduler.capacity.root.default.state
+ RUNNING
+
+
+ yarn.scheduler.capacity.root.default.acl_submit_applications
+ *
+
+
+ yarn.scheduler.capacity.root.default.acl_administer_queue
+ *
+
+
+ yarn.scheduler.capacity.node-locality-delay
+ -1
+
+
+
+```
\ No newline at end of file
diff --git a/docs/content/Ingestion-FAQ.md b/docs/content/Ingestion-FAQ.md
index fc83907c47a..972e62a6a23 100644
--- a/docs/content/Ingestion-FAQ.md
+++ b/docs/content/Ingestion-FAQ.md
@@ -1,6 +1,11 @@
---
layout: doc_page
---
+
+## What types of data does Druid support?
+
+Druid can ingest JSON, CSV, TSV and other delimited data out of the box. Druid supports single dimension values, or multiple dimension values (an array of strings). Druid supports long and float numeric columns.
+
## Where do my Druid segments end up after ingestion?
Depending on what `druid.storage.type` is set to, Druid will upload segments to some [Deep Storage](Deep-Storage.html). Local disk is used as the default deep storage.
@@ -21,6 +26,14 @@ druid.storage.bucket=druid
druid.storage.baseKey=sample
```
+Other common reasons that hand-off fails are as follows:
+
+1) Historical nodes are out of capacity and cannot download any more segments. You'll see exceptions in the coordinator logs if this occurs.
+
+2) Segments are corrupt and cannot download. You'll see exceptions in your historical nodes if this occurs.
+
+3) Deep storage is improperly configured. Make sure that your segment actually exists in deep storage and that the coordinator logs have no errors.
+
## How do I get HDFS to work?
Make sure to include the `druid-hdfs-storage` module as one of your extensions and set `druid.storage.type=hdfs`.
@@ -35,7 +48,7 @@ You can check the coordinator console located at `:/cluste
## My queries are returning empty results
-You can check `:/druid/v2/datasources/?interval=0/3000` for the dimensions and metrics that have been created for your datasource. Make sure that the name of the aggregators you use in your query match one of these metrics. Also make sure that the query interval you specify match a valid time range where data exists. Note: the broker endpoint will only return valid results on historical segments.
+You can check `:/druid/v2/datasources/?interval=0/3000` for the dimensions and metrics that have been created for your datasource. Make sure that the name of the aggregators you use in your query match one of these metrics. Also make sure that the query interval you specify match a valid time range where data exists. Note: the broker endpoint will only return valid results on historical segments and not segments served by real-time nodes.
## How can I Reindex existing data in Druid with schema changes?
@@ -50,6 +63,9 @@ To do this use the IngestSegmentFirehose and run an indexer task. The IngestSegm
Typically the above will be run as a batch job to say everyday feed in a chunk of data and aggregate it.
+## Real-time ingestion seems to be stuck
+
+There are a few ways this can occur. Druid will throttle ingestion to prevent out of memory problems if the intermediate persists are taking too long or if hand-off is taking too long. If your node logs indicate certain columns are taking a very long time to build (for example, if your segment granularity is hourly, but creating a single column takes 30 minutes), you should re-evaluate your configuration or scale up your real-time ingestion.
## More information
diff --git a/docs/content/Kafka-Eight.md b/docs/content/Kafka-Eight.md
index d8b3c80232e..5819a931b2a 100644
--- a/docs/content/Kafka-Eight.md
+++ b/docs/content/Kafka-Eight.md
@@ -8,9 +8,9 @@ The previous examples are for Kafka 7. To support Kafka 8, a couple changes need
- Update realtime node's configs for Kafka 8 extensions
- e.g.
- - `druid.extensions.coordinates=[...,"io.druid.extensions:druid-kafka-seven:0.6.156",...]`
+ - `druid.extensions.coordinates=[...,"io.druid.extensions:druid-kafka-seven:0.6.159",...]`
- becomes
- - `druid.extensions.coordinates=[...,"io.druid.extensions:druid-kafka-eight:0.6.156",...]`
+ - `druid.extensions.coordinates=[...,"io.druid.extensions:druid-kafka-eight:0.6.159",...]`
- Update realtime task config for changed keys
- `firehose.type`, `plumber.rejectionPolicyFactory`, and all of `firehose.consumerProps` changes.
diff --git a/docs/content/Production-Cluster-Configuration.md b/docs/content/Production-Cluster-Configuration.md
index d8ed57ab048..6fc319712de 100644
--- a/docs/content/Production-Cluster-Configuration.md
+++ b/docs/content/Production-Cluster-Configuration.md
@@ -57,7 +57,7 @@ druid.host=#{IP_ADDR}:8080
druid.port=8080
druid.service=druid/prod/overlord
-druid.extensions.coordinates=["io.druid.extensions:druid-s3-extensions:0.6.156"]
+druid.extensions.coordinates=["io.druid.extensions:druid-s3-extensions:0.6.159"]
druid.zk.service.host=#{ZK_IPs}
druid.zk.paths.base=/druid/prod
@@ -139,7 +139,7 @@ druid.host=#{IP_ADDR}:8080
druid.port=8080
druid.service=druid/prod/middlemanager
-druid.extensions.coordinates=["io.druid.extensions:druid-s3-extensions:0.6.156","io.druid.extensions:druid-kafka-seven:0.6.156"]
+druid.extensions.coordinates=["io.druid.extensions:druid-s3-extensions:0.6.159","io.druid.extensions:druid-kafka-seven:0.6.159"]
druid.zk.service.host=#{ZK_IPs}
druid.zk.paths.base=/druid/prod
@@ -286,7 +286,7 @@ druid.host=#{IP_ADDR}:8080
druid.port=8080
druid.service=druid/prod/historical
-druid.extensions.coordinates=["io.druid.extensions:druid-s3-extensions:0.6.156"]
+druid.extensions.coordinates=["io.druid.extensions:druid-s3-extensions:0.6.159"]
druid.zk.service.host=#{ZK_IPs}
druid.zk.paths.base=/druid/prod
diff --git a/docs/content/Querying.md b/docs/content/Querying.md
index 2b2981b96f7..dd556b2740f 100644
--- a/docs/content/Querying.md
+++ b/docs/content/Querying.md
@@ -66,7 +66,7 @@ The dataSource JSON field shown next identifies where to apply the query. In thi
"dataSource": "randSeq",
```
-The granularity JSON field specifies the bucket size for values. It could be a built-in time interval like "second", "minute", "fifteen_minute", "thirty_minute", "hour" or "day". It can also be an expression like `{"type": "period", "period":"PT6m"}` meaning "6 minute buckets". See [Granularities](Granularities.html) for more information on the different options for this field. In this example, it is set to the special value "all" which means [bucket all data points together into the same time bucket]()
+The granularity JSON field specifies the bucket size for values. It could be a built-in time interval like "second", "minute", "fifteen_minute", "thirty_minute", "hour" or "day". It can also be an expression like `{"type": "period", "period":"PT6m"}` meaning "6 minute buckets". See [Granularities](Granularities.html) for more information on the different options for this field. In this example, it is set to the special value "all" which means bucket all data points together into the same time bucket.
```javascript
"granularity": "all",
@@ -88,7 +88,7 @@ A groupBy also requires the JSON field "aggregations" (See [Aggregations](Aggreg
],
```
-You can also specify postAggregations, which are applied after data has been aggregated for the current granularity and dimensions bucket. See [Post Aggregations](Post Aggregations.html) for a detailed description. In the rand example, an arithmetic type operation (division, as specified by "fn") is performed with the result "name" of "avg_random". The "fields" field specifies the inputs from the aggregation stage to this expression. Note that identifiers corresponding to "name" JSON field inside the type "fieldAccess" are required but not used outside this expression, so they are prefixed with "dummy" for clarity:
+You can also specify postAggregations, which are applied after data has been aggregated for the current granularity and dimensions bucket. See [Post Aggregations](Post-aggregations.html) for a detailed description. In the rand example, an arithmetic type operation (division, as specified by "fn") is performed with the result "name" of "avg_random". The "fields" field specifies the inputs from the aggregation stage to this expression. Note that identifiers corresponding to "name" JSON field inside the type "fieldAccess" are required but not used outside this expression, so they are prefixed with "dummy" for clarity:
```javascript
"postAggregations": [{
@@ -127,13 +127,13 @@ Properties shared by all query types
|timeseries, topN, groupBy, search|filter|Specifies the filter (the "WHERE" clause in SQL) for the query. See [Filters](Filters.html)|no|
|timeseries, topN, groupBy, search|granularity|the timestamp granularity to bucket results into (i.e. "hour"). See [Granularities](Granularities.html) for more information.|no|
|timeseries, topN, groupBy|aggregations|aggregations that combine values in a bucket. See [Aggregations](Aggregations.html).|yes|
-|timeseries, topN, groupBy|postAggregations|aggregations of aggregations. See [Post Aggregations](Post Aggregations.html).|yes|
+|timeseries, topN, groupBy|postAggregations|aggregations of aggregations. See [Post Aggregations](Post-aggregations.html).|yes|
|groupBy|dimensions|constrains the groupings; if empty, then one value per time granularity bucket|yes|
|search|limit|maximum number of results (default is 1000), a system-level maximum can also be set via `com.metamx.query.search.maxSearchLimit`|no|
|search|searchDimensions|Dimensions to apply the search query to. If not specified, it will search through all dimensions.|no|
|search|query|The query portion of the search query. This is essentially a predicate that specifies if something matches.|yes|
-Query Context
+Query Context
-------------
|property |default | description |
diff --git a/docs/content/Realtime-Config.md b/docs/content/Realtime-Config.md
index 6cb566fac16..12ab55ff8e3 100644
--- a/docs/content/Realtime-Config.md
+++ b/docs/content/Realtime-Config.md
@@ -27,7 +27,7 @@ druid.host=localhost
druid.service=realtime
druid.port=8083
-druid.extensions.coordinates=["io.druid.extensions:druid-kafka-seven:0.6.156"]
+druid.extensions.coordinates=["io.druid.extensions:druid-kafka-seven:0.6.159"]
druid.zk.service.host=localhost
@@ -76,7 +76,7 @@ druid.host=#{IP_ADDR}:8080
druid.port=8080
druid.service=druid/prod/realtime
-druid.extensions.coordinates=["io.druid.extensions:druid-s3-extensions:0.6.156","io.druid.extensions:druid-kafka-seven:0.6.156"]
+druid.extensions.coordinates=["io.druid.extensions:druid-s3-extensions:0.6.159","io.druid.extensions:druid-kafka-seven:0.6.159"]
druid.zk.service.host=#{ZK_IPs}
druid.zk.paths.base=/druid/prod
diff --git a/docs/content/Recommendations.md b/docs/content/Recommendations.md
new file mode 100644
index 00000000000..bf764ffe6c2
--- /dev/null
+++ b/docs/content/Recommendations.md
@@ -0,0 +1,35 @@
+---
+layout: doc_page
+---
+
+Recommendations
+===============
+
+# Use UTC Timezone
+
+We recommend using UTC timezone for all your events and across on your nodes, not just for Druid, but for all data infrastructure. This can greatly mitigate potential query problems with inconsistent timezones.
+
+# Use Lowercase Strings for Column Names
+
+Druid is not perfect in how it handles mix-cased dimension and metric names. This will hopefully change very soon but for the time being, lower casing your column names is recommended.
+
+# SSDs
+
+SSDs are highly recommended for historical and real-time nodes if you are not running a cluster that is entirely in memory. SSDs can greatly mitigate the time required to page data in and out of memory.
+
+# Provide Columns Names in Lexicographic Order
+
+Although Druid supports schema-less ingestion of dimensions, because of [https://github.com/metamx/druid/issues/658](https://github.com/metamx/druid/issues/658), you may sometimes get bigger segments than necessary. To ensure segments are as compact as possible, providing dimension names in lexicographic order is recommended.
+
+
+# Use Timeseries and TopN Queries Instead of GroupBy Where Possible
+
+Timeseries and TopN queries are much more optimized and significantly faster than groupBy queries for their designed use cases. Issuing multiple topN or timeseries queries from your application can potentially be more efficient than a single groupBy query.
+
+# Read FAQs
+
+You should read common problems people have here:
+
+1) [Ingestion-FAQ](Ingestion-FAQ.html)
+
+2) [Performance-FAQ](Performance-FAQ.html)
\ No newline at end of file
diff --git a/docs/content/Router.md b/docs/content/Router.md
new file mode 100644
index 00000000000..f046fe4b468
--- /dev/null
+++ b/docs/content/Router.md
@@ -0,0 +1,133 @@
+---
+layout: doc_page
+---
+
+Router Node
+===========
+
+You should only ever need the router node if you have a Druid cluster well into the terabyte range. The router node can be used to route queries to different broker nodes. By default, the broker routes queries based on how [Rules](Rules.html) are set up. For example, if 1 month of recent data is loaded into a `hot` cluster, queries that fall within the recent month can be routed to a dedicated set of brokers. Queries outside this range are routed to another set of brokers. This set up provides query isolation such that queries for more important data are not impacted by queries for less important data.
+
+Running
+-------
+
+```
+io.druid.cli.Main server router
+```
+
+Example Production Configuration
+--------------------------------
+
+In this example, we have two tiers in our production cluster: `hot` and `_default_tier`. Queries for the `hot` tier are routed through the `broker-hot` set of brokers, and queries for the `_default_tier` are routed through the `broker-cold` set of brokers. If any exceptions or network problems occur, queries are routed to the `broker-cold` set of brokers. In our example, we are running with a c3.2xlarge EC2 node.
+
+JVM settings:
+
+```
+-server
+-Xmx13g
+-Xms13g
+-XX:NewSize=256m
+-XX:MaxNewSize=256m
+-XX:+UseConcMarkSweepGC
+-XX:+PrintGCDetails
+-XX:+PrintGCTimeStamps
+-XX:+UseLargePages
+-XX:+HeapDumpOnOutOfMemoryError
+-XX:HeapDumpPath=/mnt/galaxy/deploy/current/
+-Duser.timezone=UTC
+-Dfile.encoding=UTF-8
+-Djava.io.tmpdir=/mnt/tmp
+
+-Dcom.sun.management.jmxremote.port=17071
+-Dcom.sun.management.jmxremote.authenticate=false
+-Dcom.sun.management.jmxremote.ssl=false
+```
+
+Runtime.properties:
+
+```
+druid.host=#{IP_ADDR}:8080
+druid.port=8080
+druid.service=druid/prod/router
+
+druid.extensions.remoteRepositories=[]
+druid.extensions.localRepository=lib
+druid.extensions.coordinates=["io.druid.extensions:druid-histogram:0.6.159"]
+
+druid.zk.service.host=#{ZK_IPs}
+druid.zk.paths.base=/druid/prod
+
+druid.discovery.curator.path=/prod/discovery
+
+druid.processing.numThreads=1
+druid.router.defaultBrokerServiceName=druid:prod:broker-cold
+druid.router.coordinatorServiceName=druid:prod:coordinator
+druid.router.tierToBrokerMap={"hot":"druid:prod:broker-hot","_default_tier":"druid:prod:broker-cold"}
+druid.router.http.numConnections=50
+druid.router.http.readTimeout=PT5M
+
+druid.server.http.numThreads=100
+
+druid.request.logging.type=emitter
+druid.request.logging.feed=druid_requests
+
+druid.monitoring.monitors=["com.metamx.metrics.SysMonitor","com.metamx.metrics.JvmMonitor"]
+
+druid.emitter=http
+druid.emitter.http.recipientBaseUrl=#{URL}
+
+druid.curator.compress=true
+```
+
+Runtime Configuration
+---------------------
+
+The router module uses several of the default modules in [Configuration](Configuration.html) and has the following set of configurations as well:
+
+|Property|Possible Values|Description|Default|
+|--------|---------------|-----------|-------|
+|`druid.router.defaultBrokerServiceName`|Any string.|The default broker to connect to in case service discovery fails.|"". Must be set.|
+|`druid.router.tierToBrokerMap`|An ordered JSON map of tiers to broker names. The priority of brokers is based on the ordering.|Queries for a certain tier of data are routed to their appropriate broker.|{"_default": ""}|
+|`druid.router.defaultRule`|Any string.|The default rule for all datasources.|"_default"|
+|`druid.router.rulesEndpoint`|Any string.|The coordinator endpoint to extract rules from.|"/druid/coordinator/v1/rules"|
+|`druid.router.coordinatorServiceName`|Any string.|The service discovery name of the coordinator.|null. Must be set.|
+|`druid.router.pollPeriod`|Any ISO8601 duration.|How often to poll for new rules.|PT1M|
+|`druid.router.strategies`|An ordered JSON array of objects.|All custom strategies to use for routing.|[{"type":"timeBoundary"},{"type":"priority"}]|
+
+Router Strategies
+-----------------
+The router has a configurable list of strategies for how it selects which brokers to route queries to. The order of the strategies matter because as soon as a strategy condition is matched, a broker is selected.
+
+### timeBoundary
+
+```json
+{
+ "type":"timeBoundary"
+}
+```
+
+Including this strategy means all timeBoundary queries are always routed to the highest priority broker.
+
+### priority
+
+```json
+{
+ "type":"priority",
+ "minPriority":0,
+ "maxPriority":1
+}
+```
+
+Queries with a priority set to less than minPriority are routed to the lowest priority broker. Queries with priority set to greater than maxPriority are routed to the highest priority broker. By default, minPriority is 0 and maxPriority is 1. Using these default values, if a query with priority 0 (the default query priority is 0) is sent, the query skips the priority selection logic.
+
+### javascript
+
+Allows defining arbitrary routing rules using a JavaScript function. The function is passed the configuration and the query to be executed, and returns the tier it should be routed to, or null for the default tier.
+
+*Example*: a function that return the highest priority broker unless the given query has more than two aggregators.
+
+```json
+{
+ "type" : "javascript",
+ "function" : "function (config, query) { if (config.getTierToBrokerMap().values().size() > 0 && query.getAggregatorSpecs && query.getAggregatorSpecs().size() <= 2) { return config.getTierToBrokerMap().values().toArray()[0] } else { return config.getDefaultBrokerServiceName() } }"
+}
+```
diff --git a/docs/content/Rule-Configuration.md b/docs/content/Rule-Configuration.md
index bf8b8a9792d..c25c9e62b70 100644
--- a/docs/content/Rule-Configuration.md
+++ b/docs/content/Rule-Configuration.md
@@ -2,12 +2,34 @@
layout: doc_page
---
# Configuring Rules for Coordinator Nodes
+
+Rules indicate how segments should be assigned to different historical node tiers and how many replicas of a segment should exist in each tier. Rules may also indicate when segments should be dropped entirely from the cluster. The coordinator loads a set of rules from the metadata storage. Rules may be specific to a certain datasource and/or a default set of rules can be configured. Rules are read in order and hence the ordering of rules is important. The coordinator will cycle through all available segments and match each segment with the first rule that applies. Each segment may only match a single rule.
+
Note: It is recommended that the coordinator console is used to configure rules. However, the coordinator node does have HTTP endpoints to programmatically configure rules.
+
Load Rules
----------
-Load rules indicate how many replicants of a segment should exist in a server tier.
+Load rules indicate how many replicas of a segment should exist in a server tier.
+
+### Forever Load Rule
+
+Forever load rules are of the form:
+
+```json
+{
+ "type" : "loadForever",
+ "tieredReplicants": {
+ "hot": 1,
+ "_default_tier" : 1
+ }
+}
+```
+
+* `type` - this should always be "loadByInterval"
+* `tieredReplicants` - A JSON Object where the keys are the tier names and values are the number of replicas for that tier.
+
### Interval Load Rule
@@ -16,14 +38,17 @@ Interval load rules are of the form:
```json
{
"type" : "loadByInterval",
- "interval" : "2012-01-01/2013-01-01",
- "tier" : "hot"
+ "interval": "2012-01-01/2013-01-01",
+ "tieredReplicants": {
+ "hot": 1,
+ "_default_tier" : 1
+ }
}
```
* `type` - this should always be "loadByInterval"
* `interval` - A JSON Object representing ISO-8601 Intervals
-* `tier` - the configured historical node tier
+* `tieredReplicants` - A JSON Object where the keys are the tier names and values are the number of replicas for that tier.
### Period Load Rule
@@ -33,13 +58,16 @@ Period load rules are of the form:
{
"type" : "loadByPeriod",
"period" : "P1M",
- "tier" : "hot"
+ "tieredReplicants": {
+ "hot": 1,
+ "_default_tier" : 1
+ }
}
```
* `type` - this should always be "loadByPeriod"
* `period` - A JSON Object representing ISO-8601 Periods
-* `tier` - the configured historical node tier
+* `tieredReplicants` - A JSON Object where the keys are the tier names and values are the number of replicas for that tier.
The interval of a segment will be compared against the specified period. The rule matches if the period overlaps the interval.
@@ -48,6 +76,21 @@ Drop Rules
Drop rules indicate when segments should be dropped from the cluster.
+### Forever Drop Rule
+
+Forever drop rules are of the form:
+
+```json
+{
+ "type" : "dropForever"
+}
+```
+
+* `type` - this should always be "dropByPeriod"
+
+All segments that match this rule are dropped from the cluster.
+
+
### Interval Drop Rule
Interval drop rules are of the form:
diff --git a/docs/content/Simple-Cluster-Configuration.md b/docs/content/Simple-Cluster-Configuration.md
index 788f2d65af8..8bfade9ab9a 100644
--- a/docs/content/Simple-Cluster-Configuration.md
+++ b/docs/content/Simple-Cluster-Configuration.md
@@ -28,7 +28,7 @@ Configuration:
-Ddruid.zk.service.host=localhost
--Ddruid.extensions.coordinates=["io.druid.extensions:druid-kafka-seven:0.6.156"]
+-Ddruid.extensions.coordinates=["io.druid.extensions:druid-kafka-seven:0.6.159"]
-Ddruid.db.connector.connectURI=jdbc:mysql://localhost:3306/druid
-Ddruid.db.connector.user=druid
diff --git a/docs/content/Thanks.md b/docs/content/Thanks.md
deleted file mode 100644
index 97ec7e0904a..00000000000
--- a/docs/content/Thanks.md
+++ /dev/null
@@ -1,12 +0,0 @@
----
-layout: doc_page
----
-
-YourKit supports the Druid open source projects with its
-full-featured Java Profiler.
-YourKit, LLC is the creator of innovative and intelligent tools for profiling
-Java and .NET applications. Take a look at YourKit's software products:
-YourKit Java
-Profiler and
-YourKit .NET
-Profiler.
diff --git a/docs/content/Tutorial:-A-First-Look-at-Druid.md b/docs/content/Tutorial:-A-First-Look-at-Druid.md
index a0c89c0fafb..4a74b4f5b20 100644
--- a/docs/content/Tutorial:-A-First-Look-at-Druid.md
+++ b/docs/content/Tutorial:-A-First-Look-at-Druid.md
@@ -49,7 +49,7 @@ There are two ways to setup Druid: download a tarball, or [Build From Source](Bu
### Download a Tarball
-We've built a tarball that contains everything you'll need. You'll find it [here](http://static.druid.io/artifacts/releases/druid-services-0.6.156-bin.tar.gz). Download this file to a directory of your choosing.
+We've built a tarball that contains everything you'll need. You'll find it [here](http://static.druid.io/artifacts/releases/druid-services-0.6.159-bin.tar.gz). Download this file to a directory of your choosing.
You can extract the awesomeness within by issuing:
@@ -60,7 +60,7 @@ tar -zxvf druid-services-*-bin.tar.gz
Not too lost so far right? That's great! If you cd into the directory:
```
-cd druid-services-0.6.156
+cd druid-services-0.6.159
```
You should see a bunch of files:
diff --git a/docs/content/Tutorial:-Loading-Your-Data-Part-1.md b/docs/content/Tutorial:-Loading-Your-Data-Part-1.md
index 399af3cc786..7ef8648cc0f 100644
--- a/docs/content/Tutorial:-Loading-Your-Data-Part-1.md
+++ b/docs/content/Tutorial:-Loading-Your-Data-Part-1.md
@@ -91,7 +91,7 @@ druid.service=overlord
druid.zk.service.host=localhost
-druid.extensions.coordinates=["io.druid.extensions:druid-kafka-seven:0.6.156"]
+druid.extensions.coordinates=["io.druid.extensions:druid-kafka-seven:0.6.159"]
druid.db.connector.connectURI=jdbc:mysql://localhost:3306/druid
druid.db.connector.user=druid
diff --git a/docs/content/Tutorial:-The-Druid-Cluster.md b/docs/content/Tutorial:-The-Druid-Cluster.md
index 13483109143..3797746ca20 100644
--- a/docs/content/Tutorial:-The-Druid-Cluster.md
+++ b/docs/content/Tutorial:-The-Druid-Cluster.md
@@ -13,7 +13,7 @@ In this tutorial, we will set up other types of Druid nodes and external depende
If you followed the first tutorial, you should already have Druid downloaded. If not, let's go back and do that first.
-You can download the latest version of druid [here](http://static.druid.io/artifacts/releases/druid-services-0.6.156-bin.tar.gz)
+You can download the latest version of druid [here](http://static.druid.io/artifacts/releases/druid-services-0.6.159-bin.tar.gz)
and untar the contents within by issuing:
@@ -149,7 +149,7 @@ druid.port=8081
druid.zk.service.host=localhost
-druid.extensions.coordinates=["io.druid.extensions:druid-s3-extensions:0.6.156"]
+druid.extensions.coordinates=["io.druid.extensions:druid-s3-extensions:0.6.159"]
# Dummy read only AWS account (used to download example data)
druid.s3.secretKey=QyyfVZ7llSiRg6Qcrql1eEUG7buFpAK6T6engr1b
@@ -240,7 +240,7 @@ druid.port=8083
druid.zk.service.host=localhost
-druid.extensions.coordinates=["io.druid.extensions:druid-examples:0.6.156","io.druid.extensions:druid-kafka-seven:0.6.156"]
+druid.extensions.coordinates=["io.druid.extensions:druid-examples:0.6.159","io.druid.extensions:druid-kafka-seven:0.6.159"]
# Change this config to db to hand off to the rest of the Druid cluster
druid.publish.type=noop
diff --git a/docs/content/Tutorial:-Webstream.md b/docs/content/Tutorial:-Webstream.md
index d28dc27923c..47ebec7e049 100644
--- a/docs/content/Tutorial:-Webstream.md
+++ b/docs/content/Tutorial:-Webstream.md
@@ -37,7 +37,7 @@ There are two ways to setup Druid: download a tarball, or [Build From Source](Bu
h3. Download a Tarball
-We've built a tarball that contains everything you'll need. You'll find it [here](http://static.druid.io/artifacts/releases/druid-services-0.6.156-bin.tar.gz)
+We've built a tarball that contains everything you'll need. You'll find it [here](http://static.druid.io/artifacts/releases/druid-services-0.6.159-bin.tar.gz)
Download this file to a directory of your choosing.
You can extract the awesomeness within by issuing:
@@ -48,7 +48,7 @@ tar zxvf druid-services-*-bin.tar.gz
Not too lost so far right? That's great! If you cd into the directory:
```
-cd druid-services-0.6.156
+cd druid-services-0.6.159
```
You should see a bunch of files:
diff --git a/docs/content/Twitter-Tutorial.md b/docs/content/Twitter-Tutorial.md
index db0eab430c6..95993732449 100644
--- a/docs/content/Twitter-Tutorial.md
+++ b/docs/content/Twitter-Tutorial.md
@@ -9,7 +9,7 @@ There are two ways to setup Druid: download a tarball, or build it from source.
# Download a Tarball
-We've built a tarball that contains everything you'll need. You'll find it [here](http://static.druid.io/artifacts/releases/druid-services-0.6.156-bin.tar.gz).
+We've built a tarball that contains everything you'll need. You'll find it [here](http://static.druid.io/artifacts/releases/druid-services-0.6.159-bin.tar.gz).
Download this bad boy to a directory of your choosing.
You can extract the awesomeness within by issuing:
diff --git a/docs/content/index.md b/docs/content/index.md
index 529a2325436..3c236cc81be 100644
--- a/docs/content/index.md
+++ b/docs/content/index.md
@@ -37,17 +37,6 @@ When Druid?
* You want to do your analysis on data as it’s happening (in real-time)
* You need a data store that is always available, 24x7x365, and years into the future.
-
-Not Druid?
-----------
-
-* The amount of data you have can easily be handled by MySQL
-* You're querying for individual entries or doing lookups (not analytics)
-* Batch ingestion is good enough
-* Canned queries are good enough
-* Downtime is no big deal
-
-
Druid vs…
----------
@@ -60,7 +49,7 @@ Druid vs…
About This Page
----------
-The data store world is vast, confusing and constantly in flux. This page is meant to help potential evaluators decide whether Druid is a good fit for the problem one needs to solve. If anything about it is incorrect please provide that feedback on the mailing list or via some other means so we can fix it.
+The data infrastructure world is vast, confusing and constantly in flux. This page is meant to help potential evaluators decide whether Druid is a good fit for the problem one needs to solve. If anything about it is incorrect please provide that feedback on the mailing list or via some other means so we can fix it.
diff --git a/docs/content/toc.textile b/docs/content/toc.textile
index 437e383b760..29c08ab4d0e 100644
--- a/docs/content/toc.textile
+++ b/docs/content/toc.textile
@@ -17,7 +17,9 @@ h2. Getting Started
h2. Booting a Druid Cluster
* "Simple Cluster Configuration":Simple-Cluster-Configuration.html
* "Production Cluster Configuration":Production-Cluster-Configuration.html
+* "Production Hadoop Configuration":Hadoop-Configuration.html
* "Rolling Cluster Updates":Rolling-Updates.html
+* "Recommendations":Recommendations.html
h2. Configuration
* "Common Configuration":Configuration.html
@@ -84,6 +86,7 @@ h2. Experimental
* "Geographic Queries":./GeographicQueries.html
* "Select Query":./SelectQuery.html
* "Approximate Histograms and Quantiles":./ApproxHisto.html
+* "Router node":./Router.html
h2. Development
* "Versioning":./Versioning.html
@@ -91,4 +94,4 @@ h2. Development
* "Libraries":./Libraries.html
h2. Misc
-* "Thanks":./Thanks.html
+* "Thanks":/thanks.html
diff --git a/examples/pom.xml b/examples/pom.xml
index ed6bfaadae4..fb2597326e7 100644
--- a/examples/pom.xml
+++ b/examples/pom.xml
@@ -28,7 +28,7 @@
io.druiddruid
- 0.6.157-SNAPSHOT
+ 0.6.160-SNAPSHOT
diff --git a/hdfs-storage/pom.xml b/hdfs-storage/pom.xml
index 2ab577dd4ef..6cb8f84483e 100644
--- a/hdfs-storage/pom.xml
+++ b/hdfs-storage/pom.xml
@@ -28,7 +28,7 @@
io.druiddruid
- 0.6.157-SNAPSHOT
+ 0.6.160-SNAPSHOT
diff --git a/histogram/pom.xml b/histogram/pom.xml
index e2420e915b3..a26ce5f5601 100644
--- a/histogram/pom.xml
+++ b/histogram/pom.xml
@@ -27,7 +27,7 @@
io.druiddruid
- 0.6.157-SNAPSHOT
+ 0.6.160-SNAPSHOT
diff --git a/histogram/src/main/java/io/druid/query/aggregation/histogram/ApproximateHistogram.java b/histogram/src/main/java/io/druid/query/aggregation/histogram/ApproximateHistogram.java
index b44fbd524e1..6f481d89161 100644
--- a/histogram/src/main/java/io/druid/query/aggregation/histogram/ApproximateHistogram.java
+++ b/histogram/src/main/java/io/druid/query/aggregation/histogram/ApproximateHistogram.java
@@ -1019,8 +1019,6 @@ public class ApproximateHistogram
* @param count current size of the heap
* @param heapIndex index of the item to be deleted
* @param values values stored in the heap
- *
- * @return
*/
private static int heapDelete(int[] heap, int[] reverseIndex, int count, int heapIndex, float[] values)
{
diff --git a/histogram/src/main/java/io/druid/query/aggregation/histogram/ApproximateHistogramAggregatorFactory.java b/histogram/src/main/java/io/druid/query/aggregation/histogram/ApproximateHistogramAggregatorFactory.java
index 458486423f9..c704c72ba2f 100644
--- a/histogram/src/main/java/io/druid/query/aggregation/histogram/ApproximateHistogramAggregatorFactory.java
+++ b/histogram/src/main/java/io/druid/query/aggregation/histogram/ApproximateHistogramAggregatorFactory.java
@@ -29,8 +29,14 @@ import com.google.common.primitives.Floats;
import com.google.common.primitives.Ints;
import io.druid.query.aggregation.Aggregator;
import io.druid.query.aggregation.AggregatorFactory;
+import io.druid.query.aggregation.Aggregators;
import io.druid.query.aggregation.BufferAggregator;
+import io.druid.query.aggregation.hyperloglog.HyperLogLogCollector;
+import io.druid.query.aggregation.hyperloglog.HyperUniquesAggregator;
+import io.druid.query.aggregation.hyperloglog.HyperUniquesBufferAggregator;
import io.druid.segment.ColumnSelectorFactory;
+import io.druid.segment.FloatColumnSelector;
+import io.druid.segment.ObjectColumnSelector;
import org.apache.commons.codec.binary.Base64;
import java.nio.ByteBuffer;
@@ -113,7 +119,7 @@ public class ApproximateHistogramAggregatorFactory implements AggregatorFactory
@Override
public AggregatorFactory getCombiningFactory()
{
- return new ApproximateHistogramAggregatorFactory(name, name, resolution, numBuckets, lowerLimit, upperLimit);
+ return new ApproximateHistogramFoldingAggregatorFactory(name, name, resolution, numBuckets, lowerLimit, upperLimit);
}
@Override
diff --git a/histogram/src/main/java/io/druid/query/aggregation/histogram/ApproximateHistogramFoldingAggregatorFactory.java b/histogram/src/main/java/io/druid/query/aggregation/histogram/ApproximateHistogramFoldingAggregatorFactory.java
index 04dc43a804b..50a9e6ba973 100644
--- a/histogram/src/main/java/io/druid/query/aggregation/histogram/ApproximateHistogramFoldingAggregatorFactory.java
+++ b/histogram/src/main/java/io/druid/query/aggregation/histogram/ApproximateHistogramFoldingAggregatorFactory.java
@@ -76,7 +76,8 @@ public class ApproximateHistogramFoldingAggregatorFactory extends ApproximateHis
};
}
- if (ApproximateHistogram.class.isAssignableFrom(selector.classOfObject())) {
+ final Class cls = selector.classOfObject();
+ if (cls.equals(Object.class) || ApproximateHistogram.class.isAssignableFrom(cls)) {
return new ApproximateHistogramFoldingAggregator(
name,
selector,
@@ -89,7 +90,7 @@ public class ApproximateHistogramFoldingAggregatorFactory extends ApproximateHis
throw new IAE(
"Incompatible type for metric[%s], expected a ApproximateHistogram, got a %s",
fieldName,
- selector.classOfObject()
+ cls
);
}
@@ -117,14 +118,15 @@ public class ApproximateHistogramFoldingAggregatorFactory extends ApproximateHis
};
}
- if (ApproximateHistogram.class.isAssignableFrom(selector.classOfObject())) {
+ final Class cls = selector.classOfObject();
+ if (cls.equals(Object.class) || ApproximateHistogram.class.isAssignableFrom(cls)) {
return new ApproximateHistogramFoldingBufferAggregator(selector, resolution, lowerLimit, upperLimit);
}
throw new IAE(
"Incompatible type for metric[%s], expected a ApproximateHistogram, got a %s",
fieldName,
- selector.classOfObject()
+ cls
);
}
diff --git a/histogram/src/main/java/io/druid/query/aggregation/histogram/Histogram.java b/histogram/src/main/java/io/druid/query/aggregation/histogram/Histogram.java
index 384e6eb1fbf..6785ceb9c10 100644
--- a/histogram/src/main/java/io/druid/query/aggregation/histogram/Histogram.java
+++ b/histogram/src/main/java/io/druid/query/aggregation/histogram/Histogram.java
@@ -84,5 +84,12 @@ public class Histogram
return result;
}
-
-}
\ No newline at end of file
+ @Override
+ public String toString()
+ {
+ return "Histogram{" +
+ "breaks=" + Arrays.toString(breaks) +
+ ", counts=" + Arrays.toString(counts) +
+ '}';
+ }
+}
diff --git a/histogram/src/test/java/io/druid/query/aggregation/histogram/ApproximateHistogramGroupByQueryTest.java b/histogram/src/test/java/io/druid/query/aggregation/histogram/ApproximateHistogramGroupByQueryTest.java
new file mode 100644
index 00000000000..d623b9b6649
--- /dev/null
+++ b/histogram/src/test/java/io/druid/query/aggregation/histogram/ApproximateHistogramGroupByQueryTest.java
@@ -0,0 +1,215 @@
+/*
+ * Druid - a distributed column store.
+ * Copyright (C) 2014 Metamarkets Group Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+package io.druid.query.aggregation.histogram;
+
+import com.fasterxml.jackson.databind.ObjectMapper;
+import com.google.common.base.Function;
+import com.google.common.base.Supplier;
+import com.google.common.base.Suppliers;
+import com.google.common.collect.Iterables;
+import com.google.common.collect.Lists;
+import io.druid.collections.StupidPool;
+import io.druid.data.input.Row;
+import io.druid.jackson.DefaultObjectMapper;
+import io.druid.query.QueryRunner;
+import io.druid.query.QueryRunnerTestHelper;
+import io.druid.query.aggregation.PostAggregator;
+import io.druid.query.dimension.DefaultDimensionSpec;
+import io.druid.query.dimension.DimensionSpec;
+import io.druid.query.groupby.GroupByQuery;
+import io.druid.query.groupby.GroupByQueryConfig;
+import io.druid.query.groupby.GroupByQueryEngine;
+import io.druid.query.groupby.GroupByQueryQueryToolChest;
+import io.druid.query.groupby.GroupByQueryRunnerFactory;
+import io.druid.query.groupby.GroupByQueryRunnerTestHelper;
+import io.druid.query.groupby.orderby.DefaultLimitSpec;
+import io.druid.query.groupby.orderby.OrderByColumnSpec;
+import io.druid.segment.TestHelper;
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+
+import javax.annotation.Nullable;
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.List;
+
+/**
+ */
+@RunWith(Parameterized.class)
+public class ApproximateHistogramGroupByQueryTest
+{
+ private final QueryRunner runner;
+ private GroupByQueryRunnerFactory factory;
+
+ @Parameterized.Parameters
+ public static Collection> constructorFeeder() throws IOException
+ {
+ final ObjectMapper mapper = new DefaultObjectMapper();
+ final StupidPool pool = new StupidPool(
+ new Supplier()
+ {
+ @Override
+ public ByteBuffer get()
+ {
+ return ByteBuffer.allocate(1024 * 1024);
+ }
+ }
+ );
+
+ final GroupByQueryConfig config = new GroupByQueryConfig();
+ config.setMaxIntermediateRows(10000);
+
+ final Supplier configSupplier = Suppliers.ofInstance(config);
+ final GroupByQueryEngine engine = new GroupByQueryEngine(configSupplier, pool);
+
+ final GroupByQueryRunnerFactory factory = new GroupByQueryRunnerFactory(
+ engine,
+ QueryRunnerTestHelper.NOOP_QUERYWATCHER,
+ configSupplier,
+ new GroupByQueryQueryToolChest(configSupplier, mapper, engine)
+ );
+
+ GroupByQueryConfig singleThreadedConfig = new GroupByQueryConfig()
+ {
+ @Override
+ public boolean isSingleThreaded()
+ {
+ return true;
+ }
+ };
+ singleThreadedConfig.setMaxIntermediateRows(10000);
+
+ final Supplier singleThreadedConfigSupplier = Suppliers.ofInstance(singleThreadedConfig);
+ final GroupByQueryEngine singleThreadEngine = new GroupByQueryEngine(singleThreadedConfigSupplier, pool);
+
+ final GroupByQueryRunnerFactory singleThreadFactory = new GroupByQueryRunnerFactory(
+ singleThreadEngine,
+ QueryRunnerTestHelper.NOOP_QUERYWATCHER,
+ singleThreadedConfigSupplier,
+ new GroupByQueryQueryToolChest(singleThreadedConfigSupplier, mapper, singleThreadEngine)
+ );
+
+
+ Function