Merge branch 'master' into router-node

This commit is contained in:
fjy 2014-03-24 13:43:52 -07:00
commit fcd7522596
69 changed files with 1919 additions and 142 deletions

View File

@ -30,4 +30,4 @@ echo "For examples, see: "
echo " "
ls -1 examples/*/*sh
echo " "
echo "See also http://druid.io/docs/0.6.72"
echo "See also http://druid.io/docs/0.6.73"

View File

@ -28,7 +28,7 @@
<parent>
<groupId>io.druid</groupId>
<artifactId>druid</artifactId>
<version>0.6.73-SNAPSHOT</version>
<version>0.6.74-SNAPSHOT</version>
</parent>
<dependencies>

View File

@ -28,7 +28,7 @@
<parent>
<groupId>io.druid</groupId>
<artifactId>druid</artifactId>
<version>0.6.73-SNAPSHOT</version>
<version>0.6.74-SNAPSHOT</version>
</parent>
<dependencies>

View File

@ -20,7 +20,7 @@ io.druid.cli.Main server coordinator
Rules
-----
Segments are loaded and dropped from the cluster based on a set of rules. Rules indicate how segments should be assigned to different historical node tiers and how many replicants of a segment should exist in each tier. Rules may also indicate when segments should be dropped entirely from the cluster. The coordinator loads a set of rules from the database. Rules may be specific to a certain datasource and/or a default set of rules can be configured. Rules are read in order and hence the ordering of rules is important. The coordinator will cycle through all available segments and match each segment with the first rule that applies. Each segment may only match a single rule
Segments are loaded and dropped from the cluster based on a set of rules. Rules indicate how segments should be assigned to different historical node tiers and how many replicants of a segment should exist in each tier. Rules may also indicate when segments should be dropped entirely from the cluster. The coordinator loads a set of rules from the database. Rules may be specific to a certain datasource and/or a default set of rules can be configured. Rules are read in order and hence the ordering of rules is important. The coordinator will cycle through all available segments and match each segment with the first rule that applies. Each segment may only match a single rule.
For more information on rules, see [Rule Configuration](Rule-Configuration.html).

View File

@ -19,13 +19,13 @@ Clone Druid and build it:
git clone https://github.com/metamx/druid.git druid
cd druid
git fetch --tags
git checkout druid-0.6.72
git checkout druid-0.6.73
./build.sh
```
### Downloading the DSK (Druid Standalone Kit)
[Download](http://static.druid.io/artifacts/releases/druid-services-0.6.72-bin.tar.gz) a stand-alone tarball and run it:
[Download](http://static.druid.io/artifacts/releases/druid-services-0.6.73-bin.tar.gz) a stand-alone tarball and run it:
``` bash
tar -xzf druid-services-0.X.X-bin.tar.gz

View File

@ -2,13 +2,13 @@
layout: doc_page
---
# Aggregation Granularity
The granularity field determines how data gets bucketed across the time dimension, i.e how it gets aggregated by hour, day, minute, etc.
The granularity field determines how data gets bucketed across the time dimension, or how it gets aggregated by hour, day, minute, etc.
It can be specified either as a string for simple granularities or as an object for arbitrary granularities.
### Simple Granularities
Simple granularities are specified as a string and bucket timestamps by their UTC time (i.e. days start at 00:00 UTC).
Simple granularities are specified as a string and bucket timestamps by their UTC time (e.g., days start at 00:00 UTC).
Supported granularity strings are: `all`, `none`, `minute`, `fifteen_minute`, `thirty_minute`, `hour` and `day`
@ -35,25 +35,21 @@ This chunks up every hour on the half-hour.
### Period Granularities
Period granularities are specified as arbitrary period combinations of years, months, weeks, hours, minutes and seconds (e.g. P2W, P3M, PT1H30M, PT0.750S) in ISO8601 format.
Period granularities are specified as arbitrary period combinations of years, months, weeks, hours, minutes and seconds (e.g. P2W, P3M, PT1H30M, PT0.750S) in ISO8601 format. They support specifying a time zone which determines where period boundaries start as well as the timezone of the returned timestamps. By default, years start on the first of January, months start on the first of the month and weeks start on Mondays unless an origin is specified.
They support specifying a time zone which determines where period boundaries start and also determines the timezone of the returned timestamps.
By default years start on the first of January, months start on the first of the month and weeks start on Mondays unless an origin is specified.
Time zone is optional (defaults to UTC)
Origin is optional (defaults to 1970-01-01T00:00:00 in the given time zone)
Time zone is optional (defaults to UTC). Origin is optional (defaults to 1970-01-01T00:00:00 in the given time zone).
```
{"type": "period", "period": "P2D", "timeZone": "America/Los_Angeles"}
```
This will bucket by two day chunks in the Pacific timezone.
This will bucket by two-day chunks in the Pacific timezone.
```
{"type": "period", "period": "P3M", "timeZone": "America/Los_Angeles", "origin": "2012-02-01T00:00:00-08:00"}
```
This will bucket by 3 month chunks in the Pacific timezone where the three-month quarters are defined as starting from February.
This will bucket by 3-month chunks in the Pacific timezone where the three-month quarters are defined as starting from February.
Supported time zones: timezone support is provided by the [Joda Time library](http://www.joda.org), which uses the standard IANA time zones. [Joda Time supported timezones](http://joda-time.sourceforge.net/timezones.html)
#### Supported Time Zones
Timezone support is provided by the [Joda Time library](http://www.joda.org), which uses the standard IANA time zones. See the [Joda Time supported timezones](http://joda-time.sourceforge.net/timezones.html).

View File

@ -2,7 +2,7 @@
layout: doc_page
---
# groupBy Queries
These types of queries take a groupBy query object and return an array of JSON objects where each object represents a grouping asked for by the query. Note: If you only want to do straight aggreagates for some time range, we highly recommend using [TimeseriesQueries](TimeseriesQuery.html) instead. The performance will be substantially better.
These types of queries take a groupBy query object and return an array of JSON objects where each object represents a grouping asked for by the query. Note: If you only want to do straight aggregates for some time range, we highly recommend using [TimeseriesQueries](TimeseriesQuery.html) instead. The performance will be substantially better.
An example groupBy query object is shown below:
``` json

View File

@ -66,7 +66,7 @@ druid.host=#{IP_ADDR}:8080
druid.port=8080
druid.service=druid/prod/indexer
druid.extensions.coordinates=["io.druid.extensions:druid-s3-extensions:0.6.72"]
druid.extensions.coordinates=["io.druid.extensions:druid-s3-extensions:0.6.73"]
druid.zk.service.host=#{ZK_IPs}
druid.zk.paths.base=/druid/prod
@ -115,7 +115,7 @@ druid.host=#{IP_ADDR}:8080
druid.port=8080
druid.service=druid/prod/worker
druid.extensions.coordinates=["io.druid.extensions:druid-s3-extensions:0.6.72","io.druid.extensions:druid-kafka-seven:0.6.72"]
druid.extensions.coordinates=["io.druid.extensions:druid-s3-extensions:0.6.73","io.druid.extensions:druid-kafka-seven:0.6.73"]
druid.zk.service.host=#{ZK_IPs}
druid.zk.paths.base=/druid/prod

View File

@ -22,7 +22,7 @@ druid.storage.baseKey=sample
```
## I don't see my Druid segments on my historical nodes
You can check the coordinator console located at <COORDINATOR_IP>:<PORT>/cluster.html. Make sure that your segments have actually loaded on [historical nodes](Historical.html). If your segments are not present, check the coordinator logs for messages about capacity of replication errors. One reason that segments are not downloaded is because historical nodes have maxSizes that are too small, making them incapable of downloading more data. You can change that with (for example):
You can check the coordinator console located at `<COORDINATOR_IP>:<PORT>/cluster.html`. Make sure that your segments have actually loaded on [historical nodes](Historical.html). If your segments are not present, check the coordinator logs for messages about capacity of replication errors. One reason that segments are not downloaded is because historical nodes have maxSizes that are too small, making them incapable of downloading more data. You can change that with (for example):
```
-Ddruid.segmentCache.locations=[{"path":"/tmp/druid/storageLocation","maxSize":"500000000000"}]
@ -31,7 +31,7 @@ You can check the coordinator console located at <COORDINATOR_IP>:<PORT>/cluster
## My queries are returning empty results
You can check <BROKER_IP>:<PORT>/druid/v2/datasources/<YOUR_DATASOURCE> for the dimensions and metrics that have been created for your datasource. Make sure that the name of the aggregators you use in your query match one of these metrics. Also make sure that the query interval you specify match a valid time range where data exists.
You can check `<BROKER_IP>:<PORT>/druid/v2/datasources/<YOUR_DATASOURCE>` for the dimensions and metrics that have been created for your datasource. Make sure that the name of the aggregators you use in your query match one of these metrics. Also make sure that the query interval you specify match a valid time range where data exists.
## More information

View File

@ -27,7 +27,7 @@ druid.host=localhost
druid.service=realtime
druid.port=8083
druid.extensions.coordinates=["io.druid.extensions:druid-kafka-seven:0.6.72"]
druid.extensions.coordinates=["io.druid.extensions:druid-kafka-seven:0.6.73"]
druid.zk.service.host=localhost
@ -76,7 +76,7 @@ druid.host=#{IP_ADDR}:8080
druid.port=8080
druid.service=druid/prod/realtime
druid.extensions.coordinates=["io.druid.extensions:druid-s3-extensions:0.6.72","io.druid.extensions:druid-kafka-seven:0.6.72"]
druid.extensions.coordinates=["io.druid.extensions:druid-s3-extensions:0.6.73","io.druid.extensions:druid-kafka-seven:0.6.73"]
druid.zk.service.host=#{ZK_IPs}
druid.zk.paths.base=/druid/prod

View File

@ -37,7 +37,7 @@ There are several main parts to a search query:
|intervals|A JSON Object representing ISO-8601 Intervals. This defines the time ranges to run the query over.|yes|
|searchDimensions|The dimensions to run the search over. Excluding this means the search is run over all dimensions.|no|
|query|See [SearchQuerySpec](SearchQuerySpec.html).|yes|
|sort|How the results of the search should sorted. Two possible types here are "lexicographic" and "strlen".|yes|
|sort|How the results of the search should be sorted. Two possible types here are "lexicographic" and "strlen".|yes|
|context|An additional JSON Object which can be used to specify certain flags.|no|
The format of the result is:

View File

@ -15,7 +15,7 @@ Segment metadata queries return per segment information about:
{
"queryType":"segmentMetadata",
"dataSource":"sample_datasource",
"intervals":["2013-01-01/2014-01-01"],
"intervals":["2013-01-01/2014-01-01"]
}
```

View File

@ -9,7 +9,7 @@ There are several different types of tasks.
Segment Creation Tasks
----------------------
#### Index Task
### Index Task
The Index Task is a simpler variation of the Index Hadoop task that is designed to be used for smaller data sets. The task executes within the indexing service and does not require an external Hadoop setup to use. The grammar of the index task is as follows:
@ -51,15 +51,15 @@ The Index Task is a simpler variation of the Index Hadoop task that is designed
|--------|-----------|---------|
|type|The task type, this should always be "index".|yes|
|id|The task ID.|no|
|granularitySpec|See [granularitySpec](Tasks.html)|yes|
|spatialDimensions|Dimensions to build spatial indexes over. See [Spatial-Indexing](Spatial-Indexing.html)|no|
|granularitySpec|Specifies the segment chunks that the task will process. `type` is always "uniform"; `gran` sets the granularity of the chunks ("DAY" means all segments containing timestamps in the same day, while `intervals` sets the interval that the chunks will cover.|yes|
|spatialDimensions|Dimensions to build spatial indexes over. See [Geographic Queries](GeographicQueries.html).|no|
|aggregators|The metrics to aggregate in the data set. For more info, see [Aggregations](Aggregations.html)|yes|
|indexGranularity|The rollup granularity for timestamps.|no|
|targetPartitionSize|Used in sharding. Determines how many rows are in each segment.|no|
|firehose|The input source of data. For more info, see [Firehose](Firehose.html)|yes|
|rowFlushBoundary|Used in determining when intermediate persist should occur to disk.|no|
#### Index Hadoop Task
### Index Hadoop Task
The Hadoop Index Task is used to index larger data sets that require the parallelization and processing power of a Hadoop cluster.
@ -79,11 +79,11 @@ The Hadoop Index Task is used to index larger data sets that require the paralle
The Hadoop Index Config submitted as part of an Hadoop Index Task is identical to the Hadoop Index Config used by the `HadoopBatchIndexer` except that three fields must be omitted: `segmentOutputPath`, `workingPath`, `updaterJobSpec`. The Indexing Service takes care of setting these fields internally.
##### Using your own Hadoop distribution
#### Using your own Hadoop distribution
Druid is compiled against Apache hadoop-core 1.0.3. However, if you happen to use a different flavor of hadoop that is API compatible with hadoop-core 1.0.3, you should only have to change the hadoopCoordinates property to point to the maven artifact used by your distribution.
##### Resolving dependency conflicts running HadoopIndexTask
#### Resolving dependency conflicts running HadoopIndexTask
Currently, the HadoopIndexTask creates a single classpath to run the HadoopDruidIndexerJob, which can lead to version conflicts between various dependencies of Druid, extension modules, and Hadoop's own dependencies.
@ -91,7 +91,7 @@ The Hadoop index task will put Druid's dependencies first on the classpath, foll
If you are having trouble with any extensions in HadoopIndexTask, it may be the case that Druid, or one of its dependencies, depends on a different version of a library than what you are using as part of your extensions, but Druid's version overrides the one in your extension. In that case you probably want to build your own Druid version and override the offending library by adding an explicit dependency to the pom.xml of each druid sub-module that depends on it.
#### Realtime Index Task
### Realtime Index Task
The indexing service can also run real-time tasks. These tasks effectively transform a middle manager into a real-time node. We introduced real-time tasks as a way to programmatically add new real-time data sources without needing to manually add nodes. The grammar for the real-time task is as follows:
@ -169,7 +169,7 @@ For schema, fireDepartmentConfig, windowPeriod, segmentGranularity, and rejectio
Segment Merging Tasks
---------------------
#### Append Task
### Append Task
Append tasks append a list of segments together into a single segment (one after the other). The grammar is:
@ -181,7 +181,7 @@ Append tasks append a list of segments together into a single segment (one after
}
```
#### Merge Task
### Merge Task
Merge tasks merge a list of segments together. Any common timestamps are merged. The grammar is:
@ -196,7 +196,7 @@ Merge tasks merge a list of segments together. Any common timestamps are merged.
Segment Destroying Tasks
------------------------
#### Delete Task
### Delete Task
Delete tasks create empty segments with no data. The grammar is:
@ -208,7 +208,7 @@ Delete tasks create empty segments with no data. The grammar is:
}
```
#### Kill Task
### Kill Task
Kill tasks delete all information about a segment and removes it from deep storage. Killable segments must be disabled (used==0) in the Druid segment table. The available grammar is:
@ -223,7 +223,7 @@ Kill tasks delete all information about a segment and removes it from deep stora
Misc. Tasks
-----------
#### Version Converter Task
### Version Converter Task
These tasks convert segments from an existing older index version to the latest index version. The available grammar is:
@ -237,7 +237,7 @@ These tasks convert segments from an existing older index version to the latest
}
```
#### Noop Task
### Noop Task
These tasks start, sleep for a time and are used only for testing. The available grammar is:

View File

@ -9,6 +9,7 @@ TopN queries return a sorted set of results for the values in a given dimension
A topN query object looks like:
```json
{
"queryType": "topN",
"dataSource": "sample_data",
"dimension": "sample_dim",

View File

@ -49,7 +49,7 @@ There are two ways to setup Druid: download a tarball, or [Build From Source](Bu
### Download a Tarball
We've built a tarball that contains everything you'll need. You'll find it [here](http://static.druid.io/artifacts/releases/druid-services-0.6.72-bin.tar.gz). Download this file to a directory of your choosing.
We've built a tarball that contains everything you'll need. You'll find it [here](http://static.druid.io/artifacts/releases/druid-services-0.6.73-bin.tar.gz). Download this file to a directory of your choosing.
You can extract the awesomeness within by issuing:
@ -60,7 +60,7 @@ tar -zxvf druid-services-*-bin.tar.gz
Not too lost so far right? That's great! If you cd into the directory:
```
cd druid-services-0.6.72
cd druid-services-0.6.73
```
You should see a bunch of files:

View File

@ -13,7 +13,7 @@ In this tutorial, we will set up other types of Druid nodes and external depende
If you followed the first tutorial, you should already have Druid downloaded. If not, let's go back and do that first.
You can download the latest version of druid [here](http://static.druid.io/artifacts/releases/druid-services-0.6.72-bin.tar.gz)
You can download the latest version of druid [here](http://static.druid.io/artifacts/releases/druid-services-0.6.73-bin.tar.gz)
and untar the contents within by issuing:
@ -149,7 +149,7 @@ druid.port=8081
druid.zk.service.host=localhost
druid.extensions.coordinates=["io.druid.extensions:druid-s3-extensions:0.6.72"]
druid.extensions.coordinates=["io.druid.extensions:druid-s3-extensions:0.6.73"]
# Dummy read only AWS account (used to download example data)
druid.s3.secretKey=QyyfVZ7llSiRg6Qcrql1eEUG7buFpAK6T6engr1b
@ -240,7 +240,7 @@ druid.port=8083
druid.zk.service.host=localhost
druid.extensions.coordinates=["io.druid.extensions:druid-examples:0.6.72","io.druid.extensions:druid-kafka-seven:0.6.72"]
druid.extensions.coordinates=["io.druid.extensions:druid-examples:0.6.73","io.druid.extensions:druid-kafka-seven:0.6.73"]
# Change this config to db to hand off to the rest of the Druid cluster
druid.publish.type=noop

View File

@ -37,7 +37,7 @@ There are two ways to setup Druid: download a tarball, or [Build From Source](Bu
h3. Download a Tarball
We've built a tarball that contains everything you'll need. You'll find it [here](http://static.druid.io/artifacts/releases/druid-services-0.6.72-bin.tar.gz)
We've built a tarball that contains everything you'll need. You'll find it [here](http://static.druid.io/artifacts/releases/druid-services-0.6.73-bin.tar.gz)
Download this file to a directory of your choosing.
You can extract the awesomeness within by issuing:
@ -48,7 +48,7 @@ tar zxvf druid-services-*-bin.tar.gz
Not too lost so far right? That's great! If you cd into the directory:
```
cd druid-services-0.6.72
cd druid-services-0.6.73
```
You should see a bunch of files:

View File

@ -9,7 +9,7 @@ There are two ways to setup Druid: download a tarball, or build it from source.
h3. Download a Tarball
We've built a tarball that contains everything you'll need. You'll find it "here":http://static.druid.io/artifacts/releases/druid-services-0.6.72-bin.tar.gz.
We've built a tarball that contains everything you'll need. You'll find it "here":http://static.druid.io/artifacts/releases/druid-services-0.6.73-bin.tar.gz.
Download this bad boy to a directory of your choosing.
You can extract the awesomeness within by issuing:

View File

@ -31,20 +31,20 @@ We have more details about the general design of the system and why you might wa
When Druid?
----------
* You need to do interactive, fast, exploration of large amounts of data
* You need analytics (not key value store)
* You have a lot of data (10s of Billions of events added per day, 10s of TB of data added per day)
* You want to do your analysis on data as its happening (realtime)
* Your store needs to be always-on, 24x7x365 and years into the future.
* You need to do interactive, fast, exploration on large amounts of data
* You need analytics (not a key-value store)
* You have a lot of data (10s of billions of events added per day, 10s of TB of data added per day)
* You want to do your analysis on data as its happening (in real-time)
* You need a data store that is always available, 24x7x365, and years into the future.
Not Druid?
----------
* The amount of data you have can easily be handled by MySql
* Your querying for individual entries or doing lookups (Not Analytics)
* Batch is good enough
* Canned queries is good enough
* The amount of data you have can easily be handled by MySQL
* You're querying for individual entries or doing lookups (not analytics)
* Batch ingestion is good enough
* Canned queries are good enough
* Downtime is no big deal

View File

@ -4,7 +4,7 @@ druid.port=8081
druid.zk.service.host=localhost
druid.extensions.coordinates=["io.druid.extensions:druid-s3-extensions:0.6.72"]
druid.extensions.coordinates=["io.druid.extensions:druid-s3-extensions:0.6.73"]
# Dummy read only AWS account (used to download example data)
druid.s3.secretKey=QyyfVZ7llSiRg6Qcrql1eEUG7buFpAK6T6engr1b

View File

@ -4,7 +4,7 @@ druid.port=8083
druid.zk.service.host=localhost
druid.extensions.coordinates=["io.druid.extensions:druid-examples:0.6.72","io.druid.extensions:druid-kafka-seven:0.6.72","io.druid.extensions:druid-rabbitmq:0.6.72"]
druid.extensions.coordinates=["io.druid.extensions:druid-examples:0.6.73","io.druid.extensions:druid-kafka-seven:0.6.73","io.druid.extensions:druid-rabbitmq:0.6.73"]
# Change this config to db to hand off to the rest of the Druid cluster
druid.publish.type=noop

View File

@ -28,7 +28,7 @@
<parent>
<groupId>io.druid</groupId>
<artifactId>druid</artifactId>
<version>0.6.73-SNAPSHOT</version>
<version>0.6.74-SNAPSHOT</version>
</parent>
<dependencies>

View File

@ -28,7 +28,7 @@
<parent>
<groupId>io.druid</groupId>
<artifactId>druid</artifactId>
<version>0.6.73-SNAPSHOT</version>
<version>0.6.74-SNAPSHOT</version>
</parent>
<dependencies>

View File

@ -28,7 +28,7 @@
<parent>
<groupId>io.druid</groupId>
<artifactId>druid</artifactId>
<version>0.6.73-SNAPSHOT</version>
<version>0.6.74-SNAPSHOT</version>
</parent>
<dependencies>

View File

@ -28,7 +28,7 @@
<parent>
<groupId>io.druid</groupId>
<artifactId>druid</artifactId>
<version>0.6.73-SNAPSHOT</version>
<version>0.6.74-SNAPSHOT</version>
</parent>
<dependencies>

View File

@ -28,7 +28,7 @@
<parent>
<groupId>io.druid</groupId>
<artifactId>druid</artifactId>
<version>0.6.73-SNAPSHOT</version>
<version>0.6.74-SNAPSHOT</version>
</parent>
<dependencies>

View File

@ -28,7 +28,7 @@
<parent>
<groupId>io.druid</groupId>
<artifactId>druid</artifactId>
<version>0.6.73-SNAPSHOT</version>
<version>0.6.74-SNAPSHOT</version>
</parent>
<dependencies>

View File

@ -28,7 +28,7 @@
<parent>
<groupId>io.druid</groupId>
<artifactId>druid</artifactId>
<version>0.6.73-SNAPSHOT</version>
<version>0.6.74-SNAPSHOT</version>
</parent>
<dependencies>

View File

@ -23,14 +23,14 @@
<groupId>io.druid</groupId>
<artifactId>druid</artifactId>
<packaging>pom</packaging>
<version>0.6.73-SNAPSHOT</version>
<version>0.6.74-SNAPSHOT</version>
<name>druid</name>
<description>druid</description>
<scm>
<connection>scm:git:ssh://git@github.com/metamx/druid.git</connection>
<developerConnection>scm:git:ssh://git@github.com/metamx/druid.git</developerConnection>
<url>http://www.github.com/metamx/druid</url>
<tag>druid-0.6.72-SNAPSHOT</tag>
<tag>druid-0.6.73-SNAPSHOT</tag>
</scm>
<prerequisites>
@ -94,7 +94,7 @@
<dependency>
<groupId>com.metamx</groupId>
<artifactId>server-metrics</artifactId>
<version>0.0.5</version>
<version>0.0.9</version>
</dependency>
<dependency>

View File

@ -28,7 +28,7 @@
<parent>
<groupId>io.druid</groupId>
<artifactId>druid</artifactId>
<version>0.6.73-SNAPSHOT</version>
<version>0.6.74-SNAPSHOT</version>
</parent>
<dependencies>

View File

@ -34,4 +34,16 @@ public class AllColumnIncluderator implements ColumnIncluderator
{
return ALL_CACHE_PREFIX;
}
@Override
public boolean equals(Object obj)
{
return obj instanceof AllColumnIncluderator;
}
@Override
public int hashCode()
{
return AllColumnIncluderator.class.hashCode();
}
}

View File

@ -21,7 +21,9 @@ package io.druid.query.metadata.metadata;
import com.fasterxml.jackson.annotation.JsonCreator;
import com.fasterxml.jackson.annotation.JsonProperty;
import com.google.common.base.Preconditions;
import io.druid.query.BaseQuery;
import io.druid.query.DataSource;
import io.druid.query.Query;
import io.druid.query.TableDataSource;
import io.druid.query.spec.QuerySegmentSpec;
@ -36,17 +38,18 @@ public class SegmentMetadataQuery extends BaseQuery<SegmentAnalysis>
@JsonCreator
public SegmentMetadataQuery(
@JsonProperty("dataSource") String dataSource,
@JsonProperty("dataSource") DataSource dataSource,
@JsonProperty("intervals") QuerySegmentSpec querySegmentSpec,
@JsonProperty("toInclude") ColumnIncluderator toInclude,
@JsonProperty("merge") Boolean merge,
@JsonProperty("context") Map<String, String> context
)
{
super(new TableDataSource(dataSource), querySegmentSpec, context);
super(dataSource, querySegmentSpec, context);
this.toInclude = toInclude == null ? new AllColumnIncluderator() : toInclude;
this.merge = merge == null ? false : merge;
Preconditions.checkArgument(dataSource instanceof TableDataSource, "SegmentMetadataQuery only supports table datasource");
}
@JsonProperty
@ -77,7 +80,7 @@ public class SegmentMetadataQuery extends BaseQuery<SegmentAnalysis>
public Query<SegmentAnalysis> withOverriddenContext(Map<String, String> contextOverride)
{
return new SegmentMetadataQuery(
((TableDataSource)getDataSource()).getName(),
getDataSource(),
getQuerySegmentSpec(), toInclude, merge, computeOverridenContext(contextOverride)
);
}
@ -86,7 +89,7 @@ public class SegmentMetadataQuery extends BaseQuery<SegmentAnalysis>
public Query<SegmentAnalysis> withQuerySegmentSpec(QuerySegmentSpec spec)
{
return new SegmentMetadataQuery(
((TableDataSource)getDataSource()).getName(),
getDataSource(),
spec, toInclude, merge, getContext());
}

View File

@ -21,6 +21,7 @@ package io.druid.query.metadata;
import com.google.common.collect.Lists;
import com.metamx.common.guava.Sequences;
import io.druid.query.LegacyDataSource;
import io.druid.query.QueryRunner;
import io.druid.query.QueryRunnerFactory;
import io.druid.query.QueryRunnerTestHelper;
@ -98,7 +99,7 @@ public class SegmentAnalyzerTest
);
final SegmentMetadataQuery query = new SegmentMetadataQuery(
"test", QuerySegmentSpecs.create("2011/2012"), null, null, null
new LegacyDataSource("test"), QuerySegmentSpecs.create("2011/2012"), null, null, null
);
return Sequences.toList(query.run(runner), Lists.<SegmentAnalysis>newArrayList());
}

View File

@ -0,0 +1,51 @@
/*
* Druid - a distributed column store.
* Copyright (C) 2012, 2013 Metamarkets Group Inc.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version 2
* of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
*/
package io.druid.query.metadata;
import com.fasterxml.jackson.databind.ObjectMapper;
import io.druid.jackson.DefaultObjectMapper;
import io.druid.query.Query;
import io.druid.query.metadata.metadata.SegmentMetadataQuery;
import org.joda.time.Interval;
import org.junit.Assert;
import org.junit.Test;
public class SegmentMetadataQueryTest
{
private ObjectMapper mapper = new DefaultObjectMapper();
@Test
public void testSerde() throws Exception
{
String queryStr = "{\n"
+ " \"queryType\":\"segmentMetadata\",\n"
+ " \"dataSource\":\"test_ds\",\n"
+ " \"intervals\":[\"2013-12-04T00:00:00.000Z/2013-12-05T00:00:00.000Z\"]\n"
+ "}";
Query query = mapper.readValue(queryStr, Query.class);
Assert.assertTrue(query instanceof SegmentMetadataQuery);
Assert.assertEquals("test_ds", query.getDataSource().getName());
Assert.assertEquals(new Interval("2013-12-04T00:00:00.000Z/2013-12-05T00:00:00.000Z"), query.getIntervals().get(0));
// test serialize and deserialize
Assert.assertEquals(query, mapper.readValue(mapper.writeValueAsString(query), Query.class));
}
}

View File

@ -1,7 +1,18 @@
all : druid.pdf
all : druid
druid : druid.pdf
sigmod : sgmd0658-yang.pdf
zip : sgmd0658-yang.zip
%.zip : %.pdf
@rm -f dummy.ps
@touch dummy.ps
zip $@ $*.pdf $*.tex dummy.ps
clean :
@rm -f *.aux *.bbl *.blg *.log
@rm -f *.aux *.bbl *.blg *.log dummy.ps *.zip
%.tex : %.bib

Binary file not shown.

View File

@ -1,4 +1,18 @@
\documentclass{acm_proc_article-sp}
\documentclass{sig-alternate-2013}
\newfont{\mycrnotice}{ptmr8t at 7pt}
\newfont{\myconfname}{ptmri8t at 7pt}
\let\crnotice\mycrnotice%
\let\confname\myconfname%
\permission{Permission to make digital or hard copies of all or part of this work for personal or classroom use is granted without fee provided that copies are not made or distributed for profit or commercial advantage and that copies bear this notice and the full citation on the first page. Copyrights for components of this work owned by others than the author(s) must be honored. Abstracting with credit is permitted. To copy otherwise, or republish, to post on servers or to redistribute to lists, requires prior specific permission and/or a fee. Request permissions from permissions@acm.org.}
\conferenceinfo{SIGMOD/PODS'14,}{June 22--27, 2014, Salt Lake City, UT, USA. \\
{\mycrnotice{Copyright is held by the owner/author(s). Publication rights licensed to ACM.}}}
\copyrightetc{ACM \the\acmcopyr}
\crdata{978-1-4503-2376-5/14/06\ ...\$15.00.\\
\href{http://dx.doi.org/10.1145/2588555.2595631}{http://dx.doi.org/10.1145/2588555.2595631}}
\clubpenalty=10000
\widowpenalty = 10000
\usepackage{graphicx}
\usepackage{balance}
\usepackage{fontspec}
@ -7,28 +21,48 @@
\graphicspath{{figures/}}
\usepackage{enumitem}
\hyphenation{metamarkets nelson}
\hyphenation{metamarkets nelson cheddar}
\begin{document}
% ****************** TITLE ****************************************
\title{Druid: A Real-time Analytical Data Store}
\title{Druid}
\subtitle{A Real-time Analytical Data Store}
% ****************** AUTHORS **************************************
\numberofauthors{6}
\author{
\alignauthor Fangjin Yang, Eric Tschetter, Xavier Léauté, Nelson Ray, Gian Merlino, Deep Ganguli\\
\email{\{fangjin, cheddar, xavier, nelson, gian, deep\}@metamarkets.com}
\alignauthor Fangjin Yang\\
\affaddr{Metamarkets Group, Inc.}\\
% \affaddr{625 2nd St Suite 230}\\
% \affaddr{San Francisco, CA 94107}\\
\email{fangjin@metamarkets.com}
\alignauthor Eric Tschetter\\
\email{echeddar@gmail.com}
\alignauthor Xavier Léauté\\
\affaddr{Metamarkets Group, Inc.}\\
\email{xavier@metamarkets.com}
\and
\alignauthor Nelson Ray\\
\email{ncray86@gmail.com}
\alignauthor Gian Merlino\\
\affaddr{Metamarkets Group, Inc.}\\
\email{gian@metamarkets.com}
\alignauthor Deep Ganguli\\
\affaddr{Metamarkets Group, Inc.}\\
\email{deep@metamarkets.com}
}
\date{21 March 2013}
% ****************** AUTHORS **************************************
\maketitle
\begin{abstract}
Druid is an open
source\footnote{\href{https://github.com/metamx/druid}{https://github.com/metamx/druid}}
source\footnote{\href{http://druid.io/}{http://druid.io/} \href{https://github.com/metamx/druid}{https://github.com/metamx/druid}}
data store designed for real-time exploratory analytics on large data sets.
The system combines a column-oriented storage layout, a distributed,
shared-nothing architecture, and an advanced indexing structure to allow for
@ -37,13 +71,19 @@ this paper, we describe Druid's architecture, and detail how it supports fast
aggregations, flexible filters, and low latency data ingestion.
\end{abstract}
% A category with the (minimum) three required fields
\category{H.2.4}{Database Management}{Systems}[Distributed databases]
% \category{D.2.8}{Software Engineering}{Metrics}[complexity measures, performance measures]
\keywords{distributed; real-time; fault-tolerant; analytics; OLAP; columnar}
\section{Introduction}
In recent years, the proliferation of internet technology has
created a surge in machine-generated events. Individually, these
events contain minimal useful information and are of low value. Given the
time and resources required to extract meaning from large collections of
events, many companies were willing to discard this data instead. Although
infrastructure has been built to handle event based data (e.g. IBM's
infrastructure has been built to handle event-based data (e.g. IBM's
Netezza\cite{singh2011introduction}, HP's Vertica\cite{bear2012vertica}, and EMC's
Greenplum\cite{miner2012unified}), they are largely sold at high price points
and are only targeted towards those companies who can afford the offering.
@ -146,7 +186,7 @@ Relational Database Management Systems (RDBMS) and NoSQL key/value stores were
unable to provide a low latency data ingestion and query platform for
interactive applications \cite{tschetter2011druid}. In the early days of
Metamarkets, we were focused on building a hosted dashboard that would allow
users to arbitrary explore and visualize event streams. The data store
users to arbitrarily explore and visualize event streams. The data store
powering the dashboard needed to return queries fast enough that the data
visualizations built on top of it could provide users with an interactive
experience.
@ -187,7 +227,7 @@ Figure~\ref{fig:cluster}.
\begin{figure*}
\centering
\includegraphics[width = 4.5in]{cluster}
\includegraphics[width = 4.51in]{cluster}
\caption{An overview of a Druid cluster and the flow of data through the cluster.}
\label{fig:cluster}
\end{figure*}
@ -198,7 +238,7 @@ Figure~\ref{fig:cluster}.
Real-time nodes encapsulate the functionality to ingest and query event
streams. Events indexed via these nodes are immediately available for querying.
The nodes are only concerned with events for some small time range and
periodically hand off immutable batches of events they've collected over this
periodically hand off immutable batches of events they have collected over this
small time range to other nodes in the Druid cluster that are specialized in
dealing with batches of immutable events. Real-time nodes leverage Zookeeper
\cite{hunt2010zookeeper} for coordination with the rest of the Druid cluster.
@ -789,7 +829,7 @@ approximately 10TB of segments loaded. Collectively,
there are about 50 billion Druid rows in this tier. Results for
every data source are not shown.
\item The hot tier uses Xeon E5-2670 processors and consists of 1302 processing
\item The hot tier uses Intel Xeon E5-2670 processors and consists of 1302 processing
threads and 672 total cores (hyperthreaded).
\item A memory-mapped storage engine was used (the machine was configured to
@ -828,7 +868,7 @@ comparison, we also provide the results of the same queries using MySQL using th
MyISAM engine (InnoDB was slower in our experiments).
We selected MySQL to benchmark
against because of its universal popularity. We choose not to select another
against because of its universal popularity. We chose not to select another
open source column store because we were not confident we could correctly tune
it for optimal performance.
@ -871,6 +911,7 @@ well.
\begin{figure}
\centering
\includegraphics[width = 2.3in]{tpch_scaling}
\includegraphics[width = 2.3in]{tpch_scaling_factor}
\caption{Druid scaling benchmarks -- 100GB TPC-H data.}
\label{fig:tpch_scaling}
\end{figure}
@ -933,9 +974,9 @@ running an Amazon \texttt{cc2.8xlarge} instance.
\label{fig:ingestion_rate}
\end{figure}
The latency measurements we presented are sufficient to address the our stated
The latency measurements we presented are sufficient to address the stated
problems of interactivity. We would prefer the variability in the latencies to
be less. It is still very possible to possible to decrease latencies by adding
be less. It is still very possible to decrease latencies by adding
additional hardware, but we have not chosen to do so because infrastructure
costs are still a consideration to us.
@ -1017,7 +1058,7 @@ data centers as well. The tier configuration in Druid coordinator nodes allow
for segments to be replicated across multiple tiers. Hence, segments can be
exactly replicated across historical nodes in multiple data centers.
Similarily, query preference can be assigned to different tiers. It is possible
to have nodes in one data center act as a primary cluster (and recieve all
to have nodes in one data center act as a primary cluster (and receive all
queries) and have a redundant cluster in another data center. Such a setup may
be desired if one data center is situated much closer to users.

Binary file not shown.

Binary file not shown.

Before

Width:  |  Height:  |  Size: 35 KiB

Binary file not shown.

Binary file not shown.

Before

Width:  |  Height:  |  Size: 53 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 28 KiB

Binary file not shown.

Binary file not shown.

Before

Width:  |  Height:  |  Size: 51 KiB

Binary file not shown.

Binary file not shown.

Before

Width:  |  Height:  |  Size: 35 KiB

Binary file not shown.

Binary file not shown.

Before

Width:  |  Height:  |  Size: 36 KiB

Binary file not shown.

Binary file not shown.

Before

Width:  |  Height:  |  Size: 43 KiB

View File

@ -0,0 +1 @@
druid.bib

Binary file not shown.

View File

@ -0,0 +1 @@
druid.tex

Binary file not shown.

File diff suppressed because it is too large Load Diff

View File

@ -9,7 +9,7 @@
<parent>
<groupId>io.druid</groupId>
<artifactId>druid</artifactId>
<version>0.6.73-SNAPSHOT</version>
<version>0.6.74-SNAPSHOT</version>
</parent>
<dependencies>

View File

@ -28,7 +28,7 @@
<parent>
<groupId>io.druid</groupId>
<artifactId>druid</artifactId>
<version>0.6.73-SNAPSHOT</version>
<version>0.6.74-SNAPSHOT</version>
</parent>
<dependencies>

View File

@ -28,7 +28,7 @@
<parent>
<groupId>io.druid</groupId>
<artifactId>druid</artifactId>
<version>0.6.73-SNAPSHOT</version>
<version>0.6.74-SNAPSHOT</version>
</parent>
<dependencies>

View File

@ -36,7 +36,6 @@ import io.druid.curator.CuratorModule;
import io.druid.curator.discovery.DiscoveryModule;
import io.druid.guice.AWSModule;
import io.druid.guice.AnnouncerModule;
import io.druid.guice.LocalDataStorageDruidModule;
import io.druid.guice.DbConnectorModule;
import io.druid.guice.DruidGuiceExtensions;
import io.druid.guice.DruidProcessingModule;
@ -47,6 +46,7 @@ import io.druid.guice.IndexingServiceDiscoveryModule;
import io.druid.guice.JacksonConfigManagerModule;
import io.druid.guice.JsonConfigProvider;
import io.druid.guice.LifecycleModule;
import io.druid.guice.LocalDataStorageDruidModule;
import io.druid.guice.QueryRunnerFactoryModule;
import io.druid.guice.QueryableModule;
import io.druid.guice.ServerModule;
@ -107,9 +107,10 @@ public class Initialization
/**
* @param clazz Module class
* @param <T>
*
* @return Returns the set of modules loaded.
*/
public static<T> Set<T> getLoadedModules(Class<T> clazz)
public static <T> Set<T> getLoadedModules(Class<T> clazz)
{
Set<T> retVal = extensionsMap.get(clazz);
if (retVal == null) {
@ -190,7 +191,9 @@ public class Initialization
)
);
try {
final List<Artifact> artifacts = aether.resolveArtifacts(dependencyRequest);
List<URL> urls = Lists.newArrayListWithExpectedSize(artifacts.size());
for (Artifact artifact : artifacts) {
if (!exclusions.contains(artifact.getGroupId())) {
@ -207,6 +210,11 @@ public class Initialization
loader = new URLClassLoader(urls.toArray(new URL[urls.size()]), Initialization.class.getClassLoader());
loadersMap.put(coordinate, loader);
}
catch (Exception e) {
log.error(e, "Unable to resolve artifacts for [%s].", dependencyRequest);
throw Throwables.propagate(e);
}
}
return loader;
}
@ -232,9 +240,9 @@ public class Initialization
URI u = new URI(uri);
Repository r = new Repository(uri);
if(u.getUserInfo() != null) {
if (u.getUserInfo() != null) {
String[] auth = u.getUserInfo().split(":", 2);
if(auth.length == 2) {
if (auth.length == 2) {
r.setUsername(auth[0]);
r.setPassword(auth[1]);
} else {
@ -247,7 +255,7 @@ public class Initialization
}
remoteRepositories.add(r);
}
catch(URISyntaxException e) {
catch (URISyntaxException e) {
throw Throwables.propagate(e);
}
}
@ -261,7 +269,8 @@ public class Initialization
PrintStream oldOut = System.out;
try {
System.setOut(new PrintStream(
System.setOut(
new PrintStream(
new OutputStream()
{
@Override
@ -282,7 +291,8 @@ public class Initialization
}
}
));
)
);
return new DefaultTeslaAether(
config.getLocalRepository(),
remoteRepositories.toArray(new Repository[remoteRepositories.size()])

View File

@ -27,7 +27,7 @@
<parent>
<groupId>io.druid</groupId>
<artifactId>druid</artifactId>
<version>0.6.73-SNAPSHOT</version>
<version>0.6.74-SNAPSHOT</version>
</parent>
<dependencies>

View File

@ -54,7 +54,7 @@ import java.util.List;
*/
@Command(
name = "broker",
description = "Runs a broker node, see http://druid.io/docs/0.6.72/Broker.html for a description"
description = "Runs a broker node, see http://druid.io/docs/0.6.73/Broker.html for a description"
)
public class CliBroker extends ServerRunnable
{

View File

@ -66,7 +66,7 @@ import java.util.List;
*/
@Command(
name = "coordinator",
description = "Runs the Coordinator, see http://druid.io/docs/0.6.72/Coordinator.html for a description."
description = "Runs the Coordinator, see http://druid.io/docs/0.6.73/Coordinator.html for a description."
)
public class CliCoordinator extends ServerRunnable
{

View File

@ -41,7 +41,7 @@ import java.util.List;
*/
@Command(
name = "hadoop",
description = "Runs the batch Hadoop Druid Indexer, see http://druid.io/docs/0.6.72/Batch-ingestion.html for a description."
description = "Runs the batch Hadoop Druid Indexer, see http://druid.io/docs/0.6.73/Batch-ingestion.html for a description."
)
public class CliHadoopIndexer implements Runnable
{

View File

@ -42,7 +42,7 @@ import java.util.List;
*/
@Command(
name = "historical",
description = "Runs a Historical node, see http://druid.io/docs/0.6.72/Historical.html for a description"
description = "Runs a Historical node, see http://druid.io/docs/0.6.73/Historical.html for a description"
)
public class CliHistorical extends ServerRunnable
{

View File

@ -93,7 +93,7 @@ import java.util.List;
*/
@Command(
name = "overlord",
description = "Runs an Overlord node, see http://druid.io/docs/0.6.72/Indexing-Service.html for a description"
description = "Runs an Overlord node, see http://druid.io/docs/0.6.73/Indexing-Service.html for a description"
)
public class CliOverlord extends ServerRunnable
{

View File

@ -30,7 +30,7 @@ import java.util.List;
*/
@Command(
name = "realtime",
description = "Runs a realtime node, see http://druid.io/docs/0.6.72/Realtime.html for a description"
description = "Runs a realtime node, see http://druid.io/docs/0.6.73/Realtime.html for a description"
)
public class CliRealtime extends ServerRunnable
{

View File

@ -42,7 +42,7 @@ import java.util.concurrent.Executor;
*/
@Command(
name = "realtime",
description = "Runs a standalone realtime node for examples, see http://druid.io/docs/0.6.72/Realtime.html for a description"
description = "Runs a standalone realtime node for examples, see http://druid.io/docs/0.6.73/Realtime.html for a description"
)
public class CliRealtimeExample extends ServerRunnable
{