Merge pull request #266 from metamx/more-docs

a ton of fixes to docs
This commit is contained in:
fjy 2013-10-10 15:05:34 -07:00
commit 38e9ff5984
24 changed files with 767 additions and 724 deletions

View File

@ -23,6 +23,7 @@ import com.google.common.base.Supplier;
import com.google.common.base.Throwables;
import com.google.common.collect.Maps;
import com.google.inject.Inject;
import com.metamx.common.ISE;
import com.metamx.common.concurrent.ScheduledExecutors;
import com.metamx.common.lifecycle.LifecycleStart;
import com.metamx.common.lifecycle.LifecycleStop;
@ -38,6 +39,7 @@ import org.skife.jdbi.v2.tweak.ResultSetMapper;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.util.Arrays;
import java.util.List;
import java.util.Map;
import java.util.concurrent.Callable;
import java.util.concurrent.ConcurrentMap;
@ -76,7 +78,7 @@ public class ConfigManager
final String configTable = dbTables.get().getConfigTable();
this.selectStatement = String.format("SELECT payload FROM %s WHERE name = :name", configTable);
insertStatement = String.format(
this.insertStatement = String.format(
"INSERT INTO %s (name, payload) VALUES (:name, :payload) ON DUPLICATE KEY UPDATE payload = :payload",
configTable
);
@ -186,19 +188,29 @@ public class ConfigManager
@Override
public byte[] withHandle(Handle handle) throws Exception
{
return handle.createQuery(selectStatement)
.bind("name", key)
.map(
new ResultSetMapper<byte[]>()
{
@Override
public byte[] map(int index, ResultSet r, StatementContext ctx) throws SQLException
{
return r.getBytes("payload");
}
}
)
.first();
List<byte[]> matched = handle.createQuery(selectStatement)
.bind("name", key)
.map(
new ResultSetMapper<byte[]>()
{
@Override
public byte[] map(int index, ResultSet r, StatementContext ctx)
throws SQLException
{
return r.getBytes("payload");
}
}
).list();
if (matched.isEmpty()) {
return null;
}
if (matched.size() > 1) {
throw new ISE("Error! More than one matching entry[%d] found for [%s]?!", matched.size(), key);
}
return matched.get(0);
}
}
);

View File

@ -11,9 +11,6 @@
<a href="mailto:info@druid.io">info@druid.io</a>
</address>
<address>
<strong>Metamarkets</strong>
625 2nd Street, Suite #230<br>
San Francisco, CA 94017<br>
<div class="soc">
<a href="https://twitter.com/druidio"></a>
<a href="https://github.com/metamx/druid" class="github"></a>
@ -25,7 +22,7 @@
<li><a href="/"><strong>DRUID</strong></a></li>
<li><a href="/druid.html">What is Druid?</a></li>
<li><a href="/downloads.html">Downloads</a></li>
<li><a target="_blank" href="https://github.com/metamx/druid/wiki">Documentation</a></li>
<li><a target="_blank" href="Home.html">Documentation</a></li>
</ul>
<ul class="col-md-4 list-unstyled">
<li><a href="/community.html"><strong>SUPPORT</strong></a></li>

View File

@ -3,7 +3,7 @@ layout: doc_page
---
# Booting a Single Node Cluster #
[Loading Your Data](Loading-Your-Data.html) and [Querying Your Data](Querying-Your-Data.html) contain recipes to boot a small druid cluster on localhost. Here we will boot a small cluster on EC2. You can checkout the code, or download a tarball from [here](http://static.druid.io/artifacts/druid-services-0.6.0-bin.tar.gz).
[Loading Your Data](Tutorial%3A-Loading-Your-Data-Part-2.html) and [All About Queries](Tutorial%3A-All-About-Queries.html) contain recipes to boot a small druid cluster on localhost. Here we will boot a small cluster on EC2. You can checkout the code, or download a tarball from [here](http://static.druid.io/artifacts/druid-services-0.6.0-bin.tar.gz).
The [ec2 run script](https://github.com/metamx/druid/blob/master/examples/bin/run_ec2.sh), run_ec2.sh, is located at 'examples/bin' if you have checked out the code, or at the root of the project if you've downloaded a tarball. The scripts rely on the [Amazon EC2 API Tools](http://aws.amazon.com/developertools/351), and you will need to set three environment variables:

View File

@ -0,0 +1,105 @@
---
layout: doc_page
---
A Druid cluster consists of various node types that need to be set up depending on your use case. See our [[Design]] docs for a description of the different node types.
h2. Setup Scripts
One of our community members, "housejester":https://github.com/housejester/, contributed some scripts to help with setting up a cluster. Checkout the "github":https://github.com/housejester/druid-test-harness and "wiki":https://github.com/housejester/druid-test-harness/wiki/Druid-Test-Harness.
h2. Minimum Physical Layout: Absolute Minimum
As a special case, the absolute minimum setup is one of the standalone examples for realtime ingestion and querying; see [[Examples]] that can easily run on one machine with one core and 1GB RAM. This layout can be set up to try some basic queries with Druid.
h2. Minimum Physical Layout: Experimental Testing with 4GB of RAM
This layout can be used to load some data from deep storage onto a Druid compute node for the first time. A minimal physical layout for a 1 or 2 core machine with 4GB of RAM is:
# node1: [[Master]] + metadata service + zookeeper + [[Compute]]
# transient nodes: indexer
This setup is only reasonable to prove that a configuration works. It would not be worthwhile to use this layout for performance measurement.
h2. Comfortable Physical Layout: Pilot Project with Multiple Machines
_The machine size "flavors" are using AWS/EC2 terminology for descriptive purposes only and is not meant to imply that AWS/EC2 is required or recommended. Another cloud provider or your own hardware can also work._
A minimal physical layout not constrained by cores that demonstrates parallel querying and realtime, using AWS-EC2 "small"/m1.small (one core, with 1.7GB of RAM) or larger, no realtime, is:
# node1: [[Master]] (m1.small)
# node2: metadata service (m1.small)
# node3: zookeeper (m1.small)
# node4: [[Broker]] (m1.small or m1.medium or m1.large)
# node5: [[Compute]] (m1.small or m1.medium or m1.large)
# node6: [[Compute]] (m1.small or m1.medium or m1.large)
# node7: [[Realtime]] (m1.small or m1.medium or m1.large)
# transient nodes: indexer
This layout naturally lends itself to adding more RAM and core to Compute nodes, and to adding many more Compute nodes. Depending on the actual load, the Master, metadata server, and Zookeeper might need to use larger machines.
h2. High Availability Physical Layout
_The machine size "flavors" are using AWS/EC2 terminology for descriptive purposes only and is not meant to imply that AWS/EC2 is required or recommended. Another cloud provider or your own hardware can also work._
An HA layout allows full rolling restarts and heavy volume:
# node1: [[Master]] (m1.small or m1.medium or m1.large)
# node2: [[Master]] (m1.small or m1.medium or m1.large) (backup)
# node3: metadata service (c1.medium or m1.large)
# node4: metadata service (c1.medium or m1.large) (backup)
# node5: zookeeper (c1.medium)
# node6: zookeeper (c1.medium)
# node7: zookeeper (c1.medium)
# node8: [[Broker]] (m1.small or m1.medium or m1.large or m2.xlarge or m2.2xlarge or m2.4xlarge)
# node9: [[Broker]] (m1.small or m1.medium or m1.large or m2.xlarge or m2.2xlarge or m2.4xlarge) (backup)
# node10: [[Compute]] (m1.small or m1.medium or m1.large or m2.xlarge or m2.2xlarge or m2.4xlarge)
# node11: [[Compute]] (m1.small or m1.medium or m1.large or m2.xlarge or m2.2xlarge or m2.4xlarge)
# node12: [[Realtime]] (m1.small or m1.medium or m1.large or m2.xlarge or m2.2xlarge or m2.4xlarge)
# transient nodes: indexer
h2. Sizing for Cores and RAM
The Compute and Broker nodes will use as many cores as are available, depending on usage, so it is best to keep these on dedicated machines. The upper limit of effectively utilized cores is not well characterized yet and would depend on types of queries, query load, and the schema. Compute daemons should have a heap a size of at least 1GB per core for normal usage, but could be squeezed into a smaller heap for testing. Since in-memory caching is essential for good performance, even more RAM is better. Broker nodes will use RAM for caching, so they do more than just route queries.
The effective utilization of cores by Zookeeper, MySQL, and Master nodes is likely to be between 1 and 2 for each process/daemon, so these could potentially share a machine with lots of cores. These daemons work with heap a size between 500MB and 1GB.
h2. Storage
Indexed segments should be kept in a permanent store accessible by all nodes like AWS S3 or HDFS or equivalent. Currently Druid supports S3, but this will be extended soon.
Local disk ("ephemeral" on AWS EC2) for caching is recommended over network mounted storage (example of mounted: AWS EBS, Elastic Block Store) in order to avoid network delays during times of heavy usage. If your data center is suitably provisioned for networked storage, perhaps with separate LAN/NICs just for storage, then mounted might work fine.
h2. Setup
Setting up a cluster is essentially just firing up all of the nodes you want with the proper [[configuration]]. One thing to be aware of is that there are a few properties in the configuration that potentially need to be set individually for each process:
<pre>
<code>
druid.server.type=historical|realtime
druid.host=someHostOrIPaddrWithPort
druid.port=8080
</code>
</pre>
@druid.server.type@ should be set to "historical" for your compute nodes and realtime for the realtime nodes. The master will only assign segments to a "historical" node and the broker has some intelligence around its ability to cache results when talking to a realtime node. This does not need to be set for the master or the broker.
@druid.host@ should be set to the hostname and port that can be used to talk to the given server process. Basically, someone should be able to send a request to http://${druid.host}/ and actually talk to the process.
@druid.port@ should be set to the port that the server should listen on. In the vast majority of cases, this port should be the same as what is on @druid.host@.
h2. Build/Run
The simplest way to build and run from the repository is to run @mvn package@ from the base directory and then take @druid-services/target/druid-services-*-selfcontained.jar@ and push that around to your machines; the jar does not need to be expanded, and since it contains the main() methods for each kind of service, it is *not* invoked with java -jar. It can be run from a normal java command-line by just including it on the classpath and then giving it the main class that you want to run. For example one instance of the Compute node/service can be started like this:
<pre>
<code>
java -Duser.timezone=UTC -Dfile.encoding=UTF-8 -cp compute/:druid-services/target/druid-services-*-selfcontained.jar com.metamx.druid.http.ComputeMain
</code>
</pre>
The following table shows the possible services and fully qualified class for main().
|_. service |_. main class |
| [[ Realtime ]] | com.metamx.druid.realtime.RealtimeMain |
| [[ Master ]] | com.metamx.druid.http.MasterMain |
| [[ Broker ]] | com.metamx.druid.http.BrokerMain |
| [[ Compute ]] | com.metamx.druid.http.ComputeMain |

View File

@ -2,9 +2,8 @@
layout: doc_page
---
Druid is an open-source analytics datastore designed for realtime, exploratory, queries on large-scale data sets (100s of Billions entries, 100s TB data). Druid provides for cost effective, always-on, realtime data ingestion and arbitrary data exploration.
Druid is an open-source analytics data store designed for real-time, exploratory, queries on large-scale data sets (100s of Billions entries, 100s TB data). Druid provides for cost effective, always-on, realtime data ingestion and arbitrary data exploration.
- Check out some [Examples](Examples.html)
- Try out Druid with our Getting Started [Tutorial](./Tutorial%3A-A-First-Look-at-Druid.html)
- Learn more by reading the [White Paper](http://static.druid.io/docs/druid.pdf)

View File

@ -6,7 +6,7 @@ The indexing service is a highly-available, distributed service that runs indexi
The indexing service is composed of three main components: a peon component that can run a single task, a middle manager component that manages peons, and an overlord component that manages task distribution to middle managers.
Overlords and middle managers may run on the same node or across multiple nodes while middle managers and peons always run on the same node.
Most Basic Getting Started Configuration
Quick Start
----------------------------------------
Run:
@ -149,7 +149,7 @@ http://<COORDINATOR_IP>:<port>/druid/indexer/v1/worker/setup
A sample worker setup spec is shown below:
```
```json
{
"minVersion":"some_version",
"minNumWorkers":"0",

View File

@ -1,293 +0,0 @@
---
layout: doc_page
---
# Setup #
Before we start querying druid, we're going to finish setting up a complete cluster on localhost. In [Loading Your Data](Loading-Your-Data.html) we setup a [Realtime](Realtime.html), [Historical](Historical.html) and [Coordinator](Coordinator.html) node. If you've already completed that tutorial, you need only follow the directions for 'Booting a Broker Node'.
## Booting a Broker Node ##
1. Setup a config file at config/broker/runtime.properties that looks like this:
```
druid.host=localhost
druid.service=broker
druid.port=8080
druid.zk.service.host=localhost
```
2. Run the broker node:
```bash
java -Xmx256m -Duser.timezone=UTC -Dfile.encoding=UTF-8 -classpath lib/*:config/broker io.druid.cli.Main server broker
```
With the Broker node and the other Druid nodes types up and running, you have a fully functional Druid Cluster and are ready to query your data!
# Querying Your Data #
Now that we have a complete cluster setup on localhost, we need to load data. To do so, refer to [Loading Your Data](Loading-Your-Data.html). Having done that, its time to query our data! For a complete specification of queries, see [Querying](Querying.html).
## Querying Different Nodes ##
As a shared-nothing system, there are three ways to query druid, against the [Realtime](Realtime.html), [Historical](Historical.html) or [Broker](Broker.html) node. Querying a Realtime node returns only realtime data, querying a historical node returns only historical segments. Querying the broker may query both realtime and historical segments and compose an overall result for the query. This is the normal mode of operation for queries in Druid.
### Construct a Query ###
For constructing this query, see: Querying against the realtime.spec
```json
{
"queryType": "groupBy",
"dataSource": "druidtest",
"granularity": "all",
"dimensions": [],
"aggregations": [
{"type": "count", "name": "rows"},
{"type": "longSum", "name": "imps", "fieldName": "impressions"},
{"type": "doubleSum", "name": "wp", "fieldName": "wp"}
],
"intervals": ["2010-01-01T00:00/2020-01-01T00"]
}
```
### Querying the Realtime Node ###
Run our query against port 8080:
```bash
curl -X POST "http://localhost:8080/druid/v2/?pretty" -H 'content-type: application/json' -d @query.body
```
See our result:
```json
[ {
"version" : "v1",
"timestamp" : "2010-01-01T00:00:00.000Z",
"event" : { "imps" : 5, "wp" : 15000.0, "rows" : 5 }
} ]
```
### Querying the Historical node ###
Run the query against port 8082:
```bash
curl -X POST "http://localhost:8082/druid/v2/?pretty" -H 'content-type: application/json' -d @query.body
```
And get (similar to):
```json
[ {
"version" : "v1",
"timestamp" : "2010-01-01T00:00:00.000Z",
"event" : { "imps" : 27, "wp" : 77000.0, "rows" : 9 }
} ]
```
### Querying the Broker ###
Run the query against port 8083:
```bash
curl -X POST "http://localhost:8083/druid/v2/?pretty" -H 'content-type: application/json' -d @query.body
```
And get:
```json
[ {
"version" : "v1",
"timestamp" : "2010-01-01T00:00:00.000Z",
"event" : { "imps" : 5, "wp" : 15000.0, "rows" : 5 }
} ]
```
Now that we know what nodes can be queried (although you should usually use the broker node), lets learn how to know what queries are available.
## Examining the realtime.spec ##
How are we to know what queries we can run? Although [Querying](Querying.html) is a helpful index, to get a handle on querying our data we need to look at our [Realtime](Realtime.html) node's realtime.spec file:
```json
[
{
"schema": {
"dataSource": "druidtest",
"aggregators": [
{
"type": "count",
"name": "impressions"
},
{
"type": "doubleSum",
"name": "wp",
"fieldName": "wp"
}
],
"indexGranularity": "minute",
"shardSpec": {
"type": "none"
}
},
"config": {
"maxRowsInMemory": 500000,
"intermediatePersistPeriod": "PT10m"
},
"firehose": {
"type": "kafka-0.7.2",
"consumerProps": {
"zk.connect": "localhost:2181",
"zk.connectiontimeout.ms": "15000",
"zk.sessiontimeout.ms": "15000",
"zk.synctime.ms": "5000",
"groupid": "topic-pixel-local",
"fetch.size": "1048586",
"autooffset.reset": "largest",
"autocommit.enable": "false"
},
"feed": "druidtest",
"parser": {
"timestampSpec": {
"column": "utcdt",
"format": "iso"
},
"data": {
"format": "json"
},
"dimensionExclusions": [
"wp"
]
}
},
"plumber": {
"type": "realtime",
"windowPeriod": "PT10m",
"segmentGranularity": "hour",
"basePersistDirectory": "\/tmp\/realtime\/basePersist",
"rejectionPolicy": {
"type": "messageTime"
}
}
}
]
```
### dataSource ###
```json
"dataSource":"druidtest"
```
Our dataSource tells us the name of the relation/table, or 'source of data', to query in both our realtime.spec and query.body!
### aggregations ###
Note the [Aggregations](Aggregations.html) in our query:
```json
"aggregations": [
{"type": "count", "name": "rows"},
{"type": "longSum", "name": "imps", "fieldName": "impressions"},
{"type": "doubleSum", "name": "wp", "fieldName": "wp"}
],
```
this matches up to the aggregators in the schema of our realtime.spec!
```json
"aggregators":[ {"type":"count", "name":"impressions"},
{"type":"doubleSum","name":"wp","fieldName":"wp"}],
```
### dimensions ###
Lets look back at our actual records (from [Loading Your Data](Loading Your Data.html)):
```json
{"utcdt": "2010-01-01T01:01:01", "wp": 1000, "gender": "male", "age": 100}
{"utcdt": "2010-01-01T01:01:02", "wp": 2000, "gender": "female", "age": 50}
{"utcdt": "2010-01-01T01:01:03", "wp": 3000, "gender": "male", "age": 20}
{"utcdt": "2010-01-01T01:01:04", "wp": 4000, "gender": "female", "age": 30}
{"utcdt": "2010-01-01T01:01:05", "wp": 5000, "gender": "male", "age": 40}
```
Note that we have two dimensions to our data, other than our primary metric, wp. They are 'gender' and 'age'. We can specify these in our query! Note that we have added a dimension: age, below.
```json
{
"queryType": "groupBy",
"dataSource": "druidtest",
"granularity": "all",
"dimensions": ["age"],
"aggregations": [
{"type": "count", "name": "rows"},
{"type": "longSum", "name": "imps", "fieldName": "impressions"},
{"type": "doubleSum", "name": "wp", "fieldName": "wp"}
],
"intervals": ["2010-01-01T00:00/2020-01-01T00"]
}
```
Which gets us grouped data in return!
```json
[ {
"version" : "v1",
"timestamp" : "2010-01-01T00:00:00.000Z",
"event" : { "imps" : 1, "age" : "100", "wp" : 1000.0, "rows" : 1 }
}, {
"version" : "v1",
"timestamp" : "2010-01-01T00:00:00.000Z",
"event" : { "imps" : 1, "age" : "20", "wp" : 3000.0, "rows" : 1 }
}, {
"version" : "v1",
"timestamp" : "2010-01-01T00:00:00.000Z",
"event" : { "imps" : 1, "age" : "30", "wp" : 4000.0, "rows" : 1 }
}, {
"version" : "v1",
"timestamp" : "2010-01-01T00:00:00.000Z",
"event" : { "imps" : 1, "age" : "40", "wp" : 5000.0, "rows" : 1 }
}, {
"version" : "v1",
"timestamp" : "2010-01-01T00:00:00.000Z",
"event" : { "imps" : 1, "age" : "50", "wp" : 2000.0, "rows" : 1 }
} ]
```
### filtering ###
Now that we've observed our dimensions, we can also filter:
```json
{
"queryType": "groupBy",
"dataSource": "druidtest",
"granularity": "all",
"filter": { "type": "selector", "dimension": "gender", "value": "male" },
"aggregations": [
{"type": "count", "name": "rows"},
{"type": "longSum", "name": "imps", "fieldName": "impressions"},
{"type": "doubleSum", "name": "wp", "fieldName": "wp"}
],
"intervals": ["2010-01-01T00:00/2020-01-01T00"]
}
```
Which gets us just people aged 40:
```json
[ {
"version" : "v1",
"timestamp" : "2010-01-01T00:00:00.000Z",
"event" : { "imps" : 3, "wp" : 9000.0, "rows" : 3 }
} ]
```
Check out [Filters](Filters.html) for more information.
## Learn More ##
You can learn more about querying at [Querying](Querying.html)! Now check out [Booting a production cluster](Booting-a-production-cluster.html)!

View File

@ -12,7 +12,7 @@ Segment Creation Tasks
The Index Task is a simpler variation of the Index Hadoop task that is designed to be used for smaller data sets. The task executes within the indexing service and does not require an external Hadoop setup to use. The grammar of the index task is as follows:
```
```json
{
"type" : "index",
"dataSource" : "example",
@ -50,7 +50,7 @@ The Index Task is a simpler variation of the Index Hadoop task that is designed
|--------|-----------|---------|
|type|The task type, this should always be "index".|yes|
|id|The task ID.|no|
|granularitySpec|See [granularitySpec](Tasks.html#Granularity-Spec)|yes|
|granularitySpec|See [granularitySpec](Tasks.html)|yes|
|spatialDimensions|Dimensions to build spatial indexes over. See [Spatial-Indexing](Spatial-Indexing.html)|no|
|aggregators|The metrics to aggregate in the data set. For more info, see [Aggregations](Aggregations.html)|yes|
|indexGranularity|The rollup granularity for timestamps.|no|
@ -78,10 +78,10 @@ The Hadoop Index Task is used to index larger data sets that require the paralle
The indexing service can also run real-time tasks. These tasks effectively transform a middle manager into a real-time node. We introduced real-time tasks as a way to programmatically add new real-time data sources without needing to manually add nodes. The grammar for the real-time task is as follows:
```
```json
{
"type" : "index_realtime",
"id": "example,
"id": "example",
"resource": {
"availabilityGroup" : "someGroup",
"requiredCapacity" : 1
@ -154,10 +154,10 @@ A JSON object used for high availability purposes. Not required.
|requiredCapacity|Integer|How much middle manager capacity this task will take.|yes|
Schema:
See [Schema](Realtime.html#Schema).
See [Schema](Realtime.html).
Fire Department Config:
See [Config](Realtime.html#Config).
See [Config](Realtime.html).
Firehose:
See [Firehose](Firehose.html).
@ -178,7 +178,7 @@ Segment Merging Tasks
Append tasks append a list of segments together into a single segment (one after the other). The grammar is:
```
```json
{
"id": <task_id>,
"dataSource": <task_datasource>,
@ -190,7 +190,7 @@ Append tasks append a list of segments together into a single segment (one after
Merge tasks merge a list of segments together. Any common timestamps are merged. The grammar is:
```
```json
{
"id": <task_id>,
"dataSource": <task_datasource>,
@ -205,7 +205,7 @@ Segment Destroying Tasks
Delete tasks create empty segments with no data. The grammar is:
```
```json
{
"id": <task_id>,
"dataSource": <task_datasource>,
@ -217,7 +217,7 @@ Delete tasks create empty segments with no data. The grammar is:
Kill tasks delete all information about a segment and removes it from deep storage. Killable segments must be disabled (used==0) in the Druid segment table. The available grammar is:
```
```json
{
"id": <task_id>,
"dataSource": <task_datasource>,
@ -232,7 +232,7 @@ Misc. Tasks
These tasks convert segments from an existing older index version to the latest index version. The available grammar is:
```
```json
{
"id": <task_id>,
"groupId" : <task_group_id>,
@ -246,7 +246,7 @@ These tasks convert segments from an existing older index version to the latest
These tasks start, sleep for a time and are used only for testing. The available grammar is:
```
```json
{
"id": <optional_task_id>,
"interval" : <optional_segment_interval>,

View File

@ -43,12 +43,11 @@ These metrics track the number of characters added, deleted, and changed.
Setting Up
----------
There are two ways to setup Druid: download a tarball, or [Build From Source](Build From Source.html). You only need to do one of these.
There are two ways to setup Druid: download a tarball, or [Build From Source](Build-from-source.html). You only need to do one of these.
### Download a Tarball
We've built a tarball that contains everything you'll need. You'll find it [here](http://static.druid.io/artifacts/releases/druid-services-0.6.0-bin.tar.gz)
Download this file to a directory of your choosing.
We've built a tarball that contains everything you'll need. You'll find it [here](http://static.druid.io/artifacts/releases/druid-services-0.6.0-bin.tar.gz). Download this file to a directory of your choosing.
You can extract the awesomeness within by issuing:
@ -98,7 +97,7 @@ Okay, things are about to get real-time. To query the real-time node you've spun
./run_example_client.sh
```
Select "wikipedia" once again. This script issues [GroupByQuery](GroupByQuery.html)s to the data we've been ingesting. The query looks like this:
Select "wikipedia" once again. This script issues [GroupByQueries](GroupByQuery.html) to the data we've been ingesting. The query looks like this:
```json
{
@ -108,7 +107,7 @@ Select "wikipedia" once again. This script issues [GroupByQuery](GroupByQuery.ht
"dimensions":[ "page" ],
"aggregations":[
{"type":"count", "name":"rows"},
{"type":"longSum", "fieldName":"edit_count", "name":"count"}
{"type":"longSum", "fieldName":"count", "name":"edit_count"}
],
"filter":{ "type":"selector", "dimension":"namespace", "value":"article" },
"intervals":[ "2013-06-01T00:00/2020-01-01T00" ]
@ -151,7 +150,7 @@ time_boundary_query.body
Druid queries are JSON blobs which are relatively painless to create programmatically, but an absolute pain to write by hand. So anyway, we are going to create a Druid query by hand. Add the following to the file you just created:
```
```json
{
"queryType": "timeBoundary",
"dataSource": "wikipedia"
@ -186,7 +185,7 @@ timeseries_query.body
We are going to make a slightly more complicated query, the [TimeseriesQuery](TimeseriesQuery.html). Copy and paste the following into the file:
```
```json
{
"queryType": "timeseries",
"dataSource": "wikipedia",
@ -221,7 +220,7 @@ Right now all the results you are getting back are being aggregated into a singl
If you loudly exclaimed "we can change granularity to minute", you are absolutely correct! We can specify different granularities to bucket our results, like so:
```
```json
{
"queryType": "timeseries",
"dataSource": "wikipedia",
@ -267,7 +266,7 @@ group_by_query.body
and put the following in there:
```
```json
{
"queryType": "groupBy",
"dataSource": "wikipedia",
@ -321,13 +320,13 @@ Feel free to tweak other query parameters to answer other questions you may have
Next Steps
----------
What to know even more information about the Druid Cluster? Check out [Tutorial: The Druid Cluster](Tutorial:-The-Druid-Cluster.html)
What to know even more information about the Druid Cluster? Check out [Tutorial%3A The Druid Cluster](Tutorial%3A-The-Druid-Cluster.html)
Druid is even more fun if you load your own data into it! To learn how to load your data, see [Loading Your Data](Loading-Your-Data.html).
Druid is even more fun if you load your own data into it! To learn how to load your data, see [Loading Your Data](Tutorial%3A-Loading-Your-Data-Part-1.html).
Additional Information
----------------------
This tutorial is merely showcasing a small fraction of what Druid can do. If you are interested in more information about Druid, including setting up a more sophisticated Druid cluster, please read the other links in our wiki.
And thus concludes our journey! Hopefully you learned a thing or two about Druid real-time ingestion, querying Druid, and how Druid can be used to solve problems. If you have additional questions, feel free to post in our [google groups page](http://www.groups.google.com/forum/#!forum/druid-development).
And thus concludes our journey! Hopefully you learned a thing or two about Druid real-time ingestion, querying Druid, and how Druid can be used to solve problems. If you have additional questions, feel free to post in our [google groups page](https://groups.google.com/forum/#!forum/druid-development).

View File

@ -0,0 +1,197 @@
---
layout: doc_page
---
Hello! This tutorial is meant to provide a more in-depth look into Druid queries. The tutorial is somewhat incomplete right now but we hope to add more content to it in the near future.
Setup
-----
Before we start digging into how to query Druid, make sure you've gone through the other tutorials and are comfortable with spinning up a local cluster and loading data into Druid.
#### Booting a Druid Cluster
Let's start up a simple Druid cluster so we can query all the things.
To start a Coordinator node:
```bash
java -Xmx256m -Duser.timezone=UTC -Dfile.encoding=UTF-8 -classpath lib/*:config/coordinator io.druid.cli.Main server coordinator
```
To start a Historical node:
```bash
java -Xmx256m -Duser.timezone=UTC -Dfile.encoding=UTF-8 -classpath lib/*:config/historical io.druid.cli.Main server historical
```
To start a Broker node:
```bash
java -Xmx256m -Duser.timezone=UTC -Dfile.encoding=UTF-8 -classpath lib/*:config/broker io.druid.cli.Main server broker
```
Querying Your Data
------------------
Make sure you've completed [Loading Your Data](Loading-Your-Data-Part-1.html) so we have some data to query. Having done that, it's time to query our data! For a complete specification of queries, see [Querying](Querying.html).
#### Construct a Query
```json
{
"queryType": "groupBy",
"dataSource": "wikipedia",
"granularity": "all",
"dimensions": [],
"aggregations": [
{"type": "count", "name": "rows"},
{"type": "longSum", "name": "edit_count", "fieldName": "count"},
{"type": "doubleSum", "name": "chars_added", "fieldName": "added"}
],
"intervals": ["2010-01-01T00:00/2020-01-01T00"]
}
```
#### Query That Data
Run the query against your broker:
```bash
curl -X POST "http://localhost:8080/druid/v2/?pretty" -H 'Content-type: application/json' -d @query.body
```
And get:
```json
[ {
"version" : "v1",
"timestamp" : "2010-01-01T00:00:00.000Z",
"event" : {
"chars_added" : 1545.0,
"edit_count" : 5,
"rows" : 5
}
} ]
```
This result tells us that our query has 5 edits, and we have 5 rows of data as well. In those 5 edits, we have 1545 characters added.
#### What can I query for?
How are we to know what queries we can run? Although [Querying](Querying.html) is a helpful index, to get a handle on querying our data we need to look at our ingestion schema. There are a few particular fields we care about in the ingestion schema. All of these fields should in present in the real-time ingestion schema and the batch ingestion schema.
Datasource:
```json
"dataSource":"wikipedia"
```
Our dataSource tells us the name of the relation/table, or 'source of data'. What we decide to name our data source must match the data source we are going to be querying.
Granularity:
```json
"indexGranularity": "none",
```
Druid will roll up data at ingestion time unless the index/rollup granularity is specified as "none". Your query granularity cannot be lower than your index granularity.
Aggregators:
```json
"aggregators" : [{
"type" : "count",
"name" : "count"
}, {
"type" : "doubleSum",
"name" : "added",
"fieldName" : "added"
}, {
"type" : "doubleSum",
"name" : "deleted",
"fieldName" : "deleted"
}, {
"type" : "doubleSum",
"name" : "delta",
"fieldName" : "delta"
}]
```
The [Aggregations](Aggregations.html) specified at ingestion time correlated directly to the metrics that can be queried.
Dimensions:
```json
"dimensions" : ["page","language","user","unpatrolled","newPage","robot","anonymous","namespace","continent","country","region","city"]
```
These specify the dimensions that we can filter our data on. If we added a dimension to our groupBy query, we get:
```json
{
"queryType": "groupBy",
"dataSource": "wikipedia",
"granularity": "all",
"dimensions": ["namespace"],
"aggregations": [
{"type": "longSum", "name": "edit_count", "fieldName": "count"},
{"type": "doubleSum", "name": "chars_added", "fieldName": "added"}
],
"intervals": ["2010-01-01T00:00/2020-01-01T00"]
}
```
Which gets us data grouped over the namespace dimension in return!
```json
[ {
"version" : "v1",
"timestamp" : "2010-01-01T00:00:00.000Z",
"event" : {
"chars_added" : 180.0,
"edit_count" : 2,
"namespace" : "article"
}
}, {
"version" : "v1",
"timestamp" : "2010-01-01T00:00:00.000Z",
"event" : {
"chars_added" : 1365.0,
"edit_count" : 3,
"namespace" : "wikipedia"
}
} ]
```
Additionally,, we can also filter our query to narrow down our metric values:
```json
{
"queryType": "groupBy",
"dataSource": "wikipedia",
"granularity": "all",
"filter": { "type": "selector", "dimension": "namespace", "value": "article" },
"aggregations": [
{"type": "longSum", "name": "edit_count", "fieldName": "count"},
{"type": "doubleSum", "name": "chars_added", "fieldName": "added"}
],
"intervals": ["2010-01-01T00:00/2020-01-01T00"]
}
```
Which gets us metrics about only those edits where the namespace is 'article':
```json
[ {
"version" : "v1",
"timestamp" : "2010-01-01T00:00:00.000Z",
"event" : {
"chars_added" : 180.0,
"edit_count" : 2
}
} ]
```
Check out [Filters](Filters.html) for more information.
## Learn More ##
You can learn more about querying at [Querying](Querying.html)! If you are ready to evaluate Druid more in depth, check out [Booting a production cluster](Booting-a-production-cluster.html)!

View File

@ -1,7 +1,7 @@
---
layout: doc_page
---
In our last [tutorial](Tutorial:-The-Druid-Cluster.html), we setup a complete Druid cluster. We created all the Druid dependencies and loaded some batched data. Druid shards data into self-contained chunks known as [segments](Segments.html). Segments are the fundamental unit of storage in Druid and all Druid nodes only understand segments.
In our last [tutorial](Tutorial%3A-The-Druid-Cluster.html), we setup a complete Druid cluster. We created all the Druid dependencies and loaded some batched data. Druid shards data into self-contained chunks known as [segments](Segments.html). Segments are the fundamental unit of storage in Druid and all Druid nodes only understand segments.
In this tutorial, we will learn about batch ingestion (as opposed to real-time ingestion) and how to create segments using the final piece of the Druid Cluster, the [indexing service](Indexing-Service.html). The indexing service is a standalone service that accepts [tasks](Tasks.html) in the form of POST requests. The output of most tasks are segments.
@ -50,12 +50,12 @@ examples/indexing/wikipedia_data.json
Open the file and make sure the following events exist:
```
```json
{"timestamp": "2013-08-31T01:02:33Z", "page": "Gypsy Danger", "language" : "en", "user" : "nuclear", "unpatrolled" : "true", "newPage" : "true", "robot": "false", "anonymous": "false", "namespace":"article", "continent":"North America", "country":"United States", "region":"Bay Area", "city":"San Francisco", "added": 57, "deleted": 200, "delta": -143}
{"timestamp": "2013-08-31T03:32:45Z", "page": "Striker Eureka", "language" : "en", "user" : "speed", "unpatrolled" : "false", "newPage" : "true", "robot": "true", "anonymous": "false", "namespace":"wikipedia", "continent":"Australia", "country":"Australia", "region":"Dingo Land", "city":"Syndey", "added": 459, "deleted": 129, "delta": 330}
{"timestamp": "2013-08-31T07:11:21Z", "page": "Cherno Alpha", "language" : "ru", "user" : "masterYi", "unpatrolled" : "false", "newPage" : "true", "robot": "true", "anonymous": "false", "namespace":"article", "continent":"Asia", "country":"Russia", "region":"Vodka Land", "city":"Moscow", "added": 123, "deleted": 12, "delta": 111}
{"timestamp": "2013-08-31T03:32:45Z", "page": "Striker Eureka", "language" : "en", "user" : "speed", "unpatrolled" : "false", "newPage" : "true", "robot": "true", "anonymous": "false", "namespace":"wikipedia", "continent":"Australia", "country":"Australia", "region":"Cantebury", "city":"Syndey", "added": 459, "deleted": 129, "delta": 330}
{"timestamp": "2013-08-31T07:11:21Z", "page": "Cherno Alpha", "language" : "ru", "user" : "masterYi", "unpatrolled" : "false", "newPage" : "true", "robot": "true", "anonymous": "false", "namespace":"article", "continent":"Asia", "country":"Russia", "region":"Oblast", "city":"Moscow", "added": 123, "deleted": 12, "delta": 111}
{"timestamp": "2013-08-31T11:58:39Z", "page": "Crimson Typhoon", "language" : "zh", "user" : "triplets", "unpatrolled" : "true", "newPage" : "false", "robot": "true", "anonymous": "false", "namespace":"wikipedia", "continent":"Asia", "country":"China", "region":"Shanxi", "city":"Taiyuan", "added": 905, "deleted": 5, "delta": 900}
{"timestamp": "2013-08-31T12:41:27Z", "page": "Coyote Tango", "language" : "ja", "user" : "cancer", "unpatrolled" : "true", "newPage" : "false", "robot": "true", "anonymous": "false", "namespace":"wikipedia", "continent":"Asia", "country":"Japan", "region":"Kanto", "city":"Tokyo", "added": 1, "deleted": 10, "delta": -9}
{"timestamp": "2013-08-31T12:41:27Z", "page": "Coyote Tango", "language" : "ja", "user" : "stringer", "unpatrolled" : "true", "newPage" : "false", "robot": "true", "anonymous": "false", "namespace":"wikipedia", "continent":"Asia", "country":"Japan", "region":"Kanto", "city":"Tokyo", "added": 1, "deleted": 10, "delta": -9}
```
There are five data points spread across the day of 2013-08-31. Talk about big data right? Thankfully, we don't need a ton of data to introduce how batch ingestion works.
@ -71,12 +71,14 @@ java -Xmx2g -Duser.timezone=UTC -Dfile.encoding=UTF-8 -classpath lib/*:config/ov
```
The overlord configurations should already exist in:
```
config/overlord/runtime.properties
```
The configurations for the overlord node are as follows:
```
```bash
druid.host=localhost
druid.port=8087
druid.service=overlord
@ -96,8 +98,9 @@ druid.indexer.fork.property.druid.computation.buffer.size=268435456
If you are interested in reading more about these configurations, see [here](Indexing-Service.html).
When the overlord node is ready for tasks, you should see a message like the following:
```
013-10-09 21:30:32,817 INFO [Thread-14] io.druid.indexing.overlord.TaskQueue - Waiting for work...
```bash
2013-10-09 21:30:32,817 INFO [Thread-14] io.druid.indexing.overlord.TaskQueue - Waiting for work...
```
#### Starting Other Nodes
@ -111,6 +114,7 @@ java -Xmx256m -Duser.timezone=UTC -Dfile.encoding=UTF-8 -classpath lib/*:config/
```
Historical node:
```bash
java -Xmx256m -Duser.timezone=UTC -Dfile.encoding=UTF-8 -classpath lib/*:config/historical io.druid.cli.Main server historical
```
@ -130,7 +134,7 @@ examples/indexing/index_task.json
Open up the file to see the following:
```
```json
{
"type" : "index",
"dataSource" : "wikipedia",
@ -141,7 +145,7 @@ Open up the file to see the following:
},
"aggregators" : [{
"type" : "count",
"name" : "edit_count"
"name" : "count"
}, {
"type" : "doubleSum",
"name" : "added",
@ -176,21 +180,21 @@ Okay, so what is happening here? The "type" field indicates the type of task we
Let's send our task to the indexing service now:
```
```bash
curl -X 'POST' -H 'Content-Type:application/json' -d @examples/indexing/wikipedia_index_task.json localhost:8087/druid/indexer/v1/task
```
Issuing the request should return a task ID like so:
```
fjy$ curl -X 'POST' -H 'Content-Type:application/json' -d @examples/indexing/wikipedia_index_task.json localhost:8087/druid/indexer/v1/task
```bash
$ curl -X 'POST' -H 'Content-Type:application/json' -d @examples/indexing/wikipedia_index_task.json localhost:8087/druid/indexer/v1/task
{"task":"index_wikipedia_2013-10-09T21:30:32.802Z"}
fjy$
$
```
In your indexing service logs, you should see the following:
````
```bash
2013-10-09 21:41:41,150 INFO [qtp300448720-21] io.druid.indexing.overlord.HeapMemoryTaskStorage - Inserting task index_wikipedia_2013-10-09T21:41:41.147Z with status: TaskStatus{id=index_wikipedia_2013-10-09T21:41:41.147Z, status=RUNNING, duration=-1}
2013-10-09 21:41:41,151 INFO [qtp300448720-21] io.druid.indexing.overlord.TaskLockbox - Created new TaskLockPosse: TaskLockPosse{taskLock=TaskLock{groupId=index_wikipedia_2013-10-09T21:41:41.147Z, dataSource=wikipedia, interval=2013-08-31T00:00:00.000Z/2013-09-01T00:00:00.000Z, version=2013-10-09T21:41:41.151Z}, taskIds=[]}
...
@ -201,7 +205,7 @@ In your indexing service logs, you should see the following:
After a few seconds, the task should complete and you should see in the indexing service logs:
```
```bash
2013-10-09 21:41:45,765 INFO [pool-6-thread-1] io.druid.indexing.overlord.exec.TaskConsumer - Received SUCCESS status for task: IndexGeneratorTask{id=index_wikipedia_2013-10-09T21:41:41.147Z_generator_2013-08-31T00:00:00.000Z_2013-09-01T00:00:00.000Z_0, type=index_generator, dataSource=wikipedia, interval=Optional.of(2013-08-31T00:00:00.000Z/2013-09-01T00:00:00.000Z)}
```
@ -209,7 +213,7 @@ Congratulations! The segment has completed building. Once a segment is built, a
You should see the following logs on the coordinator:
```
```bash
2013-10-09 21:41:54,368 INFO [Coordinator-Exec--0] io.druid.server.coordinator.DruidCoordinatorLogger - [_default_tier] : Assigned 1 segments among 1 servers
2013-10-09 21:41:54,369 INFO [Coordinator-Exec--0] io.druid.server.coordinator.DruidCoordinatorLogger - Load Queues:
2013-10-09 21:41:54,369 INFO [Coordinator-Exec--0] io.druid.server.coordinator.DruidCoordinatorLogger - Server[localhost:8081, historical, _default_tier] has 1 left to load, 0 left to drop, 4,477 bytes queued, 4,477 bytes served.
@ -217,7 +221,7 @@ You should see the following logs on the coordinator:
These logs indicate that the coordinator has assigned our new segment to the historical node to download and serve. If you look at the historical node logs, you should see:
```
```bash
2013-10-09 21:41:54,369 INFO [ZkCoordinator-0] io.druid.server.coordination.ZkCoordinator - Loading segment wikipedia_2013-08-31T00:00:00.000Z_2013-09-01T00:00:00.000Z_2013-10-09T21:41:41.151Z
2013-10-09 21:41:54,369 INFO [ZkCoordinator-0] io.druid.segment.loading.LocalDataSegmentPuller - Unzipping local file[/tmp/druid/localStorage/wikipedia/2013-08-31T00:00:00.000Z_2013-09-01T00:00:00.000Z/2013-10-09T21:41:41.151Z/0/index.zip] to [/tmp/druid/indexCache/wikipedia/2013-08-31T00:00:00.000Z_2013-09-01T00:00:00.000Z/2013-10-09T21:41:41.151Z/0]
2013-10-09 21:41:54,370 INFO [ZkCoordinator-0] io.druid.utils.CompressionUtils - Unzipping file[/tmp/druid/localStorage/wikipedia/2013-08-31T00:00:00.000Z_2013-09-01T00:00:00.000Z/2013-10-09T21:41:41.151Z/0/index.zip] to [/tmp/druid/indexCache/wikipedia/2013-08-31T00:00:00.000Z_2013-09-01T00:00:00.000Z/2013-10-09T21:41:41.151Z/0]
@ -228,7 +232,7 @@ Once the segment is announced the segment is queryable. Now you should be able t
Issuing a [TimeBoundaryQuery](TimeBoundaryQuery.html) should yield:
```
```json
[ {
"timestamp" : "2013-08-31T01:02:33.000Z",
"result" : {
@ -241,9 +245,9 @@ Issuing a [TimeBoundaryQuery](TimeBoundaryQuery.html) should yield:
Next Steps
----------
This tutorial covered ingesting a small batch data set and loading it into Druid. In [Loading Your Data Part 2](Tutorial-Loading-Your-Data-Part-2.html), we will cover how to ingest data using Hadoop for larger data sets.
This tutorial covered ingesting a small batch data set and loading it into Druid. In [Loading Your Data Part 2](Tutorial%3A-Loading-Your-Data-Part-2.html), we will cover how to ingest data using Hadoop for larger data sets.
Additional Information
----------------------
Getting data into Druid can definitely be difficult for first time users. Please don't hesitate to ask questions in our IRC channel or on our [google groups page](http://www.groups.google.com/forum/#!forum/druid-development).
Getting data into Druid can definitely be difficult for first time users. Please don't hesitate to ask questions in our IRC channel or on our [google groups page](https://groups.google.com/forum/#!forum/druid-development).

View File

@ -1,26 +1,48 @@
---
layout: doc_page
---
Once you have a real-time node working, it is time to load your own data to see how Druid performs.
In this tutorial we will cover more advanced/real-world ingestion topics.
Druid can ingest data in three ways: via Kafka and a realtime node, via the indexing service, and via the Hadoop batch loader. Data is ingested in real-time using a [Firehose](Firehose.html).
Druid can ingest streaming or batch data. Streaming data is ingested via the real-time node, and batch data is ingested via the Hadoop batch indexer. Druid also has a standalone ingestion service called the [indexing service](Indexing-Service.html).
## Create Config Directories ##
Each type of node needs its own config file and directory, so create them as subdirectories under the druid directory if they not already exist.
The Data
--------
The data source we'll be using is (surprise!) Wikipedia edits. The data schema is still:
```bash
mkdir config
mkdir config/realtime
mkdir config/coordinator
mkdir config/historical
mkdir config/broker
Dimensions (things to filter on):
```json
"page"
"language"
"user"
"unpatrolled"
"newPage"
"robot"
"anonymous"
"namespace"
"continent"
"country"
"region"
"city"
```
## Loading Data with Kafka ##
Metrics (things to aggregate over):
[KafkaFirehoseFactory](https://github.com/metamx/druid/blob/druid-0.6.0/realtime/src/main/java/com/metamx/druid/realtime/firehose/KafkaFirehoseFactory.java) is how druid communicates with Kafka. Using this [Firehose](Firehose.html) with the right configuration, we can import data into Druid in realtime without writing any code. To load data to a realtime node via Kafka, we'll first need to initialize Zookeeper and Kafka, and then configure and initialize a [Realtime](Realtime.html) node.
```json
"count"
"added"
"delta"
"deleted"
```
### Booting Kafka ###
Streaming Event Ingestion
-------------------------
With real-world data, we recommend having a message bus such as [Apache Kafka](http://kafka.apache.org/) sit between the data stream and the real-time node. The message bus provides higher availability for production environments. [Firehoses](Firehose.html) are the key abstraction for real-time ingestion.
#### Setting up Kafka
[KafkaFirehoseFactory](https://github.com/metamx/druid/blob/druid-0.6.0/realtime/src/main/java/com/metamx/druid/realtime/firehose/KafkaFirehoseFactory.java) is how druid communicates with Kafka. Using this [Firehose](Firehose.html) with the right configuration, we can import data into Druid in real-time without writing any code. To load data to a real-time node via Kafka, we'll first need to initialize Zookeeper and Kafka, and then configure and initialize a [Realtime](Realtime.html) node.
Instructions for booting a Zookeeper and then Kafka cluster are available [here](http://kafka.apache.org/07/quickstart.html).
@ -44,6 +66,7 @@ Instructions for booting a Zookeeper and then Kafka cluster are available [here]
```bash
cat config/zookeeper.properties
bin/zookeeper-server-start.sh config/zookeeper.properties
# in a new console
bin/kafka-server-start.sh config/server.properties
```
@ -51,56 +74,55 @@ Instructions for booting a Zookeeper and then Kafka cluster are available [here]
4. Launch the console producer (so you can type in JSON kafka messages in a bit)
```bash
bin/kafka-console-producer.sh --zookeeper localhost:2181 --topic druidtest
bin/kafka-console-producer.sh --zookeeper localhost:2181 --topic wikipedia
```
### Launching a Realtime Node
When things are ready, you should see log messages such as:
1. Create a valid configuration file similar to this called config/realtime/runtime.properties:
```properties
druid.host=localhost
druid.service=example
druid.port=8080
druid.zk.service.host=localhost
druid.s3.accessKey=AKIAIMKECRUYKDQGR6YQ
druid.s3.secretKey=QyyfVZ7llSiRg6Qcrql1eEUG7buFpAK6T6engr1b
druid.db.connector.connectURI=jdbc\:mysql\://localhost\:3306/druid
druid.db.connector.user=druid
druid.db.connector.password=diurd
druid.realtime.specFile=config/realtime/realtime.spec
druid.processing.buffer.sizeBytes=10000000
druid.processing.numThreads=3
```
[2013-10-09 22:03:07,802] INFO zookeeper state changed (SyncConnected) (org.I0Itec.zkclient.ZkClient)
```
2. Create a valid realtime configuration file similar to this called realtime.spec:
#### Launch a Realtime Node
You should be comfortable starting Druid nodes at this point. If not, it may be worthwhile to revisit the first few tutorials.
1. Real-time nodes can be started with:
```bash
java -Xmx256m -Duser.timezone=UTC -Dfile.encoding=UTF-8 -Ddruid.realtime.specFile=examples/indexing/wikipedia.spec -classpath lib/*:config/realtime io.druid.cli.Main server realtime
```
2. A realtime.spec should already exist for the data source in the Druid tarball. You should be able to find it at:
```bash
examples/indexing/wikipedia.spec
```
The contents of the file should match:
```json
[
{
"schema": {
"dataSource": "druidtest",
"aggregators": [
{
"type": "count",
"name": "impressions"
},
{
"type": "doubleSum",
"name": "wp",
"fieldName": "wp"
}
],
"indexGranularity": "minute",
"shardSpec": {
"type": "none"
}
"dataSource": "wikipedia",
"aggregators" : [{
"type" : "count",
"name" : "count"
}, {
"type" : "doubleSum",
"name" : "added",
"fieldName" : "added"
}, {
"type" : "doubleSum",
"name" : "deleted",
"fieldName" : "deleted"
}, {
"type" : "doubleSum",
"name" : "delta",
"fieldName" : "delta"
}],
"indexGranularity": "none"
},
"config": {
"maxRowsInMemory": 500000,
@ -113,23 +135,20 @@ Instructions for booting a Zookeeper and then Kafka cluster are available [here]
"zk.connectiontimeout.ms": "15000",
"zk.sessiontimeout.ms": "15000",
"zk.synctime.ms": "5000",
"groupid": "topic-pixel-local",
"groupid": "druid-example",
"fetch.size": "1048586",
"autooffset.reset": "largest",
"autocommit.enable": "false"
},
"feed": "druidtest",
"feed": "wikipedia",
"parser": {
"timestampSpec": {
"column": "utcdt",
"format": "iso"
"column": "timestamp"
},
"data": {
"format": "json"
},
"dimensionExclusions": [
"wp"
]
"format": "json",
"dimensions" : ["page","language","user","unpatrolled","newPage","robot","anonymous","namespace","continent","country","region","city"]
}
}
},
"plumber": {
@ -138,256 +157,163 @@ Instructions for booting a Zookeeper and then Kafka cluster are available [here]
"segmentGranularity": "hour",
"basePersistDirectory": "\/tmp\/realtime\/basePersist",
"rejectionPolicy": {
"type": "messageTime"
"type": "none"
}
}
}
]
```
3. Launch the realtime node
```bash
java -Xmx256m -Duser.timezone=UTC -Dfile.encoding=UTF-8 \
-Ddruid.realtime.specFile=config/realtime/realtime.spec \
-classpath lib/*:config/realtime io.druid.cli.Main server realtime
```
4. Paste data into the Kafka console producer
3. Let's copy and paste some data into the Kafka console producer
```json
{"utcdt": "2010-01-01T01:01:01", "wp": 1000, "gender": "male", "age": 100}
{"utcdt": "2010-01-01T01:01:02", "wp": 2000, "gender": "female", "age": 50}
{"utcdt": "2010-01-01T01:01:03", "wp": 3000, "gender": "male", "age": 20}
{"utcdt": "2010-01-01T01:01:04", "wp": 4000, "gender": "female", "age": 30}
{"utcdt": "2010-01-01T01:01:05", "wp": 5000, "gender": "male", "age": 40}
{"timestamp": "2013-08-31T01:02:33Z", "page": "Gypsy Danger", "language" : "en", "user" : "nuclear", "unpatrolled" : "true", "newPage" : "true", "robot": "false", "anonymous": "false", "namespace":"article", "continent":"North America", "country":"United States", "region":"Bay Area", "city":"San Francisco", "added": 57, "deleted": 200, "delta": -143}
{"timestamp": "2013-08-31T03:32:45Z", "page": "Striker Eureka", "language" : "en", "user" : "speed", "unpatrolled" : "false", "newPage" : "true", "robot": "true", "anonymous": "false", "namespace":"wikipedia", "continent":"Australia", "country":"Australia", "region":"Cantebury", "city":"Syndey", "added": 459, "deleted": 129, "delta": 330}
{"timestamp": "2013-08-31T07:11:21Z", "page": "Cherno Alpha", "language" : "ru", "user" : "masterYi", "unpatrolled" : "false", "newPage" : "true", "robot": "true", "anonymous": "false", "namespace":"article", "continent":"Asia", "country":"Russia", "region":"Oblast", "city":"Moscow", "added": 123, "deleted": 12, "delta": 111}
{"timestamp": "2013-08-31T11:58:39Z", "page": "Crimson Typhoon", "language" : "zh", "user" : "triplets", "unpatrolled" : "true", "newPage" : "false", "robot": "true", "anonymous": "false", "namespace":"wikipedia", "continent":"Asia", "country":"China", "region":"Shanxi", "city":"Taiyuan", "added": 905, "deleted": 5, "delta": 900}
{"timestamp": "2013-08-31T12:41:27Z", "page": "Coyote Tango", "language" : "ja", "user" : "stringer", "unpatrolled" : "true", "newPage" : "false", "robot": "true", "anonymous": "false", "namespace":"wikipedia", "continent":"Asia", "country":"Japan", "region":"Kanto", "city":"Tokyo", "added": 1, "deleted": 10, "delta": -9}
```
5. Watch the events as they are ingested by Druid's realtime node
Disclaimer: We recognize the timestamps of these events aren't actually recent.
5. Watch the events as they are ingested by Druid's real-time node:
```bash
...
2013-06-17 21:41:55,569 INFO [Global--0] com.metamx.emitter.core.LoggingEmitter - Event [{"feed":"metrics","timestamp":"2013-06-17T21:41:55.569Z","service":"example","host":"127.0.0.1","metric":"events/processed","value":5,"user2":"druidtest"}]
2013-10-10 05:13:18,976 INFO [chief-wikipedia] io.druid.server.coordination.BatchDataSegmentAnnouncer - Announcing segment[wikipedia_2013-08-31T01:00:00.000Z_2013-08-31T02:00:00.000Z_2013-08-31T01:00:00.000Z] at path[/druid/segments/localhost:8083/2013-10-10T05:13:18.972Z0]
2013-10-10 05:13:18,992 INFO [chief-wikipedia] io.druid.server.coordination.BatchDataSegmentAnnouncer - Announcing segment[wikipedia_2013-08-31T03:00:00.000Z_2013-08-31T04:00:00.000Z_2013-08-31T03:00:00.000Z] at path[/druid/segments/localhost:8083/2013-10-10T05:13:18.972Z0]
2013-10-10 05:13:18,997 INFO [chief-wikipedia] io.druid.server.coordination.BatchDataSegmentAnnouncer - Announcing segment[wikipedia_2013-08-31T07:00:00.000Z_2013-08-31T08:00:00.000Z_2013-08-31T07:00:00.000Z] at path[/druid/segments/localhost:8083/2013-10-10T05:13:18.972Z0]
2013-10-10 05:13:19,003 INFO [chief-wikipedia] io.druid.server.coordination.BatchDataSegmentAnnouncer - Announcing segment[wikipedia_2013-08-31T11:00:00.000Z_2013-08-31T12:00:00.000Z_2013-08-31T11:00:00.000Z] at path[/druid/segments/localhost:8083/2013-10-10T05:13:18.972Z0]
2013-10-10 05:13:19,008 INFO [chief-wikipedia] io.druid.server.coordination.BatchDataSegmentAnnouncer - Announcing segment[wikipedia_2013-08-31T12:00:00.000Z_2013-08-31T13:00:00.000Z_2013-08-31T12:00:00.000Z] at path[/druid/segments/localhost:8083/2013-10-10T05:13:18.972Z0]
...
```
6. In a new console, edit a file called query.body:
```json
{
"queryType": "groupBy",
"dataSource": "druidtest",
"granularity": "all",
"dimensions": [],
"aggregations": [
{ "type": "count", "name": "rows" },
{"type": "longSum", "name": "imps", "fieldName": "impressions"},
{"type": "doubleSum", "name": "wp", "fieldName": "wp"}
],
"intervals": ["2010-01-01T00:00/2020-01-01T00"]
}
```
7. Submit the query via curl
```bash
curl -X POST "http://localhost:8080/druid/v2/?pretty" \
-H 'content-type: application/json' -d @query.body
```
8. View Result!
```json
[ {
"timestamp" : "2010-01-01T01:01:00.000Z",
"result" : {
"imps" : 20,
"wp" : 60000.0,
"rows" : 5
}
} ]
```
Now you're ready for [Querying Your Data](Querying-Your-Data.html)!
## Loading Data with the HadoopDruidIndexer ##
Historical data can be loaded via a Hadoop job.
The setup for a single node, 'standalone' Hadoop cluster is available at [http://hadoop.apache.org/docs/stable/single_node_setup.html](http://hadoop.apache.org/docs/stable/single_node_setup.html).
### Setup MySQL ###
1. If you don't already have it, download MySQL Community Server here: [http://dev.mysql.com/downloads/mysql/](http://dev.mysql.com/downloads/mysql/)
2. Install MySQL
3. Create a druid user and database
```bash
mysql -u root
```
```sql
GRANT ALL ON druid.* TO 'druid'@'localhost' IDENTIFIED BY 'diurd';
CREATE database druid;
```
The [Coordinator](Coordinator.html) node will create the tables it needs based on its configuration.
### Make sure you have ZooKeeper Running ###
Make sure that you have a zookeeper instance running. If you followed the instructions for Kafka, it is probably running. If you are unsure if you have zookeeper running, try running
```bash
ps auxww | grep zoo | grep -v grep
```
If you get any result back, then zookeeper is most likely running. If you haven't setup Kafka or do not have zookeeper running, then you can download it and start it up with
```bash
curl http://www.motorlogy.com/apache/zookeeper/zookeeper-3.4.5/zookeeper-3.4.5.tar.gz -o zookeeper-3.4.5.tar.gz
tar xzf zookeeper-3.4.5.tar.gz
cd zookeeper-3.4.5
cp conf/zoo_sample.cfg conf/zoo.cfg
./bin/zkServer.sh start
cd ..
```
### Launch a Coordinator Node ###
If you've already setup a realtime node, be aware that although you can run multiple node types on one physical computer, you must assign them unique ports. Having used 8080 for the [Realtime](Realtime.html) node, we use 8081 for the [Coordinator](Coordinator.html).
1. Setup a configuration file called config/coordinator/runtime.properties similar to:
```properties
druid.host=localhost
druid.service=coordinator
druid.port=8081
druid.zk.service.host=localhost
druid.s3.accessKey=AKIAIMKECRUYKDQGR6YQ
druid.s3.secretKey=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
druid.db.connector.connectURI=jdbc\:mysql\://localhost\:3306/druid
druid.db.connector.user=druid
druid.db.connector.password=diurd
druid.coordinator.startDelay=PT60s
```
2. Launch the [Coordinator](Coordinator.html) node
```bash
java -Xmx256m -Duser.timezone=UTC -Dfile.encoding=UTF-8 \
-classpath lib/*:config/coordinator \
io.druid.Cli.Main server coordinator
```
### Launch a Historical Node ###
1. Create a configuration file in config/historical/runtime.properties similar to:
```properties
druid.host=localhost
druid.service=historical
druid.port=8082
druid.zk.service.host=localhost
druid.s3.secretKey=QyyfVZ7llSiRg6Qcrql1eEUG7buFpAK6T6engr1b
druid.s3.accessKey=AKIAIMKECRUYKDQGR6YQ
druid.server.maxSize=100000000
druid.processing.buffer.sizeBytes=10000000
druid.segmentCache.infoPath=/tmp/druid/segmentInfoCache
druid.segmentCache.locations=[{"path": "/tmp/druid/indexCache", "maxSize"\: 100000000}]
```
2. Launch the historical node:
```bash
java -Xmx256m -Duser.timezone=UTC -Dfile.encoding=UTF-8 \
-classpath lib/*:config/historical \
io.druid.cli.Main server historical
```
### Create a File of Records ###
We can use the same records we have been, in a file called records.json:
Issuing a [TimeBoundaryQuery](TimeBoundaryQuery.html) to the real-time node should yield valid results:
```json
{"utcdt": "2010-01-01T01:01:01", "wp": 1000, "gender": "male", "age": 100}
{"utcdt": "2010-01-01T01:01:02", "wp": 2000, "gender": "female", "age": 50}
{"utcdt": "2010-01-01T01:01:03", "wp": 3000, "gender": "male", "age": 20}
{"utcdt": "2010-01-01T01:01:04", "wp": 4000, "gender": "female", "age": 30}
{"utcdt": "2010-01-01T01:01:05", "wp": 5000, "gender": "male", "age": 40}
[ {
"timestamp" : "2013-08-31T01:02:33.000Z",
"result" : {
"minTime" : "2013-08-31T01:02:33.000Z",
"maxTime" : "2013-08-31T12:41:27.000Z"
}
} ]
```
### Run the Hadoop Job ###
Batch Ingestion
---------------
Druid is designed for large data volumes, and most real-world data sets require batch indexing be done through a Hadoop job.
Now its time to run the Hadoop [Batch-ingestion](Batch-ingestion.html) job, HadoopDruidIndexer, which will fill a historical [Historical](Historical.html) node with data. First we'll need to configure the job.
The setup for a single node, 'standalone' Hadoop cluster is available [here](http://hadoop.apache.org/docs/stable/single_node_setup.html).
1. Create a config called batchConfig.json similar to:
For the purposes of this tutorial, we are going to use our very small and simple Wikipedia data set. This data can directly be ingested via other means as shown in the previous [tutorial](Tutorial%3A-Loading-Your-Data-Part-1), but we are going to use Hadoop here for demonstration purposes.
Our data is located at:
```
examples/indexing/wikipedia_data.json
```
The following events should exist in the file:
```json
{"timestamp": "2013-08-31T01:02:33Z", "page": "Gypsy Danger", "language" : "en", "user" : "nuclear", "unpatrolled" : "true", "newPage" : "true", "robot": "false", "anonymous": "false", "namespace":"article", "continent":"North America", "country":"United States", "region":"Bay Area", "city":"San Francisco", "added": 57, "deleted": 200, "delta": -143}
{"timestamp": "2013-08-31T03:32:45Z", "page": "Striker Eureka", "language" : "en", "user" : "speed", "unpatrolled" : "false", "newPage" : "true", "robot": "true", "anonymous": "false", "namespace":"wikipedia", "continent":"Australia", "country":"Australia", "region":"Cantebury", "city":"Syndey", "added": 459, "deleted": 129, "delta": 330}
{"timestamp": "2013-08-31T07:11:21Z", "page": "Cherno Alpha", "language" : "ru", "user" : "masterYi", "unpatrolled" : "false", "newPage" : "true", "robot": "true", "anonymous": "false", "namespace":"article", "continent":"Asia", "country":"Russia", "region":"Oblast", "city":"Moscow", "added": 123, "deleted": 12, "delta": 111}
{"timestamp": "2013-08-31T11:58:39Z", "page": "Crimson Typhoon", "language" : "zh", "user" : "triplets", "unpatrolled" : "true", "newPage" : "false", "robot": "true", "anonymous": "false", "namespace":"wikipedia", "continent":"Asia", "country":"China", "region":"Shanxi", "city":"Taiyuan", "added": 905, "deleted": 5, "delta": 900}
{"timestamp": "2013-08-31T12:41:27Z", "page": "Coyote Tango", "language" : "ja", "user" : "stringer", "unpatrolled" : "true", "newPage" : "false", "robot": "true", "anonymous": "false", "namespace":"wikipedia", "continent":"Asia", "country":"Japan", "region":"Kanto", "city":"Tokyo", "added": 1, "deleted": 10, "delta": -9}
```
#### Setup a Druid Cluster
To index the data, we are going to need an indexing service, a historical node, and a coordinator node.
To start the Indexing Service:
```bash
java -Xmx2g -Duser.timezone=UTC -Dfile.encoding=UTF-8 -classpath lib/*:<hadoop_config_path>:config/overlord io.druid.cli.Main server overlord
```
To start the Coordinator Node:
```bash
java -Xmx256m -Duser.timezone=UTC -Dfile.encoding=UTF-8 -classpath lib/*:config/coordinator io.druid.cli.Main server coordinator
```
To start the Historical Node:
```bash
java -Xmx256m -Duser.timezone=UTC -Dfile.encoding=UTF-8 -classpath lib/*:config/historical io.druid.cli.Main server historical
```
#### Index the Data
Before indexing the data, make sure you have a valid Hadoop cluster running. To build our Druid segment, we are going to submit a [Hadoop index task](Tasks.html) to the indexing service. The grammar for the Hadoop index task is very similar to the index task of the last tutorial. The tutorial Hadoop index task should be located at:
```
examples/indexing/wikipedia_index_hadoop_task.json
```
Examining the contents of the file, you should find:
```json
{
"dataSource": "druidtest",
"timestampColumn": "utcdt",
"timestampFormat": "iso",
"dataSpec": {
"format": "json",
"dimensions": [
"gender",
"age"
]
},
"granularitySpec": {
"type": "uniform",
"intervals": [
"2010-01-01T01\/PT1H"
],
"gran": "hour"
},
"pathSpec": {
"type": "static",
"paths": "\/druid\/records.json"
},
"rollupSpec": {
"aggs": [
{
"type": "count",
"name": "impressions"
},
{
"type": "doubleSum",
"name": "wp",
"fieldName": "wp"
}
],
"rollupGranularity": "minute"
},
"workingPath": "\/tmp\/working_path",
"segmentOutputPath": "\/tmp\/segments",
"partitionsSpec": {
"targetPartitionSize": 5000000
},
"updaterJobSpec": {
"type": "db",
"connectURI": "jdbc:mysql:\/\/localhost:3306\/druid",
"user": "druid",
"password": "diurd",
"segmentTable": "druid_segments"
"type" : "index_hadoop",
"config": {
"dataSource" : "wikipedia",
"timestampColumn" : "timestamp",
"timestampFormat" : "auto",
"dataSpec" : {
"format" : "json",
"dimensions" : ["page","language","user","unpatrolled","newPage","robot","anonymous","namespace","continent","country","region","city"]
},
"granularitySpec" : {
"type" : "uniform",
"gran" : "DAY",
"intervals" : [ "2013-08-31/2013-09-01" ]
},
"pathSpec" : {
"type" : "static",
"paths" : "examples/indexing/wikipedia_data.json"
},
"targetPartitionSize" : 5000000,
"rollupSpec" : {
"aggs": [{
"type" : "count",
"name" : "count"
}, {
"type" : "doubleSum",
"name" : "added",
"fieldName" : "added"
}, {
"type" : "doubleSum",
"name" : "deleted",
"fieldName" : "deleted"
}, {
"type" : "doubleSum",
"name" : "delta",
"fieldName" : "delta"
}],
"rollupGranularity" : "none"
}
}
}
```
2. Now run the job, with the config pointing at batchConfig.json:
If you are curious about what all this configuration means, see [here](Task.html)
To submit the task:
```bash
java -Xmx256m -Duser.timezone=UTC -Dfile.encoding=UTF-8 \
-classpath `echo lib/* | tr ' ' ':'` \
io.druid.cli.Main index hadoop batchConfig.json
```
```bash
curl -X 'POST' -H 'Content-Type:application/json' -d @examples/indexing/wikipedia_index_hadoop_task.json localhost:8087/druid/indexer/v1/task
```
You can now move on to [Querying Your Data](Querying-Your-Data.html)!
After the task is completed, the segment should be assigned to your historical node. You should be able to query the segment.
Next Steps
----------
For more information on querying, check out this [tutorial](Tutorial%3A-All-About-Queries.html).
Additional Information
----------------------
Getting data into Druid can definitely be difficult for first time users. Please don't hesitate to ask questions in our IRC channel or on our [google groups page](https://groups.google.com/forum/#!forum/druid-development).

View File

@ -1,13 +1,13 @@
---
layout: doc_page
---
Welcome back! In our first [tutorial](Tutorial:-A-First-Look-at-Druid.html), we introduced you to the most basic Druid setup: a single realtime node. We streamed in some data and queried it. Realtime nodes collect very recent data and periodically hand that data off to the rest of the Druid cluster. Some questions about the architecture must naturally come to mind. What does the rest of Druid cluster look like? How does Druid load available static data?
Welcome back! In our first [tutorial](Tutorial%3A-A-First-Look-at-Druid.html), we introduced you to the most basic Druid setup: a single realtime node. We streamed in some data and queried it. Realtime nodes collect very recent data and periodically hand that data off to the rest of the Druid cluster. Some questions about the architecture must naturally come to mind. What does the rest of Druid cluster look like? How does Druid load available static data?
This tutorial will hopefully answer these questions!
In this tutorial, we will set up other types of Druid nodes as well as and external dependencies for a fully functional Druid cluster. The architecture of Druid is very much like the [Megazord](http://www.youtube.com/watch?v=7mQuHh1X4H4) from the popular 90s show Mighty Morphin' Power Rangers. Each Druid node has a specific purpose and the nodes come together to form a fully functional system.
## Downloading Druid ##
## Downloading Druid
If you followed the first tutorial, you should already have Druid downloaded. If not, let's go back and do that first.
@ -20,15 +20,15 @@ tar -zxvf druid-services-*-bin.tar.gz
cd druid-services-*
```
You can also [Build From Source](Build-From-Source.html).
You can also [Build From Source](Build-from-source.html).
## External Dependencies ##
## External Dependencies
Druid requires 3 external dependencies. A "deep" storage that acts as a backup data repository, a relational database such as MySQL to hold configuration and metadata information, and [Apache Zookeeper](http://zookeeper.apache.org/) for coordination among different pieces of the cluster.
For deep storage, we have made a public S3 bucket (static.druid.io) available where data for this particular tutorial can be downloaded. More on the data [later](Tutorial-Part-2.html#the-data).
For deep storage, we have made a public S3 bucket (static.druid.io) available where data for this particular tutorial can be downloaded. More on the data later.
### Setting up MySQL ###
#### Setting up MySQL
1. If you don't already have it, download MySQL Community Server here: [http://dev.mysql.com/downloads/mysql/](http://dev.mysql.com/downloads/mysql/)
2. Install MySQL
@ -43,7 +43,7 @@ GRANT ALL ON druid.* TO 'druid'@'localhost' IDENTIFIED BY 'diurd';
CREATE database druid;
```
### Setting up Zookeeper ###
#### Setting up Zookeeper
```bash
curl http://www.motorlogy.com/apache/zookeeper/zookeeper-3.4.5/zookeeper-3.4.5.tar.gz -o zookeeper-3.4.5.tar.gz
@ -54,9 +54,9 @@ cp conf/zoo_sample.cfg conf/zoo.cfg
cd ..
```
## The Data ##
## The Data
Similar to the first tutorial, the data we will be loading is based on edits that have occurred on Wikipedia. Every time someone edits a page in Wikipedia, metadata is generated about the editor and edited page. Druid collects each individual event and packages them together in a container known as a [segment](https://github.com/metamx/druid/wiki/Segments). Segments contain data over some span of time. We've prebuilt a segment for this tutorial and will cover making your own segments in other [pages](Loading-Your-Data.html).The segment we are going to work with has the following format:
Similar to the first tutorial, the data we will be loading is based on edits that have occurred on Wikipedia. Every time someone edits a page in Wikipedia, metadata is generated about the editor and edited page. Druid collects each individual event and packages them together in a container known as a [segment](Segments.html). Segments contain data over some span of time. We've prebuilt a segment for this tutorial and will cover making your own segments in other [pages](Tutorial%3A-Loading-Your-Data-Part-1.html).The segment we are going to work with has the following format:
Dimensions (things to filter on):
@ -84,28 +84,28 @@ Metrics (things to aggregate over):
"deleted"
```
## The Cluster ##
## The Cluster
Let's start up a few nodes and download our data. First things though, let's create a config directory where we will store configs for our various nodes:
Let's start up a few nodes and download our data. First things though, let's make sure we have config directory where we will store configs for our various nodes:
```
mkdir config
ls config
```
If you are interested in learning more about Druid configuration files, check out this [link](Configuration.html). Many aspects of Druid are customizable. For the purposes of this tutorial, we are going to use default values for most things.
### Start a Coordinator Node ###
#### Start a Coordinator Node
Coordinator nodes are in charge of load assignment and distribution. Coordinator nodes monitor the status of the cluster and command historical nodes to assign and drop segments.
For more information about coordinator nodes, see [here](Coordinator.html).
To create the coordinator config file:
The coordinator config file should already exist at:
```
mkdir config/coordinator
config/coordinator
```
Under the directory we just created, create the file `runtime.properties` with the following contents if it does not exist:
In the directory, there should be a `runtime.properties` file with the following contents:
```
druid.host=localhost
@ -130,18 +130,18 @@ To start the coordinator node:
java -Xmx256m -Duser.timezone=UTC -Dfile.encoding=UTF-8 -classpath lib/*:config/coordinator io.druid.cli.Main server coordinator
```
### Start a historical node ###
#### Start a Historical Node
Historical nodes are the workhorses of a cluster and are in charge of loading historical segments and making them available for queries. Our Wikipedia segment will be downloaded by a historical node.
For more information about Historical nodes, see [here](Historical.html).
To create the historical config file:
The historical config file should exist at:
```
mkdir config/historical
config/historical
```
Under the directory we just created, create the file `runtime.properties` with the following contents:
In the directory we just created, we should have the file `runtime.properties` with the following contents:
```
druid.host=localhost
@ -167,18 +167,18 @@ To start the historical node:
java -Xmx256m -Duser.timezone=UTC -Dfile.encoding=UTF-8 -classpath lib/*:config/historical io.druid.cli.Main server historical
```
### Start a Broker Node ###
#### Start a Broker Node
Broker nodes are responsible for figuring out which historical and/or realtime nodes correspond to which queries. They also merge partial results from these nodes in a scatter/gather fashion.
For more information about Broker nodes, see [here](Broker.html).
To create the broker config file:
The broker config file should exist at:
```
mkdir config/broker
config/broker
```
Under the directory we just created, create the file ```runtime.properties``` with the following contents:
In the directory, there should be a `runtime.properties` file with the following contents:
```
druid.host=localhost
@ -194,7 +194,7 @@ To start the broker node:
java -Xmx256m -Duser.timezone=UTC -Dfile.encoding=UTF-8 -classpath lib/*:config/broker io.druid.cli.Main server broker
```
## Loading the Data ##
## Loading the Data
The MySQL dependency we introduced earlier on contains a 'segments' table that contains entries for segments that should be loaded into our cluster. The Druid coordinator compares this table with segments that already exist in the cluster to determine what should be loaded and dropped. To load our wikipedia segment, we need to create an entry in our MySQL segment table.
@ -220,7 +220,8 @@ When the segment completes downloading and ready for queries, you should see the
At this point, we can query the segment. For more information on querying, see this [link](Querying.html).
## Next Steps ##
Next Steps
----------
Now that you have an understanding of what the Druid clsuter looks like, why not load some of your own data?
Check out the [Loading Your Own Data](Loading-Your-Data.html) section for more info!
Now that you have an understanding of what the Druid cluster looks like, why not load some of your own data?
Check out the next [tutorial](Tutorial%3A-Loading-Your-Data-Part-1.html) section for more info!

View File

@ -80,7 +80,7 @@ Okay, things are about to get real. To query the real-time node you've spun up,
./run_example_client.sh
```
Select "webstream" once again. This script issues [GroupByQuery](GroupByQuery.html)s to the data we've been ingesting. The query looks like this:
Select "webstream" once again. This script issues [GroupByQueries](GroupByQuery.html) to the data we've been ingesting. The query looks like this:
```json
{
@ -304,15 +304,9 @@ You should see an answer to our question. For my stream, it looks like this:
Feel free to tweak other query parameters to answer other questions you may have about the data.
Next Steps
----------
What to know even more information about the Druid Cluster? Check out [Tutorial: The Druid Cluster](Tutorial:-The-Druid-Cluster.html)
Druid is even more fun if you load your own data into it! To learn how to load your data, see [Loading Your Data](Loading-Your-Data.html).
Additional Information
----------------------
This tutorial is merely showcasing a small fraction of what Druid can do. If you are interested in more information about Druid, including setting up a more sophisticated Druid cluster, please read the other links in our wiki.
And thus concludes our journey! Hopefully you learned a thing or two about Druid real-time ingestion, querying Druid, and how Druid can be used to solve problems. If you have additional questions, feel free to post in our [google groups page](http://www.groups.google.com/forum/#!forum/druid-development).
And thus concludes our journey! Hopefully you learned a thing or two about Druid real-time ingestion, querying Druid, and how Druid can be used to solve problems. If you have additional questions, feel free to post in our [google groups page](https://groups.google.com/forum/#!forum/druid-development).

View File

@ -322,6 +322,6 @@ Feel free to tweak other query parameters to answer other questions you may have
h2. Additional Information
This tutorial is merely showcasing a small fraction of what Druid can do. Next, continue on to "Loading Your Data":./Loading-Your-Data.html.
This tutorial is merely showcasing a small fraction of what Druid can do. Next, continue on to "The Druid Cluster":./Tutorial:-The-Druid-Cluster.html.
And thus concludes our journey! Hopefully you learned a thing or two about Druid real-time ingestion, querying Druid, and how Druid can be used to solve problems. If you have additional questions, feel free to post in our "google groups page":http://www.groups.google.com/forum/#!forum/druid-development.

View File

@ -10,18 +10,27 @@
Getting Started
* [Tutorial: A First Look at Druid](Tutorial:-A-First-Look-at-Druid.html)
* [Tutorial: The Druid Cluster](Tutorial:-The-Druid-Cluster.html)
* [Loading Your Data](Loading-Your-Data.html)
* [Querying Your Data](Querying-Your-Data.html)
* [Booting a Production Cluster](Booting-a-Production-Cluster.html)
* [Examples](Examples.html)
* [Cluster Setup](Cluster-Setup.html)
* [Configuration](Configuration.html)
* [Tutorial: Loading Your Data Part 1](Tutorial:-Loading-Your-Data-Part-1.html)
* [Tutorial: Loading Your Data Part 2](Tutorial:-Loading-Your-Data-Part-2.html)
* [Tutorial: All About Queries](Tutorial:-All-About-Queries.html)
--------------------------------------
Evaluate Druid
* [Cluster Setup](Cluster-setup.html)
* [Booting a Production Cluster](Booting-a-production-cluster.html)
--------------------------------------
Configuration
* [Configuration](Configuration.html)
-------------------------------------
Data Ingestion
* [Realtime](Realtime.html)
* [Batch|Batch Ingestion](Batch|Batch-Ingestion.html)
* [Batch Ingestion](Batch-ingestion.html)
* [Indexing Service](Indexing-Service.html)
* [Indexing Service](Indexing-Service.html)
*** ]
*** [Tasks](Tasks.html)
----------------------------
Querying

View File

@ -12,16 +12,22 @@ h1. Contents
h2. Getting Started
* "Tutorial: A First Look at Druid":./Tutorial:-A-First-Look-at-Druid.html
* "Tutorial: The Druid Cluster":./Tutorial:-The-Druid-Cluster.html
* "Loading Your Data":./Loading-Your-Data.html
* "Querying Your Data":./Querying-your-data.html
* "Tutorial: Loading Your Data Part 1":./Tutorial:-Loading-Your-Data-Part-1.html
* "Tutorial: Loading Your Data Part 2":./Tutorial:-Loading-Your-Data-Part-2.html
* "Tutorial: All About Queries":./Tutorial:-All-About-Queries.html
h2. Evaluate Druid
* "Cluster Setup":./Cluster-setup.html
* "Booting a Production Cluster":./Booting-a-production-cluster.html
* "Examples":./Examples.html
h2. Configuration
* "Configuration":Configuration.html
h2. Data Ingestion
* "Realtime":./Realtime.html
* "Batch":./Batch-ingestion.html
* "Indexing Service":./Indexing-Service.html
** "Tasks":./Tasks.html
h2. Querying
* "Querying":./Querying.html

View File

@ -1,22 +1,24 @@
[
{
"schema": {
"dataSource": "druidtest",
"aggregators": [
{
"type": "count",
"name": "impressions"
},
{
"type": "doubleSum",
"name": "wp",
"fieldName": "wp"
}
],
"indexGranularity": "minute",
"shardSpec": {
"type": "none"
}
"dataSource": "wikipedia",
"aggregators" : [{
"type" : "count",
"name" : "count"
}, {
"type" : "doubleSum",
"name" : "added",
"fieldName" : "added"
}, {
"type" : "doubleSum",
"name" : "deleted",
"fieldName" : "deleted"
}, {
"type" : "doubleSum",
"name" : "delta",
"fieldName" : "delta"
}],
"indexGranularity": "none"
},
"config": {
"maxRowsInMemory": 500000,
@ -29,23 +31,20 @@
"zk.connectiontimeout.ms": "15000",
"zk.sessiontimeout.ms": "15000",
"zk.synctime.ms": "5000",
"groupid": "topic-pixel-local",
"groupid": "druid-example",
"fetch.size": "1048586",
"autooffset.reset": "largest",
"autocommit.enable": "false"
},
"feed": "druidtest",
"feed": "wikipedia",
"parser": {
"timestampSpec": {
"column": "utcdt",
"format": "iso"
"column": "timestamp"
},
"data": {
"format": "json"
},
"dimensionExclusions": [
"wp"
]
"format": "json",
"dimensions" : ["page","language","user","unpatrolled","newPage","robot","anonymous","namespace","continent","country","region","city"]
}
}
},
"plumber": {

View File

@ -1,5 +1,5 @@
{"timestamp": "2013-08-31T01:02:33Z", "page": "Gypsy Danger", "language" : "en", "user" : "nuclear", "unpatrolled" : "true", "newPage" : "true", "robot": "false", "anonymous": "false", "namespace":"article", "continent":"North America", "country":"United States", "region":"Bay Area", "city":"San Francisco", "added": 57, "deleted": 200, "delta": -143}
{"timestamp": "2013-08-31T03:32:45Z", "page": "Striker Eureka", "language" : "en", "user" : "speed", "unpatrolled" : "false", "newPage" : "true", "robot": "true", "anonymous": "false", "namespace":"wikipedia", "continent":"Australia", "country":"Australia", "region":"Dingo Land", "city":"Syndey", "added": 459, "deleted": 129, "delta": 330}
{"timestamp": "2013-08-31T07:11:21Z", "page": "Cherno Alpha", "language" : "ru", "user" : "masterYi", "unpatrolled" : "false", "newPage" : "true", "robot": "true", "anonymous": "false", "namespace":"article", "continent":"Asia", "country":"Russia", "region":"Vodka Land", "city":"Moscow", "added": 123, "deleted": 12, "delta": 111}
{"timestamp": "2013-08-31T03:32:45Z", "page": "Striker Eureka", "language" : "en", "user" : "speed", "unpatrolled" : "false", "newPage" : "true", "robot": "true", "anonymous": "false", "namespace":"wikipedia", "continent":"Australia", "country":"Australia", "region":"Cantebury", "city":"Syndey", "added": 459, "deleted": 129, "delta": 330}
{"timestamp": "2013-08-31T07:11:21Z", "page": "Cherno Alpha", "language" : "ru", "user" : "masterYi", "unpatrolled" : "false", "newPage" : "true", "robot": "true", "anonymous": "false", "namespace":"article", "continent":"Asia", "country":"Russia", "region":"Oblast", "city":"Moscow", "added": 123, "deleted": 12, "delta": 111}
{"timestamp": "2013-08-31T11:58:39Z", "page": "Crimson Typhoon", "language" : "zh", "user" : "triplets", "unpatrolled" : "true", "newPage" : "false", "robot": "true", "anonymous": "false", "namespace":"wikipedia", "continent":"Asia", "country":"China", "region":"Shanxi", "city":"Taiyuan", "added": 905, "deleted": 5, "delta": 900}
{"timestamp": "2013-08-31T12:41:27Z", "page": "Coyote Tango", "language" : "ja", "user" : "cancer", "unpatrolled" : "true", "newPage" : "false", "robot": "true", "anonymous": "false", "namespace":"wikipedia", "continent":"Asia", "country":"Japan", "region":"Kanto", "city":"Tokyo", "added": 1, "deleted": 10, "delta": -9}

View File

@ -0,0 +1,49 @@
{
"dataSource": "wikipedia",
"timestampColumn": "timestamp",
"timestampFormat": "iso",
"dataSpec": {
"format": "json",
"dimensions" : ["page","language","user","unpatrolled","newPage","robot","anonymous","namespace","continent","country","region","city"]
},
"granularitySpec" : {
"type" : "uniform",
"gran" : "DAY",
"intervals" : [ "2013-08-31/2013-09-01" ]
},
"pathSpec": {
"type": "static",
"paths": "examples/indexing/wikipedia_data.json"
},
"rollupSpec": {
"aggs": [{
"type" : "count",
"name" : "count"
}, {
"type" : "doubleSum",
"name" : "added",
"fieldName" : "added"
}, {
"type" : "doubleSum",
"name" : "deleted",
"fieldName" : "deleted"
}, {
"type" : "doubleSum",
"name" : "delta",
"fieldName" : "delta"
}],
"rollupGranularity": "none"
},
"workingPath": "\/tmp\/working_path",
"segmentOutputPath": "\/tmp\/segments",
"partitionsSpec": {
"targetPartitionSize": 5000000
},
"updaterJobSpec": {
"type": "db",
"connectURI": "jdbc:mysql:\/\/localhost:3306\/druid",
"user": "druid",
"password": "diurd",
"segmentTable": "druid_segments"
}
}

View File

@ -0,0 +1,41 @@
{
"type" : "index_hadoop",
"config": {
"dataSource" : "wikipedia",
"timestampColumn" : "timestamp",
"timestampFormat" : "auto",
"dataSpec" : {
"format" : "json",
"dimensions" : ["page","language","user","unpatrolled","newPage","robot","anonymous","namespace","continent","country","region","city"]
},
"granularitySpec" : {
"type" : "uniform",
"gran" : "DAY",
"intervals" : [ "2013-08-31/2013-09-01" ]
},
"pathSpec" : {
"type" : "static",
"paths" : "examples/indexing/wikipedia_data.json"
},
"targetPartitionSize" : 5000000,
"rollupSpec" : {
"aggs": [{
"type" : "count",
"name" : "count"
}, {
"type" : "doubleSum",
"name" : "added",
"fieldName" : "added"
}, {
"type" : "doubleSum",
"name" : "deleted",
"fieldName" : "deleted"
}, {
"type" : "doubleSum",
"name" : "delta",
"fieldName" : "delta"
}],
"rollupGranularity" : "none"
}
}
}

View File

@ -8,7 +8,7 @@
},
"aggregators" : [{
"type" : "count",
"name" : "edit_count"
"name" : "count"
}, {
"type" : "doubleSum",
"name" : "added",

View File

@ -12,8 +12,8 @@
},
{
"type":"longSum",
"fieldName":"edit_count",
"name":"count"
"fieldName":"count",
"name":"edit_count"
}
],
"filter":{

View File

@ -11,6 +11,4 @@ druid.db.connector.connectURI=jdbc\:mysql\://localhost\:3306/druid
druid.db.connector.user=druid
druid.db.connector.password=diurd
druid.realtime.specFile=config/realtime/realtime.spec
druid.processing.buffer.sizeBytes=10000000