mirror of https://github.com/apache/druid.git
commit
38e9ff5984
|
@ -23,6 +23,7 @@ import com.google.common.base.Supplier;
|
|||
import com.google.common.base.Throwables;
|
||||
import com.google.common.collect.Maps;
|
||||
import com.google.inject.Inject;
|
||||
import com.metamx.common.ISE;
|
||||
import com.metamx.common.concurrent.ScheduledExecutors;
|
||||
import com.metamx.common.lifecycle.LifecycleStart;
|
||||
import com.metamx.common.lifecycle.LifecycleStop;
|
||||
|
@ -38,6 +39,7 @@ import org.skife.jdbi.v2.tweak.ResultSetMapper;
|
|||
import java.sql.ResultSet;
|
||||
import java.sql.SQLException;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.concurrent.Callable;
|
||||
import java.util.concurrent.ConcurrentMap;
|
||||
|
@ -76,7 +78,7 @@ public class ConfigManager
|
|||
final String configTable = dbTables.get().getConfigTable();
|
||||
|
||||
this.selectStatement = String.format("SELECT payload FROM %s WHERE name = :name", configTable);
|
||||
insertStatement = String.format(
|
||||
this.insertStatement = String.format(
|
||||
"INSERT INTO %s (name, payload) VALUES (:name, :payload) ON DUPLICATE KEY UPDATE payload = :payload",
|
||||
configTable
|
||||
);
|
||||
|
@ -186,19 +188,29 @@ public class ConfigManager
|
|||
@Override
|
||||
public byte[] withHandle(Handle handle) throws Exception
|
||||
{
|
||||
return handle.createQuery(selectStatement)
|
||||
.bind("name", key)
|
||||
.map(
|
||||
new ResultSetMapper<byte[]>()
|
||||
{
|
||||
@Override
|
||||
public byte[] map(int index, ResultSet r, StatementContext ctx) throws SQLException
|
||||
{
|
||||
return r.getBytes("payload");
|
||||
}
|
||||
}
|
||||
)
|
||||
.first();
|
||||
List<byte[]> matched = handle.createQuery(selectStatement)
|
||||
.bind("name", key)
|
||||
.map(
|
||||
new ResultSetMapper<byte[]>()
|
||||
{
|
||||
@Override
|
||||
public byte[] map(int index, ResultSet r, StatementContext ctx)
|
||||
throws SQLException
|
||||
{
|
||||
return r.getBytes("payload");
|
||||
}
|
||||
}
|
||||
).list();
|
||||
|
||||
if (matched.isEmpty()) {
|
||||
return null;
|
||||
}
|
||||
|
||||
if (matched.size() > 1) {
|
||||
throw new ISE("Error! More than one matching entry[%d] found for [%s]?!", matched.size(), key);
|
||||
}
|
||||
|
||||
return matched.get(0);
|
||||
}
|
||||
}
|
||||
);
|
||||
|
|
|
@ -11,9 +11,6 @@
|
|||
<a href="mailto:info@druid.io">info@druid.io</a>
|
||||
</address>
|
||||
<address>
|
||||
<strong>Metamarkets</strong>
|
||||
625 2nd Street, Suite #230<br>
|
||||
San Francisco, CA 94017<br>
|
||||
<div class="soc">
|
||||
<a href="https://twitter.com/druidio"></a>
|
||||
<a href="https://github.com/metamx/druid" class="github"></a>
|
||||
|
@ -25,7 +22,7 @@
|
|||
<li><a href="/"><strong>DRUID</strong></a></li>
|
||||
<li><a href="/druid.html">What is Druid?</a></li>
|
||||
<li><a href="/downloads.html">Downloads</a></li>
|
||||
<li><a target="_blank" href="https://github.com/metamx/druid/wiki">Documentation</a></li>
|
||||
<li><a target="_blank" href="Home.html">Documentation</a></li>
|
||||
</ul>
|
||||
<ul class="col-md-4 list-unstyled">
|
||||
<li><a href="/community.html"><strong>SUPPORT</strong></a></li>
|
||||
|
|
|
@ -3,7 +3,7 @@ layout: doc_page
|
|||
---
|
||||
# Booting a Single Node Cluster #
|
||||
|
||||
[Loading Your Data](Loading-Your-Data.html) and [Querying Your Data](Querying-Your-Data.html) contain recipes to boot a small druid cluster on localhost. Here we will boot a small cluster on EC2. You can checkout the code, or download a tarball from [here](http://static.druid.io/artifacts/druid-services-0.6.0-bin.tar.gz).
|
||||
[Loading Your Data](Tutorial%3A-Loading-Your-Data-Part-2.html) and [All About Queries](Tutorial%3A-All-About-Queries.html) contain recipes to boot a small druid cluster on localhost. Here we will boot a small cluster on EC2. You can checkout the code, or download a tarball from [here](http://static.druid.io/artifacts/druid-services-0.6.0-bin.tar.gz).
|
||||
|
||||
The [ec2 run script](https://github.com/metamx/druid/blob/master/examples/bin/run_ec2.sh), run_ec2.sh, is located at 'examples/bin' if you have checked out the code, or at the root of the project if you've downloaded a tarball. The scripts rely on the [Amazon EC2 API Tools](http://aws.amazon.com/developertools/351), and you will need to set three environment variables:
|
||||
|
||||
|
|
|
@ -0,0 +1,105 @@
|
|||
---
|
||||
layout: doc_page
|
||||
---
|
||||
A Druid cluster consists of various node types that need to be set up depending on your use case. See our [[Design]] docs for a description of the different node types.
|
||||
|
||||
h2. Setup Scripts
|
||||
|
||||
One of our community members, "housejester":https://github.com/housejester/, contributed some scripts to help with setting up a cluster. Checkout the "github":https://github.com/housejester/druid-test-harness and "wiki":https://github.com/housejester/druid-test-harness/wiki/Druid-Test-Harness.
|
||||
|
||||
h2. Minimum Physical Layout: Absolute Minimum
|
||||
|
||||
As a special case, the absolute minimum setup is one of the standalone examples for realtime ingestion and querying; see [[Examples]] that can easily run on one machine with one core and 1GB RAM. This layout can be set up to try some basic queries with Druid.
|
||||
|
||||
h2. Minimum Physical Layout: Experimental Testing with 4GB of RAM
|
||||
|
||||
This layout can be used to load some data from deep storage onto a Druid compute node for the first time. A minimal physical layout for a 1 or 2 core machine with 4GB of RAM is:
|
||||
|
||||
# node1: [[Master]] + metadata service + zookeeper + [[Compute]]
|
||||
# transient nodes: indexer
|
||||
|
||||
This setup is only reasonable to prove that a configuration works. It would not be worthwhile to use this layout for performance measurement.
|
||||
|
||||
h2. Comfortable Physical Layout: Pilot Project with Multiple Machines
|
||||
|
||||
_The machine size "flavors" are using AWS/EC2 terminology for descriptive purposes only and is not meant to imply that AWS/EC2 is required or recommended. Another cloud provider or your own hardware can also work._
|
||||
|
||||
A minimal physical layout not constrained by cores that demonstrates parallel querying and realtime, using AWS-EC2 "small"/m1.small (one core, with 1.7GB of RAM) or larger, no realtime, is:
|
||||
|
||||
# node1: [[Master]] (m1.small)
|
||||
# node2: metadata service (m1.small)
|
||||
# node3: zookeeper (m1.small)
|
||||
# node4: [[Broker]] (m1.small or m1.medium or m1.large)
|
||||
# node5: [[Compute]] (m1.small or m1.medium or m1.large)
|
||||
# node6: [[Compute]] (m1.small or m1.medium or m1.large)
|
||||
# node7: [[Realtime]] (m1.small or m1.medium or m1.large)
|
||||
# transient nodes: indexer
|
||||
|
||||
This layout naturally lends itself to adding more RAM and core to Compute nodes, and to adding many more Compute nodes. Depending on the actual load, the Master, metadata server, and Zookeeper might need to use larger machines.
|
||||
|
||||
h2. High Availability Physical Layout
|
||||
|
||||
_The machine size "flavors" are using AWS/EC2 terminology for descriptive purposes only and is not meant to imply that AWS/EC2 is required or recommended. Another cloud provider or your own hardware can also work._
|
||||
|
||||
An HA layout allows full rolling restarts and heavy volume:
|
||||
|
||||
# node1: [[Master]] (m1.small or m1.medium or m1.large)
|
||||
# node2: [[Master]] (m1.small or m1.medium or m1.large) (backup)
|
||||
# node3: metadata service (c1.medium or m1.large)
|
||||
# node4: metadata service (c1.medium or m1.large) (backup)
|
||||
# node5: zookeeper (c1.medium)
|
||||
# node6: zookeeper (c1.medium)
|
||||
# node7: zookeeper (c1.medium)
|
||||
# node8: [[Broker]] (m1.small or m1.medium or m1.large or m2.xlarge or m2.2xlarge or m2.4xlarge)
|
||||
# node9: [[Broker]] (m1.small or m1.medium or m1.large or m2.xlarge or m2.2xlarge or m2.4xlarge) (backup)
|
||||
# node10: [[Compute]] (m1.small or m1.medium or m1.large or m2.xlarge or m2.2xlarge or m2.4xlarge)
|
||||
# node11: [[Compute]] (m1.small or m1.medium or m1.large or m2.xlarge or m2.2xlarge or m2.4xlarge)
|
||||
# node12: [[Realtime]] (m1.small or m1.medium or m1.large or m2.xlarge or m2.2xlarge or m2.4xlarge)
|
||||
# transient nodes: indexer
|
||||
|
||||
h2. Sizing for Cores and RAM
|
||||
|
||||
The Compute and Broker nodes will use as many cores as are available, depending on usage, so it is best to keep these on dedicated machines. The upper limit of effectively utilized cores is not well characterized yet and would depend on types of queries, query load, and the schema. Compute daemons should have a heap a size of at least 1GB per core for normal usage, but could be squeezed into a smaller heap for testing. Since in-memory caching is essential for good performance, even more RAM is better. Broker nodes will use RAM for caching, so they do more than just route queries.
|
||||
|
||||
The effective utilization of cores by Zookeeper, MySQL, and Master nodes is likely to be between 1 and 2 for each process/daemon, so these could potentially share a machine with lots of cores. These daemons work with heap a size between 500MB and 1GB.
|
||||
|
||||
h2. Storage
|
||||
|
||||
Indexed segments should be kept in a permanent store accessible by all nodes like AWS S3 or HDFS or equivalent. Currently Druid supports S3, but this will be extended soon.
|
||||
|
||||
Local disk ("ephemeral" on AWS EC2) for caching is recommended over network mounted storage (example of mounted: AWS EBS, Elastic Block Store) in order to avoid network delays during times of heavy usage. If your data center is suitably provisioned for networked storage, perhaps with separate LAN/NICs just for storage, then mounted might work fine.
|
||||
|
||||
h2. Setup
|
||||
|
||||
Setting up a cluster is essentially just firing up all of the nodes you want with the proper [[configuration]]. One thing to be aware of is that there are a few properties in the configuration that potentially need to be set individually for each process:
|
||||
|
||||
<pre>
|
||||
<code>
|
||||
druid.server.type=historical|realtime
|
||||
druid.host=someHostOrIPaddrWithPort
|
||||
druid.port=8080
|
||||
</code>
|
||||
</pre>
|
||||
|
||||
@druid.server.type@ should be set to "historical" for your compute nodes and realtime for the realtime nodes. The master will only assign segments to a "historical" node and the broker has some intelligence around its ability to cache results when talking to a realtime node. This does not need to be set for the master or the broker.
|
||||
|
||||
@druid.host@ should be set to the hostname and port that can be used to talk to the given server process. Basically, someone should be able to send a request to http://${druid.host}/ and actually talk to the process.
|
||||
|
||||
@druid.port@ should be set to the port that the server should listen on. In the vast majority of cases, this port should be the same as what is on @druid.host@.
|
||||
|
||||
h2. Build/Run
|
||||
|
||||
The simplest way to build and run from the repository is to run @mvn package@ from the base directory and then take @druid-services/target/druid-services-*-selfcontained.jar@ and push that around to your machines; the jar does not need to be expanded, and since it contains the main() methods for each kind of service, it is *not* invoked with java -jar. It can be run from a normal java command-line by just including it on the classpath and then giving it the main class that you want to run. For example one instance of the Compute node/service can be started like this:
|
||||
|
||||
<pre>
|
||||
<code>
|
||||
java -Duser.timezone=UTC -Dfile.encoding=UTF-8 -cp compute/:druid-services/target/druid-services-*-selfcontained.jar com.metamx.druid.http.ComputeMain
|
||||
</code>
|
||||
</pre>
|
||||
The following table shows the possible services and fully qualified class for main().
|
||||
|
||||
|_. service |_. main class |
|
||||
| [[ Realtime ]] | com.metamx.druid.realtime.RealtimeMain |
|
||||
| [[ Master ]] | com.metamx.druid.http.MasterMain |
|
||||
| [[ Broker ]] | com.metamx.druid.http.BrokerMain |
|
||||
| [[ Compute ]] | com.metamx.druid.http.ComputeMain |
|
|
@ -2,9 +2,8 @@
|
|||
layout: doc_page
|
||||
---
|
||||
|
||||
Druid is an open-source analytics datastore designed for realtime, exploratory, queries on large-scale data sets (100’s of Billions entries, 100’s TB data). Druid provides for cost effective, always-on, realtime data ingestion and arbitrary data exploration.
|
||||
Druid is an open-source analytics data store designed for real-time, exploratory, queries on large-scale data sets (100’s of Billions entries, 100’s TB data). Druid provides for cost effective, always-on, realtime data ingestion and arbitrary data exploration.
|
||||
|
||||
- Check out some [Examples](Examples.html)
|
||||
- Try out Druid with our Getting Started [Tutorial](./Tutorial%3A-A-First-Look-at-Druid.html)
|
||||
- Learn more by reading the [White Paper](http://static.druid.io/docs/druid.pdf)
|
||||
|
||||
|
|
|
@ -6,7 +6,7 @@ The indexing service is a highly-available, distributed service that runs indexi
|
|||
The indexing service is composed of three main components: a peon component that can run a single task, a middle manager component that manages peons, and an overlord component that manages task distribution to middle managers.
|
||||
Overlords and middle managers may run on the same node or across multiple nodes while middle managers and peons always run on the same node.
|
||||
|
||||
Most Basic Getting Started Configuration
|
||||
Quick Start
|
||||
----------------------------------------
|
||||
Run:
|
||||
|
||||
|
@ -149,7 +149,7 @@ http://<COORDINATOR_IP>:<port>/druid/indexer/v1/worker/setup
|
|||
|
||||
A sample worker setup spec is shown below:
|
||||
|
||||
```
|
||||
```json
|
||||
{
|
||||
"minVersion":"some_version",
|
||||
"minNumWorkers":"0",
|
||||
|
|
|
@ -1,293 +0,0 @@
|
|||
---
|
||||
layout: doc_page
|
||||
---
|
||||
# Setup #
|
||||
|
||||
Before we start querying druid, we're going to finish setting up a complete cluster on localhost. In [Loading Your Data](Loading-Your-Data.html) we setup a [Realtime](Realtime.html), [Historical](Historical.html) and [Coordinator](Coordinator.html) node. If you've already completed that tutorial, you need only follow the directions for 'Booting a Broker Node'.
|
||||
|
||||
## Booting a Broker Node ##
|
||||
|
||||
1. Setup a config file at config/broker/runtime.properties that looks like this:
|
||||
|
||||
```
|
||||
druid.host=localhost
|
||||
druid.service=broker
|
||||
druid.port=8080
|
||||
|
||||
druid.zk.service.host=localhost
|
||||
|
||||
```
|
||||
|
||||
2. Run the broker node:
|
||||
|
||||
```bash
|
||||
java -Xmx256m -Duser.timezone=UTC -Dfile.encoding=UTF-8 -classpath lib/*:config/broker io.druid.cli.Main server broker
|
||||
```
|
||||
|
||||
With the Broker node and the other Druid nodes types up and running, you have a fully functional Druid Cluster and are ready to query your data!
|
||||
|
||||
# Querying Your Data #
|
||||
|
||||
Now that we have a complete cluster setup on localhost, we need to load data. To do so, refer to [Loading Your Data](Loading-Your-Data.html). Having done that, its time to query our data! For a complete specification of queries, see [Querying](Querying.html).
|
||||
|
||||
## Querying Different Nodes ##
|
||||
|
||||
As a shared-nothing system, there are three ways to query druid, against the [Realtime](Realtime.html), [Historical](Historical.html) or [Broker](Broker.html) node. Querying a Realtime node returns only realtime data, querying a historical node returns only historical segments. Querying the broker may query both realtime and historical segments and compose an overall result for the query. This is the normal mode of operation for queries in Druid.
|
||||
|
||||
### Construct a Query ###
|
||||
|
||||
For constructing this query, see: Querying against the realtime.spec
|
||||
|
||||
```json
|
||||
{
|
||||
"queryType": "groupBy",
|
||||
"dataSource": "druidtest",
|
||||
"granularity": "all",
|
||||
"dimensions": [],
|
||||
"aggregations": [
|
||||
{"type": "count", "name": "rows"},
|
||||
{"type": "longSum", "name": "imps", "fieldName": "impressions"},
|
||||
{"type": "doubleSum", "name": "wp", "fieldName": "wp"}
|
||||
],
|
||||
"intervals": ["2010-01-01T00:00/2020-01-01T00"]
|
||||
}
|
||||
```
|
||||
|
||||
### Querying the Realtime Node ###
|
||||
|
||||
Run our query against port 8080:
|
||||
|
||||
```bash
|
||||
curl -X POST "http://localhost:8080/druid/v2/?pretty" -H 'content-type: application/json' -d @query.body
|
||||
```
|
||||
|
||||
See our result:
|
||||
|
||||
```json
|
||||
[ {
|
||||
"version" : "v1",
|
||||
"timestamp" : "2010-01-01T00:00:00.000Z",
|
||||
"event" : { "imps" : 5, "wp" : 15000.0, "rows" : 5 }
|
||||
} ]
|
||||
```
|
||||
|
||||
### Querying the Historical node ###
|
||||
Run the query against port 8082:
|
||||
|
||||
```bash
|
||||
curl -X POST "http://localhost:8082/druid/v2/?pretty" -H 'content-type: application/json' -d @query.body
|
||||
```
|
||||
|
||||
And get (similar to):
|
||||
|
||||
```json
|
||||
[ {
|
||||
"version" : "v1",
|
||||
"timestamp" : "2010-01-01T00:00:00.000Z",
|
||||
"event" : { "imps" : 27, "wp" : 77000.0, "rows" : 9 }
|
||||
} ]
|
||||
```
|
||||
|
||||
### Querying the Broker ###
|
||||
Run the query against port 8083:
|
||||
|
||||
```bash
|
||||
curl -X POST "http://localhost:8083/druid/v2/?pretty" -H 'content-type: application/json' -d @query.body
|
||||
```
|
||||
|
||||
And get:
|
||||
|
||||
```json
|
||||
[ {
|
||||
"version" : "v1",
|
||||
"timestamp" : "2010-01-01T00:00:00.000Z",
|
||||
"event" : { "imps" : 5, "wp" : 15000.0, "rows" : 5 }
|
||||
} ]
|
||||
```
|
||||
|
||||
Now that we know what nodes can be queried (although you should usually use the broker node), lets learn how to know what queries are available.
|
||||
|
||||
## Examining the realtime.spec ##
|
||||
|
||||
How are we to know what queries we can run? Although [Querying](Querying.html) is a helpful index, to get a handle on querying our data we need to look at our [Realtime](Realtime.html) node's realtime.spec file:
|
||||
|
||||
```json
|
||||
[
|
||||
{
|
||||
"schema": {
|
||||
"dataSource": "druidtest",
|
||||
"aggregators": [
|
||||
{
|
||||
"type": "count",
|
||||
"name": "impressions"
|
||||
},
|
||||
{
|
||||
"type": "doubleSum",
|
||||
"name": "wp",
|
||||
"fieldName": "wp"
|
||||
}
|
||||
],
|
||||
"indexGranularity": "minute",
|
||||
"shardSpec": {
|
||||
"type": "none"
|
||||
}
|
||||
},
|
||||
"config": {
|
||||
"maxRowsInMemory": 500000,
|
||||
"intermediatePersistPeriod": "PT10m"
|
||||
},
|
||||
"firehose": {
|
||||
"type": "kafka-0.7.2",
|
||||
"consumerProps": {
|
||||
"zk.connect": "localhost:2181",
|
||||
"zk.connectiontimeout.ms": "15000",
|
||||
"zk.sessiontimeout.ms": "15000",
|
||||
"zk.synctime.ms": "5000",
|
||||
"groupid": "topic-pixel-local",
|
||||
"fetch.size": "1048586",
|
||||
"autooffset.reset": "largest",
|
||||
"autocommit.enable": "false"
|
||||
},
|
||||
"feed": "druidtest",
|
||||
"parser": {
|
||||
"timestampSpec": {
|
||||
"column": "utcdt",
|
||||
"format": "iso"
|
||||
},
|
||||
"data": {
|
||||
"format": "json"
|
||||
},
|
||||
"dimensionExclusions": [
|
||||
"wp"
|
||||
]
|
||||
}
|
||||
},
|
||||
"plumber": {
|
||||
"type": "realtime",
|
||||
"windowPeriod": "PT10m",
|
||||
"segmentGranularity": "hour",
|
||||
"basePersistDirectory": "\/tmp\/realtime\/basePersist",
|
||||
"rejectionPolicy": {
|
||||
"type": "messageTime"
|
||||
}
|
||||
}
|
||||
}
|
||||
]
|
||||
```
|
||||
|
||||
### dataSource ###
|
||||
|
||||
```json
|
||||
"dataSource":"druidtest"
|
||||
```
|
||||
|
||||
Our dataSource tells us the name of the relation/table, or 'source of data', to query in both our realtime.spec and query.body!
|
||||
|
||||
### aggregations ###
|
||||
|
||||
Note the [Aggregations](Aggregations.html) in our query:
|
||||
|
||||
```json
|
||||
"aggregations": [
|
||||
{"type": "count", "name": "rows"},
|
||||
{"type": "longSum", "name": "imps", "fieldName": "impressions"},
|
||||
{"type": "doubleSum", "name": "wp", "fieldName": "wp"}
|
||||
],
|
||||
```
|
||||
|
||||
this matches up to the aggregators in the schema of our realtime.spec!
|
||||
|
||||
```json
|
||||
"aggregators":[ {"type":"count", "name":"impressions"},
|
||||
{"type":"doubleSum","name":"wp","fieldName":"wp"}],
|
||||
```
|
||||
|
||||
### dimensions ###
|
||||
|
||||
Lets look back at our actual records (from [Loading Your Data](Loading Your Data.html)):
|
||||
|
||||
```json
|
||||
{"utcdt": "2010-01-01T01:01:01", "wp": 1000, "gender": "male", "age": 100}
|
||||
{"utcdt": "2010-01-01T01:01:02", "wp": 2000, "gender": "female", "age": 50}
|
||||
{"utcdt": "2010-01-01T01:01:03", "wp": 3000, "gender": "male", "age": 20}
|
||||
{"utcdt": "2010-01-01T01:01:04", "wp": 4000, "gender": "female", "age": 30}
|
||||
{"utcdt": "2010-01-01T01:01:05", "wp": 5000, "gender": "male", "age": 40}
|
||||
```
|
||||
|
||||
Note that we have two dimensions to our data, other than our primary metric, wp. They are 'gender' and 'age'. We can specify these in our query! Note that we have added a dimension: age, below.
|
||||
|
||||
```json
|
||||
{
|
||||
"queryType": "groupBy",
|
||||
"dataSource": "druidtest",
|
||||
"granularity": "all",
|
||||
"dimensions": ["age"],
|
||||
"aggregations": [
|
||||
{"type": "count", "name": "rows"},
|
||||
{"type": "longSum", "name": "imps", "fieldName": "impressions"},
|
||||
{"type": "doubleSum", "name": "wp", "fieldName": "wp"}
|
||||
],
|
||||
"intervals": ["2010-01-01T00:00/2020-01-01T00"]
|
||||
}
|
||||
```
|
||||
|
||||
Which gets us grouped data in return!
|
||||
|
||||
```json
|
||||
[ {
|
||||
"version" : "v1",
|
||||
"timestamp" : "2010-01-01T00:00:00.000Z",
|
||||
"event" : { "imps" : 1, "age" : "100", "wp" : 1000.0, "rows" : 1 }
|
||||
}, {
|
||||
"version" : "v1",
|
||||
"timestamp" : "2010-01-01T00:00:00.000Z",
|
||||
"event" : { "imps" : 1, "age" : "20", "wp" : 3000.0, "rows" : 1 }
|
||||
}, {
|
||||
"version" : "v1",
|
||||
"timestamp" : "2010-01-01T00:00:00.000Z",
|
||||
"event" : { "imps" : 1, "age" : "30", "wp" : 4000.0, "rows" : 1 }
|
||||
}, {
|
||||
"version" : "v1",
|
||||
"timestamp" : "2010-01-01T00:00:00.000Z",
|
||||
"event" : { "imps" : 1, "age" : "40", "wp" : 5000.0, "rows" : 1 }
|
||||
}, {
|
||||
"version" : "v1",
|
||||
"timestamp" : "2010-01-01T00:00:00.000Z",
|
||||
"event" : { "imps" : 1, "age" : "50", "wp" : 2000.0, "rows" : 1 }
|
||||
} ]
|
||||
```
|
||||
|
||||
### filtering ###
|
||||
|
||||
Now that we've observed our dimensions, we can also filter:
|
||||
|
||||
```json
|
||||
{
|
||||
"queryType": "groupBy",
|
||||
"dataSource": "druidtest",
|
||||
"granularity": "all",
|
||||
"filter": { "type": "selector", "dimension": "gender", "value": "male" },
|
||||
"aggregations": [
|
||||
{"type": "count", "name": "rows"},
|
||||
{"type": "longSum", "name": "imps", "fieldName": "impressions"},
|
||||
{"type": "doubleSum", "name": "wp", "fieldName": "wp"}
|
||||
],
|
||||
"intervals": ["2010-01-01T00:00/2020-01-01T00"]
|
||||
}
|
||||
```
|
||||
|
||||
Which gets us just people aged 40:
|
||||
|
||||
```json
|
||||
[ {
|
||||
"version" : "v1",
|
||||
"timestamp" : "2010-01-01T00:00:00.000Z",
|
||||
"event" : { "imps" : 3, "wp" : 9000.0, "rows" : 3 }
|
||||
} ]
|
||||
```
|
||||
|
||||
Check out [Filters](Filters.html) for more information.
|
||||
|
||||
## Learn More ##
|
||||
|
||||
You can learn more about querying at [Querying](Querying.html)! Now check out [Booting a production cluster](Booting-a-production-cluster.html)!
|
|
@ -12,7 +12,7 @@ Segment Creation Tasks
|
|||
|
||||
The Index Task is a simpler variation of the Index Hadoop task that is designed to be used for smaller data sets. The task executes within the indexing service and does not require an external Hadoop setup to use. The grammar of the index task is as follows:
|
||||
|
||||
```
|
||||
```json
|
||||
{
|
||||
"type" : "index",
|
||||
"dataSource" : "example",
|
||||
|
@ -50,7 +50,7 @@ The Index Task is a simpler variation of the Index Hadoop task that is designed
|
|||
|--------|-----------|---------|
|
||||
|type|The task type, this should always be "index".|yes|
|
||||
|id|The task ID.|no|
|
||||
|granularitySpec|See [granularitySpec](Tasks.html#Granularity-Spec)|yes|
|
||||
|granularitySpec|See [granularitySpec](Tasks.html)|yes|
|
||||
|spatialDimensions|Dimensions to build spatial indexes over. See [Spatial-Indexing](Spatial-Indexing.html)|no|
|
||||
|aggregators|The metrics to aggregate in the data set. For more info, see [Aggregations](Aggregations.html)|yes|
|
||||
|indexGranularity|The rollup granularity for timestamps.|no|
|
||||
|
@ -78,10 +78,10 @@ The Hadoop Index Task is used to index larger data sets that require the paralle
|
|||
|
||||
The indexing service can also run real-time tasks. These tasks effectively transform a middle manager into a real-time node. We introduced real-time tasks as a way to programmatically add new real-time data sources without needing to manually add nodes. The grammar for the real-time task is as follows:
|
||||
|
||||
```
|
||||
```json
|
||||
{
|
||||
"type" : "index_realtime",
|
||||
"id": "example,
|
||||
"id": "example",
|
||||
"resource": {
|
||||
"availabilityGroup" : "someGroup",
|
||||
"requiredCapacity" : 1
|
||||
|
@ -154,10 +154,10 @@ A JSON object used for high availability purposes. Not required.
|
|||
|requiredCapacity|Integer|How much middle manager capacity this task will take.|yes|
|
||||
|
||||
Schema:
|
||||
See [Schema](Realtime.html#Schema).
|
||||
See [Schema](Realtime.html).
|
||||
|
||||
Fire Department Config:
|
||||
See [Config](Realtime.html#Config).
|
||||
See [Config](Realtime.html).
|
||||
|
||||
Firehose:
|
||||
See [Firehose](Firehose.html).
|
||||
|
@ -178,7 +178,7 @@ Segment Merging Tasks
|
|||
|
||||
Append tasks append a list of segments together into a single segment (one after the other). The grammar is:
|
||||
|
||||
```
|
||||
```json
|
||||
{
|
||||
"id": <task_id>,
|
||||
"dataSource": <task_datasource>,
|
||||
|
@ -190,7 +190,7 @@ Append tasks append a list of segments together into a single segment (one after
|
|||
|
||||
Merge tasks merge a list of segments together. Any common timestamps are merged. The grammar is:
|
||||
|
||||
```
|
||||
```json
|
||||
{
|
||||
"id": <task_id>,
|
||||
"dataSource": <task_datasource>,
|
||||
|
@ -205,7 +205,7 @@ Segment Destroying Tasks
|
|||
|
||||
Delete tasks create empty segments with no data. The grammar is:
|
||||
|
||||
```
|
||||
```json
|
||||
{
|
||||
"id": <task_id>,
|
||||
"dataSource": <task_datasource>,
|
||||
|
@ -217,7 +217,7 @@ Delete tasks create empty segments with no data. The grammar is:
|
|||
|
||||
Kill tasks delete all information about a segment and removes it from deep storage. Killable segments must be disabled (used==0) in the Druid segment table. The available grammar is:
|
||||
|
||||
```
|
||||
```json
|
||||
{
|
||||
"id": <task_id>,
|
||||
"dataSource": <task_datasource>,
|
||||
|
@ -232,7 +232,7 @@ Misc. Tasks
|
|||
|
||||
These tasks convert segments from an existing older index version to the latest index version. The available grammar is:
|
||||
|
||||
```
|
||||
```json
|
||||
{
|
||||
"id": <task_id>,
|
||||
"groupId" : <task_group_id>,
|
||||
|
@ -246,7 +246,7 @@ These tasks convert segments from an existing older index version to the latest
|
|||
|
||||
These tasks start, sleep for a time and are used only for testing. The available grammar is:
|
||||
|
||||
```
|
||||
```json
|
||||
{
|
||||
"id": <optional_task_id>,
|
||||
"interval" : <optional_segment_interval>,
|
||||
|
|
|
@ -43,12 +43,11 @@ These metrics track the number of characters added, deleted, and changed.
|
|||
Setting Up
|
||||
----------
|
||||
|
||||
There are two ways to setup Druid: download a tarball, or [Build From Source](Build From Source.html). You only need to do one of these.
|
||||
There are two ways to setup Druid: download a tarball, or [Build From Source](Build-from-source.html). You only need to do one of these.
|
||||
|
||||
### Download a Tarball
|
||||
|
||||
We've built a tarball that contains everything you'll need. You'll find it [here](http://static.druid.io/artifacts/releases/druid-services-0.6.0-bin.tar.gz)
|
||||
Download this file to a directory of your choosing.
|
||||
We've built a tarball that contains everything you'll need. You'll find it [here](http://static.druid.io/artifacts/releases/druid-services-0.6.0-bin.tar.gz). Download this file to a directory of your choosing.
|
||||
|
||||
You can extract the awesomeness within by issuing:
|
||||
|
||||
|
@ -98,7 +97,7 @@ Okay, things are about to get real-time. To query the real-time node you've spun
|
|||
./run_example_client.sh
|
||||
```
|
||||
|
||||
Select "wikipedia" once again. This script issues [GroupByQuery](GroupByQuery.html)s to the data we've been ingesting. The query looks like this:
|
||||
Select "wikipedia" once again. This script issues [GroupByQueries](GroupByQuery.html) to the data we've been ingesting. The query looks like this:
|
||||
|
||||
```json
|
||||
{
|
||||
|
@ -108,7 +107,7 @@ Select "wikipedia" once again. This script issues [GroupByQuery](GroupByQuery.ht
|
|||
"dimensions":[ "page" ],
|
||||
"aggregations":[
|
||||
{"type":"count", "name":"rows"},
|
||||
{"type":"longSum", "fieldName":"edit_count", "name":"count"}
|
||||
{"type":"longSum", "fieldName":"count", "name":"edit_count"}
|
||||
],
|
||||
"filter":{ "type":"selector", "dimension":"namespace", "value":"article" },
|
||||
"intervals":[ "2013-06-01T00:00/2020-01-01T00" ]
|
||||
|
@ -151,7 +150,7 @@ time_boundary_query.body
|
|||
|
||||
Druid queries are JSON blobs which are relatively painless to create programmatically, but an absolute pain to write by hand. So anyway, we are going to create a Druid query by hand. Add the following to the file you just created:
|
||||
|
||||
```
|
||||
```json
|
||||
{
|
||||
"queryType": "timeBoundary",
|
||||
"dataSource": "wikipedia"
|
||||
|
@ -186,7 +185,7 @@ timeseries_query.body
|
|||
|
||||
We are going to make a slightly more complicated query, the [TimeseriesQuery](TimeseriesQuery.html). Copy and paste the following into the file:
|
||||
|
||||
```
|
||||
```json
|
||||
{
|
||||
"queryType": "timeseries",
|
||||
"dataSource": "wikipedia",
|
||||
|
@ -221,7 +220,7 @@ Right now all the results you are getting back are being aggregated into a singl
|
|||
|
||||
If you loudly exclaimed "we can change granularity to minute", you are absolutely correct! We can specify different granularities to bucket our results, like so:
|
||||
|
||||
```
|
||||
```json
|
||||
{
|
||||
"queryType": "timeseries",
|
||||
"dataSource": "wikipedia",
|
||||
|
@ -267,7 +266,7 @@ group_by_query.body
|
|||
|
||||
and put the following in there:
|
||||
|
||||
```
|
||||
```json
|
||||
{
|
||||
"queryType": "groupBy",
|
||||
"dataSource": "wikipedia",
|
||||
|
@ -321,13 +320,13 @@ Feel free to tweak other query parameters to answer other questions you may have
|
|||
Next Steps
|
||||
----------
|
||||
|
||||
What to know even more information about the Druid Cluster? Check out [Tutorial: The Druid Cluster](Tutorial:-The-Druid-Cluster.html)
|
||||
What to know even more information about the Druid Cluster? Check out [Tutorial%3A The Druid Cluster](Tutorial%3A-The-Druid-Cluster.html)
|
||||
|
||||
Druid is even more fun if you load your own data into it! To learn how to load your data, see [Loading Your Data](Loading-Your-Data.html).
|
||||
Druid is even more fun if you load your own data into it! To learn how to load your data, see [Loading Your Data](Tutorial%3A-Loading-Your-Data-Part-1.html).
|
||||
|
||||
Additional Information
|
||||
----------------------
|
||||
|
||||
This tutorial is merely showcasing a small fraction of what Druid can do. If you are interested in more information about Druid, including setting up a more sophisticated Druid cluster, please read the other links in our wiki.
|
||||
|
||||
And thus concludes our journey! Hopefully you learned a thing or two about Druid real-time ingestion, querying Druid, and how Druid can be used to solve problems. If you have additional questions, feel free to post in our [google groups page](http://www.groups.google.com/forum/#!forum/druid-development).
|
||||
And thus concludes our journey! Hopefully you learned a thing or two about Druid real-time ingestion, querying Druid, and how Druid can be used to solve problems. If you have additional questions, feel free to post in our [google groups page](https://groups.google.com/forum/#!forum/druid-development).
|
||||
|
|
|
@ -0,0 +1,197 @@
|
|||
---
|
||||
layout: doc_page
|
||||
---
|
||||
Hello! This tutorial is meant to provide a more in-depth look into Druid queries. The tutorial is somewhat incomplete right now but we hope to add more content to it in the near future.
|
||||
|
||||
Setup
|
||||
-----
|
||||
|
||||
Before we start digging into how to query Druid, make sure you've gone through the other tutorials and are comfortable with spinning up a local cluster and loading data into Druid.
|
||||
|
||||
#### Booting a Druid Cluster
|
||||
|
||||
Let's start up a simple Druid cluster so we can query all the things.
|
||||
|
||||
To start a Coordinator node:
|
||||
|
||||
```bash
|
||||
java -Xmx256m -Duser.timezone=UTC -Dfile.encoding=UTF-8 -classpath lib/*:config/coordinator io.druid.cli.Main server coordinator
|
||||
```
|
||||
|
||||
To start a Historical node:
|
||||
|
||||
```bash
|
||||
java -Xmx256m -Duser.timezone=UTC -Dfile.encoding=UTF-8 -classpath lib/*:config/historical io.druid.cli.Main server historical
|
||||
```
|
||||
|
||||
To start a Broker node:
|
||||
|
||||
```bash
|
||||
java -Xmx256m -Duser.timezone=UTC -Dfile.encoding=UTF-8 -classpath lib/*:config/broker io.druid.cli.Main server broker
|
||||
```
|
||||
|
||||
Querying Your Data
|
||||
------------------
|
||||
|
||||
Make sure you've completed [Loading Your Data](Loading-Your-Data-Part-1.html) so we have some data to query. Having done that, it's time to query our data! For a complete specification of queries, see [Querying](Querying.html).
|
||||
|
||||
#### Construct a Query
|
||||
```json
|
||||
{
|
||||
"queryType": "groupBy",
|
||||
"dataSource": "wikipedia",
|
||||
"granularity": "all",
|
||||
"dimensions": [],
|
||||
"aggregations": [
|
||||
{"type": "count", "name": "rows"},
|
||||
{"type": "longSum", "name": "edit_count", "fieldName": "count"},
|
||||
{"type": "doubleSum", "name": "chars_added", "fieldName": "added"}
|
||||
],
|
||||
"intervals": ["2010-01-01T00:00/2020-01-01T00"]
|
||||
}
|
||||
```
|
||||
|
||||
#### Query That Data
|
||||
Run the query against your broker:
|
||||
|
||||
```bash
|
||||
curl -X POST "http://localhost:8080/druid/v2/?pretty" -H 'Content-type: application/json' -d @query.body
|
||||
```
|
||||
|
||||
And get:
|
||||
|
||||
```json
|
||||
[ {
|
||||
"version" : "v1",
|
||||
"timestamp" : "2010-01-01T00:00:00.000Z",
|
||||
"event" : {
|
||||
"chars_added" : 1545.0,
|
||||
"edit_count" : 5,
|
||||
"rows" : 5
|
||||
}
|
||||
} ]
|
||||
```
|
||||
|
||||
This result tells us that our query has 5 edits, and we have 5 rows of data as well. In those 5 edits, we have 1545 characters added.
|
||||
|
||||
#### What can I query for?
|
||||
|
||||
How are we to know what queries we can run? Although [Querying](Querying.html) is a helpful index, to get a handle on querying our data we need to look at our ingestion schema. There are a few particular fields we care about in the ingestion schema. All of these fields should in present in the real-time ingestion schema and the batch ingestion schema.
|
||||
|
||||
Datasource:
|
||||
|
||||
```json
|
||||
"dataSource":"wikipedia"
|
||||
```
|
||||
|
||||
Our dataSource tells us the name of the relation/table, or 'source of data'. What we decide to name our data source must match the data source we are going to be querying.
|
||||
|
||||
Granularity:
|
||||
|
||||
```json
|
||||
"indexGranularity": "none",
|
||||
```
|
||||
|
||||
Druid will roll up data at ingestion time unless the index/rollup granularity is specified as "none". Your query granularity cannot be lower than your index granularity.
|
||||
|
||||
Aggregators:
|
||||
|
||||
```json
|
||||
"aggregators" : [{
|
||||
"type" : "count",
|
||||
"name" : "count"
|
||||
}, {
|
||||
"type" : "doubleSum",
|
||||
"name" : "added",
|
||||
"fieldName" : "added"
|
||||
}, {
|
||||
"type" : "doubleSum",
|
||||
"name" : "deleted",
|
||||
"fieldName" : "deleted"
|
||||
}, {
|
||||
"type" : "doubleSum",
|
||||
"name" : "delta",
|
||||
"fieldName" : "delta"
|
||||
}]
|
||||
```
|
||||
|
||||
The [Aggregations](Aggregations.html) specified at ingestion time correlated directly to the metrics that can be queried.
|
||||
|
||||
Dimensions:
|
||||
|
||||
```json
|
||||
"dimensions" : ["page","language","user","unpatrolled","newPage","robot","anonymous","namespace","continent","country","region","city"]
|
||||
```
|
||||
|
||||
These specify the dimensions that we can filter our data on. If we added a dimension to our groupBy query, we get:
|
||||
|
||||
```json
|
||||
{
|
||||
"queryType": "groupBy",
|
||||
"dataSource": "wikipedia",
|
||||
"granularity": "all",
|
||||
"dimensions": ["namespace"],
|
||||
"aggregations": [
|
||||
{"type": "longSum", "name": "edit_count", "fieldName": "count"},
|
||||
{"type": "doubleSum", "name": "chars_added", "fieldName": "added"}
|
||||
],
|
||||
"intervals": ["2010-01-01T00:00/2020-01-01T00"]
|
||||
}
|
||||
```
|
||||
|
||||
Which gets us data grouped over the namespace dimension in return!
|
||||
|
||||
```json
|
||||
[ {
|
||||
"version" : "v1",
|
||||
"timestamp" : "2010-01-01T00:00:00.000Z",
|
||||
"event" : {
|
||||
"chars_added" : 180.0,
|
||||
"edit_count" : 2,
|
||||
"namespace" : "article"
|
||||
}
|
||||
}, {
|
||||
"version" : "v1",
|
||||
"timestamp" : "2010-01-01T00:00:00.000Z",
|
||||
"event" : {
|
||||
"chars_added" : 1365.0,
|
||||
"edit_count" : 3,
|
||||
"namespace" : "wikipedia"
|
||||
}
|
||||
} ]
|
||||
```
|
||||
|
||||
Additionally,, we can also filter our query to narrow down our metric values:
|
||||
|
||||
```json
|
||||
{
|
||||
"queryType": "groupBy",
|
||||
"dataSource": "wikipedia",
|
||||
"granularity": "all",
|
||||
"filter": { "type": "selector", "dimension": "namespace", "value": "article" },
|
||||
"aggregations": [
|
||||
{"type": "longSum", "name": "edit_count", "fieldName": "count"},
|
||||
{"type": "doubleSum", "name": "chars_added", "fieldName": "added"}
|
||||
],
|
||||
"intervals": ["2010-01-01T00:00/2020-01-01T00"]
|
||||
}
|
||||
```
|
||||
|
||||
Which gets us metrics about only those edits where the namespace is 'article':
|
||||
|
||||
```json
|
||||
[ {
|
||||
"version" : "v1",
|
||||
"timestamp" : "2010-01-01T00:00:00.000Z",
|
||||
"event" : {
|
||||
"chars_added" : 180.0,
|
||||
"edit_count" : 2
|
||||
}
|
||||
} ]
|
||||
```
|
||||
|
||||
Check out [Filters](Filters.html) for more information.
|
||||
|
||||
## Learn More ##
|
||||
|
||||
You can learn more about querying at [Querying](Querying.html)! If you are ready to evaluate Druid more in depth, check out [Booting a production cluster](Booting-a-production-cluster.html)!
|
|
@ -1,7 +1,7 @@
|
|||
---
|
||||
layout: doc_page
|
||||
---
|
||||
In our last [tutorial](Tutorial:-The-Druid-Cluster.html), we setup a complete Druid cluster. We created all the Druid dependencies and loaded some batched data. Druid shards data into self-contained chunks known as [segments](Segments.html). Segments are the fundamental unit of storage in Druid and all Druid nodes only understand segments.
|
||||
In our last [tutorial](Tutorial%3A-The-Druid-Cluster.html), we setup a complete Druid cluster. We created all the Druid dependencies and loaded some batched data. Druid shards data into self-contained chunks known as [segments](Segments.html). Segments are the fundamental unit of storage in Druid and all Druid nodes only understand segments.
|
||||
|
||||
In this tutorial, we will learn about batch ingestion (as opposed to real-time ingestion) and how to create segments using the final piece of the Druid Cluster, the [indexing service](Indexing-Service.html). The indexing service is a standalone service that accepts [tasks](Tasks.html) in the form of POST requests. The output of most tasks are segments.
|
||||
|
||||
|
@ -50,12 +50,12 @@ examples/indexing/wikipedia_data.json
|
|||
|
||||
Open the file and make sure the following events exist:
|
||||
|
||||
```
|
||||
```json
|
||||
{"timestamp": "2013-08-31T01:02:33Z", "page": "Gypsy Danger", "language" : "en", "user" : "nuclear", "unpatrolled" : "true", "newPage" : "true", "robot": "false", "anonymous": "false", "namespace":"article", "continent":"North America", "country":"United States", "region":"Bay Area", "city":"San Francisco", "added": 57, "deleted": 200, "delta": -143}
|
||||
{"timestamp": "2013-08-31T03:32:45Z", "page": "Striker Eureka", "language" : "en", "user" : "speed", "unpatrolled" : "false", "newPage" : "true", "robot": "true", "anonymous": "false", "namespace":"wikipedia", "continent":"Australia", "country":"Australia", "region":"Dingo Land", "city":"Syndey", "added": 459, "deleted": 129, "delta": 330}
|
||||
{"timestamp": "2013-08-31T07:11:21Z", "page": "Cherno Alpha", "language" : "ru", "user" : "masterYi", "unpatrolled" : "false", "newPage" : "true", "robot": "true", "anonymous": "false", "namespace":"article", "continent":"Asia", "country":"Russia", "region":"Vodka Land", "city":"Moscow", "added": 123, "deleted": 12, "delta": 111}
|
||||
{"timestamp": "2013-08-31T03:32:45Z", "page": "Striker Eureka", "language" : "en", "user" : "speed", "unpatrolled" : "false", "newPage" : "true", "robot": "true", "anonymous": "false", "namespace":"wikipedia", "continent":"Australia", "country":"Australia", "region":"Cantebury", "city":"Syndey", "added": 459, "deleted": 129, "delta": 330}
|
||||
{"timestamp": "2013-08-31T07:11:21Z", "page": "Cherno Alpha", "language" : "ru", "user" : "masterYi", "unpatrolled" : "false", "newPage" : "true", "robot": "true", "anonymous": "false", "namespace":"article", "continent":"Asia", "country":"Russia", "region":"Oblast", "city":"Moscow", "added": 123, "deleted": 12, "delta": 111}
|
||||
{"timestamp": "2013-08-31T11:58:39Z", "page": "Crimson Typhoon", "language" : "zh", "user" : "triplets", "unpatrolled" : "true", "newPage" : "false", "robot": "true", "anonymous": "false", "namespace":"wikipedia", "continent":"Asia", "country":"China", "region":"Shanxi", "city":"Taiyuan", "added": 905, "deleted": 5, "delta": 900}
|
||||
{"timestamp": "2013-08-31T12:41:27Z", "page": "Coyote Tango", "language" : "ja", "user" : "cancer", "unpatrolled" : "true", "newPage" : "false", "robot": "true", "anonymous": "false", "namespace":"wikipedia", "continent":"Asia", "country":"Japan", "region":"Kanto", "city":"Tokyo", "added": 1, "deleted": 10, "delta": -9}
|
||||
{"timestamp": "2013-08-31T12:41:27Z", "page": "Coyote Tango", "language" : "ja", "user" : "stringer", "unpatrolled" : "true", "newPage" : "false", "robot": "true", "anonymous": "false", "namespace":"wikipedia", "continent":"Asia", "country":"Japan", "region":"Kanto", "city":"Tokyo", "added": 1, "deleted": 10, "delta": -9}
|
||||
```
|
||||
|
||||
There are five data points spread across the day of 2013-08-31. Talk about big data right? Thankfully, we don't need a ton of data to introduce how batch ingestion works.
|
||||
|
@ -71,12 +71,14 @@ java -Xmx2g -Duser.timezone=UTC -Dfile.encoding=UTF-8 -classpath lib/*:config/ov
|
|||
```
|
||||
|
||||
The overlord configurations should already exist in:
|
||||
|
||||
```
|
||||
config/overlord/runtime.properties
|
||||
```
|
||||
|
||||
The configurations for the overlord node are as follows:
|
||||
```
|
||||
|
||||
```bash
|
||||
druid.host=localhost
|
||||
druid.port=8087
|
||||
druid.service=overlord
|
||||
|
@ -96,8 +98,9 @@ druid.indexer.fork.property.druid.computation.buffer.size=268435456
|
|||
If you are interested in reading more about these configurations, see [here](Indexing-Service.html).
|
||||
|
||||
When the overlord node is ready for tasks, you should see a message like the following:
|
||||
```
|
||||
013-10-09 21:30:32,817 INFO [Thread-14] io.druid.indexing.overlord.TaskQueue - Waiting for work...
|
||||
|
||||
```bash
|
||||
2013-10-09 21:30:32,817 INFO [Thread-14] io.druid.indexing.overlord.TaskQueue - Waiting for work...
|
||||
```
|
||||
|
||||
#### Starting Other Nodes
|
||||
|
@ -111,6 +114,7 @@ java -Xmx256m -Duser.timezone=UTC -Dfile.encoding=UTF-8 -classpath lib/*:config/
|
|||
```
|
||||
|
||||
Historical node:
|
||||
|
||||
```bash
|
||||
java -Xmx256m -Duser.timezone=UTC -Dfile.encoding=UTF-8 -classpath lib/*:config/historical io.druid.cli.Main server historical
|
||||
```
|
||||
|
@ -130,7 +134,7 @@ examples/indexing/index_task.json
|
|||
|
||||
Open up the file to see the following:
|
||||
|
||||
```
|
||||
```json
|
||||
{
|
||||
"type" : "index",
|
||||
"dataSource" : "wikipedia",
|
||||
|
@ -141,7 +145,7 @@ Open up the file to see the following:
|
|||
},
|
||||
"aggregators" : [{
|
||||
"type" : "count",
|
||||
"name" : "edit_count"
|
||||
"name" : "count"
|
||||
}, {
|
||||
"type" : "doubleSum",
|
||||
"name" : "added",
|
||||
|
@ -176,21 +180,21 @@ Okay, so what is happening here? The "type" field indicates the type of task we
|
|||
|
||||
Let's send our task to the indexing service now:
|
||||
|
||||
```
|
||||
```bash
|
||||
curl -X 'POST' -H 'Content-Type:application/json' -d @examples/indexing/wikipedia_index_task.json localhost:8087/druid/indexer/v1/task
|
||||
```
|
||||
|
||||
Issuing the request should return a task ID like so:
|
||||
|
||||
```
|
||||
fjy$ curl -X 'POST' -H 'Content-Type:application/json' -d @examples/indexing/wikipedia_index_task.json localhost:8087/druid/indexer/v1/task
|
||||
```bash
|
||||
$ curl -X 'POST' -H 'Content-Type:application/json' -d @examples/indexing/wikipedia_index_task.json localhost:8087/druid/indexer/v1/task
|
||||
{"task":"index_wikipedia_2013-10-09T21:30:32.802Z"}
|
||||
fjy$
|
||||
$
|
||||
```
|
||||
|
||||
In your indexing service logs, you should see the following:
|
||||
|
||||
````
|
||||
```bash
|
||||
2013-10-09 21:41:41,150 INFO [qtp300448720-21] io.druid.indexing.overlord.HeapMemoryTaskStorage - Inserting task index_wikipedia_2013-10-09T21:41:41.147Z with status: TaskStatus{id=index_wikipedia_2013-10-09T21:41:41.147Z, status=RUNNING, duration=-1}
|
||||
2013-10-09 21:41:41,151 INFO [qtp300448720-21] io.druid.indexing.overlord.TaskLockbox - Created new TaskLockPosse: TaskLockPosse{taskLock=TaskLock{groupId=index_wikipedia_2013-10-09T21:41:41.147Z, dataSource=wikipedia, interval=2013-08-31T00:00:00.000Z/2013-09-01T00:00:00.000Z, version=2013-10-09T21:41:41.151Z}, taskIds=[]}
|
||||
...
|
||||
|
@ -201,7 +205,7 @@ In your indexing service logs, you should see the following:
|
|||
|
||||
After a few seconds, the task should complete and you should see in the indexing service logs:
|
||||
|
||||
```
|
||||
```bash
|
||||
2013-10-09 21:41:45,765 INFO [pool-6-thread-1] io.druid.indexing.overlord.exec.TaskConsumer - Received SUCCESS status for task: IndexGeneratorTask{id=index_wikipedia_2013-10-09T21:41:41.147Z_generator_2013-08-31T00:00:00.000Z_2013-09-01T00:00:00.000Z_0, type=index_generator, dataSource=wikipedia, interval=Optional.of(2013-08-31T00:00:00.000Z/2013-09-01T00:00:00.000Z)}
|
||||
```
|
||||
|
||||
|
@ -209,7 +213,7 @@ Congratulations! The segment has completed building. Once a segment is built, a
|
|||
|
||||
You should see the following logs on the coordinator:
|
||||
|
||||
```
|
||||
```bash
|
||||
2013-10-09 21:41:54,368 INFO [Coordinator-Exec--0] io.druid.server.coordinator.DruidCoordinatorLogger - [_default_tier] : Assigned 1 segments among 1 servers
|
||||
2013-10-09 21:41:54,369 INFO [Coordinator-Exec--0] io.druid.server.coordinator.DruidCoordinatorLogger - Load Queues:
|
||||
2013-10-09 21:41:54,369 INFO [Coordinator-Exec--0] io.druid.server.coordinator.DruidCoordinatorLogger - Server[localhost:8081, historical, _default_tier] has 1 left to load, 0 left to drop, 4,477 bytes queued, 4,477 bytes served.
|
||||
|
@ -217,7 +221,7 @@ You should see the following logs on the coordinator:
|
|||
|
||||
These logs indicate that the coordinator has assigned our new segment to the historical node to download and serve. If you look at the historical node logs, you should see:
|
||||
|
||||
```
|
||||
```bash
|
||||
2013-10-09 21:41:54,369 INFO [ZkCoordinator-0] io.druid.server.coordination.ZkCoordinator - Loading segment wikipedia_2013-08-31T00:00:00.000Z_2013-09-01T00:00:00.000Z_2013-10-09T21:41:41.151Z
|
||||
2013-10-09 21:41:54,369 INFO [ZkCoordinator-0] io.druid.segment.loading.LocalDataSegmentPuller - Unzipping local file[/tmp/druid/localStorage/wikipedia/2013-08-31T00:00:00.000Z_2013-09-01T00:00:00.000Z/2013-10-09T21:41:41.151Z/0/index.zip] to [/tmp/druid/indexCache/wikipedia/2013-08-31T00:00:00.000Z_2013-09-01T00:00:00.000Z/2013-10-09T21:41:41.151Z/0]
|
||||
2013-10-09 21:41:54,370 INFO [ZkCoordinator-0] io.druid.utils.CompressionUtils - Unzipping file[/tmp/druid/localStorage/wikipedia/2013-08-31T00:00:00.000Z_2013-09-01T00:00:00.000Z/2013-10-09T21:41:41.151Z/0/index.zip] to [/tmp/druid/indexCache/wikipedia/2013-08-31T00:00:00.000Z_2013-09-01T00:00:00.000Z/2013-10-09T21:41:41.151Z/0]
|
||||
|
@ -228,7 +232,7 @@ Once the segment is announced the segment is queryable. Now you should be able t
|
|||
|
||||
Issuing a [TimeBoundaryQuery](TimeBoundaryQuery.html) should yield:
|
||||
|
||||
```
|
||||
```json
|
||||
[ {
|
||||
"timestamp" : "2013-08-31T01:02:33.000Z",
|
||||
"result" : {
|
||||
|
@ -241,9 +245,9 @@ Issuing a [TimeBoundaryQuery](TimeBoundaryQuery.html) should yield:
|
|||
Next Steps
|
||||
----------
|
||||
|
||||
This tutorial covered ingesting a small batch data set and loading it into Druid. In [Loading Your Data Part 2](Tutorial-Loading-Your-Data-Part-2.html), we will cover how to ingest data using Hadoop for larger data sets.
|
||||
This tutorial covered ingesting a small batch data set and loading it into Druid. In [Loading Your Data Part 2](Tutorial%3A-Loading-Your-Data-Part-2.html), we will cover how to ingest data using Hadoop for larger data sets.
|
||||
|
||||
Additional Information
|
||||
----------------------
|
||||
|
||||
Getting data into Druid can definitely be difficult for first time users. Please don't hesitate to ask questions in our IRC channel or on our [google groups page](http://www.groups.google.com/forum/#!forum/druid-development).
|
||||
Getting data into Druid can definitely be difficult for first time users. Please don't hesitate to ask questions in our IRC channel or on our [google groups page](https://groups.google.com/forum/#!forum/druid-development).
|
||||
|
|
|
@ -1,26 +1,48 @@
|
|||
---
|
||||
layout: doc_page
|
||||
---
|
||||
Once you have a real-time node working, it is time to load your own data to see how Druid performs.
|
||||
In this tutorial we will cover more advanced/real-world ingestion topics.
|
||||
|
||||
Druid can ingest data in three ways: via Kafka and a realtime node, via the indexing service, and via the Hadoop batch loader. Data is ingested in real-time using a [Firehose](Firehose.html).
|
||||
Druid can ingest streaming or batch data. Streaming data is ingested via the real-time node, and batch data is ingested via the Hadoop batch indexer. Druid also has a standalone ingestion service called the [indexing service](Indexing-Service.html).
|
||||
|
||||
## Create Config Directories ##
|
||||
Each type of node needs its own config file and directory, so create them as subdirectories under the druid directory if they not already exist.
|
||||
The Data
|
||||
--------
|
||||
The data source we'll be using is (surprise!) Wikipedia edits. The data schema is still:
|
||||
|
||||
```bash
|
||||
mkdir config
|
||||
mkdir config/realtime
|
||||
mkdir config/coordinator
|
||||
mkdir config/historical
|
||||
mkdir config/broker
|
||||
Dimensions (things to filter on):
|
||||
|
||||
```json
|
||||
"page"
|
||||
"language"
|
||||
"user"
|
||||
"unpatrolled"
|
||||
"newPage"
|
||||
"robot"
|
||||
"anonymous"
|
||||
"namespace"
|
||||
"continent"
|
||||
"country"
|
||||
"region"
|
||||
"city"
|
||||
```
|
||||
|
||||
## Loading Data with Kafka ##
|
||||
Metrics (things to aggregate over):
|
||||
|
||||
[KafkaFirehoseFactory](https://github.com/metamx/druid/blob/druid-0.6.0/realtime/src/main/java/com/metamx/druid/realtime/firehose/KafkaFirehoseFactory.java) is how druid communicates with Kafka. Using this [Firehose](Firehose.html) with the right configuration, we can import data into Druid in realtime without writing any code. To load data to a realtime node via Kafka, we'll first need to initialize Zookeeper and Kafka, and then configure and initialize a [Realtime](Realtime.html) node.
|
||||
```json
|
||||
"count"
|
||||
"added"
|
||||
"delta"
|
||||
"deleted"
|
||||
```
|
||||
|
||||
### Booting Kafka ###
|
||||
Streaming Event Ingestion
|
||||
-------------------------
|
||||
|
||||
With real-world data, we recommend having a message bus such as [Apache Kafka](http://kafka.apache.org/) sit between the data stream and the real-time node. The message bus provides higher availability for production environments. [Firehoses](Firehose.html) are the key abstraction for real-time ingestion.
|
||||
|
||||
#### Setting up Kafka
|
||||
|
||||
[KafkaFirehoseFactory](https://github.com/metamx/druid/blob/druid-0.6.0/realtime/src/main/java/com/metamx/druid/realtime/firehose/KafkaFirehoseFactory.java) is how druid communicates with Kafka. Using this [Firehose](Firehose.html) with the right configuration, we can import data into Druid in real-time without writing any code. To load data to a real-time node via Kafka, we'll first need to initialize Zookeeper and Kafka, and then configure and initialize a [Realtime](Realtime.html) node.
|
||||
|
||||
Instructions for booting a Zookeeper and then Kafka cluster are available [here](http://kafka.apache.org/07/quickstart.html).
|
||||
|
||||
|
@ -44,6 +66,7 @@ Instructions for booting a Zookeeper and then Kafka cluster are available [here]
|
|||
```bash
|
||||
cat config/zookeeper.properties
|
||||
bin/zookeeper-server-start.sh config/zookeeper.properties
|
||||
|
||||
# in a new console
|
||||
bin/kafka-server-start.sh config/server.properties
|
||||
```
|
||||
|
@ -51,56 +74,55 @@ Instructions for booting a Zookeeper and then Kafka cluster are available [here]
|
|||
4. Launch the console producer (so you can type in JSON kafka messages in a bit)
|
||||
|
||||
```bash
|
||||
bin/kafka-console-producer.sh --zookeeper localhost:2181 --topic druidtest
|
||||
bin/kafka-console-producer.sh --zookeeper localhost:2181 --topic wikipedia
|
||||
```
|
||||
|
||||
### Launching a Realtime Node
|
||||
When things are ready, you should see log messages such as:
|
||||
|
||||
1. Create a valid configuration file similar to this called config/realtime/runtime.properties:
|
||||
|
||||
```properties
|
||||
druid.host=localhost
|
||||
druid.service=example
|
||||
druid.port=8080
|
||||
|
||||
druid.zk.service.host=localhost
|
||||
|
||||
druid.s3.accessKey=AKIAIMKECRUYKDQGR6YQ
|
||||
druid.s3.secretKey=QyyfVZ7llSiRg6Qcrql1eEUG7buFpAK6T6engr1b
|
||||
|
||||
druid.db.connector.connectURI=jdbc\:mysql\://localhost\:3306/druid
|
||||
druid.db.connector.user=druid
|
||||
druid.db.connector.password=diurd
|
||||
|
||||
druid.realtime.specFile=config/realtime/realtime.spec
|
||||
|
||||
druid.processing.buffer.sizeBytes=10000000
|
||||
|
||||
druid.processing.numThreads=3
|
||||
```
|
||||
[2013-10-09 22:03:07,802] INFO zookeeper state changed (SyncConnected) (org.I0Itec.zkclient.ZkClient)
|
||||
```
|
||||
|
||||
2. Create a valid realtime configuration file similar to this called realtime.spec:
|
||||
#### Launch a Realtime Node
|
||||
|
||||
You should be comfortable starting Druid nodes at this point. If not, it may be worthwhile to revisit the first few tutorials.
|
||||
|
||||
1. Real-time nodes can be started with:
|
||||
|
||||
```bash
|
||||
java -Xmx256m -Duser.timezone=UTC -Dfile.encoding=UTF-8 -Ddruid.realtime.specFile=examples/indexing/wikipedia.spec -classpath lib/*:config/realtime io.druid.cli.Main server realtime
|
||||
```
|
||||
|
||||
2. A realtime.spec should already exist for the data source in the Druid tarball. You should be able to find it at:
|
||||
|
||||
```bash
|
||||
examples/indexing/wikipedia.spec
|
||||
```
|
||||
|
||||
The contents of the file should match:
|
||||
|
||||
```json
|
||||
[
|
||||
{
|
||||
"schema": {
|
||||
"dataSource": "druidtest",
|
||||
"aggregators": [
|
||||
{
|
||||
"type": "count",
|
||||
"name": "impressions"
|
||||
},
|
||||
{
|
||||
"type": "doubleSum",
|
||||
"name": "wp",
|
||||
"fieldName": "wp"
|
||||
}
|
||||
],
|
||||
"indexGranularity": "minute",
|
||||
"shardSpec": {
|
||||
"type": "none"
|
||||
}
|
||||
"dataSource": "wikipedia",
|
||||
"aggregators" : [{
|
||||
"type" : "count",
|
||||
"name" : "count"
|
||||
}, {
|
||||
"type" : "doubleSum",
|
||||
"name" : "added",
|
||||
"fieldName" : "added"
|
||||
}, {
|
||||
"type" : "doubleSum",
|
||||
"name" : "deleted",
|
||||
"fieldName" : "deleted"
|
||||
}, {
|
||||
"type" : "doubleSum",
|
||||
"name" : "delta",
|
||||
"fieldName" : "delta"
|
||||
}],
|
||||
"indexGranularity": "none"
|
||||
},
|
||||
"config": {
|
||||
"maxRowsInMemory": 500000,
|
||||
|
@ -113,23 +135,20 @@ Instructions for booting a Zookeeper and then Kafka cluster are available [here]
|
|||
"zk.connectiontimeout.ms": "15000",
|
||||
"zk.sessiontimeout.ms": "15000",
|
||||
"zk.synctime.ms": "5000",
|
||||
"groupid": "topic-pixel-local",
|
||||
"groupid": "druid-example",
|
||||
"fetch.size": "1048586",
|
||||
"autooffset.reset": "largest",
|
||||
"autocommit.enable": "false"
|
||||
},
|
||||
"feed": "druidtest",
|
||||
"feed": "wikipedia",
|
||||
"parser": {
|
||||
"timestampSpec": {
|
||||
"column": "utcdt",
|
||||
"format": "iso"
|
||||
"column": "timestamp"
|
||||
},
|
||||
"data": {
|
||||
"format": "json"
|
||||
},
|
||||
"dimensionExclusions": [
|
||||
"wp"
|
||||
]
|
||||
"format": "json",
|
||||
"dimensions" : ["page","language","user","unpatrolled","newPage","robot","anonymous","namespace","continent","country","region","city"]
|
||||
}
|
||||
}
|
||||
},
|
||||
"plumber": {
|
||||
|
@ -138,256 +157,163 @@ Instructions for booting a Zookeeper and then Kafka cluster are available [here]
|
|||
"segmentGranularity": "hour",
|
||||
"basePersistDirectory": "\/tmp\/realtime\/basePersist",
|
||||
"rejectionPolicy": {
|
||||
"type": "messageTime"
|
||||
"type": "none"
|
||||
}
|
||||
}
|
||||
}
|
||||
]
|
||||
```
|
||||
|
||||
3. Launch the realtime node
|
||||
|
||||
```bash
|
||||
java -Xmx256m -Duser.timezone=UTC -Dfile.encoding=UTF-8 \
|
||||
-Ddruid.realtime.specFile=config/realtime/realtime.spec \
|
||||
-classpath lib/*:config/realtime io.druid.cli.Main server realtime
|
||||
```
|
||||
|
||||
4. Paste data into the Kafka console producer
|
||||
3. Let's copy and paste some data into the Kafka console producer
|
||||
|
||||
```json
|
||||
{"utcdt": "2010-01-01T01:01:01", "wp": 1000, "gender": "male", "age": 100}
|
||||
{"utcdt": "2010-01-01T01:01:02", "wp": 2000, "gender": "female", "age": 50}
|
||||
{"utcdt": "2010-01-01T01:01:03", "wp": 3000, "gender": "male", "age": 20}
|
||||
{"utcdt": "2010-01-01T01:01:04", "wp": 4000, "gender": "female", "age": 30}
|
||||
{"utcdt": "2010-01-01T01:01:05", "wp": 5000, "gender": "male", "age": 40}
|
||||
{"timestamp": "2013-08-31T01:02:33Z", "page": "Gypsy Danger", "language" : "en", "user" : "nuclear", "unpatrolled" : "true", "newPage" : "true", "robot": "false", "anonymous": "false", "namespace":"article", "continent":"North America", "country":"United States", "region":"Bay Area", "city":"San Francisco", "added": 57, "deleted": 200, "delta": -143}
|
||||
{"timestamp": "2013-08-31T03:32:45Z", "page": "Striker Eureka", "language" : "en", "user" : "speed", "unpatrolled" : "false", "newPage" : "true", "robot": "true", "anonymous": "false", "namespace":"wikipedia", "continent":"Australia", "country":"Australia", "region":"Cantebury", "city":"Syndey", "added": 459, "deleted": 129, "delta": 330}
|
||||
{"timestamp": "2013-08-31T07:11:21Z", "page": "Cherno Alpha", "language" : "ru", "user" : "masterYi", "unpatrolled" : "false", "newPage" : "true", "robot": "true", "anonymous": "false", "namespace":"article", "continent":"Asia", "country":"Russia", "region":"Oblast", "city":"Moscow", "added": 123, "deleted": 12, "delta": 111}
|
||||
{"timestamp": "2013-08-31T11:58:39Z", "page": "Crimson Typhoon", "language" : "zh", "user" : "triplets", "unpatrolled" : "true", "newPage" : "false", "robot": "true", "anonymous": "false", "namespace":"wikipedia", "continent":"Asia", "country":"China", "region":"Shanxi", "city":"Taiyuan", "added": 905, "deleted": 5, "delta": 900}
|
||||
{"timestamp": "2013-08-31T12:41:27Z", "page": "Coyote Tango", "language" : "ja", "user" : "stringer", "unpatrolled" : "true", "newPage" : "false", "robot": "true", "anonymous": "false", "namespace":"wikipedia", "continent":"Asia", "country":"Japan", "region":"Kanto", "city":"Tokyo", "added": 1, "deleted": 10, "delta": -9}
|
||||
```
|
||||
|
||||
5. Watch the events as they are ingested by Druid's realtime node
|
||||
Disclaimer: We recognize the timestamps of these events aren't actually recent.
|
||||
|
||||
5. Watch the events as they are ingested by Druid's real-time node:
|
||||
|
||||
```bash
|
||||
...
|
||||
2013-06-17 21:41:55,569 INFO [Global--0] com.metamx.emitter.core.LoggingEmitter - Event [{"feed":"metrics","timestamp":"2013-06-17T21:41:55.569Z","service":"example","host":"127.0.0.1","metric":"events/processed","value":5,"user2":"druidtest"}]
|
||||
2013-10-10 05:13:18,976 INFO [chief-wikipedia] io.druid.server.coordination.BatchDataSegmentAnnouncer - Announcing segment[wikipedia_2013-08-31T01:00:00.000Z_2013-08-31T02:00:00.000Z_2013-08-31T01:00:00.000Z] at path[/druid/segments/localhost:8083/2013-10-10T05:13:18.972Z0]
|
||||
2013-10-10 05:13:18,992 INFO [chief-wikipedia] io.druid.server.coordination.BatchDataSegmentAnnouncer - Announcing segment[wikipedia_2013-08-31T03:00:00.000Z_2013-08-31T04:00:00.000Z_2013-08-31T03:00:00.000Z] at path[/druid/segments/localhost:8083/2013-10-10T05:13:18.972Z0]
|
||||
2013-10-10 05:13:18,997 INFO [chief-wikipedia] io.druid.server.coordination.BatchDataSegmentAnnouncer - Announcing segment[wikipedia_2013-08-31T07:00:00.000Z_2013-08-31T08:00:00.000Z_2013-08-31T07:00:00.000Z] at path[/druid/segments/localhost:8083/2013-10-10T05:13:18.972Z0]
|
||||
2013-10-10 05:13:19,003 INFO [chief-wikipedia] io.druid.server.coordination.BatchDataSegmentAnnouncer - Announcing segment[wikipedia_2013-08-31T11:00:00.000Z_2013-08-31T12:00:00.000Z_2013-08-31T11:00:00.000Z] at path[/druid/segments/localhost:8083/2013-10-10T05:13:18.972Z0]
|
||||
2013-10-10 05:13:19,008 INFO [chief-wikipedia] io.druid.server.coordination.BatchDataSegmentAnnouncer - Announcing segment[wikipedia_2013-08-31T12:00:00.000Z_2013-08-31T13:00:00.000Z_2013-08-31T12:00:00.000Z] at path[/druid/segments/localhost:8083/2013-10-10T05:13:18.972Z0]
|
||||
...
|
||||
```
|
||||
|
||||
6. In a new console, edit a file called query.body:
|
||||
|
||||
```json
|
||||
{
|
||||
"queryType": "groupBy",
|
||||
"dataSource": "druidtest",
|
||||
"granularity": "all",
|
||||
"dimensions": [],
|
||||
"aggregations": [
|
||||
{ "type": "count", "name": "rows" },
|
||||
{"type": "longSum", "name": "imps", "fieldName": "impressions"},
|
||||
{"type": "doubleSum", "name": "wp", "fieldName": "wp"}
|
||||
],
|
||||
"intervals": ["2010-01-01T00:00/2020-01-01T00"]
|
||||
}
|
||||
```
|
||||
|
||||
7. Submit the query via curl
|
||||
|
||||
```bash
|
||||
curl -X POST "http://localhost:8080/druid/v2/?pretty" \
|
||||
-H 'content-type: application/json' -d @query.body
|
||||
```
|
||||
|
||||
8. View Result!
|
||||
|
||||
```json
|
||||
[ {
|
||||
"timestamp" : "2010-01-01T01:01:00.000Z",
|
||||
"result" : {
|
||||
"imps" : 20,
|
||||
"wp" : 60000.0,
|
||||
"rows" : 5
|
||||
}
|
||||
} ]
|
||||
```
|
||||
|
||||
Now you're ready for [Querying Your Data](Querying-Your-Data.html)!
|
||||
|
||||
## Loading Data with the HadoopDruidIndexer ##
|
||||
|
||||
Historical data can be loaded via a Hadoop job.
|
||||
|
||||
The setup for a single node, 'standalone' Hadoop cluster is available at [http://hadoop.apache.org/docs/stable/single_node_setup.html](http://hadoop.apache.org/docs/stable/single_node_setup.html).
|
||||
|
||||
### Setup MySQL ###
|
||||
1. If you don't already have it, download MySQL Community Server here: [http://dev.mysql.com/downloads/mysql/](http://dev.mysql.com/downloads/mysql/)
|
||||
2. Install MySQL
|
||||
3. Create a druid user and database
|
||||
|
||||
```bash
|
||||
mysql -u root
|
||||
```
|
||||
|
||||
```sql
|
||||
GRANT ALL ON druid.* TO 'druid'@'localhost' IDENTIFIED BY 'diurd';
|
||||
CREATE database druid;
|
||||
```
|
||||
|
||||
The [Coordinator](Coordinator.html) node will create the tables it needs based on its configuration.
|
||||
|
||||
### Make sure you have ZooKeeper Running ###
|
||||
|
||||
Make sure that you have a zookeeper instance running. If you followed the instructions for Kafka, it is probably running. If you are unsure if you have zookeeper running, try running
|
||||
|
||||
```bash
|
||||
ps auxww | grep zoo | grep -v grep
|
||||
```
|
||||
|
||||
If you get any result back, then zookeeper is most likely running. If you haven't setup Kafka or do not have zookeeper running, then you can download it and start it up with
|
||||
|
||||
```bash
|
||||
curl http://www.motorlogy.com/apache/zookeeper/zookeeper-3.4.5/zookeeper-3.4.5.tar.gz -o zookeeper-3.4.5.tar.gz
|
||||
tar xzf zookeeper-3.4.5.tar.gz
|
||||
cd zookeeper-3.4.5
|
||||
cp conf/zoo_sample.cfg conf/zoo.cfg
|
||||
./bin/zkServer.sh start
|
||||
cd ..
|
||||
```
|
||||
|
||||
### Launch a Coordinator Node ###
|
||||
|
||||
If you've already setup a realtime node, be aware that although you can run multiple node types on one physical computer, you must assign them unique ports. Having used 8080 for the [Realtime](Realtime.html) node, we use 8081 for the [Coordinator](Coordinator.html).
|
||||
|
||||
1. Setup a configuration file called config/coordinator/runtime.properties similar to:
|
||||
|
||||
```properties
|
||||
druid.host=localhost
|
||||
druid.service=coordinator
|
||||
druid.port=8081
|
||||
|
||||
druid.zk.service.host=localhost
|
||||
|
||||
druid.s3.accessKey=AKIAIMKECRUYKDQGR6YQ
|
||||
druid.s3.secretKey=QyyfVZ7llSiRg6Qcrql1eEUG7buFpAK6T6engr1b
|
||||
|
||||
druid.db.connector.connectURI=jdbc\:mysql\://localhost\:3306/druid
|
||||
druid.db.connector.user=druid
|
||||
druid.db.connector.password=diurd
|
||||
|
||||
druid.coordinator.startDelay=PT60s
|
||||
```
|
||||
|
||||
2. Launch the [Coordinator](Coordinator.html) node
|
||||
|
||||
```bash
|
||||
java -Xmx256m -Duser.timezone=UTC -Dfile.encoding=UTF-8 \
|
||||
-classpath lib/*:config/coordinator \
|
||||
io.druid.Cli.Main server coordinator
|
||||
```
|
||||
|
||||
### Launch a Historical Node ###
|
||||
|
||||
1. Create a configuration file in config/historical/runtime.properties similar to:
|
||||
|
||||
```properties
|
||||
druid.host=localhost
|
||||
druid.service=historical
|
||||
druid.port=8082
|
||||
|
||||
druid.zk.service.host=localhost
|
||||
|
||||
druid.s3.secretKey=QyyfVZ7llSiRg6Qcrql1eEUG7buFpAK6T6engr1b
|
||||
druid.s3.accessKey=AKIAIMKECRUYKDQGR6YQ
|
||||
|
||||
druid.server.maxSize=100000000
|
||||
|
||||
druid.processing.buffer.sizeBytes=10000000
|
||||
|
||||
druid.segmentCache.infoPath=/tmp/druid/segmentInfoCache
|
||||
druid.segmentCache.locations=[{"path": "/tmp/druid/indexCache", "maxSize"\: 100000000}]
|
||||
```
|
||||
|
||||
2. Launch the historical node:
|
||||
|
||||
```bash
|
||||
java -Xmx256m -Duser.timezone=UTC -Dfile.encoding=UTF-8 \
|
||||
-classpath lib/*:config/historical \
|
||||
io.druid.cli.Main server historical
|
||||
```
|
||||
|
||||
### Create a File of Records ###
|
||||
|
||||
We can use the same records we have been, in a file called records.json:
|
||||
Issuing a [TimeBoundaryQuery](TimeBoundaryQuery.html) to the real-time node should yield valid results:
|
||||
|
||||
```json
|
||||
{"utcdt": "2010-01-01T01:01:01", "wp": 1000, "gender": "male", "age": 100}
|
||||
{"utcdt": "2010-01-01T01:01:02", "wp": 2000, "gender": "female", "age": 50}
|
||||
{"utcdt": "2010-01-01T01:01:03", "wp": 3000, "gender": "male", "age": 20}
|
||||
{"utcdt": "2010-01-01T01:01:04", "wp": 4000, "gender": "female", "age": 30}
|
||||
{"utcdt": "2010-01-01T01:01:05", "wp": 5000, "gender": "male", "age": 40}
|
||||
[ {
|
||||
"timestamp" : "2013-08-31T01:02:33.000Z",
|
||||
"result" : {
|
||||
"minTime" : "2013-08-31T01:02:33.000Z",
|
||||
"maxTime" : "2013-08-31T12:41:27.000Z"
|
||||
}
|
||||
} ]
|
||||
```
|
||||
|
||||
### Run the Hadoop Job ###
|
||||
Batch Ingestion
|
||||
---------------
|
||||
Druid is designed for large data volumes, and most real-world data sets require batch indexing be done through a Hadoop job.
|
||||
|
||||
Now its time to run the Hadoop [Batch-ingestion](Batch-ingestion.html) job, HadoopDruidIndexer, which will fill a historical [Historical](Historical.html) node with data. First we'll need to configure the job.
|
||||
The setup for a single node, 'standalone' Hadoop cluster is available [here](http://hadoop.apache.org/docs/stable/single_node_setup.html).
|
||||
|
||||
1. Create a config called batchConfig.json similar to:
|
||||
For the purposes of this tutorial, we are going to use our very small and simple Wikipedia data set. This data can directly be ingested via other means as shown in the previous [tutorial](Tutorial%3A-Loading-Your-Data-Part-1), but we are going to use Hadoop here for demonstration purposes.
|
||||
|
||||
Our data is located at:
|
||||
|
||||
```
|
||||
examples/indexing/wikipedia_data.json
|
||||
```
|
||||
|
||||
The following events should exist in the file:
|
||||
|
||||
```json
|
||||
{"timestamp": "2013-08-31T01:02:33Z", "page": "Gypsy Danger", "language" : "en", "user" : "nuclear", "unpatrolled" : "true", "newPage" : "true", "robot": "false", "anonymous": "false", "namespace":"article", "continent":"North America", "country":"United States", "region":"Bay Area", "city":"San Francisco", "added": 57, "deleted": 200, "delta": -143}
|
||||
{"timestamp": "2013-08-31T03:32:45Z", "page": "Striker Eureka", "language" : "en", "user" : "speed", "unpatrolled" : "false", "newPage" : "true", "robot": "true", "anonymous": "false", "namespace":"wikipedia", "continent":"Australia", "country":"Australia", "region":"Cantebury", "city":"Syndey", "added": 459, "deleted": 129, "delta": 330}
|
||||
{"timestamp": "2013-08-31T07:11:21Z", "page": "Cherno Alpha", "language" : "ru", "user" : "masterYi", "unpatrolled" : "false", "newPage" : "true", "robot": "true", "anonymous": "false", "namespace":"article", "continent":"Asia", "country":"Russia", "region":"Oblast", "city":"Moscow", "added": 123, "deleted": 12, "delta": 111}
|
||||
{"timestamp": "2013-08-31T11:58:39Z", "page": "Crimson Typhoon", "language" : "zh", "user" : "triplets", "unpatrolled" : "true", "newPage" : "false", "robot": "true", "anonymous": "false", "namespace":"wikipedia", "continent":"Asia", "country":"China", "region":"Shanxi", "city":"Taiyuan", "added": 905, "deleted": 5, "delta": 900}
|
||||
{"timestamp": "2013-08-31T12:41:27Z", "page": "Coyote Tango", "language" : "ja", "user" : "stringer", "unpatrolled" : "true", "newPage" : "false", "robot": "true", "anonymous": "false", "namespace":"wikipedia", "continent":"Asia", "country":"Japan", "region":"Kanto", "city":"Tokyo", "added": 1, "deleted": 10, "delta": -9}
|
||||
```
|
||||
|
||||
#### Setup a Druid Cluster
|
||||
|
||||
To index the data, we are going to need an indexing service, a historical node, and a coordinator node.
|
||||
|
||||
To start the Indexing Service:
|
||||
|
||||
```bash
|
||||
java -Xmx2g -Duser.timezone=UTC -Dfile.encoding=UTF-8 -classpath lib/*:<hadoop_config_path>:config/overlord io.druid.cli.Main server overlord
|
||||
```
|
||||
|
||||
To start the Coordinator Node:
|
||||
|
||||
```bash
|
||||
java -Xmx256m -Duser.timezone=UTC -Dfile.encoding=UTF-8 -classpath lib/*:config/coordinator io.druid.cli.Main server coordinator
|
||||
```
|
||||
|
||||
To start the Historical Node:
|
||||
|
||||
```bash
|
||||
java -Xmx256m -Duser.timezone=UTC -Dfile.encoding=UTF-8 -classpath lib/*:config/historical io.druid.cli.Main server historical
|
||||
```
|
||||
|
||||
#### Index the Data
|
||||
|
||||
Before indexing the data, make sure you have a valid Hadoop cluster running. To build our Druid segment, we are going to submit a [Hadoop index task](Tasks.html) to the indexing service. The grammar for the Hadoop index task is very similar to the index task of the last tutorial. The tutorial Hadoop index task should be located at:
|
||||
|
||||
```
|
||||
examples/indexing/wikipedia_index_hadoop_task.json
|
||||
```
|
||||
|
||||
Examining the contents of the file, you should find:
|
||||
|
||||
```json
|
||||
{
|
||||
"dataSource": "druidtest",
|
||||
"timestampColumn": "utcdt",
|
||||
"timestampFormat": "iso",
|
||||
"dataSpec": {
|
||||
"format": "json",
|
||||
"dimensions": [
|
||||
"gender",
|
||||
"age"
|
||||
]
|
||||
},
|
||||
"granularitySpec": {
|
||||
"type": "uniform",
|
||||
"intervals": [
|
||||
"2010-01-01T01\/PT1H"
|
||||
],
|
||||
"gran": "hour"
|
||||
},
|
||||
"pathSpec": {
|
||||
"type": "static",
|
||||
"paths": "\/druid\/records.json"
|
||||
},
|
||||
"rollupSpec": {
|
||||
"aggs": [
|
||||
{
|
||||
"type": "count",
|
||||
"name": "impressions"
|
||||
},
|
||||
{
|
||||
"type": "doubleSum",
|
||||
"name": "wp",
|
||||
"fieldName": "wp"
|
||||
}
|
||||
],
|
||||
"rollupGranularity": "minute"
|
||||
},
|
||||
"workingPath": "\/tmp\/working_path",
|
||||
"segmentOutputPath": "\/tmp\/segments",
|
||||
"partitionsSpec": {
|
||||
"targetPartitionSize": 5000000
|
||||
},
|
||||
"updaterJobSpec": {
|
||||
"type": "db",
|
||||
"connectURI": "jdbc:mysql:\/\/localhost:3306\/druid",
|
||||
"user": "druid",
|
||||
"password": "diurd",
|
||||
"segmentTable": "druid_segments"
|
||||
"type" : "index_hadoop",
|
||||
"config": {
|
||||
"dataSource" : "wikipedia",
|
||||
"timestampColumn" : "timestamp",
|
||||
"timestampFormat" : "auto",
|
||||
"dataSpec" : {
|
||||
"format" : "json",
|
||||
"dimensions" : ["page","language","user","unpatrolled","newPage","robot","anonymous","namespace","continent","country","region","city"]
|
||||
},
|
||||
"granularitySpec" : {
|
||||
"type" : "uniform",
|
||||
"gran" : "DAY",
|
||||
"intervals" : [ "2013-08-31/2013-09-01" ]
|
||||
},
|
||||
"pathSpec" : {
|
||||
"type" : "static",
|
||||
"paths" : "examples/indexing/wikipedia_data.json"
|
||||
},
|
||||
"targetPartitionSize" : 5000000,
|
||||
"rollupSpec" : {
|
||||
"aggs": [{
|
||||
"type" : "count",
|
||||
"name" : "count"
|
||||
}, {
|
||||
"type" : "doubleSum",
|
||||
"name" : "added",
|
||||
"fieldName" : "added"
|
||||
}, {
|
||||
"type" : "doubleSum",
|
||||
"name" : "deleted",
|
||||
"fieldName" : "deleted"
|
||||
}, {
|
||||
"type" : "doubleSum",
|
||||
"name" : "delta",
|
||||
"fieldName" : "delta"
|
||||
}],
|
||||
"rollupGranularity" : "none"
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
2. Now run the job, with the config pointing at batchConfig.json:
|
||||
If you are curious about what all this configuration means, see [here](Task.html)
|
||||
To submit the task:
|
||||
|
||||
```bash
|
||||
java -Xmx256m -Duser.timezone=UTC -Dfile.encoding=UTF-8 \
|
||||
-classpath `echo lib/* | tr ' ' ':'` \
|
||||
io.druid.cli.Main index hadoop batchConfig.json
|
||||
```
|
||||
```bash
|
||||
curl -X 'POST' -H 'Content-Type:application/json' -d @examples/indexing/wikipedia_index_hadoop_task.json localhost:8087/druid/indexer/v1/task
|
||||
```
|
||||
|
||||
You can now move on to [Querying Your Data](Querying-Your-Data.html)!
|
||||
After the task is completed, the segment should be assigned to your historical node. You should be able to query the segment.
|
||||
|
||||
Next Steps
|
||||
----------
|
||||
For more information on querying, check out this [tutorial](Tutorial%3A-All-About-Queries.html).
|
||||
|
||||
Additional Information
|
||||
----------------------
|
||||
|
||||
Getting data into Druid can definitely be difficult for first time users. Please don't hesitate to ask questions in our IRC channel or on our [google groups page](https://groups.google.com/forum/#!forum/druid-development).
|
||||
|
|
|
@ -1,13 +1,13 @@
|
|||
---
|
||||
layout: doc_page
|
||||
---
|
||||
Welcome back! In our first [tutorial](Tutorial:-A-First-Look-at-Druid.html), we introduced you to the most basic Druid setup: a single realtime node. We streamed in some data and queried it. Realtime nodes collect very recent data and periodically hand that data off to the rest of the Druid cluster. Some questions about the architecture must naturally come to mind. What does the rest of Druid cluster look like? How does Druid load available static data?
|
||||
Welcome back! In our first [tutorial](Tutorial%3A-A-First-Look-at-Druid.html), we introduced you to the most basic Druid setup: a single realtime node. We streamed in some data and queried it. Realtime nodes collect very recent data and periodically hand that data off to the rest of the Druid cluster. Some questions about the architecture must naturally come to mind. What does the rest of Druid cluster look like? How does Druid load available static data?
|
||||
|
||||
This tutorial will hopefully answer these questions!
|
||||
|
||||
In this tutorial, we will set up other types of Druid nodes as well as and external dependencies for a fully functional Druid cluster. The architecture of Druid is very much like the [Megazord](http://www.youtube.com/watch?v=7mQuHh1X4H4) from the popular 90s show Mighty Morphin' Power Rangers. Each Druid node has a specific purpose and the nodes come together to form a fully functional system.
|
||||
|
||||
## Downloading Druid ##
|
||||
## Downloading Druid
|
||||
|
||||
If you followed the first tutorial, you should already have Druid downloaded. If not, let's go back and do that first.
|
||||
|
||||
|
@ -20,15 +20,15 @@ tar -zxvf druid-services-*-bin.tar.gz
|
|||
cd druid-services-*
|
||||
```
|
||||
|
||||
You can also [Build From Source](Build-From-Source.html).
|
||||
You can also [Build From Source](Build-from-source.html).
|
||||
|
||||
## External Dependencies ##
|
||||
## External Dependencies
|
||||
|
||||
Druid requires 3 external dependencies. A "deep" storage that acts as a backup data repository, a relational database such as MySQL to hold configuration and metadata information, and [Apache Zookeeper](http://zookeeper.apache.org/) for coordination among different pieces of the cluster.
|
||||
|
||||
For deep storage, we have made a public S3 bucket (static.druid.io) available where data for this particular tutorial can be downloaded. More on the data [later](Tutorial-Part-2.html#the-data).
|
||||
For deep storage, we have made a public S3 bucket (static.druid.io) available where data for this particular tutorial can be downloaded. More on the data later.
|
||||
|
||||
### Setting up MySQL ###
|
||||
#### Setting up MySQL
|
||||
|
||||
1. If you don't already have it, download MySQL Community Server here: [http://dev.mysql.com/downloads/mysql/](http://dev.mysql.com/downloads/mysql/)
|
||||
2. Install MySQL
|
||||
|
@ -43,7 +43,7 @@ GRANT ALL ON druid.* TO 'druid'@'localhost' IDENTIFIED BY 'diurd';
|
|||
CREATE database druid;
|
||||
```
|
||||
|
||||
### Setting up Zookeeper ###
|
||||
#### Setting up Zookeeper
|
||||
|
||||
```bash
|
||||
curl http://www.motorlogy.com/apache/zookeeper/zookeeper-3.4.5/zookeeper-3.4.5.tar.gz -o zookeeper-3.4.5.tar.gz
|
||||
|
@ -54,9 +54,9 @@ cp conf/zoo_sample.cfg conf/zoo.cfg
|
|||
cd ..
|
||||
```
|
||||
|
||||
## The Data ##
|
||||
## The Data
|
||||
|
||||
Similar to the first tutorial, the data we will be loading is based on edits that have occurred on Wikipedia. Every time someone edits a page in Wikipedia, metadata is generated about the editor and edited page. Druid collects each individual event and packages them together in a container known as a [segment](https://github.com/metamx/druid/wiki/Segments). Segments contain data over some span of time. We've prebuilt a segment for this tutorial and will cover making your own segments in other [pages](Loading-Your-Data.html).The segment we are going to work with has the following format:
|
||||
Similar to the first tutorial, the data we will be loading is based on edits that have occurred on Wikipedia. Every time someone edits a page in Wikipedia, metadata is generated about the editor and edited page. Druid collects each individual event and packages them together in a container known as a [segment](Segments.html). Segments contain data over some span of time. We've prebuilt a segment for this tutorial and will cover making your own segments in other [pages](Tutorial%3A-Loading-Your-Data-Part-1.html).The segment we are going to work with has the following format:
|
||||
|
||||
Dimensions (things to filter on):
|
||||
|
||||
|
@ -84,28 +84,28 @@ Metrics (things to aggregate over):
|
|||
"deleted"
|
||||
```
|
||||
|
||||
## The Cluster ##
|
||||
## The Cluster
|
||||
|
||||
Let's start up a few nodes and download our data. First things though, let's create a config directory where we will store configs for our various nodes:
|
||||
Let's start up a few nodes and download our data. First things though, let's make sure we have config directory where we will store configs for our various nodes:
|
||||
|
||||
```
|
||||
mkdir config
|
||||
ls config
|
||||
```
|
||||
|
||||
If you are interested in learning more about Druid configuration files, check out this [link](Configuration.html). Many aspects of Druid are customizable. For the purposes of this tutorial, we are going to use default values for most things.
|
||||
|
||||
### Start a Coordinator Node ###
|
||||
#### Start a Coordinator Node
|
||||
|
||||
Coordinator nodes are in charge of load assignment and distribution. Coordinator nodes monitor the status of the cluster and command historical nodes to assign and drop segments.
|
||||
For more information about coordinator nodes, see [here](Coordinator.html).
|
||||
|
||||
To create the coordinator config file:
|
||||
The coordinator config file should already exist at:
|
||||
|
||||
```
|
||||
mkdir config/coordinator
|
||||
config/coordinator
|
||||
```
|
||||
|
||||
Under the directory we just created, create the file `runtime.properties` with the following contents if it does not exist:
|
||||
In the directory, there should be a `runtime.properties` file with the following contents:
|
||||
|
||||
```
|
||||
druid.host=localhost
|
||||
|
@ -130,18 +130,18 @@ To start the coordinator node:
|
|||
java -Xmx256m -Duser.timezone=UTC -Dfile.encoding=UTF-8 -classpath lib/*:config/coordinator io.druid.cli.Main server coordinator
|
||||
```
|
||||
|
||||
### Start a historical node ###
|
||||
#### Start a Historical Node
|
||||
|
||||
Historical nodes are the workhorses of a cluster and are in charge of loading historical segments and making them available for queries. Our Wikipedia segment will be downloaded by a historical node.
|
||||
For more information about Historical nodes, see [here](Historical.html).
|
||||
|
||||
To create the historical config file:
|
||||
The historical config file should exist at:
|
||||
|
||||
```
|
||||
mkdir config/historical
|
||||
config/historical
|
||||
```
|
||||
|
||||
Under the directory we just created, create the file `runtime.properties` with the following contents:
|
||||
In the directory we just created, we should have the file `runtime.properties` with the following contents:
|
||||
|
||||
```
|
||||
druid.host=localhost
|
||||
|
@ -167,18 +167,18 @@ To start the historical node:
|
|||
java -Xmx256m -Duser.timezone=UTC -Dfile.encoding=UTF-8 -classpath lib/*:config/historical io.druid.cli.Main server historical
|
||||
```
|
||||
|
||||
### Start a Broker Node ###
|
||||
#### Start a Broker Node
|
||||
|
||||
Broker nodes are responsible for figuring out which historical and/or realtime nodes correspond to which queries. They also merge partial results from these nodes in a scatter/gather fashion.
|
||||
For more information about Broker nodes, see [here](Broker.html).
|
||||
|
||||
To create the broker config file:
|
||||
The broker config file should exist at:
|
||||
|
||||
```
|
||||
mkdir config/broker
|
||||
config/broker
|
||||
```
|
||||
|
||||
Under the directory we just created, create the file ```runtime.properties``` with the following contents:
|
||||
In the directory, there should be a `runtime.properties` file with the following contents:
|
||||
|
||||
```
|
||||
druid.host=localhost
|
||||
|
@ -194,7 +194,7 @@ To start the broker node:
|
|||
java -Xmx256m -Duser.timezone=UTC -Dfile.encoding=UTF-8 -classpath lib/*:config/broker io.druid.cli.Main server broker
|
||||
```
|
||||
|
||||
## Loading the Data ##
|
||||
## Loading the Data
|
||||
|
||||
The MySQL dependency we introduced earlier on contains a 'segments' table that contains entries for segments that should be loaded into our cluster. The Druid coordinator compares this table with segments that already exist in the cluster to determine what should be loaded and dropped. To load our wikipedia segment, we need to create an entry in our MySQL segment table.
|
||||
|
||||
|
@ -220,7 +220,8 @@ When the segment completes downloading and ready for queries, you should see the
|
|||
|
||||
At this point, we can query the segment. For more information on querying, see this [link](Querying.html).
|
||||
|
||||
## Next Steps ##
|
||||
Next Steps
|
||||
----------
|
||||
|
||||
Now that you have an understanding of what the Druid clsuter looks like, why not load some of your own data?
|
||||
Check out the [Loading Your Own Data](Loading-Your-Data.html) section for more info!
|
||||
Now that you have an understanding of what the Druid cluster looks like, why not load some of your own data?
|
||||
Check out the next [tutorial](Tutorial%3A-Loading-Your-Data-Part-1.html) section for more info!
|
|
@ -80,7 +80,7 @@ Okay, things are about to get real. To query the real-time node you've spun up,
|
|||
./run_example_client.sh
|
||||
```
|
||||
|
||||
Select "webstream" once again. This script issues [GroupByQuery](GroupByQuery.html)s to the data we've been ingesting. The query looks like this:
|
||||
Select "webstream" once again. This script issues [GroupByQueries](GroupByQuery.html) to the data we've been ingesting. The query looks like this:
|
||||
|
||||
```json
|
||||
{
|
||||
|
@ -304,15 +304,9 @@ You should see an answer to our question. For my stream, it looks like this:
|
|||
|
||||
Feel free to tweak other query parameters to answer other questions you may have about the data.
|
||||
|
||||
Next Steps
|
||||
----------
|
||||
|
||||
What to know even more information about the Druid Cluster? Check out [Tutorial: The Druid Cluster](Tutorial:-The-Druid-Cluster.html)
|
||||
Druid is even more fun if you load your own data into it! To learn how to load your data, see [Loading Your Data](Loading-Your-Data.html).
|
||||
|
||||
Additional Information
|
||||
----------------------
|
||||
|
||||
This tutorial is merely showcasing a small fraction of what Druid can do. If you are interested in more information about Druid, including setting up a more sophisticated Druid cluster, please read the other links in our wiki.
|
||||
|
||||
And thus concludes our journey! Hopefully you learned a thing or two about Druid real-time ingestion, querying Druid, and how Druid can be used to solve problems. If you have additional questions, feel free to post in our [google groups page](http://www.groups.google.com/forum/#!forum/druid-development).
|
||||
And thus concludes our journey! Hopefully you learned a thing or two about Druid real-time ingestion, querying Druid, and how Druid can be used to solve problems. If you have additional questions, feel free to post in our [google groups page](https://groups.google.com/forum/#!forum/druid-development).
|
||||
|
|
|
@ -322,6 +322,6 @@ Feel free to tweak other query parameters to answer other questions you may have
|
|||
|
||||
h2. Additional Information
|
||||
|
||||
This tutorial is merely showcasing a small fraction of what Druid can do. Next, continue on to "Loading Your Data":./Loading-Your-Data.html.
|
||||
This tutorial is merely showcasing a small fraction of what Druid can do. Next, continue on to "The Druid Cluster":./Tutorial:-The-Druid-Cluster.html.
|
||||
|
||||
And thus concludes our journey! Hopefully you learned a thing or two about Druid real-time ingestion, querying Druid, and how Druid can be used to solve problems. If you have additional questions, feel free to post in our "google groups page":http://www.groups.google.com/forum/#!forum/druid-development.
|
||||
|
|
|
@ -10,18 +10,27 @@
|
|||
Getting Started
|
||||
* [Tutorial: A First Look at Druid](Tutorial:-A-First-Look-at-Druid.html)
|
||||
* [Tutorial: The Druid Cluster](Tutorial:-The-Druid-Cluster.html)
|
||||
* [Loading Your Data](Loading-Your-Data.html)
|
||||
* [Querying Your Data](Querying-Your-Data.html)
|
||||
* [Booting a Production Cluster](Booting-a-Production-Cluster.html)
|
||||
* [Examples](Examples.html)
|
||||
* [Cluster Setup](Cluster-Setup.html)
|
||||
* [Configuration](Configuration.html)
|
||||
* [Tutorial: Loading Your Data Part 1](Tutorial:-Loading-Your-Data-Part-1.html)
|
||||
* [Tutorial: Loading Your Data Part 2](Tutorial:-Loading-Your-Data-Part-2.html)
|
||||
* [Tutorial: All About Queries](Tutorial:-All-About-Queries.html)
|
||||
--------------------------------------
|
||||
|
||||
Evaluate Druid
|
||||
* [Cluster Setup](Cluster-setup.html)
|
||||
* [Booting a Production Cluster](Booting-a-production-cluster.html)
|
||||
--------------------------------------
|
||||
|
||||
Configuration
|
||||
* [Configuration](Configuration.html)
|
||||
-------------------------------------
|
||||
|
||||
Data Ingestion
|
||||
* [Realtime](Realtime.html)
|
||||
* [Batch|Batch Ingestion](Batch|Batch-Ingestion.html)
|
||||
* [Batch Ingestion](Batch-ingestion.html)
|
||||
* [Indexing Service](Indexing-Service.html)
|
||||
* [Indexing Service](Indexing-Service.html)
|
||||
*** ]
|
||||
*** [Tasks](Tasks.html)
|
||||
----------------------------
|
||||
|
||||
Querying
|
||||
|
|
|
@ -12,16 +12,22 @@ h1. Contents
|
|||
h2. Getting Started
|
||||
* "Tutorial: A First Look at Druid":./Tutorial:-A-First-Look-at-Druid.html
|
||||
* "Tutorial: The Druid Cluster":./Tutorial:-The-Druid-Cluster.html
|
||||
* "Loading Your Data":./Loading-Your-Data.html
|
||||
* "Querying Your Data":./Querying-your-data.html
|
||||
* "Tutorial: Loading Your Data Part 1":./Tutorial:-Loading-Your-Data-Part-1.html
|
||||
* "Tutorial: Loading Your Data Part 2":./Tutorial:-Loading-Your-Data-Part-2.html
|
||||
* "Tutorial: All About Queries":./Tutorial:-All-About-Queries.html
|
||||
|
||||
h2. Evaluate Druid
|
||||
* "Cluster Setup":./Cluster-setup.html
|
||||
* "Booting a Production Cluster":./Booting-a-production-cluster.html
|
||||
* "Examples":./Examples.html
|
||||
|
||||
h2. Configuration
|
||||
* "Configuration":Configuration.html
|
||||
|
||||
h2. Data Ingestion
|
||||
* "Realtime":./Realtime.html
|
||||
* "Batch":./Batch-ingestion.html
|
||||
* "Indexing Service":./Indexing-Service.html
|
||||
** "Tasks":./Tasks.html
|
||||
|
||||
h2. Querying
|
||||
* "Querying":./Querying.html
|
||||
|
|
|
@ -1,22 +1,24 @@
|
|||
[
|
||||
{
|
||||
"schema": {
|
||||
"dataSource": "druidtest",
|
||||
"aggregators": [
|
||||
{
|
||||
"type": "count",
|
||||
"name": "impressions"
|
||||
},
|
||||
{
|
||||
"type": "doubleSum",
|
||||
"name": "wp",
|
||||
"fieldName": "wp"
|
||||
}
|
||||
],
|
||||
"indexGranularity": "minute",
|
||||
"shardSpec": {
|
||||
"type": "none"
|
||||
}
|
||||
"dataSource": "wikipedia",
|
||||
"aggregators" : [{
|
||||
"type" : "count",
|
||||
"name" : "count"
|
||||
}, {
|
||||
"type" : "doubleSum",
|
||||
"name" : "added",
|
||||
"fieldName" : "added"
|
||||
}, {
|
||||
"type" : "doubleSum",
|
||||
"name" : "deleted",
|
||||
"fieldName" : "deleted"
|
||||
}, {
|
||||
"type" : "doubleSum",
|
||||
"name" : "delta",
|
||||
"fieldName" : "delta"
|
||||
}],
|
||||
"indexGranularity": "none"
|
||||
},
|
||||
"config": {
|
||||
"maxRowsInMemory": 500000,
|
||||
|
@ -29,23 +31,20 @@
|
|||
"zk.connectiontimeout.ms": "15000",
|
||||
"zk.sessiontimeout.ms": "15000",
|
||||
"zk.synctime.ms": "5000",
|
||||
"groupid": "topic-pixel-local",
|
||||
"groupid": "druid-example",
|
||||
"fetch.size": "1048586",
|
||||
"autooffset.reset": "largest",
|
||||
"autocommit.enable": "false"
|
||||
},
|
||||
"feed": "druidtest",
|
||||
"feed": "wikipedia",
|
||||
"parser": {
|
||||
"timestampSpec": {
|
||||
"column": "utcdt",
|
||||
"format": "iso"
|
||||
"column": "timestamp"
|
||||
},
|
||||
"data": {
|
||||
"format": "json"
|
||||
},
|
||||
"dimensionExclusions": [
|
||||
"wp"
|
||||
]
|
||||
"format": "json",
|
||||
"dimensions" : ["page","language","user","unpatrolled","newPage","robot","anonymous","namespace","continent","country","region","city"]
|
||||
}
|
||||
}
|
||||
},
|
||||
"plumber": {
|
|
@ -1,5 +1,5 @@
|
|||
{"timestamp": "2013-08-31T01:02:33Z", "page": "Gypsy Danger", "language" : "en", "user" : "nuclear", "unpatrolled" : "true", "newPage" : "true", "robot": "false", "anonymous": "false", "namespace":"article", "continent":"North America", "country":"United States", "region":"Bay Area", "city":"San Francisco", "added": 57, "deleted": 200, "delta": -143}
|
||||
{"timestamp": "2013-08-31T03:32:45Z", "page": "Striker Eureka", "language" : "en", "user" : "speed", "unpatrolled" : "false", "newPage" : "true", "robot": "true", "anonymous": "false", "namespace":"wikipedia", "continent":"Australia", "country":"Australia", "region":"Dingo Land", "city":"Syndey", "added": 459, "deleted": 129, "delta": 330}
|
||||
{"timestamp": "2013-08-31T07:11:21Z", "page": "Cherno Alpha", "language" : "ru", "user" : "masterYi", "unpatrolled" : "false", "newPage" : "true", "robot": "true", "anonymous": "false", "namespace":"article", "continent":"Asia", "country":"Russia", "region":"Vodka Land", "city":"Moscow", "added": 123, "deleted": 12, "delta": 111}
|
||||
{"timestamp": "2013-08-31T03:32:45Z", "page": "Striker Eureka", "language" : "en", "user" : "speed", "unpatrolled" : "false", "newPage" : "true", "robot": "true", "anonymous": "false", "namespace":"wikipedia", "continent":"Australia", "country":"Australia", "region":"Cantebury", "city":"Syndey", "added": 459, "deleted": 129, "delta": 330}
|
||||
{"timestamp": "2013-08-31T07:11:21Z", "page": "Cherno Alpha", "language" : "ru", "user" : "masterYi", "unpatrolled" : "false", "newPage" : "true", "robot": "true", "anonymous": "false", "namespace":"article", "continent":"Asia", "country":"Russia", "region":"Oblast", "city":"Moscow", "added": 123, "deleted": 12, "delta": 111}
|
||||
{"timestamp": "2013-08-31T11:58:39Z", "page": "Crimson Typhoon", "language" : "zh", "user" : "triplets", "unpatrolled" : "true", "newPage" : "false", "robot": "true", "anonymous": "false", "namespace":"wikipedia", "continent":"Asia", "country":"China", "region":"Shanxi", "city":"Taiyuan", "added": 905, "deleted": 5, "delta": 900}
|
||||
{"timestamp": "2013-08-31T12:41:27Z", "page": "Coyote Tango", "language" : "ja", "user" : "cancer", "unpatrolled" : "true", "newPage" : "false", "robot": "true", "anonymous": "false", "namespace":"wikipedia", "continent":"Asia", "country":"Japan", "region":"Kanto", "city":"Tokyo", "added": 1, "deleted": 10, "delta": -9}
|
|
@ -0,0 +1,49 @@
|
|||
{
|
||||
"dataSource": "wikipedia",
|
||||
"timestampColumn": "timestamp",
|
||||
"timestampFormat": "iso",
|
||||
"dataSpec": {
|
||||
"format": "json",
|
||||
"dimensions" : ["page","language","user","unpatrolled","newPage","robot","anonymous","namespace","continent","country","region","city"]
|
||||
},
|
||||
"granularitySpec" : {
|
||||
"type" : "uniform",
|
||||
"gran" : "DAY",
|
||||
"intervals" : [ "2013-08-31/2013-09-01" ]
|
||||
},
|
||||
"pathSpec": {
|
||||
"type": "static",
|
||||
"paths": "examples/indexing/wikipedia_data.json"
|
||||
},
|
||||
"rollupSpec": {
|
||||
"aggs": [{
|
||||
"type" : "count",
|
||||
"name" : "count"
|
||||
}, {
|
||||
"type" : "doubleSum",
|
||||
"name" : "added",
|
||||
"fieldName" : "added"
|
||||
}, {
|
||||
"type" : "doubleSum",
|
||||
"name" : "deleted",
|
||||
"fieldName" : "deleted"
|
||||
}, {
|
||||
"type" : "doubleSum",
|
||||
"name" : "delta",
|
||||
"fieldName" : "delta"
|
||||
}],
|
||||
"rollupGranularity": "none"
|
||||
},
|
||||
"workingPath": "\/tmp\/working_path",
|
||||
"segmentOutputPath": "\/tmp\/segments",
|
||||
"partitionsSpec": {
|
||||
"targetPartitionSize": 5000000
|
||||
},
|
||||
"updaterJobSpec": {
|
||||
"type": "db",
|
||||
"connectURI": "jdbc:mysql:\/\/localhost:3306\/druid",
|
||||
"user": "druid",
|
||||
"password": "diurd",
|
||||
"segmentTable": "druid_segments"
|
||||
}
|
||||
}
|
|
@ -0,0 +1,41 @@
|
|||
{
|
||||
"type" : "index_hadoop",
|
||||
"config": {
|
||||
"dataSource" : "wikipedia",
|
||||
"timestampColumn" : "timestamp",
|
||||
"timestampFormat" : "auto",
|
||||
"dataSpec" : {
|
||||
"format" : "json",
|
||||
"dimensions" : ["page","language","user","unpatrolled","newPage","robot","anonymous","namespace","continent","country","region","city"]
|
||||
},
|
||||
"granularitySpec" : {
|
||||
"type" : "uniform",
|
||||
"gran" : "DAY",
|
||||
"intervals" : [ "2013-08-31/2013-09-01" ]
|
||||
},
|
||||
"pathSpec" : {
|
||||
"type" : "static",
|
||||
"paths" : "examples/indexing/wikipedia_data.json"
|
||||
},
|
||||
"targetPartitionSize" : 5000000,
|
||||
"rollupSpec" : {
|
||||
"aggs": [{
|
||||
"type" : "count",
|
||||
"name" : "count"
|
||||
}, {
|
||||
"type" : "doubleSum",
|
||||
"name" : "added",
|
||||
"fieldName" : "added"
|
||||
}, {
|
||||
"type" : "doubleSum",
|
||||
"name" : "deleted",
|
||||
"fieldName" : "deleted"
|
||||
}, {
|
||||
"type" : "doubleSum",
|
||||
"name" : "delta",
|
||||
"fieldName" : "delta"
|
||||
}],
|
||||
"rollupGranularity" : "none"
|
||||
}
|
||||
}
|
||||
}
|
|
@ -8,7 +8,7 @@
|
|||
},
|
||||
"aggregators" : [{
|
||||
"type" : "count",
|
||||
"name" : "edit_count"
|
||||
"name" : "count"
|
||||
}, {
|
||||
"type" : "doubleSum",
|
||||
"name" : "added",
|
||||
|
|
|
@ -12,8 +12,8 @@
|
|||
},
|
||||
{
|
||||
"type":"longSum",
|
||||
"fieldName":"edit_count",
|
||||
"name":"count"
|
||||
"fieldName":"count",
|
||||
"name":"edit_count"
|
||||
}
|
||||
],
|
||||
"filter":{
|
||||
|
|
|
@ -11,6 +11,4 @@ druid.db.connector.connectURI=jdbc\:mysql\://localhost\:3306/druid
|
|||
druid.db.connector.user=druid
|
||||
druid.db.connector.password=diurd
|
||||
|
||||
druid.realtime.specFile=config/realtime/realtime.spec
|
||||
|
||||
druid.processing.buffer.sizeBytes=10000000
|
Loading…
Reference in New Issue