fix layout

This commit is contained in:
Xavier Léauté 2013-09-26 17:38:11 -07:00
parent a3e16335ef
commit b68c9fee83
1 changed files with 277 additions and 237 deletions

View File

@ -7,6 +7,7 @@ Druid can ingest data in three ways: via Kafka and a realtime node, via the inde
## Create Config Directories ##
Each type of node needs its own config file and directory, so create them as subdirectories under the druid directory.
```bash
mkdir config
mkdir config/realtime
@ -24,147 +25,171 @@ mkdir config/broker
Instructions for booting a Zookeeper and then Kafka cluster are available [here](http://kafka.apache.org/07/quickstart.html).
1. Download Apache Kafka 0.7.2 from [http://kafka.apache.org/downloads.html](http://kafka.apache.org/downloads.html)
```bash
wget http://apache.spinellicreations.com/incubator/kafka/kafka-0.7.2-incubating/kafka-0.7.2-incubating-src.tgz
tar -xvzf kafka-0.7.2-incubating-src.tgz
cd kafka-0.7.2-incubating-src
```
```bash
wget http://apache.spinellicreations.com/incubator/kafka/kafka-0.7.2-incubating/kafka-0.7.2-incubating-src.tgz
tar -xvzf kafka-0.7.2-incubating-src.tgz
cd kafka-0.7.2-incubating-src
```
2. Build Kafka
```bash
./sbt update
./sbt package
```
```bash
./sbt update
./sbt package
```
3. Boot Kafka
```bash
cat config/zookeeper.properties
bin/zookeeper-server-start.sh config/zookeeper.properties
# in a new console
bin/kafka-server-start.sh config/server.properties
```
```bash
cat config/zookeeper.properties
bin/zookeeper-server-start.sh config/zookeeper.properties
# in a new console
bin/kafka-server-start.sh config/server.properties
```
4. Launch the console producer (so you can type in JSON kafka messages in a bit)
```bash
bin/kafka-console-producer.sh --zookeeper localhost:2181 --topic druidtest
```
```bash
bin/kafka-console-producer.sh --zookeeper localhost:2181 --topic druidtest
```
### Launching a Realtime Node
1. Create a valid configuration file similar to this called config/realtime/runtime.properties:
```
druid.host=0.0.0.0:8080
druid.port=8080
com.metamx.emitter.logging=true
```
druid.host=0.0.0.0:8080
druid.port=8080
druid.processing.formatString=processing_%s
druid.processing.numThreads=1
druid.processing.buffer.sizeBytes=10000000
com.metamx.emitter.logging=true
#emitting, opaque marker
druid.service=example
druid.processing.formatString=processing_%s
druid.processing.numThreads=1
druid.processing.buffer.sizeBytes=10000000
druid.request.logging.dir=/tmp/example/log
druid.realtime.specFile=realtime.spec
com.metamx.emitter.logging=true
com.metamx.emitter.logging.level=debug
#emitting, opaque marker
druid.service=example
# below are dummy values when operating a realtime only node
druid.processing.numThreads=3
druid.request.logging.dir=/tmp/example/log
druid.realtime.specFile=realtime.spec
com.metamx.emitter.logging=true
com.metamx.emitter.logging.level=debug
com.metamx.aws.accessKey=dummy_access_key
com.metamx.aws.secretKey=dummy_secret_key
druid.pusher.s3.bucket=dummy_s3_bucket
# below are dummy values when operating a realtime only node
druid.processing.numThreads=3
druid.zk.service.host=localhost
druid.server.maxSize=300000000000
druid.zk.paths.base=/druid
druid.database.segmentTable=prod_segments
druid.database.user=user
druid.database.password=diurd
druid.database.connectURI=
druid.host=127.0.0.1:8080
com.metamx.aws.accessKey=dummy_access_key
com.metamx.aws.secretKey=dummy_secret_key
druid.pusher.s3.bucket=dummy_s3_bucket
druid.zk.service.host=localhost
druid.server.maxSize=300000000000
druid.zk.paths.base=/druid
druid.database.segmentTable=prod_segments
druid.database.user=user
druid.database.password=diurd
druid.database.connectURI=
druid.host=127.0.0.1:8080
```
```
2. Create a valid realtime configuration file similar to this called realtime.spec:
```json
[{
"schema" : { "dataSource":"druidtest",
"aggregators":[ {"type":"count", "name":"impressions"},
{"type":"doubleSum","name":"wp","fieldName":"wp"}],
"indexGranularity":"minute",
"shardSpec" : { "type": "none" } },
"config" : { "maxRowsInMemory" : 500000,
"intermediatePersistPeriod" : "PT10m" },
"firehose" : { "type" : "kafka-0.7.2",
"consumerProps" : { "zk.connect" : "localhost:2181",
"zk.connectiontimeout.ms" : "15000",
"zk.sessiontimeout.ms" : "15000",
"zk.synctime.ms" : "5000",
"groupid" : "topic-pixel-local",
"fetch.size" : "1048586",
"autooffset.reset" : "largest",
"autocommit.enable" : "false" },
"feed" : "druidtest",
"parser" : { "timestampSpec" : { "column" : "utcdt", "format" : "iso" },
"data" : { "format" : "json" },
"dimensionExclusions" : ["wp"] } },
"plumber" : { "type" : "realtime",
"windowPeriod" : "PT10m",
"segmentGranularity":"hour",
"basePersistDirectory" : "/tmp/realtime/basePersist",
"rejectionPolicy": {"type": "messageTime"} }
}]
```
```json
[{
"schema" : { "dataSource":"druidtest",
"aggregators":[ {"type":"count", "name":"impressions"},
{"type":"doubleSum","name":"wp","fieldName":"wp"}],
"indexGranularity":"minute",
"shardSpec" : { "type": "none" } },
"config" : { "maxRowsInMemory" : 500000,
"intermediatePersistPeriod" : "PT10m" },
"firehose" : { "type" : "kafka-0.7.2",
"consumerProps" : { "zk.connect" : "localhost:2181",
"zk.connectiontimeout.ms" : "15000",
"zk.sessiontimeout.ms" : "15000",
"zk.synctime.ms" : "5000",
"groupid" : "topic-pixel-local",
"fetch.size" : "1048586",
"autooffset.reset" : "largest",
"autocommit.enable" : "false" },
"feed" : "druidtest",
"parser" : { "timestampSpec" : { "column" : "utcdt", "format" : "iso" },
"data" : { "format" : "json" },
"dimensionExclusions" : ["wp"] } },
"plumber" : { "type" : "realtime",
"windowPeriod" : "PT10m",
"segmentGranularity":"hour",
"basePersistDirectory" : "/tmp/realtime/basePersist",
"rejectionPolicy": {"type": "messageTime"} }
}]
```
3. Launch the realtime node
```bash
java -Xmx256m -Duser.timezone=UTC -Dfile.encoding=UTF-8 \
-Ddruid.realtime.specFile=config/realtime/realtime.spec \
-classpath lib/*:config/realtime com.metamx.druid.realtime.RealtimeMain
```
```bash
java -Xmx256m -Duser.timezone=UTC -Dfile.encoding=UTF-8 \
-Ddruid.realtime.specFile=config/realtime/realtime.spec \
-classpath lib/*:config/realtime com.metamx.druid.realtime.RealtimeMain
```
4. Paste data into the Kafka console producer
```json
{"utcdt": "2010-01-01T01:01:01", "wp": 1000, "gender": "male", "age": 100}
{"utcdt": "2010-01-01T01:01:02", "wp": 2000, "gender": "female", "age": 50}
{"utcdt": "2010-01-01T01:01:03", "wp": 3000, "gender": "male", "age": 20}
{"utcdt": "2010-01-01T01:01:04", "wp": 4000, "gender": "female", "age": 30}
{"utcdt": "2010-01-01T01:01:05", "wp": 5000, "gender": "male", "age": 40}
```
```json
{"utcdt": "2010-01-01T01:01:01", "wp": 1000, "gender": "male", "age": 100}
{"utcdt": "2010-01-01T01:01:02", "wp": 2000, "gender": "female", "age": 50}
{"utcdt": "2010-01-01T01:01:03", "wp": 3000, "gender": "male", "age": 20}
{"utcdt": "2010-01-01T01:01:04", "wp": 4000, "gender": "female", "age": 30}
{"utcdt": "2010-01-01T01:01:05", "wp": 5000, "gender": "male", "age": 40}
```
5. Watch the events as they are ingested by Druid's realtime node
```bash
...
2013-06-17 21:41:55,569 INFO [Global--0] com.metamx.emitter.core.LoggingEmitter - Event [{"feed":"metrics","timestamp":"2013-06-17T21:41:55.569Z","service":"example","host":"127.0.0.1","metric":"events/processed","value":5,"user2":"druidtest"}]
...
```
```bash
...
2013-06-17 21:41:55,569 INFO [Global--0] com.metamx.emitter.core.LoggingEmitter - Event [{"feed":"metrics","timestamp":"2013-06-17T21:41:55.569Z","service":"example","host":"127.0.0.1","metric":"events/processed","value":5,"user2":"druidtest"}]
...
```
6. In a new console, edit a file called query.body:
```json
{
"queryType": "groupBy",
"dataSource": "druidtest",
"granularity": "all",
"dimensions": [],
"aggregations": [
{ "type": "count", "name": "rows" },
{"type": "longSum", "name": "imps", "fieldName": "impressions"},
{"type": "doubleSum", "name": "wp", "fieldName": "wp"}
],
"intervals": ["2010-01-01T00:00/2020-01-01T00"]
}
```
7. Submit the query via curl
```bash
curl -X POST "http://localhost:8080/druid/v2/?pretty" \
-H 'content-type: application/json' -d @query.body
```
8. View Result!
```json
[ {
"timestamp" : "2010-01-01T01:01:00.000Z",
"result" : {
"imps" : 20,
"wp" : 60000.0,
"rows" : 5
```json
{
"queryType": "groupBy",
"dataSource": "druidtest",
"granularity": "all",
"dimensions": [],
"aggregations": [
{ "type": "count", "name": "rows" },
{"type": "longSum", "name": "imps", "fieldName": "impressions"},
{"type": "doubleSum", "name": "wp", "fieldName": "wp"}
],
"intervals": ["2010-01-01T00:00/2020-01-01T00"]
}
} ]
```
```
7. Submit the query via curl
```bash
curl -X POST "http://localhost:8080/druid/v2/?pretty" \
-H 'content-type: application/json' -d @query.body
```
8. View Result!
```json
[ {
"timestamp" : "2010-01-01T01:01:00.000Z",
"result" : {
"imps" : 20,
"wp" : 60000.0,
"rows" : 5
}
} ]
```
Now you're ready for [Querying Your Data](Querying-Your-Data.html)!
## Loading Data with the HadoopDruidIndexer ##
@ -177,13 +202,16 @@ The setup for a single node, 'standalone' Hadoop cluster is available at [http:/
1. If you don't already have it, download MySQL Community Server here: [http://dev.mysql.com/downloads/mysql/](http://dev.mysql.com/downloads/mysql/)
2. Install MySQL
3. Create a druid user and database
```bash
mysql -u root
```
```sql
GRANT ALL ON druid.* TO 'druid'@'localhost' IDENTIFIED BY 'diurd';
CREATE database druid;
```
The [Master](Master.html) node will create the tables it needs based on its configuration.
### Make sure you have ZooKeeper Running ###
@ -206,114 +234,123 @@ cd ..
```
### Launch a Master Node ###
If you've already setup a realtime node, be aware that although you can run multiple node types on one physical computer, you must assign them unique ports. Having used 8080 for the [Realtime](Realtime.html) node, we use 8081 for the [Master](Master.html).
1. Setup a configuration file called config/master/runtime.properties similar to:
```bash
druid.host=0.0.0.0:8081
druid.port=8081
com.metamx.emitter.logging=true
```bash
druid.host=0.0.0.0:8081
druid.port=8081
druid.processing.formatString=processing_%s
druid.processing.numThreads=1
druid.processing.buffer.sizeBytes=10000000
com.metamx.emitter.logging=true
#emitting, opaque marker
druid.service=example
druid.processing.formatString=processing_%s
druid.processing.numThreads=1
druid.processing.buffer.sizeBytes=10000000
druid.master.startDelay=PT60s
druid.request.logging.dir=/tmp/example/log
druid.realtime.specFile=realtime.spec
com.metamx.emitter.logging=true
com.metamx.emitter.logging.level=debug
# emitting, opaque marker
druid.service=example
# below are dummy values when operating a realtime only node
druid.processing.numThreads=3
druid.master.startDelay=PT60s
druid.request.logging.dir=/tmp/example/log
druid.realtime.specFile=realtime.spec
com.metamx.emitter.logging=true
com.metamx.emitter.logging.level=debug
com.metamx.aws.accessKey=dummy_access_key
com.metamx.aws.secretKey=dummy_secret_key
druid.pusher.s3.bucket=dummy_s3_bucket
# below are dummy values when operating a realtime only node
druid.processing.numThreads=3
druid.zk.service.host=localhost
druid.server.maxSize=300000000000
druid.zk.paths.base=/druid
druid.database.segmentTable=prod_segments
druid.database.user=druid
druid.database.password=diurd
druid.database.connectURI=jdbc:mysql://localhost:3306/druid
druid.zk.paths.discoveryPath=/druid/discoveryPath
druid.database.ruleTable=rules
druid.database.configTable=config
com.metamx.aws.accessKey=dummy_access_key
com.metamx.aws.secretKey=dummy_secret_key
druid.pusher.s3.bucket=dummy_s3_bucket
druid.zk.service.host=localhost
druid.server.maxSize=300000000000
druid.zk.paths.base=/druid
druid.database.segmentTable=prod_segments
druid.database.user=druid
druid.database.password=diurd
druid.database.connectURI=jdbc:mysql://localhost:3306/druid
druid.zk.paths.discoveryPath=/druid/discoveryPath
druid.database.ruleTable=rules
druid.database.configTable=config
# Path on local FS for storage of segments; dir will be created if needed
druid.paths.indexCache=/tmp/druid/indexCache
# Path on local FS for storage of segment metadata; dir will be created if needed
druid.paths.segmentInfoCache=/tmp/druid/segmentInfoCache
```
# Path on local FS for storage of segments; dir will be created if needed
druid.paths.indexCache=/tmp/druid/indexCache
# Path on local FS for storage of segment metadata; dir will be created if needed
druid.paths.segmentInfoCache=/tmp/druid/segmentInfoCache
```
2. Launch the [Master](Master.html) node
```bash
java -Xmx256m -Duser.timezone=UTC -Dfile.encoding=UTF-8 \
-classpath lib/*:config/master \
com.metamx.druid.http.MasterMain
```
```bash
java -Xmx256m -Duser.timezone=UTC -Dfile.encoding=UTF-8 \
-classpath lib/*:config/master \
com.metamx.druid.http.MasterMain
```
### Launch a Compute/Historical Node ###
1. Create a configuration file in config/compute/runtime.properties similar to:
```bash
druid.host=0.0.0.0:8082
druid.port=8082
com.metamx.emitter.logging=true
```bash
druid.host=0.0.0.0:8082
druid.port=8082
druid.processing.formatString=processing_%s
druid.processing.numThreads=1
druid.processing.buffer.sizeBytes=10000000
com.metamx.emitter.logging=true
#emitting, opaque marker
druid.service=example
druid.processing.formatString=processing_%s
druid.processing.numThreads=1
druid.processing.buffer.sizeBytes=10000000
druid.request.logging.dir=/tmp/example/log
druid.realtime.specFile=realtime.spec
com.metamx.emitter.logging=true
com.metamx.emitter.logging.level=debug
# emitting, opaque marker
druid.service=example
# below are dummy values when operating a realtime only node
druid.processing.numThreads=3
druid.request.logging.dir=/tmp/example/log
druid.realtime.specFile=realtime.spec
com.metamx.emitter.logging=true
com.metamx.emitter.logging.level=debug
com.metamx.aws.accessKey=dummy_access_key
com.metamx.aws.secretKey=dummy_secret_key
druid.pusher.s3.bucket=dummy_s3_bucket
# below are dummy values when operating a realtime only node
druid.processing.numThreads=3
druid.zk.service.host=localhost
druid.server.maxSize=300000000000
druid.zk.paths.base=/druid
druid.database.segmentTable=prod_segments
druid.database.user=druid
druid.database.password=diurd
druid.database.connectURI=jdbc:mysql://localhost:3306/druid
druid.zk.paths.discoveryPath=/druid/discoveryPath
druid.database.ruleTable=rules
druid.database.configTable=config
com.metamx.aws.accessKey=dummy_access_key
com.metamx.aws.secretKey=dummy_secret_key
druid.pusher.s3.bucket=dummy_s3_bucket
druid.zk.service.host=localhost
druid.server.maxSize=300000000000
druid.zk.paths.base=/druid
druid.database.segmentTable=prod_segments
druid.database.user=druid
druid.database.password=diurd
druid.database.connectURI=jdbc:mysql://localhost:3306/druid
druid.zk.paths.discoveryPath=/druid/discoveryPath
druid.database.ruleTable=rules
druid.database.configTable=config
# Path on local FS for storage of segments; dir will be created if needed
druid.paths.indexCache=/tmp/druid/indexCache
druid.paths.indexCache=/tmp/druid/indexCache
# Path on local FS for storage of segment metadata; dir will be created if needed
druid.paths.segmentInfoCache=/tmp/druid/segmentInfoCache
druid.paths.segmentInfoCache=/tmp/druid/segmentInfoCache
# Setup local storage mode
druid.pusher.local.storageDirectory=/tmp/druid/localStorage
druid.pusher.local=true
```
druid.pusher.local.storageDirectory=/tmp/druid/localStorage
druid.pusher.local=true
```
2. Launch the compute node:
```bash
java -Xmx256m -Duser.timezone=UTC -Dfile.encoding=UTF-8 \
-classpath lib/*:config/compute \
com.metamx.druid.http.ComputeMain
```
```bash
java -Xmx256m -Duser.timezone=UTC -Dfile.encoding=UTF-8 \
-classpath lib/*:config/compute \
com.metamx.druid.http.ComputeMain
```
### Create a File of Records ###
We can use the same records we have been, in a file called records.json:
```json
{"utcdt": "2010-01-01T01:01:01", "wp": 1000, "gender": "male", "age": 100}
{"utcdt": "2010-01-01T01:01:02", "wp": 2000, "gender": "female", "age": 50}
@ -327,44 +364,47 @@ We can use the same records we have been, in a file called records.json:
Now its time to run the Hadoop [Batch-ingestion](Batch-ingestion.html) job, HadoopDruidIndexer, which will fill a historical [Compute](Compute.html) node with data. First we'll need to configure the job.
1. Create a config called batchConfig.json similar to:
```json
{
"dataSource": "druidtest",
"timestampColumn": "utcdt",
"timestampFormat": "iso",
"dataSpec": {
"format": "json",
"dimensions": ["gender", "age"]
},
"granularitySpec": {
"type":"uniform",
"intervals":["2010-01-01T01/PT1H"],
"gran":"hour"
},
"pathSpec": { "type": "static",
"paths": "/Users/rjurney/Software/druid/records.json" },
"rollupSpec": { "aggs":[ {"type":"count", "name":"impressions"},
{"type":"doubleSum","name":"wp","fieldName":"wp"}
],
"rollupGranularity": "minute"},
"workingPath": "/tmp/working_path",
"segmentOutputPath": "/tmp/segments",
"leaveIntermediate": "false",
"partitionsSpec": {
"targetPartitionSize": 5000000
},
"updaterJobSpec": {
"type":"db",
"connectURI":"jdbc:mysql://localhost:3306/druid",
"user":"druid",
"password":"diurd",
"segmentTable":"prod_segments"
}
}
```
2. Now run the job, with the config pointing at batchConfig.json:
```bash
java -Xmx256m -Duser.timezone=UTC -Dfile.encoding=UTF-8 -Ddruid.realtime.specFile=realtime.spec -classpath lib/* com.metamx.druid.indexer.HadoopDruidIndexerMain batchConfig.json
```
You can now move on to [Querying Your Data](Querying-Your-Data.html)!
```json
{
"dataSource": "druidtest",
"timestampColumn": "utcdt",
"timestampFormat": "iso",
"dataSpec": {
"format": "json",
"dimensions": ["gender", "age"]
},
"granularitySpec": {
"type":"uniform",
"intervals":["2010-01-01T01/PT1H"],
"gran":"hour"
},
"pathSpec": { "type": "static",
"paths": "/Users/rjurney/Software/druid/records.json" },
"rollupSpec": { "aggs":[ {"type":"count", "name":"impressions"},
{"type":"doubleSum","name":"wp","fieldName":"wp"}
],
"rollupGranularity": "minute"},
"workingPath": "/tmp/working_path",
"segmentOutputPath": "/tmp/segments",
"leaveIntermediate": "false",
"partitionsSpec": {
"targetPartitionSize": 5000000
},
"updaterJobSpec": {
"type":"db",
"connectURI":"jdbc:mysql://localhost:3306/druid",
"user":"druid",
"password":"diurd",
"segmentTable":"prod_segments"
}
}
```
2. Now run the job, with the config pointing at batchConfig.json:
```bash
java -Xmx256m -Duser.timezone=UTC -Dfile.encoding=UTF-8 -Ddruid.realtime.specFile=realtime.spec -classpath lib/* com.metamx.druid.indexer.HadoopDruidIndexerMain batchConfig.json
```
You can now move on to [Querying Your Data](Querying-Your-Data.html)!