fix layout

2025-02-07 18:48:33 +00:00 · 2013-09-26 17:38:11 -07:00 · 2013-09-26 17:38:11 -07:00 · b68c9fee83
commit b68c9fee83
parent a3e16335ef
1 changed files with 277 additions and 237 deletions
--- a/docs/content/Loading-Your-Data.md
+++ b/docs/content/Loading-Your-Data.md
@ -7,6 +7,7 @@ Druid can ingest data in three ways: via Kafka and a realtime node, via the inde

 ## Create Config Directories ##
 Each type of node needs its own config file and directory, so create them as subdirectories under the druid directory.
+
 ```bash
 mkdir config
 mkdir config/realtime
@ -24,147 +25,171 @@ mkdir config/broker
 Instructions for booting a Zookeeper and then Kafka cluster are available [here](http://kafka.apache.org/07/quickstart.html).

 1. Download Apache Kafka 0.7.2 from [http://kafka.apache.org/downloads.html](http://kafka.apache.org/downloads.html)
-```bash
-wget http://apache.spinellicreations.com/incubator/kafka/kafka-0.7.2-incubating/kafka-0.7.2-incubating-src.tgz
-tar -xvzf kafka-0.7.2-incubating-src.tgz
-cd kafka-0.7.2-incubating-src
-```
+
+  ```bash
+  wget http://apache.spinellicreations.com/incubator/kafka/kafka-0.7.2-incubating/kafka-0.7.2-incubating-src.tgz
+  tar -xvzf kafka-0.7.2-incubating-src.tgz
+  cd kafka-0.7.2-incubating-src
+  ```
+
 2. Build Kafka
-```bash
-./sbt update
-./sbt package
-```
+
+  ```bash
+  ./sbt update
+  ./sbt package
+  ```
+
 3. Boot Kafka
-```bash
-cat config/zookeeper.properties
-bin/zookeeper-server-start.sh config/zookeeper.properties
-# in a new console
-bin/kafka-server-start.sh config/server.properties
-```
+
+  ```bash
+  cat config/zookeeper.properties
+  bin/zookeeper-server-start.sh config/zookeeper.properties
+  # in a new console
+  bin/kafka-server-start.sh config/server.properties
+  ```
+
 4. Launch the console producer (so you can type in JSON kafka messages in a bit)
-```bash
-bin/kafka-console-producer.sh --zookeeper localhost:2181 --topic druidtest
-```
+
+  ```bash
+  bin/kafka-console-producer.sh --zookeeper localhost:2181 --topic druidtest
+  ```
+
 ### Launching a Realtime Node

 1. Create a valid configuration file similar to this called config/realtime/runtime.properties:
-```
-druid.host=0.0.0.0:8080
-druid.port=8080

-com.metamx.emitter.logging=true
+  ```
+  druid.host=0.0.0.0:8080
+  druid.port=8080

-druid.processing.formatString=processing_%s
-druid.processing.numThreads=1
-druid.processing.buffer.sizeBytes=10000000
+  com.metamx.emitter.logging=true

-#emitting, opaque marker
-druid.service=example
+  druid.processing.formatString=processing_%s
+  druid.processing.numThreads=1
+  druid.processing.buffer.sizeBytes=10000000

-druid.request.logging.dir=/tmp/example/log
-druid.realtime.specFile=realtime.spec
-com.metamx.emitter.logging=true
-com.metamx.emitter.logging.level=debug
+  #emitting, opaque marker
+  druid.service=example

-# below are dummy values when operating a realtime only node
-druid.processing.numThreads=3
+  druid.request.logging.dir=/tmp/example/log
+  druid.realtime.specFile=realtime.spec
+  com.metamx.emitter.logging=true
+  com.metamx.emitter.logging.level=debug

-com.metamx.aws.accessKey=dummy_access_key
-com.metamx.aws.secretKey=dummy_secret_key
-druid.pusher.s3.bucket=dummy_s3_bucket
+  # below are dummy values when operating a realtime only node
+  druid.processing.numThreads=3

-druid.zk.service.host=localhost
-druid.server.maxSize=300000000000
-druid.zk.paths.base=/druid
-druid.database.segmentTable=prod_segments
-druid.database.user=user
-druid.database.password=diurd
-druid.database.connectURI=
-druid.host=127.0.0.1:8080
+  com.metamx.aws.accessKey=dummy_access_key
+  com.metamx.aws.secretKey=dummy_secret_key
+  druid.pusher.s3.bucket=dummy_s3_bucket
+
+  druid.zk.service.host=localhost
+  druid.server.maxSize=300000000000
+  druid.zk.paths.base=/druid
+  druid.database.segmentTable=prod_segments
+  druid.database.user=user
+  druid.database.password=diurd
+  druid.database.connectURI=
+  druid.host=127.0.0.1:8080
+
+  ```

-```
 2. Create a valid realtime configuration file similar to this called realtime.spec:
-```json
-[{
-  "schema" : { "dataSource":"druidtest",
-               "aggregators":[ {"type":"count", "name":"impressions"},
-                                  {"type":"doubleSum","name":"wp","fieldName":"wp"}],
-               "indexGranularity":"minute",
-           "shardSpec" : { "type": "none" } },
-  "config" : { "maxRowsInMemory" : 500000,
-               "intermediatePersistPeriod" : "PT10m" },
-  "firehose" : { "type" : "kafka-0.7.2",
-                 "consumerProps" : { "zk.connect" : "localhost:2181",
-                                     "zk.connectiontimeout.ms" : "15000",
-                                     "zk.sessiontimeout.ms" : "15000",
-                                     "zk.synctime.ms" : "5000",
-                                     "groupid" : "topic-pixel-local",
-                                     "fetch.size" : "1048586",
-                                     "autooffset.reset" : "largest",
-                                     "autocommit.enable" : "false" },
-                 "feed" : "druidtest",
-                 "parser" : { "timestampSpec" : { "column" : "utcdt", "format" : "iso" },
-                              "data" : { "format" : "json" },
-                              "dimensionExclusions" : ["wp"] } },
-  "plumber" : { "type" : "realtime",
-                "windowPeriod" : "PT10m",
-                "segmentGranularity":"hour",
-                "basePersistDirectory" : "/tmp/realtime/basePersist",
-                "rejectionPolicy": {"type": "messageTime"} }

-}]
-```
+  ```json
+  [{
+    "schema" : { "dataSource":"druidtest",
+                 "aggregators":[ {"type":"count", "name":"impressions"},
+                                    {"type":"doubleSum","name":"wp","fieldName":"wp"}],
+                 "indexGranularity":"minute",
+             "shardSpec" : { "type": "none" } },
+    "config" : { "maxRowsInMemory" : 500000,
+                 "intermediatePersistPeriod" : "PT10m" },
+    "firehose" : { "type" : "kafka-0.7.2",
+                   "consumerProps" : { "zk.connect" : "localhost:2181",
+                                       "zk.connectiontimeout.ms" : "15000",
+                                       "zk.sessiontimeout.ms" : "15000",
+                                       "zk.synctime.ms" : "5000",
+                                       "groupid" : "topic-pixel-local",
+                                       "fetch.size" : "1048586",
+                                       "autooffset.reset" : "largest",
+                                       "autocommit.enable" : "false" },
+                   "feed" : "druidtest",
+                   "parser" : { "timestampSpec" : { "column" : "utcdt", "format" : "iso" },
+                                "data" : { "format" : "json" },
+                                "dimensionExclusions" : ["wp"] } },
+    "plumber" : { "type" : "realtime",
+                  "windowPeriod" : "PT10m",
+                  "segmentGranularity":"hour",
+                  "basePersistDirectory" : "/tmp/realtime/basePersist",
+                  "rejectionPolicy": {"type": "messageTime"} }
+
+  }]
+  ```
+
 3. Launch the realtime node
-```bash
-java -Xmx256m -Duser.timezone=UTC -Dfile.encoding=UTF-8 \
-Ddruid.realtime.specFile=config/realtime/realtime.spec \
-classpath lib/*:config/realtime com.metamx.druid.realtime.RealtimeMain
-```
+
+  ```bash
+  java -Xmx256m -Duser.timezone=UTC -Dfile.encoding=UTF-8 \
+  -Ddruid.realtime.specFile=config/realtime/realtime.spec \
+  -classpath lib/*:config/realtime com.metamx.druid.realtime.RealtimeMain
+  ```
+
 4. Paste data into the Kafka console producer
-```json
-{"utcdt": "2010-01-01T01:01:01", "wp": 1000, "gender": "male", "age": 100}
-{"utcdt": "2010-01-01T01:01:02", "wp": 2000, "gender": "female", "age": 50}
-{"utcdt": "2010-01-01T01:01:03", "wp": 3000, "gender": "male", "age": 20}
-{"utcdt": "2010-01-01T01:01:04", "wp": 4000, "gender": "female", "age": 30}
-{"utcdt": "2010-01-01T01:01:05", "wp": 5000, "gender": "male", "age": 40}
-```
+
+  ```json
+  {"utcdt": "2010-01-01T01:01:01", "wp": 1000, "gender": "male", "age": 100}
+  {"utcdt": "2010-01-01T01:01:02", "wp": 2000, "gender": "female", "age": 50}
+  {"utcdt": "2010-01-01T01:01:03", "wp": 3000, "gender": "male", "age": 20}
+  {"utcdt": "2010-01-01T01:01:04", "wp": 4000, "gender": "female", "age": 30}
+  {"utcdt": "2010-01-01T01:01:05", "wp": 5000, "gender": "male", "age": 40}
+  ```
+
 5. Watch the events as they are ingested by Druid's realtime node
-```bash
-...
-2013-06-17 21:41:55,569 INFO [Global--0] com.metamx.emitter.core.LoggingEmitter - Event [{"feed":"metrics","timestamp":"2013-06-17T21:41:55.569Z","service":"example","host":"127.0.0.1","metric":"events/processed","value":5,"user2":"druidtest"}]
-...
-```
+
+  ```bash
+  ...
+  2013-06-17 21:41:55,569 INFO [Global--0] com.metamx.emitter.core.LoggingEmitter - Event [{"feed":"metrics","timestamp":"2013-06-17T21:41:55.569Z","service":"example","host":"127.0.0.1","metric":"events/processed","value":5,"user2":"druidtest"}]
+  ...
+  ```
+
 6. In a new console, edit a file called query.body:
-```json
-{
-    "queryType": "groupBy",
-    "dataSource": "druidtest",
-    "granularity": "all",
-    "dimensions": [],
-    "aggregations": [
-        { "type": "count", "name": "rows" },
-        {"type": "longSum", "name": "imps", "fieldName": "impressions"},
-        {"type": "doubleSum", "name": "wp", "fieldName": "wp"}
-    ],
-    "intervals": ["2010-01-01T00:00/2020-01-01T00"]
-}
-```
-7. Submit the query via curl
-```bash
-curl -X POST "http://localhost:8080/druid/v2/?pretty" \
-H 'content-type: application/json' -d @query.body
-```
-8. View Result!
-```json
-[ {
-  "timestamp" : "2010-01-01T01:01:00.000Z",
-  "result" : {
-    "imps" : 20,
-    "wp" : 60000.0,
-    "rows" : 5
+
+  ```json
+  {
+      "queryType": "groupBy",
+      "dataSource": "druidtest",
+      "granularity": "all",
+      "dimensions": [],
+      "aggregations": [
+          { "type": "count", "name": "rows" },
+          {"type": "longSum", "name": "imps", "fieldName": "impressions"},
+          {"type": "doubleSum", "name": "wp", "fieldName": "wp"}
+      ],
+      "intervals": ["2010-01-01T00:00/2020-01-01T00"]
  }
-} ]
-```
+  ```
+
+7. Submit the query via curl
+
+  ```bash
+  curl -X POST "http://localhost:8080/druid/v2/?pretty" \
+  -H 'content-type: application/json' -d @query.body
+  ```
+
+8. View Result!
+
+  ```json
+  [ {
+    "timestamp" : "2010-01-01T01:01:00.000Z",
+    "result" : {
+      "imps" : 20,
+      "wp" : 60000.0,
+      "rows" : 5
+    }
+  } ]
+  ```
+
 Now you're ready for [Querying Your Data](Querying-Your-Data.html)!

 ## Loading Data with the HadoopDruidIndexer ##
@ -177,13 +202,16 @@ The setup for a single node, 'standalone' Hadoop cluster is available at [http:/
 1. If you don't already have it, download MySQL Community Server here: [http://dev.mysql.com/downloads/mysql/](http://dev.mysql.com/downloads/mysql/)
 2. Install MySQL
 3. Create a druid user and database
+
 ```bash
 mysql -u root
 ```
+
 ```sql
 GRANT ALL ON druid.* TO 'druid'@'localhost' IDENTIFIED BY 'diurd';
 CREATE database druid;
 ```
+
 The [Master](Master.html) node will create the tables it needs based on its configuration.

 ### Make sure you have ZooKeeper Running ###
@ -206,114 +234,123 @@ cd ..
 ```

 ### Launch a Master Node ###
+
 If you've already setup a realtime node, be aware that although you can run multiple node types on one physical computer, you must assign them unique ports. Having used 8080 for the [Realtime](Realtime.html) node, we use 8081 for the [Master](Master.html).

 1. Setup a configuration file called config/master/runtime.properties similar to:
-```bash
-druid.host=0.0.0.0:8081
-druid.port=8081

-com.metamx.emitter.logging=true
+  ```bash
+  druid.host=0.0.0.0:8081
+  druid.port=8081

-druid.processing.formatString=processing_%s
-druid.processing.numThreads=1
-druid.processing.buffer.sizeBytes=10000000
+  com.metamx.emitter.logging=true

-#emitting, opaque marker
-druid.service=example
+  druid.processing.formatString=processing_%s
+  druid.processing.numThreads=1
+  druid.processing.buffer.sizeBytes=10000000

-druid.master.startDelay=PT60s
-druid.request.logging.dir=/tmp/example/log
-druid.realtime.specFile=realtime.spec
-com.metamx.emitter.logging=true
-com.metamx.emitter.logging.level=debug
+  # emitting, opaque marker
+  druid.service=example

-# below are dummy values when operating a realtime only node
-druid.processing.numThreads=3
+  druid.master.startDelay=PT60s
+  druid.request.logging.dir=/tmp/example/log
+  druid.realtime.specFile=realtime.spec
+  com.metamx.emitter.logging=true
+  com.metamx.emitter.logging.level=debug

-com.metamx.aws.accessKey=dummy_access_key
-com.metamx.aws.secretKey=dummy_secret_key
-druid.pusher.s3.bucket=dummy_s3_bucket
+  # below are dummy values when operating a realtime only node
+  druid.processing.numThreads=3

-druid.zk.service.host=localhost
-druid.server.maxSize=300000000000
-druid.zk.paths.base=/druid
-druid.database.segmentTable=prod_segments
-druid.database.user=druid
-druid.database.password=diurd
-druid.database.connectURI=jdbc:mysql://localhost:3306/druid
-druid.zk.paths.discoveryPath=/druid/discoveryPath
-druid.database.ruleTable=rules
-druid.database.configTable=config
+  com.metamx.aws.accessKey=dummy_access_key
+  com.metamx.aws.secretKey=dummy_secret_key
+  druid.pusher.s3.bucket=dummy_s3_bucket
+
+  druid.zk.service.host=localhost
+  druid.server.maxSize=300000000000
+  druid.zk.paths.base=/druid
+  druid.database.segmentTable=prod_segments
+  druid.database.user=druid
+  druid.database.password=diurd
+  druid.database.connectURI=jdbc:mysql://localhost:3306/druid
+  druid.zk.paths.discoveryPath=/druid/discoveryPath
+  druid.database.ruleTable=rules
+  druid.database.configTable=config
+
+  # Path on local FS for storage of segments; dir will be created if needed
+  druid.paths.indexCache=/tmp/druid/indexCache
+  # Path on local FS for storage of segment metadata; dir will be created if needed
+  druid.paths.segmentInfoCache=/tmp/druid/segmentInfoCache
+  ```

-# Path on local FS for storage of segments; dir will be created if needed
-druid.paths.indexCache=/tmp/druid/indexCache
-# Path on local FS for storage of segment metadata; dir will be created if needed
-druid.paths.segmentInfoCache=/tmp/druid/segmentInfoCache
-```
 2. Launch the [Master](Master.html) node
-```bash
-java -Xmx256m -Duser.timezone=UTC -Dfile.encoding=UTF-8 \
-classpath lib/*:config/master \
-com.metamx.druid.http.MasterMain
-```
+
+  ```bash
+  java -Xmx256m -Duser.timezone=UTC -Dfile.encoding=UTF-8 \
+  -classpath lib/*:config/master \
+  com.metamx.druid.http.MasterMain
+  ```

 ### Launch a Compute/Historical Node ###
+
 1. Create a configuration file in config/compute/runtime.properties similar to:
-```bash
-druid.host=0.0.0.0:8082
-druid.port=8082

-com.metamx.emitter.logging=true
+  ```bash
+  druid.host=0.0.0.0:8082
+  druid.port=8082

-druid.processing.formatString=processing_%s
-druid.processing.numThreads=1
-druid.processing.buffer.sizeBytes=10000000
+  com.metamx.emitter.logging=true

-#emitting, opaque marker
-druid.service=example
+  druid.processing.formatString=processing_%s
+  druid.processing.numThreads=1
+  druid.processing.buffer.sizeBytes=10000000

-druid.request.logging.dir=/tmp/example/log
-druid.realtime.specFile=realtime.spec
-com.metamx.emitter.logging=true
-com.metamx.emitter.logging.level=debug
+  # emitting, opaque marker
+  druid.service=example

-# below are dummy values when operating a realtime only node
-druid.processing.numThreads=3
+  druid.request.logging.dir=/tmp/example/log
+  druid.realtime.specFile=realtime.spec
+  com.metamx.emitter.logging=true
+  com.metamx.emitter.logging.level=debug

-com.metamx.aws.accessKey=dummy_access_key
-com.metamx.aws.secretKey=dummy_secret_key
-druid.pusher.s3.bucket=dummy_s3_bucket
+  # below are dummy values when operating a realtime only node
+  druid.processing.numThreads=3

-druid.zk.service.host=localhost
-druid.server.maxSize=300000000000
-druid.zk.paths.base=/druid
-druid.database.segmentTable=prod_segments
-druid.database.user=druid
-druid.database.password=diurd
-druid.database.connectURI=jdbc:mysql://localhost:3306/druid
-druid.zk.paths.discoveryPath=/druid/discoveryPath
-druid.database.ruleTable=rules
-druid.database.configTable=config
+  com.metamx.aws.accessKey=dummy_access_key
+  com.metamx.aws.secretKey=dummy_secret_key
+  druid.pusher.s3.bucket=dummy_s3_bucket
+
+  druid.zk.service.host=localhost
+  druid.server.maxSize=300000000000
+  druid.zk.paths.base=/druid
+  druid.database.segmentTable=prod_segments
+  druid.database.user=druid
+  druid.database.password=diurd
+  druid.database.connectURI=jdbc:mysql://localhost:3306/druid
+  druid.zk.paths.discoveryPath=/druid/discoveryPath
+  druid.database.ruleTable=rules
+  druid.database.configTable=config

 # Path on local FS for storage of segments; dir will be created if needed
-druid.paths.indexCache=/tmp/druid/indexCache
+  druid.paths.indexCache=/tmp/druid/indexCache
 # Path on local FS for storage of segment metadata; dir will be created if needed
-druid.paths.segmentInfoCache=/tmp/druid/segmentInfoCache
+  druid.paths.segmentInfoCache=/tmp/druid/segmentInfoCache
 # Setup local storage mode
-druid.pusher.local.storageDirectory=/tmp/druid/localStorage
-druid.pusher.local=true
-```
+  druid.pusher.local.storageDirectory=/tmp/druid/localStorage
+  druid.pusher.local=true
+  ```
+
 2. Launch the compute node:
-```bash
-java -Xmx256m -Duser.timezone=UTC -Dfile.encoding=UTF-8 \
-classpath lib/*:config/compute \
-com.metamx.druid.http.ComputeMain
-```
+
+  ```bash
+  java -Xmx256m -Duser.timezone=UTC -Dfile.encoding=UTF-8 \
+  -classpath lib/*:config/compute \
+  com.metamx.druid.http.ComputeMain
+  ```

 ### Create a File of Records ###

 We can use the same records we have been, in a file called records.json:
+
 ```json
 {"utcdt": "2010-01-01T01:01:01", "wp": 1000, "gender": "male", "age": 100}
 {"utcdt": "2010-01-01T01:01:02", "wp": 2000, "gender": "female", "age": 50}
@ -327,44 +364,47 @@ We can use the same records we have been, in a file called records.json:
 Now its time to run the Hadoop [Batch-ingestion](Batch-ingestion.html) job, HadoopDruidIndexer, which will fill a historical [Compute](Compute.html) node with data. First we'll need to configure the job.

 1. Create a config called batchConfig.json similar to:
-```json
-{
-  "dataSource": "druidtest",
-  "timestampColumn": "utcdt",
-  "timestampFormat": "iso",
-  "dataSpec": {
-    "format": "json",
-    "dimensions": ["gender", "age"]
-  },
-  "granularitySpec": {
-    "type":"uniform",
-    "intervals":["2010-01-01T01/PT1H"],
-    "gran":"hour"
-  },
-  "pathSpec": { "type": "static",
-                "paths": "/Users/rjurney/Software/druid/records.json" },
-  "rollupSpec": { "aggs":[ {"type":"count", "name":"impressions"},
-                           {"type":"doubleSum","name":"wp","fieldName":"wp"}
-                           ],
-                  "rollupGranularity": "minute"},
-  "workingPath": "/tmp/working_path",
-  "segmentOutputPath": "/tmp/segments",
-  "leaveIntermediate": "false",
-  "partitionsSpec": {
-    "targetPartitionSize": 5000000
-  },
-  "updaterJobSpec": {
-    "type":"db",
-    "connectURI":"jdbc:mysql://localhost:3306/druid",
-    "user":"druid",
-    "password":"diurd",
-    "segmentTable":"prod_segments"
-  }
-}
-```
-2. Now run the job, with the config pointing at batchConfig.json:
-```bash
-java -Xmx256m -Duser.timezone=UTC -Dfile.encoding=UTF-8 -Ddruid.realtime.specFile=realtime.spec -classpath lib/* com.metamx.druid.indexer.HadoopDruidIndexerMain batchConfig.json
-```

-You can now move on to [Querying Your Data](Querying-Your-Data.html)!
+  ```json
+  {
+    "dataSource": "druidtest",
+    "timestampColumn": "utcdt",
+    "timestampFormat": "iso",
+    "dataSpec": {
+      "format": "json",
+      "dimensions": ["gender", "age"]
+    },
+    "granularitySpec": {
+      "type":"uniform",
+      "intervals":["2010-01-01T01/PT1H"],
+      "gran":"hour"
+    },
+    "pathSpec": { "type": "static",
+                  "paths": "/Users/rjurney/Software/druid/records.json" },
+    "rollupSpec": { "aggs":[ {"type":"count", "name":"impressions"},
+                             {"type":"doubleSum","name":"wp","fieldName":"wp"}
+                             ],
+                    "rollupGranularity": "minute"},
+    "workingPath": "/tmp/working_path",
+    "segmentOutputPath": "/tmp/segments",
+    "leaveIntermediate": "false",
+    "partitionsSpec": {
+      "targetPartitionSize": 5000000
+    },
+    "updaterJobSpec": {
+      "type":"db",
+      "connectURI":"jdbc:mysql://localhost:3306/druid",
+      "user":"druid",
+      "password":"diurd",
+      "segmentTable":"prod_segments"
+    }
+  }
+  ```
+
+2. Now run the job, with the config pointing at batchConfig.json:
+
+  ```bash
+  java -Xmx256m -Duser.timezone=UTC -Dfile.encoding=UTF-8 -Ddruid.realtime.specFile=realtime.spec -classpath lib/* com.metamx.druid.indexer.HadoopDruidIndexerMain batchConfig.json
+  ```
+
+You can now move on to [Querying Your Data](Querying-Your-Data.html)!