mirror of https://github.com/apache/druid.git
Merge pull request #1131 from druid-io/fix-tutorials
Go through and fix mistakes in tutorials and docs
This commit is contained in:
commit
683fa1dc65
|
@ -253,6 +253,10 @@ The configuration options are:
|
||||||
|partitionDimension|the dimension to partition on. Leave blank to select a dimension automatically.|no|
|
|partitionDimension|the dimension to partition on. Leave blank to select a dimension automatically.|no|
|
||||||
|assumeGrouped|assume input data has already been grouped on time and dimensions. Ingestion will run faster, but can choose suboptimal partitions if the assumption is violated.|no|
|
|assumeGrouped|assume input data has already been grouped on time and dimensions. Ingestion will run faster, but can choose suboptimal partitions if the assumption is violated.|no|
|
||||||
|
|
||||||
|
### Remote Hadoop Cluster
|
||||||
|
|
||||||
|
If you have a remote Hadoop cluster, make sure to include the folder holding your configuration `*.xml` files in the classpath of the indexer.
|
||||||
|
|
||||||
Batch Ingestion Using the Indexing Service
|
Batch Ingestion Using the Indexing Service
|
||||||
------------------------------------------
|
------------------------------------------
|
||||||
|
|
||||||
|
@ -364,6 +368,10 @@ If the task succeeds, you should see in the logs of the indexing service:
|
||||||
2013-10-16 16:38:31,945 INFO [pool-6-thread-1] io.druid.indexing.overlord.exec.TaskConsumer - Task SUCCESS: HadoopIndexTask...
|
2013-10-16 16:38:31,945 INFO [pool-6-thread-1] io.druid.indexing.overlord.exec.TaskConsumer - Task SUCCESS: HadoopIndexTask...
|
||||||
```
|
```
|
||||||
|
|
||||||
|
### Remote Hadoop Cluster
|
||||||
|
|
||||||
|
If you have a remote Hadoop cluster, make sure to include the folder holding your configuration `*.xml` files in the classpath of the middle manager.
|
||||||
|
|
||||||
Having Problems?
|
Having Problems?
|
||||||
----------------
|
----------------
|
||||||
Getting data into Druid can definitely be difficult for first time users. Please don't hesitate to ask questions in our IRC channel or on our [google groups page](https://groups.google.com/forum/#!forum/druid-development).
|
Getting data into Druid can definitely be difficult for first time users. Please don't hesitate to ask questions in our IRC channel or on our [google groups page](https://groups.google.com/forum/#!forum/druid-development).
|
||||||
|
|
|
@ -11,6 +11,8 @@ To override the default Hadoop version, both the Hadoop Index Task and the stand
|
||||||
|
|
||||||
The Hadoop Index Task takes this parameter has part of the task JSON and the standalone Hadoop indexer takes this parameter as a command line argument.
|
The Hadoop Index Task takes this parameter has part of the task JSON and the standalone Hadoop indexer takes this parameter as a command line argument.
|
||||||
|
|
||||||
|
If you are still having problems, include all relevant hadoop jars at the beginning of the classpath of your indexing or historical nodes.
|
||||||
|
|
||||||
|
|
||||||
Working with Hadoop 1.x and older
|
Working with Hadoop 1.x and older
|
||||||
---------------------------------
|
---------------------------------
|
||||||
|
|
|
@ -74,7 +74,7 @@ The Index Task is a simpler variation of the Index Hadoop task that is designed
|
||||||
"type" : "index",
|
"type" : "index",
|
||||||
"targetPartitionSize" : -1,
|
"targetPartitionSize" : -1,
|
||||||
"rowFlushBoundary" : 0,
|
"rowFlushBoundary" : 0,
|
||||||
"numShards": 2
|
"numShards": 1
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -105,9 +105,9 @@ The tuningConfig is optional and default parameters will be used if no tuningCon
|
||||||
|property|description|default|required?|
|
|property|description|default|required?|
|
||||||
|--------|-----------|-------|---------|
|
|--------|-----------|-------|---------|
|
||||||
|type|The task type, this should always be "index".|None.||yes|
|
|type|The task type, this should always be "index".|None.||yes|
|
||||||
|targetPartitionSize|Used in sharding. Determines how many rows are in each segment.|5000000|no|
|
|targetPartitionSize|Used in sharding. Determines how many rows are in each segment. Set this to -1 to use numShards instead for sharding.|5000000|no|
|
||||||
|rowFlushBoundary|Used in determining when intermediate persist should occur to disk.|500000|no|
|
|rowFlushBoundary|Used in determining when intermediate persist should occur to disk.|500000|no|
|
||||||
|numShards|You can skip the intermediate persist step if you specify the number of shards you want and set targetPartitionSize=-1.|null|no|
|
|numShards|Directly specify the number of shards to create. You can skip the intermediate persist step if you specify the number of shards you want and set targetPartitionSize=-1.|null|no|
|
||||||
|
|
||||||
### Index Hadoop Task
|
### Index Hadoop Task
|
||||||
|
|
||||||
|
@ -116,14 +116,14 @@ The Hadoop Index Task is used to index larger data sets that require the paralle
|
||||||
```
|
```
|
||||||
{
|
{
|
||||||
"type" : "index_hadoop",
|
"type" : "index_hadoop",
|
||||||
"config": <Hadoop index config>
|
"spec": <Hadoop index spec>
|
||||||
}
|
}
|
||||||
```
|
```
|
||||||
|
|
||||||
|property|description|required?|
|
|property|description|required?|
|
||||||
|--------|-----------|---------|
|
|--------|-----------|---------|
|
||||||
|type|The task type, this should always be "index_hadoop".|yes|
|
|type|The task type, this should always be "index_hadoop".|yes|
|
||||||
|config|A Hadoop Index Config. See [Batch Ingestion](Batch-ingestion.html)|yes|
|
|spec|A Hadoop Index Spec. See [Batch Ingestion](Batch-ingestion.html)|yes|
|
||||||
|hadoopCoordinates|The Maven \<groupId\>:\<artifactId\>:\<version\> of Hadoop to use. The default is "org.apache.hadoop:hadoop-client:2.3.0".|no|
|
|hadoopCoordinates|The Maven \<groupId\>:\<artifactId\>:\<version\> of Hadoop to use. The default is "org.apache.hadoop:hadoop-client:2.3.0".|no|
|
||||||
|
|
||||||
|
|
||||||
|
@ -131,7 +131,7 @@ The Hadoop Index Config submitted as part of an Hadoop Index Task is identical t
|
||||||
|
|
||||||
#### Using your own Hadoop distribution
|
#### Using your own Hadoop distribution
|
||||||
|
|
||||||
Druid is compiled against Apache hadoop-client 2.3.0. However, if you happen to use a different flavor of hadoop that is API compatible with hadoop-client 2.3.0, you should only have to change the hadoopCoordinates property to point to the maven artifact used by your distribution.
|
Druid is compiled against Apache hadoop-client 2.3.0. However, if you happen to use a different flavor of hadoop that is API compatible with hadoop-client 2.3.0, you should only have to change the hadoopCoordinates property to point to the maven artifact used by your distribution. For non-API compatible versions, please see [here](Other-Hadoop.html).
|
||||||
|
|
||||||
#### Resolving dependency conflicts running HadoopIndexTask
|
#### Resolving dependency conflicts running HadoopIndexTask
|
||||||
|
|
||||||
|
|
|
@ -10,7 +10,7 @@ About the data
|
||||||
|
|
||||||
The data source we'll be working with is Wikipedia edits. Each time an edit is made in Wikipedia, an event gets pushed to an IRC channel associated with the language of the Wikipedia page. We scrape IRC channels for several different languages and load this data into Druid.
|
The data source we'll be working with is Wikipedia edits. Each time an edit is made in Wikipedia, an event gets pushed to an IRC channel associated with the language of the Wikipedia page. We scrape IRC channels for several different languages and load this data into Druid.
|
||||||
|
|
||||||
Each event has a timestamp indicating the time of the edit (in UTC time), a list of dimensions indicating various metadata about the event (such as information about the user editing the page and where the user resides), and a list of metrics associated with the event (such as the number of characters added and deleted).
|
Each event has a timestamp indicating the time of the edit (in UTC time), a list of dimensions indicating various metadata about the event (such as information about the user editing the page and where the user is a bot), and a list of metrics associated with the event (such as the number of characters added and deleted).
|
||||||
|
|
||||||
Specifically. the data schema looks like so:
|
Specifically. the data schema looks like so:
|
||||||
|
|
||||||
|
@ -40,8 +40,6 @@ Metrics (things to aggregate over):
|
||||||
"deleted"
|
"deleted"
|
||||||
```
|
```
|
||||||
|
|
||||||
These metrics track the number of characters added, deleted, and changed.
|
|
||||||
|
|
||||||
Setting Up
|
Setting Up
|
||||||
----------
|
----------
|
||||||
|
|
||||||
|
@ -83,12 +81,14 @@ Select "wikipedia".
|
||||||
Note that the first time you start the example, it may take some extra time due to its fetching various dependencies. Once the node starts up you will see a bunch of logs about setting up properties and connecting to the data source. If everything was successful, you should see messages of the form shown below.
|
Note that the first time you start the example, it may take some extra time due to its fetching various dependencies. Once the node starts up you will see a bunch of logs about setting up properties and connecting to the data source. If everything was successful, you should see messages of the form shown below.
|
||||||
|
|
||||||
```
|
```
|
||||||
2013-09-04 19:33:11,922 INFO [main] org.eclipse.jetty.server.AbstractConnector - Started SelectChannelConnector@0.0.0.0:8083
|
2015-02-17T21:46:36,804 INFO [main] org.eclipse.jetty.server.ServerConnector - Started ServerConnector@79b6cf95{HTTP/1.1}{0.0.0.0:8083}
|
||||||
2013-09-04 19:33:11,946 INFO [ApiDaemon] io.druid.segment.realtime.firehose.IrcFirehoseFactory - irc connection to server [irc.wikimedia.org] established
|
2015-02-17T21:46:36,804 INFO [main] org.eclipse.jetty.server.Server - Started @9580ms
|
||||||
2013-09-04 19:33:11,946 INFO [ApiDaemon] io.druid.segment.realtime.firehose.IrcFirehoseFactory - Joining channel #en.wikipedia
|
2015-02-17T21:46:36,862 INFO [ApiDaemon] io.druid.segment.realtime.firehose.IrcFirehoseFactory - irc connection to server [irc.wikimedia.org] established
|
||||||
2013-09-04 19:33:11,946 INFO [ApiDaemon] io.druid.segment.realtime.firehose.IrcFirehoseFactory - Joining channel #fr.wikipedia
|
2015-02-17T21:46:36,862 INFO [ApiDaemon] io.druid.segment.realtime.firehose.IrcFirehoseFactory - Joining channel #en.wikipedia
|
||||||
2013-09-04 19:33:11,946 INFO [ApiDaemon] io.druid.segment.realtime.firehose.IrcFirehoseFactory - Joining channel #de.wikipedia
|
2015-02-17T21:46:36,863 INFO [ApiDaemon] io.druid.segment.realtime.firehose.IrcFirehoseFactory - Joining channel #fr.wikipedia
|
||||||
2013-09-04 19:33:11,946 INFO [ApiDaemon] io.druid.segment.realtime.firehose.IrcFirehoseFactory - Joining channel #ja.wikipedia
|
2015-02-17T21:46:36,863 INFO [ApiDaemon] io.druid.segment.realtime.firehose.IrcFirehoseFactory - Joining channel #de.wikipedia
|
||||||
|
2015-02-17T21:46:36,863 INFO [ApiDaemon] io.druid.segment.realtime.firehose.IrcFirehoseFactory - Joining channel #ja.wikipedia
|
||||||
|
2015-02-17T21:46:37,009 INFO [ServerInventoryView-0] io.druid.client.BatchServerInventoryView - Inventory Initialized
|
||||||
```
|
```
|
||||||
|
|
||||||
The Druid real time-node ingests events in an in-memory buffer. Periodically, these events will be persisted to disk. If you are interested in the details of our real-time architecture and why we persist indexes to disk, we suggest you read our [White Paper](http://static.druid.io/docs/druid.pdf).
|
The Druid real time-node ingests events in an in-memory buffer. Periodically, these events will be persisted to disk. If you are interested in the details of our real-time architecture and why we persist indexes to disk, we suggest you read our [White Paper](http://static.druid.io/docs/druid.pdf).
|
||||||
|
@ -110,8 +110,6 @@ Select "wikipedia" once again. This script issues [TimeBoundary](TimeBoundaryQue
|
||||||
|
|
||||||
The **timeBoundary** query is one of the simplest queries you can make in Druid. It gives you the boundaries of the ingested data.
|
The **timeBoundary** query is one of the simplest queries you can make in Druid. It gives you the boundaries of the ingested data.
|
||||||
|
|
||||||
We are **filtering** via the `namespace` dimension, to only look at edits on `articles`. Our **aggregations** are what we are calculating: a count of the number of data rows, and a count of the number of edits that have occurred.
|
|
||||||
|
|
||||||
The result looks something like this (when it's prettified):
|
The result looks something like this (when it's prettified):
|
||||||
|
|
||||||
```json
|
```json
|
||||||
|
@ -150,7 +148,7 @@ We are going to make a slightly more complicated query, the [TimeseriesQuery](Ti
|
||||||
}
|
}
|
||||||
```
|
```
|
||||||
|
|
||||||
Our query has now expanded to include a time interval, [Granularities](Granularities.html), and [Aggregations](Aggregations.html). What the query is doing is aggregating a set metrics over a span of time, and the results are put into a single bucket.
|
Our query has now expanded to include a time interval, [Granularities](Granularities.html), and [Aggregations](Aggregations.html). What the query is doing is aggregating a set of metrics over a span of time, and the results are grouped into a single time bucket.
|
||||||
To issue the query and get some results, run the following in your command line:
|
To issue the query and get some results, run the following in your command line:
|
||||||
|
|
||||||
```
|
```
|
||||||
|
@ -170,7 +168,7 @@ If you issue the query again, you should notice your results updating.
|
||||||
|
|
||||||
Right now all the results you are getting back are being aggregated into a single timestamp bucket. What if we wanted to see our aggregations on a per minute basis?
|
Right now all the results you are getting back are being aggregated into a single timestamp bucket. What if we wanted to see our aggregations on a per minute basis?
|
||||||
|
|
||||||
We can change granularity our the results to minute. To specify different granularities to bucket our results, we change our schema like so:
|
We can change granularity our the results to minute. To specify different granularities to bucket our results, we change our query like so:
|
||||||
|
|
||||||
```json
|
```json
|
||||||
{
|
{
|
||||||
|
@ -234,7 +232,7 @@ and put the following in there:
|
||||||
}
|
}
|
||||||
```
|
```
|
||||||
|
|
||||||
Note that our query now includes [Filters](Filters.html).
|
Note that our query now includes [Filters](Filters.html). Filters are like `WHERE` clauses in SQL and help narrow down the data that needs to be scanned.
|
||||||
|
|
||||||
If you issue the query:
|
If you issue the query:
|
||||||
|
|
||||||
|
@ -258,14 +256,16 @@ You should see an answer to our question. As an example, some results are shown
|
||||||
]
|
]
|
||||||
```
|
```
|
||||||
|
|
||||||
Feel free to tweak other query parameters to answer other questions you may have about the data.
|
Feel free to tweak other query parameters to answer other questions you may have about the data. Druid also includes more complex query types such as [groupBy queries](GroupByQuery.html).
|
||||||
|
|
||||||
Next Steps
|
Next Steps
|
||||||
----------
|
----------
|
||||||
|
|
||||||
Want to know even more information about the Druid Cluster? Check out [The Druid Cluster](Tutorial%3A-The-Druid-Cluster.html).
|
This tutorial only covered the basic operations of a single Druid node. For production, you'll likely need a full Druid cluster. Check out our next tutorial [The Druid Cluster](Tutorial%3A-The-Druid-Cluster.html) to learn more.
|
||||||
|
|
||||||
Druid is even more fun if you load your own data into it! To learn how to load your data, see [Loading Your Data](Tutorial%3A-Loading-Your-Data-Part-1.html).
|
To learn more about loading streaming data, see [Loading Streaming Data](Tutorial%3A-Loading-Streaming-Data.html).
|
||||||
|
|
||||||
|
To learn more about loading batch data, see [Loading Batch Data](Tutorial%3A-Loading-Batch-Data.html).
|
||||||
|
|
||||||
Additional Information
|
Additional Information
|
||||||
----------------------
|
----------------------
|
||||||
|
|
|
@ -4,7 +4,7 @@ layout: doc_page
|
||||||
|
|
||||||
# Tutorial: Loading Batch Data
|
# Tutorial: Loading Batch Data
|
||||||
|
|
||||||
In this tutorial, we will learn about batch ingestion (as opposed to real-time ingestion) and how to create segments using the final piece of the Druid Cluster, the [indexing service](Indexing-Service.html). The indexing service is a standalone service that accepts [tasks](Tasks.html) in the form of POST requests. The output of most tasks are segments.
|
In this tutorial, we will learn about batch ingestion (as opposed to real-time ingestion) and how to create segments using the final piece of the Druid Cluster, the [indexing service](Indexing-Service.html). The indexing service is a standalone service that accepts [tasks](Tasks.html) in the form of POST requests. The output of most tasks are segments. The indexing service can be used as a single service for both real-time/streaming and batch ingestion.
|
||||||
|
|
||||||
The Data
|
The Data
|
||||||
--------
|
--------
|
||||||
|
@ -38,11 +38,8 @@ Metrics (things to aggregate over):
|
||||||
|
|
||||||
Batch Ingestion
|
Batch Ingestion
|
||||||
---------------
|
---------------
|
||||||
Druid is designed for large data volumes, and most real-world data sets require batch indexing be done through a Hadoop job.
|
|
||||||
|
|
||||||
For this tutorial, we used [Hadoop 2.3.0](https://archive.apache.org/dist/hadoop/core/hadoop-2.3.0/). There are many pages on the Internet showing how to set up a single-node (standalone) Hadoop cluster, which is all that's needed for this example.
|
For the purposes of this tutorial, we are going to use our very small and simple Wikipedia data set. This data can directly be ingested via other means as shown in the previous [tutorial](Tutorial%3A-Loading-Your-Data-Part-1.html).
|
||||||
|
|
||||||
For the purposes of this tutorial, we are going to use our very small and simple Wikipedia data set. This data can directly be ingested via other means as shown in the previous [tutorial](Tutorial%3A-Loading-Your-Data-Part-1.html), but we are going to use Hadoop here for demonstration purposes.
|
|
||||||
|
|
||||||
Our data is located at:
|
Our data is located at:
|
||||||
|
|
||||||
|
@ -164,7 +161,7 @@ Open up the file to see the following:
|
||||||
}
|
}
|
||||||
```
|
```
|
||||||
|
|
||||||
Okay, so what is happening here? The "type" field indicates the type of task we plan to run. In this case, it is a simple "index" task. The "granularitySpec" indicates that we are building a daily segment for 2013-08-31 to 2013-09-01. Next, the "aggregators" indicate which fields in our data set we plan to build metric columns for. The "fieldName" corresponds to the metric name in the raw data. The "name" corresponds to what our metric column is actually going to be called in the segment. Finally, we have a local "firehose" that is going to read data from disk. We tell the firehose where our data is located and the types of files we are looking to ingest. In our case, we only have a single data file.
|
Okay, so what is happening here? The "type" field indicates the type of task we plan to run. In this case, it is a simple "index" task. The "parseSpec" indicates how we plan to figure out what the timestamp and dimension columns are. The "granularitySpec" indicates that we are building a daily segment for 2013-08-31 to 2013-09-01 and the minimum queryGranularity will be millisecond (NONE). Next, the "metricsSpec" indicate which fields in our data set we plan to build metric columns for. The "fieldName" corresponds to the metric name in the raw data. The "name" corresponds to what our metric column is actually going to be called in the segment. Finally, we have a local "firehose" that is going to read data from disk. We tell the firehose where our data is located and the types of files we are looking to ingest. In our case, we only have a single data file.
|
||||||
|
|
||||||
Let's send our task to the indexing service now:
|
Let's send our task to the indexing service now:
|
||||||
|
|
||||||
|
@ -175,9 +172,8 @@ curl -X 'POST' -H 'Content-Type:application/json' -d @examples/indexing/wikipedi
|
||||||
Issuing the request should return a task ID like so:
|
Issuing the request should return a task ID like so:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
$ curl -X 'POST' -H 'Content-Type:application/json' -d @examples/indexing/wikipedia_index_task.json localhost:8087/druid/indexer/v1/task
|
curl -X 'POST' -H 'Content-Type:application/json' -d @examples/indexing/wikipedia_index_task.json localhost:8087/druid/indexer/v1/task
|
||||||
{"task":"index_wikipedia_2013-10-09T21:30:32.802Z"}
|
{"task":"index_wikipedia_2013-10-09T21:30:32.802Z"}
|
||||||
$
|
|
||||||
```
|
```
|
||||||
|
|
||||||
In your indexing service logs, you should see the following:
|
In your indexing service logs, you should see the following:
|
||||||
|
@ -197,7 +193,7 @@ After a few seconds, the task should complete and you should see in the indexing
|
||||||
2013-10-09 21:41:45,765 INFO [pool-6-thread-1] io.druid.indexing.overlord.exec.TaskConsumer - Received SUCCESS status for task: IndexGeneratorTask{id=index_wikipedia_2013-10-09T21:41:41.147Z_generator_2013-08-31T00:00:00.000Z_2013-09-01T00:00:00.000Z_0, type=index_generator, dataSource=wikipedia, interval=Optional.of(2013-08-31T00:00:00.000Z/2013-09-01T00:00:00.000Z)}
|
2013-10-09 21:41:45,765 INFO [pool-6-thread-1] io.druid.indexing.overlord.exec.TaskConsumer - Received SUCCESS status for task: IndexGeneratorTask{id=index_wikipedia_2013-10-09T21:41:41.147Z_generator_2013-08-31T00:00:00.000Z_2013-09-01T00:00:00.000Z_0, type=index_generator, dataSource=wikipedia, interval=Optional.of(2013-08-31T00:00:00.000Z/2013-09-01T00:00:00.000Z)}
|
||||||
```
|
```
|
||||||
|
|
||||||
Congratulations! The segment has completed building. Once a segment is built, a segment metadata entry is created in your MySQL table. The coordinator compares what is in the segment metadata table with what is in the cluster. A new entry in the metadata table will cause the coordinator to load the new segment in a minute or so.
|
Congratulations! The segment has completed building. Once a segment is built, a segment metadata entry is created in your metadata storage table. The coordinator compares what is in the segment metadata table with what is in the cluster. A new entry in the metadata table will cause the coordinator to load the new segment in a minute or so.
|
||||||
|
|
||||||
You should see the following logs on the coordinator:
|
You should see the following logs on the coordinator:
|
||||||
|
|
||||||
|
@ -249,6 +245,10 @@ Most common data ingestion problems are around timestamp formats and other malfo
|
||||||
|
|
||||||
#### Hadoop Index Task
|
#### Hadoop Index Task
|
||||||
|
|
||||||
|
Druid is designed for large data volumes, and most real-world data sets require batch indexing be done through a Hadoop job.
|
||||||
|
|
||||||
|
For this tutorial, we used [Hadoop 2.3.0](https://archive.apache.org/dist/hadoop/core/hadoop-2.3.0/). There are many pages on the Internet showing how to set up a single-node (standalone) Hadoop cluster, which is all that's needed for this example.
|
||||||
|
|
||||||
Before indexing the data, make sure you have a valid Hadoop cluster running. To build our Druid segment, we are going to submit a [Hadoop index task](Tasks.html) to the indexing service. The grammar for the Hadoop index task is very similar to the index task of the last tutorial. The tutorial Hadoop index task should be located at:
|
Before indexing the data, make sure you have a valid Hadoop cluster running. To build our Druid segment, we are going to submit a [Hadoop index task](Tasks.html) to the indexing service. The grammar for the Hadoop index task is very similar to the index task of the last tutorial. The tutorial Hadoop index task should be located at:
|
||||||
|
|
||||||
```
|
```
|
||||||
|
|
|
@ -26,9 +26,9 @@ You can also [Build From Source](Build-from-source.html).
|
||||||
|
|
||||||
## External Dependencies
|
## External Dependencies
|
||||||
|
|
||||||
Druid requires 3 external dependencies. A "deep" storage that acts as a backup data repository, a relational database such as MySQL to hold configuration and metadata information, and [Apache Zookeeper](http://zookeeper.apache.org/) for coordination among different pieces of the cluster.
|
Druid requires 3 external dependencies. A "deep storage" that acts as a backup data repository, a "metadata storage" such as MySQL to hold configuration and metadata information, and [Apache Zookeeper](http://zookeeper.apache.org/) for coordination among different pieces of the cluster.
|
||||||
|
|
||||||
For deep storage, we will use local disk in this tutorial.
|
For deep storage, we will use local disk in this tutorial, but for production, HDFS and S3 are popular options. For the metadata storage, we'll be using MySQL, but other options such as PostgreSQL are also supported.
|
||||||
|
|
||||||
#### Set up Metadata storage
|
#### Set up Metadata storage
|
||||||
|
|
||||||
|
@ -112,22 +112,22 @@ In the directory, there should be a `common.runtime.properties` file with the fo
|
||||||
|
|
||||||
```
|
```
|
||||||
# Extensions
|
# Extensions
|
||||||
druid.extensions.coordinates=["io.druid.extensions:druid-examples","io.druid.extensions:druid-kafka-seven","io.druid.extensions:mysql-metadata-storage"]
|
druid.extensions.coordinates=["io.druid.extensions:druid-examples","io.druid.extensions:druid-kafka-eight","io.druid.extensions:mysql-metadata-storage"]
|
||||||
|
|
||||||
# Zookeeper
|
# Zookeeper
|
||||||
druid.zk.service.host=localhost
|
druid.zk.service.host=localhost
|
||||||
|
|
||||||
# Metadata Storage
|
# Metadata Storage (mysql)
|
||||||
druid.metadata.storage.type=mysql
|
druid.metadata.storage.type=mysql
|
||||||
druid.metadata.storage.connector.connectURI=jdbc\:mysql\://localhost\:3306/druid
|
druid.metadata.storage.connector.connectURI=jdbc\:mysql\://localhost\:3306/druid
|
||||||
druid.metadata.storage.connector.user=druid
|
druid.metadata.storage.connector.user=druid
|
||||||
druid.metadata.storage.connector.password=diurd
|
druid.metadata.storage.connector.password=diurd
|
||||||
|
|
||||||
# Deep storage
|
# Deep storage (local filesystem for examples - don't use this in production)
|
||||||
druid.storage.type=local
|
druid.storage.type=local
|
||||||
druid.storage.storage.storageDirectory=/tmp/druid/localStorage
|
druid.storage.storage.storageDirectory=/tmp/druid/localStorage
|
||||||
|
|
||||||
# Cache (we use a simple 10mb heap-based local cache on the broker)
|
# Query Cache (we use a simple 10mb heap-based local cache on the broker)
|
||||||
druid.cache.type=local
|
druid.cache.type=local
|
||||||
druid.cache.sizeInBytes=10000000
|
druid.cache.sizeInBytes=10000000
|
||||||
|
|
||||||
|
@ -172,6 +172,8 @@ To start the coordinator node:
|
||||||
java -Xmx256m -Duser.timezone=UTC -Dfile.encoding=UTF-8 -classpath lib/*:config/_common:config/coordinator io.druid.cli.Main server coordinator
|
java -Xmx256m -Duser.timezone=UTC -Dfile.encoding=UTF-8 -classpath lib/*:config/_common:config/coordinator io.druid.cli.Main server coordinator
|
||||||
```
|
```
|
||||||
|
|
||||||
|
Note: we will be running a single historical node in these examples, so you may see some warnings about not being able to replicate segments. These can be safely ignored, but in production, you should always replicate segments across multiple historical nodes.
|
||||||
|
|
||||||
#### Start a Historical Node
|
#### Start a Historical Node
|
||||||
|
|
||||||
Historical nodes are the workhorses of a cluster and are in charge of loading historical segments and making them available for queries. Realtime nodes hand off segments to historical nodes.
|
Historical nodes are the workhorses of a cluster and are in charge of loading historical segments and making them available for queries. Realtime nodes hand off segments to historical nodes.
|
||||||
|
@ -321,7 +323,7 @@ This query may produce no results if the realtime node hasn't run long enough to
|
||||||
curl -X POST 'http://localhost:8083/druid/v2/?pretty' -H 'content-type: application/json' -d@examples/wikipedia/query.body
|
curl -X POST 'http://localhost:8083/druid/v2/?pretty' -H 'content-type: application/json' -d@examples/wikipedia/query.body
|
||||||
```
|
```
|
||||||
|
|
||||||
The realtime query results will reflect the data that was recently indexed from wikipedia, and not handed off to the historical node yet. Once the historical node acknowledges it has loaded the segment, the realtime node will stop querying the segment.
|
The realtime query results will reflect the data that was recently indexed from wikipedia, and not handed off to the historical node yet. Once the historical node acknowledges it has loaded the segment, the realtime node will drop the segment.
|
||||||
|
|
||||||
Querying the historical and realtime node directly is useful for understanding how the segment handling is working, but if you just want to run a query for all the data (realtime and historical), then send the query to the broker at port 8080 (which is what we did in the first example). The broker will send the query to the historical and realtime nodes and merge the results.
|
Querying the historical and realtime node directly is useful for understanding how the segment handling is working, but if you just want to run a query for all the data (realtime and historical), then send the query to the broker at port 8080 (which is what we did in the first example). The broker will send the query to the historical and realtime nodes and merge the results.
|
||||||
|
|
||||||
|
@ -330,4 +332,4 @@ For more information on querying, see this [link](Querying.html).
|
||||||
Next Steps
|
Next Steps
|
||||||
----------
|
----------
|
||||||
If you are interested in how data flows through the different Druid components, check out the [Druid data flow architecture](Design.html). Now that you have an understanding of what the Druid cluster looks like, why not load some of your own data?
|
If you are interested in how data flows through the different Druid components, check out the [Druid data flow architecture](Design.html). Now that you have an understanding of what the Druid cluster looks like, why not load some of your own data?
|
||||||
Check out the next [tutorial](Tutorial%3A-Loading-Your-Data-Part-1.html) section for more info!
|
Check out the next [tutorial](Tutorial%3A-Loading-Streaming-Data.html) section for more info!
|
||||||
|
|
|
@ -4,7 +4,7 @@ layout: doc_page
|
||||||
|
|
||||||
# About Druid
|
# About Druid
|
||||||
|
|
||||||
Druid is an open-source analytics data store designed for real-time exploratory queries on large-scale data sets (100’s of Billions entries, 100’s TB data). Druid provides for cost-effective and always-on realtime data ingestion and arbitrary data exploration.
|
Druid is an open-source analytics data store designed for real-time exploratory queries on large-scale data sets (trillions of events, petabytes of data). Druid provides cost-effective and always-on realtime data ingestion and arbitrary data exploration.
|
||||||
|
|
||||||
- Try out Druid with our Getting Started [Tutorial](./Tutorial%3A-A-First-Look-at-Druid.html)
|
- Try out Druid with our Getting Started [Tutorial](./Tutorial%3A-A-First-Look-at-Druid.html)
|
||||||
- Learn more by reading the [White Paper](http://static.druid.io/docs/druid.pdf)
|
- Learn more by reading the [White Paper](http://static.druid.io/docs/druid.pdf)
|
||||||
|
|
|
@ -63,7 +63,7 @@
|
||||||
"type": "hadoop",
|
"type": "hadoop",
|
||||||
"inputSpec": {
|
"inputSpec": {
|
||||||
"type": "static",
|
"type": "static",
|
||||||
"paths": "/myPath/druid-services-0.7.0-rc3/examples/indexing/wikipedia_data.json"
|
"paths": "examples/indexing/wikipedia_data.json"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"tuningConfig": {
|
"tuningConfig": {
|
||||||
|
|
|
@ -63,7 +63,7 @@
|
||||||
"type": "index",
|
"type": "index",
|
||||||
"firehose": {
|
"firehose": {
|
||||||
"type": "local",
|
"type": "local",
|
||||||
"baseDir": "/MyPath/druid-services-0.7.0-rc3/examples/indexing/",
|
"baseDir": "examples/indexing/",
|
||||||
"filter": "wikipedia_data.json"
|
"filter": "wikipedia_data.json"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
|
|
@ -49,7 +49,7 @@ if [ -e ${EXAMPLE_LOC}/before.sh ]; then
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# start process
|
# start process
|
||||||
JAVA_ARGS="-Xmx256m -Duser.timezone=UTC -Dfile.encoding=UTF-8"
|
JAVA_ARGS="-Xmx512m -Duser.timezone=UTC -Dfile.encoding=UTF-8"
|
||||||
JAVA_ARGS="${JAVA_ARGS} -Ddruid.realtime.specFile=${SPEC_FILE}"
|
JAVA_ARGS="${JAVA_ARGS} -Ddruid.realtime.specFile=${SPEC_FILE}"
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -21,17 +21,19 @@ druid.extensions.coordinates=["io.druid.extensions:druid-examples","io.druid.ext
|
||||||
# Zookeeper
|
# Zookeeper
|
||||||
druid.zk.service.host=localhost
|
druid.zk.service.host=localhost
|
||||||
|
|
||||||
# Metadata Storage
|
# Metadata Storage (mysql)
|
||||||
druid.metadata.storage.type=mysql
|
druid.metadata.storage.type=mysql
|
||||||
druid.metadata.storage.connector.connectURI=jdbc\:mysql\://localhost\:3306/druid
|
druid.metadata.storage.connector.connectURI=jdbc\:mysql\://localhost\:3306/druid
|
||||||
druid.metadata.storage.connector.user=druid
|
druid.metadata.storage.connector.user=druid
|
||||||
druid.metadata.storage.connector.password=diurd
|
druid.metadata.storage.connector.password=diurd
|
||||||
|
|
||||||
# Deep storage
|
# Deep storage (local filesystem for examples - don't use this in production)
|
||||||
druid.storage.type=local
|
druid.storage.type=local
|
||||||
druid.storage.storage.storageDirectory=/tmp/druid/localStorage
|
druid.storage.storage.storageDirectory=/tmp/druid/localStorage
|
||||||
|
|
||||||
# Cache can be defined here if we have a distributed cache
|
# Query Cache (we use a simple 10mb heap-based local cache on the broker)
|
||||||
|
druid.cache.type=local
|
||||||
|
druid.cache.sizeInBytes=10000000
|
||||||
|
|
||||||
# Indexing service discovery
|
# Indexing service discovery
|
||||||
druid.selectors.indexing.serviceName=overlord
|
druid.selectors.indexing.serviceName=overlord
|
||||||
|
|
|
@ -19,9 +19,7 @@ druid.host=localhost
|
||||||
druid.port=8080
|
druid.port=8080
|
||||||
druid.service=broker
|
druid.service=broker
|
||||||
|
|
||||||
# We use a simple 10mb heap-based local cache
|
# We enable using the local query cache here
|
||||||
druid.cache.type=local
|
|
||||||
druid.cache.sizeInBytes=10000000
|
|
||||||
druid.broker.cache.useCache=true
|
druid.broker.cache.useCache=true
|
||||||
druid.broker.cache.populateCache=true
|
druid.broker.cache.populateCache=true
|
||||||
|
|
||||||
|
|
|
@ -16,7 +16,7 @@
|
||||||
#
|
#
|
||||||
|
|
||||||
druid.host=localhost
|
druid.host=localhost
|
||||||
druid.port=8080
|
druid.port=8087
|
||||||
druid.service=overlord
|
druid.service=overlord
|
||||||
|
|
||||||
# Run the overlord in local mode with a single peon to execute tasks
|
# Run the overlord in local mode with a single peon to execute tasks
|
||||||
|
|
|
@ -97,16 +97,25 @@ class WikipediaIrcDecoder implements IrcDecoder
|
||||||
} else {
|
} else {
|
||||||
try {
|
try {
|
||||||
String tmpDir = System.getProperty("java.io.tmpdir");
|
String tmpDir = System.getProperty("java.io.tmpdir");
|
||||||
|
|
||||||
geoDb = new File(tmpDir, this.getClass().getCanonicalName() + ".GeoLite2-City.mmdb");
|
geoDb = new File(tmpDir, this.getClass().getCanonicalName() + ".GeoLite2-City.mmdb");
|
||||||
|
|
||||||
if (!geoDb.exists()) {
|
if (!geoDb.exists()) {
|
||||||
log.info("Downloading geo ip database to [%s]", geoDb);
|
log.info("Downloading geo ip database to [%s]. This may take a few minutes.", geoDb);
|
||||||
|
|
||||||
|
File tmpFile = File.createTempFile("druid", "geo");
|
||||||
|
|
||||||
FileUtils.copyInputStreamToFile(
|
FileUtils.copyInputStreamToFile(
|
||||||
new GZIPInputStream(
|
new GZIPInputStream(
|
||||||
new URL("http://geolite.maxmind.com/download/geoip/database/GeoLite2-City.mmdb.gz").openStream()
|
new URL("http://geolite.maxmind.com/download/geoip/database/GeoLite2-City.mmdb.gz").openStream()
|
||||||
),
|
),
|
||||||
geoDb
|
tmpFile
|
||||||
);
|
);
|
||||||
|
if (!tmpFile.renameTo(geoDb)) {
|
||||||
|
throw new RuntimeException("Unable to move geo file!");
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
log.info("Using geo ip database at [%s].", geoDb);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
catch (IOException e) {
|
catch (IOException e) {
|
||||||
|
|
Loading…
Reference in New Issue