New quickstart and tutorials (#6126)

* New quickstart and tutorials

* PR comments

* Fix tranquility
This commit is contained in:
Jonathan Wei 2018-08-09 13:37:52 -07:00 committed by David Lim
parent 2b0f03acb9
commit 24f2e8ba26
96 changed files with 6298 additions and 1097 deletions

10
NOTICE
View File

@ -82,4 +82,12 @@ This product contains code adapted from Apache Hadoop
* LICENSE:
* https://github.com/apache/hadoop/blob/trunk/LICENSE.txt (Apache License, Version 2.0)
* HOMEPAGE:
* http://hadoop.apache.org/
* http://hadoop.apache.org/
This product contains modified versions of the Dockerfile and related configuration files from SequenceIQ's Hadoop Docker image:
* LICENSE:
* https://github.com/sequenceiq/hadoop-docker/blob/master/LICENSE (Apache License, Version 2.0)
* HOMEPAGE:
* https://github.com/sequenceiq/hadoop-docker/
* COMMIT TAG:
* update this when this patch is committed

View File

@ -52,70 +52,99 @@
</includes>
<outputDirectory>quickstart</outputDirectory>
</fileSet>
<fileSet>
<directory>../examples/quickstart/tutorial</directory>
<includes>
<include>*</include>
</includes>
<outputDirectory>quickstart/tutorial</outputDirectory>
</fileSet>
<fileSet>
<directory>../examples/quickstart/tutorial/conf</directory>
<includes>
<include>*</include>
</includes>
<outputDirectory>quickstart/tutorial/conf</outputDirectory>
</fileSet>
<fileSet>
<directory>../examples/quickstart/tutorial/conf/druid</directory>
<includes>
<include>*</include>
</includes>
<outputDirectory>quickstart/tutorial/conf/druid</outputDirectory>
</fileSet>
<fileSet>
<directory>../examples/quickstart/tutorial/conf/druid/_common</directory>
<includes>
<include>*</include>
</includes>
<outputDirectory>quickstart/tutorial/conf/druid/_common/</outputDirectory>
</fileSet>
<fileSet>
<directory>../examples/quickstart/tutorial/conf/druid/broker</directory>
<includes>
<include>*</include>
</includes>
<outputDirectory>quickstart/tutorial/conf/druid/broker</outputDirectory>
</fileSet>
<fileSet>
<directory>../examples/quickstart/tutorial/conf/druid/coordinator</directory>
<includes>
<include>*</include>
</includes>
<outputDirectory>quickstart/tutorial/conf/druid/coordinator</outputDirectory>
</fileSet>
<fileSet>
<directory>../examples/quickstart/tutorial/conf/druid/historical</directory>
<includes>
<include>*</include>
</includes>
<outputDirectory>quickstart/tutorial/conf/druid/historical</outputDirectory>
</fileSet>
<fileSet>
<directory>../examples/quickstart/tutorial/conf/druid/overlord</directory>
<includes>
<include>*</include>
</includes>
<outputDirectory>quickstart/tutorial/conf/druid/overlord</outputDirectory>
</fileSet>
<fileSet>
<directory>../examples/quickstart/tutorial/conf/druid/middleManager</directory>
<includes>
<include>*</include>
</includes>
<outputDirectory>quickstart/tutorial/conf/druid/middleManager</outputDirectory>
</fileSet>
<fileSet>
<directory>../examples/quickstart/tutorial/conf/tranquility</directory>
<includes>
<include>*</include>
</includes>
<outputDirectory>quickstart/tutorial/conf/tranquility</outputDirectory>
</fileSet>
<fileSet>
<directory>../examples/quickstart/tutorial/conf/zk</directory>
<includes>
<include>*</include>
</includes>
<outputDirectory>quickstart/tutorial/conf/zk</outputDirectory>
</fileSet>
<fileSet>
<directory>../examples/quickstart/tutorial/hadoop</directory>
<includes>
<include>*</include>
</includes>
<outputDirectory>quickstart/tutorial/hadoop</outputDirectory>
</fileSet>
<fileSet>
<directory>../examples/quickstart/tutorial/hadoop/docker</directory>
<includes>
<include>*</include>
</includes>
<outputDirectory>quickstart/tutorial/hadoop/docker</outputDirectory>
</fileSet>
<fileSet>
<directory>../examples/conf-quickstart</directory>
<includes>
<include>*</include>
</includes>
<outputDirectory>conf-quickstart</outputDirectory>
</fileSet>
<fileSet>
<directory>../examples/conf-quickstart/druid</directory>
<includes>
<include>*</include>
</includes>
<outputDirectory>conf-quickstart/druid</outputDirectory>
</fileSet>
<fileSet>
<directory>../examples/conf-quickstart/druid/_common</directory>
<includes>
<include>*</include>
</includes>
<outputDirectory>conf-quickstart/druid/_common/</outputDirectory>
</fileSet>
<fileSet>
<directory>../examples/conf-quickstart/druid/broker</directory>
<includes>
<include>*</include>
</includes>
<outputDirectory>conf-quickstart/druid/broker</outputDirectory>
</fileSet>
<fileSet>
<directory>../examples/conf-quickstart/druid/coordinator</directory>
<includes>
<include>*</include>
</includes>
<outputDirectory>conf-quickstart/druid/coordinator</outputDirectory>
</fileSet>
<fileSet>
<directory>../examples/conf-quickstart/druid/historical</directory>
<includes>
<include>*</include>
</includes>
<outputDirectory>conf-quickstart/druid/historical</outputDirectory>
</fileSet>
<fileSet>
<directory>../examples/conf-quickstart/druid/overlord</directory>
<includes>
<include>*</include>
</includes>
<outputDirectory>conf-quickstart/druid/overlord</outputDirectory>
</fileSet>
<fileSet>
<directory>../examples/conf-quickstart/druid/middleManager</directory>
<includes>
<include>*</include>
</includes>
<outputDirectory>conf-quickstart/druid/middleManager</outputDirectory>
</fileSet>
<fileSet>
<directory>../examples/conf-quickstart/tranquility</directory>
<includes>
<include>*</include>
</includes>
<outputDirectory>conf-quickstart/tranquility</outputDirectory>
</fileSet>
<fileSet>
<directory>../examples/quickstart/protobuf</directory>
<includes>

View File

@ -4,11 +4,19 @@ layout: toc
## Getting Started
* [Concepts](/docs/VERSION/design/)
* [Quickstart](/docs/VERSION/tutorials/quickstart.html)
* [Loading Data](/docs/VERSION/tutorials/ingestion.html)
* [Loading from Files](/docs/VERSION/tutorials/tutorial-batch.html)
* [Loading from Streams](/docs/VERSION/tutorials/tutorial-streams.html)
* [Loading from Kafka](/docs/VERSION/tutorials/tutorial-kafka.html)
* [Quickstart](/docs/VERSION/tutorials/index.html)
* [Tutorial: Loading a file](/docs/VERSION/tutorials/tutorial-batch.html)
* [Tutorial: Loading stream data from Kafka](/docs/VERSION/tutorials/tutorial-kafka.html)
* [Tutorial: Loading a file using Hadoop](/docs/VERSION/tutorials/tutorial-batch-hadoop.html)
* [Tutorial: Loading stream data using HTTP push](/docs/VERSION/tutorials/tutorial-tranquility.html)
* [Tutorial: Querying data](/docs/VERSION/tutorials/tutorial-query.html)
* [Further tutorials](/docs/VERSION/tutorials/advanced.html)
* [Tutorial: Rollup](/docs/VERSION/tutorials/rollup.html)
* [Tutorial: Configuring retention](/docs/VERSION/tutorials/tutorial-retention.html)
* [Tutorial: Updating existing data](/docs/VERSION/tutorials/tutorial-update-data.html)
* [Tutorial: Compacting segments](/docs/VERSION/tutorials/tutorial-compaction.html)
* [Tutorial: Deleting data](/docs/VERSION/tutorials/tutorial-delete-data.html)
* [Tutorial: Writing your own ingestion specs](/docs/VERSION/tutorials/tutorial-ingestion-spec.html)
* [Clustering](/docs/VERSION/tutorials/cluster.html)
## Data Ingestion

Binary file not shown.

After

Width:  |  Height:  |  Size: 88 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 220 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 28 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 108 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 127 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 135 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 214 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 76 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 135 KiB

View File

@ -0,0 +1,168 @@
---
layout: doc_page
---
# Druid Quickstart
In this quickstart, we will download Druid and set it up on a single machine. The cluster will be ready to load data
after completing this initial setup.
Before beginning the quickstart, it is helpful to read the [general Druid overview](../design/index.html) and the
[ingestion overview](../ingestion/index.html), as the tutorials will refer to concepts discussed on those pages.
## Prerequisites
You will need:
* Java 8
* Linux, Mac OS X, or other Unix-like OS (Windows is not supported)
* 8G of RAM
* 2 vCPUs
On Mac OS X, you can use [Oracle's JDK
8](http://www.oracle.com/technetwork/java/javase/downloads/jdk8-downloads-2133151.html) to install
Java.
On Linux, your OS package manager should be able to help for Java. If your Ubuntu-
based OS does not have a recent enough version of Java, WebUpd8 offers [packages for those
OSes](http://www.webupd8.org/2012/09/install-oracle-java-8-in-ubuntu-via-ppa.html).
## Getting started
To install Druid, run the following commands in your terminal:
```bash
curl -O http://static.druid.io/artifacts/releases/druid-#{DRUIDVERSION}-bin.tar.gz
tar -xzf druid-#{DRUIDVERSION}-bin.tar.gz
cd druid-#{DRUIDVERSION}
```
In the package, you should find:
* `LICENSE` - the license files.
* `bin/` - scripts useful for this quickstart.
* `conf/*` - template configurations for a clustered setup.
* `extensions/*` - all Druid extensions.
* `hadoop-dependencies/*` - Druid Hadoop dependencies.
* `lib/*` - all included software packages for core Druid.
* `quickstart/*` - configuration files, sample data, and other files for the quickstart tutorials
## Download Zookeeper
Druid currently has a dependency on [Apache ZooKeeper](http://zookeeper.apache.org/) for distributed coordination. You'll
need to download and run Zookeeper.
In the package root, run the following commands:
```bash
curl https://archive.apache.org/dist/zookeeper/zookeeper-3.4.11/zookeeper-3.4.11.tar.gz -o zookeeper-3.4.11.tar.gz
tar -xzf zookeeper-3.4.11.tar.gz
mv zookeeper-3.4.11 zk
```
The startup scripts for the tutorial will expect the contents of the Zookeeper tarball to be located at `zk` under the druid-#{DRUIDVERSION} package root.
## Start up Druid services
From the druid-#{DRUIDVERSION} package root, run the following command:
```bash
bin/supervise -c quickstart/tutorial/conf/tutorial-cluster.conf
```
This will bring up instances of Zookeeper and the Druid services, all running on the local machine, e.g.:
```
bin/supervise -c quickstart/tutorial/conf/tutorial-cluster.conf
[Thu Jul 26 12:16:23 2018] Running command[zk], logging to[/stage/druid-#{DRUIDVERSION}/var/sv/zk.log]: bin/run-zk quickstart/tutorial/conf
[Thu Jul 26 12:16:23 2018] Running command[coordinator], logging to[/stage/druid-#{DRUIDVERSION}/var/sv/coordinator.log]: bin/run-druid coordinator quickstart/tutorial/conf
[Thu Jul 26 12:16:23 2018] Running command[broker], logging to[//stage/druid-#{DRUIDVERSION}/var/sv/broker.log]: bin/run-druid broker quickstart/tutorial/conf
[Thu Jul 26 12:16:23 2018] Running command[historical], logging to[/stage/druid-#{DRUIDVERSION}/var/sv/historical.log]: bin/run-druid historical quickstart/tutorial/conf
[Thu Jul 26 12:16:23 2018] Running command[overlord], logging to[/stage/druid-#{DRUIDVERSION}/var/sv/overlord.log]: bin/run-druid overlord quickstart/tutorial/conf
[Thu Jul 26 12:16:23 2018] Running command[middleManager], logging to[/stage/druid-#{DRUIDVERSION}/var/sv/middleManager.log]: bin/run-druid middleManager quickstart/tutorial/conf
```
All persistent state such as the cluster metadata store and segments for the services will be kept in the `var` directory under the druid-#{DRUIDVERSION} package root. Logs for the services are located at `var/sv`.
Later on, if you'd like to stop the services, CTRL-C to exit the `bin/supervise` script, which will terminate the Druid processes.
If you want a clean start after stopping the services, delete the `var` directory and run the `bin/supervise` script again.
Once every service has started, you are now ready to load data.
## Loading Data
### Tutorial Dataset
For the following data loading tutorials, we have included a sample data file containing Wikipedia page edit events that occurred on 2015-09-12.
This sample data is located at `quickstart/wikipedia-2015-09-12-sampled.json.gz` from the Druid package root. The page edit events are stored as JSON objects in a text file.
The sample data has the following columns, and an example event is shown below:
* added
* channel
* cityName
* comment
* countryIsoCode
* countryName
* deleted
* delta
* isAnonymous
* isMinor
* isNew
* isRobot
* isUnpatrolled
* metroCode
* namespace
* page
* regionIsoCode
* regionName
* user
```
{
"timestamp":"2015-09-12T20:03:45.018Z",
"channel":"#en.wikipedia",
"namespace":"Main"
"page":"Spider-Man's powers and equipment",
"user":"foobar",
"comment":"/* Artificial web-shooters */",
"cityName":"New York",
"regionName":"New York",
"regionIsoCode":"NY",
"countryName":"United States",
"countryIsoCode":"US",
"isAnonymous":false,
"isNew":false,
"isMinor":false,
"isRobot":false,
"isUnpatrolled":false,
"added":99,
"delta":99,
"deleted":0,
}
```
The following tutorials demonstrate various methods of loading data into Druid, including both batch and streaming use cases.
### [Tutorial: Loading a file](./tutorial-batch.html)
This tutorial demonstrates how to perform a batch file load, using Druid's native batch ingestion.
### [Tutorial: Loading stream data from Kafka](../tutorial-kafka.html)
This tutorial demonstrates how to load streaming data from a Kafka topic.
### [Tutorial: Loading a file using Hadoop](../tutorial-batch-hadoop.html)
This tutorial demonstrates how to perform a batch file load, using a remote Hadoop cluster.
### [Tutorial: Loading data using Tranquility](../tutorial-tranquility.html)
This tutorial demonstrates how to load streaming data by pushing events to Druid using the Tranquility service.
### [Tutorial: Writing your own ingestion spec](../tutorial-ingestion-spec.html)
This tutorial demonstrates how to write a new ingestion spec and use it to load data.

View File

@ -1,42 +0,0 @@
---
layout: doc_page
---
# Loading Data
## Choosing an ingestion method
Druid supports streaming (real-time) and file-based (batch) ingestion methods. The most
popular configurations are:
- [Files](../ingestion/batch-ingestion.html) - Load data from HDFS, S3, local files, or any supported Hadoop
filesystem in batches. We recommend this method if your dataset is already in flat files.
- [Stream push](../ingestion/stream-ingestion.html#stream-push) - Push a data stream into Druid in real-time
using [Tranquility](http://github.com/druid-io/tranquility), a client library for sending streams
to Druid. We recommend this method if your dataset originates in a streaming system like Kafka,
Storm, Spark Streaming, or your own system.
- [Stream pull](../ingestion/stream-ingestion.html#stream-pull) - Pull a data stream directly from an external
data source into Druid using Realtime Nodes.
## Getting started
The easiest ways to get started with loading your own data are the three included tutorials.
- [Files-based tutorial](tutorial-batch.html) showing you how to load files from your local disk.
- [Streams-based tutorial](tutorial-streams.html) showing you how to push data over HTTP.
- [Kafka-based tutorial](tutorial-kafka.html) showing you how to load data from Kafka.
## Hybrid batch/streaming
You can combine batch and streaming methods in a hybrid batch/streaming architecture. In a hybrid architecture,
you use a streaming method to do initial ingestion, and then periodically re-ingest older data in batch mode
(typically every few hours, or nightly). When Druid re-ingests data for a time range, the new data automatically
replaces the data from the earlier ingestion.
All streaming ingestion methods currently supported by Druid do introduce the possibility of dropped or duplicated
messages in certain failure scenarios, and batch re-ingestion eliminates this potential source of error for
historical data.
Batch re-ingestion also gives you the option to re-ingest your data if you needed to revise it for any reason.

View File

@ -1,243 +0,0 @@
---
layout: doc_page
---
# Druid Quickstart
In this quickstart, we will download Druid, set up it up on a single machine, load some data, and query the data.
## Prerequisites
You will need:
* Java 8
* Linux, Mac OS X, or other Unix-like OS (Windows is not supported)
* 8G of RAM
* 2 vCPUs
On Mac OS X, you can use [Oracle's JDK
8](http://www.oracle.com/technetwork/java/javase/downloads/jdk8-downloads-2133151.html) to install
Java.
On Linux, your OS package manager should be able to help for Java. If your Ubuntu-
based OS does not have a recent enough version of Java, WebUpd8 offers [packages for those
OSes](http://www.webupd8.org/2012/09/install-oracle-java-8-in-ubuntu-via-ppa.html).
## Getting started
To install Druid, issue the following commands in your terminal:
```bash
curl -O http://static.druid.io/artifacts/releases/druid-#{DRUIDVERSION}-bin.tar.gz
tar -xzf druid-#{DRUIDVERSION}-bin.tar.gz
cd druid-#{DRUIDVERSION}
```
In the package, you should find:
* `LICENSE` - the license files.
* `bin/` - scripts useful for this quickstart.
* `conf/*` - template configurations for a clustered setup.
* `conf-quickstart/*` - configurations for this quickstart.
* `extensions/*` - all Druid extensions.
* `hadoop-dependencies/*` - Druid Hadoop dependencies.
* `lib/*` - all included software packages for core Druid.
* `quickstart/*` - files useful for this quickstart.
## Start up Zookeeper
Druid currently has a dependency on [Apache ZooKeeper](http://zookeeper.apache.org/) for distributed coordination. You'll
need to download and run Zookeeper.
```bash
curl http://www.gtlib.gatech.edu/pub/apache/zookeeper/zookeeper-3.4.11/zookeeper-3.4.11.tar.gz -o zookeeper-3.4.11.tar.gz
tar -xzf zookeeper-3.4.11.tar.gz
cd zookeeper-3.4.11
cp conf/zoo_sample.cfg conf/zoo.cfg
./bin/zkServer.sh start
```
## Start up Druid services
With Zookeeper running, return to the druid-#{DRUIDVERSION} directory. In that directory, issue the command:
```bash
bin/init
```
This will setup up some directories for you. Next, you can start up the Druid processes in different terminal windows.
This tutorial runs every Druid process on the same system. In a large distributed production cluster,
many of these Druid processes can still be co-located together.
```bash
java `cat conf-quickstart/druid/historical/jvm.config | xargs` -cp "conf-quickstart/druid/_common:conf-quickstart/druid/historical:lib/*" io.druid.cli.Main server historical
java `cat conf-quickstart/druid/broker/jvm.config | xargs` -cp "conf-quickstart/druid/_common:conf-quickstart/druid/broker:lib/*" io.druid.cli.Main server broker
java `cat conf-quickstart/druid/coordinator/jvm.config | xargs` -cp "conf-quickstart/druid/_common:conf-quickstart/druid/coordinator:lib/*" io.druid.cli.Main server coordinator
java `cat conf-quickstart/druid/overlord/jvm.config | xargs` -cp "conf-quickstart/druid/_common:conf-quickstart/druid/overlord:lib/*" io.druid.cli.Main server overlord
java `cat conf-quickstart/druid/middleManager/jvm.config | xargs` -cp "conf-quickstart/druid/_common:conf-quickstart/druid/middleManager:lib/*" io.druid.cli.Main server middleManager
```
You should see a log message printed out for each service that starts up.
Later on, if you'd like to stop the services, CTRL-C to exit from the running java processes. If you
want a clean start after stopping the services, delete the `var` directory and run the `init` script again.
Once every service has started, you are now ready to load data.
## Load batch data
We've included a sample of Wikipedia edits from September 12, 2015 to get you started.
<div class="note info">
This section shows you how to load data in batches, but you can skip ahead to learn how to <a href="quickstart.html#load-streaming-data">load
streams in real-time</a>. Druid's streaming ingestion can load data
with virtually no delay between events occurring and being available for queries.
</div>
The [dimensions](https://en.wikipedia.org/wiki/Dimension_%28data_warehouse%29) (attributes you can
filter and split on) in the Wikipedia dataset, other than time, are:
* channel
* cityName
* comment
* countryIsoCode
* countryName
* isAnonymous
* isMinor
* isNew
* isRobot
* isUnpatrolled
* metroCode
* namespace
* page
* regionIsoCode
* regionName
* user
The [measures](https://en.wikipedia.org/wiki/Measure_%28data_warehouse%29), or *metrics* as they are known in Druid (values you can aggregate)
in the Wikipedia dataset are:
* count
* added
* deleted
* delta
* user_unique
To load this data into Druid, you can submit an *ingestion task* pointing to the file. We've included
a task that loads the `wikiticker-2015-09-12-sampled.json` file included in the archive. To submit
this task, POST it to Druid in a new terminal window from the druid-#{DRUIDVERSION} directory:
```bash
curl -X 'POST' -H 'Content-Type:application/json' -d @quickstart/wikiticker-index.json localhost:8090/druid/indexer/v1/task
```
Which will print the ID of the task if the submission was successful:
```bash
{"task":"index_hadoop_wikipedia_2013-10-09T21:30:32.802Z"}
```
To view the status of your ingestion task, go to your overlord console:
[http://localhost:8090/console.html](http://localhost:8090/console.html). You can refresh the console periodically, and after
the task is successful, you should see a "SUCCESS" status for the task.
After your ingestion task finishes, the data will be loaded by historical nodes and available for
querying within a minute or two. You can monitor the progress of loading your data in the
coordinator console, by checking whether there is a datasource "wikiticker" with a blue circle
indicating "fully available": [http://localhost:8081/#/](http://localhost:8081/#/).
Once the data is fully available, you can immediately query it&mdash; to see how, skip to the [Query
data](#query-data) section below. Or, continue to the [Load your own data](#load-your-own-data)
section if you'd like to load a different dataset.
## Load streaming data
To load streaming data, we are going to push events into Druid
over a simple HTTP API. To do this we will use [Tranquility], a high level data producer
library for Druid.
To download Tranquility, issue the following commands in your terminal:
```bash
curl -O http://static.druid.io/tranquility/releases/tranquility-distribution-0.8.0.tgz
tar -xzf tranquility-distribution-0.8.0.tgz
cd tranquility-distribution-0.8.0
```
We've included a configuration file in `conf-quickstart/tranquility/server.json` as part of the Druid distribution
for a *metrics* datasource. We're going to start the Tranquility server process, which can be used to push events
directly to Druid.
``` bash
bin/tranquility server -configFile <path_to_druid_distro>/conf-quickstart/tranquility/server.json
```
<div class="note info">
This section shows you how to load data using Tranquility Server, but Druid also supports a wide
variety of <a href="../ingestion/stream-ingestion.html#stream-push">other streaming ingestion options</a>, including from
popular streaming systems like Kafka, Storm, Samza, and Spark Streaming.
</div>
The [dimensions](https://en.wikipedia.org/wiki/Dimension_%28data_warehouse%29) (attributes you can
filter and split on) for this datasource are flexible. It's configured for *schemaless dimensions*,
meaning it will accept any field in your JSON input as a dimension.
The metrics (also called
[measures](https://en.wikipedia.org/wiki/Measure_%28data_warehouse%29); values
you can aggregate) in this datasource are:
* count
* value_sum (derived from `value` in the input)
* value_min (derived from `value` in the input)
* value_max (derived from `value` in the input)
We've included a script that can generate some random sample metrics to load into this datasource.
To use it, simply run in your Druid distribution repository:
```bash
bin/generate-example-metrics | curl -XPOST -H'Content-Type: application/json' --data-binary @- http://localhost:8200/v1/post/metrics
```
Which will print something like:
```
{"result":{"received":25,"sent":25}}
```
This indicates that the HTTP server received 25 events from you, and sent 25 to Druid. Note that
this may take a few seconds to finish the first time you run it, as Druid resources must be
allocated to the ingestion task. Subsequent POSTs should complete quickly.
Once the data is sent to Druid, you can immediately [query it](#query-data).
## Query data
### Direct Druid queries
Druid supports a rich [family of JSON-based
queries](../querying/querying.html). We've included an example topN query
in `quickstart/wikiticker-top-pages.json` that will find the most-edited articles in this dataset:
```bash
curl -L -H'Content-Type: application/json' -XPOST --data-binary @quickstart/wikiticker-top-pages.json http://localhost:8082/druid/v2/?pretty
```
## Visualizing data
Druid is ideal for power user-facing analytic applications. There are a number of different open source applications to
visualize and explore data in Druid. We recommend trying [Pivot](https://github.com/implydata/pivot),
[Superset](https://github.com/airbnb/superset), or [Metabase](https://github.com/metabase/metabase) to start
visualizing the data you just ingested.
If you installed Pivot for example, you should be able to view your data in your browser at [localhost:9090](http://localhost:9090/).
### SQL and other query libraries
There are many more query tools for Druid than we've included here, including SQL
engines, and libraries for various languages like Python and Ruby. Please see [the list of
libraries](../development/libraries.html) for more information.
## Clustered setup
This quickstart sets you up with all services running on a single machine. The next step is to [load
your own data](ingestion.html). Or, you can skip ahead to [running a distributed cluster](cluster.html).

View File

@ -0,0 +1,232 @@
---
layout: doc_page
---
# Tutorial: Load batch data using Hadoop
This tutorial shows you how to load data files into Druid using a remote Hadoop cluster.
For this tutorial, we'll assume that you've already completed the previous [batch ingestion tutorial](tutorial-batch.html) using Druid's native batch ingestion system.
## Install Docker
This tutorial requires [Docker](https://docs.docker.com/install/) to be installed on the tutorial machine.
Once the Docker install is complete, please proceed to the next steps in the tutorial.
## Build the Hadoop docker image
For this tutorial, we've provided a Dockerfile for a Hadoop 2.8.3 cluster, which we'll use to run the batch indexing task.
This Dockerfile and related files are located at `quickstart/tutorial/hadoop/docker`.
From the druid-${DRUIDVERSION} package root, run the following commands to build a Docker image named "druid-hadoop-demo" with version tag "2.8.3":
```
cd quickstart/tutorial/hadoop/docker
docker build -t druid-hadoop-demo:2.8.3 .
```
This will start building the Hadoop image. Once the image build is done, you should see the message `Successfully tagged druid-hadoop-demo:2.8.3` printed to the console.
## Setup the Hadoop docker cluster
### Create temporary shared directory
We'll need a shared folder between the host and the Hadoop container for transferring some files.
Let's create some folders under `/tmp`, we will use these later when starting the Hadoop container:
```
mkdir -p /tmp/shared
mkdir -p /tmp/shared/hadoop_xml
```
### Configure /etc/hosts
On the host machine, add the following entry to `/etc/hosts`:
```
127.0.0.1 druid-hadoop-demo
```
### Start the Hadoop container
Once the `/tmp/shared` folder has been created and the `etc/hosts` entry has been added, run the following command to start the Hadoop container.
```
docker run -it -h druid-hadoop-demo -p 50010:50010 -p 50020:50020 -p 50075:50075 -p 50090:50090 -p 8020:8020 -p 10020:10020 -p 19888:19888 -p 8030:8030 -p 8031:8031 -p 8032:8032 -p 8033:8033 -p 8040:8040 -p 8042:8042 -p 8088:8088 -p 8443:8443 -p 2049:2049 -p 9000:9000 -p 49707:49707 -p 2122:2122 -p 34455:34455 -v /tmp/shared:/shared druid-hadoop-demo:2.8.3 /etc/bootstrap.sh -bash
```
Once the container is started, your terminal will attach to a bash shell running inside the container:
```
Starting sshd: [ OK ]
18/07/26 17:27:15 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Starting namenodes on [druid-hadoop-demo]
druid-hadoop-demo: starting namenode, logging to /usr/local/hadoop/logs/hadoop-root-namenode-druid-hadoop-demo.out
localhost: starting datanode, logging to /usr/local/hadoop/logs/hadoop-root-datanode-druid-hadoop-demo.out
Starting secondary namenodes [0.0.0.0]
0.0.0.0: starting secondarynamenode, logging to /usr/local/hadoop/logs/hadoop-root-secondarynamenode-druid-hadoop-demo.out
18/07/26 17:27:31 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
starting yarn daemons
starting resourcemanager, logging to /usr/local/hadoop/logs/yarn--resourcemanager-druid-hadoop-demo.out
localhost: starting nodemanager, logging to /usr/local/hadoop/logs/yarn-root-nodemanager-druid-hadoop-demo.out
starting historyserver, logging to /usr/local/hadoop/logs/mapred--historyserver-druid-hadoop-demo.out
bash-4.1#
```
The `Unable to load native-hadoop library for your platform... using builtin-java classes where applicable` warning messages can be safely ignored.
### Copy input data to the Hadoop container
From the druid-${DRUIDVERSION} package root on the host, copy the `quickstart/wikiticker-2015-09-12-sampled.json.gz` sample data to the shared folder:
```
cp quickstart/wikiticker-2015-09-12-sampled.json.gz /tmp/shared/wikiticker-2015-09-12-sampled.json.gz
```
### Setup HDFS directories
In the Hadoop container's shell, run the following commands to setup the HDFS directories needed by this tutorial and copy the input data to HDFS.
```
cd /usr/local/hadoop/bin
./hadoop fs -mkdir /druid
./hadoop fs -mkdir /druid/segments
./hadoop fs -mkdir /quickstart
./hadoop fs -chmod 777 /druid
./hadoop fs -chmod 777 /druid/segments
./hadoop fs -chmod 777 /quickstart
./hadoop fs -chmod -R 777 /tmp
./hadoop fs -chmod -R 777 /user
./hadoop fs -put /shared/wikiticker-2015-09-12-sampled.json.gz /quickstart/wikiticker-2015-09-12-sampled.json.gz
```
If you encounter namenode errors when running this command, the Hadoop container may not be finished initializing. When this occurs, wait a couple of minutes and retry the commands.
## Configure Druid to use Hadoop
Some additional steps are needed to configure the Druid cluster for Hadoop batch indexing.
### Copy Hadoop configuration to Druid classpath
From the Hadoop container's shell, run the following command to copy the Hadoop .xml configuration files to the shared folder:
```
cp /usr/local/hadoop/etc/hadoop/*.xml /shared/hadoop_xml
```
From the host machine, run the following, where {PATH_TO_DRUID} is replaced by the path to the Druid package.
```
mkdir -p {PATH_TO_DRUID}/quickstart/tutorial/conf/druid/_common/hadoop-xml
cp /tmp/shared/hadoop_xml/*.xml {PATH_TO_DRUID}/quickstart/tutorial/conf/druid/_common/hadoop-xml/
```
### Update Druid segment and log storage
In your favorite text editor, open `quickstart/tutorial/conf/druid/_common/common.runtime.properties`, and make the following edits:
#### Disable local deep storage and enable HDFS deep stroage
```
#
# Deep storage
#
# For local disk (only viable in a cluster if this is a network mount):
#druid.storage.type=local
#druid.storage.storageDirectory=var/druid/segments
# For HDFS:
druid.storage.type=hdfs
druid.storage.storageDirectory=/druid/segments
```
#### Disable local log storage and enable HDFS log storage
```
#
# Indexing service logs
#
# For local disk (only viable in a cluster if this is a network mount):
#druid.indexer.logs.type=file
#druid.indexer.logs.directory=var/druid/indexing-logs
# For HDFS:
druid.indexer.logs.type=hdfs
druid.indexer.logs.directory=/druid/indexing-logs
```
### Restart Druid cluster
Once the Hadoop .xml files have been copied to the Druid cluster and the segment/log storage configuration has been updated to use HDFS, the Druid cluster needs to be restarted for the new configurations to take effect.
If the cluster is still running, CTRL-C to terminate the `bin/supervise` script, and re-reun it to bring the Druid services back up.
## Load batch data
We've included a sample of Wikipedia edits from September 12, 2015 to get you started.
To load this data into Druid, you can submit an *ingestion task* pointing to the file. We've included
a task that loads the `wikiticker-2015-09-12-sampled.json.gz` file included in the archive.
Let's submit the `wikipedia-index-hadoop-.json` task:
```
bin/post-index-task --file quickstart/tutorial/wikipedia-index-hadoop.json
```
## Querying your data
After the data load is complete, please follow the [query tutorial](../tutorial/tutorial-query.html) to run some example queries on the newly loaded data.
## Cleanup
This tutorial is only meant to be used together with the [query tutorial](../tutorial/tutorial-query.html).
If you wish to go through any of the other tutorials, you will need to:
* Shut down the cluster and reset the cluster state by removing the contents of the `var` directory under the druid package.
* Revert the deep storage and task storage config back to local types in `quickstart/tutorial/conf/druid/_common/common.runtime.properties`
* Restart the cluster
This is necessary because the other ingestion tutorials will write to the same "wikipedia" datasource, and later tutorials expect the cluster to use local deep storage.
Example reverted config:
```
#
# Deep storage
#
# For local disk (only viable in a cluster if this is a network mount):
druid.storage.type=local
druid.storage.storageDirectory=var/druid/segments
# For HDFS:
#druid.storage.type=hdfs
#druid.storage.storageDirectory=/druid/segments
#
# Indexing service logs
#
# For local disk (only viable in a cluster if this is a network mount):
druid.indexer.logs.type=file
druid.indexer.logs.directory=var/druid/indexing-logs
# For HDFS:
#druid.indexer.logs.type=hdfs
#druid.indexer.logs.directory=/druid/indexing-logs
```
## Further reading
For more information on loading batch data with Hadoop, please see [the Hadoop batch ingestion documentation](../ingestion/hadoop.html).

View File

@ -2,137 +2,157 @@
layout: doc_page
---
# Tutorial: Load your own batch data
# Tutorial: Loading a file
## Getting started
This tutorial shows you how to load your own data files into Druid.
This tutorial demonstrates how to perform a batch file load, using Druid's native batch ingestion.
For this tutorial, we'll assume you've already downloaded Druid as described in
the [single-machine quickstart](quickstart.html) and have it running on your local machine. You
the [single-machine quickstart](index.html) and have it running on your local machine. You
don't need to have loaded any data yet.
Once that's complete, you can load your own dataset by writing a custom ingestion spec.
## Preparing the data and the ingestion task spec
## Writing an ingestion spec
A data load is initiated by submitting an *ingestion task* spec to the Druid overlord. For this tutorial, we'll be loading the sample Wikipedia page edits data.
When loading files into Druid, you will use Druid's [batch loading](../ingestion/batch-ingestion.html) process.
There's an example batch ingestion spec in `quickstart/wikiticker-index.json` that you can modify
for your own needs.
The Druid package includes the following sample native batch ingestion task spec at `quickstart/wikipedia-index.json`, shown here for convenience,
which has been configured to read the `quickstart/wikiticker-2015-09-12-sampled.json.gz` input file:
The most important questions are:
* What should the dataset be called? This is the "dataSource" field of the "dataSchema".
* Where is the dataset located? The file paths belong in the "paths" of the "inputSpec". If you
want to load multiple files, you can provide them as a comma-separated string.
* Which field should be treated as a timestamp? This belongs in the "column" of the "timestampSpec".
* Which fields should be treated as dimensions? This belongs in the "dimensions" of the "dimensionsSpec".
* Which fields should be treated as metrics? This belongs in the "metricsSpec".
* What time ranges (intervals) are being loaded? This belongs in the "intervals" of the "granularitySpec".
If your data does not have a natural sense of time, you can tag each row with the current time.
You can also tag all rows with a fixed timestamp, like "2000-01-01T00:00:00.000Z".
Let's use this pageviews dataset as an example. Druid supports TSV, CSV, and JSON out of the box.
Note that nested JSON objects are not supported, so if you do use JSON, you should provide a file
containing flattened objects.
```json
{"time": "2015-09-01T00:00:00Z", "url": "/foo/bar", "user": "alice", "latencyMs": 32}
{"time": "2015-09-01T01:00:00Z", "url": "/", "user": "bob", "latencyMs": 11}
{"time": "2015-09-01T01:30:00Z", "url": "/foo/bar", "user": "bob", "latencyMs": 45}
```
Make sure the file has no newline at the end. If you save this to a file called "pageviews.json", then for this dataset:
* Let's call the dataset "pageviews".
* The data is located in "pageviews.json".
* The timestamp is the "time" field.
* Good choices for dimensions are the string fields "url" and "user".
* Good choices for metrics are a count of pageviews, and the sum of "latencyMs". Collecting that
sum when we load the data will allow us to compute an average at query time as well.
* The data covers the time range 2015-09-01 (inclusive) through 2015-09-02 (exclusive).
You can copy the existing `quickstart/wikiticker-index.json` indexing task to a new file:
```bash
cp quickstart/wikiticker-index.json my-index-task.json
```
And modify it by altering these sections:
```json
"dataSource": "pageviews"
```
```json
"inputSpec": {
"type": "static",
"paths": "pageviews.json"
{
"type" : "index",
"spec" : {
"dataSchema" : {
"dataSource" : "wikipedia",
"parser" : {
"type" : "string",
"parseSpec" : {
"format" : "json",
"dimensionsSpec" : {
"dimensions" : [
"channel",
"cityName",
"comment",
"countryIsoCode",
"countryName",
"isAnonymous",
"isMinor",
"isNew",
"isRobot",
"isUnpatrolled",
"metroCode",
"namespace",
"page",
"regionIsoCode",
"regionName",
"user",
{ "name": "added", "type": "long" },
{ "name": "deleted", "type": "long" },
{ "name": "delta", "type": "long" }
]
},
"timestampSpec": {
"column": "time",
"format": "iso"
}
}
},
"metricsSpec" : [],
"granularitySpec" : {
"type" : "uniform",
"segmentGranularity" : "day",
"queryGranularity" : "none",
"intervals" : ["2015-09-12/2015-09-13"],
"rollup" : false
}
},
"ioConfig" : {
"type" : "index",
"firehose" : {
"type" : "local",
"baseDir" : "quickstart/",
"filter" : "wikiticker-2015-09-12-sampled.json.gz"
},
"appendToExisting" : false
},
"tuningConfig" : {
"type" : "index",
"targetPartitionSize" : 5000000,
"maxRowsInMemory" : 25000,
"forceExtendableShardSpecs" : true
}
}
}
```
```json
"timestampSpec": {
"format": "auto",
"column": "time"
}
This spec will create a datasource named "wikipedia",
## Load batch data
We've included a sample of Wikipedia edits from September 12, 2015 to get you started.
To load this data into Druid, you can submit an *ingestion task* pointing to the file. We've included
a task that loads the `wikiticker-2015-09-12-sampled.json.gz` file included in the archive.
For convenience, the Druid package includes a batch ingestion helper script at `bin/post-index-task`.
This script will POST an ingestion task to the Druid overlord and poll Druid until the data is available for querying.
Run the following command from Druid package root:
```
bin/post-index-task --file quickstart/tutorial/wikipedia-index.json
```
```json
"dimensionsSpec": {
"dimensions": ["url", "user"]
}
You should see output like the following:
```
```json
"metricsSpec": [
{"name": "views", "type": "count"},
{"name": "latencyMs", "type": "doubleSum", "fieldName": "latencyMs"}
]
Beginning indexing data for wikipedia
Task started: index_wikipedia_2018-07-27T06:37:44.323Z
Task log: http://localhost:8090/druid/indexer/v1/task/index_wikipedia_2018-07-27T06:37:44.323Z/log
Task status: http://localhost:8090/druid/indexer/v1/task/index_wikipedia_2018-07-27T06:37:44.323Z/status
Task index_wikipedia_2018-07-27T06:37:44.323Z still running...
Task index_wikipedia_2018-07-27T06:37:44.323Z still running...
Task finished with status: SUCCESS
Completed indexing data for wikipedia. Now loading indexed data onto the cluster...
wikipedia loading complete! You may now query your data
```
```json
"granularitySpec": {
"type": "uniform",
"segmentGranularity": "day",
"queryGranularity": "none",
"intervals": ["2015-09-01/2015-09-02"]
}
```
## Running the task
To actually run this task, first make sure that the indexing task can read *pageviews.json*:
- If you're running locally (no configuration for connecting to Hadoop; this is the default) then
place it in the root of the Druid distribution.
- If you configured Druid to connect to a Hadoop cluster, upload
the pageviews.json file to HDFS. You may need to adjust the `paths` in the ingestion spec.
To kick off the indexing process, POST your indexing task to the Druid Overlord. In a standard Druid
install, the URL is `http://OVERLORD_IP:8090/druid/indexer/v1/task`.
```bash
curl -X 'POST' -H 'Content-Type:application/json' -d @my-index-task.json OVERLORD_IP:8090/druid/indexer/v1/task
```
If you're running everything on a single machine, you can use localhost:
```bash
curl -X 'POST' -H 'Content-Type:application/json' -d @my-index-task.json localhost:8090/druid/indexer/v1/task
```
If anything goes wrong with this task (e.g. it finishes with status FAILED), you can troubleshoot
by visiting the "Task log" on the [overlord console](http://localhost:8090/console.html).
## Querying your data
Your data should become fully available within a minute or two. You can monitor this process on
your Coordinator console at [http://localhost:8081/#/](http://localhost:8081/#/).
Once the data is loaded, please follow the [query tutorial](../tutorial/tutorial-query.html) to run some example queries on the newly loaded data.
Once your data is fully available, you can query it using any of the
[supported query methods](../querying/querying.html).
## Cleanup
If you wish to go through any of the other ingestion tutorials, you will need to shut down the cluster and reset the cluster state by removing the contents of the `var` directory under the druid package, as the other tutorials will write to the same "wikipedia" datasource.
## Extra: Loading data without the script
Let's briefly discuss how we would've submitted the ingestion task without using the script. You do not need to run these commands.
To submit the task, POST it to Druid in a new terminal window from the druid-#{DRUIDVERSION} directory:
```bash
curl -X 'POST' -H 'Content-Type:application/json' -d @quickstart/tutorial/wikipedia-index.json http://localhost:8090/druid/indexer/v1/task
```
Which will print the ID of the task if the submission was successful:
```bash
{"task":"index_wikipedia_2018-06-09T21:30:32.802Z"}
```
To view the status of the ingestion task, go to the overlord console:
[http://localhost:8090/console.html](http://localhost:8090/console.html). You can refresh the console periodically, and after
the task is successful, you should see a "SUCCESS" status for the task.
After the ingestion task finishes, the data will be loaded by historical nodes and available for
querying within a minute or two. You can monitor the progress of loading the data in the
coordinator console, by checking whether there is a datasource "wikipedia" with a blue circle
indicating "fully available": [http://localhost:8081/#/](http://localhost:8081/#/).
![Coordinator console](../tutorials/img/tutorial-batch-01.png "Wikipedia 100% loaded")
## Further reading

View File

@ -0,0 +1,106 @@
---
layout: doc_page
---
# Tutorial: Compacting segments
This tutorial demonstrates how to compact existing segments into fewer but larger segments.
Because there is some per-segment memory and processing overhead, it can sometimes be beneficial to reduce the total number of segments.
For this tutorial, we'll assume you've already downloaded Druid as described in
the [single-machine quickstart](index.html) and have it running on your local machine.
It will also be helpful to have finished [Tutorial: Loading a file](/docs/VERSION/tutorials/tutorial-batch.html) and [Tutorial: Querying data](/docs/VERSION/tutorials/tutorial-query.html).
## Load the initial data
For this tutorial, we'll be using the Wikipedia edits sample data, with an ingestion task spec that will create a separate segment for each hour in the input data.
The ingestion spec can be found at `quickstart/tutorial/compaction-init-index.json`. Let's submit that spec, which will create a datasource called `compaction-tutorial`:
```
bin/post-index-task --file quickstart/tutorial/compaction-init-index.json
```
After the ingestion completes, go to http://localhost:8081/#/datasources/compaction-tutorial in a browser to view information about the new datasource in the Coordinator console.
There will be 24 segments for this datasource, one segment per hour in the input data:
![Original segments](../tutorials/img/tutorial-retention-01.png "Original segments")
Running a COUNT(*) query on this datasource shows that there are 39,244 rows:
```
dsql> select count(*) from "compaction-tutorial";
┌────────┐
│ EXPR$0 │
├────────┤
│ 39244 │
└────────┘
Retrieved 1 row in 1.38s.
```
## Compact the data
Let's now combine these 24 segments into one segment.
We have included a compaction task spec for this tutorial datasource at `quickstart/tutorial/compaction-final-index.json`:
```
{
"type": "compact",
"dataSource": "compaction-tutorial",
"interval": "2015-09-12/2015-09-13",
"tuningConfig" : {
"type" : "index",
"targetPartitionSize" : 5000000,
"maxRowsInMemory" : 25000,
"forceExtendableShardSpecs" : true
}
}
```
This will compact all segments for the interval `2015-09-12/2015-09-13` in the `compaction-tutorial` datasource.
The parameters in the `tuningConfig` control how many segments will be present in the compacted set of segments.
In this tutorial example, only one compacted segment will be created, as the 39244 rows in the input is less than the 5000000 `targetPartitionSize`.
Let's submit this task now:
```
bin/post-index-task --file quickstart/tutorial/compaction-final-index.json
```
After the task finishes, refresh the http://localhost:8081/#/datasources/compaction-tutorial page.
The original 24 segments will eventually be marked as "unused" by the Coordinator and removed, with the new compacted segment remaining.
By default, the Druid coordinator will not mark segments as unused until the coordinator process has been up for at least 15 minutes, so you may see the old segment set and the new compacted set at the same time in the coordinator, e.g.:
![Compacted segments intermediate state](../tutorials/img/tutorial-compaction-01.png "Compacted segments intermediate state")
The new compacted segment has a more recent version than the original segments, so even when both sets of segments are shown by the coordinator, queries will only read from the new compacted segment.
Let's try running a COUNT(*) on `compaction-tutorial` again, where the row count should still be 39,244:
```
dsql> select count(*) from "compaction-tutorial";
┌────────┐
│ EXPR$0 │
├────────┤
│ 39244 │
└────────┘
Retrieved 1 row in 1.30s.
```
After the coordinator has been running for at least 15 minutes, the http://localhost:8081/#/datasources/compaction-tutorial page should show there is only 1 segment:
![Compacted segments final state](../tutorials/img/tutorial-compaction-02.png "Compacted segments final state")
## Further reading
[Task documentation](../ingestion/tasks.html)
[Segment optimization](../operations/segment-optimization.html)

View File

@ -0,0 +1,156 @@
---
layout: doc_page
---
# Tutorial: Deleting data
This tutorial demonstrates how to delete existing data.
For this tutorial, we'll assume you've already downloaded Druid as described in
the [single-machine quickstart](index.html) and have it running on your local machine.
Completing [Tutorial: Configuring retention](/docs/VERSION/tutorials/tutorial-retention.html) first is highly recommended, as we will be using retention rules in this tutorial.
## Load initial data
In this tutorial, we will use the Wikipedia edits data, with an indexing spec that creates hourly segments. This spec is located at `quickstart/tutorial/deletion-index.json`, and it creates a datasource called `deletion-tutorial`.
Let's load this initial data:
```
bin/post-index-task --file quickstart/tutorial/deletion-index.json
```
When the load finishes, open http://localhost:8081/#/datasources/deletion-tutorial in a browser.
## How to permanently delete data
Permanent deletion of a Druid segment has two steps:
1. The segment must first be marked as "unused". This occurs when a segment is dropped by retention rules, and when a user manually disables a segment through the Coordinator API. This tutorial will cover both cases.
2. After segments have been marked as "unused", a Kill Task will delete any "unused" segments from Druid's metadata store as well as deep storage.
Let's drop some segments now, first with load rules, then manually.
## Drop some data with load rules
As with the previous retention tutorial, there are currently 24 segments in the `deletion-tutorial` datasource.
Click the `edit rules` button with a pencil icon at the upper left corner of the page.
A rule configuration window will appear. Enter `tutorial` for both the user and changelog comment field.
Now click the `+ Add a rule` button twice.
In the `rule #1` box at the top, click `Load`, `Interval`, enter `2015-09-12T12:00:00.000Z/2015-09-13T00:00:00.000Z` in the interval box, and click `+ _default_tier replicant`.
In the `rule #2` box at the bottom, click `Drop` and `Forever`.
This will cause the first 12 segments of `deletion-tutorial` to be dropped. However, these dropped segments are not removed from deep storage.
You can see that all 24 segments are still present in deep storage by listing the contents of `druid-{DRUIDVERSION}/var/druid/segments/deletion-tutorial`:
```
$ ls -l1 var/druid/segments/deletion-tutorial/
2015-09-12T00:00:00.000Z_2015-09-12T01:00:00.000Z
2015-09-12T01:00:00.000Z_2015-09-12T02:00:00.000Z
2015-09-12T02:00:00.000Z_2015-09-12T03:00:00.000Z
2015-09-12T03:00:00.000Z_2015-09-12T04:00:00.000Z
2015-09-12T04:00:00.000Z_2015-09-12T05:00:00.000Z
2015-09-12T05:00:00.000Z_2015-09-12T06:00:00.000Z
2015-09-12T06:00:00.000Z_2015-09-12T07:00:00.000Z
2015-09-12T07:00:00.000Z_2015-09-12T08:00:00.000Z
2015-09-12T08:00:00.000Z_2015-09-12T09:00:00.000Z
2015-09-12T09:00:00.000Z_2015-09-12T10:00:00.000Z
2015-09-12T10:00:00.000Z_2015-09-12T11:00:00.000Z
2015-09-12T11:00:00.000Z_2015-09-12T12:00:00.000Z
2015-09-12T12:00:00.000Z_2015-09-12T13:00:00.000Z
2015-09-12T13:00:00.000Z_2015-09-12T14:00:00.000Z
2015-09-12T14:00:00.000Z_2015-09-12T15:00:00.000Z
2015-09-12T15:00:00.000Z_2015-09-12T16:00:00.000Z
2015-09-12T16:00:00.000Z_2015-09-12T17:00:00.000Z
2015-09-12T17:00:00.000Z_2015-09-12T18:00:00.000Z
2015-09-12T18:00:00.000Z_2015-09-12T19:00:00.000Z
2015-09-12T19:00:00.000Z_2015-09-12T20:00:00.000Z
2015-09-12T20:00:00.000Z_2015-09-12T21:00:00.000Z
2015-09-12T21:00:00.000Z_2015-09-12T22:00:00.000Z
2015-09-12T22:00:00.000Z_2015-09-12T23:00:00.000Z
2015-09-12T23:00:00.000Z_2015-09-13T00:00:00.000Z
```
## Manually disable a segment
Let's manually disable a segment now. This will mark a segment as "unused", but not remove it from deep storage.
On http://localhost:8081/#/datasources/deletion-tutorial, click one of the remaining segments on the left for full details about the segment:
![Segments](../tutorials/img/tutorial-deletion-01.png "Segments")
The top of the info box shows the full segment ID, e.g. `deletion-tutorial_2016-06-27T14:00:00.000Z_2016-06-27T15:00:00.000Z_2018-07-27T22:57:00.110Z` for the segment of hour 14.
Let's disable the hour 14 segment by sending the following DELETE request to the coordinator, where {SEGMENT-ID} is the full segment ID shown in the info box:
```
curl -XDELETE http://localhost:8081/druid/coordinator/v1/datasources/deletion-tutorial/segments/{SEGMENT-ID}
```
After that command completes, you should see that the segment for hour 14 has been disabled:
![Segments 2](../tutorials/img/tutorial-deletion-02.png "Segments 2")
Note that the hour 14 segment is still in deep storage:
```
$ ls -l1 var/druid/segments/deletion-tutorial/
2015-09-12T00:00:00.000Z_2015-09-12T01:00:00.000Z
2015-09-12T01:00:00.000Z_2015-09-12T02:00:00.000Z
2015-09-12T02:00:00.000Z_2015-09-12T03:00:00.000Z
2015-09-12T03:00:00.000Z_2015-09-12T04:00:00.000Z
2015-09-12T04:00:00.000Z_2015-09-12T05:00:00.000Z
2015-09-12T05:00:00.000Z_2015-09-12T06:00:00.000Z
2015-09-12T06:00:00.000Z_2015-09-12T07:00:00.000Z
2015-09-12T07:00:00.000Z_2015-09-12T08:00:00.000Z
2015-09-12T08:00:00.000Z_2015-09-12T09:00:00.000Z
2015-09-12T09:00:00.000Z_2015-09-12T10:00:00.000Z
2015-09-12T10:00:00.000Z_2015-09-12T11:00:00.000Z
2015-09-12T11:00:00.000Z_2015-09-12T12:00:00.000Z
2015-09-12T12:00:00.000Z_2015-09-12T13:00:00.000Z
2015-09-12T13:00:00.000Z_2015-09-12T14:00:00.000Z
2015-09-12T14:00:00.000Z_2015-09-12T15:00:00.000Z
2015-09-12T15:00:00.000Z_2015-09-12T16:00:00.000Z
2015-09-12T16:00:00.000Z_2015-09-12T17:00:00.000Z
2015-09-12T17:00:00.000Z_2015-09-12T18:00:00.000Z
2015-09-12T18:00:00.000Z_2015-09-12T19:00:00.000Z
2015-09-12T19:00:00.000Z_2015-09-12T20:00:00.000Z
2015-09-12T20:00:00.000Z_2015-09-12T21:00:00.000Z
2015-09-12T21:00:00.000Z_2015-09-12T22:00:00.000Z
2015-09-12T22:00:00.000Z_2015-09-12T23:00:00.000Z
2015-09-12T23:00:00.000Z_2015-09-13T00:00:00.000Z
```
## Run a kill task
Now that we have disabled some segments, we can submit a Kill Task, which will delete the disabled segments from metadata and deep storage.
A Kill Task spec has been provided at `quickstart/deletion-kill.json`. Submit this task to the Overlord with the following command:
```
curl -X 'POST' -H 'Content-Type:application/json' -d @quickstart/tutorial/deletion-kill.json http://localhost:8090/druid/indexer/v1/task
```
After this task completes, you can see that the disabled segments have now been removed from deep storage:
```
$ ls -l1 var/druid/segments/deletion-tutorial/
2015-09-12T12:00:00.000Z_2015-09-12T13:00:00.000Z
2015-09-12T13:00:00.000Z_2015-09-12T14:00:00.000Z
2015-09-12T15:00:00.000Z_2015-09-12T16:00:00.000Z
2015-09-12T16:00:00.000Z_2015-09-12T17:00:00.000Z
2015-09-12T17:00:00.000Z_2015-09-12T18:00:00.000Z
2015-09-12T18:00:00.000Z_2015-09-12T19:00:00.000Z
2015-09-12T19:00:00.000Z_2015-09-12T20:00:00.000Z
2015-09-12T20:00:00.000Z_2015-09-12T21:00:00.000Z
2015-09-12T21:00:00.000Z_2015-09-12T22:00:00.000Z
2015-09-12T22:00:00.000Z_2015-09-12T23:00:00.000Z
2015-09-12T23:00:00.000Z_2015-09-13T00:00:00.000Z
```

View File

@ -0,0 +1,642 @@
---
layout: doc_page
---
# Tutorial: Writing an ingestion spec
This tutorial will guide the reader through the process of defining an ingestion spec, pointing out key considerations and guidelines.
For this tutorial, we'll assume you've already downloaded Druid as described in
the [single-machine quickstart](index.html) and have it running on your local machine.
It will also be helpful to have finished [Tutorial: Loading a file](/docs/VERSION/tutorials/tutorial-batch.html), [Tutorial: Querying data](/docs/VERSION/tutorials/tutorial-query.html), and [Tutorial: Rollup](/docs/VERSION/tutorials/tutorial-rollup.html).
## Example data
Suppose we have the following network flow data:
* `srcIP`: IP address of sender
* `srcPort`: Port of sender
* `dstIP`: IP address of receiver
* `dstPort`: Port of receiver
* `protocol`: IP protocol number
* `packets`: number of packets transmitted
* `bytes`: number of bytes transmitted
* `cost`: the cost of sending the traffic
```
{"ts":"2018-01-01T01:01:35Z","srcIP":"1.1.1.1", "dstIP":"2.2.2.2", "srcPort":2000, "dstPort":3000, "protocol": 6, "packets":10, "bytes":1000, "cost": 1.4}
{"ts":"2018-01-01T01:01:51Z","srcIP":"1.1.1.1", "dstIP":"2.2.2.2", "srcPort":2000, "dstPort":3000, "protocol": 6, "packets":20, "bytes":2000, "cost": 3.1}
{"ts":"2018-01-01T01:01:59Z","srcIP":"1.1.1.1", "dstIP":"2.2.2.2", "srcPort":2000, "dstPort":3000, "protocol": 6, "packets":30, "bytes":3000, "cost": 0.4}
{"ts":"2018-01-01T01:02:14Z","srcIP":"1.1.1.1", "dstIP":"2.2.2.2", "srcPort":5000, "dstPort":7000, "protocol": 6, "packets":40, "bytes":4000, "cost": 7.9}
{"ts":"2018-01-01T01:02:29Z","srcIP":"1.1.1.1", "dstIP":"2.2.2.2", "srcPort":5000, "dstPort":7000, "protocol": 6, "packets":50, "bytes":5000, "cost": 10.2}
{"ts":"2018-01-01T01:03:29Z","srcIP":"1.1.1.1", "dstIP":"2.2.2.2", "srcPort":5000, "dstPort":7000, "protocol": 6, "packets":60, "bytes":6000, "cost": 4.3}
{"ts":"2018-01-01T02:33:14Z","srcIP":"7.7.7.7", "dstIP":"8.8.8.8", "srcPort":4000, "dstPort":5000, "protocol": 17, "packets":100, "bytes":10000, "cost": 22.4}
{"ts":"2018-01-01T02:33:45Z","srcIP":"7.7.7.7", "dstIP":"8.8.8.8", "srcPort":4000, "dstPort":5000, "protocol": 17, "packets":200, "bytes":20000, "cost": 34.5}
{"ts":"2018-01-01T02:35:45Z","srcIP":"7.7.7.7", "dstIP":"8.8.8.8", "srcPort":4000, "dstPort":5000, "protocol": 17, "packets":300, "bytes":30000, "cost": 46.3}
```
Save the JSON contents above into a file called `ingestion-tutorial-data.json` in `quickstart/`.
Let's walk through the process of defining an ingestion spec that can load this data.
For this tutorial, we will be using the native batch indexing task. When using other task types, some aspects of the ingestion spec will differ, and this tutorial will point out such areas.
## Defining the schema
The core element of a Druid ingestion spec is the `dataSchema`. The `dataSchema` defines how to parse input data into a set of columns that will be stored in Druid.
Let's start with an empty `dataSchema` and add fields to it as we progress through the tutorial.
Create a new file called `ingestion-tutorial-index.json` in `quickstart/` with the following contents:
```json
"dataSchema" : {}
```
We will be making successive edits to this ingestion spec as we progress through the tutorial.
### Datasource name
The datasource name is specified by the `dataSource` parameter in the `dataSchema`.
```json
"dataSchema" : {
"dataSource" : "ingestion-tutorial",
}
```
Let's call the tutorial datasource `ingestion-tutorial`.
### Choose a parser
A `dataSchema` has a `parser` field, which defines the parser that Druid will use to interpret the input data.
Since our input data is represented as JSON strings, we'll use a `string` parser with `json` format:
```
"dataSchema" : {
"dataSource" : "ingestion-tutorial",
"parser" : {
"type" : "string",
"parseSpec" : {
"format" : "json"
}
}
}
```
### Time column
The `parser` needs to know how to extract the main timestamp field from the input data. When using a `json` type `parseSpec`, the timestamp is defined in a `timestampSpec`.
The timestamp column in our input data is named "ts", containing ISO 8601 timestamps, so let's add a `timestampSpec` with that information to the `parseSpec`:
```
"dataSchema" : {
"dataSource" : "ingestion-tutorial",
"parser" : {
"type" : "string",
"parseSpec" : {
"format" : "json",
"timestampSpec" : {
"format" : "iso",
"column" : "ts"
}
}
}
}
```
### Column types
Now that we've defined the time column, let's look at definitions for other columns.
Druid supports the following column types: String, Long, Float, Double. We will see how these are used in the following sections.
Before we move on to how we define our other non-time columns, let's discuss `rollup` first.
### Rollup
When ingesting data, we must consider whether we wish to use rollup or not.
* If rollup is enabled, we will need to separate the input columns into two categories, "dimensions" and "metrics". "Dimensions" are the grouping columns for rollup, while "metrics" are the columns that will be aggregated.
* If rollup is disabled, then all columns are treated as "dimensions" and no pre-aggregation occurs.
For this tutorial, let's enable rollup. This is specified with a `granularitySpec` on the `dataSchema`.
Note that the `granularitySpec` lies outside of the `parser`. We will revist the `parser` soon when we define our dimensions and metrics.
```
"dataSchema" : {
"dataSource" : "ingestion-tutorial",
"parser" : {
"type" : "string",
"parseSpec" : {
"format" : "json",
"timestampSpec" : {
"format" : "iso",
"column" : "ts"
}
}
},
"granularitySpec" : {
"rollup" : true
}
}
```
#### Choosing dimensions and metrics
For this example dataset, the following is a sensible split for "dimensions" and "metrics":
* Dimensions: srcIP, srcPort, dstIP, dstPort, protocol
* Metrics: packets, bytes, cost
The dimensions here are a group of properties that identify a unidirectional flow of IP traffic, while the metrics represent facts about the IP traffic flow specified by a dimension grouping.
Let's look at how to define these dimensions and metrics within the ingestion spec.
#### Dimensions
Dimensions are specified with a `dimensionsSpec` inside the `parseSpec`.
```
"dataSchema" : {
"dataSource" : "ingestion-tutorial",
"parser" : {
"type" : "string",
"parseSpec" : {
"format" : "json",
"timestampSpec" : {
"format" : "iso",
"column" : "ts"
},
"dimensionsSpec" : {
"dimensions": [
"srcIP",
{ "name" : "srcPort", "type" : "long" },
{ "name" : "dstIP", "type" : "string" },
{ "name" : "dstPort", "type" : "long" },
{ "name" : "protocol", "type" : "string" }
]
}
}
},
"granularitySpec" : {
"rollup" : true
}
}
```
Each dimension has a `name` and a `type`, where `type` can be "long", "float", "double", or "string".
Note that `srcIP` is a "string" dimension; for string dimensions, it is enough to specify just a dimension name, since "string" is the default dimension type.
Also note that `protocol` is a numeric value in the input data, but we are ingesting it as a "string" column; Druid will coerce the input longs to strings during ingestion.
##### Strings vs. Numerics
Should a numeric input be ingested as a numeric dimension or as a string dimension?
Numeric dimensions have the following pros/cons relative to String dimensions:
* Pros: Numeric representation can result in smaller column sizes on disk and lower processing overhead when reading values from the column
* Cons: Numeric dimensions do not have indices, so filtering on them will often be slower than filtering on an equivalent String dimension (which has bitmap indices)
#### Metrics
Metrics are specified with a `metricsSpec` inside the `dataSchema`:
```json
"dataSchema" : {
"dataSource" : "ingestion-tutorial",
"parser" : {
"type" : "string",
"parseSpec" : {
"format" : "json",
"timestampSpec" : {
"format" : "iso",
"column" : "ts"
},
"dimensionsSpec" : {
"dimensions": [
"srcIP",
{ "name" : "srcPort", "type" : "long" },
{ "name" : "dstIP", "type" : "string" },
{ "name" : "dstPort", "type" : "long" },
{ "name" : "protocol", "type" : "string" }
]
}
}
},
"metricsSpec" : [
{ "type" : "count", "name" : "count" },
{ "type" : "longSum", "name" : "packets", "fieldName" : "packets" },
{ "type" : "longSum", "name" : "bytes", "fieldName" : "bytes" },
{ "type" : "doubleSum", "name" : "cost", "fieldName" : "cost" }
],
"granularitySpec" : {
"rollup" : true
}
}
```
When defining a metric, it is necessary to specify what type of aggregation should be performed on that column during rollup.
Here we have defined long sum aggregations on the two long metric columns, `packets` and `bytes`, and a double sum aggregation for the `cost` column.
Note that the `metricsSpec` is on a different nesting level than `dimensionSpec` or `parseSpec`; it belongs on the same nesting level as `parser` within the `dataSchema`.
Note that we have also defined a `count` aggregator. The count aggregator will track how many rows in the original input data contributed to a "rolled up" row in the final ingested data.
### No rollup
If we were not using rollup, all columns would be specified in the `dimensionsSpec`, e.g.:
```
"dimensionsSpec" : {
"dimensions": [
"srcIP",
{ "name" : "srcPort", "type" : "long" },
{ "name" : "dstIP", "type" : "string" },
{ "name" : "dstPort", "type" : "long" },
{ "name" : "protocol", "type" : "string" },
{ "name" : "packets", "type" : "long" },
{ "name" : "bytes", "type" : "long" },
{ "name" : "srcPort", "type" : "double" }
]
},
```
### Define granularities
At this point, we are done defining the `parser` and `metricsSpec` within the `dataSchema` and we are almost done writing the ingestion spec.
There are some additional properties we need to set in the `granularitySpec`:
* Type of granularitySpec: `uniform` and `arbitrary` are the two supported types. For this tutorial, we will use a `uniform` granularity spec, where all segments have uniform interval sizes (for example, all segments cover an hour's worth of data).
* The segment granularity: what size of time interval should a single segment contain data for? e.g., `DAY`, `WEEK`
* The bucketing granularity of the timestamps in the time column (referred to as `queryGranularity`)
#### Segment granularity
Segment granularity is configured by the `segmentGranularity` property in the `granularitySpec`. For this tutorial, we'll create hourly segments:
```
"dataSchema" : {
"dataSource" : "ingestion-tutorial",
"parser" : {
"type" : "string",
"parseSpec" : {
"format" : "json",
"timestampSpec" : {
"format" : "iso",
"column" : "ts"
},
"dimensionsSpec" : {
"dimensions": [
"srcIP",
{ "name" : "srcPort", "type" : "long" },
{ "name" : "dstIP", "type" : "string" },
{ "name" : "dstPort", "type" : "long" },
{ "name" : "protocol", "type" : "string" }
]
}
}
},
"metricsSpec" : [
{ "type" : "count", "name" : "count" },
{ "type" : "longSum", "name" : "packets", "fieldName" : "packets" },
{ "type" : "longSum", "name" : "bytes", "fieldName" : "bytes" },
{ "type" : "doubleSum", "name" : "cost", "fieldName" : "cost" }
],
"granularitySpec" : {
"type" : "uniform",
"segmentGranularity" : "HOUR",
"rollup" : true
}
}
```
Our input data has events from two separate hours, so this task will generate two segments.
#### Query granularity
The query granularity is configured by the `queryGranularity` property in the `granularitySpec`. For this tutorial, let's use minute granularity:
```
"dataSchema" : {
"dataSource" : "ingestion-tutorial",
"parser" : {
"type" : "string",
"parseSpec" : {
"format" : "json",
"timestampSpec" : {
"format" : "iso",
"column" : "ts"
},
"dimensionsSpec" : {
"dimensions": [
"srcIP",
{ "name" : "srcPort", "type" : "long" },
{ "name" : "dstIP", "type" : "string" },
{ "name" : "dstPort", "type" : "long" },
{ "name" : "protocol", "type" : "string" }
]
}
}
},
"metricsSpec" : [
{ "type" : "count", "name" : "count" },
{ "type" : "longSum", "name" : "packets", "fieldName" : "packets" },
{ "type" : "longSum", "name" : "bytes", "fieldName" : "bytes" },
{ "type" : "doubleSum", "name" : "cost", "fieldName" : "cost" }
],
"granularitySpec" : {
"type" : "uniform",
"segmentGranularity" : "HOUR",
"queryGranularity" : "MINUTE"
"rollup" : true
}
}
```
To see the effect of the query granularity, let's look at this row from the raw input data:
```
{"ts":"2018-01-01T01:03:29Z","srcIP":"1.1.1.1", "dstIP":"2.2.2.2", "srcPort":5000, "dstPort":7000, "protocol": 6, "packets":60, "bytes":6000, "cost": 4.3}
```
When this row is ingested with minute queryGranularity, Druid will floor the row's timestamp to minute buckets:
```
{"ts":"2018-01-01T01:03:00Z","srcIP":"1.1.1.1", "dstIP":"2.2.2.2", "srcPort":5000, "dstPort":7000, "protocol": 6, "packets":60, "bytes":6000, "cost": 4.3}
```
#### Define an interval (batch only)
For batch tasks, it is necessary to define a time interval. Input rows with timestamps outside of the time interval will not be ingested.
The interval is also specified in the `granularitySpec`:
```
"dataSchema" : {
"dataSource" : "ingestion-tutorial",
"parser" : {
"type" : "string",
"parseSpec" : {
"format" : "json",
"timestampSpec" : {
"format" : "iso",
"column" : "ts"
},
"dimensionsSpec" : {
"dimensions": [
"srcIP",
{ "name" : "srcPort", "type" : "long" },
{ "name" : "dstIP", "type" : "string" },
{ "name" : "dstPort", "type" : "long" },
{ "name" : "protocol", "type" : "string" }
]
}
}
},
"metricsSpec" : [
{ "type" : "count", "name" : "count" },
{ "type" : "longSum", "name" : "packets", "fieldName" : "packets" },
{ "type" : "longSum", "name" : "bytes", "fieldName" : "bytes" },
{ "type" : "doubleSum", "name" : "cost", "fieldName" : "cost" }
],
"granularitySpec" : {
"type" : "uniform",
"segmentGranularity" : "HOUR",
"queryGranularity" : "MINUTE",
"intervals" : ["2018-01-01/2018-01-02"],
"rollup" : true
}
}
```
## Define the task type
We've now finished defining our `dataSchema`. The remaining steps are to place the `dataSchema` we created into an ingestion task spec, and specify the input source.
The `dataSchema` is shared across all task types, but each task type has its own specification format. For this tutorial, we will use the native batch ingestion task:
```
{
"type" : "index",
"spec" : {
"dataSchema" : {
"dataSource" : "ingestion-tutorial",
"parser" : {
"type" : "string",
"parseSpec" : {
"format" : "json",
"timestampSpec" : {
"format" : "iso",
"column" : "ts"
},
"dimensionsSpec" : {
"dimensions": [
"srcIP",
{ "name" : "srcPort", "type" : "long" },
{ "name" : "dstIP", "type" : "string" },
{ "name" : "dstPort", "type" : "long" },
{ "name" : "protocol", "type" : "string" }
]
}
}
},
"metricsSpec" : [
{ "type" : "count", "name" : "count" },
{ "type" : "longSum", "name" : "packets", "fieldName" : "packets" },
{ "type" : "longSum", "name" : "bytes", "fieldName" : "bytes" },
{ "type" : "doubleSum", "name" : "cost", "fieldName" : "cost" }
],
"granularitySpec" : {
"type" : "uniform",
"segmentGranularity" : "HOUR",
"queryGranularity" : "MINUTE",
"intervals" : ["2018-01-01/2018-01-02"],
"rollup" : true
}
}
}
}
```
## Define the input source
Now let's define our input source, which is specified in an `ioConfig` object. Each task type has its own type of `ioConfig`. The native batch task uses "firehoses" to read input data, so let's configure a "local" firehose to read the example netflow data we saved earlier:
```
"ioConfig" : {
"type" : "index",
"firehose" : {
"type" : "local",
"baseDir" : "quickstart/",
"filter" : "ingestion-tutorial-data.json"
}
}
```
```
{
"type" : "index",
"spec" : {
"dataSchema" : {
"dataSource" : "ingestion-tutorial",
"parser" : {
"type" : "string",
"parseSpec" : {
"format" : "json",
"timestampSpec" : {
"format" : "iso",
"column" : "ts"
},
"dimensionsSpec" : {
"dimensions": [
"srcIP",
{ "name" : "srcPort", "type" : "long" },
{ "name" : "dstIP", "type" : "string" },
{ "name" : "dstPort", "type" : "long" },
{ "name" : "protocol", "type" : "string" }
]
}
}
},
"metricsSpec" : [
{ "type" : "count", "name" : "count" },
{ "type" : "longSum", "name" : "packets", "fieldName" : "packets" },
{ "type" : "longSum", "name" : "bytes", "fieldName" : "bytes" },
{ "type" : "doubleSum", "name" : "cost", "fieldName" : "cost" }
],
"granularitySpec" : {
"type" : "uniform",
"segmentGranularity" : "HOUR",
"queryGranularity" : "MINUTE",
"intervals" : ["2018-01-01/2018-01-02"],
"rollup" : true
}
},
"ioConfig" : {
"type" : "index",
"firehose" : {
"type" : "local",
"baseDir" : "quickstart/",
"filter" : "ingestion-tutorial-data.json"
}
}
}
}
```
## Additional tuning
Each ingestion task has a `tuningConfig` section that allows users to tune various ingestion parameters.
As an example, let's add a `tuningConfig` that sets a target segment size for the native batch ingestion task:
```
"tuningConfig" : {
"type" : "index",
"targetPartitionSize" : 5000000
}
```
Note that each ingestion task has its own type of `tuningConfig`.
## Final spec
We've finished defining the ingestion spec, it should now look like the following:
```
{
"type" : "index",
"spec" : {
"dataSchema" : {
"dataSource" : "ingestion-tutorial",
"parser" : {
"type" : "string",
"parseSpec" : {
"format" : "json",
"timestampSpec" : {
"format" : "iso",
"column" : "ts"
},
"dimensionsSpec" : {
"dimensions": [
"srcIP",
{ "name" : "srcPort", "type" : "long" },
{ "name" : "dstIP", "type" : "string" },
{ "name" : "dstPort", "type" : "long" },
{ "name" : "protocol", "type" : "string" }
]
}
}
},
"metricsSpec" : [
{ "type" : "count", "name" : "count" },
{ "type" : "longSum", "name" : "packets", "fieldName" : "packets" },
{ "type" : "longSum", "name" : "bytes", "fieldName" : "bytes" },
{ "type" : "doubleSum", "name" : "cost", "fieldName" : "cost" }
],
"granularitySpec" : {
"type" : "uniform",
"segmentGranularity" : "HOUR",
"queryGranularity" : "MINUTE",
"intervals" : ["2018-01-01/2018-01-02"],
"rollup" : true
}
},
"ioConfig" : {
"type" : "index",
"firehose" : {
"type" : "local",
"baseDir" : "quickstart/",
"filter" : "ingestion-tutorial-data.json"
}
},
"tuningConfig" : {
"type" : "index",
"targetPartitionSize" : 5000000
}
}
}
```
## Submit the task and query the data
From the druid-${DRUIDVERSION} package root, run the following command:
```
bin/post-index-task --file quickstart/ingestion-tutorial-index.json
```
After the script completes, we will query the data.
Let's run `bin/dsql` and issue a `select * from "ingestion-tutorial";` query to see what data was ingested.
```
$ bin/dsql
Welcome to dsql, the command-line client for Druid SQL.
Type "\h" for help.
dsql> select * from "ingestion-tutorial";
┌──────────────────────────┬───────┬──────┬───────┬─────────┬─────────┬─────────┬──────────┬─────────┬─────────┐
│ __time │ bytes │ cost │ count │ dstIP │ dstPort │ packets │ protocol │ srcIP │ srcPort │
├──────────────────────────┼───────┼──────┼───────┼─────────┼─────────┼─────────┼──────────┼─────────┼─────────┤
│ 2018-01-01T01:01:00.000Z │ 6000 │ 4.9 │ 3 │ 2.2.2.2 │ 3000 │ 60 │ 6 │ 1.1.1.1 │ 2000 │
│ 2018-01-01T01:02:00.000Z │ 9000 │ 18.1 │ 2 │ 2.2.2.2 │ 7000 │ 90 │ 6 │ 1.1.1.1 │ 5000 │
│ 2018-01-01T01:03:00.000Z │ 6000 │ 4.3 │ 1 │ 2.2.2.2 │ 7000 │ 60 │ 6 │ 1.1.1.1 │ 5000 │
│ 2018-01-01T02:33:00.000Z │ 30000 │ 56.9 │ 2 │ 8.8.8.8 │ 5000 │ 300 │ 17 │ 7.7.7.7 │ 4000 │
│ 2018-01-01T02:35:00.000Z │ 30000 │ 46.3 │ 1 │ 8.8.8.8 │ 5000 │ 300 │ 17 │ 7.7.7.7 │ 4000 │
└──────────────────────────┴───────┴──────┴───────┴─────────┴─────────┴─────────┴──────────┴─────────┴─────────┘
Retrieved 5 rows in 0.12s.
dsql>
```

View File

@ -2,33 +2,26 @@
layout: doc_page
---
# Tutorial: Load from Kafka
# Tutorial: Load streaming data from Kafka
## Getting started
This tutorial shows you how to load data from Kafka into Druid.
This tutorial demonstrates how to load data from a Kafka stream, using the Druid Kafka indexing service.
For this tutorial, we'll assume you've already downloaded Druid and Tranquility as described in
the [single-machine quickstart](quickstart.html) and have it running on your local machine. You
For this tutorial, we'll assume you've already downloaded Druid as described in
the [single-machine quickstart](index.html) and have it running on your local machine. You
don't need to have loaded any data yet.
<div class="note info">
This tutorial will show you how to load data from Kafka into Druid, but Druid additionally supports
a wide variety of batch and streaming loading methods. See the <a href="../ingestion/batch-ingestion.html">Loading files</a>
and <a href="../ingestion/stream-ingestion.html">Loading streams</a> pages for more information about other options,
including from Hadoop, HTTP, Storm, Samza, Spark Streaming, and your own JVM apps.
</div>
## Start Kafka
## Download and start Kafka
[Apache Kafka](http://kafka.apache.org/) is a high throughput message bus that works well with
Druid. For this tutorial, we will use Kafka 0.9.0.0. To download Kafka, issue the following
Druid. For this tutorial, we will use Kafka 0.10.2.0. To download Kafka, issue the following
commands in your terminal:
```bash
curl -O http://www.us.apache.org/dist/kafka/0.9.0.0/kafka_2.11-0.9.0.0.tgz
tar -xzf kafka_2.11-0.9.0.0.tgz
cd kafka_2.11-0.9.0.0
curl -O https://archive.apache.org/dist/kafka/0.10.2.0/kafka_2.11-0.10.2.0.tgz
tar -xzf kafka_2.11-0.10.2.0.tgz
cd kafka_2.11-0.10.2.0
```
Start a Kafka broker by running the following command in a new terminal:
@ -37,149 +30,56 @@ Start a Kafka broker by running the following command in a new terminal:
./bin/kafka-server-start.sh config/server.properties
```
Run this command to create a Kafka topic called *metrics*, to which we'll send data:
Run this command to create a Kafka topic called *wikipedia*, to which we'll send data:
```bash
./bin/kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 1 --partitions 1 --topic metrics
./bin/kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 1 --partitions 1 --topic wikipedia
```
## Send example data
## Enable Druid Kafka ingestion
We will use Druid's Kafka indexing service to ingest messages from our newly created *wikipedia* topic. To start the
service, we will need to submit a supervisor spec to the Druid overlord by running the following from the Imply directory:
```bash
curl -XPOST -H'Content-Type: application/json' -d @quickstart/tutorial/wikipedia-kafka-supervisor.json http://localhost:8090/druid/indexer/v1/supervisor
```
If the supervisor was successfully created, you will get a response containing the ID of the supervisor; in our case we should see `{"id":"wikipedia-kafka"}`.
For more details about what's going on here, check out the
[Druid Kafka indexing service documentation](http://druid.io/docs/{{druidVersion}}/development/extensions-core/kafka-ingestion.html).
## Load data
Let's launch a console producer for our topic and send some data!
In your Druid directory, generate some metrics by running:
In your Druid directory, run the following command:
```bash
bin/generate-example-metrics
```
cd quickstart
gunzip -k wikipedia-2015-09-12-sampled.json.gz
```
In your Kafka directory, run:
In your Kafka directory, run the following command, where {PATH_TO_DRUID} is replaced by the path to the Druid directory:
```bash
./bin/kafka-console-producer.sh --broker-list localhost:9092 --topic metrics
export KAFKA_OPTS="-Dfile.encoding=UTF-8"
./bin/kafka-console-producer.sh --broker-list localhost:9092 --topic wikipedia < {PATH_TO_DRUID}/quickstart/wikipedia-2015-09-12-sampled.json
```
The *kafka-console-producer* command is now awaiting input. Copy the generated example metrics,
paste them into the *kafka-console-producer* terminal, and press enter. If you like, you can also
paste more messages into the producer, or you can press CTRL-D to exit the console producer.
You can immediately query this data, or you can skip ahead to the
[Loading your own data](#loading-your-own-data) section if you'd like to load your own dataset.
The previous command posted sample events to the *wikipedia* Kafka topic which were then ingested into Druid by the Kafka indexing service. You're now ready to run some queries!
## Querying your data
After sending data, you can immediately query it using any of the
[supported query methods](../querying/querying.html).
After data is sent to the Kafka stream, it is immediately available for querying.
## Loading your own data
Please follow the [query tutorial](../tutorial/tutorial-query.html) to run some example queries on the newly loaded data.
So far, you've loaded data into Druid from Kafka using an ingestion spec that we've included in the
distribution. Each ingestion spec is designed to work with a particular dataset. You load your own
data types into Imply by writing a custom ingestion spec.
## Cleanup
You can write a custom ingestion spec by starting from the bundled configuration in
`conf-quickstart/tranquility/kafka.json` and modifying it for your own needs.
The most important questions are:
* What should the dataset be called? This is the "dataSource" field of the "dataSchema".
* Which field should be treated as a timestamp? This belongs in the "column" of the "timestampSpec".
* Which fields should be treated as dimensions? This belongs in the "dimensions" of the "dimensionsSpec".
* Which fields should be treated as measures? This belongs in the "metricsSpec".
Let's use a small JSON pageviews dataset in the topic *pageviews* as an example, with records like:
```json
{"time": "2000-01-01T00:00:00Z", "url": "/foo/bar", "user": "alice", "latencyMs": 32}
```
First, create the topic:
```bash
./bin/kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 1 --partitions 1 --topic pageviews
```
Next, edit `conf-quickstart/tranquility/kafka.json`:
* Let's call the dataset "pageviews-kafka".
* The timestamp is the "time" field.
* Good choices for dimensions are the string fields "url" and "user".
* Good choices for measures are a count of pageviews, and the sum of "latencyMs". Collecting that
sum when we load the data will allow us to compute an average at query time as well.
You can edit the existing `conf-quickstart/tranquility/kafka.json` file by altering these
sections:
1. Change the key `"metrics-kafka"` under `"dataSources"` to `"pageviews-kafka"`
2. Alter these sections under the new `"pageviews-kafka"` key:
```json
"dataSource": "pageviews-kafka"
```
```json
"timestampSpec": {
"format": "auto",
"column": "time"
}
```
```json
"dimensionsSpec": {
"dimensions": ["url", "user"]
}
```
```json
"metricsSpec": [
{"name": "views", "type": "count"},
{"name": "latencyMs", "type": "doubleSum", "fieldName": "latencyMs"}
]
```
```json
"properties" : {
"task.partitions" : "1",
"task.replicants" : "1",
"topicPattern" : "pageviews"
}
```
Next, start Druid Kafka ingestion:
```bash
bin/tranquility kafka -configFile ../druid-#{DRUIDVERSION}/conf-quickstart/tranquility/kafka.json
```
- If your Tranquility server or Kafka is already running, stop it (CTRL-C) and
start it up again.
Finally, send some data to the Kafka topic. Let's start with these messages:
```json
{"time": "2000-01-01T00:00:00Z", "url": "/foo/bar", "user": "alice", "latencyMs": 32}
{"time": "2000-01-01T00:00:00Z", "url": "/", "user": "bob", "latencyMs": 11}
{"time": "2000-01-01T00:00:00Z", "url": "/foo/bar", "user": "bob", "latencyMs": 45}
```
Druid streaming ingestion requires relatively current messages (relative to a slack time controlled by the
[windowPeriod](../ingestion/stream-ingestion.html#segmentgranularity-and-windowperiod) value), so you should
replace `2000-01-01T00:00:00Z` in these messages with the current time in ISO8601 format. You can
get this by running:
```bash
python -c 'import datetime; print(datetime.datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%SZ"))'
```
Update the timestamps in the JSON above, then copy and paste these messages into this console
producer and press enter:
```bash
./bin/kafka-console-producer.sh --broker-list localhost:9092 --topic pageviews
```
That's it, your data should now be in Druid. You can immediately query it using any of the
[supported query methods](../querying/querying.html).
If you wish to go through any of the other ingestion tutorials, you will need to shut down the cluster and reset the cluster state by removing the contents of the `var` directory under the druid package, as the other tutorials will write to the same "wikipedia" datasource.
## Further reading
To read more about loading streams, see our [streaming ingestion documentation](../ingestion/stream-ingestion.html).
For more information on loading data from Kafka streams, please see the [Druid Kafka indexing service documentation](http://druid.io/docs/{{druidVersion}}/development/extensions-core/kafka-ingestion.html).

View File

@ -0,0 +1,280 @@
---
layout: doc_page
---
# Tutorial: Querying data
This tutorial will demonstrate how to query data in Druid, with examples for Druid's native query format and Druid SQL.
The tutorial assumes that you've already completed one of the 4 ingestion tutorials, as we will be querying the sample Wikipedia edits data.
* [Tutorial: Loading a file](/docs/VERSION/tutorials/tutorial-batch.html)
* [Tutorial: Loading stream data from Kafka](/docs/VERSION/tutorials/tutorial-kafka.html)
* [Tutorial: Loading a file using Hadoop](/docs/VERSION/tutorials/tutorial-batch-hadoop.html)
* [Tutorial: Loading stream data using Tranquility](/docs/VERSION/tutorials/tutorial-tranquility.html)
## Native JSON queries
Druid's native query format is expressed in JSON. We have included a sample native TopN query under `quickstart/tutorial/wikipedia-top-pages.json`:
```json
{
"queryType" : "topN",
"dataSource" : "wikipedia",
"intervals" : ["2015-09-12/2015-09-13"],
"granularity" : "all",
"dimension" : "page",
"metric" : "count",
"threshold" : 10,
"aggregations" : [
{
"type" : "count",
"name" : "count"
}
]
}
```
This query retrieves the 10 Wikipedia pages with the most page edits on 2015-09-12.
Let's submit this query to the Druid broker:
```bash
curl -X 'POST' -H 'Content-Type:application/json' -d @quickstart/tutorial/wikipedia-top-pages.json http://localhost:8082/druid/v2?pretty
```
You should see the following query results:
```json
[ {
"timestamp" : "2015-09-12T00:46:58.771Z",
"result" : [ {
"count" : 33,
"page" : "Wikipedia:Vandalismusmeldung"
}, {
"count" : 28,
"page" : "User:Cyde/List of candidates for speedy deletion/Subpage"
}, {
"count" : 27,
"page" : "Jeremy Corbyn"
}, {
"count" : 21,
"page" : "Wikipedia:Administrators' noticeboard/Incidents"
}, {
"count" : 20,
"page" : "Flavia Pennetta"
}, {
"count" : 18,
"page" : "Total Drama Presents: The Ridonculous Race"
}, {
"count" : 18,
"page" : "User talk:Dudeperson176123"
}, {
"count" : 18,
"page" : "Wikipédia:Le Bistro/12 septembre 2015"
}, {
"count" : 17,
"page" : "Wikipedia:In the news/Candidates"
}, {
"count" : 17,
"page" : "Wikipedia:Requests for page protection"
} ]
} ]
```
## Druid SQL queries
Druid also supports a dialect of SQL for querying. Let's run a SQL query that is equivalent to the native JSON query shown above:
```
SELECT page, COUNT(*) AS Edits FROM wikipedia WHERE "__time" BETWEEN TIMESTAMP '2015-09-12 00:00:00' AND TIMESTAMP '2015-09-13 00:00:00' GROUP BY page ORDER BY Edits DESC LIMIT 10;
```
The SQL queries are submitted as JSON over HTTP.
### TopN query example
The tutorial package includes an example file that contains the SQL query shown above at `quickstart/wikipedia-top-pages-sql.json`. Let's submit that query to the Druid broker:
```bash
curl -X 'POST' -H 'Content-Type:application/json' -d @quickstart/tutorial/wikipedia-top-pages-sql.json http://localhost:8082/druid/v2/sql
```
The following results should be returned:
```
[
{
"page": "Wikipedia:Vandalismusmeldung",
"Edits": 33
},
{
"page": "User:Cyde/List of candidates for speedy deletion/Subpage",
"Edits": 28
},
{
"page": "Jeremy Corbyn",
"Edits": 27
},
{
"page": "Wikipedia:Administrators' noticeboard/Incidents",
"Edits": 21
},
{
"page": "Flavia Pennetta",
"Edits": 20
},
{
"page": "Total Drama Presents: The Ridonculous Race",
"Edits": 18
},
{
"page": "User talk:Dudeperson176123",
"Edits": 18
},
{
"page": "Wikipédia:Le Bistro/12 septembre 2015",
"Edits": 18
},
{
"page": "Wikipedia:In the news/Candidates",
"Edits": 17
},
{
"page": "Wikipedia:Requests for page protection",
"Edits": 17
}
]
```
### dsql client
For convenience, the Druid package includes a SQL command-line client, located at `bin/dsql` from the Druid package root.
Let's now run `bin/dsql`; you should see the following prompt:
```
Welcome to dsql, the command-line client for Druid SQL.
Type "\h" for help.
dsql>
```
To submit the query, paste it to the `dsql` prompt and press enter:
```
dsql> SELECT page, COUNT(*) AS Edits FROM wikipedia WHERE "__time" BETWEEN TIMESTAMP '2015-09-12 00:00:00' AND TIMESTAMP '2015-09-13 00:00:00' GROUP BY page ORDER BY Edits DESC LIMIT 10;
┌──────────────────────────────────────────────────────────┬───────┐
│ page │ Edits │
├──────────────────────────────────────────────────────────┼───────┤
│ Wikipedia:Vandalismusmeldung │ 33 │
│ User:Cyde/List of candidates for speedy deletion/Subpage │ 28 │
│ Jeremy Corbyn │ 27 │
│ Wikipedia:Administrators' noticeboard/Incidents │ 21 │
│ Flavia Pennetta │ 20 │
│ Total Drama Presents: The Ridonculous Race │ 18 │
│ User talk:Dudeperson176123 │ 18 │
│ Wikipédia:Le Bistro/12 septembre 2015 │ 18 │
│ Wikipedia:In the news/Candidates │ 17 │
│ Wikipedia:Requests for page protection │ 17 │
└──────────────────────────────────────────────────────────┴───────┘
Retrieved 10 rows in 0.06s.
```
### Additional Druid SQL queries
#### Timeseries
`SELECT FLOOR(__time to HOUR) AS HourTime, SUM(deleted) AS LinesDeleted FROM wikipedia WHERE "__time" BETWEEN TIMESTAMP '2015-09-12 00:00:00' AND TIMESTAMP '2015-09-13 00:00:00' GROUP BY FLOOR(__time to HOUR);`
```
dsql> SELECT FLOOR(__time to HOUR) AS HourTime, SUM(deleted) AS LinesDeleted FROM wikipedia WHERE "__time" BETWEEN TIMESTAMP '2015-09-12 00:00:00' AND TIMESTAMP '2015-09-13 00:00:00' GROUP BY FLOOR(__time to HOUR);
┌──────────────────────────┬──────────────┐
│ HourTime │ LinesDeleted │
├──────────────────────────┼──────────────┤
│ 2015-09-12T00:00:00.000Z │ 1761 │
│ 2015-09-12T01:00:00.000Z │ 16208 │
│ 2015-09-12T02:00:00.000Z │ 14543 │
│ 2015-09-12T03:00:00.000Z │ 13101 │
│ 2015-09-12T04:00:00.000Z │ 12040 │
│ 2015-09-12T05:00:00.000Z │ 6399 │
│ 2015-09-12T06:00:00.000Z │ 9036 │
│ 2015-09-12T07:00:00.000Z │ 11409 │
│ 2015-09-12T08:00:00.000Z │ 11616 │
│ 2015-09-12T09:00:00.000Z │ 17509 │
│ 2015-09-12T10:00:00.000Z │ 19406 │
│ 2015-09-12T11:00:00.000Z │ 16284 │
│ 2015-09-12T12:00:00.000Z │ 18672 │
│ 2015-09-12T13:00:00.000Z │ 30520 │
│ 2015-09-12T14:00:00.000Z │ 18025 │
│ 2015-09-12T15:00:00.000Z │ 26399 │
│ 2015-09-12T16:00:00.000Z │ 24759 │
│ 2015-09-12T17:00:00.000Z │ 19634 │
│ 2015-09-12T18:00:00.000Z │ 17345 │
│ 2015-09-12T19:00:00.000Z │ 19305 │
│ 2015-09-12T20:00:00.000Z │ 22265 │
│ 2015-09-12T21:00:00.000Z │ 16394 │
│ 2015-09-12T22:00:00.000Z │ 16379 │
│ 2015-09-12T23:00:00.000Z │ 15289 │
└──────────────────────────┴──────────────┘
Retrieved 24 rows in 0.08s.
```
#### GroupBy
`SELECT channel, SUM(added) FROM wikipedia WHERE "__time" BETWEEN TIMESTAMP '2015-09-12 00:00:00' AND TIMESTAMP '2015-09-13 00:00:00' GROUP BY channel ORDER BY SUM(added) DESC LIMIT 5;`
```
dsql> SELECT channel, SUM(added) FROM wikipedia WHERE "__time" BETWEEN TIMESTAMP '2015-09-12 00:00:00' AND TIMESTAMP '2015-09-13 00:00:00' GROUP BY channel ORDER BY SUM(added) DESC LIMIT 5;
┌───────────────┬─────────┐
│ channel │ EXPR$1 │
├───────────────┼─────────┤
#en.wikipedia │ 3045299 │
#it.wikipedia │ 711011 │
#fr.wikipedia │ 642555 │
#ru.wikipedia │ 640698 │
#es.wikipedia │ 634670 │
└───────────────┴─────────┘
Retrieved 5 rows in 0.05s.
```
#### Scan
` SELECT user, page FROM wikipedia WHERE "__time" BETWEEN TIMESTAMP '2015-09-12 02:00:00' AND TIMESTAMP '2015-09-12 03:00:00' LIMIT 5;`
```
dsql> SELECT user, page FROM wikipedia WHERE "__time" BETWEEN TIMESTAMP '2015-09-12 02:00:00' AND TIMESTAMP '2015-09-12 03:00:00' LIMIT 5;
┌────────────────────────┬────────────────────────────────────────────────────────┐
│ user │ page │
├────────────────────────┼────────────────────────────────────────────────────────┤
│ Thiago89 │ Campeonato Mundial de Voleibol Femenino Sub-20 de 2015 │
│ 91.34.200.249 │ Friede von Schönbrunn │
│ TuHan-Bot │ Trĩ vàng │
│ Lowercase sigmabot III │ User talk:ErrantX │
│ BattyBot │ Hans W. Jung │
└────────────────────────┴────────────────────────────────────────────────────────┘
Retrieved 5 rows in 0.04s.
```
#### EXPLAIN PLAN FOR
By prepending `EXPLAIN PLAN FOR ` to a Druid SQL query, it is possible to see what native Druid queries a SQL query will plan into.
Using the TopN query above as an example:
`EXPLAIN PLAN FOR SELECT page, COUNT(*) AS Edits FROM wikipedia WHERE "__time" BETWEEN TIMESTAMP '2015-09-12 00:00:00' AND TIMESTAMP '2015-09-13 00:00:00' GROUP BY page ORDER BY Edits DESC LIMIT 10;`
```
dsql> EXPLAIN PLAN FOR SELECT page, COUNT(*) AS Edits FROM wikipedia WHERE "__time" BETWEEN TIMESTAMP '2015-09-12 00:00:00' AND TIMESTAMP '2015-09-13 00:00:00' GROUP BY page ORDER BY Edits DESC LIMIT 10;
┌─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┐
│ PLAN │
├─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┤
│ DruidQueryRel(query=[{"queryType":"topN","dataSource":{"type":"table","name":"wikipedia"},"virtualColumns":[],"dimension":{"type":"default","dimension":"page","outputName":"d0","outputType":"STRING"},"metric":{"type":"numeric","metric":"a0"},"threshold":10,"intervals":{"type":"intervals","intervals":["2015-09-12T00:00:00.000Z/2015-09-13T00:00:00.001Z"]},"filter":null,"granularity":{"type":"all"},"aggregations":[{"type":"count","name":"a0"}],"postAggregations":[],"context":{},"descending":false}], signature=[{d0:STRING, a0:LONG}]) │
└─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┘
Retrieved 1 row in 0.03s.
```
## Further reading
The [Queries documentation](/docs/VERSION/querying/querying.html) has more information on Druid's native JSON queries.
The [Druid SQL documentation](/docs/VERSION/querying/sql.html) has more information on using Druid SQL queries.

View File

@ -0,0 +1,92 @@
---
layout: doc_page
---
# Tutorial: Configuring data retention
This tutorial demonstrates how to configure retention rules on a datasource to set the time intervals of data that will be retained or dropped.
For this tutorial, we'll assume you've already downloaded Druid as described in
the [single-machine quickstart](index.html) and have it running on your local machine.
It will also be helpful to have finished [Tutorial: Loading a file](/docs/VERSION/tutorials/tutorial-batch.html) and [Tutorial: Querying data](/docs/VERSION/tutorials/tutorial-query.html).
## Load the example data
For this tutorial, we'll be using the Wikipedia edits sample data, with an ingestion task spec that will create a separate segment for each hour in the input data.
The ingestion spec can be found at `quickstart/retention-index.json`. Let's submit that spec, which will create a datasource called `retention-tutorial`:
```
bin/post-index-task --file quickstart/tutorial/retention-index.json
```
After the ingestion completes, go to http://localhost:8081 in a browser to access the Coordinator console.
In the Coordinator console, go to the `datasources` tab at the top of the page.
This tab shows the available datasources and a summary of the retention rules for each datasource:
![Summary](../tutorials/img/tutorial-retention-00.png "Summary")
Currently there are no rules set for the `retention-tutorial` datasource. Note that there are default rules, currently set to `load Forever 2 in _default_tier`.
This means that all data will be loaded regardless of timestamp, and each segment will be replicated to two nodes in the default tier.
In this tutorial, we will ignore the tiering and redundancy concepts for now.
Let's click the `retention-tutorial` datasource on the left.
The next page (http://localhost:8081/#/datasources/retention-tutorial) provides information about what segments a datasource contains. On the left, the page shows that there are 24 segments, each one containing data for a specific hour of 2015-09-12:
![Original segments](../tutorials/img/tutorial-retention-01.png "Original segments")
## Set retention rules
Suppose we want to drop data for the first 12 hours of 2015-09-12 and keep data for the later 12 hours of 2015-09-12.
Click the `edit rules` button with a pencil icon at the upper left corner of the page.
A rule configuration window will appear. Enter `tutorial` for both the user and changelog comment field.
Now click the `+ Add a rule` button twice.
In the `rule #1` box at the top, click `Load`, `Interval`, enter `2015-09-12T12:00:00.000Z/2015-09-13T00:00:00.000Z` in the interval box, and click `+ _default_tier replicant`.
In the `rule #2` box at the bottom, click `Drop` and `Forever`.
The rules should look like this:
![Set rules](../tutorials/img/tutorial-retention-02.png "Set rules")
Now click `Save all rules`, wait for a few seconds, and refresh the page.
The segments for the first 12 hours of 2015-09-12 are now gone:
![New segments](../tutorials/img/tutorial-retention-03.png "New segments")
The resulting retention rule chain is the following:
```
loadByInterval 2015-09-12T12/2015-09-13 (12 hours)
dropForever
loadForever (default rule)
```
The rule chain is evaluated from top to bottom, with the default rule chain always added at the bottom.
The tutorial rule chain we just created loads data if it is within the specified 12 hour interval.
If data is not within the 12 hour interval, the rule chain evaluates `dropForever` next, which will drop any data.
The `dropForever` terminates the rule chain, effectively overriding the default `loadForever` rule, which will never be reached in this rule chain.
Note that in this tutorial we defined a load rule on a specific interval.
If instead you want to retain data based on how old it is (e.g., retain data that ranges from 3 months in the past to the present time), you would define a Period load rule instead.
## Further reading
* [Load rules](/docs/VERSION/operations/rule-configuration.html)

View File

@ -0,0 +1,180 @@
---
layout: doc_page
---
# Tutorial: Roll-up
Druid can summarize raw data at ingestion time using a process we refer to as "roll-up". Roll-up is a first-level aggregation operation over a selected set of columns that reduces the size of stored segments.
This tutorial will demonstrate the effects of roll-up on an example dataset.
For this tutorial, we'll assume you've already downloaded Druid as described in
the [single-machine quickstart](index.html) and have it running on your local machine.
It will also be helpful to have finished [Tutorial: Loading a file](/docs/VERSION/tutorials/tutorial-batch.html) and [Tutorial: Querying data](/docs/VERSION/tutorials/tutorial-query.html).
## Example data
For this tutorial, we'll use a small sample of network flow event data, representing packet and byte counts for traffic from a source to a destination IP address that occurred within a particular second.
```
{"timestamp":"2018-01-01T01:01:35Z","srcIP":"1.1.1.1", "dstIP":"2.2.2.2","packets":20,"bytes":9024}
{"timestamp":"2018-01-01T01:01:51Z","srcIP":"1.1.1.1", "dstIP":"2.2.2.2","packets":255,"bytes":21133}
{"timestamp":"2018-01-01T01:01:59Z","srcIP":"1.1.1.1", "dstIP":"2.2.2.2","packets":11,"bytes":5780}
{"timestamp":"2018-01-01T01:02:14Z","srcIP":"1.1.1.1", "dstIP":"2.2.2.2","packets":38,"bytes":6289}
{"timestamp":"2018-01-01T01:02:29Z","srcIP":"1.1.1.1", "dstIP":"2.2.2.2","packets":377,"bytes":359971}
{"timestamp":"2018-01-01T01:03:29Z","srcIP":"1.1.1.1", "dstIP":"2.2.2.2","packets":49,"bytes":10204}
{"timestamp":"2018-01-02T21:33:14Z","srcIP":"7.7.7.7", "dstIP":"8.8.8.8","packets":38,"bytes":6289}
{"timestamp":"2018-01-02T21:33:45Z","srcIP":"7.7.7.7", "dstIP":"8.8.8.8","packets":123,"bytes":93999}
{"timestamp":"2018-01-02T21:35:45Z","srcIP":"7.7.7.7", "dstIP":"8.8.8.8","packets":12,"bytes":2818}
```
A file containing this sample input data is located at `quickstart/tutorial/rollup-data.json`.
We'll ingest this data using the following ingestion task spec, located at `quickstart/tutorial/rollup-index.json`.
```
{
"type" : "index",
"spec" : {
"dataSchema" : {
"dataSource" : "rollup-tutorial",
"parser" : {
"type" : "string",
"parseSpec" : {
"format" : "json",
"dimensionsSpec" : {
"dimensions" : [
"srcIP",
"dstIP"
]
},
"timestampSpec": {
"column": "timestamp",
"format": "iso"
}
}
},
"metricsSpec" : [
{ "type" : "count", "name" : "count" },
{ "type" : "longSum", "name" : "packets", "fieldName" : "packets" },
{ "type" : "longSum", "name" : "bytes", "fieldName" : "bytes" }
],
"granularitySpec" : {
"type" : "uniform",
"segmentGranularity" : "week",
"queryGranularity" : "minute",
"intervals" : ["2018-01-01/2018-01-03"],
"rollup" : true
}
},
"ioConfig" : {
"type" : "index",
"firehose" : {
"type" : "local",
"baseDir" : "quickstart/tutorial",
"filter" : "rollup-data.json"
},
"appendToExisting" : false
},
"tuningConfig" : {
"type" : "index",
"targetPartitionSize" : 5000000,
"maxRowsInMemory" : 25000,
"forceExtendableShardSpecs" : true
}
}
}
```
Roll-up has been enabled by setting `"rollup" : true` in the `granularitySpec`.
Note that we have `srcIP` and `dstIP` defined as dimensions, a longSum metric is defined for the `packets` and `bytes` columns, and the `queryGranularity` has been defined as `minute`.
We will see how these definitions are used after we load this data.
## Load the example data
From the druid-${DRUIDVERSION} package root, run the following command:
```
bin/post-index-task --file quickstart/tutorial/rollup-index.json
```
After the script completes, we will query the data.
## Query the example data
Let's run `bin/dsql` and issue a `select * from "rollup-tutorial";` query to see what data was ingested.
```
$ bin/dsql
Welcome to dsql, the command-line client for Druid SQL.
Type "\h" for help.
dsql> select * from "rollup-tutorial";
┌──────────────────────────┬────────┬───────┬─────────┬─────────┬─────────┐
│ __time │ bytes │ count │ dstIP │ packets │ srcIP │
├──────────────────────────┼────────┼───────┼─────────┼─────────┼─────────┤
│ 2018-01-01T01:01:00.000Z │ 35937 │ 3 │ 2.2.2.2 │ 286 │ 1.1.1.1 │
│ 2018-01-01T01:02:00.000Z │ 366260 │ 2 │ 2.2.2.2 │ 415 │ 1.1.1.1 │
│ 2018-01-01T01:03:00.000Z │ 10204 │ 1 │ 2.2.2.2 │ 49 │ 1.1.1.1 │
│ 2018-01-02T21:33:00.000Z │ 100288 │ 2 │ 8.8.8.8 │ 161 │ 7.7.7.7 │
│ 2018-01-02T21:35:00.000Z │ 2818 │ 1 │ 8.8.8.8 │ 12 │ 7.7.7.7 │
└──────────────────────────┴────────┴───────┴─────────┴─────────┴─────────┘
Retrieved 5 rows in 1.18s.
dsql>
```
Let's look at the three events in the original input data that occurred during `2018-01-01T01:01`:
```
{"timestamp":"2018-01-01T01:01:35Z","srcIP":"1.1.1.1", "dstIP":"2.2.2.2","packets":20,"bytes":9024}
{"timestamp":"2018-01-01T01:01:51Z","srcIP":"1.1.1.1", "dstIP":"2.2.2.2","packets":255,"bytes":21133}
{"timestamp":"2018-01-01T01:01:59Z","srcIP":"1.1.1.1", "dstIP":"2.2.2.2","packets":11,"bytes":5780}
```
These three rows have been "rolled up" into the following row:
```
┌──────────────────────────┬────────┬───────┬─────────┬─────────┬─────────┐
│ __time │ bytes │ count │ dstIP │ packets │ srcIP │
├──────────────────────────┼────────┼───────┼─────────┼─────────┼─────────┤
│ 2018-01-01T01:01:00.000Z │ 35937 │ 3 │ 2.2.2.2 │ 286 │ 1.1.1.1 │
└──────────────────────────┴────────┴───────┴─────────┴─────────┴─────────┘
```
The input rows have been grouped by the timestamp and dimension columns `{timestamp, srcIP, dstIP}` with sum aggregations on the metric columns `packets` and `bytes`.
Before the grouping occurs, the timestamps of the original input data are bucketed/floored by minute, due to the `"queryGranularity":"minute"` setting in the ingestion spec.
Likewise, these two events that occurred during `2018-01-01T01:02` have been rolled up:
```
{"timestamp":"2018-01-01T01:02:14Z","srcIP":"1.1.1.1", "dstIP":"2.2.2.2","packets":38,"bytes":6289}
{"timestamp":"2018-01-01T01:02:29Z","srcIP":"1.1.1.1", "dstIP":"2.2.2.2","packets":377,"bytes":359971}
```
```
┌──────────────────────────┬────────┬───────┬─────────┬─────────┬─────────┐
│ __time │ bytes │ count │ dstIP │ packets │ srcIP │
├──────────────────────────┼────────┼───────┼─────────┼─────────┼─────────┤
│ 2018-01-01T01:02:00.000Z │ 366260 │ 2 │ 2.2.2.2 │ 415 │ 1.1.1.1 │
└──────────────────────────┴────────┴───────┴─────────┴─────────┴─────────┘
```
For the last event recording traffic between 1.1.1.1 and 2.2.2.2, no roll-up took place, because this was the only event that occurred during `2018-01-01T01:03`:
```
{"timestamp":"2018-01-01T01:03:29Z","srcIP":"1.1.1.1", "dstIP":"2.2.2.2","packets":49,"bytes":10204}
```
```
┌──────────────────────────┬────────┬───────┬─────────┬─────────┬─────────┐
│ __time │ bytes │ count │ dstIP │ packets │ srcIP │
├──────────────────────────┼────────┼───────┼─────────┼─────────┼─────────┤
│ 2018-01-01T01:03:00.000Z │ 10204 │ 1 │ 2.2.2.2 │ 49 │ 1.1.1.1 │
└──────────────────────────┴────────┴───────┴─────────┴─────────┴─────────┘
```
Note that the `count` metric shows how many rows in the original input data contributed to the final "rolled up" row.

View File

@ -1,134 +0,0 @@
---
layout: doc_page
---
# Tutorial: Load your own streaming data
## Getting started
This tutorial shows you how to load your own streams into Druid.
For this tutorial, we'll assume you've already downloaded Druid and Tranquility as described in
the [single-machine quickstart](quickstart.html) and have it running on your local machine. You
don't need to have loaded any data yet.
Once that's complete, you can load your own dataset by writing a custom ingestion spec.
## Writing an ingestion spec
When loading streams into Druid, we recommend using the [stream push](../ingestion/stream-push.html)
process. In this tutorial we'll be using [Tranquility Server](../ingestion/stream-ingestion.html#server) to push
data into Druid over HTTP.
<div class="note info">
This tutorial will show you how to push streams to Druid using HTTP, but Druid additionally supports
a wide variety of batch and streaming loading methods. See the <a href="../ingestion/batch-ingestion.html">Loading files</a>
and <a href="../ingestion/stream-ingestion.html">Loading streams</a> pages for more information about other options,
including from Hadoop, Kafka, Storm, Samza, Spark Streaming, and your own JVM apps.
</div>
You can prepare for loading a new dataset over HTTP by writing a custom Tranquility Server
configuration. The bundled configuration is in `conf-quickstart/tranquility/server.json`, which
you can modify for your own needs.
The most important questions are:
* What should the dataset be called? This is the "dataSource" field of the "dataSchema".
* Which field should be treated as a timestamp? This belongs in the "column" field of the "timestampSpec".
* Which fields should be treated as dimensions? This belongs in the "dimensions" field of the "dimensionsSpec".
* Which fields should be treated as measures? This belongs in the "metricsSpec" field.
Let's use a small JSON pageviews dataset as an example, with records like:
```json
{"time": "2000-01-01T00:00:00Z", "url": "/foo/bar", "user": "alice", "latencyMs": 32}
```
So the answers to the questions above are:
* Let's call the dataset "pageviews".
* The timestamp is the "time" field.
* Good choices for dimensions are the string fields "url" and "user".
* Good choices for measures are a count of pageviews, and the sum of "latencyMs". Collecting that
sum when we load the data will allow us to compute an average at query time as well.
Now, edit the existing `conf-quickstart/tranquility/server.json` file by altering these
sections:
1. Change the key `"metrics"` under `"dataSources"` to `"pageviews"`
2. Alter these sections under the new `"pageviews"` key:
```json
"dataSource": "pageviews"
```
```json
"timestampSpec": {
"format": "auto",
"column": "time"
}
```
```json
"dimensionsSpec": {
"dimensions": ["url", "user"]
}
```
```json
"metricsSpec": [
{"name": "views", "type": "count"},
{"name": "latencyMs", "type": "doubleSum", "fieldName": "latencyMs"}
]
```
## Restarting the server
Restart the server to pick up the new configuration file by stopping Tranquility (CTRL-C) and starting it up again.
## Sending data
Let's send some data! We'll start with these three records:
```json
{"time": "2000-01-01T00:00:00Z", "url": "/foo/bar", "user": "alice", "latencyMs": 32}
{"time": "2000-01-01T00:00:00Z", "url": "/", "user": "bob", "latencyMs": 11}
{"time": "2000-01-01T00:00:00Z", "url": "/foo/bar", "user": "bob", "latencyMs": 45}
```
Druid streaming ingestion requires relatively current messages (relative to a slack time controlled by the
[windowPeriod](../ingestion/stream-push.html#segmentgranularity-and-windowperiod) value), so you should
replace `2000-01-01T00:00:00Z` in these messages with the current time in ISO8601 format. You can
get this by running:
```bash
python -c 'import datetime; print(datetime.datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%SZ"))'
```
Update the timestamps in the JSON above, and save it to a file named `pageviews.json`. Then send
it to Druid by running:
```bash
curl -XPOST -H'Content-Type: application/json' --data-binary @pageviews.json http://localhost:8200/v1/post/pageviews
```
This will print something like:
```
{"result":{"received":3,"sent":3}}
```
This indicates that the HTTP server received 3 events from you, and sent 3 to Druid. Note that
this may take a few seconds to finish the first time you run it, as Druid resources must be
allocated to the ingestion task. Subsequent POSTs should complete quickly.
If you see `"sent":0` this likely means that your timestamps are not recent enough. Try adjusting
your timestamps and re-sending your data.
## Querying your data
After sending data, you can immediately query it using any of the
[supported query methods](../querying/querying.html).
## Further reading
To read more about loading streams, see our [streaming ingestion documentation](../ingestion/stream-ingestion.html).

View File

@ -0,0 +1,84 @@
---
layout: doc_page
---
# Tutorial: Load streaming data with HTTP push
## Getting started
This tutorial shows you how to load streaming data into Druid using HTTP push via Tranquility Server.
[Tranquility Server](https://github.com/druid-io/tranquility/blob/master/docs/server.md) allows a stream of data to be pushed into Druid using HTTP POSTs.
For this tutorial, we'll assume you've already downloaded Druid as described in
the [single-machine quickstart](quickstart.html) and have it running on your local machine. You
don't need to have loaded any data yet.
## Download Tranquility
In the Druid package root, run the following commands:
```
curl http://static.druid.io/tranquility/releases/tranquility-distribution-0.8.2.tgz -o tranquility-distribution-0.8.2.tgz
tar -xzf tranquility-distribution-0.8.2.tgz
mv tranquility-distribution-0.8.2 tranquility
```
The startup scripts for the tutorial will expect the contents of the Tranquility tarball to be located at `tranquility` under the druid-#{DRUIDVERSION} package root.
## Enable Tranquility Server
- In your `quickstart/tutorial/conf/tutorial-cluster.conf`, uncomment the `tranquility-server` line.
- Stop your *bin/supervise* command (CTRL-C) and then restart it by again running `bin/supervise -c quickstart/tutorial/conf/tutorial-cluster.conf`.
As part of the output of *supervise* you should see something like:
```
Running command[tranquility-server], logging to[/stage/druid-{DRUIDVERSION}/var/sv/tranquility-server.log]: tranquility/bin/tranquility server -configFile quickstart/tutorial/conf/tranquility/server.json -Ddruid.extensions.loadList=[]
```
You can check the log file in `var/sv/tranquility-server.log` to confirm that the server is starting up properly.
## Send data
Let's send the sample Wikipedia edits data to Tranquility:
```
gunzip -k quickstart/wikiticker-2015-09-12-sampled.json.gz
curl -XPOST -H'Content-Type: application/json' --data-binary @quickstart/wikiticker-2015-09-12-sampled.json http://localhost:8200/v1/post/wikipedia
```
Which will print something like:
```
{"result":{"received":39244,"sent":39244}}
```
This indicates that the HTTP server received 39,244 events from you, and sent 39,244 to Druid. This
command may generate a "connection refused" error if you run it too quickly after enabling Tranquility
Server, which means the server has not yet started up. It should start up within a few seconds. The command
may also take a few seconds to finish the first time you run it, during which time Druid resources are being
allocated to the ingestion task. Subsequent POSTs will complete quickly once this is done.
Once the data is sent to Druid, you can immediately query it.
If you see a `sent` count of 0, retry the send command until the `sent` count also shows 39244:
```
{"result":{"received":39244,"sent":0}}
```
## Querying your data
Please follow the [query tutorial](../tutorial/tutorial-query.html) to run some example queries on the newly loaded data.
## Cleanup
If you wish to go through any of the other ingestion tutorials, you will need to shut down the cluster and reset the cluster state by removing the contents of the `var` directory under the druid package, as the other tutorials will write to the same "wikipedia" datasource.
When cleaning up after running this Tranquility tutorial, it is also necessary to recomment the `tranquility-server` line in `quickstart/tutorial/conf/tutorial-cluster.conf` before restarting the cluster.
## Further reading
For more information on Tranquility, please see [the Tranquility documentation](https://github.com/druid-io/tranquility).

View File

@ -0,0 +1,138 @@
---
layout: doc_page
---
# Tutorial: Transforming input data
This tutorial will demonstrate how to use transform specs to filter and transform input data during ingestion.
For this tutorial, we'll assume you've already downloaded Druid as described in
the [single-machine quickstart](index.html) and have it running on your local machine.
It will also be helpful to have finished [Tutorial: Loading a file](/docs/VERSION/tutorials/tutorial-batch.html) and [Tutorial: Querying data](/docs/VERSION/tutorials/tutorial-query.html).
## Sample data
We've included sample data for this tutorial at `quickstart/tutorial/transform-data.json`, reproduced here for convenience:
```
{"timestamp":"2018-01-01T07:01:35Z","animal":"octopus", "location":1, "number":100}
{"timestamp":"2018-01-01T05:01:35Z","animal":"mongoose", "location":2,"number":200}
{"timestamp":"2018-01-01T06:01:35Z","animal":"snake", "location":3, "number":300}
{"timestamp":"2018-01-01T01:01:35Z","animal":"lion", "location":4, "number":300}
```
## Load data with transform specs
We will ingest the sample data using the following spec, which demonstrates the use of transform specs:
```
{
"type" : "index",
"spec" : {
"dataSchema" : {
"dataSource" : "transform-tutorial",
"parser" : {
"type" : "string",
"parseSpec" : {
"format" : "json",
"dimensionsSpec" : {
"dimensions" : [
"animal",
{ "name": "location", "type": "long" }
]
},
"timestampSpec": {
"column": "timestamp",
"format": "iso"
}
}
},
"metricsSpec" : [
{ "type" : "count", "name" : "count" },
{ "type" : "longSum", "name" : "number", "fieldName" : "number" },
{ "type" : "longSum", "name" : "triple-number", "fieldName" : "triple-number" }
],
"granularitySpec" : {
"type" : "uniform",
"segmentGranularity" : "week",
"queryGranularity" : "minute",
"intervals" : ["2018-01-01/2018-01-03"],
"rollup" : true
},
"transformSpec": {
"transforms": [
{
"type": "expression",
"name": "animal",
"expression": "concat('super-', animal)"
},
{
"type": "expression",
"name": "triple-number",
"expression": "number * 3"
}
],
"filter": {
"type":"or",
"fields": [
{ "type": "selector", "dimension": "animal", "value": "super-mongoose" },
{ "type": "selector", "dimension": "triple-number", "value": "300" },
{ "type": "selector", "dimension": "location", "value": "3" }
]
}
}
},
"ioConfig" : {
"type" : "index",
"firehose" : {
"type" : "local",
"baseDir" : "quickstart/tutorial",
"filter" : "transform-data.json"
},
"appendToExisting" : false
},
"tuningConfig" : {
"type" : "index",
"targetPartitionSize" : 5000000,
"maxRowsInMemory" : 25000,
"forceExtendableShardSpecs" : true
}
}
}
```
In the transform spec, we have two expression transforms:
* `super-animal`: prepends "super-" to the values in the `animal` column. This will override the `animal` column with the transformed version, since the transform's name is `animal`.
* `triple-number`: multiplies the `number` column by 3. This will create a new `triple-number` column. Note that we are ingesting both the original and the transformed column.
Additionally, we have an OR filter with three clauses:
* `super-animal` values that match "super-mongoose"
* `triple-number` values that match 300
* `location` values that match 3
This filter selects the first 3 rows, and it will exclude the final "lion" row in the input data. Note that the filter is applied after the transformation.
Let's submit this task now, which has been included at `quickstart/tutorial/transform-index.json`:
```
bin/post-index-task --file quickstart/tutorial/transform-index.json
```
## Query the transformed data
Let's run `bin/dsql` and issue a `select * from "transform-tutorial";` query to see what was ingested:
```
dsql> select * from "transform-tutorial";
┌──────────────────────────┬────────────────┬───────┬──────────┬────────┬───────────────┐
│ __time │ animal │ count │ location │ number │ triple-number │
├──────────────────────────┼────────────────┼───────┼──────────┼────────┼───────────────┤
│ 2018-01-01T05:01:00.000Z │ super-mongoose │ 1 │ 2 │ 200 │ 600 │
│ 2018-01-01T06:01:00.000Z │ super-snake │ 1 │ 3 │ 300 │ 900 │
│ 2018-01-01T07:01:00.000Z │ super-octopus │ 1 │ 1 │ 100 │ 300 │
└──────────────────────────┴────────────────┴───────┴──────────┴────────┴───────────────┘
Retrieved 3 rows in 0.03s.
```
The "lion" row has been discarded, the `animal` column has been transformed, and we have both the original and transformed `number` column.

View File

@ -0,0 +1,150 @@
---
layout: doc_page
---
# Tutorial: Updating existing data
This tutorial demonstrates how to update existing data, showing both overwrites and appends.
For this tutorial, we'll assume you've already downloaded Druid as described in
the [single-machine quickstart](index.html) and have it running on your local machine.
It will also be helpful to have finished [Tutorial: Loading a file](/docs/VERSION/tutorials/tutorial-batch.html), [Tutorial: Querying data](/docs/VERSION/tutorials/tutorial-query.html), and [Tutorial: Rollup](/docs/VERSION/tutorials/tutorial-rollup.html).
## Overwrite
This section of the tutorial will cover how to overwrite an existing interval of data.
### Load initial data
Let's load an initial data set which we will overwrite and append to.
The spec we'll use for this tutorial is located at `quickstart/tutorial/updates-init-index.json`. This spec creates a datasource called `updates-tutorial` from the `quickstart/tutorial/updates-data.json` input file.
Let's submit that task:
```
bin/post-index-task --file quickstart/tutorial/updates-init-index.json
```
We have three initial rows containing an "animal" dimension and "number" metric:
```
dsql> select * from "updates-tutorial";
┌──────────────────────────┬──────────┬───────┬────────┐
│ __time │ animal │ count │ number │
├──────────────────────────┼──────────┼───────┼────────┤
│ 2018-01-01T01:01:00.000Z │ tiger │ 1 │ 100 │
│ 2018-01-01T03:01:00.000Z │ aardvark │ 1 │ 42 │
│ 2018-01-01T03:01:00.000Z │ giraffe │ 1 │ 14124 │
└──────────────────────────┴──────────┴───────┴────────┘
Retrieved 3 rows in 1.42s.
```
### Overwrite the initial data
To overwrite this data, we can submit another task for the same interval, but with different input data.
The `quickstart/tutorial/updates-overwrite-index.json` spec will perform an overwrite on the `updates-tutorial` datasource.
Note that this task reads input from `quickstart/tutorial/updates-data2.json`, and `appendToExisting` is set to `false` (indicating this is an overwrite).
Let's submit that task:
```
bin/post-index-task --file quickstart/tutorial/updates-overwrite-index.json
```
When Druid finishes loading the new segment from this overwrite task, the "tiger" row now has the value "lion", the "aardvark" row has a different number, and the "giraffe" row has been replaced. It may take a couple of minutes for the changes to take effect:
```
dsql> select * from "updates-tutorial";
┌──────────────────────────┬──────────┬───────┬────────┐
│ __time │ animal │ count │ number │
├──────────────────────────┼──────────┼───────┼────────┤
│ 2018-01-01T01:01:00.000Z │ lion │ 1 │ 100 │
│ 2018-01-01T03:01:00.000Z │ aardvark │ 1 │ 9999 │
│ 2018-01-01T04:01:00.000Z │ bear │ 1 │ 111 │
└──────────────────────────┴──────────┴───────┴────────┘
Retrieved 3 rows in 0.02s.
```
## Combine old data with new data and overwrite
Let's try appending some new data to the `updates-tutorial` datasource now. We will add the data from `quickstart/tutorial/updates-data3.json`.
The `quickstart/tutorial/updates-append-index.json` task spec has been configured to read from the existing `updates-tutorial` datasource and the `quickstart/tutorial/updates-data3.json` file. The task will combine data from the two input sources, and then overwrite the original data with the new combined data.
Let's submit that task:
```
bin/post-index-task --file quickstart/tutorial/updates-append-index.json
```
When Druid finishes loading the new segment from this overwrite task, the new rows will have been added to the datasource. Note that roll-up occurred for the "lion" row:
```
dsql> select * from "updates-tutorial";
┌──────────────────────────┬──────────┬───────┬────────┐
│ __time │ animal │ count │ number │
├──────────────────────────┼──────────┼───────┼────────┤
│ 2018-01-01T01:01:00.000Z │ lion │ 2 │ 400 │
│ 2018-01-01T03:01:00.000Z │ aardvark │ 1 │ 9999 │
│ 2018-01-01T04:01:00.000Z │ bear │ 1 │ 111 │
│ 2018-01-01T05:01:00.000Z │ mongoose │ 1 │ 737 │
│ 2018-01-01T06:01:00.000Z │ snake │ 1 │ 1234 │
│ 2018-01-01T07:01:00.000Z │ octopus │ 1 │ 115 │
└──────────────────────────┴──────────┴───────┴────────┘
Retrieved 6 rows in 0.02s.
```
## Append to the data
Let's try another way of appending data.
The `quickstart/tutorial/updates-append-index2.json` task spec reads input from `quickstart/tutorial/updates-data4.json` and will append its data to the `updates-tutorial` datasource. Note that `appendToExisting` is set to `true` in this spec.
Let's submit that task:
```
bin/post-index-task --file quickstart/tutorial/updates-append-index2.json
```
When the new data is loaded, we can see two additional rows after "octopus". Note that the new "bear" row with number 222 has not been rolled up with the existing bear-111 row, because the new data is held in a separate segment.
```
dsql> select * from "updates-tutorial";
┌──────────────────────────┬──────────┬───────┬────────┐
│ __time │ animal │ count │ number │
├──────────────────────────┼──────────┼───────┼────────┤
│ 2018-01-01T01:01:00.000Z │ lion │ 2 │ 400 │
│ 2018-01-01T03:01:00.000Z │ aardvark │ 1 │ 9999 │
│ 2018-01-01T04:01:00.000Z │ bear │ 1 │ 111 │
│ 2018-01-01T05:01:00.000Z │ mongoose │ 1 │ 737 │
│ 2018-01-01T06:01:00.000Z │ snake │ 1 │ 1234 │
│ 2018-01-01T07:01:00.000Z │ octopus │ 1 │ 115 │
│ 2018-01-01T04:01:00.000Z │ bear │ 1 │ 222 │
│ 2018-01-01T09:01:00.000Z │ falcon │ 1 │ 1241 │
└──────────────────────────┴──────────┴───────┴────────┘
Retrieved 8 rows in 0.02s.
```
If we run a GroupBy query instead of a `select *`, we can see that the "bear" rows will group together at query time:
```
dsql> select __time, animal, SUM("count"), SUM("number") from "updates-tutorial" group by __time, animal;
┌──────────────────────────┬──────────┬────────┬────────┐
│ __time │ animal │ EXPR$2 │ EXPR$3 │
├──────────────────────────┼──────────┼────────┼────────┤
│ 2018-01-01T01:01:00.000Z │ lion │ 2 │ 400 │
│ 2018-01-01T03:01:00.000Z │ aardvark │ 1 │ 9999 │
│ 2018-01-01T04:01:00.000Z │ bear │ 2 │ 333 │
│ 2018-01-01T05:01:00.000Z │ mongoose │ 1 │ 737 │
│ 2018-01-01T06:01:00.000Z │ snake │ 1 │ 1234 │
│ 2018-01-01T07:01:00.000Z │ octopus │ 1 │ 115 │
│ 2018-01-01T09:01:00.000Z │ falcon │ 1 │ 1241 │
└──────────────────────────┴──────────┴────────┴────────┘
Retrieved 7 rows in 0.23s.
```

View File

@ -1,4 +1,5 @@
#
#!/bin/bash -eu
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
@ -15,10 +16,21 @@
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#
druid.service=druid/coordinator
druid.port=8081
PWD="$(pwd)"
WHEREAMI="$(dirname "$0")"
WHEREAMI="$(cd "$WHEREAMI" && pwd)"
druid.coordinator.startDelay=PT10S
druid.coordinator.period=PT5S
RLWRAP=""
if [ -x "$(command -v rlwrap)" ]
then
RLWRAP="rlwrap -C dsql"
fi
if [ -x "$(command -v python2)" ]
then
exec $RLWRAP python2 "$WHEREAMI/dsql-main" "$@"
else
exec $RLWRAP "$WHEREAMI/dsql-main" "$@"
fi

453
examples/bin/dsql-main Executable file
View File

@ -0,0 +1,453 @@
#!/usr/bin/env python
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
from __future__ import print_function
import argparse
import base64
import collections
import csv
import errno
import json
import numbers
import re
import ssl
import sys
import time
import unicodedata
import urllib2
class DruidSqlException(Exception):
def write_to(self, f):
f.write('\x1b[31m')
f.write(self.message if self.message else "Query failed")
f.write('\x1b[0m')
f.write('\n')
f.flush()
def do_query(url, sql, context, timeout, user, password, ignore_ssl_verification, ca_file, ca_path):
json_decoder = json.JSONDecoder(object_pairs_hook=collections.OrderedDict)
try:
sql_json = json.dumps({'query' : sql, 'context' : context})
# SSL stuff
ssl_context = None;
if (ignore_ssl_verification or ca_file != None or ca_path != None):
ssl_context = ssl.create_default_context()
if (ignore_ssl_verification):
ssl_context.check_hostname = False
ssl_context.verify_mode = ssl.CERT_NONE
else:
ssl_context.load_verify_locations(cafile=ca_file, capath=ca_path)
req = urllib2.Request(url, sql_json, {'Content-Type' : 'application/json'})
if timeout <= 0:
timeout = None
if (user and password):
basicAuthEncoding = base64.b64encode('%s:%s' % (user, password))
req.add_header("Authorization", "Basic %s" % basicAuthEncoding)
response = urllib2.urlopen(req, None, timeout, context=ssl_context)
first_chunk = True
eof = False
buf = ''
while not eof or len(buf) > 0:
while True:
try:
# Remove starting ','
buf = buf.lstrip(',')
obj, sz = json_decoder.raw_decode(buf)
yield obj
buf = buf[sz:]
except ValueError as e:
# Maybe invalid JSON, maybe partial object; it's hard to tell with this library.
if eof and buf.rstrip() == ']':
# Stream done and all objects read.
buf = ''
break
elif eof or len(buf) > 256 * 1024:
# If we read more than 256KB or if it's eof then report the parse error.
raise
else:
# Stop reading objects, get more from the stream instead.
break
# Read more from the http stream
if not eof:
chunk = response.read(8192)
if chunk:
buf = buf + chunk
if first_chunk:
# Remove starting '['
buf = buf.lstrip('[')
else:
# Stream done. Keep reading objects out of buf though.
eof = True
except urllib2.URLError as e:
raise_friendly_error(e)
def raise_friendly_error(e):
if isinstance(e, urllib2.HTTPError):
text = e.read().strip()
error_obj = {}
try:
error_obj = dict(json.loads(text))
except:
pass
if e.code == 500 and 'errorMessage' in error_obj:
error_text = ''
if error_obj['error'] != 'Unknown exception':
error_text = error_text + error_obj['error'] + ': '
if error_obj['errorClass']:
error_text = error_text + str(error_obj['errorClass']) + ': '
error_text = error_text + str(error_obj['errorMessage'])
if error_obj['host']:
error_text = error_text + ' (' + str(error_obj['host']) + ')'
raise DruidSqlException(error_text)
else:
raise DruidSqlException("HTTP Error {0}: {1}\n{2}".format(e.code, e.reason, text))
else:
raise DruidSqlException(str(e))
def to_utf8(value):
if value is None:
return ""
elif isinstance(value, unicode):
return value.encode("utf-8")
else:
return str(value)
def to_tsv(values, delimiter):
return delimiter.join(to_utf8(v).replace(delimiter, '') for v in values)
def print_csv(rows, header):
csv_writer = csv.writer(sys.stdout)
first = True
for row in rows:
if first and header:
csv_writer.writerow(list(to_utf8(k) for k in row.keys()))
first = False
values = []
for key, value in row.iteritems():
values.append(to_utf8(value))
csv_writer.writerow(values)
def print_tsv(rows, header, tsv_delimiter):
first = True
for row in rows:
if first and header:
print(to_tsv(row.keys(), tsv_delimiter))
first = False
values = []
for key, value in row.iteritems():
values.append(value)
print(to_tsv(values, tsv_delimiter))
def print_json(rows):
for row in rows:
print(json.dumps(row))
def table_to_printable_value(value):
# Unicode string, trimmed with control characters removed
if value is None:
return u"NULL"
else:
return to_utf8(value).strip().decode('utf-8').translate(dict.fromkeys(range(32)))
def table_compute_string_width(v):
normalized = unicodedata.normalize('NFC', v)
width = 0
for c in normalized:
ccategory = unicodedata.category(c)
cwidth = unicodedata.east_asian_width(c)
if ccategory == 'Cf':
# Formatting control, zero width
pass
elif cwidth == 'F' or cwidth == 'W':
# Double-wide character, prints in two columns
width = width + 2
else:
# All other characters
width = width + 1
return width
def table_compute_column_widths(row_buffer):
widths = None
for values in row_buffer:
values_widths = [table_compute_string_width(v) for v in values]
if not widths:
widths = values_widths
else:
i = 0
for v in values:
widths[i] = max(widths[i], values_widths[i])
i = i + 1
return widths
def table_print_row(values, column_widths, column_types):
vertical_line = u'\u2502'.encode('utf-8')
for i in xrange(0, len(values)):
padding = ' ' * max(0, column_widths[i] - table_compute_string_width(values[i]))
if column_types and column_types[i] == 'n':
print(vertical_line + ' ' + padding + values[i].encode('utf-8') + ' ', end="")
else:
print(vertical_line + ' ' + values[i].encode('utf-8') + padding + ' ', end="")
print(vertical_line)
def table_print_header(values, column_widths):
# Line 1
left_corner = u'\u250C'.encode('utf-8')
horizontal_line = u'\u2500'.encode('utf-8')
top_tee = u'\u252C'.encode('utf-8')
right_corner = u'\u2510'.encode('utf-8')
print(left_corner, end="")
for i in xrange(0, len(column_widths)):
print(horizontal_line * max(0, column_widths[i] + 2), end="")
if i + 1 < len(column_widths):
print(top_tee, end="")
print(right_corner)
# Line 2
table_print_row(values, column_widths, None)
# Line 3
left_tee = u'\u251C'.encode('utf-8')
cross = u'\u253C'.encode('utf-8')
right_tee = u'\u2524'.encode('utf-8')
print(left_tee, end="")
for i in xrange(0, len(column_widths)):
print(horizontal_line * max(0, column_widths[i] + 2), end="")
if i + 1 < len(column_widths):
print(cross, end="")
print(right_tee)
def table_print_bottom(column_widths):
left_corner = u'\u2514'.encode('utf-8')
right_corner = u'\u2518'.encode('utf-8')
bottom_tee = u'\u2534'.encode('utf-8')
horizontal_line = u'\u2500'.encode('utf-8')
print(left_corner, end="")
for i in xrange(0, len(column_widths)):
print(horizontal_line * max(0, column_widths[i] + 2), end="")
if i + 1 < len(column_widths):
print(bottom_tee, end="")
print(right_corner)
def table_print_row_buffer(row_buffer, column_widths, column_types):
first = True
for values in row_buffer:
if first:
table_print_header(values, column_widths)
first = False
else:
table_print_row(values, column_widths, column_types)
def print_table(rows):
start = time.time()
nrows = 0
first = True
# Buffer some rows before printing.
rows_to_buffer = 500
row_buffer = []
column_types = []
column_widths = None
for row in rows:
nrows = nrows + 1
if first:
row_buffer.append([table_to_printable_value(k) for k in row.keys()])
for k in row.keys():
if isinstance(row[k], numbers.Number):
column_types.append('n')
else:
column_types.append('s')
first = False
values = [table_to_printable_value(v) for k, v in row.iteritems()]
if rows_to_buffer > 0:
row_buffer.append(values)
rows_to_buffer = rows_to_buffer - 1
else:
if row_buffer:
column_widths = table_compute_column_widths(row_buffer)
table_print_row_buffer(row_buffer, column_widths, column_types)
del row_buffer[:]
table_print_row(values, column_widths, column_types)
if row_buffer:
column_widths = table_compute_column_widths(row_buffer)
table_print_row_buffer(row_buffer, column_widths, column_types)
if column_widths:
table_print_bottom(column_widths)
print("Retrieved {0:,d} row{1:s} in {2:.2f}s.".format(nrows, 's' if nrows != 1 else '', time.time() - start))
print("")
def display_query(url, sql, context, args):
rows = do_query(url, sql, context, args.timeout, args.user, args.password, args.ignore_ssl_verification, args.cafile, args.capath)
if args.format == 'csv':
print_csv(rows, args.header)
elif args.format == 'tsv':
print_tsv(rows, args.header, args.tsv_delimiter)
elif args.format == 'json':
print_json(rows)
elif args.format == 'table':
print_table(rows)
def sql_escape(s):
if s is None:
return "''"
elif isinstance(s, unicode):
ustr = s
else:
ustr = str(s).decode('utf-8')
escaped = [u"U&'"]
for c in ustr:
ccategory = unicodedata.category(c)
if ccategory.startswith('L') or ccategory.startswith('N') or c == ' ':
escaped.append(c)
else:
escaped.append(u'\\')
escaped.append('%04x' % ord(c))
escaped.append("'")
return ''.join(escaped)
def main():
parser = argparse.ArgumentParser(description='Druid SQL command-line client.')
parser.add_argument('--host', '-H', type=str, default='http://localhost:8082/', help='Broker host or url')
parser.add_argument('--timeout', type=int, default=0, help='Timeout in seconds, 0 for no timeout')
parser.add_argument('--format', type=str, default='table', choices=('csv', 'tsv', 'json', 'table'), help='Result format')
parser.add_argument('--header', action='store_true', help='Include header row for formats "csv" and "tsv"')
parser.add_argument('--tsv-delimiter', type=str, default='\t', help='Delimiter for format "tsv"')
parser.add_argument('--context-option', '-c', type=str, action='append', help='Set context option for this connection')
parser.add_argument('--execute', '-e', type=str, help='Execute single SQL query')
parser.add_argument('--user', '-u', type=str, help='Username for HTTP basic auth')
parser.add_argument('--password', '-p', type=str, help='Password for HTTP basic auth')
parser.add_argument('--ignore-ssl-verification', '-k', action='store_true', default=False, help='Skip verification of SSL certificates.')
parser.add_argument('--cafile', type=str, help='Path to SSL CA file for validating server certificates. See load_verify_locations() in https://docs.python.org/2/library/ssl.html#ssl.SSLContext.')
parser.add_argument('--capath', type=str, help='SSL CA path for validating server certificates. See load_verify_locations() in https://docs.python.org/2/library/ssl.html#ssl.SSLContext.')
args = parser.parse_args()
# Build broker URL
url = args.host.rstrip('/') + '/druid/v2/sql/'
if not url.startswith('http:') and not url.startswith('https:'):
url = 'http://' + url
# Build context
context = {}
if args.context_option:
for opt in args.context_option:
kv = opt.split("=", 1)
if len(kv) != 2:
raise ValueError('Invalid context option, should be key=value: ' + opt)
if re.match(r"^\d+$", kv[1]):
context[kv[0]] = long(kv[1])
else:
context[kv[0]] = kv[1]
if args.execute:
display_query(url, args.execute, context, args)
else:
# interactive mode
print("Welcome to dsql, the command-line client for Druid SQL.")
print("Type \"\h\" for help.")
while True:
sql = ''
while not sql.endswith(';'):
prompt = "dsql> " if sql == '' else 'more> '
try:
more_sql = raw_input(prompt)
except EOFError:
sys.stdout.write('\n')
sys.exit(1)
if sql == '' and more_sql.startswith('\\'):
# backslash command
dmatch = re.match(r'^\\d(S?)(\+?)(\s+.*?|)\s*$', more_sql)
if dmatch:
include_system = dmatch.group(1)
extra_info = dmatch.group(2)
arg = dmatch.group(3).strip()
if arg:
sql = "SELECT TABLE_SCHEMA, TABLE_NAME, COLUMN_NAME, DATA_TYPE FROM INFORMATION_SCHEMA.COLUMNS WHERE TABLE_NAME = " + sql_escape(arg)
if not include_system:
sql = sql + " AND TABLE_SCHEMA = 'druid'"
# break to execute sql
break
else:
sql = "SELECT TABLE_SCHEMA, TABLE_NAME FROM INFORMATION_SCHEMA.TABLES";
if not include_system:
sql = sql + " WHERE TABLE_SCHEMA = 'druid'"
# break to execute sql
break
hmatch = re.match(r'^\\h\s*$', more_sql)
if hmatch:
print("Commands:")
print(" \d show tables")
print(" \dS show tables, including system tables")
print(" \d table_name describe table")
print(" \h show this help")
print(" \q exit this program")
print("Or enter a SQL query ending with a semicolon (;).")
continue
qmatch = re.match(r'^\\q\s*$', more_sql)
if qmatch:
sys.exit(0)
print("No such command: " + more_sql)
else:
sql = (sql + ' ' + more_sql).strip()
try:
display_query(url, sql.rstrip(';'), context, args)
except DruidSqlException as e:
e.write_to(sys.stdout)
except KeyboardInterrupt:
sys.stdout.write("Query interrupted\n")
sys.stdout.flush()
try:
main()
except DruidSqlException as e:
e.write_to(sys.stderr)
sys.exit(1)
except KeyboardInterrupt:
sys.exit(1)
except IOError as e:
if e.errno == errno.EPIPE:
sys.exit(1)
else:
raise

View File

@ -1,4 +1,5 @@
#
#!/bin/bash -eu
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
@ -15,12 +16,14 @@
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#
druid.service=druid/overlord
druid.port=8090
PWD="$(pwd)"
WHEREAMI="$(dirname "$0")"
WHEREAMI="$(cd "$WHEREAMI" && pwd)"
druid.indexer.queue.startDelay=PT5S
druid.indexer.runner.type=remote
druid.indexer.storage.type=metadata
if [ -x "$(command -v python2)" ]
then
exec python2 "$WHEREAMI/post-index-task-main" "$@"
else
exec "$WHEREAMI/post-index-task-main" "$@"
fi

176
examples/bin/post-index-task-main Executable file
View File

@ -0,0 +1,176 @@
#!/usr/bin/env python
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
import argparse
import base64
import json
import re
import sys
import time
import urllib2
import urlparse
def read_task_file(args):
with open(args.file, 'r') as f:
contents = f.read()
# We don't use the parsed data, but we want to throw early if it's invalid
try:
json.loads(contents)
except Exception, e:
sys.stderr.write('Invalid JSON in task file "{0}": {1}\n'.format(args.file, repr(e)))
sys.exit(1)
return contents
def add_basic_auth_header(args, req):
if (args.user is not None):
basic_auth_encoded = base64.b64encode('%s:%s' % (args.user, args.password))
req.add_header("Authorization", "Basic %s" % basic_auth_encoded)
# Keep trying until timeout_at, maybe die then
def post_task(args, task_json, timeout_at):
try:
url = args.url.rstrip("/") + "/druid/indexer/v1/task"
req = urllib2.Request(url, task_json, {'Content-Type' : 'application/json'})
add_basic_auth_header(args, req)
timeleft = timeout_at - time.time()
response_timeout = min(max(timeleft, 5), 10)
response = urllib2.urlopen(req, None, response_timeout)
return response.read().rstrip()
except urllib2.URLError as e:
if isinstance(e, urllib2.HTTPError) and e.code >= 400 and e.code <= 500:
# 4xx (problem with the request) or 500 (something wrong on the server)
raise_friendly_error(e)
elif time.time() >= timeout_at:
# No futher retries
raise_friendly_error(e)
elif isinstance(e, urllib2.HTTPError) and e.code in [301, 302, 303, 305, 307] and \
e.info().getheader("Location") is not None:
# Set the new location in args.url so it can be used by await_task_completion and re-issue the request
location = urlparse.urlparse(e.info().getheader("Location"))
args.url = "{0}://{1}".format(location.scheme, location.netloc)
sys.stderr.write("Redirect response received, setting url to [{0}]\n".format(args.url))
return post_task(args, task_json, timeout_at)
else:
# If at first you don't succeed, try, try again!
sleep_time = 5
if not args.quiet:
extra = ''
if hasattr(e, 'read'):
extra = e.read().rstrip()
sys.stderr.write("Waiting up to {0}s for indexing service [{1}] to become available. [Got: {2} {3}]".format(max(sleep_time, int(timeout_at - time.time())), args.url, str(e), extra).rstrip())
sys.stderr.write("\n")
time.sleep(sleep_time)
return post_task(args, task_json, timeout_at)
# Keep trying until timeout_at, maybe die then
def await_task_completion(args, task_id, timeout_at):
while True:
url = args.url.rstrip("/") + "/druid/indexer/v1/task/{0}/status".format(task_id)
req = urllib2.Request(url)
add_basic_auth_header(args, req)
timeleft = timeout_at - time.time()
response_timeout = min(max(timeleft, 5), 10)
response = urllib2.urlopen(req, None, response_timeout)
response_obj = json.loads(response.read())
response_status_code = response_obj["status"]["statusCode"]
if response_status_code in ['SUCCESS', 'FAILED']:
return response_status_code
else:
if time.time() < timeout_at:
if not args.quiet:
sys.stderr.write("Task {0} still running...\n".format(task_id))
timeleft = timeout_at - time.time()
time.sleep(min(5, timeleft))
else:
raise Exception("Task {0} did not finish in time!".format(task_id))
def raise_friendly_error(e):
if isinstance(e, urllib2.HTTPError):
text = e.read().strip()
reresult = re.search(r'<pre>(.*?)</pre>', text, re.DOTALL)
if reresult:
text = reresult.group(1).strip()
raise Exception("HTTP Error {0}: {1}, check overlord log for more details.\n{2}".format(e.code, e.reason, text))
raise e
def await_load_completion(args, datasource, timeout_at):
while True:
url = args.coordinator_url.rstrip("/") + "/druid/coordinator/v1/loadstatus"
req = urllib2.Request(url)
add_basic_auth_header(args, req)
timeleft = timeout_at - time.time()
response_timeout = min(max(timeleft, 5), 10)
response = urllib2.urlopen(req, None, response_timeout)
response_obj = json.loads(response.read())
load_status = response_obj.get(datasource, 0.0)
if load_status >= 100.0:
sys.stderr.write("{0} loading complete! You may now query your data\n".format(datasource))
return
else:
if time.time() < timeout_at:
if not args.quiet:
sys.stderr.write("{0} is {1}% finished loading...\n".format(datasource, load_status))
timeleft = timeout_at - time.time()
time.sleep(min(5, timeleft))
else:
raise Exception("{0} was not loaded in time!".format(datasource))
def main():
parser = argparse.ArgumentParser(description='Post Druid indexing tasks.')
parser.add_argument('--url', '-u', metavar='url', type=str, default='http://localhost:8090/', help='Druid Overlord url')
parser.add_argument('--coordinator-url', type=str, default='http://localhost:8081/', help='Druid Coordinator url')
parser.add_argument('--file', '-f', type=str, required=True, help='Query JSON file')
parser.add_argument('--submit-timeout', type=int, default=120, help='Timeout (in seconds) for submitting tasks')
parser.add_argument('--complete-timeout', type=int, default=14400, help='Timeout (in seconds) for completing tasks')
parser.add_argument('--load-timeout', type=int, default=14400, help='Timeout (in seconds) for waiting for tasks to load')
parser.add_argument('--quiet', '-q', action='store_true', help='Suppress retryable errors')
parser.add_argument('--user', type=str, default=None, help='Basic auth username')
parser.add_argument('--password', type=str, default=None, help='Basic auth password')
args = parser.parse_args()
submit_timeout_at = time.time() + args.submit_timeout
complete_timeout_at = time.time() + args.complete_timeout
task_contents = read_task_file(args)
task_json = json.loads(task_contents)
if task_json['type'] == "compact":
datasource = task_json['dataSource']
else:
datasource = json.loads(task_contents)["spec"]["dataSchema"]["dataSource"]
sys.stderr.write("Beginning indexing data for {0}\n".format(datasource))
task_id = json.loads(post_task(args, task_contents, submit_timeout_at))["task"]
sys.stderr.write('\033[1m' + "Task started: " + '\033[0m' + "{0}\n".format(task_id))
sys.stderr.write('\033[1m' + "Task log: " + '\033[0m' + "{0}/druid/indexer/v1/task/{1}/log\n".format(args.url.rstrip("/"),task_id))
sys.stderr.write('\033[1m' + "Task status: " + '\033[0m' + "{0}/druid/indexer/v1/task/{1}/status\n".format(args.url.rstrip("/"),task_id))
task_status = await_task_completion(args, task_id, complete_timeout_at)
sys.stderr.write("Task finished with status: {0}\n".format(task_status))
if task_status != 'SUCCESS':
sys.exit(1)
sys.stderr.write("Completed indexing data for {0}. Now loading indexed data onto the cluster...\n".format(datasource))
load_timeout_at = time.time() + args.load_timeout
await_load_completion(args, datasource, load_timeout_at)
try:
main()
except KeyboardInterrupt:
sys.exit(1)

43
examples/bin/run-druid Executable file
View File

@ -0,0 +1,43 @@
#!/bin/bash -eu
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
if [ "$#" -gt 2 ]
then
echo "usage: $0 <service> [conf-dir]" >&2
exit 1
fi
PWD="$(pwd)"
WHEREAMI="$(dirname "$0")"
WHATAMI="$1"
if [ "$#" -eq 1 ]
then
CONFDIR="$WHEREAMI/../conf"
else
CONFDIR="$2"
fi
CONFDIR="$(cd "$CONFDIR" && pwd)/druid"
WHEREAMI="$(cd "$WHEREAMI" && pwd)"
cd "$WHEREAMI/.."
exec java `cat "$CONFDIR"/"$WHATAMI"/jvm.config | xargs` \
-cp "$CONFDIR"/"$WHATAMI":"$CONFDIR"/_common:"$CONFDIR"/_common/hadoop-xml:"$WHEREAMI/../lib/*" \
`cat "$CONFDIR"/$WHATAMI/main.config | xargs`

43
examples/bin/run-zk Executable file
View File

@ -0,0 +1,43 @@
#!/bin/bash -eu
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
if [ "$#" -gt 1 ]
then
echo "usage: $0 [conf-dir]" >&2
exit 1
fi
PWD="$(pwd)"
WHEREAMI="$(dirname "$0")"
if [ "$#" -lt 1 ] || [ "x$1" = "x" ]
then
CONFDIR="$WHEREAMI"/../conf
else
CONFDIR="$1"
fi
CONFDIR="$(cd "$CONFDIR" && pwd)/zk"
WHEREAMI="$(cd "$WHEREAMI" && pwd)"
cd "$WHEREAMI/.."
exec java `cat "$CONFDIR"/jvm.config | xargs` \
-cp "$WHEREAMI/../zk/lib/*:$WHEREAMI/../zk/*:$CONFDIR" \
org.apache.zookeeper.server.quorum.QuorumPeerMain \
"$CONFDIR"/zoo.cfg

70
examples/bin/service Executable file
View File

@ -0,0 +1,70 @@
#!/usr/bin/env perl
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
use strict;
use warnings;
use Cwd qw/realpath/;
use Fcntl;
use File::Basename;
use FindBin;
use Getopt::Long qw/:config require_order gnu_compat/;
sub usage
{
die "usage: $0 (--restart <service> | --tail <service> | --down) [-d <var dir>]\n";
}
# Parse arguments
my %opt = (
'vardir' => realpath("$FindBin::Bin/../var"),
);
usage() unless GetOptions(\%opt, 'command=s', 'restart=s', 'down', 'tail=s', 'vardir|d=s');
my $svdir = "$opt{vardir}/sv";
my $command;
if ($opt{command}) {
usage() if $command;
$command = $opt{command};
}
if ($opt{down}) {
usage() if $command;
$command = 'd';
}
if ($opt{restart}) {
usage() if $command;
$command = "k $opt{restart}";
}
if ($opt{tail}) {
usage() if $command;
exec "tail", "-f", "$svdir/$opt{tail}.log"
or die "exec failed: $!\n";
}
usage() unless $command;
my $fifofile = "$svdir/.ctrl";
sysopen my $fifofh, $fifofile, O_WRONLY or die "Can't open control fifo, perhaps supervise is not running: $!\n";
print $fifofh "$command\n";
close $fifofh;

380
examples/bin/supervise Executable file
View File

@ -0,0 +1,380 @@
#!/usr/bin/env perl
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
use strict;
use warnings;
use Cwd qw/realpath/;
use POSIX qw/:sys_wait_h mkfifo setsid/;
use Fcntl qw/:DEFAULT :flock/;
use Getopt::Long qw/:config require_order gnu_compat/;
use FindBin;
use File::Spec;
use File::Copy;
sub logdie($)
{
my ($msg) = @_;
chomp $msg;
die "[" . (scalar localtime()) . "] $msg\n";
}
sub logit($)
{
my ($msg) = @_;
chomp $msg;
warn "[" . (scalar localtime()) . "] $msg\n";
}
sub usage
{
die "usage: $0 -c <conf file> [-d <var dir>] [-t <kill timeout>] [--svlogd <optional conf file>]\n";
}
sub read_config_file
{
my ($config_file) = @_;
open my $config_fh, "<", $config_file
or die "open $config_file: $!";
my @commands;
my @verify;
my $kill_timeout;
while (my $line = <$config_fh>) {
chomp $line;
next if $line =~ /^(\s*\#.*|\s*)$/;
if ($line =~ /^(:verify|:kill-timeout|(?:\!p[0-9]+\s+)?[^:]\S+)\s+(.+)$/) {
my $name = $1;
my $order = 50;
my $command = $2;
if ($name =~ /^(?:\!p([0-9]+)\s+)(.*)$/) {
$order = $1;
$name = $2;
}
if ($name eq ':verify') {
push @verify, $command;
} elsif ($name eq ':kill-timeout') {
$kill_timeout = int($command);
} else {
die "Duplicate command: $line\n" if grep { $_->{name} eq $name } @commands;
push @commands, {
name => $name,
command => $command,
order => $order, # Stop order for this command
pid => 0, # Current pid, or 0 if not running
down => 0, # Time the proc should be down until
killed => 0, # Signal we sent to this process
restarting => 0, # True if this command is currently restarting
};
}
} else {
die "Syntax error: $line\n";
}
}
close $config_fh;
return { commands => \@commands, verify => \@verify, 'kill-timeout' => $kill_timeout };
}
sub stringify_exit_status
{
my ($status) = @_;
my $string;
my $signal = $status & 127;
my $cored = $status & 128;
my $code = $status >> 8;
if ($signal) {
$string = "signal = $signal";
} else {
$string = "exited = $code";
}
if ($cored) {
$string = $string . ", dumped core";
}
return $string;
}
sub open_control_fifo
{
my ($svdir) = @_;
my $fifofile = "$svdir/.ctrl";
if (-e $fifofile) {
unlink $fifofile or die "Cannot remove fifo: $fifofile\n";
}
mkfifo($fifofile, 0700) or die "Cannot create fifo: $fifofile\n";
sysopen my $fifofh, $fifofile, O_NONBLOCK | O_RDWR or die "Cannot open fifo for reading: $fifofile\n";
return $fifofh;
}
sub pretty
{
my ($text, $color) = @_;
if (-t STDERR) {
if ($color eq 'bold') {
return "\x1b[1m$text\x1b[0m";
} elsif ($color eq 'red') {
return "\x1b[31m\x1b[1m$text\x1b[0m";
} else {
return $text;
}
} else {
return $text;
}
}
my @commands;
# If nonzero we should be exiting. -1 means exit without signal, >0 means exit with signal
my $killed = 0;
# If >0 then kill -9 all procs at this time
my $killkill = 0;
# Current proc order we're stopping. Ignored unless $killed is nonzero
my $stopping = 100;
# We'll do our own reaping
$SIG{CHLD} = sub {};
# Redirect stderr to stdout
open STDERR, ">&STDOUT" or die;
# Parse arguments
my %opt = (
'chdir' => realpath("$FindBin::Bin/.."),
'vardir' => realpath("$FindBin::Bin/../var"),
'kill-timeout' => 360,
);
usage() unless GetOptions(
\%opt,
'conf|c=s',
'vardir|d=s',
'kill-timeout|t=i',
'chdir=s',
'svlogd:s'
);
usage() unless $opt{'conf'} && $opt{'vardir'};
# Read config file
my $config = read_config_file($opt{'conf'});
@commands = @{$config->{commands}};
if (!@commands) {
die "Nothing to run.\n";
}
# Potentially override --kill-timeout
if (defined $config->{'kill-timeout'}) {
$opt{'kill-timeout'} = $config->{'kill-timeout'};
}
# Remember where vardir, svdir are after chdiring
my $vardir = File::Spec->rel2abs($opt{vardir});
my $svdir = "$vardir/sv";
# chdir to the root of the distribution (or whereever)
chdir($opt{chdir}) or die "chdir[$opt{chdir}] failed: $!\n";
# Create vardir with tmp/
if (! -e "$vardir/tmp") {
system("mkdir -p \Q$vardir\E/tmp") == 0 or die "mkdir $vardir/tmp failed: $!\n";
}
# Create svdir
if (! -e $svdir) {
system("mkdir -p \Q$svdir\E") == 0 or die "mkdir $svdir failed: $!\n";
}
# Lock svdir and keep it locked until we exit
my $lockfile = "$svdir/.lock";
open my $lockfh, ">", $lockfile or die "Cannot write to svdir, please check permissions: $svdir\n";
flock($lockfh, LOCK_EX | LOCK_NB) or die "Cannot lock svdir, maybe another 'supervise' is running: $svdir\n";
# Create control fifo in svdir
my $fifofh = open_control_fifo($svdir);
# Run verification commands
for my $verify_cmd (@{$config->{verify}}) {
system($verify_cmd) == 0 or exit 1;
}
# Catch killy signals and do an orderly shutdown
$SIG{HUP} = sub { if (!$killed) { $killed = 1; $killkill = time + $opt{'kill-timeout'}; } };
$SIG{INT} = sub { if (!$killed) { $killed = 2; $killkill = time + $opt{'kill-timeout'}; } };
$SIG{TERM} = sub { if (!$killed) { $killed = 15; $killkill = time + $opt{'kill-timeout'}; } };
# Build up control fifo command over multiple sysreads, potentially
my $fifobuffer = '';
while (1) {
# Spawn new procs
if (!$killed) {
for my $command (grep { !$_->{pid} } @commands) {
if ($command->{down} < time) {
my $logfile = sprintf("%s%s", "$svdir/$command->{name}", defined $opt{'svlogd'} ? "" : ".log");
logit "Running command[" . pretty($command->{name}, 'bold') . "], logging to[$logfile]: $command->{command}";
if (my $pid = fork) {
$command->{pid} = $pid;
$command->{logfile} = $logfile;
} else {
setsid;
if (defined $opt{'svlogd'}) {
if (! -e $logfile) {
system("mkdir -p \Q$logfile\E") == 0 or logdie "mkdir $logfile failed: $!\n";
}
if ($opt{'svlogd'}) {
copy($opt{'svlogd'}, "$logfile/config") or logdie "Failed copying $opt{'svlogd'} to $logfile/config: $!";
} else {
open my $configfh, ">", "$logfile/config" or logdie "Cannot write svlogd config, please check permissions: $logfile/config\n";
print $configfh "s100000000\nn10\nN5\nt604800";
close $configfh;
}
open STDOUT, "|svlogd $logfile" or logdie "pipe to svlogd $logfile failed: $!\n";
} else {
open STDOUT, ">>", $logfile or logdie "open $logfile failed: $!\n";
}
open STDERR, ">&STDOUT" or logdie "redirecting stderr failed: $!\n";
exec('sh', '-c', "exec $command->{command}") or logdie "exec [$command->{command}] failed: $!";
}
}
}
}
# Reap dead procs
my $pid;
while (($pid = waitpid(-1, WNOHANG)) > 0) {
my $status = $?;
my ($command) = (grep { $_->{pid} eq $pid } @commands);
if ($command) {
$command->{pid} = 0;
$command->{down} = time + 2;
logit "Command[" . pretty($command->{name}, 'bold') . "] exited (pid = $pid, " . stringify_exit_status($status) . ")";
if ($status && !$killed && !$command->{restarting}) {
# Unexpected exit
logit "Command[" . pretty($command->{name}, 'bold') . "] " . pretty("failed", "red") . ", see logfile for more details: $command->{logfile}";
}
$command->{restarting} = 0;
} else {
logit "ERR: Reaped unknown command (pid = $pid, " . stringify_exit_status($status) . ")";
}
}
# Kill procs, maybe
if ($killed) {
my $should_killkill = time > $killkill;
# Update stopping position, maybe
if ($should_killkill) {
$stopping = 0;
} else {
my $maxorder = 0;
for my $command (grep { $_->{pid} } @commands) {
if ($command->{order} > $maxorder) {
$maxorder = $command->{order};
}
}
if ($maxorder < $stopping) {
$stopping = $maxorder;
}
}
for my $command (grep { $_->{pid} && $_->{order} >= $stopping } @commands) {
my $want_signal;
if ($command->{killed} == 9 || $should_killkill) {
$want_signal = 9;
} else {
$want_signal = 15;
}
if ($command->{killed} != $want_signal) {
if ($want_signal != 9) {
my $kt = $opt{'kill-timeout'};
logit "Sending signal[$want_signal] to command[" . pretty($command->{name}, 'bold') . "] (timeout ${kt}s).";
} else {
logit "Sending signal[$want_signal] to command[" . pretty($command->{name}, 'bold') . "].";
}
kill $want_signal, $command->{pid} or logit "WARN: Could not signal pid: $command->{pid}";
$command->{killed} = $want_signal;
}
}
}
# Kill ourselves, maybe
if ($killed && ! grep { $_->{pid} } @commands) {
logit "Exiting.";
$SIG{HUP} = $SIG{INT} = $SIG{TERM} = 'DEFAULT';
if ($killed > 0) {
kill $killed, $$;
exit 1;
} else {
# Normal exit
exit 0;
}
}
# Be controlled, maybe
my $fifostr = "";
if (sysread $fifofh, $fifostr, 4096) {
$fifobuffer .= $fifostr;
while ($fifobuffer =~ /^([^\n]*)\n(.*)/s) {
my $fifocmd = $1;
$fifobuffer = $2;
if ($fifocmd =~ /^k (.+)$/ && !$killed) {
my $name = $1;
my ($command) = grep { $_->{name} eq $name && $_->{pid} } @commands;
if ($command) {
logit "Restarting command[" . pretty($name, "bold") . "].";
if (kill TERM => $command->{pid}) {
$command->{restarting} = 1;
} else {
logit "WARN: Could not signal pid: $command->{pid}"
}
} else {
logit "Asked to restart unknown command[" . pretty($name, "bold") . "], ignoring.";
}
} elsif ($fifocmd eq 'd') {
# -1 means exit without signal
$killed = -1;
$killkill = time + $opt{'kill-timeout'}
} else {
logit "Received unknown control command, ignoring.";
}
}
}
sleep 1;
}
exit 0;

View File

@ -1,4 +1,5 @@
#
#!/usr/bin/env perl
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
@ -15,21 +16,17 @@
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#
druid.service=druid/broker
druid.port=8082
use strict;
use warnings;
use Socket;
# HTTP server threads
druid.broker.http.numConnections=5
druid.server.http.numThreads=9
my @ports = (1527, 2181, 8081, 8082, 8083, 8090, 8091, 8200, 9095);
# Processing threads and buffers
druid.processing.buffer.sizeBytes=256000000
druid.processing.numThreads=2
# Query cache (we use a small local cache)
druid.broker.cache.useCache=true
druid.broker.cache.populateCache=true
druid.cache.type=local
druid.cache.sizeInBytes=10000000
my $tcp = getprotobyname("tcp");
for my $port (@ports) {
socket(my $sock, PF_INET, SOCK_STREAM, $tcp) or die "socket: $!";
setsockopt($sock, SOL_SOCKET, SO_REUSEADDR, pack("l", 1)) or die "setsockopt: $!";
bind($sock, sockaddr_in($port, INADDR_ANY)) or die "Cannot start up because port[$port] is already in use.\n";
close $sock;
}

View File

@ -1,4 +1,5 @@
#
#!/usr/bin/env perl
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
@ -15,18 +16,18 @@
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#
druid.service=druid/historical
druid.port=8083
use strict;
use warnings;
# HTTP server threads
druid.server.http.numThreads=9
my $java_version = qx[java -version 2>&1];
if ($?) {
die "Please install Java 8 or better!\n";
}
# Processing threads and buffers
druid.processing.buffer.sizeBytes=256000000
druid.processing.numThreads=2
# If we know it won't work, die. Otherwise hope for the best.
if ($java_version =~ /java version \"((\d+)\.(\d+).*?)\"/ && ($2 < 1 || $3 < 8)) {
die "Please upgrade to Java 8 or better! Your current version is: $1\n";
}
# Segment storage
druid.segmentCache.locations=[{"path":"var/druid/segment-cache","maxSize":300000000000}]
druid.server.maxSize=300000000000
exit 0;

View File

@ -1,33 +0,0 @@
<?xml version="1.0" encoding="UTF-8" ?>
<!--
~ Licensed to the Apache Software Foundation (ASF) under one
~ or more contributor license agreements. See the NOTICE file
~ distributed with this work for additional information
~ regarding copyright ownership. The ASF licenses this file
~ to you under the Apache License, Version 2.0 (the
~ "License"); you may not use this file except in compliance
~ with the License. You may obtain a copy of the License at
~
~ http://www.apache.org/licenses/LICENSE-2.0
~
~ Unless required by applicable law or agreed to in writing,
~ software distributed under the License is distributed on an
~ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
~ KIND, either express or implied. See the License for the
~ specific language governing permissions and limitations
~ under the License.
-->
<Configuration status="WARN">
<Appenders>
<Console name="Console" target="SYSTEM_OUT">
<PatternLayout pattern="%d{ISO8601} %p [%t] %c - %m%n"/>
</Console>
</Appenders>
<Loggers>
<Root level="info">
<AppenderRef ref="Console"/>
</Root>
</Loggers>
</Configuration>

View File

@ -1,38 +0,0 @@
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#
druid.service=druid/middleManager
druid.port=8091
# Number of tasks per middleManager
druid.worker.capacity=3
# Task launch parameters
druid.indexer.runner.javaOpts=-server -Xmx2g -Duser.timezone=UTC -Dfile.encoding=UTF-8 -Djava.util.logging.manager=org.apache.logging.log4j.jul.LogManager
druid.indexer.task.baseTaskDir=var/druid/task
# HTTP server threads
druid.server.http.numThreads=9
# Processing threads and buffers on Peons
druid.indexer.fork.property.druid.processing.buffer.sizeBytes=256000000
druid.indexer.fork.property.druid.processing.numThreads=2
# Hadoop indexing
druid.indexer.task.hadoopWorkingPath=var/druid/hadoop-tmp

View File

@ -1,76 +0,0 @@
{
"dataSources" : {
"metrics-kafka" : {
"spec" : {
"dataSchema" : {
"dataSource" : "metrics-kafka",
"parser" : {
"type" : "string",
"parseSpec" : {
"timestampSpec" : {
"column" : "timestamp",
"format" : "auto"
},
"dimensionsSpec" : {
"dimensions" : [],
"dimensionExclusions" : [
"timestamp",
"value"
]
},
"format" : "json"
}
},
"granularitySpec" : {
"type" : "uniform",
"segmentGranularity" : "hour",
"queryGranularity" : "none"
},
"metricsSpec" : [
{
"type" : "count",
"name" : "count"
},
{
"name" : "value_sum",
"type" : "doubleSum",
"fieldName" : "value"
},
{
"fieldName" : "value",
"name" : "value_min",
"type" : "doubleMin"
},
{
"type" : "doubleMax",
"name" : "value_max",
"fieldName" : "value"
}
]
},
"ioConfig" : {
"type" : "realtime"
},
"tuningConfig" : {
"type" : "realtime",
"intermediatePersistPeriod" : "PT10M",
"windowPeriod" : "PT10M"
}
},
"properties" : {
"task.partitions" : "1",
"task.replicants" : "1",
"topicPattern" : "metrics"
}
}
},
"properties" : {
"zookeeper.connect" : "localhost",
"druid.discovery.curator.path" : "/druid/discovery",
"druid.selectors.indexing.serviceName" : "druid/overlord",
"commit.periodMillis" : "15000",
"consumer.numThreads" : "2",
"kafka.zookeeper.connect" : "localhost",
"kafka.group.id" : "tranquility-kafka"
}
}

View File

@ -1,73 +0,0 @@
{
"dataSources" : {
"metrics" : {
"spec" : {
"dataSchema" : {
"dataSource" : "metrics",
"parser" : {
"type" : "string",
"parseSpec" : {
"timestampSpec" : {
"column" : "timestamp",
"format" : "auto"
},
"dimensionsSpec" : {
"dimensions" : [],
"dimensionExclusions" : [
"timestamp",
"value"
]
},
"format" : "json"
}
},
"granularitySpec" : {
"type" : "uniform",
"segmentGranularity" : "hour",
"queryGranularity" : "none"
},
"metricsSpec" : [
{
"type" : "count",
"name" : "count"
},
{
"name" : "value_sum",
"type" : "doubleSum",
"fieldName" : "value"
},
{
"fieldName" : "value",
"name" : "value_min",
"type" : "doubleMin"
},
{
"type" : "doubleMax",
"name" : "value_max",
"fieldName" : "value"
}
]
},
"ioConfig" : {
"type" : "realtime"
},
"tuningConfig" : {
"type" : "realtime",
"intermediatePersistPeriod" : "PT10M",
"windowPeriod" : "PT10M"
}
},
"properties" : {
"task.partitions" : "1",
"task.replicants" : "1"
}
}
},
"properties" : {
"zookeeper.connect" : "localhost",
"druid.discovery.curator.path" : "/druid/discovery",
"druid.selectors.indexing.serviceName" : "druid/overlord",
"http.port" : "8200",
"http.threads" : "9"
}
}

View File

@ -0,0 +1,11 @@
{
"type": "compact",
"dataSource": "compaction-tutorial",
"interval": "2015-09-12/2015-09-13",
"tuningConfig" : {
"type" : "index",
"targetPartitionSize" : 5000000,
"maxRowsInMemory" : 25000,
"forceExtendableShardSpecs" : true
}
}

View File

@ -0,0 +1,64 @@
{
"type" : "index",
"spec" : {
"dataSchema" : {
"dataSource" : "compaction-tutorial",
"parser" : {
"type" : "string",
"parseSpec" : {
"format" : "json",
"dimensionsSpec" : {
"dimensions" : [
"channel",
"cityName",
"comment",
"countryIsoCode",
"countryName",
"isAnonymous",
"isMinor",
"isNew",
"isRobot",
"isUnpatrolled",
"metroCode",
"namespace",
"page",
"regionIsoCode",
"regionName",
"user",
{ "name": "added", "type": "long" },
{ "name": "deleted", "type": "long" },
{ "name": "delta", "type": "long" }
]
},
"timestampSpec": {
"column": "time",
"format": "iso"
}
}
},
"metricsSpec" : [],
"granularitySpec" : {
"type" : "uniform",
"segmentGranularity" : "hour",
"queryGranularity" : "none",
"intervals" : ["2015-09-12/2015-09-13"],
"rollup" : false
}
},
"ioConfig" : {
"type" : "index",
"firehose" : {
"type" : "local",
"baseDir" : "quickstart/",
"filter" : "wikiticker-2015-09-12-sampled.json.gz"
},
"appendToExisting" : false
},
"tuningConfig" : {
"type" : "index",
"targetPartitionSize" : 5000000,
"maxRowsInMemory" : 25000,
"forceExtendableShardSpecs" : true
}
}
}

View File

@ -1,18 +1,18 @@
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# Licensed to Metamarkets Group Inc. (Metamarkets) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# regarding copyright ownership. Metamarkets licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#
@ -24,12 +24,18 @@
# If you specify `druid.extensions.loadList=[]`, Druid won't load any extension from file system.
# If you don't specify `druid.extensions.loadList`, Druid will load all the extensions under root extension directory.
# More info: http://druid.io/docs/latest/operations/including-extensions.html
druid.extensions.loadList=[]
druid.extensions.loadList=["druid-hdfs-storage", "druid-kafka-indexing-service"]
# If you have a different version of Hadoop, place your Hadoop client jar files in your hadoop-dependencies directory
# and uncomment the line below to point to your directory.
#druid.extensions.hadoopDependenciesDir=/my/dir/hadoop-dependencies
#
# Hostname
#
druid.host=localhost
#
# Logging
#
@ -126,3 +132,9 @@ druid.indexing.doubleStorage=double
# Security
#
druid.server.hiddenProperties=["druid.s3.accessKey","druid.s3.secretKey","druid.metadata.storage.connector.password"]
#
# SQL
#
druid.sql.enable=true

View File

@ -0,0 +1,33 @@
<?xml version="1.0" encoding="UTF-8" ?>
<!--
~ Licensed to Metamarkets Group Inc. (Metamarkets) under one
~ or more contributor license agreements. See the NOTICE file
~ distributed with this work for additional information
~ regarding copyright ownership. Metamarkets licenses this file
~ to you under the Apache License, Version 2.0 (the
~ "License"); you may not use this file except in compliance
~ with the License. You may obtain a copy of the License at
~
~ http://www.apache.org/licenses/LICENSE-2.0
~
~ Unless required by applicable law or agreed to in writing,
~ software distributed under the License is distributed on an
~ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
~ KIND, either express or implied. See the License for the
~ specific language governing permissions and limitations
~ under the License.
-->
<Configuration status="WARN">
<Appenders>
<Console name="Console" target="SYSTEM_OUT">
<PatternLayout pattern="%d{ISO8601} %p [%t] %c - %m%n"/>
</Console>
</Appenders>
<Loggers>
<Root level="info">
<AppenderRef ref="Console"/>
</Root>
</Loggers>
</Configuration>

View File

@ -0,0 +1 @@
io.druid.cli.Main server broker

View File

@ -0,0 +1,16 @@
druid.service=druid/broker
druid.port=8082
# HTTP server threads
druid.broker.http.numConnections=5
druid.server.http.numThreads=9
# Processing threads and buffers
druid.processing.buffer.sizeBytes=256000000
druid.processing.numThreads=2
# Query cache (we use a small local cache)
druid.broker.cache.useCache=true
druid.broker.cache.populateCache=true
druid.cache.type=local
druid.cache.sizeInBytes=10000000

View File

@ -0,0 +1 @@
io.druid.cli.Main server coordinator

View File

@ -0,0 +1,5 @@
druid.service=druid/coordinator
druid.port=8081
druid.coordinator.startDelay=PT10S
druid.coordinator.period=PT5S

View File

@ -0,0 +1 @@
io.druid.cli.Main server historical

View File

@ -0,0 +1,13 @@
druid.service=druid/historical
druid.port=8083
# HTTP server threads
druid.server.http.numThreads=9
# Processing threads and buffers
druid.processing.buffer.sizeBytes=256000000
druid.processing.numThreads=2
# Segment storage
druid.segmentCache.locations=[{"path":"var/druid/segment-cache","maxSize":300000000000}]
druid.server.maxSize=300000000000

View File

@ -0,0 +1 @@
io.druid.cli.Main server middleManager

View File

@ -0,0 +1,19 @@
druid.service=druid/middleManager
druid.port=8091
# Number of tasks per middleManager
druid.worker.capacity=3
# Task launch parameters
druid.indexer.runner.javaOpts=-server -Xmx2g -Duser.timezone=UTC -Dfile.encoding=UTF-8 -Djava.util.logging.manager=org.apache.logging.log4j.jul.LogManager
druid.indexer.task.baseTaskDir=var/druid/task
# HTTP server threads
druid.server.http.numThreads=9
# Processing threads and buffers on Peons
druid.indexer.fork.property.druid.processing.buffer.sizeBytes=256000000
druid.indexer.fork.property.druid.processing.numThreads=2
# Hadoop indexing
druid.indexer.task.hadoopWorkingPath=var/druid/hadoop-tmp

View File

@ -0,0 +1 @@
io.druid.cli.Main server overlord

View File

@ -0,0 +1,7 @@
druid.service=druid/overlord
druid.port=8090
druid.indexer.queue.startDelay=PT5S
druid.indexer.runner.type=remote
druid.indexer.storage.type=metadata

View File

@ -0,0 +1,84 @@
{
"dataSources" : {
"wikipedia" : {
"spec" : {
"dataSchema" : {
"dataSource" : "wikipedia",
"parser" : {
"type" : "string",
"parseSpec" : {
"format" : "json",
"dimensionsSpec" : {
"dimensions" : [
"channel",
"cityName",
"comment",
"countryIsoCode",
"countryName",
"isAnonymous",
"isMinor",
"isNew",
"isRobot",
"isUnpatrolled",
"metroCode",
"namespace",
"page",
"regionIsoCode",
"regionName",
"user"
]
},
"timestampSpec": {
"column": "time",
"format": "iso"
}
}
},
"metricsSpec" : [
{
"name" : "added",
"type" : "longSum",
"fieldName" : "added"
},
{
"name" : "deleted",
"type" : "longSum",
"fieldName" : "deleted"
},
{
"name" : "delta",
"type" : "longSum",
"fieldName" : "delta"
}
],
"granularitySpec" : {
"type" : "uniform",
"segmentGranularity" : "day",
"queryGranularity" : "none",
"intervals" : ["2015-09-12/2015-09-13"],
"rollup" : false
}
},
"ioConfig" : {
"type" : "realtime"
},
"tuningConfig" : {
"type" : "realtime",
"intermediatePersistPeriod" : "PT10M",
"windowPeriod" : "P3650D"
}
},
"properties" : {
"task.partitions" : "1",
"task.replicants" : "1"
}
}
},
"properties" : {
"zookeeper.connect" : "localhost",
"druid.discovery.curator.path" : "/druid/discovery",
"druid.selectors.indexing.serviceName" : "druid/overlord",
"http.port" : "8200",
"http.threads" : "9"
}
}

View File

@ -0,0 +1,13 @@
:verify bin/verify-java
:verify bin/verify-default-ports
:kill-timeout 10
!p10 zk bin/run-zk quickstart/tutorial/conf
coordinator bin/run-druid coordinator quickstart/tutorial/conf
broker bin/run-druid broker quickstart/tutorial/conf
historical bin/run-druid historical quickstart/tutorial/conf
!p80 overlord bin/run-druid overlord quickstart/tutorial/conf
!p90 middleManager bin/run-druid middleManager quickstart/tutorial/conf
# Uncomment to use Tranquility Server
#!p95 tranquility-server tranquility/bin/tranquility server -configFile quickstart/tutorial/conf/tranquility/wikipedia-server.json -Ddruid.extensions.loadList=[]

View File

@ -0,0 +1,4 @@
-server
-Xms128m
-Xmx128m
-Duser.timezone=UTC

View File

@ -0,0 +1,17 @@
<?xml version="1.0" encoding="UTF-8" ?>
<!DOCTYPE log4j:configuration SYSTEM "log4j.dtd">
<log4j:configuration xmlns:log4j="http://jakarta.apache.org/log4j/">
<appender name="console" class="org.apache.log4j.ConsoleAppender">
<param name="Target" value="System.out"/>
<layout class="org.apache.log4j.PatternLayout">
<param name="ConversionPattern" value="%d{ISO8601} %p [%t] %c - %m%n"/>
</layout>
</appender>
<root>
<priority value ="info" />
<appender-ref ref="console" />
</root>
</log4j:configuration>

View File

@ -0,0 +1,16 @@
#
# Server
#
tickTime=2000
dataDir=var/zk
clientPort=2181
initLimit=5
syncLimit=2
#
# Autopurge
#
autopurge.snapRetainCount=5
autopurge.purgeInterval=1

View File

@ -0,0 +1,64 @@
{
"type" : "index",
"spec" : {
"dataSchema" : {
"dataSource" : "deletion-tutorial",
"parser" : {
"type" : "string",
"parseSpec" : {
"format" : "json",
"dimensionsSpec" : {
"dimensions" : [
"channel",
"cityName",
"comment",
"countryIsoCode",
"countryName",
"isAnonymous",
"isMinor",
"isNew",
"isRobot",
"isUnpatrolled",
"metroCode",
"namespace",
"page",
"regionIsoCode",
"regionName",
"user",
{ "name": "added", "type": "long" },
{ "name": "deleted", "type": "long" },
{ "name": "delta", "type": "long" }
]
},
"timestampSpec": {
"column": "time",
"format": "iso"
}
}
},
"metricsSpec" : [],
"granularitySpec" : {
"type" : "uniform",
"segmentGranularity" : "hour",
"queryGranularity" : "none",
"intervals" : ["2015-09-12/2015-09-13"],
"rollup" : false
}
},
"ioConfig" : {
"type" : "index",
"firehose" : {
"type" : "local",
"baseDir" : "quickstart/",
"filter" : "wikiticker-2015-09-12-sampled.json.gz"
},
"appendToExisting" : false
},
"tuningConfig" : {
"type" : "index",
"targetPartitionSize" : 5000000,
"maxRowsInMemory" : 25000,
"forceExtendableShardSpecs" : true
}
}
}

View File

@ -0,0 +1,5 @@
{
"type": "kill",
"dataSource": "deletion-tutorial",
"interval" : "2015-09-12/2015-09-13"
}

View File

@ -0,0 +1,107 @@
# Creates pseudo distributed hadoop 2.8.3 with java 8
#
# Modified from the SequenceIQ Dockerfiles at https://github.com/sequenceiq/hadoop-docker
#
# docker build -t druid-hadoop-demo:2.8.3 .
FROM sequenceiq/pam:centos-6.5
MAINTAINER SequenceIQ
USER root
# install dev tools
RUN yum clean all \
&& rpm --rebuilddb \
&& yum install -y curl which tar sudo openssh-server openssh-clients rsync yum-plugin-ovl\
&& yum clean all \
&& yum update -y libselinux \
&& yum clean all
# update libselinux. see https://github.com/sequenceiq/hadoop-docker/issues/14
# passwordless ssh
RUN ssh-keygen -q -N "" -t dsa -f /etc/ssh/ssh_host_dsa_key
RUN ssh-keygen -q -N "" -t rsa -f /etc/ssh/ssh_host_rsa_key
RUN ssh-keygen -q -N "" -t rsa -f /root/.ssh/id_rsa
RUN cp /root/.ssh/id_rsa.pub /root/.ssh/authorized_keys
# zulu java 8
RUN rpm --import http://repos.azulsystems.com/RPM-GPG-KEY-azulsystems
RUN rpm --rebuilddb
RUN sudo curl -o /etc/yum.repos.d/zulu.repo http://repos.azulsystems.com/rhel/zulu.repo
RUN yum install -y zulu-8
ENV JAVA_HOME /usr/lib/jvm/zulu-8
ENV PATH $PATH:$JAVA_HOME/bin
# hadoop
RUN curl -s https://archive.apache.org/dist/hadoop/core/hadoop-2.8.3/hadoop-2.8.3.tar.gz | tar -xz -C /usr/local/
RUN cd /usr/local && ln -s ./hadoop-2.8.3 hadoop
ENV HADOOP_PREFIX /usr/local/hadoop
ENV HADOOP_COMMON_HOME /usr/local/hadoop
ENV HADOOP_HDFS_HOME /usr/local/hadoop
ENV HADOOP_MAPRED_HOME /usr/local/hadoop
ENV HADOOP_YARN_HOME /usr/local/hadoop
ENV HADOOP_CONF_DIR /usr/local/hadoop/etc/hadoop
ENV YARN_CONF_DIR $HADOOP_PREFIX/etc/hadoop
RUN sed -i '/^export JAVA_HOME/ s:.*:export JAVA_HOME=/usr/lib/jvm/zulu-8\nexport HADOOP_PREFIX=/usr/local/hadoop\nexport HADOOP_HOME=/usr/local/hadoop\n:' $HADOOP_PREFIX/etc/hadoop/hadoop-env.sh
RUN sed -i '/^export HADOOP_CONF_DIR/ s:.*:export HADOOP_CONF_DIR=/usr/local/hadoop/etc/hadoop/:' $HADOOP_PREFIX/etc/hadoop/hadoop-env.sh
RUN mkdir $HADOOP_PREFIX/input
RUN cp $HADOOP_PREFIX/etc/hadoop/*.xml $HADOOP_PREFIX/input
# pseudo distributed
ADD core-site.xml.template $HADOOP_PREFIX/etc/hadoop/core-site.xml.template
RUN sed s/HOSTNAME/localhost/ /usr/local/hadoop/etc/hadoop/core-site.xml.template > /usr/local/hadoop/etc/hadoop/core-site.xml
ADD hdfs-site.xml $HADOOP_PREFIX/etc/hadoop/hdfs-site.xml
ADD mapred-site.xml $HADOOP_PREFIX/etc/hadoop/mapred-site.xml
ADD yarn-site.xml $HADOOP_PREFIX/etc/hadoop/yarn-site.xml
RUN $HADOOP_PREFIX/bin/hdfs namenode -format
ADD ssh_config /root/.ssh/config
RUN chmod 600 /root/.ssh/config
RUN chown root:root /root/.ssh/config
# # installing supervisord
# RUN yum install -y python-setuptools
# RUN easy_install pip
# RUN curl https://bitbucket.org/pypa/setuptools/raw/bootstrap/ez_setup.py -o - | python
# RUN pip install supervisor
#
# ADD supervisord.conf /etc/supervisord.conf
ADD bootstrap.sh /etc/bootstrap.sh
RUN chown root:root /etc/bootstrap.sh
RUN chmod 700 /etc/bootstrap.sh
ENV BOOTSTRAP /etc/bootstrap.sh
# workingaround docker.io build error
RUN ls -la /usr/local/hadoop/etc/hadoop/*-env.sh
RUN chmod +x /usr/local/hadoop/etc/hadoop/*-env.sh
RUN ls -la /usr/local/hadoop/etc/hadoop/*-env.sh
# Copy additional .jars to classpath
RUN cp /usr/local/hadoop/share/hadoop/tools/lib/*.jar /usr/local/hadoop/share/hadoop/common/lib/
# fix the 254 error code
RUN sed -i "/^[^#]*UsePAM/ s/.*/#&/" /etc/ssh/sshd_config
RUN echo "UsePAM no" >> /etc/ssh/sshd_config
RUN echo "Port 2122" >> /etc/ssh/sshd_config
RUN service sshd start && $HADOOP_PREFIX/etc/hadoop/hadoop-env.sh && $HADOOP_PREFIX/sbin/start-dfs.sh && $HADOOP_PREFIX/bin/hdfs dfs -mkdir -p /user/root
RUN service sshd start && $HADOOP_PREFIX/etc/hadoop/hadoop-env.sh && $HADOOP_PREFIX/sbin/start-dfs.sh && $HADOOP_PREFIX/bin/hdfs dfs -put $HADOOP_PREFIX/etc/hadoop/ input
CMD ["/etc/bootstrap.sh", "-d"]
# Hdfs ports
EXPOSE 50010 50020 50070 50075 50090 8020 9000
# Mapred ports
EXPOSE 10020 19888
#Yarn ports
EXPOSE 8030 8031 8032 8033 8040 8042 8088
#Other ports
EXPOSE 49707 2122

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,27 @@
#!/bin/bash
: ${HADOOP_PREFIX:=/usr/local/hadoop}
$HADOOP_PREFIX/etc/hadoop/hadoop-env.sh
rm /tmp/*.pid
# installing libraries if any - (resource urls added comma separated to the ACP system variable)
cd $HADOOP_PREFIX/share/hadoop/common ; for cp in ${ACP//,/ }; do echo == $cp; curl -LO $cp ; done; cd -
# altering the core-site configuration
sed s/HOSTNAME/$HOSTNAME/ /usr/local/hadoop/etc/hadoop/core-site.xml.template > /usr/local/hadoop/etc/hadoop/core-site.xml
service sshd start
$HADOOP_PREFIX/sbin/start-dfs.sh
$HADOOP_PREFIX/sbin/start-yarn.sh
$HADOOP_PREFIX/sbin/mr-jobhistory-daemon.sh start historyserver
if [[ $1 == "-d" ]]; then
while true; do sleep 1000; done
fi
if [[ $1 == "-bash" ]]; then
/bin/bash
fi

View File

@ -0,0 +1,6 @@
<configuration>
<property>
<name>fs.defaultFS</name>
<value>hdfs://HOSTNAME:9000</value>
</property>
</configuration>

View File

@ -0,0 +1,14 @@
<configuration>
<property>
<name>dfs.replication</name>
<value>1</value>
</property>
<property>
<name>dfs.client.use.datanode.hostname</name>
<value>true</value>
</property>
<property>
<name>dfs.datanode.use.datanode.hostname</name>
<value>true</value>
</property>
</configuration>

View File

@ -0,0 +1,6 @@
<configuration>
<property>
<name>mapreduce.framework.name</name>
<value>yarn</value>
</property>
</configuration>

View File

@ -0,0 +1,5 @@
Host *
UserKnownHostsFile /dev/null
StrictHostKeyChecking no
LogLevel quiet
Port 2122

View File

@ -0,0 +1,47 @@
<configuration>
<property>
<name>yarn.nodemanager.aux-services</name>
<value>mapreduce_shuffle</value>
</property>
<property>
<name>yarn.application.classpath</name>
<value>/usr/local/hadoop/etc/hadoop, /usr/local/hadoop/share/hadoop/common/*, /usr/local/hadoop/share/hadoop/common/lib/*, /usr/local/hadoop/share/hadoop/hdfs/*, /usr/local/hadoop/share/hadoop/hdfs/lib/*, /usr/local/hadoop/share/hadoop/mapreduce/*, /usr/local/hadoop/share/hadoop/mapreduce/lib/*, /usr/local/hadoop/share/hadoop/yarn/*, /usr/local/hadoop/share/hadoop/yarn/lib/*</value>
</property>
<property>
<description>
Number of seconds after an application finishes before the nodemanager's
DeletionService will delete the application's localized file directory
and log directory.
To diagnose Yarn application problems, set this property's value large
enough (for example, to 600 = 10 minutes) to permit examination of these
directories. After changing the property's value, you must restart the
nodemanager in order for it to have an effect.
The roots of Yarn applications' work directories is configurable with
the yarn.nodemanager.local-dirs property (see below), and the roots
of the Yarn applications' log directories is configurable with the
yarn.nodemanager.log-dirs property (see also below).
</description>
<name>yarn.nodemanager.delete.debug-delay-sec</name>
<value>600</value>
</property>
<property>
<name>yarn.log-aggregation-enable</name>
<value>true</value>
</property>
<property>
<name>yarn.log-aggregation.retain-seconds</name>
<value>900000</value>
</property>
<property>
<name>yarn.nodemanager.vmem-check-enabled</name>
<value>false</value>
</property>
</configuration>

View File

@ -0,0 +1,64 @@
{
"type" : "index",
"spec" : {
"dataSchema" : {
"dataSource" : "retention-tutorial",
"parser" : {
"type" : "string",
"parseSpec" : {
"format" : "json",
"dimensionsSpec" : {
"dimensions" : [
"channel",
"cityName",
"comment",
"countryIsoCode",
"countryName",
"isAnonymous",
"isMinor",
"isNew",
"isRobot",
"isUnpatrolled",
"metroCode",
"namespace",
"page",
"regionIsoCode",
"regionName",
"user",
{ "name": "added", "type": "long" },
{ "name": "deleted", "type": "long" },
{ "name": "delta", "type": "long" }
]
},
"timestampSpec": {
"column": "time",
"format": "iso"
}
}
},
"metricsSpec" : [],
"granularitySpec" : {
"type" : "uniform",
"segmentGranularity" : "hour",
"queryGranularity" : "none",
"intervals" : ["2015-09-12/2015-09-13"],
"rollup" : false
}
},
"ioConfig" : {
"type" : "index",
"firehose" : {
"type" : "local",
"baseDir" : "quickstart/",
"filter" : "wikiticker-2015-09-12-sampled.json.gz"
},
"appendToExisting" : false
},
"tuningConfig" : {
"type" : "index",
"targetPartitionSize" : 5000000,
"maxRowsInMemory" : 25000,
"forceExtendableShardSpecs" : true
}
}
}

View File

@ -0,0 +1,9 @@
{"timestamp":"2018-01-01T01:01:35Z","srcIP":"1.1.1.1", "dstIP":"2.2.2.2","packets":20,"bytes":9024}
{"timestamp":"2018-01-01T01:01:51Z","srcIP":"1.1.1.1", "dstIP":"2.2.2.2","packets":255,"bytes":21133}
{"timestamp":"2018-01-01T01:01:59Z","srcIP":"1.1.1.1", "dstIP":"2.2.2.2","packets":11,"bytes":5780}
{"timestamp":"2018-01-01T01:02:14Z","srcIP":"1.1.1.1", "dstIP":"2.2.2.2","packets":38,"bytes":6289}
{"timestamp":"2018-01-01T01:02:29Z","srcIP":"1.1.1.1", "dstIP":"2.2.2.2","packets":377,"bytes":359971}
{"timestamp":"2018-01-01T01:03:29Z","srcIP":"1.1.1.1", "dstIP":"2.2.2.2","packets":49,"bytes":10204}
{"timestamp":"2018-01-02T21:33:14Z","srcIP":"7.7.7.7", "dstIP":"8.8.8.8","packets":38,"bytes":6289}
{"timestamp":"2018-01-02T21:33:45Z","srcIP":"7.7.7.7", "dstIP":"8.8.8.8","packets":123,"bytes":93999}
{"timestamp":"2018-01-02T21:35:45Z","srcIP":"7.7.7.7", "dstIP":"8.8.8.8","packets":12,"bytes":2818}

View File

@ -0,0 +1,51 @@
{
"type" : "index",
"spec" : {
"dataSchema" : {
"dataSource" : "rollup-tutorial",
"parser" : {
"type" : "string",
"parseSpec" : {
"format" : "json",
"dimensionsSpec" : {
"dimensions" : [
"srcIP",
"dstIP"
]
},
"timestampSpec": {
"column": "timestamp",
"format": "iso"
}
}
},
"metricsSpec" : [
{ "type" : "count", "name" : "count" },
{ "type" : "longSum", "name" : "packets", "fieldName" : "packets" },
{ "type" : "longSum", "name" : "bytes", "fieldName" : "bytes" }
],
"granularitySpec" : {
"type" : "uniform",
"segmentGranularity" : "week",
"queryGranularity" : "minute",
"intervals" : ["2018-01-01/2018-01-03"],
"rollup" : true
}
},
"ioConfig" : {
"type" : "index",
"firehose" : {
"type" : "local",
"baseDir" : "quickstart/tutorial",
"filter" : "rollup-data.json"
},
"appendToExisting" : false
},
"tuningConfig" : {
"type" : "index",
"targetPartitionSize" : 5000000,
"maxRowsInMemory" : 25000,
"forceExtendableShardSpecs" : true
}
}
}

View File

@ -0,0 +1,4 @@
{"timestamp":"2018-01-01T07:01:35Z","animal":"octopus", "location":1, "number":100}
{"timestamp":"2018-01-01T05:01:35Z","animal":"mongoose", "location":2,"number":200}
{"timestamp":"2018-01-01T06:01:35Z","animal":"snake", "location":3, "number":300}
{"timestamp":"2018-01-01T01:01:35Z","animal":"lion", "location":4, "number":300}

View File

@ -0,0 +1,73 @@
{
"type" : "index",
"spec" : {
"dataSchema" : {
"dataSource" : "transform-tutorial",
"parser" : {
"type" : "string",
"parseSpec" : {
"format" : "json",
"dimensionsSpec" : {
"dimensions" : [
"animal",
{ "name": "location", "type": "long" }
]
},
"timestampSpec": {
"column": "timestamp",
"format": "iso"
}
}
},
"metricsSpec" : [
{ "type" : "count", "name" : "count" },
{ "type" : "longSum", "name" : "number", "fieldName" : "number" },
{ "type" : "longSum", "name" : "triple-number", "fieldName" : "triple-number" }
],
"granularitySpec" : {
"type" : "uniform",
"segmentGranularity" : "week",
"queryGranularity" : "minute",
"intervals" : ["2018-01-01/2018-01-03"],
"rollup" : true
},
"transformSpec": {
"transforms": [
{
"type": "expression",
"name": "animal",
"expression": "concat('super-', animal)"
},
{
"type": "expression",
"name": "triple-number",
"expression": "number * 3"
}
],
"filter": {
"type":"or",
"fields": [
{ "type": "selector", "dimension": "animal", "value": "super-mongoose" },
{ "type": "selector", "dimension": "triple-number", "value": "300" },
{ "type": "selector", "dimension": "location", "value": "3" }
]
}
}
},
"ioConfig" : {
"type" : "index",
"firehose" : {
"type" : "local",
"baseDir" : "quickstart/tutorial",
"filter" : "transform-data.json"
},
"appendToExisting" : false
},
"tuningConfig" : {
"type" : "index",
"targetPartitionSize" : 5000000,
"maxRowsInMemory" : 25000,
"forceExtendableShardSpecs" : true
}
}
}

View File

@ -0,0 +1,59 @@
{
"type" : "index",
"spec" : {
"dataSchema" : {
"dataSource" : "updates-tutorial",
"parser" : {
"type" : "string",
"parseSpec" : {
"format" : "json",
"dimensionsSpec" : {
"dimensions" : [
"animal"
]
},
"timestampSpec": {
"column": "timestamp",
"format": "iso"
}
}
},
"metricsSpec" : [
{ "type" : "count", "name" : "count" },
{ "type" : "longSum", "name" : "number", "fieldName" : "number" }
],
"granularitySpec" : {
"type" : "uniform",
"segmentGranularity" : "week",
"queryGranularity" : "minute",
"intervals" : ["2018-01-01/2018-01-03"],
"rollup" : true
}
},
"ioConfig" : {
"type" : "index",
"firehose" : {
"type": "combining",
"delegates": [
{
"type" : "ingestSegment",
"dataSource" : "updates-tutorial",
"interval" : "2018-01-01/2018-01-03"
},
{
"type" : "local",
"baseDir" : "quickstart/tutorial",
"filter" : "updates-data3.json"
}
]
},
"appendToExisting" : false
},
"tuningConfig" : {
"type" : "index",
"targetPartitionSize" : 5000000,
"maxRowsInMemory" : 25000,
"forceExtendableShardSpecs" : true
}
}
}

View File

@ -0,0 +1,49 @@
{
"type" : "index",
"spec" : {
"dataSchema" : {
"dataSource" : "updates-tutorial",
"parser" : {
"type" : "string",
"parseSpec" : {
"format" : "json",
"dimensionsSpec" : {
"dimensions" : [
"animal"
]
},
"timestampSpec": {
"column": "timestamp",
"format": "iso"
}
}
},
"metricsSpec" : [
{ "type" : "count", "name" : "count" },
{ "type" : "longSum", "name" : "number", "fieldName" : "number" }
],
"granularitySpec" : {
"type" : "uniform",
"segmentGranularity" : "week",
"queryGranularity" : "minute",
"intervals" : ["2018-01-01/2018-01-03"],
"rollup" : true
}
},
"ioConfig" : {
"type" : "index",
"firehose" : {
"type" : "local",
"baseDir" : "quickstart/tutorial",
"filter" : "updates-data4.json"
},
"appendToExisting" : true
},
"tuningConfig" : {
"type" : "index",
"targetPartitionSize" : 5000000,
"maxRowsInMemory" : 25000,
"forceExtendableShardSpecs" : true
}
}
}

View File

@ -0,0 +1,3 @@
{"timestamp":"2018-01-01T01:01:35Z","animal":"tiger", "number":100}
{"timestamp":"2018-01-01T03:01:35Z","animal":"aardvark", "number":42}
{"timestamp":"2018-01-01T03:01:35Z","animal":"giraffe", "number":14124}

View File

@ -0,0 +1,3 @@
{"timestamp":"2018-01-01T01:01:35Z","animal":"lion", "number":100}
{"timestamp":"2018-01-01T03:01:35Z","animal":"aardvark", "number":9999}
{"timestamp":"2018-01-01T04:01:35Z","animal":"bear", "number":111}

View File

@ -0,0 +1,4 @@
{"timestamp":"2018-01-01T07:01:35Z","animal":"octopus", "number":115}
{"timestamp":"2018-01-01T05:01:35Z","animal":"mongoose", "number":737}
{"timestamp":"2018-01-01T06:01:35Z","animal":"snake", "number":1234}
{"timestamp":"2018-01-01T01:01:35Z","animal":"lion", "number":300}

View File

@ -0,0 +1,2 @@
{"timestamp":"2018-01-01T04:01:35Z","animal":"bear", "number":222}
{"timestamp":"2018-01-01T09:01:35Z","animal":"falcon", "number":1241}

View File

@ -0,0 +1,49 @@
{
"type" : "index",
"spec" : {
"dataSchema" : {
"dataSource" : "updates-tutorial",
"parser" : {
"type" : "string",
"parseSpec" : {
"format" : "json",
"dimensionsSpec" : {
"dimensions" : [
"animal"
]
},
"timestampSpec": {
"column": "timestamp",
"format": "iso"
}
}
},
"metricsSpec" : [
{ "type" : "count", "name" : "count" },
{ "type" : "longSum", "name" : "number", "fieldName" : "number" }
],
"granularitySpec" : {
"type" : "uniform",
"segmentGranularity" : "week",
"queryGranularity" : "minute",
"intervals" : ["2018-01-01/2018-01-03"],
"rollup" : true
}
},
"ioConfig" : {
"type" : "index",
"firehose" : {
"type" : "local",
"baseDir" : "quickstart/tutorial",
"filter" : "updates-data.json"
},
"appendToExisting" : false
},
"tuningConfig" : {
"type" : "index",
"targetPartitionSize" : 5000000,
"maxRowsInMemory" : 25000,
"forceExtendableShardSpecs" : true
}
}
}

View File

@ -0,0 +1,49 @@
{
"type" : "index",
"spec" : {
"dataSchema" : {
"dataSource" : "updates-tutorial",
"parser" : {
"type" : "string",
"parseSpec" : {
"format" : "json",
"dimensionsSpec" : {
"dimensions" : [
"animal"
]
},
"timestampSpec": {
"column": "timestamp",
"format": "iso"
}
}
},
"metricsSpec" : [
{ "type" : "count", "name" : "count" },
{ "type" : "longSum", "name" : "number", "fieldName" : "number" }
],
"granularitySpec" : {
"type" : "uniform",
"segmentGranularity" : "week",
"queryGranularity" : "minute",
"intervals" : ["2018-01-01/2018-01-03"],
"rollup" : true
}
},
"ioConfig" : {
"type" : "index",
"firehose" : {
"type" : "local",
"baseDir" : "quickstart/tutorial",
"filter" : "updates-data2.json"
},
"appendToExisting" : false
},
"tuningConfig" : {
"type" : "index",
"targetPartitionSize" : 5000000,
"maxRowsInMemory" : 25000,
"forceExtendableShardSpecs" : true
}
}
}

View File

@ -0,0 +1,79 @@
{
"type" : "index_hadoop",
"spec" : {
"dataSchema" : {
"dataSource" : "wikipedia",
"parser" : {
"type" : "hadoopyString",
"parseSpec" : {
"format" : "json",
"dimensionsSpec" : {
"dimensions" : [
"channel",
"cityName",
"comment",
"countryIsoCode",
"countryName",
"isAnonymous",
"isMinor",
"isNew",
"isRobot",
"isUnpatrolled",
"metroCode",
"namespace",
"page",
"regionIsoCode",
"regionName",
"user",
{ "name": "added", "type": "long" },
{ "name": "deleted", "type": "long" },
{ "name": "delta", "type": "long" }
]
},
"timestampSpec" : {
"format" : "auto",
"column" : "time"
}
}
},
"metricsSpec" : [],
"granularitySpec" : {
"type" : "uniform",
"segmentGranularity" : "day",
"queryGranularity" : "none",
"intervals" : ["2015-09-12/2015-09-13"],
"rollup" : false
}
},
"ioConfig" : {
"type" : "hadoop",
"inputSpec" : {
"type" : "static",
"paths" : "/quickstart/wikiticker-2015-09-12-sampled.json.gz"
}
},
"tuningConfig" : {
"type" : "hadoop",
"partitionsSpec" : {
"type" : "hashed",
"targetPartitionSize" : 5000000
},
"forceExtendableShardSpecs" : true,
"jobProperties" : {
"fs.default.name" : "hdfs://druid-hadoop-demo:9000",
"fs.defaultFS" : "hdfs://druid-hadoop-demo:9000",
"dfs.datanode.address" : "druid-hadoop-demo",
"dfs.client.use.datanode.hostname" : "true",
"dfs.datanode.use.datanode.hostname" : "true",
"yarn.resourcemanager.hostname" : "druid-hadoop-demo",
"yarn.nodemanager.vmem-check-enabled" : "false",
"mapreduce.map.java.opts" : "-Duser.timezone=UTC -Dfile.encoding=UTF-8",
"mapreduce.job.user.classpath.first" : "true",
"mapreduce.reduce.java.opts" : "-Duser.timezone=UTC -Dfile.encoding=UTF-8",
"mapreduce.map.memory.mb" : 1024,
"mapreduce.reduce.memory.mb" : 1024
}
}
},
"hadoopDependencyCoordinates": ["org.apache.hadoop:hadoop-client:2.8.3"]
}

View File

@ -0,0 +1,64 @@
{
"type" : "index",
"spec" : {
"dataSchema" : {
"dataSource" : "wikipedia",
"parser" : {
"type" : "string",
"parseSpec" : {
"format" : "json",
"dimensionsSpec" : {
"dimensions" : [
"channel",
"cityName",
"comment",
"countryIsoCode",
"countryName",
"isAnonymous",
"isMinor",
"isNew",
"isRobot",
"isUnpatrolled",
"metroCode",
"namespace",
"page",
"regionIsoCode",
"regionName",
"user",
{ "name": "added", "type": "long" },
{ "name": "deleted", "type": "long" },
{ "name": "delta", "type": "long" }
]
},
"timestampSpec": {
"column": "time",
"format": "iso"
}
}
},
"metricsSpec" : [],
"granularitySpec" : {
"type" : "uniform",
"segmentGranularity" : "day",
"queryGranularity" : "none",
"intervals" : ["2015-09-12/2015-09-13"],
"rollup" : false
}
},
"ioConfig" : {
"type" : "index",
"firehose" : {
"type" : "local",
"baseDir" : "quickstart/",
"filter" : "wikiticker-2015-09-12-sampled.json.gz"
},
"appendToExisting" : false
},
"tuningConfig" : {
"type" : "index",
"targetPartitionSize" : 5000000,
"maxRowsInMemory" : 25000,
"forceExtendableShardSpecs" : true
}
}
}

View File

@ -0,0 +1,59 @@
{
"type": "kafka",
"dataSchema": {
"dataSource": "wikipedia",
"parser": {
"type": "string",
"parseSpec": {
"format": "json",
"timestampSpec": {
"column": "time",
"format": "auto"
},
"dimensionsSpec": {
"dimensions": [
"channel",
"cityName",
"comment",
"countryIsoCode",
"countryName",
"isAnonymous",
"isMinor",
"isNew",
"isRobot",
"isUnpatrolled",
"metroCode",
"namespace",
"page",
"regionIsoCode",
"regionName",
"user",
{ "name": "added", "type": "long" },
{ "name": "deleted", "type": "long" },
{ "name": "delta", "type": "long" }
]
}
}
},
"metricsSpec" : [],
"granularitySpec": {
"type": "uniform",
"segmentGranularity": "DAY",
"queryGranularity": "NONE",
"rollup": false
}
},
"tuningConfig": {
"type": "kafka",
"reportParseExceptions": false
},
"ioConfig": {
"topic": "wikipedia",
"replicas": 2,
"taskDuration": "PT10M",
"completionTimeout": "PT20M",
"consumerProperties": {
"bootstrap.servers": "localhost:9092"
}
}
}

View File

@ -0,0 +1,3 @@
{
"query":"SELECT page, COUNT(*) AS Edits FROM wikipedia WHERE \"__time\" BETWEEN TIMESTAMP '2015-09-12 00:00:00' AND TIMESTAMP '2015-09-13 00:00:00' GROUP BY page ORDER BY Edits DESC LIMIT 10"
}

View File

@ -1,16 +1,15 @@
{
"queryType" : "topN",
"dataSource" : "wikiticker",
"dataSource" : "wikipedia",
"intervals" : ["2015-09-12/2015-09-13"],
"granularity" : "all",
"dimension" : "page",
"metric" : "edits",
"threshold" : 25,
"metric" : "count",
"threshold" : 10,
"aggregations" : [
{
"type" : "longSum",
"name" : "edits",
"fieldName" : "count"
"type" : "count",
"name" : "count"
}
]
}
}

View File

@ -1,85 +0,0 @@
{
"type" : "index_hadoop",
"spec" : {
"ioConfig" : {
"type" : "hadoop",
"inputSpec" : {
"type" : "static",
"paths" : "quickstart/wikiticker-2015-09-12-sampled.json.gz"
}
},
"dataSchema" : {
"dataSource" : "wikiticker",
"granularitySpec" : {
"type" : "uniform",
"segmentGranularity" : "day",
"queryGranularity" : "none",
"intervals" : ["2015-09-12/2015-09-13"]
},
"parser" : {
"type" : "hadoopyString",
"parseSpec" : {
"format" : "json",
"dimensionsSpec" : {
"dimensions" : [
"channel",
"cityName",
"comment",
"countryIsoCode",
"countryName",
"isAnonymous",
"isMinor",
"isNew",
"isRobot",
"isUnpatrolled",
"metroCode",
"namespace",
"page",
"regionIsoCode",
"regionName",
"user"
]
},
"timestampSpec" : {
"format" : "auto",
"column" : "time"
}
}
},
"metricsSpec" : [
{
"name" : "count",
"type" : "count"
},
{
"name" : "added",
"type" : "longSum",
"fieldName" : "added"
},
{
"name" : "deleted",
"type" : "longSum",
"fieldName" : "deleted"
},
{
"name" : "delta",
"type" : "longSum",
"fieldName" : "delta"
},
{
"name" : "user_unique",
"type" : "hyperUnique",
"fieldName" : "user"
}
]
},
"tuningConfig" : {
"type" : "hadoop",
"partitionsSpec" : {
"type" : "hashed",
"targetPartitionSize" : 5000000
},
"jobProperties" : {}
}
}
}