New quickstart and tutorials (#6126)
* New quickstart and tutorials * PR comments * Fix tranquility
10
NOTICE
|
@ -82,4 +82,12 @@ This product contains code adapted from Apache Hadoop
|
|||
* LICENSE:
|
||||
* https://github.com/apache/hadoop/blob/trunk/LICENSE.txt (Apache License, Version 2.0)
|
||||
* HOMEPAGE:
|
||||
* http://hadoop.apache.org/
|
||||
* http://hadoop.apache.org/
|
||||
|
||||
This product contains modified versions of the Dockerfile and related configuration files from SequenceIQ's Hadoop Docker image:
|
||||
* LICENSE:
|
||||
* https://github.com/sequenceiq/hadoop-docker/blob/master/LICENSE (Apache License, Version 2.0)
|
||||
* HOMEPAGE:
|
||||
* https://github.com/sequenceiq/hadoop-docker/
|
||||
* COMMIT TAG:
|
||||
* update this when this patch is committed
|
|
@ -52,70 +52,99 @@
|
|||
</includes>
|
||||
<outputDirectory>quickstart</outputDirectory>
|
||||
</fileSet>
|
||||
<fileSet>
|
||||
<directory>../examples/quickstart/tutorial</directory>
|
||||
<includes>
|
||||
<include>*</include>
|
||||
</includes>
|
||||
<outputDirectory>quickstart/tutorial</outputDirectory>
|
||||
</fileSet>
|
||||
<fileSet>
|
||||
<directory>../examples/quickstart/tutorial/conf</directory>
|
||||
<includes>
|
||||
<include>*</include>
|
||||
</includes>
|
||||
<outputDirectory>quickstart/tutorial/conf</outputDirectory>
|
||||
</fileSet>
|
||||
<fileSet>
|
||||
<directory>../examples/quickstart/tutorial/conf/druid</directory>
|
||||
<includes>
|
||||
<include>*</include>
|
||||
</includes>
|
||||
<outputDirectory>quickstart/tutorial/conf/druid</outputDirectory>
|
||||
</fileSet>
|
||||
<fileSet>
|
||||
<directory>../examples/quickstart/tutorial/conf/druid/_common</directory>
|
||||
<includes>
|
||||
<include>*</include>
|
||||
</includes>
|
||||
<outputDirectory>quickstart/tutorial/conf/druid/_common/</outputDirectory>
|
||||
</fileSet>
|
||||
<fileSet>
|
||||
<directory>../examples/quickstart/tutorial/conf/druid/broker</directory>
|
||||
<includes>
|
||||
<include>*</include>
|
||||
</includes>
|
||||
<outputDirectory>quickstart/tutorial/conf/druid/broker</outputDirectory>
|
||||
</fileSet>
|
||||
<fileSet>
|
||||
<directory>../examples/quickstart/tutorial/conf/druid/coordinator</directory>
|
||||
<includes>
|
||||
<include>*</include>
|
||||
</includes>
|
||||
<outputDirectory>quickstart/tutorial/conf/druid/coordinator</outputDirectory>
|
||||
</fileSet>
|
||||
<fileSet>
|
||||
<directory>../examples/quickstart/tutorial/conf/druid/historical</directory>
|
||||
<includes>
|
||||
<include>*</include>
|
||||
</includes>
|
||||
<outputDirectory>quickstart/tutorial/conf/druid/historical</outputDirectory>
|
||||
</fileSet>
|
||||
<fileSet>
|
||||
<directory>../examples/quickstart/tutorial/conf/druid/overlord</directory>
|
||||
<includes>
|
||||
<include>*</include>
|
||||
</includes>
|
||||
<outputDirectory>quickstart/tutorial/conf/druid/overlord</outputDirectory>
|
||||
</fileSet>
|
||||
<fileSet>
|
||||
<directory>../examples/quickstart/tutorial/conf/druid/middleManager</directory>
|
||||
<includes>
|
||||
<include>*</include>
|
||||
</includes>
|
||||
<outputDirectory>quickstart/tutorial/conf/druid/middleManager</outputDirectory>
|
||||
</fileSet>
|
||||
<fileSet>
|
||||
<directory>../examples/quickstart/tutorial/conf/tranquility</directory>
|
||||
<includes>
|
||||
<include>*</include>
|
||||
</includes>
|
||||
<outputDirectory>quickstart/tutorial/conf/tranquility</outputDirectory>
|
||||
</fileSet>
|
||||
<fileSet>
|
||||
<directory>../examples/quickstart/tutorial/conf/zk</directory>
|
||||
<includes>
|
||||
<include>*</include>
|
||||
</includes>
|
||||
<outputDirectory>quickstart/tutorial/conf/zk</outputDirectory>
|
||||
</fileSet>
|
||||
<fileSet>
|
||||
<directory>../examples/quickstart/tutorial/hadoop</directory>
|
||||
<includes>
|
||||
<include>*</include>
|
||||
</includes>
|
||||
<outputDirectory>quickstart/tutorial/hadoop</outputDirectory>
|
||||
</fileSet>
|
||||
<fileSet>
|
||||
<directory>../examples/quickstart/tutorial/hadoop/docker</directory>
|
||||
<includes>
|
||||
<include>*</include>
|
||||
</includes>
|
||||
<outputDirectory>quickstart/tutorial/hadoop/docker</outputDirectory>
|
||||
</fileSet>
|
||||
|
||||
|
||||
<fileSet>
|
||||
<directory>../examples/conf-quickstart</directory>
|
||||
<includes>
|
||||
<include>*</include>
|
||||
</includes>
|
||||
<outputDirectory>conf-quickstart</outputDirectory>
|
||||
</fileSet>
|
||||
<fileSet>
|
||||
<directory>../examples/conf-quickstart/druid</directory>
|
||||
<includes>
|
||||
<include>*</include>
|
||||
</includes>
|
||||
<outputDirectory>conf-quickstart/druid</outputDirectory>
|
||||
</fileSet>
|
||||
<fileSet>
|
||||
<directory>../examples/conf-quickstart/druid/_common</directory>
|
||||
<includes>
|
||||
<include>*</include>
|
||||
</includes>
|
||||
<outputDirectory>conf-quickstart/druid/_common/</outputDirectory>
|
||||
</fileSet>
|
||||
<fileSet>
|
||||
<directory>../examples/conf-quickstart/druid/broker</directory>
|
||||
<includes>
|
||||
<include>*</include>
|
||||
</includes>
|
||||
<outputDirectory>conf-quickstart/druid/broker</outputDirectory>
|
||||
</fileSet>
|
||||
<fileSet>
|
||||
<directory>../examples/conf-quickstart/druid/coordinator</directory>
|
||||
<includes>
|
||||
<include>*</include>
|
||||
</includes>
|
||||
<outputDirectory>conf-quickstart/druid/coordinator</outputDirectory>
|
||||
</fileSet>
|
||||
<fileSet>
|
||||
<directory>../examples/conf-quickstart/druid/historical</directory>
|
||||
<includes>
|
||||
<include>*</include>
|
||||
</includes>
|
||||
<outputDirectory>conf-quickstart/druid/historical</outputDirectory>
|
||||
</fileSet>
|
||||
<fileSet>
|
||||
<directory>../examples/conf-quickstart/druid/overlord</directory>
|
||||
<includes>
|
||||
<include>*</include>
|
||||
</includes>
|
||||
<outputDirectory>conf-quickstart/druid/overlord</outputDirectory>
|
||||
</fileSet>
|
||||
<fileSet>
|
||||
<directory>../examples/conf-quickstart/druid/middleManager</directory>
|
||||
<includes>
|
||||
<include>*</include>
|
||||
</includes>
|
||||
<outputDirectory>conf-quickstart/druid/middleManager</outputDirectory>
|
||||
</fileSet>
|
||||
<fileSet>
|
||||
<directory>../examples/conf-quickstart/tranquility</directory>
|
||||
<includes>
|
||||
<include>*</include>
|
||||
</includes>
|
||||
<outputDirectory>conf-quickstart/tranquility</outputDirectory>
|
||||
</fileSet>
|
||||
<fileSet>
|
||||
<directory>../examples/quickstart/protobuf</directory>
|
||||
<includes>
|
||||
|
|
|
@ -4,11 +4,19 @@ layout: toc
|
|||
|
||||
## Getting Started
|
||||
* [Concepts](/docs/VERSION/design/)
|
||||
* [Quickstart](/docs/VERSION/tutorials/quickstart.html)
|
||||
* [Loading Data](/docs/VERSION/tutorials/ingestion.html)
|
||||
* [Loading from Files](/docs/VERSION/tutorials/tutorial-batch.html)
|
||||
* [Loading from Streams](/docs/VERSION/tutorials/tutorial-streams.html)
|
||||
* [Loading from Kafka](/docs/VERSION/tutorials/tutorial-kafka.html)
|
||||
* [Quickstart](/docs/VERSION/tutorials/index.html)
|
||||
* [Tutorial: Loading a file](/docs/VERSION/tutorials/tutorial-batch.html)
|
||||
* [Tutorial: Loading stream data from Kafka](/docs/VERSION/tutorials/tutorial-kafka.html)
|
||||
* [Tutorial: Loading a file using Hadoop](/docs/VERSION/tutorials/tutorial-batch-hadoop.html)
|
||||
* [Tutorial: Loading stream data using HTTP push](/docs/VERSION/tutorials/tutorial-tranquility.html)
|
||||
* [Tutorial: Querying data](/docs/VERSION/tutorials/tutorial-query.html)
|
||||
* [Further tutorials](/docs/VERSION/tutorials/advanced.html)
|
||||
* [Tutorial: Rollup](/docs/VERSION/tutorials/rollup.html)
|
||||
* [Tutorial: Configuring retention](/docs/VERSION/tutorials/tutorial-retention.html)
|
||||
* [Tutorial: Updating existing data](/docs/VERSION/tutorials/tutorial-update-data.html)
|
||||
* [Tutorial: Compacting segments](/docs/VERSION/tutorials/tutorial-compaction.html)
|
||||
* [Tutorial: Deleting data](/docs/VERSION/tutorials/tutorial-delete-data.html)
|
||||
* [Tutorial: Writing your own ingestion specs](/docs/VERSION/tutorials/tutorial-ingestion-spec.html)
|
||||
* [Clustering](/docs/VERSION/tutorials/cluster.html)
|
||||
|
||||
## Data Ingestion
|
||||
|
|
After Width: | Height: | Size: 88 KiB |
After Width: | Height: | Size: 220 KiB |
After Width: | Height: | Size: 28 KiB |
After Width: | Height: | Size: 108 KiB |
After Width: | Height: | Size: 127 KiB |
After Width: | Height: | Size: 135 KiB |
After Width: | Height: | Size: 214 KiB |
After Width: | Height: | Size: 76 KiB |
After Width: | Height: | Size: 135 KiB |
|
@ -0,0 +1,168 @@
|
|||
---
|
||||
layout: doc_page
|
||||
---
|
||||
|
||||
# Druid Quickstart
|
||||
|
||||
In this quickstart, we will download Druid and set it up on a single machine. The cluster will be ready to load data
|
||||
after completing this initial setup.
|
||||
|
||||
Before beginning the quickstart, it is helpful to read the [general Druid overview](../design/index.html) and the
|
||||
[ingestion overview](../ingestion/index.html), as the tutorials will refer to concepts discussed on those pages.
|
||||
|
||||
## Prerequisites
|
||||
|
||||
You will need:
|
||||
|
||||
* Java 8
|
||||
* Linux, Mac OS X, or other Unix-like OS (Windows is not supported)
|
||||
* 8G of RAM
|
||||
* 2 vCPUs
|
||||
|
||||
On Mac OS X, you can use [Oracle's JDK
|
||||
8](http://www.oracle.com/technetwork/java/javase/downloads/jdk8-downloads-2133151.html) to install
|
||||
Java.
|
||||
|
||||
On Linux, your OS package manager should be able to help for Java. If your Ubuntu-
|
||||
based OS does not have a recent enough version of Java, WebUpd8 offers [packages for those
|
||||
OSes](http://www.webupd8.org/2012/09/install-oracle-java-8-in-ubuntu-via-ppa.html).
|
||||
|
||||
## Getting started
|
||||
|
||||
To install Druid, run the following commands in your terminal:
|
||||
|
||||
```bash
|
||||
curl -O http://static.druid.io/artifacts/releases/druid-#{DRUIDVERSION}-bin.tar.gz
|
||||
tar -xzf druid-#{DRUIDVERSION}-bin.tar.gz
|
||||
cd druid-#{DRUIDVERSION}
|
||||
```
|
||||
|
||||
In the package, you should find:
|
||||
|
||||
* `LICENSE` - the license files.
|
||||
* `bin/` - scripts useful for this quickstart.
|
||||
* `conf/*` - template configurations for a clustered setup.
|
||||
* `extensions/*` - all Druid extensions.
|
||||
* `hadoop-dependencies/*` - Druid Hadoop dependencies.
|
||||
* `lib/*` - all included software packages for core Druid.
|
||||
* `quickstart/*` - configuration files, sample data, and other files for the quickstart tutorials
|
||||
|
||||
## Download Zookeeper
|
||||
|
||||
Druid currently has a dependency on [Apache ZooKeeper](http://zookeeper.apache.org/) for distributed coordination. You'll
|
||||
need to download and run Zookeeper.
|
||||
|
||||
In the package root, run the following commands:
|
||||
|
||||
```bash
|
||||
curl https://archive.apache.org/dist/zookeeper/zookeeper-3.4.11/zookeeper-3.4.11.tar.gz -o zookeeper-3.4.11.tar.gz
|
||||
tar -xzf zookeeper-3.4.11.tar.gz
|
||||
mv zookeeper-3.4.11 zk
|
||||
```
|
||||
|
||||
The startup scripts for the tutorial will expect the contents of the Zookeeper tarball to be located at `zk` under the druid-#{DRUIDVERSION} package root.
|
||||
|
||||
## Start up Druid services
|
||||
|
||||
From the druid-#{DRUIDVERSION} package root, run the following command:
|
||||
|
||||
```bash
|
||||
bin/supervise -c quickstart/tutorial/conf/tutorial-cluster.conf
|
||||
```
|
||||
|
||||
This will bring up instances of Zookeeper and the Druid services, all running on the local machine, e.g.:
|
||||
|
||||
```
|
||||
bin/supervise -c quickstart/tutorial/conf/tutorial-cluster.conf
|
||||
[Thu Jul 26 12:16:23 2018] Running command[zk], logging to[/stage/druid-#{DRUIDVERSION}/var/sv/zk.log]: bin/run-zk quickstart/tutorial/conf
|
||||
[Thu Jul 26 12:16:23 2018] Running command[coordinator], logging to[/stage/druid-#{DRUIDVERSION}/var/sv/coordinator.log]: bin/run-druid coordinator quickstart/tutorial/conf
|
||||
[Thu Jul 26 12:16:23 2018] Running command[broker], logging to[//stage/druid-#{DRUIDVERSION}/var/sv/broker.log]: bin/run-druid broker quickstart/tutorial/conf
|
||||
[Thu Jul 26 12:16:23 2018] Running command[historical], logging to[/stage/druid-#{DRUIDVERSION}/var/sv/historical.log]: bin/run-druid historical quickstart/tutorial/conf
|
||||
[Thu Jul 26 12:16:23 2018] Running command[overlord], logging to[/stage/druid-#{DRUIDVERSION}/var/sv/overlord.log]: bin/run-druid overlord quickstart/tutorial/conf
|
||||
[Thu Jul 26 12:16:23 2018] Running command[middleManager], logging to[/stage/druid-#{DRUIDVERSION}/var/sv/middleManager.log]: bin/run-druid middleManager quickstart/tutorial/conf
|
||||
|
||||
```
|
||||
|
||||
All persistent state such as the cluster metadata store and segments for the services will be kept in the `var` directory under the druid-#{DRUIDVERSION} package root. Logs for the services are located at `var/sv`.
|
||||
|
||||
Later on, if you'd like to stop the services, CTRL-C to exit the `bin/supervise` script, which will terminate the Druid processes.
|
||||
|
||||
If you want a clean start after stopping the services, delete the `var` directory and run the `bin/supervise` script again.
|
||||
|
||||
Once every service has started, you are now ready to load data.
|
||||
|
||||
## Loading Data
|
||||
|
||||
### Tutorial Dataset
|
||||
|
||||
For the following data loading tutorials, we have included a sample data file containing Wikipedia page edit events that occurred on 2015-09-12.
|
||||
|
||||
This sample data is located at `quickstart/wikipedia-2015-09-12-sampled.json.gz` from the Druid package root. The page edit events are stored as JSON objects in a text file.
|
||||
|
||||
The sample data has the following columns, and an example event is shown below:
|
||||
|
||||
* added
|
||||
* channel
|
||||
* cityName
|
||||
* comment
|
||||
* countryIsoCode
|
||||
* countryName
|
||||
* deleted
|
||||
* delta
|
||||
* isAnonymous
|
||||
* isMinor
|
||||
* isNew
|
||||
* isRobot
|
||||
* isUnpatrolled
|
||||
* metroCode
|
||||
* namespace
|
||||
* page
|
||||
* regionIsoCode
|
||||
* regionName
|
||||
* user
|
||||
|
||||
```
|
||||
{
|
||||
"timestamp":"2015-09-12T20:03:45.018Z",
|
||||
"channel":"#en.wikipedia",
|
||||
"namespace":"Main"
|
||||
"page":"Spider-Man's powers and equipment",
|
||||
"user":"foobar",
|
||||
"comment":"/* Artificial web-shooters */",
|
||||
"cityName":"New York",
|
||||
"regionName":"New York",
|
||||
"regionIsoCode":"NY",
|
||||
"countryName":"United States",
|
||||
"countryIsoCode":"US",
|
||||
"isAnonymous":false,
|
||||
"isNew":false,
|
||||
"isMinor":false,
|
||||
"isRobot":false,
|
||||
"isUnpatrolled":false,
|
||||
"added":99,
|
||||
"delta":99,
|
||||
"deleted":0,
|
||||
}
|
||||
```
|
||||
|
||||
The following tutorials demonstrate various methods of loading data into Druid, including both batch and streaming use cases.
|
||||
|
||||
### [Tutorial: Loading a file](./tutorial-batch.html)
|
||||
|
||||
This tutorial demonstrates how to perform a batch file load, using Druid's native batch ingestion.
|
||||
|
||||
### [Tutorial: Loading stream data from Kafka](../tutorial-kafka.html)
|
||||
|
||||
This tutorial demonstrates how to load streaming data from a Kafka topic.
|
||||
|
||||
### [Tutorial: Loading a file using Hadoop](../tutorial-batch-hadoop.html)
|
||||
|
||||
This tutorial demonstrates how to perform a batch file load, using a remote Hadoop cluster.
|
||||
|
||||
### [Tutorial: Loading data using Tranquility](../tutorial-tranquility.html)
|
||||
|
||||
This tutorial demonstrates how to load streaming data by pushing events to Druid using the Tranquility service.
|
||||
|
||||
### [Tutorial: Writing your own ingestion spec](../tutorial-ingestion-spec.html)
|
||||
|
||||
This tutorial demonstrates how to write a new ingestion spec and use it to load data.
|
|
@ -1,42 +0,0 @@
|
|||
---
|
||||
layout: doc_page
|
||||
---
|
||||
|
||||
# Loading Data
|
||||
|
||||
## Choosing an ingestion method
|
||||
|
||||
Druid supports streaming (real-time) and file-based (batch) ingestion methods. The most
|
||||
popular configurations are:
|
||||
|
||||
- [Files](../ingestion/batch-ingestion.html) - Load data from HDFS, S3, local files, or any supported Hadoop
|
||||
filesystem in batches. We recommend this method if your dataset is already in flat files.
|
||||
|
||||
- [Stream push](../ingestion/stream-ingestion.html#stream-push) - Push a data stream into Druid in real-time
|
||||
using [Tranquility](http://github.com/druid-io/tranquility), a client library for sending streams
|
||||
to Druid. We recommend this method if your dataset originates in a streaming system like Kafka,
|
||||
Storm, Spark Streaming, or your own system.
|
||||
|
||||
- [Stream pull](../ingestion/stream-ingestion.html#stream-pull) - Pull a data stream directly from an external
|
||||
data source into Druid using Realtime Nodes.
|
||||
|
||||
## Getting started
|
||||
|
||||
The easiest ways to get started with loading your own data are the three included tutorials.
|
||||
|
||||
- [Files-based tutorial](tutorial-batch.html) showing you how to load files from your local disk.
|
||||
- [Streams-based tutorial](tutorial-streams.html) showing you how to push data over HTTP.
|
||||
- [Kafka-based tutorial](tutorial-kafka.html) showing you how to load data from Kafka.
|
||||
|
||||
## Hybrid batch/streaming
|
||||
|
||||
You can combine batch and streaming methods in a hybrid batch/streaming architecture. In a hybrid architecture,
|
||||
you use a streaming method to do initial ingestion, and then periodically re-ingest older data in batch mode
|
||||
(typically every few hours, or nightly). When Druid re-ingests data for a time range, the new data automatically
|
||||
replaces the data from the earlier ingestion.
|
||||
|
||||
All streaming ingestion methods currently supported by Druid do introduce the possibility of dropped or duplicated
|
||||
messages in certain failure scenarios, and batch re-ingestion eliminates this potential source of error for
|
||||
historical data.
|
||||
|
||||
Batch re-ingestion also gives you the option to re-ingest your data if you needed to revise it for any reason.
|
|
@ -1,243 +0,0 @@
|
|||
---
|
||||
layout: doc_page
|
||||
---
|
||||
|
||||
# Druid Quickstart
|
||||
|
||||
In this quickstart, we will download Druid, set up it up on a single machine, load some data, and query the data.
|
||||
|
||||
## Prerequisites
|
||||
|
||||
You will need:
|
||||
|
||||
* Java 8
|
||||
* Linux, Mac OS X, or other Unix-like OS (Windows is not supported)
|
||||
* 8G of RAM
|
||||
* 2 vCPUs
|
||||
|
||||
On Mac OS X, you can use [Oracle's JDK
|
||||
8](http://www.oracle.com/technetwork/java/javase/downloads/jdk8-downloads-2133151.html) to install
|
||||
Java.
|
||||
|
||||
On Linux, your OS package manager should be able to help for Java. If your Ubuntu-
|
||||
based OS does not have a recent enough version of Java, WebUpd8 offers [packages for those
|
||||
OSes](http://www.webupd8.org/2012/09/install-oracle-java-8-in-ubuntu-via-ppa.html).
|
||||
|
||||
## Getting started
|
||||
|
||||
To install Druid, issue the following commands in your terminal:
|
||||
|
||||
```bash
|
||||
curl -O http://static.druid.io/artifacts/releases/druid-#{DRUIDVERSION}-bin.tar.gz
|
||||
tar -xzf druid-#{DRUIDVERSION}-bin.tar.gz
|
||||
cd druid-#{DRUIDVERSION}
|
||||
```
|
||||
|
||||
In the package, you should find:
|
||||
|
||||
* `LICENSE` - the license files.
|
||||
* `bin/` - scripts useful for this quickstart.
|
||||
* `conf/*` - template configurations for a clustered setup.
|
||||
* `conf-quickstart/*` - configurations for this quickstart.
|
||||
* `extensions/*` - all Druid extensions.
|
||||
* `hadoop-dependencies/*` - Druid Hadoop dependencies.
|
||||
* `lib/*` - all included software packages for core Druid.
|
||||
* `quickstart/*` - files useful for this quickstart.
|
||||
|
||||
## Start up Zookeeper
|
||||
|
||||
Druid currently has a dependency on [Apache ZooKeeper](http://zookeeper.apache.org/) for distributed coordination. You'll
|
||||
need to download and run Zookeeper.
|
||||
|
||||
```bash
|
||||
curl http://www.gtlib.gatech.edu/pub/apache/zookeeper/zookeeper-3.4.11/zookeeper-3.4.11.tar.gz -o zookeeper-3.4.11.tar.gz
|
||||
tar -xzf zookeeper-3.4.11.tar.gz
|
||||
cd zookeeper-3.4.11
|
||||
cp conf/zoo_sample.cfg conf/zoo.cfg
|
||||
./bin/zkServer.sh start
|
||||
```
|
||||
|
||||
## Start up Druid services
|
||||
|
||||
With Zookeeper running, return to the druid-#{DRUIDVERSION} directory. In that directory, issue the command:
|
||||
|
||||
```bash
|
||||
bin/init
|
||||
```
|
||||
|
||||
This will setup up some directories for you. Next, you can start up the Druid processes in different terminal windows.
|
||||
This tutorial runs every Druid process on the same system. In a large distributed production cluster,
|
||||
many of these Druid processes can still be co-located together.
|
||||
|
||||
```bash
|
||||
java `cat conf-quickstart/druid/historical/jvm.config | xargs` -cp "conf-quickstart/druid/_common:conf-quickstart/druid/historical:lib/*" io.druid.cli.Main server historical
|
||||
java `cat conf-quickstart/druid/broker/jvm.config | xargs` -cp "conf-quickstart/druid/_common:conf-quickstart/druid/broker:lib/*" io.druid.cli.Main server broker
|
||||
java `cat conf-quickstart/druid/coordinator/jvm.config | xargs` -cp "conf-quickstart/druid/_common:conf-quickstart/druid/coordinator:lib/*" io.druid.cli.Main server coordinator
|
||||
java `cat conf-quickstart/druid/overlord/jvm.config | xargs` -cp "conf-quickstart/druid/_common:conf-quickstart/druid/overlord:lib/*" io.druid.cli.Main server overlord
|
||||
java `cat conf-quickstart/druid/middleManager/jvm.config | xargs` -cp "conf-quickstart/druid/_common:conf-quickstart/druid/middleManager:lib/*" io.druid.cli.Main server middleManager
|
||||
```
|
||||
|
||||
You should see a log message printed out for each service that starts up.
|
||||
|
||||
Later on, if you'd like to stop the services, CTRL-C to exit from the running java processes. If you
|
||||
want a clean start after stopping the services, delete the `var` directory and run the `init` script again.
|
||||
|
||||
Once every service has started, you are now ready to load data.
|
||||
|
||||
## Load batch data
|
||||
|
||||
We've included a sample of Wikipedia edits from September 12, 2015 to get you started.
|
||||
|
||||
<div class="note info">
|
||||
This section shows you how to load data in batches, but you can skip ahead to learn how to <a href="quickstart.html#load-streaming-data">load
|
||||
streams in real-time</a>. Druid's streaming ingestion can load data
|
||||
with virtually no delay between events occurring and being available for queries.
|
||||
</div>
|
||||
|
||||
The [dimensions](https://en.wikipedia.org/wiki/Dimension_%28data_warehouse%29) (attributes you can
|
||||
filter and split on) in the Wikipedia dataset, other than time, are:
|
||||
|
||||
* channel
|
||||
* cityName
|
||||
* comment
|
||||
* countryIsoCode
|
||||
* countryName
|
||||
* isAnonymous
|
||||
* isMinor
|
||||
* isNew
|
||||
* isRobot
|
||||
* isUnpatrolled
|
||||
* metroCode
|
||||
* namespace
|
||||
* page
|
||||
* regionIsoCode
|
||||
* regionName
|
||||
* user
|
||||
|
||||
The [measures](https://en.wikipedia.org/wiki/Measure_%28data_warehouse%29), or *metrics* as they are known in Druid (values you can aggregate)
|
||||
in the Wikipedia dataset are:
|
||||
|
||||
* count
|
||||
* added
|
||||
* deleted
|
||||
* delta
|
||||
* user_unique
|
||||
|
||||
To load this data into Druid, you can submit an *ingestion task* pointing to the file. We've included
|
||||
a task that loads the `wikiticker-2015-09-12-sampled.json` file included in the archive. To submit
|
||||
this task, POST it to Druid in a new terminal window from the druid-#{DRUIDVERSION} directory:
|
||||
|
||||
```bash
|
||||
curl -X 'POST' -H 'Content-Type:application/json' -d @quickstart/wikiticker-index.json localhost:8090/druid/indexer/v1/task
|
||||
```
|
||||
|
||||
Which will print the ID of the task if the submission was successful:
|
||||
|
||||
```bash
|
||||
{"task":"index_hadoop_wikipedia_2013-10-09T21:30:32.802Z"}
|
||||
```
|
||||
|
||||
To view the status of your ingestion task, go to your overlord console:
|
||||
[http://localhost:8090/console.html](http://localhost:8090/console.html). You can refresh the console periodically, and after
|
||||
the task is successful, you should see a "SUCCESS" status for the task.
|
||||
|
||||
After your ingestion task finishes, the data will be loaded by historical nodes and available for
|
||||
querying within a minute or two. You can monitor the progress of loading your data in the
|
||||
coordinator console, by checking whether there is a datasource "wikiticker" with a blue circle
|
||||
indicating "fully available": [http://localhost:8081/#/](http://localhost:8081/#/).
|
||||
|
||||
Once the data is fully available, you can immediately query it— to see how, skip to the [Query
|
||||
data](#query-data) section below. Or, continue to the [Load your own data](#load-your-own-data)
|
||||
section if you'd like to load a different dataset.
|
||||
|
||||
## Load streaming data
|
||||
|
||||
To load streaming data, we are going to push events into Druid
|
||||
over a simple HTTP API. To do this we will use [Tranquility], a high level data producer
|
||||
library for Druid.
|
||||
|
||||
To download Tranquility, issue the following commands in your terminal:
|
||||
|
||||
```bash
|
||||
curl -O http://static.druid.io/tranquility/releases/tranquility-distribution-0.8.0.tgz
|
||||
tar -xzf tranquility-distribution-0.8.0.tgz
|
||||
cd tranquility-distribution-0.8.0
|
||||
```
|
||||
|
||||
We've included a configuration file in `conf-quickstart/tranquility/server.json` as part of the Druid distribution
|
||||
for a *metrics* datasource. We're going to start the Tranquility server process, which can be used to push events
|
||||
directly to Druid.
|
||||
|
||||
``` bash
|
||||
bin/tranquility server -configFile <path_to_druid_distro>/conf-quickstart/tranquility/server.json
|
||||
```
|
||||
|
||||
<div class="note info">
|
||||
This section shows you how to load data using Tranquility Server, but Druid also supports a wide
|
||||
variety of <a href="../ingestion/stream-ingestion.html#stream-push">other streaming ingestion options</a>, including from
|
||||
popular streaming systems like Kafka, Storm, Samza, and Spark Streaming.
|
||||
</div>
|
||||
|
||||
The [dimensions](https://en.wikipedia.org/wiki/Dimension_%28data_warehouse%29) (attributes you can
|
||||
filter and split on) for this datasource are flexible. It's configured for *schemaless dimensions*,
|
||||
meaning it will accept any field in your JSON input as a dimension.
|
||||
|
||||
The metrics (also called
|
||||
[measures](https://en.wikipedia.org/wiki/Measure_%28data_warehouse%29); values
|
||||
you can aggregate) in this datasource are:
|
||||
|
||||
* count
|
||||
* value_sum (derived from `value` in the input)
|
||||
* value_min (derived from `value` in the input)
|
||||
* value_max (derived from `value` in the input)
|
||||
|
||||
We've included a script that can generate some random sample metrics to load into this datasource.
|
||||
To use it, simply run in your Druid distribution repository:
|
||||
|
||||
```bash
|
||||
bin/generate-example-metrics | curl -XPOST -H'Content-Type: application/json' --data-binary @- http://localhost:8200/v1/post/metrics
|
||||
```
|
||||
|
||||
Which will print something like:
|
||||
|
||||
```
|
||||
{"result":{"received":25,"sent":25}}
|
||||
```
|
||||
|
||||
This indicates that the HTTP server received 25 events from you, and sent 25 to Druid. Note that
|
||||
this may take a few seconds to finish the first time you run it, as Druid resources must be
|
||||
allocated to the ingestion task. Subsequent POSTs should complete quickly.
|
||||
|
||||
Once the data is sent to Druid, you can immediately [query it](#query-data).
|
||||
|
||||
## Query data
|
||||
|
||||
### Direct Druid queries
|
||||
|
||||
Druid supports a rich [family of JSON-based
|
||||
queries](../querying/querying.html). We've included an example topN query
|
||||
in `quickstart/wikiticker-top-pages.json` that will find the most-edited articles in this dataset:
|
||||
|
||||
```bash
|
||||
curl -L -H'Content-Type: application/json' -XPOST --data-binary @quickstart/wikiticker-top-pages.json http://localhost:8082/druid/v2/?pretty
|
||||
```
|
||||
|
||||
## Visualizing data
|
||||
|
||||
Druid is ideal for power user-facing analytic applications. There are a number of different open source applications to
|
||||
visualize and explore data in Druid. We recommend trying [Pivot](https://github.com/implydata/pivot),
|
||||
[Superset](https://github.com/airbnb/superset), or [Metabase](https://github.com/metabase/metabase) to start
|
||||
visualizing the data you just ingested.
|
||||
|
||||
If you installed Pivot for example, you should be able to view your data in your browser at [localhost:9090](http://localhost:9090/).
|
||||
|
||||
### SQL and other query libraries
|
||||
|
||||
There are many more query tools for Druid than we've included here, including SQL
|
||||
engines, and libraries for various languages like Python and Ruby. Please see [the list of
|
||||
libraries](../development/libraries.html) for more information.
|
||||
|
||||
## Clustered setup
|
||||
|
||||
This quickstart sets you up with all services running on a single machine. The next step is to [load
|
||||
your own data](ingestion.html). Or, you can skip ahead to [running a distributed cluster](cluster.html).
|
|
@ -0,0 +1,232 @@
|
|||
---
|
||||
layout: doc_page
|
||||
---
|
||||
|
||||
# Tutorial: Load batch data using Hadoop
|
||||
|
||||
This tutorial shows you how to load data files into Druid using a remote Hadoop cluster.
|
||||
|
||||
For this tutorial, we'll assume that you've already completed the previous [batch ingestion tutorial](tutorial-batch.html) using Druid's native batch ingestion system.
|
||||
|
||||
## Install Docker
|
||||
|
||||
This tutorial requires [Docker](https://docs.docker.com/install/) to be installed on the tutorial machine.
|
||||
|
||||
Once the Docker install is complete, please proceed to the next steps in the tutorial.
|
||||
|
||||
## Build the Hadoop docker image
|
||||
|
||||
For this tutorial, we've provided a Dockerfile for a Hadoop 2.8.3 cluster, which we'll use to run the batch indexing task.
|
||||
|
||||
This Dockerfile and related files are located at `quickstart/tutorial/hadoop/docker`.
|
||||
|
||||
From the druid-${DRUIDVERSION} package root, run the following commands to build a Docker image named "druid-hadoop-demo" with version tag "2.8.3":
|
||||
|
||||
```
|
||||
cd quickstart/tutorial/hadoop/docker
|
||||
docker build -t druid-hadoop-demo:2.8.3 .
|
||||
```
|
||||
|
||||
This will start building the Hadoop image. Once the image build is done, you should see the message `Successfully tagged druid-hadoop-demo:2.8.3` printed to the console.
|
||||
|
||||
## Setup the Hadoop docker cluster
|
||||
|
||||
### Create temporary shared directory
|
||||
|
||||
We'll need a shared folder between the host and the Hadoop container for transferring some files.
|
||||
|
||||
Let's create some folders under `/tmp`, we will use these later when starting the Hadoop container:
|
||||
|
||||
```
|
||||
mkdir -p /tmp/shared
|
||||
mkdir -p /tmp/shared/hadoop_xml
|
||||
```
|
||||
|
||||
### Configure /etc/hosts
|
||||
|
||||
On the host machine, add the following entry to `/etc/hosts`:
|
||||
|
||||
```
|
||||
127.0.0.1 druid-hadoop-demo
|
||||
```
|
||||
|
||||
### Start the Hadoop container
|
||||
|
||||
Once the `/tmp/shared` folder has been created and the `etc/hosts` entry has been added, run the following command to start the Hadoop container.
|
||||
|
||||
```
|
||||
docker run -it -h druid-hadoop-demo -p 50010:50010 -p 50020:50020 -p 50075:50075 -p 50090:50090 -p 8020:8020 -p 10020:10020 -p 19888:19888 -p 8030:8030 -p 8031:8031 -p 8032:8032 -p 8033:8033 -p 8040:8040 -p 8042:8042 -p 8088:8088 -p 8443:8443 -p 2049:2049 -p 9000:9000 -p 49707:49707 -p 2122:2122 -p 34455:34455 -v /tmp/shared:/shared druid-hadoop-demo:2.8.3 /etc/bootstrap.sh -bash
|
||||
```
|
||||
|
||||
Once the container is started, your terminal will attach to a bash shell running inside the container:
|
||||
|
||||
```
|
||||
Starting sshd: [ OK ]
|
||||
18/07/26 17:27:15 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
|
||||
Starting namenodes on [druid-hadoop-demo]
|
||||
druid-hadoop-demo: starting namenode, logging to /usr/local/hadoop/logs/hadoop-root-namenode-druid-hadoop-demo.out
|
||||
localhost: starting datanode, logging to /usr/local/hadoop/logs/hadoop-root-datanode-druid-hadoop-demo.out
|
||||
Starting secondary namenodes [0.0.0.0]
|
||||
0.0.0.0: starting secondarynamenode, logging to /usr/local/hadoop/logs/hadoop-root-secondarynamenode-druid-hadoop-demo.out
|
||||
18/07/26 17:27:31 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
|
||||
starting yarn daemons
|
||||
starting resourcemanager, logging to /usr/local/hadoop/logs/yarn--resourcemanager-druid-hadoop-demo.out
|
||||
localhost: starting nodemanager, logging to /usr/local/hadoop/logs/yarn-root-nodemanager-druid-hadoop-demo.out
|
||||
starting historyserver, logging to /usr/local/hadoop/logs/mapred--historyserver-druid-hadoop-demo.out
|
||||
bash-4.1#
|
||||
```
|
||||
|
||||
The `Unable to load native-hadoop library for your platform... using builtin-java classes where applicable` warning messages can be safely ignored.
|
||||
|
||||
### Copy input data to the Hadoop container
|
||||
|
||||
From the druid-${DRUIDVERSION} package root on the host, copy the `quickstart/wikiticker-2015-09-12-sampled.json.gz` sample data to the shared folder:
|
||||
|
||||
```
|
||||
cp quickstart/wikiticker-2015-09-12-sampled.json.gz /tmp/shared/wikiticker-2015-09-12-sampled.json.gz
|
||||
```
|
||||
|
||||
### Setup HDFS directories
|
||||
|
||||
In the Hadoop container's shell, run the following commands to setup the HDFS directories needed by this tutorial and copy the input data to HDFS.
|
||||
|
||||
```
|
||||
cd /usr/local/hadoop/bin
|
||||
./hadoop fs -mkdir /druid
|
||||
./hadoop fs -mkdir /druid/segments
|
||||
./hadoop fs -mkdir /quickstart
|
||||
./hadoop fs -chmod 777 /druid
|
||||
./hadoop fs -chmod 777 /druid/segments
|
||||
./hadoop fs -chmod 777 /quickstart
|
||||
./hadoop fs -chmod -R 777 /tmp
|
||||
./hadoop fs -chmod -R 777 /user
|
||||
./hadoop fs -put /shared/wikiticker-2015-09-12-sampled.json.gz /quickstart/wikiticker-2015-09-12-sampled.json.gz
|
||||
```
|
||||
|
||||
If you encounter namenode errors when running this command, the Hadoop container may not be finished initializing. When this occurs, wait a couple of minutes and retry the commands.
|
||||
|
||||
## Configure Druid to use Hadoop
|
||||
|
||||
Some additional steps are needed to configure the Druid cluster for Hadoop batch indexing.
|
||||
|
||||
### Copy Hadoop configuration to Druid classpath
|
||||
|
||||
From the Hadoop container's shell, run the following command to copy the Hadoop .xml configuration files to the shared folder:
|
||||
|
||||
```
|
||||
cp /usr/local/hadoop/etc/hadoop/*.xml /shared/hadoop_xml
|
||||
```
|
||||
|
||||
From the host machine, run the following, where {PATH_TO_DRUID} is replaced by the path to the Druid package.
|
||||
|
||||
```
|
||||
mkdir -p {PATH_TO_DRUID}/quickstart/tutorial/conf/druid/_common/hadoop-xml
|
||||
cp /tmp/shared/hadoop_xml/*.xml {PATH_TO_DRUID}/quickstart/tutorial/conf/druid/_common/hadoop-xml/
|
||||
```
|
||||
|
||||
### Update Druid segment and log storage
|
||||
|
||||
In your favorite text editor, open `quickstart/tutorial/conf/druid/_common/common.runtime.properties`, and make the following edits:
|
||||
|
||||
#### Disable local deep storage and enable HDFS deep stroage
|
||||
|
||||
```
|
||||
#
|
||||
# Deep storage
|
||||
#
|
||||
|
||||
# For local disk (only viable in a cluster if this is a network mount):
|
||||
#druid.storage.type=local
|
||||
#druid.storage.storageDirectory=var/druid/segments
|
||||
|
||||
# For HDFS:
|
||||
druid.storage.type=hdfs
|
||||
druid.storage.storageDirectory=/druid/segments
|
||||
```
|
||||
|
||||
|
||||
#### Disable local log storage and enable HDFS log storage
|
||||
|
||||
```
|
||||
#
|
||||
# Indexing service logs
|
||||
#
|
||||
|
||||
# For local disk (only viable in a cluster if this is a network mount):
|
||||
#druid.indexer.logs.type=file
|
||||
#druid.indexer.logs.directory=var/druid/indexing-logs
|
||||
|
||||
# For HDFS:
|
||||
druid.indexer.logs.type=hdfs
|
||||
druid.indexer.logs.directory=/druid/indexing-logs
|
||||
|
||||
```
|
||||
|
||||
### Restart Druid cluster
|
||||
|
||||
Once the Hadoop .xml files have been copied to the Druid cluster and the segment/log storage configuration has been updated to use HDFS, the Druid cluster needs to be restarted for the new configurations to take effect.
|
||||
|
||||
If the cluster is still running, CTRL-C to terminate the `bin/supervise` script, and re-reun it to bring the Druid services back up.
|
||||
|
||||
## Load batch data
|
||||
|
||||
We've included a sample of Wikipedia edits from September 12, 2015 to get you started.
|
||||
|
||||
To load this data into Druid, you can submit an *ingestion task* pointing to the file. We've included
|
||||
a task that loads the `wikiticker-2015-09-12-sampled.json.gz` file included in the archive.
|
||||
|
||||
Let's submit the `wikipedia-index-hadoop-.json` task:
|
||||
|
||||
```
|
||||
bin/post-index-task --file quickstart/tutorial/wikipedia-index-hadoop.json
|
||||
```
|
||||
|
||||
## Querying your data
|
||||
|
||||
After the data load is complete, please follow the [query tutorial](../tutorial/tutorial-query.html) to run some example queries on the newly loaded data.
|
||||
|
||||
## Cleanup
|
||||
|
||||
This tutorial is only meant to be used together with the [query tutorial](../tutorial/tutorial-query.html).
|
||||
|
||||
If you wish to go through any of the other tutorials, you will need to:
|
||||
* Shut down the cluster and reset the cluster state by removing the contents of the `var` directory under the druid package.
|
||||
* Revert the deep storage and task storage config back to local types in `quickstart/tutorial/conf/druid/_common/common.runtime.properties`
|
||||
* Restart the cluster
|
||||
|
||||
This is necessary because the other ingestion tutorials will write to the same "wikipedia" datasource, and later tutorials expect the cluster to use local deep storage.
|
||||
|
||||
Example reverted config:
|
||||
|
||||
```
|
||||
#
|
||||
# Deep storage
|
||||
#
|
||||
|
||||
# For local disk (only viable in a cluster if this is a network mount):
|
||||
druid.storage.type=local
|
||||
druid.storage.storageDirectory=var/druid/segments
|
||||
|
||||
# For HDFS:
|
||||
#druid.storage.type=hdfs
|
||||
#druid.storage.storageDirectory=/druid/segments
|
||||
|
||||
#
|
||||
# Indexing service logs
|
||||
#
|
||||
|
||||
# For local disk (only viable in a cluster if this is a network mount):
|
||||
druid.indexer.logs.type=file
|
||||
druid.indexer.logs.directory=var/druid/indexing-logs
|
||||
|
||||
# For HDFS:
|
||||
#druid.indexer.logs.type=hdfs
|
||||
#druid.indexer.logs.directory=/druid/indexing-logs
|
||||
|
||||
```
|
||||
|
||||
|
||||
## Further reading
|
||||
|
||||
For more information on loading batch data with Hadoop, please see [the Hadoop batch ingestion documentation](../ingestion/hadoop.html).
|
||||
|
|
@ -2,137 +2,157 @@
|
|||
layout: doc_page
|
||||
---
|
||||
|
||||
# Tutorial: Load your own batch data
|
||||
# Tutorial: Loading a file
|
||||
|
||||
## Getting started
|
||||
|
||||
This tutorial shows you how to load your own data files into Druid.
|
||||
This tutorial demonstrates how to perform a batch file load, using Druid's native batch ingestion.
|
||||
|
||||
For this tutorial, we'll assume you've already downloaded Druid as described in
|
||||
the [single-machine quickstart](quickstart.html) and have it running on your local machine. You
|
||||
the [single-machine quickstart](index.html) and have it running on your local machine. You
|
||||
don't need to have loaded any data yet.
|
||||
|
||||
Once that's complete, you can load your own dataset by writing a custom ingestion spec.
|
||||
## Preparing the data and the ingestion task spec
|
||||
|
||||
## Writing an ingestion spec
|
||||
A data load is initiated by submitting an *ingestion task* spec to the Druid overlord. For this tutorial, we'll be loading the sample Wikipedia page edits data.
|
||||
|
||||
When loading files into Druid, you will use Druid's [batch loading](../ingestion/batch-ingestion.html) process.
|
||||
There's an example batch ingestion spec in `quickstart/wikiticker-index.json` that you can modify
|
||||
for your own needs.
|
||||
The Druid package includes the following sample native batch ingestion task spec at `quickstart/wikipedia-index.json`, shown here for convenience,
|
||||
which has been configured to read the `quickstart/wikiticker-2015-09-12-sampled.json.gz` input file:
|
||||
|
||||
The most important questions are:
|
||||
|
||||
* What should the dataset be called? This is the "dataSource" field of the "dataSchema".
|
||||
* Where is the dataset located? The file paths belong in the "paths" of the "inputSpec". If you
|
||||
want to load multiple files, you can provide them as a comma-separated string.
|
||||
* Which field should be treated as a timestamp? This belongs in the "column" of the "timestampSpec".
|
||||
* Which fields should be treated as dimensions? This belongs in the "dimensions" of the "dimensionsSpec".
|
||||
* Which fields should be treated as metrics? This belongs in the "metricsSpec".
|
||||
* What time ranges (intervals) are being loaded? This belongs in the "intervals" of the "granularitySpec".
|
||||
|
||||
If your data does not have a natural sense of time, you can tag each row with the current time.
|
||||
You can also tag all rows with a fixed timestamp, like "2000-01-01T00:00:00.000Z".
|
||||
|
||||
Let's use this pageviews dataset as an example. Druid supports TSV, CSV, and JSON out of the box.
|
||||
Note that nested JSON objects are not supported, so if you do use JSON, you should provide a file
|
||||
containing flattened objects.
|
||||
|
||||
```json
|
||||
{"time": "2015-09-01T00:00:00Z", "url": "/foo/bar", "user": "alice", "latencyMs": 32}
|
||||
{"time": "2015-09-01T01:00:00Z", "url": "/", "user": "bob", "latencyMs": 11}
|
||||
{"time": "2015-09-01T01:30:00Z", "url": "/foo/bar", "user": "bob", "latencyMs": 45}
|
||||
```
|
||||
|
||||
Make sure the file has no newline at the end. If you save this to a file called "pageviews.json", then for this dataset:
|
||||
|
||||
* Let's call the dataset "pageviews".
|
||||
* The data is located in "pageviews.json".
|
||||
* The timestamp is the "time" field.
|
||||
* Good choices for dimensions are the string fields "url" and "user".
|
||||
* Good choices for metrics are a count of pageviews, and the sum of "latencyMs". Collecting that
|
||||
sum when we load the data will allow us to compute an average at query time as well.
|
||||
* The data covers the time range 2015-09-01 (inclusive) through 2015-09-02 (exclusive).
|
||||
|
||||
You can copy the existing `quickstart/wikiticker-index.json` indexing task to a new file:
|
||||
|
||||
```bash
|
||||
cp quickstart/wikiticker-index.json my-index-task.json
|
||||
```
|
||||
|
||||
And modify it by altering these sections:
|
||||
|
||||
```json
|
||||
"dataSource": "pageviews"
|
||||
```
|
||||
|
||||
```json
|
||||
"inputSpec": {
|
||||
"type": "static",
|
||||
"paths": "pageviews.json"
|
||||
{
|
||||
"type" : "index",
|
||||
"spec" : {
|
||||
"dataSchema" : {
|
||||
"dataSource" : "wikipedia",
|
||||
"parser" : {
|
||||
"type" : "string",
|
||||
"parseSpec" : {
|
||||
"format" : "json",
|
||||
"dimensionsSpec" : {
|
||||
"dimensions" : [
|
||||
"channel",
|
||||
"cityName",
|
||||
"comment",
|
||||
"countryIsoCode",
|
||||
"countryName",
|
||||
"isAnonymous",
|
||||
"isMinor",
|
||||
"isNew",
|
||||
"isRobot",
|
||||
"isUnpatrolled",
|
||||
"metroCode",
|
||||
"namespace",
|
||||
"page",
|
||||
"regionIsoCode",
|
||||
"regionName",
|
||||
"user",
|
||||
{ "name": "added", "type": "long" },
|
||||
{ "name": "deleted", "type": "long" },
|
||||
{ "name": "delta", "type": "long" }
|
||||
]
|
||||
},
|
||||
"timestampSpec": {
|
||||
"column": "time",
|
||||
"format": "iso"
|
||||
}
|
||||
}
|
||||
},
|
||||
"metricsSpec" : [],
|
||||
"granularitySpec" : {
|
||||
"type" : "uniform",
|
||||
"segmentGranularity" : "day",
|
||||
"queryGranularity" : "none",
|
||||
"intervals" : ["2015-09-12/2015-09-13"],
|
||||
"rollup" : false
|
||||
}
|
||||
},
|
||||
"ioConfig" : {
|
||||
"type" : "index",
|
||||
"firehose" : {
|
||||
"type" : "local",
|
||||
"baseDir" : "quickstart/",
|
||||
"filter" : "wikiticker-2015-09-12-sampled.json.gz"
|
||||
},
|
||||
"appendToExisting" : false
|
||||
},
|
||||
"tuningConfig" : {
|
||||
"type" : "index",
|
||||
"targetPartitionSize" : 5000000,
|
||||
"maxRowsInMemory" : 25000,
|
||||
"forceExtendableShardSpecs" : true
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
```json
|
||||
"timestampSpec": {
|
||||
"format": "auto",
|
||||
"column": "time"
|
||||
}
|
||||
This spec will create a datasource named "wikipedia",
|
||||
|
||||
## Load batch data
|
||||
|
||||
We've included a sample of Wikipedia edits from September 12, 2015 to get you started.
|
||||
|
||||
To load this data into Druid, you can submit an *ingestion task* pointing to the file. We've included
|
||||
a task that loads the `wikiticker-2015-09-12-sampled.json.gz` file included in the archive.
|
||||
|
||||
For convenience, the Druid package includes a batch ingestion helper script at `bin/post-index-task`.
|
||||
|
||||
This script will POST an ingestion task to the Druid overlord and poll Druid until the data is available for querying.
|
||||
|
||||
Run the following command from Druid package root:
|
||||
|
||||
```
|
||||
bin/post-index-task --file quickstart/tutorial/wikipedia-index.json
|
||||
```
|
||||
|
||||
```json
|
||||
"dimensionsSpec": {
|
||||
"dimensions": ["url", "user"]
|
||||
}
|
||||
You should see output like the following:
|
||||
|
||||
```
|
||||
|
||||
```json
|
||||
"metricsSpec": [
|
||||
{"name": "views", "type": "count"},
|
||||
{"name": "latencyMs", "type": "doubleSum", "fieldName": "latencyMs"}
|
||||
]
|
||||
Beginning indexing data for wikipedia
|
||||
Task started: index_wikipedia_2018-07-27T06:37:44.323Z
|
||||
Task log: http://localhost:8090/druid/indexer/v1/task/index_wikipedia_2018-07-27T06:37:44.323Z/log
|
||||
Task status: http://localhost:8090/druid/indexer/v1/task/index_wikipedia_2018-07-27T06:37:44.323Z/status
|
||||
Task index_wikipedia_2018-07-27T06:37:44.323Z still running...
|
||||
Task index_wikipedia_2018-07-27T06:37:44.323Z still running...
|
||||
Task finished with status: SUCCESS
|
||||
Completed indexing data for wikipedia. Now loading indexed data onto the cluster...
|
||||
wikipedia loading complete! You may now query your data
|
||||
```
|
||||
|
||||
```json
|
||||
"granularitySpec": {
|
||||
"type": "uniform",
|
||||
"segmentGranularity": "day",
|
||||
"queryGranularity": "none",
|
||||
"intervals": ["2015-09-01/2015-09-02"]
|
||||
}
|
||||
```
|
||||
|
||||
## Running the task
|
||||
|
||||
To actually run this task, first make sure that the indexing task can read *pageviews.json*:
|
||||
|
||||
- If you're running locally (no configuration for connecting to Hadoop; this is the default) then
|
||||
place it in the root of the Druid distribution.
|
||||
- If you configured Druid to connect to a Hadoop cluster, upload
|
||||
the pageviews.json file to HDFS. You may need to adjust the `paths` in the ingestion spec.
|
||||
|
||||
To kick off the indexing process, POST your indexing task to the Druid Overlord. In a standard Druid
|
||||
install, the URL is `http://OVERLORD_IP:8090/druid/indexer/v1/task`.
|
||||
|
||||
```bash
|
||||
curl -X 'POST' -H 'Content-Type:application/json' -d @my-index-task.json OVERLORD_IP:8090/druid/indexer/v1/task
|
||||
```
|
||||
|
||||
If you're running everything on a single machine, you can use localhost:
|
||||
|
||||
```bash
|
||||
curl -X 'POST' -H 'Content-Type:application/json' -d @my-index-task.json localhost:8090/druid/indexer/v1/task
|
||||
```
|
||||
|
||||
If anything goes wrong with this task (e.g. it finishes with status FAILED), you can troubleshoot
|
||||
by visiting the "Task log" on the [overlord console](http://localhost:8090/console.html).
|
||||
|
||||
## Querying your data
|
||||
|
||||
Your data should become fully available within a minute or two. You can monitor this process on
|
||||
your Coordinator console at [http://localhost:8081/#/](http://localhost:8081/#/).
|
||||
Once the data is loaded, please follow the [query tutorial](../tutorial/tutorial-query.html) to run some example queries on the newly loaded data.
|
||||
|
||||
Once your data is fully available, you can query it using any of the
|
||||
[supported query methods](../querying/querying.html).
|
||||
## Cleanup
|
||||
|
||||
If you wish to go through any of the other ingestion tutorials, you will need to shut down the cluster and reset the cluster state by removing the contents of the `var` directory under the druid package, as the other tutorials will write to the same "wikipedia" datasource.
|
||||
|
||||
## Extra: Loading data without the script
|
||||
|
||||
Let's briefly discuss how we would've submitted the ingestion task without using the script. You do not need to run these commands.
|
||||
|
||||
To submit the task, POST it to Druid in a new terminal window from the druid-#{DRUIDVERSION} directory:
|
||||
|
||||
```bash
|
||||
curl -X 'POST' -H 'Content-Type:application/json' -d @quickstart/tutorial/wikipedia-index.json http://localhost:8090/druid/indexer/v1/task
|
||||
```
|
||||
|
||||
Which will print the ID of the task if the submission was successful:
|
||||
|
||||
```bash
|
||||
{"task":"index_wikipedia_2018-06-09T21:30:32.802Z"}
|
||||
```
|
||||
|
||||
To view the status of the ingestion task, go to the overlord console:
|
||||
[http://localhost:8090/console.html](http://localhost:8090/console.html). You can refresh the console periodically, and after
|
||||
the task is successful, you should see a "SUCCESS" status for the task.
|
||||
|
||||
After the ingestion task finishes, the data will be loaded by historical nodes and available for
|
||||
querying within a minute or two. You can monitor the progress of loading the data in the
|
||||
coordinator console, by checking whether there is a datasource "wikipedia" with a blue circle
|
||||
indicating "fully available": [http://localhost:8081/#/](http://localhost:8081/#/).
|
||||
|
||||
![Coordinator console](../tutorials/img/tutorial-batch-01.png "Wikipedia 100% loaded")
|
||||
|
||||
## Further reading
|
||||
|
||||
|
|
|
@ -0,0 +1,106 @@
|
|||
---
|
||||
layout: doc_page
|
||||
---
|
||||
|
||||
# Tutorial: Compacting segments
|
||||
|
||||
This tutorial demonstrates how to compact existing segments into fewer but larger segments.
|
||||
|
||||
Because there is some per-segment memory and processing overhead, it can sometimes be beneficial to reduce the total number of segments.
|
||||
|
||||
For this tutorial, we'll assume you've already downloaded Druid as described in
|
||||
the [single-machine quickstart](index.html) and have it running on your local machine.
|
||||
|
||||
It will also be helpful to have finished [Tutorial: Loading a file](/docs/VERSION/tutorials/tutorial-batch.html) and [Tutorial: Querying data](/docs/VERSION/tutorials/tutorial-query.html).
|
||||
|
||||
## Load the initial data
|
||||
|
||||
For this tutorial, we'll be using the Wikipedia edits sample data, with an ingestion task spec that will create a separate segment for each hour in the input data.
|
||||
|
||||
The ingestion spec can be found at `quickstart/tutorial/compaction-init-index.json`. Let's submit that spec, which will create a datasource called `compaction-tutorial`:
|
||||
|
||||
```
|
||||
bin/post-index-task --file quickstart/tutorial/compaction-init-index.json
|
||||
```
|
||||
|
||||
After the ingestion completes, go to http://localhost:8081/#/datasources/compaction-tutorial in a browser to view information about the new datasource in the Coordinator console.
|
||||
|
||||
There will be 24 segments for this datasource, one segment per hour in the input data:
|
||||
|
||||
![Original segments](../tutorials/img/tutorial-retention-01.png "Original segments")
|
||||
|
||||
Running a COUNT(*) query on this datasource shows that there are 39,244 rows:
|
||||
|
||||
```
|
||||
dsql> select count(*) from "compaction-tutorial";
|
||||
┌────────┐
|
||||
│ EXPR$0 │
|
||||
├────────┤
|
||||
│ 39244 │
|
||||
└────────┘
|
||||
Retrieved 1 row in 1.38s.
|
||||
```
|
||||
|
||||
## Compact the data
|
||||
|
||||
Let's now combine these 24 segments into one segment.
|
||||
|
||||
We have included a compaction task spec for this tutorial datasource at `quickstart/tutorial/compaction-final-index.json`:
|
||||
|
||||
```
|
||||
{
|
||||
"type": "compact",
|
||||
"dataSource": "compaction-tutorial",
|
||||
"interval": "2015-09-12/2015-09-13",
|
||||
"tuningConfig" : {
|
||||
"type" : "index",
|
||||
"targetPartitionSize" : 5000000,
|
||||
"maxRowsInMemory" : 25000,
|
||||
"forceExtendableShardSpecs" : true
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
This will compact all segments for the interval `2015-09-12/2015-09-13` in the `compaction-tutorial` datasource.
|
||||
|
||||
The parameters in the `tuningConfig` control how many segments will be present in the compacted set of segments.
|
||||
|
||||
In this tutorial example, only one compacted segment will be created, as the 39244 rows in the input is less than the 5000000 `targetPartitionSize`.
|
||||
|
||||
Let's submit this task now:
|
||||
|
||||
```
|
||||
bin/post-index-task --file quickstart/tutorial/compaction-final-index.json
|
||||
```
|
||||
|
||||
After the task finishes, refresh the http://localhost:8081/#/datasources/compaction-tutorial page.
|
||||
|
||||
The original 24 segments will eventually be marked as "unused" by the Coordinator and removed, with the new compacted segment remaining.
|
||||
|
||||
By default, the Druid coordinator will not mark segments as unused until the coordinator process has been up for at least 15 minutes, so you may see the old segment set and the new compacted set at the same time in the coordinator, e.g.:
|
||||
|
||||
![Compacted segments intermediate state](../tutorials/img/tutorial-compaction-01.png "Compacted segments intermediate state")
|
||||
|
||||
The new compacted segment has a more recent version than the original segments, so even when both sets of segments are shown by the coordinator, queries will only read from the new compacted segment.
|
||||
|
||||
Let's try running a COUNT(*) on `compaction-tutorial` again, where the row count should still be 39,244:
|
||||
|
||||
```
|
||||
dsql> select count(*) from "compaction-tutorial";
|
||||
┌────────┐
|
||||
│ EXPR$0 │
|
||||
├────────┤
|
||||
│ 39244 │
|
||||
└────────┘
|
||||
Retrieved 1 row in 1.30s.
|
||||
```
|
||||
|
||||
After the coordinator has been running for at least 15 minutes, the http://localhost:8081/#/datasources/compaction-tutorial page should show there is only 1 segment:
|
||||
|
||||
![Compacted segments final state](../tutorials/img/tutorial-compaction-02.png "Compacted segments final state")
|
||||
|
||||
## Further reading
|
||||
|
||||
[Task documentation](../ingestion/tasks.html)
|
||||
|
||||
[Segment optimization](../operations/segment-optimization.html)
|
|
@ -0,0 +1,156 @@
|
|||
---
|
||||
layout: doc_page
|
||||
---
|
||||
|
||||
# Tutorial: Deleting data
|
||||
|
||||
This tutorial demonstrates how to delete existing data.
|
||||
|
||||
For this tutorial, we'll assume you've already downloaded Druid as described in
|
||||
the [single-machine quickstart](index.html) and have it running on your local machine.
|
||||
|
||||
Completing [Tutorial: Configuring retention](/docs/VERSION/tutorials/tutorial-retention.html) first is highly recommended, as we will be using retention rules in this tutorial.
|
||||
|
||||
## Load initial data
|
||||
|
||||
In this tutorial, we will use the Wikipedia edits data, with an indexing spec that creates hourly segments. This spec is located at `quickstart/tutorial/deletion-index.json`, and it creates a datasource called `deletion-tutorial`.
|
||||
|
||||
Let's load this initial data:
|
||||
|
||||
```
|
||||
bin/post-index-task --file quickstart/tutorial/deletion-index.json
|
||||
```
|
||||
|
||||
When the load finishes, open http://localhost:8081/#/datasources/deletion-tutorial in a browser.
|
||||
|
||||
## How to permanently delete data
|
||||
|
||||
Permanent deletion of a Druid segment has two steps:
|
||||
|
||||
1. The segment must first be marked as "unused". This occurs when a segment is dropped by retention rules, and when a user manually disables a segment through the Coordinator API. This tutorial will cover both cases.
|
||||
2. After segments have been marked as "unused", a Kill Task will delete any "unused" segments from Druid's metadata store as well as deep storage.
|
||||
|
||||
Let's drop some segments now, first with load rules, then manually.
|
||||
|
||||
## Drop some data with load rules
|
||||
|
||||
As with the previous retention tutorial, there are currently 24 segments in the `deletion-tutorial` datasource.
|
||||
|
||||
Click the `edit rules` button with a pencil icon at the upper left corner of the page.
|
||||
|
||||
A rule configuration window will appear. Enter `tutorial` for both the user and changelog comment field.
|
||||
|
||||
Now click the `+ Add a rule` button twice.
|
||||
|
||||
In the `rule #1` box at the top, click `Load`, `Interval`, enter `2015-09-12T12:00:00.000Z/2015-09-13T00:00:00.000Z` in the interval box, and click `+ _default_tier replicant`.
|
||||
|
||||
In the `rule #2` box at the bottom, click `Drop` and `Forever`.
|
||||
|
||||
This will cause the first 12 segments of `deletion-tutorial` to be dropped. However, these dropped segments are not removed from deep storage.
|
||||
|
||||
You can see that all 24 segments are still present in deep storage by listing the contents of `druid-{DRUIDVERSION}/var/druid/segments/deletion-tutorial`:
|
||||
|
||||
```
|
||||
$ ls -l1 var/druid/segments/deletion-tutorial/
|
||||
2015-09-12T00:00:00.000Z_2015-09-12T01:00:00.000Z
|
||||
2015-09-12T01:00:00.000Z_2015-09-12T02:00:00.000Z
|
||||
2015-09-12T02:00:00.000Z_2015-09-12T03:00:00.000Z
|
||||
2015-09-12T03:00:00.000Z_2015-09-12T04:00:00.000Z
|
||||
2015-09-12T04:00:00.000Z_2015-09-12T05:00:00.000Z
|
||||
2015-09-12T05:00:00.000Z_2015-09-12T06:00:00.000Z
|
||||
2015-09-12T06:00:00.000Z_2015-09-12T07:00:00.000Z
|
||||
2015-09-12T07:00:00.000Z_2015-09-12T08:00:00.000Z
|
||||
2015-09-12T08:00:00.000Z_2015-09-12T09:00:00.000Z
|
||||
2015-09-12T09:00:00.000Z_2015-09-12T10:00:00.000Z
|
||||
2015-09-12T10:00:00.000Z_2015-09-12T11:00:00.000Z
|
||||
2015-09-12T11:00:00.000Z_2015-09-12T12:00:00.000Z
|
||||
2015-09-12T12:00:00.000Z_2015-09-12T13:00:00.000Z
|
||||
2015-09-12T13:00:00.000Z_2015-09-12T14:00:00.000Z
|
||||
2015-09-12T14:00:00.000Z_2015-09-12T15:00:00.000Z
|
||||
2015-09-12T15:00:00.000Z_2015-09-12T16:00:00.000Z
|
||||
2015-09-12T16:00:00.000Z_2015-09-12T17:00:00.000Z
|
||||
2015-09-12T17:00:00.000Z_2015-09-12T18:00:00.000Z
|
||||
2015-09-12T18:00:00.000Z_2015-09-12T19:00:00.000Z
|
||||
2015-09-12T19:00:00.000Z_2015-09-12T20:00:00.000Z
|
||||
2015-09-12T20:00:00.000Z_2015-09-12T21:00:00.000Z
|
||||
2015-09-12T21:00:00.000Z_2015-09-12T22:00:00.000Z
|
||||
2015-09-12T22:00:00.000Z_2015-09-12T23:00:00.000Z
|
||||
2015-09-12T23:00:00.000Z_2015-09-13T00:00:00.000Z
|
||||
```
|
||||
|
||||
## Manually disable a segment
|
||||
|
||||
Let's manually disable a segment now. This will mark a segment as "unused", but not remove it from deep storage.
|
||||
|
||||
On http://localhost:8081/#/datasources/deletion-tutorial, click one of the remaining segments on the left for full details about the segment:
|
||||
|
||||
![Segments](../tutorials/img/tutorial-deletion-01.png "Segments")
|
||||
|
||||
The top of the info box shows the full segment ID, e.g. `deletion-tutorial_2016-06-27T14:00:00.000Z_2016-06-27T15:00:00.000Z_2018-07-27T22:57:00.110Z` for the segment of hour 14.
|
||||
|
||||
Let's disable the hour 14 segment by sending the following DELETE request to the coordinator, where {SEGMENT-ID} is the full segment ID shown in the info box:
|
||||
|
||||
```
|
||||
curl -XDELETE http://localhost:8081/druid/coordinator/v1/datasources/deletion-tutorial/segments/{SEGMENT-ID}
|
||||
```
|
||||
|
||||
After that command completes, you should see that the segment for hour 14 has been disabled:
|
||||
|
||||
![Segments 2](../tutorials/img/tutorial-deletion-02.png "Segments 2")
|
||||
|
||||
Note that the hour 14 segment is still in deep storage:
|
||||
|
||||
```
|
||||
$ ls -l1 var/druid/segments/deletion-tutorial/
|
||||
2015-09-12T00:00:00.000Z_2015-09-12T01:00:00.000Z
|
||||
2015-09-12T01:00:00.000Z_2015-09-12T02:00:00.000Z
|
||||
2015-09-12T02:00:00.000Z_2015-09-12T03:00:00.000Z
|
||||
2015-09-12T03:00:00.000Z_2015-09-12T04:00:00.000Z
|
||||
2015-09-12T04:00:00.000Z_2015-09-12T05:00:00.000Z
|
||||
2015-09-12T05:00:00.000Z_2015-09-12T06:00:00.000Z
|
||||
2015-09-12T06:00:00.000Z_2015-09-12T07:00:00.000Z
|
||||
2015-09-12T07:00:00.000Z_2015-09-12T08:00:00.000Z
|
||||
2015-09-12T08:00:00.000Z_2015-09-12T09:00:00.000Z
|
||||
2015-09-12T09:00:00.000Z_2015-09-12T10:00:00.000Z
|
||||
2015-09-12T10:00:00.000Z_2015-09-12T11:00:00.000Z
|
||||
2015-09-12T11:00:00.000Z_2015-09-12T12:00:00.000Z
|
||||
2015-09-12T12:00:00.000Z_2015-09-12T13:00:00.000Z
|
||||
2015-09-12T13:00:00.000Z_2015-09-12T14:00:00.000Z
|
||||
2015-09-12T14:00:00.000Z_2015-09-12T15:00:00.000Z
|
||||
2015-09-12T15:00:00.000Z_2015-09-12T16:00:00.000Z
|
||||
2015-09-12T16:00:00.000Z_2015-09-12T17:00:00.000Z
|
||||
2015-09-12T17:00:00.000Z_2015-09-12T18:00:00.000Z
|
||||
2015-09-12T18:00:00.000Z_2015-09-12T19:00:00.000Z
|
||||
2015-09-12T19:00:00.000Z_2015-09-12T20:00:00.000Z
|
||||
2015-09-12T20:00:00.000Z_2015-09-12T21:00:00.000Z
|
||||
2015-09-12T21:00:00.000Z_2015-09-12T22:00:00.000Z
|
||||
2015-09-12T22:00:00.000Z_2015-09-12T23:00:00.000Z
|
||||
2015-09-12T23:00:00.000Z_2015-09-13T00:00:00.000Z
|
||||
```
|
||||
|
||||
## Run a kill task
|
||||
|
||||
Now that we have disabled some segments, we can submit a Kill Task, which will delete the disabled segments from metadata and deep storage.
|
||||
|
||||
A Kill Task spec has been provided at `quickstart/deletion-kill.json`. Submit this task to the Overlord with the following command:
|
||||
|
||||
```
|
||||
curl -X 'POST' -H 'Content-Type:application/json' -d @quickstart/tutorial/deletion-kill.json http://localhost:8090/druid/indexer/v1/task
|
||||
```
|
||||
|
||||
After this task completes, you can see that the disabled segments have now been removed from deep storage:
|
||||
|
||||
```
|
||||
$ ls -l1 var/druid/segments/deletion-tutorial/
|
||||
2015-09-12T12:00:00.000Z_2015-09-12T13:00:00.000Z
|
||||
2015-09-12T13:00:00.000Z_2015-09-12T14:00:00.000Z
|
||||
2015-09-12T15:00:00.000Z_2015-09-12T16:00:00.000Z
|
||||
2015-09-12T16:00:00.000Z_2015-09-12T17:00:00.000Z
|
||||
2015-09-12T17:00:00.000Z_2015-09-12T18:00:00.000Z
|
||||
2015-09-12T18:00:00.000Z_2015-09-12T19:00:00.000Z
|
||||
2015-09-12T19:00:00.000Z_2015-09-12T20:00:00.000Z
|
||||
2015-09-12T20:00:00.000Z_2015-09-12T21:00:00.000Z
|
||||
2015-09-12T21:00:00.000Z_2015-09-12T22:00:00.000Z
|
||||
2015-09-12T22:00:00.000Z_2015-09-12T23:00:00.000Z
|
||||
2015-09-12T23:00:00.000Z_2015-09-13T00:00:00.000Z
|
||||
```
|
|
@ -0,0 +1,642 @@
|
|||
---
|
||||
layout: doc_page
|
||||
---
|
||||
|
||||
# Tutorial: Writing an ingestion spec
|
||||
|
||||
This tutorial will guide the reader through the process of defining an ingestion spec, pointing out key considerations and guidelines.
|
||||
|
||||
For this tutorial, we'll assume you've already downloaded Druid as described in
|
||||
the [single-machine quickstart](index.html) and have it running on your local machine.
|
||||
|
||||
It will also be helpful to have finished [Tutorial: Loading a file](/docs/VERSION/tutorials/tutorial-batch.html), [Tutorial: Querying data](/docs/VERSION/tutorials/tutorial-query.html), and [Tutorial: Rollup](/docs/VERSION/tutorials/tutorial-rollup.html).
|
||||
|
||||
## Example data
|
||||
|
||||
Suppose we have the following network flow data:
|
||||
|
||||
* `srcIP`: IP address of sender
|
||||
* `srcPort`: Port of sender
|
||||
* `dstIP`: IP address of receiver
|
||||
* `dstPort`: Port of receiver
|
||||
* `protocol`: IP protocol number
|
||||
* `packets`: number of packets transmitted
|
||||
* `bytes`: number of bytes transmitted
|
||||
* `cost`: the cost of sending the traffic
|
||||
|
||||
```
|
||||
{"ts":"2018-01-01T01:01:35Z","srcIP":"1.1.1.1", "dstIP":"2.2.2.2", "srcPort":2000, "dstPort":3000, "protocol": 6, "packets":10, "bytes":1000, "cost": 1.4}
|
||||
{"ts":"2018-01-01T01:01:51Z","srcIP":"1.1.1.1", "dstIP":"2.2.2.2", "srcPort":2000, "dstPort":3000, "protocol": 6, "packets":20, "bytes":2000, "cost": 3.1}
|
||||
{"ts":"2018-01-01T01:01:59Z","srcIP":"1.1.1.1", "dstIP":"2.2.2.2", "srcPort":2000, "dstPort":3000, "protocol": 6, "packets":30, "bytes":3000, "cost": 0.4}
|
||||
{"ts":"2018-01-01T01:02:14Z","srcIP":"1.1.1.1", "dstIP":"2.2.2.2", "srcPort":5000, "dstPort":7000, "protocol": 6, "packets":40, "bytes":4000, "cost": 7.9}
|
||||
{"ts":"2018-01-01T01:02:29Z","srcIP":"1.1.1.1", "dstIP":"2.2.2.2", "srcPort":5000, "dstPort":7000, "protocol": 6, "packets":50, "bytes":5000, "cost": 10.2}
|
||||
{"ts":"2018-01-01T01:03:29Z","srcIP":"1.1.1.1", "dstIP":"2.2.2.2", "srcPort":5000, "dstPort":7000, "protocol": 6, "packets":60, "bytes":6000, "cost": 4.3}
|
||||
{"ts":"2018-01-01T02:33:14Z","srcIP":"7.7.7.7", "dstIP":"8.8.8.8", "srcPort":4000, "dstPort":5000, "protocol": 17, "packets":100, "bytes":10000, "cost": 22.4}
|
||||
{"ts":"2018-01-01T02:33:45Z","srcIP":"7.7.7.7", "dstIP":"8.8.8.8", "srcPort":4000, "dstPort":5000, "protocol": 17, "packets":200, "bytes":20000, "cost": 34.5}
|
||||
{"ts":"2018-01-01T02:35:45Z","srcIP":"7.7.7.7", "dstIP":"8.8.8.8", "srcPort":4000, "dstPort":5000, "protocol": 17, "packets":300, "bytes":30000, "cost": 46.3}
|
||||
```
|
||||
|
||||
Save the JSON contents above into a file called `ingestion-tutorial-data.json` in `quickstart/`.
|
||||
|
||||
Let's walk through the process of defining an ingestion spec that can load this data.
|
||||
|
||||
For this tutorial, we will be using the native batch indexing task. When using other task types, some aspects of the ingestion spec will differ, and this tutorial will point out such areas.
|
||||
|
||||
## Defining the schema
|
||||
|
||||
The core element of a Druid ingestion spec is the `dataSchema`. The `dataSchema` defines how to parse input data into a set of columns that will be stored in Druid.
|
||||
|
||||
Let's start with an empty `dataSchema` and add fields to it as we progress through the tutorial.
|
||||
|
||||
Create a new file called `ingestion-tutorial-index.json` in `quickstart/` with the following contents:
|
||||
|
||||
```json
|
||||
"dataSchema" : {}
|
||||
```
|
||||
|
||||
We will be making successive edits to this ingestion spec as we progress through the tutorial.
|
||||
|
||||
### Datasource name
|
||||
|
||||
The datasource name is specified by the `dataSource` parameter in the `dataSchema`.
|
||||
|
||||
```json
|
||||
"dataSchema" : {
|
||||
"dataSource" : "ingestion-tutorial",
|
||||
}
|
||||
```
|
||||
|
||||
Let's call the tutorial datasource `ingestion-tutorial`.
|
||||
|
||||
### Choose a parser
|
||||
|
||||
A `dataSchema` has a `parser` field, which defines the parser that Druid will use to interpret the input data.
|
||||
|
||||
Since our input data is represented as JSON strings, we'll use a `string` parser with `json` format:
|
||||
|
||||
```
|
||||
"dataSchema" : {
|
||||
"dataSource" : "ingestion-tutorial",
|
||||
"parser" : {
|
||||
"type" : "string",
|
||||
"parseSpec" : {
|
||||
"format" : "json"
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### Time column
|
||||
|
||||
The `parser` needs to know how to extract the main timestamp field from the input data. When using a `json` type `parseSpec`, the timestamp is defined in a `timestampSpec`.
|
||||
|
||||
The timestamp column in our input data is named "ts", containing ISO 8601 timestamps, so let's add a `timestampSpec` with that information to the `parseSpec`:
|
||||
|
||||
```
|
||||
"dataSchema" : {
|
||||
"dataSource" : "ingestion-tutorial",
|
||||
"parser" : {
|
||||
"type" : "string",
|
||||
"parseSpec" : {
|
||||
"format" : "json",
|
||||
"timestampSpec" : {
|
||||
"format" : "iso",
|
||||
"column" : "ts"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### Column types
|
||||
|
||||
Now that we've defined the time column, let's look at definitions for other columns.
|
||||
|
||||
Druid supports the following column types: String, Long, Float, Double. We will see how these are used in the following sections.
|
||||
|
||||
Before we move on to how we define our other non-time columns, let's discuss `rollup` first.
|
||||
|
||||
### Rollup
|
||||
|
||||
When ingesting data, we must consider whether we wish to use rollup or not.
|
||||
|
||||
* If rollup is enabled, we will need to separate the input columns into two categories, "dimensions" and "metrics". "Dimensions" are the grouping columns for rollup, while "metrics" are the columns that will be aggregated.
|
||||
|
||||
* If rollup is disabled, then all columns are treated as "dimensions" and no pre-aggregation occurs.
|
||||
|
||||
For this tutorial, let's enable rollup. This is specified with a `granularitySpec` on the `dataSchema`.
|
||||
|
||||
Note that the `granularitySpec` lies outside of the `parser`. We will revist the `parser` soon when we define our dimensions and metrics.
|
||||
|
||||
```
|
||||
"dataSchema" : {
|
||||
"dataSource" : "ingestion-tutorial",
|
||||
"parser" : {
|
||||
"type" : "string",
|
||||
"parseSpec" : {
|
||||
"format" : "json",
|
||||
"timestampSpec" : {
|
||||
"format" : "iso",
|
||||
"column" : "ts"
|
||||
}
|
||||
}
|
||||
},
|
||||
"granularitySpec" : {
|
||||
"rollup" : true
|
||||
}
|
||||
}
|
||||
|
||||
```
|
||||
|
||||
#### Choosing dimensions and metrics
|
||||
|
||||
For this example dataset, the following is a sensible split for "dimensions" and "metrics":
|
||||
|
||||
* Dimensions: srcIP, srcPort, dstIP, dstPort, protocol
|
||||
* Metrics: packets, bytes, cost
|
||||
|
||||
The dimensions here are a group of properties that identify a unidirectional flow of IP traffic, while the metrics represent facts about the IP traffic flow specified by a dimension grouping.
|
||||
|
||||
Let's look at how to define these dimensions and metrics within the ingestion spec.
|
||||
|
||||
#### Dimensions
|
||||
|
||||
Dimensions are specified with a `dimensionsSpec` inside the `parseSpec`.
|
||||
|
||||
```
|
||||
"dataSchema" : {
|
||||
"dataSource" : "ingestion-tutorial",
|
||||
"parser" : {
|
||||
"type" : "string",
|
||||
"parseSpec" : {
|
||||
"format" : "json",
|
||||
"timestampSpec" : {
|
||||
"format" : "iso",
|
||||
"column" : "ts"
|
||||
},
|
||||
"dimensionsSpec" : {
|
||||
"dimensions": [
|
||||
"srcIP",
|
||||
{ "name" : "srcPort", "type" : "long" },
|
||||
{ "name" : "dstIP", "type" : "string" },
|
||||
{ "name" : "dstPort", "type" : "long" },
|
||||
{ "name" : "protocol", "type" : "string" }
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
"granularitySpec" : {
|
||||
"rollup" : true
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
Each dimension has a `name` and a `type`, where `type` can be "long", "float", "double", or "string".
|
||||
|
||||
Note that `srcIP` is a "string" dimension; for string dimensions, it is enough to specify just a dimension name, since "string" is the default dimension type.
|
||||
|
||||
Also note that `protocol` is a numeric value in the input data, but we are ingesting it as a "string" column; Druid will coerce the input longs to strings during ingestion.
|
||||
|
||||
##### Strings vs. Numerics
|
||||
|
||||
Should a numeric input be ingested as a numeric dimension or as a string dimension?
|
||||
|
||||
Numeric dimensions have the following pros/cons relative to String dimensions:
|
||||
* Pros: Numeric representation can result in smaller column sizes on disk and lower processing overhead when reading values from the column
|
||||
* Cons: Numeric dimensions do not have indices, so filtering on them will often be slower than filtering on an equivalent String dimension (which has bitmap indices)
|
||||
|
||||
#### Metrics
|
||||
|
||||
Metrics are specified with a `metricsSpec` inside the `dataSchema`:
|
||||
|
||||
```json
|
||||
"dataSchema" : {
|
||||
"dataSource" : "ingestion-tutorial",
|
||||
"parser" : {
|
||||
"type" : "string",
|
||||
"parseSpec" : {
|
||||
"format" : "json",
|
||||
"timestampSpec" : {
|
||||
"format" : "iso",
|
||||
"column" : "ts"
|
||||
},
|
||||
"dimensionsSpec" : {
|
||||
"dimensions": [
|
||||
"srcIP",
|
||||
{ "name" : "srcPort", "type" : "long" },
|
||||
{ "name" : "dstIP", "type" : "string" },
|
||||
{ "name" : "dstPort", "type" : "long" },
|
||||
{ "name" : "protocol", "type" : "string" }
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
"metricsSpec" : [
|
||||
{ "type" : "count", "name" : "count" },
|
||||
{ "type" : "longSum", "name" : "packets", "fieldName" : "packets" },
|
||||
{ "type" : "longSum", "name" : "bytes", "fieldName" : "bytes" },
|
||||
{ "type" : "doubleSum", "name" : "cost", "fieldName" : "cost" }
|
||||
],
|
||||
"granularitySpec" : {
|
||||
"rollup" : true
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
When defining a metric, it is necessary to specify what type of aggregation should be performed on that column during rollup.
|
||||
|
||||
Here we have defined long sum aggregations on the two long metric columns, `packets` and `bytes`, and a double sum aggregation for the `cost` column.
|
||||
|
||||
Note that the `metricsSpec` is on a different nesting level than `dimensionSpec` or `parseSpec`; it belongs on the same nesting level as `parser` within the `dataSchema`.
|
||||
|
||||
Note that we have also defined a `count` aggregator. The count aggregator will track how many rows in the original input data contributed to a "rolled up" row in the final ingested data.
|
||||
|
||||
### No rollup
|
||||
|
||||
If we were not using rollup, all columns would be specified in the `dimensionsSpec`, e.g.:
|
||||
|
||||
```
|
||||
"dimensionsSpec" : {
|
||||
"dimensions": [
|
||||
"srcIP",
|
||||
{ "name" : "srcPort", "type" : "long" },
|
||||
{ "name" : "dstIP", "type" : "string" },
|
||||
{ "name" : "dstPort", "type" : "long" },
|
||||
{ "name" : "protocol", "type" : "string" },
|
||||
{ "name" : "packets", "type" : "long" },
|
||||
{ "name" : "bytes", "type" : "long" },
|
||||
{ "name" : "srcPort", "type" : "double" }
|
||||
]
|
||||
},
|
||||
```
|
||||
|
||||
|
||||
### Define granularities
|
||||
|
||||
At this point, we are done defining the `parser` and `metricsSpec` within the `dataSchema` and we are almost done writing the ingestion spec.
|
||||
|
||||
There are some additional properties we need to set in the `granularitySpec`:
|
||||
* Type of granularitySpec: `uniform` and `arbitrary` are the two supported types. For this tutorial, we will use a `uniform` granularity spec, where all segments have uniform interval sizes (for example, all segments cover an hour's worth of data).
|
||||
* The segment granularity: what size of time interval should a single segment contain data for? e.g., `DAY`, `WEEK`
|
||||
* The bucketing granularity of the timestamps in the time column (referred to as `queryGranularity`)
|
||||
|
||||
#### Segment granularity
|
||||
|
||||
Segment granularity is configured by the `segmentGranularity` property in the `granularitySpec`. For this tutorial, we'll create hourly segments:
|
||||
|
||||
```
|
||||
"dataSchema" : {
|
||||
"dataSource" : "ingestion-tutorial",
|
||||
"parser" : {
|
||||
"type" : "string",
|
||||
"parseSpec" : {
|
||||
"format" : "json",
|
||||
"timestampSpec" : {
|
||||
"format" : "iso",
|
||||
"column" : "ts"
|
||||
},
|
||||
"dimensionsSpec" : {
|
||||
"dimensions": [
|
||||
"srcIP",
|
||||
{ "name" : "srcPort", "type" : "long" },
|
||||
{ "name" : "dstIP", "type" : "string" },
|
||||
{ "name" : "dstPort", "type" : "long" },
|
||||
{ "name" : "protocol", "type" : "string" }
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
"metricsSpec" : [
|
||||
{ "type" : "count", "name" : "count" },
|
||||
{ "type" : "longSum", "name" : "packets", "fieldName" : "packets" },
|
||||
{ "type" : "longSum", "name" : "bytes", "fieldName" : "bytes" },
|
||||
{ "type" : "doubleSum", "name" : "cost", "fieldName" : "cost" }
|
||||
],
|
||||
"granularitySpec" : {
|
||||
"type" : "uniform",
|
||||
"segmentGranularity" : "HOUR",
|
||||
"rollup" : true
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
Our input data has events from two separate hours, so this task will generate two segments.
|
||||
|
||||
#### Query granularity
|
||||
|
||||
The query granularity is configured by the `queryGranularity` property in the `granularitySpec`. For this tutorial, let's use minute granularity:
|
||||
|
||||
```
|
||||
"dataSchema" : {
|
||||
"dataSource" : "ingestion-tutorial",
|
||||
"parser" : {
|
||||
"type" : "string",
|
||||
"parseSpec" : {
|
||||
"format" : "json",
|
||||
"timestampSpec" : {
|
||||
"format" : "iso",
|
||||
"column" : "ts"
|
||||
},
|
||||
"dimensionsSpec" : {
|
||||
"dimensions": [
|
||||
"srcIP",
|
||||
{ "name" : "srcPort", "type" : "long" },
|
||||
{ "name" : "dstIP", "type" : "string" },
|
||||
{ "name" : "dstPort", "type" : "long" },
|
||||
{ "name" : "protocol", "type" : "string" }
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
"metricsSpec" : [
|
||||
{ "type" : "count", "name" : "count" },
|
||||
{ "type" : "longSum", "name" : "packets", "fieldName" : "packets" },
|
||||
{ "type" : "longSum", "name" : "bytes", "fieldName" : "bytes" },
|
||||
{ "type" : "doubleSum", "name" : "cost", "fieldName" : "cost" }
|
||||
],
|
||||
"granularitySpec" : {
|
||||
"type" : "uniform",
|
||||
"segmentGranularity" : "HOUR",
|
||||
"queryGranularity" : "MINUTE"
|
||||
"rollup" : true
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
To see the effect of the query granularity, let's look at this row from the raw input data:
|
||||
|
||||
```
|
||||
{"ts":"2018-01-01T01:03:29Z","srcIP":"1.1.1.1", "dstIP":"2.2.2.2", "srcPort":5000, "dstPort":7000, "protocol": 6, "packets":60, "bytes":6000, "cost": 4.3}
|
||||
```
|
||||
|
||||
When this row is ingested with minute queryGranularity, Druid will floor the row's timestamp to minute buckets:
|
||||
|
||||
```
|
||||
{"ts":"2018-01-01T01:03:00Z","srcIP":"1.1.1.1", "dstIP":"2.2.2.2", "srcPort":5000, "dstPort":7000, "protocol": 6, "packets":60, "bytes":6000, "cost": 4.3}
|
||||
```
|
||||
|
||||
#### Define an interval (batch only)
|
||||
|
||||
For batch tasks, it is necessary to define a time interval. Input rows with timestamps outside of the time interval will not be ingested.
|
||||
|
||||
The interval is also specified in the `granularitySpec`:
|
||||
|
||||
```
|
||||
"dataSchema" : {
|
||||
"dataSource" : "ingestion-tutorial",
|
||||
"parser" : {
|
||||
"type" : "string",
|
||||
"parseSpec" : {
|
||||
"format" : "json",
|
||||
"timestampSpec" : {
|
||||
"format" : "iso",
|
||||
"column" : "ts"
|
||||
},
|
||||
"dimensionsSpec" : {
|
||||
"dimensions": [
|
||||
"srcIP",
|
||||
{ "name" : "srcPort", "type" : "long" },
|
||||
{ "name" : "dstIP", "type" : "string" },
|
||||
{ "name" : "dstPort", "type" : "long" },
|
||||
{ "name" : "protocol", "type" : "string" }
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
"metricsSpec" : [
|
||||
{ "type" : "count", "name" : "count" },
|
||||
{ "type" : "longSum", "name" : "packets", "fieldName" : "packets" },
|
||||
{ "type" : "longSum", "name" : "bytes", "fieldName" : "bytes" },
|
||||
{ "type" : "doubleSum", "name" : "cost", "fieldName" : "cost" }
|
||||
],
|
||||
"granularitySpec" : {
|
||||
"type" : "uniform",
|
||||
"segmentGranularity" : "HOUR",
|
||||
"queryGranularity" : "MINUTE",
|
||||
"intervals" : ["2018-01-01/2018-01-02"],
|
||||
"rollup" : true
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
## Define the task type
|
||||
|
||||
We've now finished defining our `dataSchema`. The remaining steps are to place the `dataSchema` we created into an ingestion task spec, and specify the input source.
|
||||
|
||||
The `dataSchema` is shared across all task types, but each task type has its own specification format. For this tutorial, we will use the native batch ingestion task:
|
||||
|
||||
```
|
||||
{
|
||||
"type" : "index",
|
||||
"spec" : {
|
||||
"dataSchema" : {
|
||||
"dataSource" : "ingestion-tutorial",
|
||||
"parser" : {
|
||||
"type" : "string",
|
||||
"parseSpec" : {
|
||||
"format" : "json",
|
||||
"timestampSpec" : {
|
||||
"format" : "iso",
|
||||
"column" : "ts"
|
||||
},
|
||||
"dimensionsSpec" : {
|
||||
"dimensions": [
|
||||
"srcIP",
|
||||
{ "name" : "srcPort", "type" : "long" },
|
||||
{ "name" : "dstIP", "type" : "string" },
|
||||
{ "name" : "dstPort", "type" : "long" },
|
||||
{ "name" : "protocol", "type" : "string" }
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
"metricsSpec" : [
|
||||
{ "type" : "count", "name" : "count" },
|
||||
{ "type" : "longSum", "name" : "packets", "fieldName" : "packets" },
|
||||
{ "type" : "longSum", "name" : "bytes", "fieldName" : "bytes" },
|
||||
{ "type" : "doubleSum", "name" : "cost", "fieldName" : "cost" }
|
||||
],
|
||||
"granularitySpec" : {
|
||||
"type" : "uniform",
|
||||
"segmentGranularity" : "HOUR",
|
||||
"queryGranularity" : "MINUTE",
|
||||
"intervals" : ["2018-01-01/2018-01-02"],
|
||||
"rollup" : true
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
## Define the input source
|
||||
|
||||
Now let's define our input source, which is specified in an `ioConfig` object. Each task type has its own type of `ioConfig`. The native batch task uses "firehoses" to read input data, so let's configure a "local" firehose to read the example netflow data we saved earlier:
|
||||
|
||||
|
||||
```
|
||||
"ioConfig" : {
|
||||
"type" : "index",
|
||||
"firehose" : {
|
||||
"type" : "local",
|
||||
"baseDir" : "quickstart/",
|
||||
"filter" : "ingestion-tutorial-data.json"
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
```
|
||||
{
|
||||
"type" : "index",
|
||||
"spec" : {
|
||||
"dataSchema" : {
|
||||
"dataSource" : "ingestion-tutorial",
|
||||
"parser" : {
|
||||
"type" : "string",
|
||||
"parseSpec" : {
|
||||
"format" : "json",
|
||||
"timestampSpec" : {
|
||||
"format" : "iso",
|
||||
"column" : "ts"
|
||||
},
|
||||
"dimensionsSpec" : {
|
||||
"dimensions": [
|
||||
"srcIP",
|
||||
{ "name" : "srcPort", "type" : "long" },
|
||||
{ "name" : "dstIP", "type" : "string" },
|
||||
{ "name" : "dstPort", "type" : "long" },
|
||||
{ "name" : "protocol", "type" : "string" }
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
"metricsSpec" : [
|
||||
{ "type" : "count", "name" : "count" },
|
||||
{ "type" : "longSum", "name" : "packets", "fieldName" : "packets" },
|
||||
{ "type" : "longSum", "name" : "bytes", "fieldName" : "bytes" },
|
||||
{ "type" : "doubleSum", "name" : "cost", "fieldName" : "cost" }
|
||||
],
|
||||
"granularitySpec" : {
|
||||
"type" : "uniform",
|
||||
"segmentGranularity" : "HOUR",
|
||||
"queryGranularity" : "MINUTE",
|
||||
"intervals" : ["2018-01-01/2018-01-02"],
|
||||
"rollup" : true
|
||||
}
|
||||
},
|
||||
"ioConfig" : {
|
||||
"type" : "index",
|
||||
"firehose" : {
|
||||
"type" : "local",
|
||||
"baseDir" : "quickstart/",
|
||||
"filter" : "ingestion-tutorial-data.json"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
## Additional tuning
|
||||
|
||||
Each ingestion task has a `tuningConfig` section that allows users to tune various ingestion parameters.
|
||||
|
||||
As an example, let's add a `tuningConfig` that sets a target segment size for the native batch ingestion task:
|
||||
|
||||
```
|
||||
"tuningConfig" : {
|
||||
"type" : "index",
|
||||
"targetPartitionSize" : 5000000
|
||||
}
|
||||
```
|
||||
|
||||
Note that each ingestion task has its own type of `tuningConfig`.
|
||||
|
||||
## Final spec
|
||||
|
||||
We've finished defining the ingestion spec, it should now look like the following:
|
||||
|
||||
```
|
||||
{
|
||||
"type" : "index",
|
||||
"spec" : {
|
||||
"dataSchema" : {
|
||||
"dataSource" : "ingestion-tutorial",
|
||||
"parser" : {
|
||||
"type" : "string",
|
||||
"parseSpec" : {
|
||||
"format" : "json",
|
||||
"timestampSpec" : {
|
||||
"format" : "iso",
|
||||
"column" : "ts"
|
||||
},
|
||||
"dimensionsSpec" : {
|
||||
"dimensions": [
|
||||
"srcIP",
|
||||
{ "name" : "srcPort", "type" : "long" },
|
||||
{ "name" : "dstIP", "type" : "string" },
|
||||
{ "name" : "dstPort", "type" : "long" },
|
||||
{ "name" : "protocol", "type" : "string" }
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
"metricsSpec" : [
|
||||
{ "type" : "count", "name" : "count" },
|
||||
{ "type" : "longSum", "name" : "packets", "fieldName" : "packets" },
|
||||
{ "type" : "longSum", "name" : "bytes", "fieldName" : "bytes" },
|
||||
{ "type" : "doubleSum", "name" : "cost", "fieldName" : "cost" }
|
||||
],
|
||||
"granularitySpec" : {
|
||||
"type" : "uniform",
|
||||
"segmentGranularity" : "HOUR",
|
||||
"queryGranularity" : "MINUTE",
|
||||
"intervals" : ["2018-01-01/2018-01-02"],
|
||||
"rollup" : true
|
||||
}
|
||||
},
|
||||
"ioConfig" : {
|
||||
"type" : "index",
|
||||
"firehose" : {
|
||||
"type" : "local",
|
||||
"baseDir" : "quickstart/",
|
||||
"filter" : "ingestion-tutorial-data.json"
|
||||
}
|
||||
},
|
||||
"tuningConfig" : {
|
||||
"type" : "index",
|
||||
"targetPartitionSize" : 5000000
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
## Submit the task and query the data
|
||||
|
||||
From the druid-${DRUIDVERSION} package root, run the following command:
|
||||
|
||||
```
|
||||
bin/post-index-task --file quickstart/ingestion-tutorial-index.json
|
||||
```
|
||||
|
||||
After the script completes, we will query the data.
|
||||
|
||||
Let's run `bin/dsql` and issue a `select * from "ingestion-tutorial";` query to see what data was ingested.
|
||||
|
||||
```
|
||||
$ bin/dsql
|
||||
Welcome to dsql, the command-line client for Druid SQL.
|
||||
Type "\h" for help.
|
||||
dsql> select * from "ingestion-tutorial";
|
||||
|
||||
┌──────────────────────────┬───────┬──────┬───────┬─────────┬─────────┬─────────┬──────────┬─────────┬─────────┐
|
||||
│ __time │ bytes │ cost │ count │ dstIP │ dstPort │ packets │ protocol │ srcIP │ srcPort │
|
||||
├──────────────────────────┼───────┼──────┼───────┼─────────┼─────────┼─────────┼──────────┼─────────┼─────────┤
|
||||
│ 2018-01-01T01:01:00.000Z │ 6000 │ 4.9 │ 3 │ 2.2.2.2 │ 3000 │ 60 │ 6 │ 1.1.1.1 │ 2000 │
|
||||
│ 2018-01-01T01:02:00.000Z │ 9000 │ 18.1 │ 2 │ 2.2.2.2 │ 7000 │ 90 │ 6 │ 1.1.1.1 │ 5000 │
|
||||
│ 2018-01-01T01:03:00.000Z │ 6000 │ 4.3 │ 1 │ 2.2.2.2 │ 7000 │ 60 │ 6 │ 1.1.1.1 │ 5000 │
|
||||
│ 2018-01-01T02:33:00.000Z │ 30000 │ 56.9 │ 2 │ 8.8.8.8 │ 5000 │ 300 │ 17 │ 7.7.7.7 │ 4000 │
|
||||
│ 2018-01-01T02:35:00.000Z │ 30000 │ 46.3 │ 1 │ 8.8.8.8 │ 5000 │ 300 │ 17 │ 7.7.7.7 │ 4000 │
|
||||
└──────────────────────────┴───────┴──────┴───────┴─────────┴─────────┴─────────┴──────────┴─────────┴─────────┘
|
||||
Retrieved 5 rows in 0.12s.
|
||||
|
||||
dsql>
|
||||
```
|
|
@ -2,33 +2,26 @@
|
|||
layout: doc_page
|
||||
---
|
||||
|
||||
# Tutorial: Load from Kafka
|
||||
# Tutorial: Load streaming data from Kafka
|
||||
|
||||
## Getting started
|
||||
|
||||
This tutorial shows you how to load data from Kafka into Druid.
|
||||
This tutorial demonstrates how to load data from a Kafka stream, using the Druid Kafka indexing service.
|
||||
|
||||
For this tutorial, we'll assume you've already downloaded Druid and Tranquility as described in
|
||||
the [single-machine quickstart](quickstart.html) and have it running on your local machine. You
|
||||
For this tutorial, we'll assume you've already downloaded Druid as described in
|
||||
the [single-machine quickstart](index.html) and have it running on your local machine. You
|
||||
don't need to have loaded any data yet.
|
||||
|
||||
<div class="note info">
|
||||
This tutorial will show you how to load data from Kafka into Druid, but Druid additionally supports
|
||||
a wide variety of batch and streaming loading methods. See the <a href="../ingestion/batch-ingestion.html">Loading files</a>
|
||||
and <a href="../ingestion/stream-ingestion.html">Loading streams</a> pages for more information about other options,
|
||||
including from Hadoop, HTTP, Storm, Samza, Spark Streaming, and your own JVM apps.
|
||||
</div>
|
||||
|
||||
## Start Kafka
|
||||
## Download and start Kafka
|
||||
|
||||
[Apache Kafka](http://kafka.apache.org/) is a high throughput message bus that works well with
|
||||
Druid. For this tutorial, we will use Kafka 0.9.0.0. To download Kafka, issue the following
|
||||
Druid. For this tutorial, we will use Kafka 0.10.2.0. To download Kafka, issue the following
|
||||
commands in your terminal:
|
||||
|
||||
```bash
|
||||
curl -O http://www.us.apache.org/dist/kafka/0.9.0.0/kafka_2.11-0.9.0.0.tgz
|
||||
tar -xzf kafka_2.11-0.9.0.0.tgz
|
||||
cd kafka_2.11-0.9.0.0
|
||||
curl -O https://archive.apache.org/dist/kafka/0.10.2.0/kafka_2.11-0.10.2.0.tgz
|
||||
tar -xzf kafka_2.11-0.10.2.0.tgz
|
||||
cd kafka_2.11-0.10.2.0
|
||||
```
|
||||
|
||||
Start a Kafka broker by running the following command in a new terminal:
|
||||
|
@ -37,149 +30,56 @@ Start a Kafka broker by running the following command in a new terminal:
|
|||
./bin/kafka-server-start.sh config/server.properties
|
||||
```
|
||||
|
||||
Run this command to create a Kafka topic called *metrics*, to which we'll send data:
|
||||
Run this command to create a Kafka topic called *wikipedia*, to which we'll send data:
|
||||
|
||||
```bash
|
||||
./bin/kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 1 --partitions 1 --topic metrics
|
||||
./bin/kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 1 --partitions 1 --topic wikipedia
|
||||
```
|
||||
|
||||
## Send example data
|
||||
## Enable Druid Kafka ingestion
|
||||
|
||||
We will use Druid's Kafka indexing service to ingest messages from our newly created *wikipedia* topic. To start the
|
||||
service, we will need to submit a supervisor spec to the Druid overlord by running the following from the Imply directory:
|
||||
|
||||
```bash
|
||||
curl -XPOST -H'Content-Type: application/json' -d @quickstart/tutorial/wikipedia-kafka-supervisor.json http://localhost:8090/druid/indexer/v1/supervisor
|
||||
```
|
||||
|
||||
If the supervisor was successfully created, you will get a response containing the ID of the supervisor; in our case we should see `{"id":"wikipedia-kafka"}`.
|
||||
|
||||
For more details about what's going on here, check out the
|
||||
[Druid Kafka indexing service documentation](http://druid.io/docs/{{druidVersion}}/development/extensions-core/kafka-ingestion.html).
|
||||
|
||||
## Load data
|
||||
|
||||
Let's launch a console producer for our topic and send some data!
|
||||
|
||||
In your Druid directory, generate some metrics by running:
|
||||
In your Druid directory, run the following command:
|
||||
|
||||
```bash
|
||||
bin/generate-example-metrics
|
||||
```
|
||||
cd quickstart
|
||||
gunzip -k wikipedia-2015-09-12-sampled.json.gz
|
||||
```
|
||||
|
||||
In your Kafka directory, run:
|
||||
In your Kafka directory, run the following command, where {PATH_TO_DRUID} is replaced by the path to the Druid directory:
|
||||
|
||||
```bash
|
||||
./bin/kafka-console-producer.sh --broker-list localhost:9092 --topic metrics
|
||||
export KAFKA_OPTS="-Dfile.encoding=UTF-8"
|
||||
./bin/kafka-console-producer.sh --broker-list localhost:9092 --topic wikipedia < {PATH_TO_DRUID}/quickstart/wikipedia-2015-09-12-sampled.json
|
||||
```
|
||||
|
||||
The *kafka-console-producer* command is now awaiting input. Copy the generated example metrics,
|
||||
paste them into the *kafka-console-producer* terminal, and press enter. If you like, you can also
|
||||
paste more messages into the producer, or you can press CTRL-D to exit the console producer.
|
||||
|
||||
You can immediately query this data, or you can skip ahead to the
|
||||
[Loading your own data](#loading-your-own-data) section if you'd like to load your own dataset.
|
||||
The previous command posted sample events to the *wikipedia* Kafka topic which were then ingested into Druid by the Kafka indexing service. You're now ready to run some queries!
|
||||
|
||||
## Querying your data
|
||||
|
||||
After sending data, you can immediately query it using any of the
|
||||
[supported query methods](../querying/querying.html).
|
||||
After data is sent to the Kafka stream, it is immediately available for querying.
|
||||
|
||||
## Loading your own data
|
||||
Please follow the [query tutorial](../tutorial/tutorial-query.html) to run some example queries on the newly loaded data.
|
||||
|
||||
So far, you've loaded data into Druid from Kafka using an ingestion spec that we've included in the
|
||||
distribution. Each ingestion spec is designed to work with a particular dataset. You load your own
|
||||
data types into Imply by writing a custom ingestion spec.
|
||||
## Cleanup
|
||||
|
||||
You can write a custom ingestion spec by starting from the bundled configuration in
|
||||
`conf-quickstart/tranquility/kafka.json` and modifying it for your own needs.
|
||||
|
||||
The most important questions are:
|
||||
|
||||
* What should the dataset be called? This is the "dataSource" field of the "dataSchema".
|
||||
* Which field should be treated as a timestamp? This belongs in the "column" of the "timestampSpec".
|
||||
* Which fields should be treated as dimensions? This belongs in the "dimensions" of the "dimensionsSpec".
|
||||
* Which fields should be treated as measures? This belongs in the "metricsSpec".
|
||||
|
||||
Let's use a small JSON pageviews dataset in the topic *pageviews* as an example, with records like:
|
||||
|
||||
```json
|
||||
{"time": "2000-01-01T00:00:00Z", "url": "/foo/bar", "user": "alice", "latencyMs": 32}
|
||||
```
|
||||
|
||||
First, create the topic:
|
||||
|
||||
```bash
|
||||
./bin/kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 1 --partitions 1 --topic pageviews
|
||||
```
|
||||
|
||||
Next, edit `conf-quickstart/tranquility/kafka.json`:
|
||||
|
||||
* Let's call the dataset "pageviews-kafka".
|
||||
* The timestamp is the "time" field.
|
||||
* Good choices for dimensions are the string fields "url" and "user".
|
||||
* Good choices for measures are a count of pageviews, and the sum of "latencyMs". Collecting that
|
||||
sum when we load the data will allow us to compute an average at query time as well.
|
||||
|
||||
You can edit the existing `conf-quickstart/tranquility/kafka.json` file by altering these
|
||||
sections:
|
||||
|
||||
1. Change the key `"metrics-kafka"` under `"dataSources"` to `"pageviews-kafka"`
|
||||
2. Alter these sections under the new `"pageviews-kafka"` key:
|
||||
```json
|
||||
"dataSource": "pageviews-kafka"
|
||||
```
|
||||
|
||||
```json
|
||||
"timestampSpec": {
|
||||
"format": "auto",
|
||||
"column": "time"
|
||||
}
|
||||
```
|
||||
|
||||
```json
|
||||
"dimensionsSpec": {
|
||||
"dimensions": ["url", "user"]
|
||||
}
|
||||
```
|
||||
|
||||
```json
|
||||
"metricsSpec": [
|
||||
{"name": "views", "type": "count"},
|
||||
{"name": "latencyMs", "type": "doubleSum", "fieldName": "latencyMs"}
|
||||
]
|
||||
```
|
||||
|
||||
```json
|
||||
"properties" : {
|
||||
"task.partitions" : "1",
|
||||
"task.replicants" : "1",
|
||||
"topicPattern" : "pageviews"
|
||||
}
|
||||
```
|
||||
|
||||
Next, start Druid Kafka ingestion:
|
||||
|
||||
```bash
|
||||
bin/tranquility kafka -configFile ../druid-#{DRUIDVERSION}/conf-quickstart/tranquility/kafka.json
|
||||
```
|
||||
|
||||
- If your Tranquility server or Kafka is already running, stop it (CTRL-C) and
|
||||
start it up again.
|
||||
|
||||
Finally, send some data to the Kafka topic. Let's start with these messages:
|
||||
|
||||
```json
|
||||
{"time": "2000-01-01T00:00:00Z", "url": "/foo/bar", "user": "alice", "latencyMs": 32}
|
||||
{"time": "2000-01-01T00:00:00Z", "url": "/", "user": "bob", "latencyMs": 11}
|
||||
{"time": "2000-01-01T00:00:00Z", "url": "/foo/bar", "user": "bob", "latencyMs": 45}
|
||||
```
|
||||
|
||||
Druid streaming ingestion requires relatively current messages (relative to a slack time controlled by the
|
||||
[windowPeriod](../ingestion/stream-ingestion.html#segmentgranularity-and-windowperiod) value), so you should
|
||||
replace `2000-01-01T00:00:00Z` in these messages with the current time in ISO8601 format. You can
|
||||
get this by running:
|
||||
|
||||
```bash
|
||||
python -c 'import datetime; print(datetime.datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%SZ"))'
|
||||
```
|
||||
|
||||
Update the timestamps in the JSON above, then copy and paste these messages into this console
|
||||
producer and press enter:
|
||||
|
||||
```bash
|
||||
./bin/kafka-console-producer.sh --broker-list localhost:9092 --topic pageviews
|
||||
```
|
||||
|
||||
That's it, your data should now be in Druid. You can immediately query it using any of the
|
||||
[supported query methods](../querying/querying.html).
|
||||
If you wish to go through any of the other ingestion tutorials, you will need to shut down the cluster and reset the cluster state by removing the contents of the `var` directory under the druid package, as the other tutorials will write to the same "wikipedia" datasource.
|
||||
|
||||
## Further reading
|
||||
|
||||
To read more about loading streams, see our [streaming ingestion documentation](../ingestion/stream-ingestion.html).
|
||||
For more information on loading data from Kafka streams, please see the [Druid Kafka indexing service documentation](http://druid.io/docs/{{druidVersion}}/development/extensions-core/kafka-ingestion.html).
|
||||
|
|
|
@ -0,0 +1,280 @@
|
|||
---
|
||||
layout: doc_page
|
||||
---
|
||||
|
||||
# Tutorial: Querying data
|
||||
|
||||
This tutorial will demonstrate how to query data in Druid, with examples for Druid's native query format and Druid SQL.
|
||||
|
||||
The tutorial assumes that you've already completed one of the 4 ingestion tutorials, as we will be querying the sample Wikipedia edits data.
|
||||
|
||||
* [Tutorial: Loading a file](/docs/VERSION/tutorials/tutorial-batch.html)
|
||||
* [Tutorial: Loading stream data from Kafka](/docs/VERSION/tutorials/tutorial-kafka.html)
|
||||
* [Tutorial: Loading a file using Hadoop](/docs/VERSION/tutorials/tutorial-batch-hadoop.html)
|
||||
* [Tutorial: Loading stream data using Tranquility](/docs/VERSION/tutorials/tutorial-tranquility.html)
|
||||
|
||||
## Native JSON queries
|
||||
|
||||
Druid's native query format is expressed in JSON. We have included a sample native TopN query under `quickstart/tutorial/wikipedia-top-pages.json`:
|
||||
|
||||
```json
|
||||
{
|
||||
"queryType" : "topN",
|
||||
"dataSource" : "wikipedia",
|
||||
"intervals" : ["2015-09-12/2015-09-13"],
|
||||
"granularity" : "all",
|
||||
"dimension" : "page",
|
||||
"metric" : "count",
|
||||
"threshold" : 10,
|
||||
"aggregations" : [
|
||||
{
|
||||
"type" : "count",
|
||||
"name" : "count"
|
||||
}
|
||||
]
|
||||
}
|
||||
```
|
||||
|
||||
This query retrieves the 10 Wikipedia pages with the most page edits on 2015-09-12.
|
||||
|
||||
Let's submit this query to the Druid broker:
|
||||
|
||||
```bash
|
||||
curl -X 'POST' -H 'Content-Type:application/json' -d @quickstart/tutorial/wikipedia-top-pages.json http://localhost:8082/druid/v2?pretty
|
||||
```
|
||||
|
||||
You should see the following query results:
|
||||
|
||||
```json
|
||||
[ {
|
||||
"timestamp" : "2015-09-12T00:46:58.771Z",
|
||||
"result" : [ {
|
||||
"count" : 33,
|
||||
"page" : "Wikipedia:Vandalismusmeldung"
|
||||
}, {
|
||||
"count" : 28,
|
||||
"page" : "User:Cyde/List of candidates for speedy deletion/Subpage"
|
||||
}, {
|
||||
"count" : 27,
|
||||
"page" : "Jeremy Corbyn"
|
||||
}, {
|
||||
"count" : 21,
|
||||
"page" : "Wikipedia:Administrators' noticeboard/Incidents"
|
||||
}, {
|
||||
"count" : 20,
|
||||
"page" : "Flavia Pennetta"
|
||||
}, {
|
||||
"count" : 18,
|
||||
"page" : "Total Drama Presents: The Ridonculous Race"
|
||||
}, {
|
||||
"count" : 18,
|
||||
"page" : "User talk:Dudeperson176123"
|
||||
}, {
|
||||
"count" : 18,
|
||||
"page" : "Wikipédia:Le Bistro/12 septembre 2015"
|
||||
}, {
|
||||
"count" : 17,
|
||||
"page" : "Wikipedia:In the news/Candidates"
|
||||
}, {
|
||||
"count" : 17,
|
||||
"page" : "Wikipedia:Requests for page protection"
|
||||
} ]
|
||||
} ]
|
||||
```
|
||||
|
||||
## Druid SQL queries
|
||||
|
||||
Druid also supports a dialect of SQL for querying. Let's run a SQL query that is equivalent to the native JSON query shown above:
|
||||
|
||||
```
|
||||
SELECT page, COUNT(*) AS Edits FROM wikipedia WHERE "__time" BETWEEN TIMESTAMP '2015-09-12 00:00:00' AND TIMESTAMP '2015-09-13 00:00:00' GROUP BY page ORDER BY Edits DESC LIMIT 10;
|
||||
```
|
||||
|
||||
The SQL queries are submitted as JSON over HTTP.
|
||||
|
||||
### TopN query example
|
||||
|
||||
The tutorial package includes an example file that contains the SQL query shown above at `quickstart/wikipedia-top-pages-sql.json`. Let's submit that query to the Druid broker:
|
||||
|
||||
```bash
|
||||
curl -X 'POST' -H 'Content-Type:application/json' -d @quickstart/tutorial/wikipedia-top-pages-sql.json http://localhost:8082/druid/v2/sql
|
||||
```
|
||||
|
||||
The following results should be returned:
|
||||
|
||||
```
|
||||
[
|
||||
{
|
||||
"page": "Wikipedia:Vandalismusmeldung",
|
||||
"Edits": 33
|
||||
},
|
||||
{
|
||||
"page": "User:Cyde/List of candidates for speedy deletion/Subpage",
|
||||
"Edits": 28
|
||||
},
|
||||
{
|
||||
"page": "Jeremy Corbyn",
|
||||
"Edits": 27
|
||||
},
|
||||
{
|
||||
"page": "Wikipedia:Administrators' noticeboard/Incidents",
|
||||
"Edits": 21
|
||||
},
|
||||
{
|
||||
"page": "Flavia Pennetta",
|
||||
"Edits": 20
|
||||
},
|
||||
{
|
||||
"page": "Total Drama Presents: The Ridonculous Race",
|
||||
"Edits": 18
|
||||
},
|
||||
{
|
||||
"page": "User talk:Dudeperson176123",
|
||||
"Edits": 18
|
||||
},
|
||||
{
|
||||
"page": "Wikipédia:Le Bistro/12 septembre 2015",
|
||||
"Edits": 18
|
||||
},
|
||||
{
|
||||
"page": "Wikipedia:In the news/Candidates",
|
||||
"Edits": 17
|
||||
},
|
||||
{
|
||||
"page": "Wikipedia:Requests for page protection",
|
||||
"Edits": 17
|
||||
}
|
||||
]
|
||||
```
|
||||
|
||||
### dsql client
|
||||
|
||||
For convenience, the Druid package includes a SQL command-line client, located at `bin/dsql` from the Druid package root.
|
||||
|
||||
Let's now run `bin/dsql`; you should see the following prompt:
|
||||
|
||||
```
|
||||
Welcome to dsql, the command-line client for Druid SQL.
|
||||
Type "\h" for help.
|
||||
dsql>
|
||||
```
|
||||
|
||||
To submit the query, paste it to the `dsql` prompt and press enter:
|
||||
|
||||
```
|
||||
dsql> SELECT page, COUNT(*) AS Edits FROM wikipedia WHERE "__time" BETWEEN TIMESTAMP '2015-09-12 00:00:00' AND TIMESTAMP '2015-09-13 00:00:00' GROUP BY page ORDER BY Edits DESC LIMIT 10;
|
||||
┌──────────────────────────────────────────────────────────┬───────┐
|
||||
│ page │ Edits │
|
||||
├──────────────────────────────────────────────────────────┼───────┤
|
||||
│ Wikipedia:Vandalismusmeldung │ 33 │
|
||||
│ User:Cyde/List of candidates for speedy deletion/Subpage │ 28 │
|
||||
│ Jeremy Corbyn │ 27 │
|
||||
│ Wikipedia:Administrators' noticeboard/Incidents │ 21 │
|
||||
│ Flavia Pennetta │ 20 │
|
||||
│ Total Drama Presents: The Ridonculous Race │ 18 │
|
||||
│ User talk:Dudeperson176123 │ 18 │
|
||||
│ Wikipédia:Le Bistro/12 septembre 2015 │ 18 │
|
||||
│ Wikipedia:In the news/Candidates │ 17 │
|
||||
│ Wikipedia:Requests for page protection │ 17 │
|
||||
└──────────────────────────────────────────────────────────┴───────┘
|
||||
Retrieved 10 rows in 0.06s.
|
||||
```
|
||||
|
||||
### Additional Druid SQL queries
|
||||
|
||||
#### Timeseries
|
||||
|
||||
`SELECT FLOOR(__time to HOUR) AS HourTime, SUM(deleted) AS LinesDeleted FROM wikipedia WHERE "__time" BETWEEN TIMESTAMP '2015-09-12 00:00:00' AND TIMESTAMP '2015-09-13 00:00:00' GROUP BY FLOOR(__time to HOUR);`
|
||||
|
||||
```
|
||||
dsql> SELECT FLOOR(__time to HOUR) AS HourTime, SUM(deleted) AS LinesDeleted FROM wikipedia WHERE "__time" BETWEEN TIMESTAMP '2015-09-12 00:00:00' AND TIMESTAMP '2015-09-13 00:00:00' GROUP BY FLOOR(__time to HOUR);
|
||||
┌──────────────────────────┬──────────────┐
|
||||
│ HourTime │ LinesDeleted │
|
||||
├──────────────────────────┼──────────────┤
|
||||
│ 2015-09-12T00:00:00.000Z │ 1761 │
|
||||
│ 2015-09-12T01:00:00.000Z │ 16208 │
|
||||
│ 2015-09-12T02:00:00.000Z │ 14543 │
|
||||
│ 2015-09-12T03:00:00.000Z │ 13101 │
|
||||
│ 2015-09-12T04:00:00.000Z │ 12040 │
|
||||
│ 2015-09-12T05:00:00.000Z │ 6399 │
|
||||
│ 2015-09-12T06:00:00.000Z │ 9036 │
|
||||
│ 2015-09-12T07:00:00.000Z │ 11409 │
|
||||
│ 2015-09-12T08:00:00.000Z │ 11616 │
|
||||
│ 2015-09-12T09:00:00.000Z │ 17509 │
|
||||
│ 2015-09-12T10:00:00.000Z │ 19406 │
|
||||
│ 2015-09-12T11:00:00.000Z │ 16284 │
|
||||
│ 2015-09-12T12:00:00.000Z │ 18672 │
|
||||
│ 2015-09-12T13:00:00.000Z │ 30520 │
|
||||
│ 2015-09-12T14:00:00.000Z │ 18025 │
|
||||
│ 2015-09-12T15:00:00.000Z │ 26399 │
|
||||
│ 2015-09-12T16:00:00.000Z │ 24759 │
|
||||
│ 2015-09-12T17:00:00.000Z │ 19634 │
|
||||
│ 2015-09-12T18:00:00.000Z │ 17345 │
|
||||
│ 2015-09-12T19:00:00.000Z │ 19305 │
|
||||
│ 2015-09-12T20:00:00.000Z │ 22265 │
|
||||
│ 2015-09-12T21:00:00.000Z │ 16394 │
|
||||
│ 2015-09-12T22:00:00.000Z │ 16379 │
|
||||
│ 2015-09-12T23:00:00.000Z │ 15289 │
|
||||
└──────────────────────────┴──────────────┘
|
||||
Retrieved 24 rows in 0.08s.
|
||||
```
|
||||
|
||||
#### GroupBy
|
||||
|
||||
`SELECT channel, SUM(added) FROM wikipedia WHERE "__time" BETWEEN TIMESTAMP '2015-09-12 00:00:00' AND TIMESTAMP '2015-09-13 00:00:00' GROUP BY channel ORDER BY SUM(added) DESC LIMIT 5;`
|
||||
|
||||
```
|
||||
dsql> SELECT channel, SUM(added) FROM wikipedia WHERE "__time" BETWEEN TIMESTAMP '2015-09-12 00:00:00' AND TIMESTAMP '2015-09-13 00:00:00' GROUP BY channel ORDER BY SUM(added) DESC LIMIT 5;
|
||||
┌───────────────┬─────────┐
|
||||
│ channel │ EXPR$1 │
|
||||
├───────────────┼─────────┤
|
||||
│ #en.wikipedia │ 3045299 │
|
||||
│ #it.wikipedia │ 711011 │
|
||||
│ #fr.wikipedia │ 642555 │
|
||||
│ #ru.wikipedia │ 640698 │
|
||||
│ #es.wikipedia │ 634670 │
|
||||
└───────────────┴─────────┘
|
||||
Retrieved 5 rows in 0.05s.
|
||||
```
|
||||
|
||||
#### Scan
|
||||
|
||||
` SELECT user, page FROM wikipedia WHERE "__time" BETWEEN TIMESTAMP '2015-09-12 02:00:00' AND TIMESTAMP '2015-09-12 03:00:00' LIMIT 5;`
|
||||
|
||||
```
|
||||
dsql> SELECT user, page FROM wikipedia WHERE "__time" BETWEEN TIMESTAMP '2015-09-12 02:00:00' AND TIMESTAMP '2015-09-12 03:00:00' LIMIT 5;
|
||||
┌────────────────────────┬────────────────────────────────────────────────────────┐
|
||||
│ user │ page │
|
||||
├────────────────────────┼────────────────────────────────────────────────────────┤
|
||||
│ Thiago89 │ Campeonato Mundial de Voleibol Femenino Sub-20 de 2015 │
|
||||
│ 91.34.200.249 │ Friede von Schönbrunn │
|
||||
│ TuHan-Bot │ Trĩ vàng │
|
||||
│ Lowercase sigmabot III │ User talk:ErrantX │
|
||||
│ BattyBot │ Hans W. Jung │
|
||||
└────────────────────────┴────────────────────────────────────────────────────────┘
|
||||
Retrieved 5 rows in 0.04s.
|
||||
```
|
||||
|
||||
#### EXPLAIN PLAN FOR
|
||||
|
||||
By prepending `EXPLAIN PLAN FOR ` to a Druid SQL query, it is possible to see what native Druid queries a SQL query will plan into.
|
||||
|
||||
Using the TopN query above as an example:
|
||||
|
||||
`EXPLAIN PLAN FOR SELECT page, COUNT(*) AS Edits FROM wikipedia WHERE "__time" BETWEEN TIMESTAMP '2015-09-12 00:00:00' AND TIMESTAMP '2015-09-13 00:00:00' GROUP BY page ORDER BY Edits DESC LIMIT 10;`
|
||||
|
||||
```
|
||||
dsql> EXPLAIN PLAN FOR SELECT page, COUNT(*) AS Edits FROM wikipedia WHERE "__time" BETWEEN TIMESTAMP '2015-09-12 00:00:00' AND TIMESTAMP '2015-09-13 00:00:00' GROUP BY page ORDER BY Edits DESC LIMIT 10;
|
||||
┌─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┐
|
||||
│ PLAN │
|
||||
├─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┤
|
||||
│ DruidQueryRel(query=[{"queryType":"topN","dataSource":{"type":"table","name":"wikipedia"},"virtualColumns":[],"dimension":{"type":"default","dimension":"page","outputName":"d0","outputType":"STRING"},"metric":{"type":"numeric","metric":"a0"},"threshold":10,"intervals":{"type":"intervals","intervals":["2015-09-12T00:00:00.000Z/2015-09-13T00:00:00.001Z"]},"filter":null,"granularity":{"type":"all"},"aggregations":[{"type":"count","name":"a0"}],"postAggregations":[],"context":{},"descending":false}], signature=[{d0:STRING, a0:LONG}]) │
|
||||
└─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┘
|
||||
Retrieved 1 row in 0.03s.
|
||||
```
|
||||
|
||||
## Further reading
|
||||
|
||||
The [Queries documentation](/docs/VERSION/querying/querying.html) has more information on Druid's native JSON queries.
|
||||
|
||||
The [Druid SQL documentation](/docs/VERSION/querying/sql.html) has more information on using Druid SQL queries.
|
|
@ -0,0 +1,92 @@
|
|||
---
|
||||
layout: doc_page
|
||||
---
|
||||
|
||||
# Tutorial: Configuring data retention
|
||||
|
||||
This tutorial demonstrates how to configure retention rules on a datasource to set the time intervals of data that will be retained or dropped.
|
||||
|
||||
For this tutorial, we'll assume you've already downloaded Druid as described in
|
||||
the [single-machine quickstart](index.html) and have it running on your local machine.
|
||||
|
||||
It will also be helpful to have finished [Tutorial: Loading a file](/docs/VERSION/tutorials/tutorial-batch.html) and [Tutorial: Querying data](/docs/VERSION/tutorials/tutorial-query.html).
|
||||
|
||||
## Load the example data
|
||||
|
||||
For this tutorial, we'll be using the Wikipedia edits sample data, with an ingestion task spec that will create a separate segment for each hour in the input data.
|
||||
|
||||
The ingestion spec can be found at `quickstart/retention-index.json`. Let's submit that spec, which will create a datasource called `retention-tutorial`:
|
||||
|
||||
```
|
||||
bin/post-index-task --file quickstart/tutorial/retention-index.json
|
||||
```
|
||||
|
||||
After the ingestion completes, go to http://localhost:8081 in a browser to access the Coordinator console.
|
||||
|
||||
In the Coordinator console, go to the `datasources` tab at the top of the page.
|
||||
|
||||
This tab shows the available datasources and a summary of the retention rules for each datasource:
|
||||
|
||||
![Summary](../tutorials/img/tutorial-retention-00.png "Summary")
|
||||
|
||||
Currently there are no rules set for the `retention-tutorial` datasource. Note that there are default rules, currently set to `load Forever 2 in _default_tier`.
|
||||
|
||||
This means that all data will be loaded regardless of timestamp, and each segment will be replicated to two nodes in the default tier.
|
||||
|
||||
In this tutorial, we will ignore the tiering and redundancy concepts for now.
|
||||
|
||||
Let's click the `retention-tutorial` datasource on the left.
|
||||
|
||||
The next page (http://localhost:8081/#/datasources/retention-tutorial) provides information about what segments a datasource contains. On the left, the page shows that there are 24 segments, each one containing data for a specific hour of 2015-09-12:
|
||||
|
||||
![Original segments](../tutorials/img/tutorial-retention-01.png "Original segments")
|
||||
|
||||
## Set retention rules
|
||||
|
||||
Suppose we want to drop data for the first 12 hours of 2015-09-12 and keep data for the later 12 hours of 2015-09-12.
|
||||
|
||||
Click the `edit rules` button with a pencil icon at the upper left corner of the page.
|
||||
|
||||
A rule configuration window will appear. Enter `tutorial` for both the user and changelog comment field.
|
||||
|
||||
Now click the `+ Add a rule` button twice.
|
||||
|
||||
In the `rule #1` box at the top, click `Load`, `Interval`, enter `2015-09-12T12:00:00.000Z/2015-09-13T00:00:00.000Z` in the interval box, and click `+ _default_tier replicant`.
|
||||
|
||||
In the `rule #2` box at the bottom, click `Drop` and `Forever`.
|
||||
|
||||
The rules should look like this:
|
||||
|
||||
![Set rules](../tutorials/img/tutorial-retention-02.png "Set rules")
|
||||
|
||||
Now click `Save all rules`, wait for a few seconds, and refresh the page.
|
||||
|
||||
The segments for the first 12 hours of 2015-09-12 are now gone:
|
||||
|
||||
![New segments](../tutorials/img/tutorial-retention-03.png "New segments")
|
||||
|
||||
The resulting retention rule chain is the following:
|
||||
|
||||
```
|
||||
loadByInterval 2015-09-12T12/2015-09-13 (12 hours)
|
||||
|
||||
dropForever
|
||||
|
||||
loadForever (default rule)
|
||||
```
|
||||
|
||||
The rule chain is evaluated from top to bottom, with the default rule chain always added at the bottom.
|
||||
|
||||
The tutorial rule chain we just created loads data if it is within the specified 12 hour interval.
|
||||
|
||||
If data is not within the 12 hour interval, the rule chain evaluates `dropForever` next, which will drop any data.
|
||||
|
||||
The `dropForever` terminates the rule chain, effectively overriding the default `loadForever` rule, which will never be reached in this rule chain.
|
||||
|
||||
Note that in this tutorial we defined a load rule on a specific interval.
|
||||
|
||||
If instead you want to retain data based on how old it is (e.g., retain data that ranges from 3 months in the past to the present time), you would define a Period load rule instead.
|
||||
|
||||
## Further reading
|
||||
|
||||
* [Load rules](/docs/VERSION/operations/rule-configuration.html)
|
|
@ -0,0 +1,180 @@
|
|||
---
|
||||
layout: doc_page
|
||||
---
|
||||
|
||||
# Tutorial: Roll-up
|
||||
|
||||
Druid can summarize raw data at ingestion time using a process we refer to as "roll-up". Roll-up is a first-level aggregation operation over a selected set of columns that reduces the size of stored segments.
|
||||
|
||||
This tutorial will demonstrate the effects of roll-up on an example dataset.
|
||||
|
||||
For this tutorial, we'll assume you've already downloaded Druid as described in
|
||||
the [single-machine quickstart](index.html) and have it running on your local machine.
|
||||
|
||||
It will also be helpful to have finished [Tutorial: Loading a file](/docs/VERSION/tutorials/tutorial-batch.html) and [Tutorial: Querying data](/docs/VERSION/tutorials/tutorial-query.html).
|
||||
|
||||
## Example data
|
||||
|
||||
For this tutorial, we'll use a small sample of network flow event data, representing packet and byte counts for traffic from a source to a destination IP address that occurred within a particular second.
|
||||
|
||||
```
|
||||
{"timestamp":"2018-01-01T01:01:35Z","srcIP":"1.1.1.1", "dstIP":"2.2.2.2","packets":20,"bytes":9024}
|
||||
{"timestamp":"2018-01-01T01:01:51Z","srcIP":"1.1.1.1", "dstIP":"2.2.2.2","packets":255,"bytes":21133}
|
||||
{"timestamp":"2018-01-01T01:01:59Z","srcIP":"1.1.1.1", "dstIP":"2.2.2.2","packets":11,"bytes":5780}
|
||||
{"timestamp":"2018-01-01T01:02:14Z","srcIP":"1.1.1.1", "dstIP":"2.2.2.2","packets":38,"bytes":6289}
|
||||
{"timestamp":"2018-01-01T01:02:29Z","srcIP":"1.1.1.1", "dstIP":"2.2.2.2","packets":377,"bytes":359971}
|
||||
{"timestamp":"2018-01-01T01:03:29Z","srcIP":"1.1.1.1", "dstIP":"2.2.2.2","packets":49,"bytes":10204}
|
||||
{"timestamp":"2018-01-02T21:33:14Z","srcIP":"7.7.7.7", "dstIP":"8.8.8.8","packets":38,"bytes":6289}
|
||||
{"timestamp":"2018-01-02T21:33:45Z","srcIP":"7.7.7.7", "dstIP":"8.8.8.8","packets":123,"bytes":93999}
|
||||
{"timestamp":"2018-01-02T21:35:45Z","srcIP":"7.7.7.7", "dstIP":"8.8.8.8","packets":12,"bytes":2818}
|
||||
```
|
||||
|
||||
A file containing this sample input data is located at `quickstart/tutorial/rollup-data.json`.
|
||||
|
||||
We'll ingest this data using the following ingestion task spec, located at `quickstart/tutorial/rollup-index.json`.
|
||||
|
||||
```
|
||||
{
|
||||
"type" : "index",
|
||||
"spec" : {
|
||||
"dataSchema" : {
|
||||
"dataSource" : "rollup-tutorial",
|
||||
"parser" : {
|
||||
"type" : "string",
|
||||
"parseSpec" : {
|
||||
"format" : "json",
|
||||
"dimensionsSpec" : {
|
||||
"dimensions" : [
|
||||
"srcIP",
|
||||
"dstIP"
|
||||
]
|
||||
},
|
||||
"timestampSpec": {
|
||||
"column": "timestamp",
|
||||
"format": "iso"
|
||||
}
|
||||
}
|
||||
},
|
||||
"metricsSpec" : [
|
||||
{ "type" : "count", "name" : "count" },
|
||||
{ "type" : "longSum", "name" : "packets", "fieldName" : "packets" },
|
||||
{ "type" : "longSum", "name" : "bytes", "fieldName" : "bytes" }
|
||||
],
|
||||
"granularitySpec" : {
|
||||
"type" : "uniform",
|
||||
"segmentGranularity" : "week",
|
||||
"queryGranularity" : "minute",
|
||||
"intervals" : ["2018-01-01/2018-01-03"],
|
||||
"rollup" : true
|
||||
}
|
||||
},
|
||||
"ioConfig" : {
|
||||
"type" : "index",
|
||||
"firehose" : {
|
||||
"type" : "local",
|
||||
"baseDir" : "quickstart/tutorial",
|
||||
"filter" : "rollup-data.json"
|
||||
},
|
||||
"appendToExisting" : false
|
||||
},
|
||||
"tuningConfig" : {
|
||||
"type" : "index",
|
||||
"targetPartitionSize" : 5000000,
|
||||
"maxRowsInMemory" : 25000,
|
||||
"forceExtendableShardSpecs" : true
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
Roll-up has been enabled by setting `"rollup" : true` in the `granularitySpec`.
|
||||
|
||||
Note that we have `srcIP` and `dstIP` defined as dimensions, a longSum metric is defined for the `packets` and `bytes` columns, and the `queryGranularity` has been defined as `minute`.
|
||||
|
||||
We will see how these definitions are used after we load this data.
|
||||
|
||||
## Load the example data
|
||||
|
||||
From the druid-${DRUIDVERSION} package root, run the following command:
|
||||
|
||||
```
|
||||
bin/post-index-task --file quickstart/tutorial/rollup-index.json
|
||||
```
|
||||
|
||||
After the script completes, we will query the data.
|
||||
|
||||
## Query the example data
|
||||
|
||||
Let's run `bin/dsql` and issue a `select * from "rollup-tutorial";` query to see what data was ingested.
|
||||
|
||||
```
|
||||
$ bin/dsql
|
||||
Welcome to dsql, the command-line client for Druid SQL.
|
||||
Type "\h" for help.
|
||||
dsql> select * from "rollup-tutorial";
|
||||
┌──────────────────────────┬────────┬───────┬─────────┬─────────┬─────────┐
|
||||
│ __time │ bytes │ count │ dstIP │ packets │ srcIP │
|
||||
├──────────────────────────┼────────┼───────┼─────────┼─────────┼─────────┤
|
||||
│ 2018-01-01T01:01:00.000Z │ 35937 │ 3 │ 2.2.2.2 │ 286 │ 1.1.1.1 │
|
||||
│ 2018-01-01T01:02:00.000Z │ 366260 │ 2 │ 2.2.2.2 │ 415 │ 1.1.1.1 │
|
||||
│ 2018-01-01T01:03:00.000Z │ 10204 │ 1 │ 2.2.2.2 │ 49 │ 1.1.1.1 │
|
||||
│ 2018-01-02T21:33:00.000Z │ 100288 │ 2 │ 8.8.8.8 │ 161 │ 7.7.7.7 │
|
||||
│ 2018-01-02T21:35:00.000Z │ 2818 │ 1 │ 8.8.8.8 │ 12 │ 7.7.7.7 │
|
||||
└──────────────────────────┴────────┴───────┴─────────┴─────────┴─────────┘
|
||||
Retrieved 5 rows in 1.18s.
|
||||
|
||||
dsql>
|
||||
```
|
||||
|
||||
Let's look at the three events in the original input data that occurred during `2018-01-01T01:01`:
|
||||
|
||||
```
|
||||
{"timestamp":"2018-01-01T01:01:35Z","srcIP":"1.1.1.1", "dstIP":"2.2.2.2","packets":20,"bytes":9024}
|
||||
{"timestamp":"2018-01-01T01:01:51Z","srcIP":"1.1.1.1", "dstIP":"2.2.2.2","packets":255,"bytes":21133}
|
||||
{"timestamp":"2018-01-01T01:01:59Z","srcIP":"1.1.1.1", "dstIP":"2.2.2.2","packets":11,"bytes":5780}
|
||||
```
|
||||
|
||||
These three rows have been "rolled up" into the following row:
|
||||
|
||||
```
|
||||
┌──────────────────────────┬────────┬───────┬─────────┬─────────┬─────────┐
|
||||
│ __time │ bytes │ count │ dstIP │ packets │ srcIP │
|
||||
├──────────────────────────┼────────┼───────┼─────────┼─────────┼─────────┤
|
||||
│ 2018-01-01T01:01:00.000Z │ 35937 │ 3 │ 2.2.2.2 │ 286 │ 1.1.1.1 │
|
||||
└──────────────────────────┴────────┴───────┴─────────┴─────────┴─────────┘
|
||||
```
|
||||
|
||||
The input rows have been grouped by the timestamp and dimension columns `{timestamp, srcIP, dstIP}` with sum aggregations on the metric columns `packets` and `bytes`.
|
||||
|
||||
Before the grouping occurs, the timestamps of the original input data are bucketed/floored by minute, due to the `"queryGranularity":"minute"` setting in the ingestion spec.
|
||||
|
||||
Likewise, these two events that occurred during `2018-01-01T01:02` have been rolled up:
|
||||
|
||||
```
|
||||
{"timestamp":"2018-01-01T01:02:14Z","srcIP":"1.1.1.1", "dstIP":"2.2.2.2","packets":38,"bytes":6289}
|
||||
{"timestamp":"2018-01-01T01:02:29Z","srcIP":"1.1.1.1", "dstIP":"2.2.2.2","packets":377,"bytes":359971}
|
||||
```
|
||||
|
||||
```
|
||||
┌──────────────────────────┬────────┬───────┬─────────┬─────────┬─────────┐
|
||||
│ __time │ bytes │ count │ dstIP │ packets │ srcIP │
|
||||
├──────────────────────────┼────────┼───────┼─────────┼─────────┼─────────┤
|
||||
│ 2018-01-01T01:02:00.000Z │ 366260 │ 2 │ 2.2.2.2 │ 415 │ 1.1.1.1 │
|
||||
└──────────────────────────┴────────┴───────┴─────────┴─────────┴─────────┘
|
||||
```
|
||||
|
||||
For the last event recording traffic between 1.1.1.1 and 2.2.2.2, no roll-up took place, because this was the only event that occurred during `2018-01-01T01:03`:
|
||||
|
||||
```
|
||||
{"timestamp":"2018-01-01T01:03:29Z","srcIP":"1.1.1.1", "dstIP":"2.2.2.2","packets":49,"bytes":10204}
|
||||
```
|
||||
|
||||
```
|
||||
┌──────────────────────────┬────────┬───────┬─────────┬─────────┬─────────┐
|
||||
│ __time │ bytes │ count │ dstIP │ packets │ srcIP │
|
||||
├──────────────────────────┼────────┼───────┼─────────┼─────────┼─────────┤
|
||||
│ 2018-01-01T01:03:00.000Z │ 10204 │ 1 │ 2.2.2.2 │ 49 │ 1.1.1.1 │
|
||||
└──────────────────────────┴────────┴───────┴─────────┴─────────┴─────────┘
|
||||
```
|
||||
|
||||
Note that the `count` metric shows how many rows in the original input data contributed to the final "rolled up" row.
|
|
@ -1,134 +0,0 @@
|
|||
---
|
||||
layout: doc_page
|
||||
---
|
||||
|
||||
# Tutorial: Load your own streaming data
|
||||
|
||||
## Getting started
|
||||
|
||||
This tutorial shows you how to load your own streams into Druid.
|
||||
|
||||
For this tutorial, we'll assume you've already downloaded Druid and Tranquility as described in
|
||||
the [single-machine quickstart](quickstart.html) and have it running on your local machine. You
|
||||
don't need to have loaded any data yet.
|
||||
|
||||
Once that's complete, you can load your own dataset by writing a custom ingestion spec.
|
||||
|
||||
## Writing an ingestion spec
|
||||
|
||||
When loading streams into Druid, we recommend using the [stream push](../ingestion/stream-push.html)
|
||||
process. In this tutorial we'll be using [Tranquility Server](../ingestion/stream-ingestion.html#server) to push
|
||||
data into Druid over HTTP.
|
||||
|
||||
<div class="note info">
|
||||
This tutorial will show you how to push streams to Druid using HTTP, but Druid additionally supports
|
||||
a wide variety of batch and streaming loading methods. See the <a href="../ingestion/batch-ingestion.html">Loading files</a>
|
||||
and <a href="../ingestion/stream-ingestion.html">Loading streams</a> pages for more information about other options,
|
||||
including from Hadoop, Kafka, Storm, Samza, Spark Streaming, and your own JVM apps.
|
||||
</div>
|
||||
|
||||
You can prepare for loading a new dataset over HTTP by writing a custom Tranquility Server
|
||||
configuration. The bundled configuration is in `conf-quickstart/tranquility/server.json`, which
|
||||
you can modify for your own needs.
|
||||
|
||||
The most important questions are:
|
||||
|
||||
* What should the dataset be called? This is the "dataSource" field of the "dataSchema".
|
||||
* Which field should be treated as a timestamp? This belongs in the "column" field of the "timestampSpec".
|
||||
* Which fields should be treated as dimensions? This belongs in the "dimensions" field of the "dimensionsSpec".
|
||||
* Which fields should be treated as measures? This belongs in the "metricsSpec" field.
|
||||
|
||||
Let's use a small JSON pageviews dataset as an example, with records like:
|
||||
|
||||
```json
|
||||
{"time": "2000-01-01T00:00:00Z", "url": "/foo/bar", "user": "alice", "latencyMs": 32}
|
||||
```
|
||||
|
||||
So the answers to the questions above are:
|
||||
|
||||
* Let's call the dataset "pageviews".
|
||||
* The timestamp is the "time" field.
|
||||
* Good choices for dimensions are the string fields "url" and "user".
|
||||
* Good choices for measures are a count of pageviews, and the sum of "latencyMs". Collecting that
|
||||
sum when we load the data will allow us to compute an average at query time as well.
|
||||
|
||||
Now, edit the existing `conf-quickstart/tranquility/server.json` file by altering these
|
||||
sections:
|
||||
|
||||
1. Change the key `"metrics"` under `"dataSources"` to `"pageviews"`
|
||||
2. Alter these sections under the new `"pageviews"` key:
|
||||
```json
|
||||
"dataSource": "pageviews"
|
||||
```
|
||||
|
||||
```json
|
||||
"timestampSpec": {
|
||||
"format": "auto",
|
||||
"column": "time"
|
||||
}
|
||||
```
|
||||
|
||||
```json
|
||||
"dimensionsSpec": {
|
||||
"dimensions": ["url", "user"]
|
||||
}
|
||||
```
|
||||
|
||||
```json
|
||||
"metricsSpec": [
|
||||
{"name": "views", "type": "count"},
|
||||
{"name": "latencyMs", "type": "doubleSum", "fieldName": "latencyMs"}
|
||||
]
|
||||
```
|
||||
|
||||
## Restarting the server
|
||||
|
||||
Restart the server to pick up the new configuration file by stopping Tranquility (CTRL-C) and starting it up again.
|
||||
|
||||
## Sending data
|
||||
|
||||
Let's send some data! We'll start with these three records:
|
||||
|
||||
```json
|
||||
{"time": "2000-01-01T00:00:00Z", "url": "/foo/bar", "user": "alice", "latencyMs": 32}
|
||||
{"time": "2000-01-01T00:00:00Z", "url": "/", "user": "bob", "latencyMs": 11}
|
||||
{"time": "2000-01-01T00:00:00Z", "url": "/foo/bar", "user": "bob", "latencyMs": 45}
|
||||
```
|
||||
|
||||
Druid streaming ingestion requires relatively current messages (relative to a slack time controlled by the
|
||||
[windowPeriod](../ingestion/stream-push.html#segmentgranularity-and-windowperiod) value), so you should
|
||||
replace `2000-01-01T00:00:00Z` in these messages with the current time in ISO8601 format. You can
|
||||
get this by running:
|
||||
|
||||
```bash
|
||||
python -c 'import datetime; print(datetime.datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%SZ"))'
|
||||
```
|
||||
|
||||
Update the timestamps in the JSON above, and save it to a file named `pageviews.json`. Then send
|
||||
it to Druid by running:
|
||||
|
||||
```bash
|
||||
curl -XPOST -H'Content-Type: application/json' --data-binary @pageviews.json http://localhost:8200/v1/post/pageviews
|
||||
```
|
||||
|
||||
This will print something like:
|
||||
|
||||
```
|
||||
{"result":{"received":3,"sent":3}}
|
||||
```
|
||||
|
||||
This indicates that the HTTP server received 3 events from you, and sent 3 to Druid. Note that
|
||||
this may take a few seconds to finish the first time you run it, as Druid resources must be
|
||||
allocated to the ingestion task. Subsequent POSTs should complete quickly.
|
||||
|
||||
If you see `"sent":0` this likely means that your timestamps are not recent enough. Try adjusting
|
||||
your timestamps and re-sending your data.
|
||||
|
||||
## Querying your data
|
||||
|
||||
After sending data, you can immediately query it using any of the
|
||||
[supported query methods](../querying/querying.html).
|
||||
|
||||
## Further reading
|
||||
|
||||
To read more about loading streams, see our [streaming ingestion documentation](../ingestion/stream-ingestion.html).
|
|
@ -0,0 +1,84 @@
|
|||
---
|
||||
layout: doc_page
|
||||
---
|
||||
|
||||
# Tutorial: Load streaming data with HTTP push
|
||||
|
||||
## Getting started
|
||||
|
||||
This tutorial shows you how to load streaming data into Druid using HTTP push via Tranquility Server.
|
||||
|
||||
[Tranquility Server](https://github.com/druid-io/tranquility/blob/master/docs/server.md) allows a stream of data to be pushed into Druid using HTTP POSTs.
|
||||
|
||||
For this tutorial, we'll assume you've already downloaded Druid as described in
|
||||
the [single-machine quickstart](quickstart.html) and have it running on your local machine. You
|
||||
don't need to have loaded any data yet.
|
||||
|
||||
## Download Tranquility
|
||||
|
||||
In the Druid package root, run the following commands:
|
||||
|
||||
```
|
||||
curl http://static.druid.io/tranquility/releases/tranquility-distribution-0.8.2.tgz -o tranquility-distribution-0.8.2.tgz
|
||||
tar -xzf tranquility-distribution-0.8.2.tgz
|
||||
mv tranquility-distribution-0.8.2 tranquility
|
||||
```
|
||||
|
||||
The startup scripts for the tutorial will expect the contents of the Tranquility tarball to be located at `tranquility` under the druid-#{DRUIDVERSION} package root.
|
||||
|
||||
## Enable Tranquility Server
|
||||
|
||||
- In your `quickstart/tutorial/conf/tutorial-cluster.conf`, uncomment the `tranquility-server` line.
|
||||
- Stop your *bin/supervise* command (CTRL-C) and then restart it by again running `bin/supervise -c quickstart/tutorial/conf/tutorial-cluster.conf`.
|
||||
|
||||
As part of the output of *supervise* you should see something like:
|
||||
|
||||
```
|
||||
Running command[tranquility-server], logging to[/stage/druid-{DRUIDVERSION}/var/sv/tranquility-server.log]: tranquility/bin/tranquility server -configFile quickstart/tutorial/conf/tranquility/server.json -Ddruid.extensions.loadList=[]
|
||||
```
|
||||
|
||||
You can check the log file in `var/sv/tranquility-server.log` to confirm that the server is starting up properly.
|
||||
|
||||
## Send data
|
||||
|
||||
Let's send the sample Wikipedia edits data to Tranquility:
|
||||
|
||||
```
|
||||
gunzip -k quickstart/wikiticker-2015-09-12-sampled.json.gz
|
||||
curl -XPOST -H'Content-Type: application/json' --data-binary @quickstart/wikiticker-2015-09-12-sampled.json http://localhost:8200/v1/post/wikipedia
|
||||
```
|
||||
|
||||
Which will print something like:
|
||||
|
||||
```
|
||||
{"result":{"received":39244,"sent":39244}}
|
||||
```
|
||||
|
||||
This indicates that the HTTP server received 39,244 events from you, and sent 39,244 to Druid. This
|
||||
command may generate a "connection refused" error if you run it too quickly after enabling Tranquility
|
||||
Server, which means the server has not yet started up. It should start up within a few seconds. The command
|
||||
may also take a few seconds to finish the first time you run it, during which time Druid resources are being
|
||||
allocated to the ingestion task. Subsequent POSTs will complete quickly once this is done.
|
||||
|
||||
Once the data is sent to Druid, you can immediately query it.
|
||||
|
||||
If you see a `sent` count of 0, retry the send command until the `sent` count also shows 39244:
|
||||
|
||||
```
|
||||
{"result":{"received":39244,"sent":0}}
|
||||
```
|
||||
|
||||
## Querying your data
|
||||
|
||||
Please follow the [query tutorial](../tutorial/tutorial-query.html) to run some example queries on the newly loaded data.
|
||||
|
||||
## Cleanup
|
||||
|
||||
If you wish to go through any of the other ingestion tutorials, you will need to shut down the cluster and reset the cluster state by removing the contents of the `var` directory under the druid package, as the other tutorials will write to the same "wikipedia" datasource.
|
||||
|
||||
When cleaning up after running this Tranquility tutorial, it is also necessary to recomment the `tranquility-server` line in `quickstart/tutorial/conf/tutorial-cluster.conf` before restarting the cluster.
|
||||
|
||||
|
||||
## Further reading
|
||||
|
||||
For more information on Tranquility, please see [the Tranquility documentation](https://github.com/druid-io/tranquility).
|
|
@ -0,0 +1,138 @@
|
|||
---
|
||||
layout: doc_page
|
||||
---
|
||||
|
||||
# Tutorial: Transforming input data
|
||||
|
||||
This tutorial will demonstrate how to use transform specs to filter and transform input data during ingestion.
|
||||
|
||||
For this tutorial, we'll assume you've already downloaded Druid as described in
|
||||
the [single-machine quickstart](index.html) and have it running on your local machine.
|
||||
|
||||
It will also be helpful to have finished [Tutorial: Loading a file](/docs/VERSION/tutorials/tutorial-batch.html) and [Tutorial: Querying data](/docs/VERSION/tutorials/tutorial-query.html).
|
||||
|
||||
## Sample data
|
||||
|
||||
We've included sample data for this tutorial at `quickstart/tutorial/transform-data.json`, reproduced here for convenience:
|
||||
|
||||
```
|
||||
{"timestamp":"2018-01-01T07:01:35Z","animal":"octopus", "location":1, "number":100}
|
||||
{"timestamp":"2018-01-01T05:01:35Z","animal":"mongoose", "location":2,"number":200}
|
||||
{"timestamp":"2018-01-01T06:01:35Z","animal":"snake", "location":3, "number":300}
|
||||
{"timestamp":"2018-01-01T01:01:35Z","animal":"lion", "location":4, "number":300}
|
||||
```
|
||||
|
||||
## Load data with transform specs
|
||||
|
||||
We will ingest the sample data using the following spec, which demonstrates the use of transform specs:
|
||||
|
||||
```
|
||||
{
|
||||
"type" : "index",
|
||||
"spec" : {
|
||||
"dataSchema" : {
|
||||
"dataSource" : "transform-tutorial",
|
||||
"parser" : {
|
||||
"type" : "string",
|
||||
"parseSpec" : {
|
||||
"format" : "json",
|
||||
"dimensionsSpec" : {
|
||||
"dimensions" : [
|
||||
"animal",
|
||||
{ "name": "location", "type": "long" }
|
||||
]
|
||||
},
|
||||
"timestampSpec": {
|
||||
"column": "timestamp",
|
||||
"format": "iso"
|
||||
}
|
||||
}
|
||||
},
|
||||
"metricsSpec" : [
|
||||
{ "type" : "count", "name" : "count" },
|
||||
{ "type" : "longSum", "name" : "number", "fieldName" : "number" },
|
||||
{ "type" : "longSum", "name" : "triple-number", "fieldName" : "triple-number" }
|
||||
],
|
||||
"granularitySpec" : {
|
||||
"type" : "uniform",
|
||||
"segmentGranularity" : "week",
|
||||
"queryGranularity" : "minute",
|
||||
"intervals" : ["2018-01-01/2018-01-03"],
|
||||
"rollup" : true
|
||||
},
|
||||
"transformSpec": {
|
||||
"transforms": [
|
||||
{
|
||||
"type": "expression",
|
||||
"name": "animal",
|
||||
"expression": "concat('super-', animal)"
|
||||
},
|
||||
{
|
||||
"type": "expression",
|
||||
"name": "triple-number",
|
||||
"expression": "number * 3"
|
||||
}
|
||||
],
|
||||
"filter": {
|
||||
"type":"or",
|
||||
"fields": [
|
||||
{ "type": "selector", "dimension": "animal", "value": "super-mongoose" },
|
||||
{ "type": "selector", "dimension": "triple-number", "value": "300" },
|
||||
{ "type": "selector", "dimension": "location", "value": "3" }
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
"ioConfig" : {
|
||||
"type" : "index",
|
||||
"firehose" : {
|
||||
"type" : "local",
|
||||
"baseDir" : "quickstart/tutorial",
|
||||
"filter" : "transform-data.json"
|
||||
},
|
||||
"appendToExisting" : false
|
||||
},
|
||||
"tuningConfig" : {
|
||||
"type" : "index",
|
||||
"targetPartitionSize" : 5000000,
|
||||
"maxRowsInMemory" : 25000,
|
||||
"forceExtendableShardSpecs" : true
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
In the transform spec, we have two expression transforms:
|
||||
* `super-animal`: prepends "super-" to the values in the `animal` column. This will override the `animal` column with the transformed version, since the transform's name is `animal`.
|
||||
* `triple-number`: multiplies the `number` column by 3. This will create a new `triple-number` column. Note that we are ingesting both the original and the transformed column.
|
||||
|
||||
Additionally, we have an OR filter with three clauses:
|
||||
* `super-animal` values that match "super-mongoose"
|
||||
* `triple-number` values that match 300
|
||||
* `location` values that match 3
|
||||
|
||||
This filter selects the first 3 rows, and it will exclude the final "lion" row in the input data. Note that the filter is applied after the transformation.
|
||||
|
||||
Let's submit this task now, which has been included at `quickstart/tutorial/transform-index.json`:
|
||||
|
||||
```
|
||||
bin/post-index-task --file quickstart/tutorial/transform-index.json
|
||||
```
|
||||
|
||||
## Query the transformed data
|
||||
|
||||
Let's run `bin/dsql` and issue a `select * from "transform-tutorial";` query to see what was ingested:
|
||||
|
||||
```
|
||||
dsql> select * from "transform-tutorial";
|
||||
┌──────────────────────────┬────────────────┬───────┬──────────┬────────┬───────────────┐
|
||||
│ __time │ animal │ count │ location │ number │ triple-number │
|
||||
├──────────────────────────┼────────────────┼───────┼──────────┼────────┼───────────────┤
|
||||
│ 2018-01-01T05:01:00.000Z │ super-mongoose │ 1 │ 2 │ 200 │ 600 │
|
||||
│ 2018-01-01T06:01:00.000Z │ super-snake │ 1 │ 3 │ 300 │ 900 │
|
||||
│ 2018-01-01T07:01:00.000Z │ super-octopus │ 1 │ 1 │ 100 │ 300 │
|
||||
└──────────────────────────┴────────────────┴───────┴──────────┴────────┴───────────────┘
|
||||
Retrieved 3 rows in 0.03s.
|
||||
```
|
||||
|
||||
The "lion" row has been discarded, the `animal` column has been transformed, and we have both the original and transformed `number` column.
|
|
@ -0,0 +1,150 @@
|
|||
---
|
||||
layout: doc_page
|
||||
---
|
||||
|
||||
# Tutorial: Updating existing data
|
||||
|
||||
This tutorial demonstrates how to update existing data, showing both overwrites and appends.
|
||||
|
||||
For this tutorial, we'll assume you've already downloaded Druid as described in
|
||||
the [single-machine quickstart](index.html) and have it running on your local machine.
|
||||
|
||||
It will also be helpful to have finished [Tutorial: Loading a file](/docs/VERSION/tutorials/tutorial-batch.html), [Tutorial: Querying data](/docs/VERSION/tutorials/tutorial-query.html), and [Tutorial: Rollup](/docs/VERSION/tutorials/tutorial-rollup.html).
|
||||
|
||||
## Overwrite
|
||||
|
||||
This section of the tutorial will cover how to overwrite an existing interval of data.
|
||||
|
||||
### Load initial data
|
||||
|
||||
Let's load an initial data set which we will overwrite and append to.
|
||||
|
||||
The spec we'll use for this tutorial is located at `quickstart/tutorial/updates-init-index.json`. This spec creates a datasource called `updates-tutorial` from the `quickstart/tutorial/updates-data.json` input file.
|
||||
|
||||
Let's submit that task:
|
||||
|
||||
```
|
||||
bin/post-index-task --file quickstart/tutorial/updates-init-index.json
|
||||
```
|
||||
|
||||
We have three initial rows containing an "animal" dimension and "number" metric:
|
||||
|
||||
```
|
||||
dsql> select * from "updates-tutorial";
|
||||
┌──────────────────────────┬──────────┬───────┬────────┐
|
||||
│ __time │ animal │ count │ number │
|
||||
├──────────────────────────┼──────────┼───────┼────────┤
|
||||
│ 2018-01-01T01:01:00.000Z │ tiger │ 1 │ 100 │
|
||||
│ 2018-01-01T03:01:00.000Z │ aardvark │ 1 │ 42 │
|
||||
│ 2018-01-01T03:01:00.000Z │ giraffe │ 1 │ 14124 │
|
||||
└──────────────────────────┴──────────┴───────┴────────┘
|
||||
Retrieved 3 rows in 1.42s.
|
||||
```
|
||||
|
||||
### Overwrite the initial data
|
||||
|
||||
To overwrite this data, we can submit another task for the same interval, but with different input data.
|
||||
|
||||
The `quickstart/tutorial/updates-overwrite-index.json` spec will perform an overwrite on the `updates-tutorial` datasource.
|
||||
|
||||
Note that this task reads input from `quickstart/tutorial/updates-data2.json`, and `appendToExisting` is set to `false` (indicating this is an overwrite).
|
||||
|
||||
Let's submit that task:
|
||||
|
||||
```
|
||||
bin/post-index-task --file quickstart/tutorial/updates-overwrite-index.json
|
||||
```
|
||||
|
||||
When Druid finishes loading the new segment from this overwrite task, the "tiger" row now has the value "lion", the "aardvark" row has a different number, and the "giraffe" row has been replaced. It may take a couple of minutes for the changes to take effect:
|
||||
|
||||
```
|
||||
dsql> select * from "updates-tutorial";
|
||||
┌──────────────────────────┬──────────┬───────┬────────┐
|
||||
│ __time │ animal │ count │ number │
|
||||
├──────────────────────────┼──────────┼───────┼────────┤
|
||||
│ 2018-01-01T01:01:00.000Z │ lion │ 1 │ 100 │
|
||||
│ 2018-01-01T03:01:00.000Z │ aardvark │ 1 │ 9999 │
|
||||
│ 2018-01-01T04:01:00.000Z │ bear │ 1 │ 111 │
|
||||
└──────────────────────────┴──────────┴───────┴────────┘
|
||||
Retrieved 3 rows in 0.02s.
|
||||
```
|
||||
|
||||
## Combine old data with new data and overwrite
|
||||
|
||||
Let's try appending some new data to the `updates-tutorial` datasource now. We will add the data from `quickstart/tutorial/updates-data3.json`.
|
||||
|
||||
The `quickstart/tutorial/updates-append-index.json` task spec has been configured to read from the existing `updates-tutorial` datasource and the `quickstart/tutorial/updates-data3.json` file. The task will combine data from the two input sources, and then overwrite the original data with the new combined data.
|
||||
|
||||
Let's submit that task:
|
||||
|
||||
```
|
||||
bin/post-index-task --file quickstart/tutorial/updates-append-index.json
|
||||
```
|
||||
|
||||
When Druid finishes loading the new segment from this overwrite task, the new rows will have been added to the datasource. Note that roll-up occurred for the "lion" row:
|
||||
|
||||
```
|
||||
dsql> select * from "updates-tutorial";
|
||||
┌──────────────────────────┬──────────┬───────┬────────┐
|
||||
│ __time │ animal │ count │ number │
|
||||
├──────────────────────────┼──────────┼───────┼────────┤
|
||||
│ 2018-01-01T01:01:00.000Z │ lion │ 2 │ 400 │
|
||||
│ 2018-01-01T03:01:00.000Z │ aardvark │ 1 │ 9999 │
|
||||
│ 2018-01-01T04:01:00.000Z │ bear │ 1 │ 111 │
|
||||
│ 2018-01-01T05:01:00.000Z │ mongoose │ 1 │ 737 │
|
||||
│ 2018-01-01T06:01:00.000Z │ snake │ 1 │ 1234 │
|
||||
│ 2018-01-01T07:01:00.000Z │ octopus │ 1 │ 115 │
|
||||
└──────────────────────────┴──────────┴───────┴────────┘
|
||||
Retrieved 6 rows in 0.02s.
|
||||
```
|
||||
|
||||
## Append to the data
|
||||
|
||||
Let's try another way of appending data.
|
||||
|
||||
The `quickstart/tutorial/updates-append-index2.json` task spec reads input from `quickstart/tutorial/updates-data4.json` and will append its data to the `updates-tutorial` datasource. Note that `appendToExisting` is set to `true` in this spec.
|
||||
|
||||
Let's submit that task:
|
||||
|
||||
```
|
||||
bin/post-index-task --file quickstart/tutorial/updates-append-index2.json
|
||||
```
|
||||
|
||||
When the new data is loaded, we can see two additional rows after "octopus". Note that the new "bear" row with number 222 has not been rolled up with the existing bear-111 row, because the new data is held in a separate segment.
|
||||
|
||||
```
|
||||
dsql> select * from "updates-tutorial";
|
||||
┌──────────────────────────┬──────────┬───────┬────────┐
|
||||
│ __time │ animal │ count │ number │
|
||||
├──────────────────────────┼──────────┼───────┼────────┤
|
||||
│ 2018-01-01T01:01:00.000Z │ lion │ 2 │ 400 │
|
||||
│ 2018-01-01T03:01:00.000Z │ aardvark │ 1 │ 9999 │
|
||||
│ 2018-01-01T04:01:00.000Z │ bear │ 1 │ 111 │
|
||||
│ 2018-01-01T05:01:00.000Z │ mongoose │ 1 │ 737 │
|
||||
│ 2018-01-01T06:01:00.000Z │ snake │ 1 │ 1234 │
|
||||
│ 2018-01-01T07:01:00.000Z │ octopus │ 1 │ 115 │
|
||||
│ 2018-01-01T04:01:00.000Z │ bear │ 1 │ 222 │
|
||||
│ 2018-01-01T09:01:00.000Z │ falcon │ 1 │ 1241 │
|
||||
└──────────────────────────┴──────────┴───────┴────────┘
|
||||
Retrieved 8 rows in 0.02s.
|
||||
|
||||
```
|
||||
|
||||
If we run a GroupBy query instead of a `select *`, we can see that the "bear" rows will group together at query time:
|
||||
|
||||
```
|
||||
dsql> select __time, animal, SUM("count"), SUM("number") from "updates-tutorial" group by __time, animal;
|
||||
┌──────────────────────────┬──────────┬────────┬────────┐
|
||||
│ __time │ animal │ EXPR$2 │ EXPR$3 │
|
||||
├──────────────────────────┼──────────┼────────┼────────┤
|
||||
│ 2018-01-01T01:01:00.000Z │ lion │ 2 │ 400 │
|
||||
│ 2018-01-01T03:01:00.000Z │ aardvark │ 1 │ 9999 │
|
||||
│ 2018-01-01T04:01:00.000Z │ bear │ 2 │ 333 │
|
||||
│ 2018-01-01T05:01:00.000Z │ mongoose │ 1 │ 737 │
|
||||
│ 2018-01-01T06:01:00.000Z │ snake │ 1 │ 1234 │
|
||||
│ 2018-01-01T07:01:00.000Z │ octopus │ 1 │ 115 │
|
||||
│ 2018-01-01T09:01:00.000Z │ falcon │ 1 │ 1241 │
|
||||
└──────────────────────────┴──────────┴────────┴────────┘
|
||||
Retrieved 7 rows in 0.23s.
|
||||
```
|
||||
|
24
examples/conf-quickstart/druid/coordinator/runtime.properties → examples/bin/dsql
Normal file → Executable file
|
@ -1,4 +1,5 @@
|
|||
#
|
||||
#!/bin/bash -eu
|
||||
|
||||
# Licensed to the Apache Software Foundation (ASF) under one
|
||||
# or more contributor license agreements. See the NOTICE file
|
||||
# distributed with this work for additional information
|
||||
|
@ -15,10 +16,21 @@
|
|||
# KIND, either express or implied. See the License for the
|
||||
# specific language governing permissions and limitations
|
||||
# under the License.
|
||||
#
|
||||
|
||||
druid.service=druid/coordinator
|
||||
druid.port=8081
|
||||
PWD="$(pwd)"
|
||||
WHEREAMI="$(dirname "$0")"
|
||||
WHEREAMI="$(cd "$WHEREAMI" && pwd)"
|
||||
|
||||
druid.coordinator.startDelay=PT10S
|
||||
druid.coordinator.period=PT5S
|
||||
RLWRAP=""
|
||||
|
||||
if [ -x "$(command -v rlwrap)" ]
|
||||
then
|
||||
RLWRAP="rlwrap -C dsql"
|
||||
fi
|
||||
|
||||
if [ -x "$(command -v python2)" ]
|
||||
then
|
||||
exec $RLWRAP python2 "$WHEREAMI/dsql-main" "$@"
|
||||
else
|
||||
exec $RLWRAP "$WHEREAMI/dsql-main" "$@"
|
||||
fi
|
|
@ -0,0 +1,453 @@
|
|||
#!/usr/bin/env python
|
||||
|
||||
# Licensed to the Apache Software Foundation (ASF) under one
|
||||
# or more contributor license agreements. See the NOTICE file
|
||||
# distributed with this work for additional information
|
||||
# regarding copyright ownership. The ASF licenses this file
|
||||
# to you under the Apache License, Version 2.0 (the
|
||||
# "License"); you may not use this file except in compliance
|
||||
# with the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing,
|
||||
# software distributed under the License is distributed on an
|
||||
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
# KIND, either express or implied. See the License for the
|
||||
# specific language governing permissions and limitations
|
||||
# under the License.
|
||||
|
||||
from __future__ import print_function
|
||||
|
||||
import argparse
|
||||
import base64
|
||||
import collections
|
||||
import csv
|
||||
import errno
|
||||
import json
|
||||
import numbers
|
||||
import re
|
||||
import ssl
|
||||
import sys
|
||||
import time
|
||||
import unicodedata
|
||||
import urllib2
|
||||
|
||||
class DruidSqlException(Exception):
|
||||
def write_to(self, f):
|
||||
f.write('\x1b[31m')
|
||||
f.write(self.message if self.message else "Query failed")
|
||||
f.write('\x1b[0m')
|
||||
f.write('\n')
|
||||
f.flush()
|
||||
|
||||
def do_query(url, sql, context, timeout, user, password, ignore_ssl_verification, ca_file, ca_path):
|
||||
json_decoder = json.JSONDecoder(object_pairs_hook=collections.OrderedDict)
|
||||
try:
|
||||
sql_json = json.dumps({'query' : sql, 'context' : context})
|
||||
|
||||
# SSL stuff
|
||||
ssl_context = None;
|
||||
if (ignore_ssl_verification or ca_file != None or ca_path != None):
|
||||
ssl_context = ssl.create_default_context()
|
||||
if (ignore_ssl_verification):
|
||||
ssl_context.check_hostname = False
|
||||
ssl_context.verify_mode = ssl.CERT_NONE
|
||||
else:
|
||||
ssl_context.load_verify_locations(cafile=ca_file, capath=ca_path)
|
||||
|
||||
req = urllib2.Request(url, sql_json, {'Content-Type' : 'application/json'})
|
||||
if timeout <= 0:
|
||||
timeout = None
|
||||
|
||||
if (user and password):
|
||||
basicAuthEncoding = base64.b64encode('%s:%s' % (user, password))
|
||||
req.add_header("Authorization", "Basic %s" % basicAuthEncoding)
|
||||
|
||||
response = urllib2.urlopen(req, None, timeout, context=ssl_context)
|
||||
|
||||
first_chunk = True
|
||||
eof = False
|
||||
buf = ''
|
||||
|
||||
while not eof or len(buf) > 0:
|
||||
while True:
|
||||
try:
|
||||
# Remove starting ','
|
||||
buf = buf.lstrip(',')
|
||||
obj, sz = json_decoder.raw_decode(buf)
|
||||
yield obj
|
||||
buf = buf[sz:]
|
||||
except ValueError as e:
|
||||
# Maybe invalid JSON, maybe partial object; it's hard to tell with this library.
|
||||
if eof and buf.rstrip() == ']':
|
||||
# Stream done and all objects read.
|
||||
buf = ''
|
||||
break
|
||||
elif eof or len(buf) > 256 * 1024:
|
||||
# If we read more than 256KB or if it's eof then report the parse error.
|
||||
raise
|
||||
else:
|
||||
# Stop reading objects, get more from the stream instead.
|
||||
break
|
||||
|
||||
# Read more from the http stream
|
||||
if not eof:
|
||||
chunk = response.read(8192)
|
||||
if chunk:
|
||||
buf = buf + chunk
|
||||
if first_chunk:
|
||||
# Remove starting '['
|
||||
buf = buf.lstrip('[')
|
||||
else:
|
||||
# Stream done. Keep reading objects out of buf though.
|
||||
eof = True
|
||||
|
||||
except urllib2.URLError as e:
|
||||
raise_friendly_error(e)
|
||||
|
||||
def raise_friendly_error(e):
|
||||
if isinstance(e, urllib2.HTTPError):
|
||||
text = e.read().strip()
|
||||
error_obj = {}
|
||||
try:
|
||||
error_obj = dict(json.loads(text))
|
||||
except:
|
||||
pass
|
||||
if e.code == 500 and 'errorMessage' in error_obj:
|
||||
error_text = ''
|
||||
if error_obj['error'] != 'Unknown exception':
|
||||
error_text = error_text + error_obj['error'] + ': '
|
||||
if error_obj['errorClass']:
|
||||
error_text = error_text + str(error_obj['errorClass']) + ': '
|
||||
error_text = error_text + str(error_obj['errorMessage'])
|
||||
if error_obj['host']:
|
||||
error_text = error_text + ' (' + str(error_obj['host']) + ')'
|
||||
raise DruidSqlException(error_text)
|
||||
else:
|
||||
raise DruidSqlException("HTTP Error {0}: {1}\n{2}".format(e.code, e.reason, text))
|
||||
else:
|
||||
raise DruidSqlException(str(e))
|
||||
|
||||
def to_utf8(value):
|
||||
if value is None:
|
||||
return ""
|
||||
elif isinstance(value, unicode):
|
||||
return value.encode("utf-8")
|
||||
else:
|
||||
return str(value)
|
||||
|
||||
def to_tsv(values, delimiter):
|
||||
return delimiter.join(to_utf8(v).replace(delimiter, '') for v in values)
|
||||
|
||||
def print_csv(rows, header):
|
||||
csv_writer = csv.writer(sys.stdout)
|
||||
first = True
|
||||
for row in rows:
|
||||
if first and header:
|
||||
csv_writer.writerow(list(to_utf8(k) for k in row.keys()))
|
||||
first = False
|
||||
|
||||
values = []
|
||||
for key, value in row.iteritems():
|
||||
values.append(to_utf8(value))
|
||||
|
||||
csv_writer.writerow(values)
|
||||
|
||||
def print_tsv(rows, header, tsv_delimiter):
|
||||
first = True
|
||||
for row in rows:
|
||||
if first and header:
|
||||
print(to_tsv(row.keys(), tsv_delimiter))
|
||||
first = False
|
||||
|
||||
values = []
|
||||
for key, value in row.iteritems():
|
||||
values.append(value)
|
||||
|
||||
print(to_tsv(values, tsv_delimiter))
|
||||
|
||||
def print_json(rows):
|
||||
for row in rows:
|
||||
print(json.dumps(row))
|
||||
|
||||
def table_to_printable_value(value):
|
||||
# Unicode string, trimmed with control characters removed
|
||||
if value is None:
|
||||
return u"NULL"
|
||||
else:
|
||||
return to_utf8(value).strip().decode('utf-8').translate(dict.fromkeys(range(32)))
|
||||
|
||||
def table_compute_string_width(v):
|
||||
normalized = unicodedata.normalize('NFC', v)
|
||||
width = 0
|
||||
for c in normalized:
|
||||
ccategory = unicodedata.category(c)
|
||||
cwidth = unicodedata.east_asian_width(c)
|
||||
if ccategory == 'Cf':
|
||||
# Formatting control, zero width
|
||||
pass
|
||||
elif cwidth == 'F' or cwidth == 'W':
|
||||
# Double-wide character, prints in two columns
|
||||
width = width + 2
|
||||
else:
|
||||
# All other characters
|
||||
width = width + 1
|
||||
return width
|
||||
|
||||
def table_compute_column_widths(row_buffer):
|
||||
widths = None
|
||||
for values in row_buffer:
|
||||
values_widths = [table_compute_string_width(v) for v in values]
|
||||
if not widths:
|
||||
widths = values_widths
|
||||
else:
|
||||
i = 0
|
||||
for v in values:
|
||||
widths[i] = max(widths[i], values_widths[i])
|
||||
i = i + 1
|
||||
return widths
|
||||
|
||||
def table_print_row(values, column_widths, column_types):
|
||||
vertical_line = u'\u2502'.encode('utf-8')
|
||||
for i in xrange(0, len(values)):
|
||||
padding = ' ' * max(0, column_widths[i] - table_compute_string_width(values[i]))
|
||||
if column_types and column_types[i] == 'n':
|
||||
print(vertical_line + ' ' + padding + values[i].encode('utf-8') + ' ', end="")
|
||||
else:
|
||||
print(vertical_line + ' ' + values[i].encode('utf-8') + padding + ' ', end="")
|
||||
print(vertical_line)
|
||||
|
||||
def table_print_header(values, column_widths):
|
||||
# Line 1
|
||||
left_corner = u'\u250C'.encode('utf-8')
|
||||
horizontal_line = u'\u2500'.encode('utf-8')
|
||||
top_tee = u'\u252C'.encode('utf-8')
|
||||
right_corner = u'\u2510'.encode('utf-8')
|
||||
print(left_corner, end="")
|
||||
for i in xrange(0, len(column_widths)):
|
||||
print(horizontal_line * max(0, column_widths[i] + 2), end="")
|
||||
if i + 1 < len(column_widths):
|
||||
print(top_tee, end="")
|
||||
print(right_corner)
|
||||
|
||||
# Line 2
|
||||
table_print_row(values, column_widths, None)
|
||||
|
||||
# Line 3
|
||||
left_tee = u'\u251C'.encode('utf-8')
|
||||
cross = u'\u253C'.encode('utf-8')
|
||||
right_tee = u'\u2524'.encode('utf-8')
|
||||
print(left_tee, end="")
|
||||
for i in xrange(0, len(column_widths)):
|
||||
print(horizontal_line * max(0, column_widths[i] + 2), end="")
|
||||
if i + 1 < len(column_widths):
|
||||
print(cross, end="")
|
||||
print(right_tee)
|
||||
|
||||
def table_print_bottom(column_widths):
|
||||
left_corner = u'\u2514'.encode('utf-8')
|
||||
right_corner = u'\u2518'.encode('utf-8')
|
||||
bottom_tee = u'\u2534'.encode('utf-8')
|
||||
horizontal_line = u'\u2500'.encode('utf-8')
|
||||
print(left_corner, end="")
|
||||
for i in xrange(0, len(column_widths)):
|
||||
print(horizontal_line * max(0, column_widths[i] + 2), end="")
|
||||
if i + 1 < len(column_widths):
|
||||
print(bottom_tee, end="")
|
||||
print(right_corner)
|
||||
|
||||
def table_print_row_buffer(row_buffer, column_widths, column_types):
|
||||
first = True
|
||||
for values in row_buffer:
|
||||
if first:
|
||||
table_print_header(values, column_widths)
|
||||
first = False
|
||||
else:
|
||||
table_print_row(values, column_widths, column_types)
|
||||
|
||||
def print_table(rows):
|
||||
start = time.time()
|
||||
nrows = 0
|
||||
first = True
|
||||
|
||||
# Buffer some rows before printing.
|
||||
rows_to_buffer = 500
|
||||
row_buffer = []
|
||||
column_types = []
|
||||
column_widths = None
|
||||
|
||||
for row in rows:
|
||||
nrows = nrows + 1
|
||||
|
||||
if first:
|
||||
row_buffer.append([table_to_printable_value(k) for k in row.keys()])
|
||||
for k in row.keys():
|
||||
if isinstance(row[k], numbers.Number):
|
||||
column_types.append('n')
|
||||
else:
|
||||
column_types.append('s')
|
||||
first = False
|
||||
|
||||
values = [table_to_printable_value(v) for k, v in row.iteritems()]
|
||||
if rows_to_buffer > 0:
|
||||
row_buffer.append(values)
|
||||
rows_to_buffer = rows_to_buffer - 1
|
||||
else:
|
||||
if row_buffer:
|
||||
column_widths = table_compute_column_widths(row_buffer)
|
||||
table_print_row_buffer(row_buffer, column_widths, column_types)
|
||||
del row_buffer[:]
|
||||
table_print_row(values, column_widths, column_types)
|
||||
|
||||
if row_buffer:
|
||||
column_widths = table_compute_column_widths(row_buffer)
|
||||
table_print_row_buffer(row_buffer, column_widths, column_types)
|
||||
|
||||
if column_widths:
|
||||
table_print_bottom(column_widths)
|
||||
|
||||
print("Retrieved {0:,d} row{1:s} in {2:.2f}s.".format(nrows, 's' if nrows != 1 else '', time.time() - start))
|
||||
print("")
|
||||
|
||||
def display_query(url, sql, context, args):
|
||||
rows = do_query(url, sql, context, args.timeout, args.user, args.password, args.ignore_ssl_verification, args.cafile, args.capath)
|
||||
|
||||
if args.format == 'csv':
|
||||
print_csv(rows, args.header)
|
||||
elif args.format == 'tsv':
|
||||
print_tsv(rows, args.header, args.tsv_delimiter)
|
||||
elif args.format == 'json':
|
||||
print_json(rows)
|
||||
elif args.format == 'table':
|
||||
print_table(rows)
|
||||
|
||||
def sql_escape(s):
|
||||
if s is None:
|
||||
return "''"
|
||||
elif isinstance(s, unicode):
|
||||
ustr = s
|
||||
else:
|
||||
ustr = str(s).decode('utf-8')
|
||||
|
||||
escaped = [u"U&'"]
|
||||
|
||||
for c in ustr:
|
||||
ccategory = unicodedata.category(c)
|
||||
if ccategory.startswith('L') or ccategory.startswith('N') or c == ' ':
|
||||
escaped.append(c)
|
||||
else:
|
||||
escaped.append(u'\\')
|
||||
escaped.append('%04x' % ord(c))
|
||||
|
||||
escaped.append("'")
|
||||
return ''.join(escaped)
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description='Druid SQL command-line client.')
|
||||
parser.add_argument('--host', '-H', type=str, default='http://localhost:8082/', help='Broker host or url')
|
||||
parser.add_argument('--timeout', type=int, default=0, help='Timeout in seconds, 0 for no timeout')
|
||||
parser.add_argument('--format', type=str, default='table', choices=('csv', 'tsv', 'json', 'table'), help='Result format')
|
||||
parser.add_argument('--header', action='store_true', help='Include header row for formats "csv" and "tsv"')
|
||||
parser.add_argument('--tsv-delimiter', type=str, default='\t', help='Delimiter for format "tsv"')
|
||||
parser.add_argument('--context-option', '-c', type=str, action='append', help='Set context option for this connection')
|
||||
parser.add_argument('--execute', '-e', type=str, help='Execute single SQL query')
|
||||
parser.add_argument('--user', '-u', type=str, help='Username for HTTP basic auth')
|
||||
parser.add_argument('--password', '-p', type=str, help='Password for HTTP basic auth')
|
||||
parser.add_argument('--ignore-ssl-verification', '-k', action='store_true', default=False, help='Skip verification of SSL certificates.')
|
||||
parser.add_argument('--cafile', type=str, help='Path to SSL CA file for validating server certificates. See load_verify_locations() in https://docs.python.org/2/library/ssl.html#ssl.SSLContext.')
|
||||
parser.add_argument('--capath', type=str, help='SSL CA path for validating server certificates. See load_verify_locations() in https://docs.python.org/2/library/ssl.html#ssl.SSLContext.')
|
||||
args = parser.parse_args()
|
||||
|
||||
# Build broker URL
|
||||
url = args.host.rstrip('/') + '/druid/v2/sql/'
|
||||
if not url.startswith('http:') and not url.startswith('https:'):
|
||||
url = 'http://' + url
|
||||
|
||||
# Build context
|
||||
context = {}
|
||||
if args.context_option:
|
||||
for opt in args.context_option:
|
||||
kv = opt.split("=", 1)
|
||||
if len(kv) != 2:
|
||||
raise ValueError('Invalid context option, should be key=value: ' + opt)
|
||||
if re.match(r"^\d+$", kv[1]):
|
||||
context[kv[0]] = long(kv[1])
|
||||
else:
|
||||
context[kv[0]] = kv[1]
|
||||
|
||||
if args.execute:
|
||||
display_query(url, args.execute, context, args)
|
||||
else:
|
||||
# interactive mode
|
||||
print("Welcome to dsql, the command-line client for Druid SQL.")
|
||||
print("Type \"\h\" for help.")
|
||||
|
||||
while True:
|
||||
sql = ''
|
||||
while not sql.endswith(';'):
|
||||
prompt = "dsql> " if sql == '' else 'more> '
|
||||
try:
|
||||
more_sql = raw_input(prompt)
|
||||
except EOFError:
|
||||
sys.stdout.write('\n')
|
||||
sys.exit(1)
|
||||
if sql == '' and more_sql.startswith('\\'):
|
||||
# backslash command
|
||||
dmatch = re.match(r'^\\d(S?)(\+?)(\s+.*?|)\s*$', more_sql)
|
||||
if dmatch:
|
||||
include_system = dmatch.group(1)
|
||||
extra_info = dmatch.group(2)
|
||||
arg = dmatch.group(3).strip()
|
||||
if arg:
|
||||
sql = "SELECT TABLE_SCHEMA, TABLE_NAME, COLUMN_NAME, DATA_TYPE FROM INFORMATION_SCHEMA.COLUMNS WHERE TABLE_NAME = " + sql_escape(arg)
|
||||
if not include_system:
|
||||
sql = sql + " AND TABLE_SCHEMA = 'druid'"
|
||||
# break to execute sql
|
||||
break
|
||||
else:
|
||||
sql = "SELECT TABLE_SCHEMA, TABLE_NAME FROM INFORMATION_SCHEMA.TABLES";
|
||||
if not include_system:
|
||||
sql = sql + " WHERE TABLE_SCHEMA = 'druid'"
|
||||
# break to execute sql
|
||||
break
|
||||
|
||||
hmatch = re.match(r'^\\h\s*$', more_sql)
|
||||
if hmatch:
|
||||
print("Commands:")
|
||||
print(" \d show tables")
|
||||
print(" \dS show tables, including system tables")
|
||||
print(" \d table_name describe table")
|
||||
print(" \h show this help")
|
||||
print(" \q exit this program")
|
||||
print("Or enter a SQL query ending with a semicolon (;).")
|
||||
continue
|
||||
|
||||
qmatch = re.match(r'^\\q\s*$', more_sql)
|
||||
if qmatch:
|
||||
sys.exit(0)
|
||||
|
||||
print("No such command: " + more_sql)
|
||||
else:
|
||||
sql = (sql + ' ' + more_sql).strip()
|
||||
|
||||
try:
|
||||
display_query(url, sql.rstrip(';'), context, args)
|
||||
except DruidSqlException as e:
|
||||
e.write_to(sys.stdout)
|
||||
except KeyboardInterrupt:
|
||||
sys.stdout.write("Query interrupted\n")
|
||||
sys.stdout.flush()
|
||||
|
||||
try:
|
||||
main()
|
||||
except DruidSqlException as e:
|
||||
e.write_to(sys.stderr)
|
||||
sys.exit(1)
|
||||
except KeyboardInterrupt:
|
||||
sys.exit(1)
|
||||
except IOError as e:
|
||||
if e.errno == errno.EPIPE:
|
||||
sys.exit(1)
|
||||
else:
|
||||
raise
|
19
examples/conf-quickstart/druid/overlord/runtime.properties → examples/bin/post-index-task
Normal file → Executable file
|
@ -1,4 +1,5 @@
|
|||
#
|
||||
#!/bin/bash -eu
|
||||
|
||||
# Licensed to the Apache Software Foundation (ASF) under one
|
||||
# or more contributor license agreements. See the NOTICE file
|
||||
# distributed with this work for additional information
|
||||
|
@ -15,12 +16,14 @@
|
|||
# KIND, either express or implied. See the License for the
|
||||
# specific language governing permissions and limitations
|
||||
# under the License.
|
||||
#
|
||||
|
||||
druid.service=druid/overlord
|
||||
druid.port=8090
|
||||
PWD="$(pwd)"
|
||||
WHEREAMI="$(dirname "$0")"
|
||||
WHEREAMI="$(cd "$WHEREAMI" && pwd)"
|
||||
|
||||
druid.indexer.queue.startDelay=PT5S
|
||||
|
||||
druid.indexer.runner.type=remote
|
||||
druid.indexer.storage.type=metadata
|
||||
if [ -x "$(command -v python2)" ]
|
||||
then
|
||||
exec python2 "$WHEREAMI/post-index-task-main" "$@"
|
||||
else
|
||||
exec "$WHEREAMI/post-index-task-main" "$@"
|
||||
fi
|
|
@ -0,0 +1,176 @@
|
|||
#!/usr/bin/env python
|
||||
|
||||
# Licensed to the Apache Software Foundation (ASF) under one
|
||||
# or more contributor license agreements. See the NOTICE file
|
||||
# distributed with this work for additional information
|
||||
# regarding copyright ownership. The ASF licenses this file
|
||||
# to you under the Apache License, Version 2.0 (the
|
||||
# "License"); you may not use this file except in compliance
|
||||
# with the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing,
|
||||
# software distributed under the License is distributed on an
|
||||
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
# KIND, either express or implied. See the License for the
|
||||
# specific language governing permissions and limitations
|
||||
# under the License.
|
||||
|
||||
import argparse
|
||||
import base64
|
||||
import json
|
||||
import re
|
||||
import sys
|
||||
import time
|
||||
import urllib2
|
||||
import urlparse
|
||||
|
||||
def read_task_file(args):
|
||||
with open(args.file, 'r') as f:
|
||||
contents = f.read()
|
||||
# We don't use the parsed data, but we want to throw early if it's invalid
|
||||
try:
|
||||
json.loads(contents)
|
||||
except Exception, e:
|
||||
sys.stderr.write('Invalid JSON in task file "{0}": {1}\n'.format(args.file, repr(e)))
|
||||
sys.exit(1)
|
||||
return contents
|
||||
|
||||
def add_basic_auth_header(args, req):
|
||||
if (args.user is not None):
|
||||
basic_auth_encoded = base64.b64encode('%s:%s' % (args.user, args.password))
|
||||
req.add_header("Authorization", "Basic %s" % basic_auth_encoded)
|
||||
|
||||
# Keep trying until timeout_at, maybe die then
|
||||
def post_task(args, task_json, timeout_at):
|
||||
try:
|
||||
url = args.url.rstrip("/") + "/druid/indexer/v1/task"
|
||||
req = urllib2.Request(url, task_json, {'Content-Type' : 'application/json'})
|
||||
add_basic_auth_header(args, req)
|
||||
timeleft = timeout_at - time.time()
|
||||
response_timeout = min(max(timeleft, 5), 10)
|
||||
response = urllib2.urlopen(req, None, response_timeout)
|
||||
return response.read().rstrip()
|
||||
except urllib2.URLError as e:
|
||||
if isinstance(e, urllib2.HTTPError) and e.code >= 400 and e.code <= 500:
|
||||
# 4xx (problem with the request) or 500 (something wrong on the server)
|
||||
raise_friendly_error(e)
|
||||
elif time.time() >= timeout_at:
|
||||
# No futher retries
|
||||
raise_friendly_error(e)
|
||||
elif isinstance(e, urllib2.HTTPError) and e.code in [301, 302, 303, 305, 307] and \
|
||||
e.info().getheader("Location") is not None:
|
||||
# Set the new location in args.url so it can be used by await_task_completion and re-issue the request
|
||||
location = urlparse.urlparse(e.info().getheader("Location"))
|
||||
args.url = "{0}://{1}".format(location.scheme, location.netloc)
|
||||
sys.stderr.write("Redirect response received, setting url to [{0}]\n".format(args.url))
|
||||
return post_task(args, task_json, timeout_at)
|
||||
else:
|
||||
# If at first you don't succeed, try, try again!
|
||||
sleep_time = 5
|
||||
if not args.quiet:
|
||||
extra = ''
|
||||
if hasattr(e, 'read'):
|
||||
extra = e.read().rstrip()
|
||||
sys.stderr.write("Waiting up to {0}s for indexing service [{1}] to become available. [Got: {2} {3}]".format(max(sleep_time, int(timeout_at - time.time())), args.url, str(e), extra).rstrip())
|
||||
sys.stderr.write("\n")
|
||||
time.sleep(sleep_time)
|
||||
return post_task(args, task_json, timeout_at)
|
||||
|
||||
# Keep trying until timeout_at, maybe die then
|
||||
def await_task_completion(args, task_id, timeout_at):
|
||||
while True:
|
||||
url = args.url.rstrip("/") + "/druid/indexer/v1/task/{0}/status".format(task_id)
|
||||
req = urllib2.Request(url)
|
||||
add_basic_auth_header(args, req)
|
||||
timeleft = timeout_at - time.time()
|
||||
response_timeout = min(max(timeleft, 5), 10)
|
||||
response = urllib2.urlopen(req, None, response_timeout)
|
||||
response_obj = json.loads(response.read())
|
||||
response_status_code = response_obj["status"]["statusCode"]
|
||||
if response_status_code in ['SUCCESS', 'FAILED']:
|
||||
return response_status_code
|
||||
else:
|
||||
if time.time() < timeout_at:
|
||||
if not args.quiet:
|
||||
sys.stderr.write("Task {0} still running...\n".format(task_id))
|
||||
timeleft = timeout_at - time.time()
|
||||
time.sleep(min(5, timeleft))
|
||||
else:
|
||||
raise Exception("Task {0} did not finish in time!".format(task_id))
|
||||
|
||||
def raise_friendly_error(e):
|
||||
if isinstance(e, urllib2.HTTPError):
|
||||
text = e.read().strip()
|
||||
reresult = re.search(r'<pre>(.*?)</pre>', text, re.DOTALL)
|
||||
if reresult:
|
||||
text = reresult.group(1).strip()
|
||||
raise Exception("HTTP Error {0}: {1}, check overlord log for more details.\n{2}".format(e.code, e.reason, text))
|
||||
raise e
|
||||
|
||||
def await_load_completion(args, datasource, timeout_at):
|
||||
while True:
|
||||
url = args.coordinator_url.rstrip("/") + "/druid/coordinator/v1/loadstatus"
|
||||
req = urllib2.Request(url)
|
||||
add_basic_auth_header(args, req)
|
||||
timeleft = timeout_at - time.time()
|
||||
response_timeout = min(max(timeleft, 5), 10)
|
||||
response = urllib2.urlopen(req, None, response_timeout)
|
||||
response_obj = json.loads(response.read())
|
||||
load_status = response_obj.get(datasource, 0.0)
|
||||
if load_status >= 100.0:
|
||||
sys.stderr.write("{0} loading complete! You may now query your data\n".format(datasource))
|
||||
return
|
||||
else:
|
||||
if time.time() < timeout_at:
|
||||
if not args.quiet:
|
||||
sys.stderr.write("{0} is {1}% finished loading...\n".format(datasource, load_status))
|
||||
timeleft = timeout_at - time.time()
|
||||
time.sleep(min(5, timeleft))
|
||||
else:
|
||||
raise Exception("{0} was not loaded in time!".format(datasource))
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description='Post Druid indexing tasks.')
|
||||
parser.add_argument('--url', '-u', metavar='url', type=str, default='http://localhost:8090/', help='Druid Overlord url')
|
||||
parser.add_argument('--coordinator-url', type=str, default='http://localhost:8081/', help='Druid Coordinator url')
|
||||
parser.add_argument('--file', '-f', type=str, required=True, help='Query JSON file')
|
||||
parser.add_argument('--submit-timeout', type=int, default=120, help='Timeout (in seconds) for submitting tasks')
|
||||
parser.add_argument('--complete-timeout', type=int, default=14400, help='Timeout (in seconds) for completing tasks')
|
||||
parser.add_argument('--load-timeout', type=int, default=14400, help='Timeout (in seconds) for waiting for tasks to load')
|
||||
parser.add_argument('--quiet', '-q', action='store_true', help='Suppress retryable errors')
|
||||
parser.add_argument('--user', type=str, default=None, help='Basic auth username')
|
||||
parser.add_argument('--password', type=str, default=None, help='Basic auth password')
|
||||
args = parser.parse_args()
|
||||
|
||||
submit_timeout_at = time.time() + args.submit_timeout
|
||||
complete_timeout_at = time.time() + args.complete_timeout
|
||||
|
||||
task_contents = read_task_file(args)
|
||||
task_json = json.loads(task_contents)
|
||||
if task_json['type'] == "compact":
|
||||
datasource = task_json['dataSource']
|
||||
else:
|
||||
datasource = json.loads(task_contents)["spec"]["dataSchema"]["dataSource"]
|
||||
sys.stderr.write("Beginning indexing data for {0}\n".format(datasource))
|
||||
|
||||
task_id = json.loads(post_task(args, task_contents, submit_timeout_at))["task"]
|
||||
|
||||
sys.stderr.write('\033[1m' + "Task started: " + '\033[0m' + "{0}\n".format(task_id))
|
||||
sys.stderr.write('\033[1m' + "Task log: " + '\033[0m' + "{0}/druid/indexer/v1/task/{1}/log\n".format(args.url.rstrip("/"),task_id))
|
||||
sys.stderr.write('\033[1m' + "Task status: " + '\033[0m' + "{0}/druid/indexer/v1/task/{1}/status\n".format(args.url.rstrip("/"),task_id))
|
||||
|
||||
task_status = await_task_completion(args, task_id, complete_timeout_at)
|
||||
sys.stderr.write("Task finished with status: {0}\n".format(task_status))
|
||||
if task_status != 'SUCCESS':
|
||||
sys.exit(1)
|
||||
|
||||
sys.stderr.write("Completed indexing data for {0}. Now loading indexed data onto the cluster...\n".format(datasource))
|
||||
load_timeout_at = time.time() + args.load_timeout
|
||||
await_load_completion(args, datasource, load_timeout_at)
|
||||
|
||||
try:
|
||||
main()
|
||||
except KeyboardInterrupt:
|
||||
sys.exit(1)
|
|
@ -0,0 +1,43 @@
|
|||
#!/bin/bash -eu
|
||||
|
||||
# Licensed to the Apache Software Foundation (ASF) under one
|
||||
# or more contributor license agreements. See the NOTICE file
|
||||
# distributed with this work for additional information
|
||||
# regarding copyright ownership. The ASF licenses this file
|
||||
# to you under the Apache License, Version 2.0 (the
|
||||
# "License"); you may not use this file except in compliance
|
||||
# with the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing,
|
||||
# software distributed under the License is distributed on an
|
||||
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
# KIND, either express or implied. See the License for the
|
||||
# specific language governing permissions and limitations
|
||||
# under the License.
|
||||
|
||||
if [ "$#" -gt 2 ]
|
||||
then
|
||||
echo "usage: $0 <service> [conf-dir]" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
PWD="$(pwd)"
|
||||
WHEREAMI="$(dirname "$0")"
|
||||
WHATAMI="$1"
|
||||
|
||||
if [ "$#" -eq 1 ]
|
||||
then
|
||||
CONFDIR="$WHEREAMI/../conf"
|
||||
else
|
||||
CONFDIR="$2"
|
||||
fi
|
||||
|
||||
CONFDIR="$(cd "$CONFDIR" && pwd)/druid"
|
||||
WHEREAMI="$(cd "$WHEREAMI" && pwd)"
|
||||
|
||||
cd "$WHEREAMI/.."
|
||||
exec java `cat "$CONFDIR"/"$WHATAMI"/jvm.config | xargs` \
|
||||
-cp "$CONFDIR"/"$WHATAMI":"$CONFDIR"/_common:"$CONFDIR"/_common/hadoop-xml:"$WHEREAMI/../lib/*" \
|
||||
`cat "$CONFDIR"/$WHATAMI/main.config | xargs`
|
|
@ -0,0 +1,43 @@
|
|||
#!/bin/bash -eu
|
||||
|
||||
# Licensed to the Apache Software Foundation (ASF) under one
|
||||
# or more contributor license agreements. See the NOTICE file
|
||||
# distributed with this work for additional information
|
||||
# regarding copyright ownership. The ASF licenses this file
|
||||
# to you under the Apache License, Version 2.0 (the
|
||||
# "License"); you may not use this file except in compliance
|
||||
# with the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing,
|
||||
# software distributed under the License is distributed on an
|
||||
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
# KIND, either express or implied. See the License for the
|
||||
# specific language governing permissions and limitations
|
||||
# under the License.
|
||||
|
||||
if [ "$#" -gt 1 ]
|
||||
then
|
||||
echo "usage: $0 [conf-dir]" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
PWD="$(pwd)"
|
||||
WHEREAMI="$(dirname "$0")"
|
||||
|
||||
if [ "$#" -lt 1 ] || [ "x$1" = "x" ]
|
||||
then
|
||||
CONFDIR="$WHEREAMI"/../conf
|
||||
else
|
||||
CONFDIR="$1"
|
||||
fi
|
||||
|
||||
CONFDIR="$(cd "$CONFDIR" && pwd)/zk"
|
||||
WHEREAMI="$(cd "$WHEREAMI" && pwd)"
|
||||
|
||||
cd "$WHEREAMI/.."
|
||||
exec java `cat "$CONFDIR"/jvm.config | xargs` \
|
||||
-cp "$WHEREAMI/../zk/lib/*:$WHEREAMI/../zk/*:$CONFDIR" \
|
||||
org.apache.zookeeper.server.quorum.QuorumPeerMain \
|
||||
"$CONFDIR"/zoo.cfg
|
|
@ -0,0 +1,70 @@
|
|||
#!/usr/bin/env perl
|
||||
|
||||
# Licensed to the Apache Software Foundation (ASF) under one
|
||||
# or more contributor license agreements. See the NOTICE file
|
||||
# distributed with this work for additional information
|
||||
# regarding copyright ownership. The ASF licenses this file
|
||||
# to you under the Apache License, Version 2.0 (the
|
||||
# "License"); you may not use this file except in compliance
|
||||
# with the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing,
|
||||
# software distributed under the License is distributed on an
|
||||
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
# KIND, either express or implied. See the License for the
|
||||
# specific language governing permissions and limitations
|
||||
# under the License.
|
||||
|
||||
use strict;
|
||||
use warnings;
|
||||
|
||||
use Cwd qw/realpath/;
|
||||
use Fcntl;
|
||||
use File::Basename;
|
||||
use FindBin;
|
||||
use Getopt::Long qw/:config require_order gnu_compat/;
|
||||
|
||||
sub usage
|
||||
{
|
||||
die "usage: $0 (--restart <service> | --tail <service> | --down) [-d <var dir>]\n";
|
||||
}
|
||||
|
||||
# Parse arguments
|
||||
my %opt = (
|
||||
'vardir' => realpath("$FindBin::Bin/../var"),
|
||||
);
|
||||
|
||||
usage() unless GetOptions(\%opt, 'command=s', 'restart=s', 'down', 'tail=s', 'vardir|d=s');
|
||||
|
||||
my $svdir = "$opt{vardir}/sv";
|
||||
my $command;
|
||||
|
||||
if ($opt{command}) {
|
||||
usage() if $command;
|
||||
$command = $opt{command};
|
||||
}
|
||||
|
||||
if ($opt{down}) {
|
||||
usage() if $command;
|
||||
$command = 'd';
|
||||
}
|
||||
|
||||
if ($opt{restart}) {
|
||||
usage() if $command;
|
||||
$command = "k $opt{restart}";
|
||||
}
|
||||
|
||||
if ($opt{tail}) {
|
||||
usage() if $command;
|
||||
exec "tail", "-f", "$svdir/$opt{tail}.log"
|
||||
or die "exec failed: $!\n";
|
||||
}
|
||||
|
||||
usage() unless $command;
|
||||
|
||||
my $fifofile = "$svdir/.ctrl";
|
||||
sysopen my $fifofh, $fifofile, O_WRONLY or die "Can't open control fifo, perhaps supervise is not running: $!\n";
|
||||
print $fifofh "$command\n";
|
||||
close $fifofh;
|
|
@ -0,0 +1,380 @@
|
|||
#!/usr/bin/env perl
|
||||
|
||||
# Licensed to the Apache Software Foundation (ASF) under one
|
||||
# or more contributor license agreements. See the NOTICE file
|
||||
# distributed with this work for additional information
|
||||
# regarding copyright ownership. The ASF licenses this file
|
||||
# to you under the Apache License, Version 2.0 (the
|
||||
# "License"); you may not use this file except in compliance
|
||||
# with the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing,
|
||||
# software distributed under the License is distributed on an
|
||||
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
# KIND, either express or implied. See the License for the
|
||||
# specific language governing permissions and limitations
|
||||
# under the License.
|
||||
|
||||
use strict;
|
||||
use warnings;
|
||||
use Cwd qw/realpath/;
|
||||
use POSIX qw/:sys_wait_h mkfifo setsid/;
|
||||
use Fcntl qw/:DEFAULT :flock/;
|
||||
use Getopt::Long qw/:config require_order gnu_compat/;
|
||||
use FindBin;
|
||||
use File::Spec;
|
||||
use File::Copy;
|
||||
|
||||
sub logdie($)
|
||||
{
|
||||
my ($msg) = @_;
|
||||
chomp $msg;
|
||||
die "[" . (scalar localtime()) . "] $msg\n";
|
||||
}
|
||||
|
||||
sub logit($)
|
||||
{
|
||||
my ($msg) = @_;
|
||||
chomp $msg;
|
||||
warn "[" . (scalar localtime()) . "] $msg\n";
|
||||
}
|
||||
|
||||
sub usage
|
||||
{
|
||||
die "usage: $0 -c <conf file> [-d <var dir>] [-t <kill timeout>] [--svlogd <optional conf file>]\n";
|
||||
}
|
||||
|
||||
sub read_config_file
|
||||
{
|
||||
my ($config_file) = @_;
|
||||
|
||||
open my $config_fh, "<", $config_file
|
||||
or die "open $config_file: $!";
|
||||
|
||||
my @commands;
|
||||
my @verify;
|
||||
my $kill_timeout;
|
||||
while (my $line = <$config_fh>) {
|
||||
chomp $line;
|
||||
next if $line =~ /^(\s*\#.*|\s*)$/;
|
||||
|
||||
if ($line =~ /^(:verify|:kill-timeout|(?:\!p[0-9]+\s+)?[^:]\S+)\s+(.+)$/) {
|
||||
my $name = $1;
|
||||
my $order = 50;
|
||||
my $command = $2;
|
||||
|
||||
if ($name =~ /^(?:\!p([0-9]+)\s+)(.*)$/) {
|
||||
$order = $1;
|
||||
$name = $2;
|
||||
}
|
||||
|
||||
if ($name eq ':verify') {
|
||||
push @verify, $command;
|
||||
} elsif ($name eq ':kill-timeout') {
|
||||
$kill_timeout = int($command);
|
||||
} else {
|
||||
die "Duplicate command: $line\n" if grep { $_->{name} eq $name } @commands;
|
||||
push @commands, {
|
||||
name => $name,
|
||||
command => $command,
|
||||
order => $order, # Stop order for this command
|
||||
pid => 0, # Current pid, or 0 if not running
|
||||
down => 0, # Time the proc should be down until
|
||||
killed => 0, # Signal we sent to this process
|
||||
restarting => 0, # True if this command is currently restarting
|
||||
};
|
||||
}
|
||||
} else {
|
||||
die "Syntax error: $line\n";
|
||||
}
|
||||
}
|
||||
|
||||
close $config_fh;
|
||||
return { commands => \@commands, verify => \@verify, 'kill-timeout' => $kill_timeout };
|
||||
}
|
||||
|
||||
sub stringify_exit_status
|
||||
{
|
||||
my ($status) = @_;
|
||||
my $string;
|
||||
my $signal = $status & 127;
|
||||
my $cored = $status & 128;
|
||||
my $code = $status >> 8;
|
||||
|
||||
if ($signal) {
|
||||
$string = "signal = $signal";
|
||||
} else {
|
||||
$string = "exited = $code";
|
||||
}
|
||||
|
||||
if ($cored) {
|
||||
$string = $string . ", dumped core";
|
||||
}
|
||||
|
||||
return $string;
|
||||
}
|
||||
|
||||
sub open_control_fifo
|
||||
{
|
||||
my ($svdir) = @_;
|
||||
my $fifofile = "$svdir/.ctrl";
|
||||
if (-e $fifofile) {
|
||||
unlink $fifofile or die "Cannot remove fifo: $fifofile\n";
|
||||
}
|
||||
mkfifo($fifofile, 0700) or die "Cannot create fifo: $fifofile\n";
|
||||
sysopen my $fifofh, $fifofile, O_NONBLOCK | O_RDWR or die "Cannot open fifo for reading: $fifofile\n";
|
||||
return $fifofh;
|
||||
}
|
||||
|
||||
sub pretty
|
||||
{
|
||||
my ($text, $color) = @_;
|
||||
if (-t STDERR) {
|
||||
if ($color eq 'bold') {
|
||||
return "\x1b[1m$text\x1b[0m";
|
||||
} elsif ($color eq 'red') {
|
||||
return "\x1b[31m\x1b[1m$text\x1b[0m";
|
||||
} else {
|
||||
return $text;
|
||||
}
|
||||
} else {
|
||||
return $text;
|
||||
}
|
||||
}
|
||||
|
||||
my @commands;
|
||||
|
||||
# If nonzero we should be exiting. -1 means exit without signal, >0 means exit with signal
|
||||
my $killed = 0;
|
||||
|
||||
# If >0 then kill -9 all procs at this time
|
||||
my $killkill = 0;
|
||||
|
||||
# Current proc order we're stopping. Ignored unless $killed is nonzero
|
||||
my $stopping = 100;
|
||||
|
||||
# We'll do our own reaping
|
||||
$SIG{CHLD} = sub {};
|
||||
|
||||
# Redirect stderr to stdout
|
||||
open STDERR, ">&STDOUT" or die;
|
||||
|
||||
# Parse arguments
|
||||
my %opt = (
|
||||
'chdir' => realpath("$FindBin::Bin/.."),
|
||||
'vardir' => realpath("$FindBin::Bin/../var"),
|
||||
'kill-timeout' => 360,
|
||||
);
|
||||
|
||||
usage() unless GetOptions(
|
||||
\%opt,
|
||||
'conf|c=s',
|
||||
'vardir|d=s',
|
||||
'kill-timeout|t=i',
|
||||
'chdir=s',
|
||||
'svlogd:s'
|
||||
);
|
||||
|
||||
usage() unless $opt{'conf'} && $opt{'vardir'};
|
||||
|
||||
# Read config file
|
||||
my $config = read_config_file($opt{'conf'});
|
||||
@commands = @{$config->{commands}};
|
||||
|
||||
if (!@commands) {
|
||||
die "Nothing to run.\n";
|
||||
}
|
||||
|
||||
# Potentially override --kill-timeout
|
||||
if (defined $config->{'kill-timeout'}) {
|
||||
$opt{'kill-timeout'} = $config->{'kill-timeout'};
|
||||
}
|
||||
|
||||
# Remember where vardir, svdir are after chdiring
|
||||
my $vardir = File::Spec->rel2abs($opt{vardir});
|
||||
my $svdir = "$vardir/sv";
|
||||
|
||||
# chdir to the root of the distribution (or whereever)
|
||||
chdir($opt{chdir}) or die "chdir[$opt{chdir}] failed: $!\n";
|
||||
|
||||
# Create vardir with tmp/
|
||||
if (! -e "$vardir/tmp") {
|
||||
system("mkdir -p \Q$vardir\E/tmp") == 0 or die "mkdir $vardir/tmp failed: $!\n";
|
||||
}
|
||||
|
||||
# Create svdir
|
||||
if (! -e $svdir) {
|
||||
system("mkdir -p \Q$svdir\E") == 0 or die "mkdir $svdir failed: $!\n";
|
||||
}
|
||||
|
||||
# Lock svdir and keep it locked until we exit
|
||||
my $lockfile = "$svdir/.lock";
|
||||
open my $lockfh, ">", $lockfile or die "Cannot write to svdir, please check permissions: $svdir\n";
|
||||
flock($lockfh, LOCK_EX | LOCK_NB) or die "Cannot lock svdir, maybe another 'supervise' is running: $svdir\n";
|
||||
|
||||
# Create control fifo in svdir
|
||||
my $fifofh = open_control_fifo($svdir);
|
||||
|
||||
# Run verification commands
|
||||
for my $verify_cmd (@{$config->{verify}}) {
|
||||
system($verify_cmd) == 0 or exit 1;
|
||||
}
|
||||
|
||||
# Catch killy signals and do an orderly shutdown
|
||||
$SIG{HUP} = sub { if (!$killed) { $killed = 1; $killkill = time + $opt{'kill-timeout'}; } };
|
||||
$SIG{INT} = sub { if (!$killed) { $killed = 2; $killkill = time + $opt{'kill-timeout'}; } };
|
||||
$SIG{TERM} = sub { if (!$killed) { $killed = 15; $killkill = time + $opt{'kill-timeout'}; } };
|
||||
|
||||
# Build up control fifo command over multiple sysreads, potentially
|
||||
my $fifobuffer = '';
|
||||
|
||||
while (1) {
|
||||
# Spawn new procs
|
||||
if (!$killed) {
|
||||
for my $command (grep { !$_->{pid} } @commands) {
|
||||
if ($command->{down} < time) {
|
||||
my $logfile = sprintf("%s%s", "$svdir/$command->{name}", defined $opt{'svlogd'} ? "" : ".log");
|
||||
|
||||
logit "Running command[" . pretty($command->{name}, 'bold') . "], logging to[$logfile]: $command->{command}";
|
||||
|
||||
if (my $pid = fork) {
|
||||
$command->{pid} = $pid;
|
||||
$command->{logfile} = $logfile;
|
||||
} else {
|
||||
setsid;
|
||||
|
||||
if (defined $opt{'svlogd'}) {
|
||||
if (! -e $logfile) {
|
||||
system("mkdir -p \Q$logfile\E") == 0 or logdie "mkdir $logfile failed: $!\n";
|
||||
}
|
||||
|
||||
if ($opt{'svlogd'}) {
|
||||
copy($opt{'svlogd'}, "$logfile/config") or logdie "Failed copying $opt{'svlogd'} to $logfile/config: $!";
|
||||
} else {
|
||||
open my $configfh, ">", "$logfile/config" or logdie "Cannot write svlogd config, please check permissions: $logfile/config\n";
|
||||
print $configfh "s100000000\nn10\nN5\nt604800";
|
||||
close $configfh;
|
||||
}
|
||||
|
||||
open STDOUT, "|svlogd $logfile" or logdie "pipe to svlogd $logfile failed: $!\n";
|
||||
} else {
|
||||
open STDOUT, ">>", $logfile or logdie "open $logfile failed: $!\n";
|
||||
}
|
||||
|
||||
open STDERR, ">&STDOUT" or logdie "redirecting stderr failed: $!\n";
|
||||
exec('sh', '-c', "exec $command->{command}") or logdie "exec [$command->{command}] failed: $!";
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
# Reap dead procs
|
||||
my $pid;
|
||||
while (($pid = waitpid(-1, WNOHANG)) > 0) {
|
||||
my $status = $?;
|
||||
my ($command) = (grep { $_->{pid} eq $pid } @commands);
|
||||
if ($command) {
|
||||
$command->{pid} = 0;
|
||||
$command->{down} = time + 2;
|
||||
logit "Command[" . pretty($command->{name}, 'bold') . "] exited (pid = $pid, " . stringify_exit_status($status) . ")";
|
||||
if ($status && !$killed && !$command->{restarting}) {
|
||||
# Unexpected exit
|
||||
logit "Command[" . pretty($command->{name}, 'bold') . "] " . pretty("failed", "red") . ", see logfile for more details: $command->{logfile}";
|
||||
}
|
||||
$command->{restarting} = 0;
|
||||
} else {
|
||||
logit "ERR: Reaped unknown command (pid = $pid, " . stringify_exit_status($status) . ")";
|
||||
}
|
||||
}
|
||||
|
||||
# Kill procs, maybe
|
||||
if ($killed) {
|
||||
my $should_killkill = time > $killkill;
|
||||
|
||||
# Update stopping position, maybe
|
||||
if ($should_killkill) {
|
||||
$stopping = 0;
|
||||
} else {
|
||||
my $maxorder = 0;
|
||||
for my $command (grep { $_->{pid} } @commands) {
|
||||
if ($command->{order} > $maxorder) {
|
||||
$maxorder = $command->{order};
|
||||
}
|
||||
}
|
||||
|
||||
if ($maxorder < $stopping) {
|
||||
$stopping = $maxorder;
|
||||
}
|
||||
}
|
||||
|
||||
for my $command (grep { $_->{pid} && $_->{order} >= $stopping } @commands) {
|
||||
my $want_signal;
|
||||
if ($command->{killed} == 9 || $should_killkill) {
|
||||
$want_signal = 9;
|
||||
} else {
|
||||
$want_signal = 15;
|
||||
}
|
||||
|
||||
if ($command->{killed} != $want_signal) {
|
||||
if ($want_signal != 9) {
|
||||
my $kt = $opt{'kill-timeout'};
|
||||
logit "Sending signal[$want_signal] to command[" . pretty($command->{name}, 'bold') . "] (timeout ${kt}s).";
|
||||
} else {
|
||||
logit "Sending signal[$want_signal] to command[" . pretty($command->{name}, 'bold') . "].";
|
||||
}
|
||||
kill $want_signal, $command->{pid} or logit "WARN: Could not signal pid: $command->{pid}";
|
||||
$command->{killed} = $want_signal;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
# Kill ourselves, maybe
|
||||
if ($killed && ! grep { $_->{pid} } @commands) {
|
||||
logit "Exiting.";
|
||||
$SIG{HUP} = $SIG{INT} = $SIG{TERM} = 'DEFAULT';
|
||||
if ($killed > 0) {
|
||||
kill $killed, $$;
|
||||
exit 1;
|
||||
} else {
|
||||
# Normal exit
|
||||
exit 0;
|
||||
}
|
||||
}
|
||||
|
||||
# Be controlled, maybe
|
||||
my $fifostr = "";
|
||||
if (sysread $fifofh, $fifostr, 4096) {
|
||||
$fifobuffer .= $fifostr;
|
||||
|
||||
while ($fifobuffer =~ /^([^\n]*)\n(.*)/s) {
|
||||
my $fifocmd = $1;
|
||||
$fifobuffer = $2;
|
||||
if ($fifocmd =~ /^k (.+)$/ && !$killed) {
|
||||
my $name = $1;
|
||||
my ($command) = grep { $_->{name} eq $name && $_->{pid} } @commands;
|
||||
if ($command) {
|
||||
logit "Restarting command[" . pretty($name, "bold") . "].";
|
||||
if (kill TERM => $command->{pid}) {
|
||||
$command->{restarting} = 1;
|
||||
} else {
|
||||
logit "WARN: Could not signal pid: $command->{pid}"
|
||||
}
|
||||
} else {
|
||||
logit "Asked to restart unknown command[" . pretty($name, "bold") . "], ignoring.";
|
||||
}
|
||||
} elsif ($fifocmd eq 'd') {
|
||||
# -1 means exit without signal
|
||||
$killed = -1;
|
||||
$killkill = time + $opt{'kill-timeout'}
|
||||
} else {
|
||||
logit "Received unknown control command, ignoring.";
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
sleep 1;
|
||||
}
|
||||
|
||||
exit 0;
|
29
examples/conf-quickstart/druid/broker/runtime.properties → examples/bin/verify-default-ports
Normal file → Executable file
|
@ -1,4 +1,5 @@
|
|||
#
|
||||
#!/usr/bin/env perl
|
||||
|
||||
# Licensed to the Apache Software Foundation (ASF) under one
|
||||
# or more contributor license agreements. See the NOTICE file
|
||||
# distributed with this work for additional information
|
||||
|
@ -15,21 +16,17 @@
|
|||
# KIND, either express or implied. See the License for the
|
||||
# specific language governing permissions and limitations
|
||||
# under the License.
|
||||
#
|
||||
|
||||
druid.service=druid/broker
|
||||
druid.port=8082
|
||||
use strict;
|
||||
use warnings;
|
||||
use Socket;
|
||||
|
||||
# HTTP server threads
|
||||
druid.broker.http.numConnections=5
|
||||
druid.server.http.numThreads=9
|
||||
my @ports = (1527, 2181, 8081, 8082, 8083, 8090, 8091, 8200, 9095);
|
||||
|
||||
# Processing threads and buffers
|
||||
druid.processing.buffer.sizeBytes=256000000
|
||||
druid.processing.numThreads=2
|
||||
|
||||
# Query cache (we use a small local cache)
|
||||
druid.broker.cache.useCache=true
|
||||
druid.broker.cache.populateCache=true
|
||||
druid.cache.type=local
|
||||
druid.cache.sizeInBytes=10000000
|
||||
my $tcp = getprotobyname("tcp");
|
||||
for my $port (@ports) {
|
||||
socket(my $sock, PF_INET, SOCK_STREAM, $tcp) or die "socket: $!";
|
||||
setsockopt($sock, SOL_SOCKET, SO_REUSEADDR, pack("l", 1)) or die "setsockopt: $!";
|
||||
bind($sock, sockaddr_in($port, INADDR_ANY)) or die "Cannot start up because port[$port] is already in use.\n";
|
||||
close $sock;
|
||||
}
|
25
examples/conf-quickstart/druid/historical/runtime.properties → examples/bin/verify-java
Normal file → Executable file
|
@ -1,4 +1,5 @@
|
|||
#
|
||||
#!/usr/bin/env perl
|
||||
|
||||
# Licensed to the Apache Software Foundation (ASF) under one
|
||||
# or more contributor license agreements. See the NOTICE file
|
||||
# distributed with this work for additional information
|
||||
|
@ -15,18 +16,18 @@
|
|||
# KIND, either express or implied. See the License for the
|
||||
# specific language governing permissions and limitations
|
||||
# under the License.
|
||||
#
|
||||
|
||||
druid.service=druid/historical
|
||||
druid.port=8083
|
||||
use strict;
|
||||
use warnings;
|
||||
|
||||
# HTTP server threads
|
||||
druid.server.http.numThreads=9
|
||||
my $java_version = qx[java -version 2>&1];
|
||||
if ($?) {
|
||||
die "Please install Java 8 or better!\n";
|
||||
}
|
||||
|
||||
# Processing threads and buffers
|
||||
druid.processing.buffer.sizeBytes=256000000
|
||||
druid.processing.numThreads=2
|
||||
# If we know it won't work, die. Otherwise hope for the best.
|
||||
if ($java_version =~ /java version \"((\d+)\.(\d+).*?)\"/ && ($2 < 1 || $3 < 8)) {
|
||||
die "Please upgrade to Java 8 or better! Your current version is: $1\n";
|
||||
}
|
||||
|
||||
# Segment storage
|
||||
druid.segmentCache.locations=[{"path":"var/druid/segment-cache","maxSize":300000000000}]
|
||||
druid.server.maxSize=300000000000
|
||||
exit 0;
|
|
@ -1,33 +0,0 @@
|
|||
<?xml version="1.0" encoding="UTF-8" ?>
|
||||
<!--
|
||||
~ Licensed to the Apache Software Foundation (ASF) under one
|
||||
~ or more contributor license agreements. See the NOTICE file
|
||||
~ distributed with this work for additional information
|
||||
~ regarding copyright ownership. The ASF licenses this file
|
||||
~ to you under the Apache License, Version 2.0 (the
|
||||
~ "License"); you may not use this file except in compliance
|
||||
~ with the License. You may obtain a copy of the License at
|
||||
~
|
||||
~ http://www.apache.org/licenses/LICENSE-2.0
|
||||
~
|
||||
~ Unless required by applicable law or agreed to in writing,
|
||||
~ software distributed under the License is distributed on an
|
||||
~ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
~ KIND, either express or implied. See the License for the
|
||||
~ specific language governing permissions and limitations
|
||||
~ under the License.
|
||||
-->
|
||||
|
||||
<Configuration status="WARN">
|
||||
<Appenders>
|
||||
<Console name="Console" target="SYSTEM_OUT">
|
||||
<PatternLayout pattern="%d{ISO8601} %p [%t] %c - %m%n"/>
|
||||
</Console>
|
||||
</Appenders>
|
||||
<Loggers>
|
||||
<Root level="info">
|
||||
<AppenderRef ref="Console"/>
|
||||
</Root>
|
||||
</Loggers>
|
||||
</Configuration>
|
||||
|
|
@ -1,38 +0,0 @@
|
|||
#
|
||||
# Licensed to the Apache Software Foundation (ASF) under one
|
||||
# or more contributor license agreements. See the NOTICE file
|
||||
# distributed with this work for additional information
|
||||
# regarding copyright ownership. The ASF licenses this file
|
||||
# to you under the Apache License, Version 2.0 (the
|
||||
# "License"); you may not use this file except in compliance
|
||||
# with the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing,
|
||||
# software distributed under the License is distributed on an
|
||||
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
# KIND, either express or implied. See the License for the
|
||||
# specific language governing permissions and limitations
|
||||
# under the License.
|
||||
#
|
||||
|
||||
druid.service=druid/middleManager
|
||||
druid.port=8091
|
||||
|
||||
# Number of tasks per middleManager
|
||||
druid.worker.capacity=3
|
||||
|
||||
# Task launch parameters
|
||||
druid.indexer.runner.javaOpts=-server -Xmx2g -Duser.timezone=UTC -Dfile.encoding=UTF-8 -Djava.util.logging.manager=org.apache.logging.log4j.jul.LogManager
|
||||
druid.indexer.task.baseTaskDir=var/druid/task
|
||||
|
||||
# HTTP server threads
|
||||
druid.server.http.numThreads=9
|
||||
|
||||
# Processing threads and buffers on Peons
|
||||
druid.indexer.fork.property.druid.processing.buffer.sizeBytes=256000000
|
||||
druid.indexer.fork.property.druid.processing.numThreads=2
|
||||
|
||||
# Hadoop indexing
|
||||
druid.indexer.task.hadoopWorkingPath=var/druid/hadoop-tmp
|
|
@ -1,76 +0,0 @@
|
|||
{
|
||||
"dataSources" : {
|
||||
"metrics-kafka" : {
|
||||
"spec" : {
|
||||
"dataSchema" : {
|
||||
"dataSource" : "metrics-kafka",
|
||||
"parser" : {
|
||||
"type" : "string",
|
||||
"parseSpec" : {
|
||||
"timestampSpec" : {
|
||||
"column" : "timestamp",
|
||||
"format" : "auto"
|
||||
},
|
||||
"dimensionsSpec" : {
|
||||
"dimensions" : [],
|
||||
"dimensionExclusions" : [
|
||||
"timestamp",
|
||||
"value"
|
||||
]
|
||||
},
|
||||
"format" : "json"
|
||||
}
|
||||
},
|
||||
"granularitySpec" : {
|
||||
"type" : "uniform",
|
||||
"segmentGranularity" : "hour",
|
||||
"queryGranularity" : "none"
|
||||
},
|
||||
"metricsSpec" : [
|
||||
{
|
||||
"type" : "count",
|
||||
"name" : "count"
|
||||
},
|
||||
{
|
||||
"name" : "value_sum",
|
||||
"type" : "doubleSum",
|
||||
"fieldName" : "value"
|
||||
},
|
||||
{
|
||||
"fieldName" : "value",
|
||||
"name" : "value_min",
|
||||
"type" : "doubleMin"
|
||||
},
|
||||
{
|
||||
"type" : "doubleMax",
|
||||
"name" : "value_max",
|
||||
"fieldName" : "value"
|
||||
}
|
||||
]
|
||||
},
|
||||
"ioConfig" : {
|
||||
"type" : "realtime"
|
||||
},
|
||||
"tuningConfig" : {
|
||||
"type" : "realtime",
|
||||
"intermediatePersistPeriod" : "PT10M",
|
||||
"windowPeriod" : "PT10M"
|
||||
}
|
||||
},
|
||||
"properties" : {
|
||||
"task.partitions" : "1",
|
||||
"task.replicants" : "1",
|
||||
"topicPattern" : "metrics"
|
||||
}
|
||||
}
|
||||
},
|
||||
"properties" : {
|
||||
"zookeeper.connect" : "localhost",
|
||||
"druid.discovery.curator.path" : "/druid/discovery",
|
||||
"druid.selectors.indexing.serviceName" : "druid/overlord",
|
||||
"commit.periodMillis" : "15000",
|
||||
"consumer.numThreads" : "2",
|
||||
"kafka.zookeeper.connect" : "localhost",
|
||||
"kafka.group.id" : "tranquility-kafka"
|
||||
}
|
||||
}
|
|
@ -1,73 +0,0 @@
|
|||
{
|
||||
"dataSources" : {
|
||||
"metrics" : {
|
||||
"spec" : {
|
||||
"dataSchema" : {
|
||||
"dataSource" : "metrics",
|
||||
"parser" : {
|
||||
"type" : "string",
|
||||
"parseSpec" : {
|
||||
"timestampSpec" : {
|
||||
"column" : "timestamp",
|
||||
"format" : "auto"
|
||||
},
|
||||
"dimensionsSpec" : {
|
||||
"dimensions" : [],
|
||||
"dimensionExclusions" : [
|
||||
"timestamp",
|
||||
"value"
|
||||
]
|
||||
},
|
||||
"format" : "json"
|
||||
}
|
||||
},
|
||||
"granularitySpec" : {
|
||||
"type" : "uniform",
|
||||
"segmentGranularity" : "hour",
|
||||
"queryGranularity" : "none"
|
||||
},
|
||||
"metricsSpec" : [
|
||||
{
|
||||
"type" : "count",
|
||||
"name" : "count"
|
||||
},
|
||||
{
|
||||
"name" : "value_sum",
|
||||
"type" : "doubleSum",
|
||||
"fieldName" : "value"
|
||||
},
|
||||
{
|
||||
"fieldName" : "value",
|
||||
"name" : "value_min",
|
||||
"type" : "doubleMin"
|
||||
},
|
||||
{
|
||||
"type" : "doubleMax",
|
||||
"name" : "value_max",
|
||||
"fieldName" : "value"
|
||||
}
|
||||
]
|
||||
},
|
||||
"ioConfig" : {
|
||||
"type" : "realtime"
|
||||
},
|
||||
"tuningConfig" : {
|
||||
"type" : "realtime",
|
||||
"intermediatePersistPeriod" : "PT10M",
|
||||
"windowPeriod" : "PT10M"
|
||||
}
|
||||
},
|
||||
"properties" : {
|
||||
"task.partitions" : "1",
|
||||
"task.replicants" : "1"
|
||||
}
|
||||
}
|
||||
},
|
||||
"properties" : {
|
||||
"zookeeper.connect" : "localhost",
|
||||
"druid.discovery.curator.path" : "/druid/discovery",
|
||||
"druid.selectors.indexing.serviceName" : "druid/overlord",
|
||||
"http.port" : "8200",
|
||||
"http.threads" : "9"
|
||||
}
|
||||
}
|
|
@ -0,0 +1,11 @@
|
|||
{
|
||||
"type": "compact",
|
||||
"dataSource": "compaction-tutorial",
|
||||
"interval": "2015-09-12/2015-09-13",
|
||||
"tuningConfig" : {
|
||||
"type" : "index",
|
||||
"targetPartitionSize" : 5000000,
|
||||
"maxRowsInMemory" : 25000,
|
||||
"forceExtendableShardSpecs" : true
|
||||
}
|
||||
}
|
|
@ -0,0 +1,64 @@
|
|||
{
|
||||
"type" : "index",
|
||||
"spec" : {
|
||||
"dataSchema" : {
|
||||
"dataSource" : "compaction-tutorial",
|
||||
"parser" : {
|
||||
"type" : "string",
|
||||
"parseSpec" : {
|
||||
"format" : "json",
|
||||
"dimensionsSpec" : {
|
||||
"dimensions" : [
|
||||
"channel",
|
||||
"cityName",
|
||||
"comment",
|
||||
"countryIsoCode",
|
||||
"countryName",
|
||||
"isAnonymous",
|
||||
"isMinor",
|
||||
"isNew",
|
||||
"isRobot",
|
||||
"isUnpatrolled",
|
||||
"metroCode",
|
||||
"namespace",
|
||||
"page",
|
||||
"regionIsoCode",
|
||||
"regionName",
|
||||
"user",
|
||||
{ "name": "added", "type": "long" },
|
||||
{ "name": "deleted", "type": "long" },
|
||||
{ "name": "delta", "type": "long" }
|
||||
]
|
||||
},
|
||||
"timestampSpec": {
|
||||
"column": "time",
|
||||
"format": "iso"
|
||||
}
|
||||
}
|
||||
},
|
||||
"metricsSpec" : [],
|
||||
"granularitySpec" : {
|
||||
"type" : "uniform",
|
||||
"segmentGranularity" : "hour",
|
||||
"queryGranularity" : "none",
|
||||
"intervals" : ["2015-09-12/2015-09-13"],
|
||||
"rollup" : false
|
||||
}
|
||||
},
|
||||
"ioConfig" : {
|
||||
"type" : "index",
|
||||
"firehose" : {
|
||||
"type" : "local",
|
||||
"baseDir" : "quickstart/",
|
||||
"filter" : "wikiticker-2015-09-12-sampled.json.gz"
|
||||
},
|
||||
"appendToExisting" : false
|
||||
},
|
||||
"tuningConfig" : {
|
||||
"type" : "index",
|
||||
"targetPartitionSize" : 5000000,
|
||||
"maxRowsInMemory" : 25000,
|
||||
"forceExtendableShardSpecs" : true
|
||||
}
|
||||
}
|
||||
}
|
|
@ -1,18 +1,18 @@
|
|||
#
|
||||
# Licensed to the Apache Software Foundation (ASF) under one
|
||||
# or more contributor license agreements. See the NOTICE file
|
||||
# Licensed to Metamarkets Group Inc. (Metamarkets) under one
|
||||
# or more contributor license agreements. See the NOTICE file
|
||||
# distributed with this work for additional information
|
||||
# regarding copyright ownership. The ASF licenses this file
|
||||
# regarding copyright ownership. Metamarkets licenses this file
|
||||
# to you under the Apache License, Version 2.0 (the
|
||||
# "License"); you may not use this file except in compliance
|
||||
# with the License. You may obtain a copy of the License at
|
||||
# with the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing,
|
||||
# software distributed under the License is distributed on an
|
||||
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
# KIND, either express or implied. See the License for the
|
||||
# KIND, either express or implied. See the License for the
|
||||
# specific language governing permissions and limitations
|
||||
# under the License.
|
||||
#
|
||||
|
@ -24,12 +24,18 @@
|
|||
# If you specify `druid.extensions.loadList=[]`, Druid won't load any extension from file system.
|
||||
# If you don't specify `druid.extensions.loadList`, Druid will load all the extensions under root extension directory.
|
||||
# More info: http://druid.io/docs/latest/operations/including-extensions.html
|
||||
druid.extensions.loadList=[]
|
||||
druid.extensions.loadList=["druid-hdfs-storage", "druid-kafka-indexing-service"]
|
||||
|
||||
# If you have a different version of Hadoop, place your Hadoop client jar files in your hadoop-dependencies directory
|
||||
# and uncomment the line below to point to your directory.
|
||||
#druid.extensions.hadoopDependenciesDir=/my/dir/hadoop-dependencies
|
||||
|
||||
|
||||
#
|
||||
# Hostname
|
||||
#
|
||||
druid.host=localhost
|
||||
|
||||
#
|
||||
# Logging
|
||||
#
|
||||
|
@ -126,3 +132,9 @@ druid.indexing.doubleStorage=double
|
|||
# Security
|
||||
#
|
||||
druid.server.hiddenProperties=["druid.s3.accessKey","druid.s3.secretKey","druid.metadata.storage.connector.password"]
|
||||
|
||||
|
||||
#
|
||||
# SQL
|
||||
#
|
||||
druid.sql.enable=true
|
|
@ -0,0 +1,33 @@
|
|||
<?xml version="1.0" encoding="UTF-8" ?>
|
||||
<!--
|
||||
~ Licensed to Metamarkets Group Inc. (Metamarkets) under one
|
||||
~ or more contributor license agreements. See the NOTICE file
|
||||
~ distributed with this work for additional information
|
||||
~ regarding copyright ownership. Metamarkets licenses this file
|
||||
~ to you under the Apache License, Version 2.0 (the
|
||||
~ "License"); you may not use this file except in compliance
|
||||
~ with the License. You may obtain a copy of the License at
|
||||
~
|
||||
~ http://www.apache.org/licenses/LICENSE-2.0
|
||||
~
|
||||
~ Unless required by applicable law or agreed to in writing,
|
||||
~ software distributed under the License is distributed on an
|
||||
~ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
~ KIND, either express or implied. See the License for the
|
||||
~ specific language governing permissions and limitations
|
||||
~ under the License.
|
||||
-->
|
||||
|
||||
<Configuration status="WARN">
|
||||
<Appenders>
|
||||
<Console name="Console" target="SYSTEM_OUT">
|
||||
<PatternLayout pattern="%d{ISO8601} %p [%t] %c - %m%n"/>
|
||||
</Console>
|
||||
</Appenders>
|
||||
<Loggers>
|
||||
<Root level="info">
|
||||
<AppenderRef ref="Console"/>
|
||||
</Root>
|
||||
</Loggers>
|
||||
</Configuration>
|
||||
|
|
@ -0,0 +1 @@
|
|||
io.druid.cli.Main server broker
|
|
@ -0,0 +1,16 @@
|
|||
druid.service=druid/broker
|
||||
druid.port=8082
|
||||
|
||||
# HTTP server threads
|
||||
druid.broker.http.numConnections=5
|
||||
druid.server.http.numThreads=9
|
||||
|
||||
# Processing threads and buffers
|
||||
druid.processing.buffer.sizeBytes=256000000
|
||||
druid.processing.numThreads=2
|
||||
|
||||
# Query cache (we use a small local cache)
|
||||
druid.broker.cache.useCache=true
|
||||
druid.broker.cache.populateCache=true
|
||||
druid.cache.type=local
|
||||
druid.cache.sizeInBytes=10000000
|
|
@ -0,0 +1 @@
|
|||
io.druid.cli.Main server coordinator
|
|
@ -0,0 +1,5 @@
|
|||
druid.service=druid/coordinator
|
||||
druid.port=8081
|
||||
|
||||
druid.coordinator.startDelay=PT10S
|
||||
druid.coordinator.period=PT5S
|
|
@ -0,0 +1 @@
|
|||
io.druid.cli.Main server historical
|
|
@ -0,0 +1,13 @@
|
|||
druid.service=druid/historical
|
||||
druid.port=8083
|
||||
|
||||
# HTTP server threads
|
||||
druid.server.http.numThreads=9
|
||||
|
||||
# Processing threads and buffers
|
||||
druid.processing.buffer.sizeBytes=256000000
|
||||
druid.processing.numThreads=2
|
||||
|
||||
# Segment storage
|
||||
druid.segmentCache.locations=[{"path":"var/druid/segment-cache","maxSize":300000000000}]
|
||||
druid.server.maxSize=300000000000
|
|
@ -0,0 +1 @@
|
|||
io.druid.cli.Main server middleManager
|
|
@ -0,0 +1,19 @@
|
|||
druid.service=druid/middleManager
|
||||
druid.port=8091
|
||||
|
||||
# Number of tasks per middleManager
|
||||
druid.worker.capacity=3
|
||||
|
||||
# Task launch parameters
|
||||
druid.indexer.runner.javaOpts=-server -Xmx2g -Duser.timezone=UTC -Dfile.encoding=UTF-8 -Djava.util.logging.manager=org.apache.logging.log4j.jul.LogManager
|
||||
druid.indexer.task.baseTaskDir=var/druid/task
|
||||
|
||||
# HTTP server threads
|
||||
druid.server.http.numThreads=9
|
||||
|
||||
# Processing threads and buffers on Peons
|
||||
druid.indexer.fork.property.druid.processing.buffer.sizeBytes=256000000
|
||||
druid.indexer.fork.property.druid.processing.numThreads=2
|
||||
|
||||
# Hadoop indexing
|
||||
druid.indexer.task.hadoopWorkingPath=var/druid/hadoop-tmp
|
|
@ -0,0 +1 @@
|
|||
io.druid.cli.Main server overlord
|
|
@ -0,0 +1,7 @@
|
|||
druid.service=druid/overlord
|
||||
druid.port=8090
|
||||
|
||||
druid.indexer.queue.startDelay=PT5S
|
||||
|
||||
druid.indexer.runner.type=remote
|
||||
druid.indexer.storage.type=metadata
|
|
@ -0,0 +1,84 @@
|
|||
{
|
||||
"dataSources" : {
|
||||
"wikipedia" : {
|
||||
"spec" : {
|
||||
"dataSchema" : {
|
||||
"dataSource" : "wikipedia",
|
||||
"parser" : {
|
||||
"type" : "string",
|
||||
"parseSpec" : {
|
||||
"format" : "json",
|
||||
"dimensionsSpec" : {
|
||||
"dimensions" : [
|
||||
"channel",
|
||||
"cityName",
|
||||
"comment",
|
||||
"countryIsoCode",
|
||||
"countryName",
|
||||
"isAnonymous",
|
||||
"isMinor",
|
||||
"isNew",
|
||||
"isRobot",
|
||||
"isUnpatrolled",
|
||||
"metroCode",
|
||||
"namespace",
|
||||
"page",
|
||||
"regionIsoCode",
|
||||
"regionName",
|
||||
"user"
|
||||
]
|
||||
},
|
||||
"timestampSpec": {
|
||||
"column": "time",
|
||||
"format": "iso"
|
||||
}
|
||||
}
|
||||
},
|
||||
"metricsSpec" : [
|
||||
{
|
||||
"name" : "added",
|
||||
"type" : "longSum",
|
||||
"fieldName" : "added"
|
||||
},
|
||||
{
|
||||
"name" : "deleted",
|
||||
"type" : "longSum",
|
||||
"fieldName" : "deleted"
|
||||
},
|
||||
{
|
||||
"name" : "delta",
|
||||
"type" : "longSum",
|
||||
"fieldName" : "delta"
|
||||
}
|
||||
],
|
||||
"granularitySpec" : {
|
||||
"type" : "uniform",
|
||||
"segmentGranularity" : "day",
|
||||
"queryGranularity" : "none",
|
||||
"intervals" : ["2015-09-12/2015-09-13"],
|
||||
"rollup" : false
|
||||
}
|
||||
},
|
||||
"ioConfig" : {
|
||||
"type" : "realtime"
|
||||
},
|
||||
"tuningConfig" : {
|
||||
"type" : "realtime",
|
||||
"intermediatePersistPeriod" : "PT10M",
|
||||
"windowPeriod" : "P3650D"
|
||||
}
|
||||
},
|
||||
"properties" : {
|
||||
"task.partitions" : "1",
|
||||
"task.replicants" : "1"
|
||||
}
|
||||
}
|
||||
},
|
||||
"properties" : {
|
||||
"zookeeper.connect" : "localhost",
|
||||
"druid.discovery.curator.path" : "/druid/discovery",
|
||||
"druid.selectors.indexing.serviceName" : "druid/overlord",
|
||||
"http.port" : "8200",
|
||||
"http.threads" : "9"
|
||||
}
|
||||
}
|
|
@ -0,0 +1,13 @@
|
|||
:verify bin/verify-java
|
||||
:verify bin/verify-default-ports
|
||||
:kill-timeout 10
|
||||
|
||||
!p10 zk bin/run-zk quickstart/tutorial/conf
|
||||
coordinator bin/run-druid coordinator quickstart/tutorial/conf
|
||||
broker bin/run-druid broker quickstart/tutorial/conf
|
||||
historical bin/run-druid historical quickstart/tutorial/conf
|
||||
!p80 overlord bin/run-druid overlord quickstart/tutorial/conf
|
||||
!p90 middleManager bin/run-druid middleManager quickstart/tutorial/conf
|
||||
|
||||
# Uncomment to use Tranquility Server
|
||||
#!p95 tranquility-server tranquility/bin/tranquility server -configFile quickstart/tutorial/conf/tranquility/wikipedia-server.json -Ddruid.extensions.loadList=[]
|
|
@ -0,0 +1,4 @@
|
|||
-server
|
||||
-Xms128m
|
||||
-Xmx128m
|
||||
-Duser.timezone=UTC
|
|
@ -0,0 +1,17 @@
|
|||
<?xml version="1.0" encoding="UTF-8" ?>
|
||||
<!DOCTYPE log4j:configuration SYSTEM "log4j.dtd">
|
||||
|
||||
<log4j:configuration xmlns:log4j="http://jakarta.apache.org/log4j/">
|
||||
<appender name="console" class="org.apache.log4j.ConsoleAppender">
|
||||
<param name="Target" value="System.out"/>
|
||||
<layout class="org.apache.log4j.PatternLayout">
|
||||
<param name="ConversionPattern" value="%d{ISO8601} %p [%t] %c - %m%n"/>
|
||||
</layout>
|
||||
</appender>
|
||||
|
||||
<root>
|
||||
<priority value ="info" />
|
||||
<appender-ref ref="console" />
|
||||
</root>
|
||||
|
||||
</log4j:configuration>
|
|
@ -0,0 +1,16 @@
|
|||
#
|
||||
# Server
|
||||
#
|
||||
|
||||
tickTime=2000
|
||||
dataDir=var/zk
|
||||
clientPort=2181
|
||||
initLimit=5
|
||||
syncLimit=2
|
||||
|
||||
#
|
||||
# Autopurge
|
||||
#
|
||||
|
||||
autopurge.snapRetainCount=5
|
||||
autopurge.purgeInterval=1
|
|
@ -0,0 +1,64 @@
|
|||
{
|
||||
"type" : "index",
|
||||
"spec" : {
|
||||
"dataSchema" : {
|
||||
"dataSource" : "deletion-tutorial",
|
||||
"parser" : {
|
||||
"type" : "string",
|
||||
"parseSpec" : {
|
||||
"format" : "json",
|
||||
"dimensionsSpec" : {
|
||||
"dimensions" : [
|
||||
"channel",
|
||||
"cityName",
|
||||
"comment",
|
||||
"countryIsoCode",
|
||||
"countryName",
|
||||
"isAnonymous",
|
||||
"isMinor",
|
||||
"isNew",
|
||||
"isRobot",
|
||||
"isUnpatrolled",
|
||||
"metroCode",
|
||||
"namespace",
|
||||
"page",
|
||||
"regionIsoCode",
|
||||
"regionName",
|
||||
"user",
|
||||
{ "name": "added", "type": "long" },
|
||||
{ "name": "deleted", "type": "long" },
|
||||
{ "name": "delta", "type": "long" }
|
||||
]
|
||||
},
|
||||
"timestampSpec": {
|
||||
"column": "time",
|
||||
"format": "iso"
|
||||
}
|
||||
}
|
||||
},
|
||||
"metricsSpec" : [],
|
||||
"granularitySpec" : {
|
||||
"type" : "uniform",
|
||||
"segmentGranularity" : "hour",
|
||||
"queryGranularity" : "none",
|
||||
"intervals" : ["2015-09-12/2015-09-13"],
|
||||
"rollup" : false
|
||||
}
|
||||
},
|
||||
"ioConfig" : {
|
||||
"type" : "index",
|
||||
"firehose" : {
|
||||
"type" : "local",
|
||||
"baseDir" : "quickstart/",
|
||||
"filter" : "wikiticker-2015-09-12-sampled.json.gz"
|
||||
},
|
||||
"appendToExisting" : false
|
||||
},
|
||||
"tuningConfig" : {
|
||||
"type" : "index",
|
||||
"targetPartitionSize" : 5000000,
|
||||
"maxRowsInMemory" : 25000,
|
||||
"forceExtendableShardSpecs" : true
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,5 @@
|
|||
{
|
||||
"type": "kill",
|
||||
"dataSource": "deletion-tutorial",
|
||||
"interval" : "2015-09-12/2015-09-13"
|
||||
}
|
|
@ -0,0 +1,107 @@
|
|||
# Creates pseudo distributed hadoop 2.8.3 with java 8
|
||||
#
|
||||
# Modified from the SequenceIQ Dockerfiles at https://github.com/sequenceiq/hadoop-docker
|
||||
#
|
||||
# docker build -t druid-hadoop-demo:2.8.3 .
|
||||
|
||||
FROM sequenceiq/pam:centos-6.5
|
||||
MAINTAINER SequenceIQ
|
||||
|
||||
USER root
|
||||
|
||||
# install dev tools
|
||||
RUN yum clean all \
|
||||
&& rpm --rebuilddb \
|
||||
&& yum install -y curl which tar sudo openssh-server openssh-clients rsync yum-plugin-ovl\
|
||||
&& yum clean all \
|
||||
&& yum update -y libselinux \
|
||||
&& yum clean all
|
||||
# update libselinux. see https://github.com/sequenceiq/hadoop-docker/issues/14
|
||||
|
||||
# passwordless ssh
|
||||
RUN ssh-keygen -q -N "" -t dsa -f /etc/ssh/ssh_host_dsa_key
|
||||
RUN ssh-keygen -q -N "" -t rsa -f /etc/ssh/ssh_host_rsa_key
|
||||
RUN ssh-keygen -q -N "" -t rsa -f /root/.ssh/id_rsa
|
||||
RUN cp /root/.ssh/id_rsa.pub /root/.ssh/authorized_keys
|
||||
|
||||
# zulu java 8
|
||||
RUN rpm --import http://repos.azulsystems.com/RPM-GPG-KEY-azulsystems
|
||||
RUN rpm --rebuilddb
|
||||
RUN sudo curl -o /etc/yum.repos.d/zulu.repo http://repos.azulsystems.com/rhel/zulu.repo
|
||||
RUN yum install -y zulu-8
|
||||
|
||||
ENV JAVA_HOME /usr/lib/jvm/zulu-8
|
||||
ENV PATH $PATH:$JAVA_HOME/bin
|
||||
|
||||
# hadoop
|
||||
RUN curl -s https://archive.apache.org/dist/hadoop/core/hadoop-2.8.3/hadoop-2.8.3.tar.gz | tar -xz -C /usr/local/
|
||||
RUN cd /usr/local && ln -s ./hadoop-2.8.3 hadoop
|
||||
|
||||
ENV HADOOP_PREFIX /usr/local/hadoop
|
||||
ENV HADOOP_COMMON_HOME /usr/local/hadoop
|
||||
ENV HADOOP_HDFS_HOME /usr/local/hadoop
|
||||
ENV HADOOP_MAPRED_HOME /usr/local/hadoop
|
||||
ENV HADOOP_YARN_HOME /usr/local/hadoop
|
||||
ENV HADOOP_CONF_DIR /usr/local/hadoop/etc/hadoop
|
||||
ENV YARN_CONF_DIR $HADOOP_PREFIX/etc/hadoop
|
||||
|
||||
RUN sed -i '/^export JAVA_HOME/ s:.*:export JAVA_HOME=/usr/lib/jvm/zulu-8\nexport HADOOP_PREFIX=/usr/local/hadoop\nexport HADOOP_HOME=/usr/local/hadoop\n:' $HADOOP_PREFIX/etc/hadoop/hadoop-env.sh
|
||||
RUN sed -i '/^export HADOOP_CONF_DIR/ s:.*:export HADOOP_CONF_DIR=/usr/local/hadoop/etc/hadoop/:' $HADOOP_PREFIX/etc/hadoop/hadoop-env.sh
|
||||
|
||||
RUN mkdir $HADOOP_PREFIX/input
|
||||
RUN cp $HADOOP_PREFIX/etc/hadoop/*.xml $HADOOP_PREFIX/input
|
||||
|
||||
# pseudo distributed
|
||||
ADD core-site.xml.template $HADOOP_PREFIX/etc/hadoop/core-site.xml.template
|
||||
RUN sed s/HOSTNAME/localhost/ /usr/local/hadoop/etc/hadoop/core-site.xml.template > /usr/local/hadoop/etc/hadoop/core-site.xml
|
||||
ADD hdfs-site.xml $HADOOP_PREFIX/etc/hadoop/hdfs-site.xml
|
||||
|
||||
ADD mapred-site.xml $HADOOP_PREFIX/etc/hadoop/mapred-site.xml
|
||||
ADD yarn-site.xml $HADOOP_PREFIX/etc/hadoop/yarn-site.xml
|
||||
|
||||
RUN $HADOOP_PREFIX/bin/hdfs namenode -format
|
||||
|
||||
ADD ssh_config /root/.ssh/config
|
||||
RUN chmod 600 /root/.ssh/config
|
||||
RUN chown root:root /root/.ssh/config
|
||||
|
||||
# # installing supervisord
|
||||
# RUN yum install -y python-setuptools
|
||||
# RUN easy_install pip
|
||||
# RUN curl https://bitbucket.org/pypa/setuptools/raw/bootstrap/ez_setup.py -o - | python
|
||||
# RUN pip install supervisor
|
||||
#
|
||||
# ADD supervisord.conf /etc/supervisord.conf
|
||||
|
||||
ADD bootstrap.sh /etc/bootstrap.sh
|
||||
RUN chown root:root /etc/bootstrap.sh
|
||||
RUN chmod 700 /etc/bootstrap.sh
|
||||
|
||||
ENV BOOTSTRAP /etc/bootstrap.sh
|
||||
|
||||
# workingaround docker.io build error
|
||||
RUN ls -la /usr/local/hadoop/etc/hadoop/*-env.sh
|
||||
RUN chmod +x /usr/local/hadoop/etc/hadoop/*-env.sh
|
||||
RUN ls -la /usr/local/hadoop/etc/hadoop/*-env.sh
|
||||
|
||||
# Copy additional .jars to classpath
|
||||
RUN cp /usr/local/hadoop/share/hadoop/tools/lib/*.jar /usr/local/hadoop/share/hadoop/common/lib/
|
||||
|
||||
# fix the 254 error code
|
||||
RUN sed -i "/^[^#]*UsePAM/ s/.*/#&/" /etc/ssh/sshd_config
|
||||
RUN echo "UsePAM no" >> /etc/ssh/sshd_config
|
||||
RUN echo "Port 2122" >> /etc/ssh/sshd_config
|
||||
|
||||
RUN service sshd start && $HADOOP_PREFIX/etc/hadoop/hadoop-env.sh && $HADOOP_PREFIX/sbin/start-dfs.sh && $HADOOP_PREFIX/bin/hdfs dfs -mkdir -p /user/root
|
||||
RUN service sshd start && $HADOOP_PREFIX/etc/hadoop/hadoop-env.sh && $HADOOP_PREFIX/sbin/start-dfs.sh && $HADOOP_PREFIX/bin/hdfs dfs -put $HADOOP_PREFIX/etc/hadoop/ input
|
||||
|
||||
CMD ["/etc/bootstrap.sh", "-d"]
|
||||
|
||||
# Hdfs ports
|
||||
EXPOSE 50010 50020 50070 50075 50090 8020 9000
|
||||
# Mapred ports
|
||||
EXPOSE 10020 19888
|
||||
#Yarn ports
|
||||
EXPOSE 8030 8031 8032 8033 8040 8042 8088
|
||||
#Other ports
|
||||
EXPOSE 49707 2122
|
|
@ -0,0 +1,27 @@
|
|||
#!/bin/bash
|
||||
|
||||
: ${HADOOP_PREFIX:=/usr/local/hadoop}
|
||||
|
||||
$HADOOP_PREFIX/etc/hadoop/hadoop-env.sh
|
||||
|
||||
rm /tmp/*.pid
|
||||
|
||||
# installing libraries if any - (resource urls added comma separated to the ACP system variable)
|
||||
cd $HADOOP_PREFIX/share/hadoop/common ; for cp in ${ACP//,/ }; do echo == $cp; curl -LO $cp ; done; cd -
|
||||
|
||||
# altering the core-site configuration
|
||||
sed s/HOSTNAME/$HOSTNAME/ /usr/local/hadoop/etc/hadoop/core-site.xml.template > /usr/local/hadoop/etc/hadoop/core-site.xml
|
||||
|
||||
|
||||
service sshd start
|
||||
$HADOOP_PREFIX/sbin/start-dfs.sh
|
||||
$HADOOP_PREFIX/sbin/start-yarn.sh
|
||||
$HADOOP_PREFIX/sbin/mr-jobhistory-daemon.sh start historyserver
|
||||
|
||||
if [[ $1 == "-d" ]]; then
|
||||
while true; do sleep 1000; done
|
||||
fi
|
||||
|
||||
if [[ $1 == "-bash" ]]; then
|
||||
/bin/bash
|
||||
fi
|
|
@ -0,0 +1,6 @@
|
|||
<configuration>
|
||||
<property>
|
||||
<name>fs.defaultFS</name>
|
||||
<value>hdfs://HOSTNAME:9000</value>
|
||||
</property>
|
||||
</configuration>
|
|
@ -0,0 +1,14 @@
|
|||
<configuration>
|
||||
<property>
|
||||
<name>dfs.replication</name>
|
||||
<value>1</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>dfs.client.use.datanode.hostname</name>
|
||||
<value>true</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>dfs.datanode.use.datanode.hostname</name>
|
||||
<value>true</value>
|
||||
</property>
|
||||
</configuration>
|
|
@ -0,0 +1,6 @@
|
|||
<configuration>
|
||||
<property>
|
||||
<name>mapreduce.framework.name</name>
|
||||
<value>yarn</value>
|
||||
</property>
|
||||
</configuration>
|
|
@ -0,0 +1,5 @@
|
|||
Host *
|
||||
UserKnownHostsFile /dev/null
|
||||
StrictHostKeyChecking no
|
||||
LogLevel quiet
|
||||
Port 2122
|
|
@ -0,0 +1,47 @@
|
|||
<configuration>
|
||||
<property>
|
||||
<name>yarn.nodemanager.aux-services</name>
|
||||
<value>mapreduce_shuffle</value>
|
||||
</property>
|
||||
|
||||
<property>
|
||||
<name>yarn.application.classpath</name>
|
||||
<value>/usr/local/hadoop/etc/hadoop, /usr/local/hadoop/share/hadoop/common/*, /usr/local/hadoop/share/hadoop/common/lib/*, /usr/local/hadoop/share/hadoop/hdfs/*, /usr/local/hadoop/share/hadoop/hdfs/lib/*, /usr/local/hadoop/share/hadoop/mapreduce/*, /usr/local/hadoop/share/hadoop/mapreduce/lib/*, /usr/local/hadoop/share/hadoop/yarn/*, /usr/local/hadoop/share/hadoop/yarn/lib/*</value>
|
||||
</property>
|
||||
|
||||
<property>
|
||||
<description>
|
||||
Number of seconds after an application finishes before the nodemanager's
|
||||
DeletionService will delete the application's localized file directory
|
||||
and log directory.
|
||||
|
||||
To diagnose Yarn application problems, set this property's value large
|
||||
enough (for example, to 600 = 10 minutes) to permit examination of these
|
||||
directories. After changing the property's value, you must restart the
|
||||
nodemanager in order for it to have an effect.
|
||||
|
||||
The roots of Yarn applications' work directories is configurable with
|
||||
the yarn.nodemanager.local-dirs property (see below), and the roots
|
||||
of the Yarn applications' log directories is configurable with the
|
||||
yarn.nodemanager.log-dirs property (see also below).
|
||||
</description>
|
||||
<name>yarn.nodemanager.delete.debug-delay-sec</name>
|
||||
<value>600</value>
|
||||
</property>
|
||||
|
||||
<property>
|
||||
<name>yarn.log-aggregation-enable</name>
|
||||
<value>true</value>
|
||||
</property>
|
||||
|
||||
<property>
|
||||
<name>yarn.log-aggregation.retain-seconds</name>
|
||||
<value>900000</value>
|
||||
</property>
|
||||
|
||||
<property>
|
||||
<name>yarn.nodemanager.vmem-check-enabled</name>
|
||||
<value>false</value>
|
||||
</property>
|
||||
|
||||
</configuration>
|
|
@ -0,0 +1,64 @@
|
|||
{
|
||||
"type" : "index",
|
||||
"spec" : {
|
||||
"dataSchema" : {
|
||||
"dataSource" : "retention-tutorial",
|
||||
"parser" : {
|
||||
"type" : "string",
|
||||
"parseSpec" : {
|
||||
"format" : "json",
|
||||
"dimensionsSpec" : {
|
||||
"dimensions" : [
|
||||
"channel",
|
||||
"cityName",
|
||||
"comment",
|
||||
"countryIsoCode",
|
||||
"countryName",
|
||||
"isAnonymous",
|
||||
"isMinor",
|
||||
"isNew",
|
||||
"isRobot",
|
||||
"isUnpatrolled",
|
||||
"metroCode",
|
||||
"namespace",
|
||||
"page",
|
||||
"regionIsoCode",
|
||||
"regionName",
|
||||
"user",
|
||||
{ "name": "added", "type": "long" },
|
||||
{ "name": "deleted", "type": "long" },
|
||||
{ "name": "delta", "type": "long" }
|
||||
]
|
||||
},
|
||||
"timestampSpec": {
|
||||
"column": "time",
|
||||
"format": "iso"
|
||||
}
|
||||
}
|
||||
},
|
||||
"metricsSpec" : [],
|
||||
"granularitySpec" : {
|
||||
"type" : "uniform",
|
||||
"segmentGranularity" : "hour",
|
||||
"queryGranularity" : "none",
|
||||
"intervals" : ["2015-09-12/2015-09-13"],
|
||||
"rollup" : false
|
||||
}
|
||||
},
|
||||
"ioConfig" : {
|
||||
"type" : "index",
|
||||
"firehose" : {
|
||||
"type" : "local",
|
||||
"baseDir" : "quickstart/",
|
||||
"filter" : "wikiticker-2015-09-12-sampled.json.gz"
|
||||
},
|
||||
"appendToExisting" : false
|
||||
},
|
||||
"tuningConfig" : {
|
||||
"type" : "index",
|
||||
"targetPartitionSize" : 5000000,
|
||||
"maxRowsInMemory" : 25000,
|
||||
"forceExtendableShardSpecs" : true
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,9 @@
|
|||
{"timestamp":"2018-01-01T01:01:35Z","srcIP":"1.1.1.1", "dstIP":"2.2.2.2","packets":20,"bytes":9024}
|
||||
{"timestamp":"2018-01-01T01:01:51Z","srcIP":"1.1.1.1", "dstIP":"2.2.2.2","packets":255,"bytes":21133}
|
||||
{"timestamp":"2018-01-01T01:01:59Z","srcIP":"1.1.1.1", "dstIP":"2.2.2.2","packets":11,"bytes":5780}
|
||||
{"timestamp":"2018-01-01T01:02:14Z","srcIP":"1.1.1.1", "dstIP":"2.2.2.2","packets":38,"bytes":6289}
|
||||
{"timestamp":"2018-01-01T01:02:29Z","srcIP":"1.1.1.1", "dstIP":"2.2.2.2","packets":377,"bytes":359971}
|
||||
{"timestamp":"2018-01-01T01:03:29Z","srcIP":"1.1.1.1", "dstIP":"2.2.2.2","packets":49,"bytes":10204}
|
||||
{"timestamp":"2018-01-02T21:33:14Z","srcIP":"7.7.7.7", "dstIP":"8.8.8.8","packets":38,"bytes":6289}
|
||||
{"timestamp":"2018-01-02T21:33:45Z","srcIP":"7.7.7.7", "dstIP":"8.8.8.8","packets":123,"bytes":93999}
|
||||
{"timestamp":"2018-01-02T21:35:45Z","srcIP":"7.7.7.7", "dstIP":"8.8.8.8","packets":12,"bytes":2818}
|
|
@ -0,0 +1,51 @@
|
|||
{
|
||||
"type" : "index",
|
||||
"spec" : {
|
||||
"dataSchema" : {
|
||||
"dataSource" : "rollup-tutorial",
|
||||
"parser" : {
|
||||
"type" : "string",
|
||||
"parseSpec" : {
|
||||
"format" : "json",
|
||||
"dimensionsSpec" : {
|
||||
"dimensions" : [
|
||||
"srcIP",
|
||||
"dstIP"
|
||||
]
|
||||
},
|
||||
"timestampSpec": {
|
||||
"column": "timestamp",
|
||||
"format": "iso"
|
||||
}
|
||||
}
|
||||
},
|
||||
"metricsSpec" : [
|
||||
{ "type" : "count", "name" : "count" },
|
||||
{ "type" : "longSum", "name" : "packets", "fieldName" : "packets" },
|
||||
{ "type" : "longSum", "name" : "bytes", "fieldName" : "bytes" }
|
||||
],
|
||||
"granularitySpec" : {
|
||||
"type" : "uniform",
|
||||
"segmentGranularity" : "week",
|
||||
"queryGranularity" : "minute",
|
||||
"intervals" : ["2018-01-01/2018-01-03"],
|
||||
"rollup" : true
|
||||
}
|
||||
},
|
||||
"ioConfig" : {
|
||||
"type" : "index",
|
||||
"firehose" : {
|
||||
"type" : "local",
|
||||
"baseDir" : "quickstart/tutorial",
|
||||
"filter" : "rollup-data.json"
|
||||
},
|
||||
"appendToExisting" : false
|
||||
},
|
||||
"tuningConfig" : {
|
||||
"type" : "index",
|
||||
"targetPartitionSize" : 5000000,
|
||||
"maxRowsInMemory" : 25000,
|
||||
"forceExtendableShardSpecs" : true
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,4 @@
|
|||
{"timestamp":"2018-01-01T07:01:35Z","animal":"octopus", "location":1, "number":100}
|
||||
{"timestamp":"2018-01-01T05:01:35Z","animal":"mongoose", "location":2,"number":200}
|
||||
{"timestamp":"2018-01-01T06:01:35Z","animal":"snake", "location":3, "number":300}
|
||||
{"timestamp":"2018-01-01T01:01:35Z","animal":"lion", "location":4, "number":300}
|
|
@ -0,0 +1,73 @@
|
|||
{
|
||||
"type" : "index",
|
||||
"spec" : {
|
||||
"dataSchema" : {
|
||||
"dataSource" : "transform-tutorial",
|
||||
"parser" : {
|
||||
"type" : "string",
|
||||
"parseSpec" : {
|
||||
"format" : "json",
|
||||
"dimensionsSpec" : {
|
||||
"dimensions" : [
|
||||
"animal",
|
||||
{ "name": "location", "type": "long" }
|
||||
]
|
||||
},
|
||||
"timestampSpec": {
|
||||
"column": "timestamp",
|
||||
"format": "iso"
|
||||
}
|
||||
}
|
||||
},
|
||||
"metricsSpec" : [
|
||||
{ "type" : "count", "name" : "count" },
|
||||
{ "type" : "longSum", "name" : "number", "fieldName" : "number" },
|
||||
{ "type" : "longSum", "name" : "triple-number", "fieldName" : "triple-number" }
|
||||
],
|
||||
"granularitySpec" : {
|
||||
"type" : "uniform",
|
||||
"segmentGranularity" : "week",
|
||||
"queryGranularity" : "minute",
|
||||
"intervals" : ["2018-01-01/2018-01-03"],
|
||||
"rollup" : true
|
||||
},
|
||||
"transformSpec": {
|
||||
"transforms": [
|
||||
{
|
||||
"type": "expression",
|
||||
"name": "animal",
|
||||
"expression": "concat('super-', animal)"
|
||||
},
|
||||
{
|
||||
"type": "expression",
|
||||
"name": "triple-number",
|
||||
"expression": "number * 3"
|
||||
}
|
||||
],
|
||||
"filter": {
|
||||
"type":"or",
|
||||
"fields": [
|
||||
{ "type": "selector", "dimension": "animal", "value": "super-mongoose" },
|
||||
{ "type": "selector", "dimension": "triple-number", "value": "300" },
|
||||
{ "type": "selector", "dimension": "location", "value": "3" }
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
"ioConfig" : {
|
||||
"type" : "index",
|
||||
"firehose" : {
|
||||
"type" : "local",
|
||||
"baseDir" : "quickstart/tutorial",
|
||||
"filter" : "transform-data.json"
|
||||
},
|
||||
"appendToExisting" : false
|
||||
},
|
||||
"tuningConfig" : {
|
||||
"type" : "index",
|
||||
"targetPartitionSize" : 5000000,
|
||||
"maxRowsInMemory" : 25000,
|
||||
"forceExtendableShardSpecs" : true
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,59 @@
|
|||
{
|
||||
"type" : "index",
|
||||
"spec" : {
|
||||
"dataSchema" : {
|
||||
"dataSource" : "updates-tutorial",
|
||||
"parser" : {
|
||||
"type" : "string",
|
||||
"parseSpec" : {
|
||||
"format" : "json",
|
||||
"dimensionsSpec" : {
|
||||
"dimensions" : [
|
||||
"animal"
|
||||
]
|
||||
},
|
||||
"timestampSpec": {
|
||||
"column": "timestamp",
|
||||
"format": "iso"
|
||||
}
|
||||
}
|
||||
},
|
||||
"metricsSpec" : [
|
||||
{ "type" : "count", "name" : "count" },
|
||||
{ "type" : "longSum", "name" : "number", "fieldName" : "number" }
|
||||
],
|
||||
"granularitySpec" : {
|
||||
"type" : "uniform",
|
||||
"segmentGranularity" : "week",
|
||||
"queryGranularity" : "minute",
|
||||
"intervals" : ["2018-01-01/2018-01-03"],
|
||||
"rollup" : true
|
||||
}
|
||||
},
|
||||
"ioConfig" : {
|
||||
"type" : "index",
|
||||
"firehose" : {
|
||||
"type": "combining",
|
||||
"delegates": [
|
||||
{
|
||||
"type" : "ingestSegment",
|
||||
"dataSource" : "updates-tutorial",
|
||||
"interval" : "2018-01-01/2018-01-03"
|
||||
},
|
||||
{
|
||||
"type" : "local",
|
||||
"baseDir" : "quickstart/tutorial",
|
||||
"filter" : "updates-data3.json"
|
||||
}
|
||||
]
|
||||
},
|
||||
"appendToExisting" : false
|
||||
},
|
||||
"tuningConfig" : {
|
||||
"type" : "index",
|
||||
"targetPartitionSize" : 5000000,
|
||||
"maxRowsInMemory" : 25000,
|
||||
"forceExtendableShardSpecs" : true
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,49 @@
|
|||
{
|
||||
"type" : "index",
|
||||
"spec" : {
|
||||
"dataSchema" : {
|
||||
"dataSource" : "updates-tutorial",
|
||||
"parser" : {
|
||||
"type" : "string",
|
||||
"parseSpec" : {
|
||||
"format" : "json",
|
||||
"dimensionsSpec" : {
|
||||
"dimensions" : [
|
||||
"animal"
|
||||
]
|
||||
},
|
||||
"timestampSpec": {
|
||||
"column": "timestamp",
|
||||
"format": "iso"
|
||||
}
|
||||
}
|
||||
},
|
||||
"metricsSpec" : [
|
||||
{ "type" : "count", "name" : "count" },
|
||||
{ "type" : "longSum", "name" : "number", "fieldName" : "number" }
|
||||
],
|
||||
"granularitySpec" : {
|
||||
"type" : "uniform",
|
||||
"segmentGranularity" : "week",
|
||||
"queryGranularity" : "minute",
|
||||
"intervals" : ["2018-01-01/2018-01-03"],
|
||||
"rollup" : true
|
||||
}
|
||||
},
|
||||
"ioConfig" : {
|
||||
"type" : "index",
|
||||
"firehose" : {
|
||||
"type" : "local",
|
||||
"baseDir" : "quickstart/tutorial",
|
||||
"filter" : "updates-data4.json"
|
||||
},
|
||||
"appendToExisting" : true
|
||||
},
|
||||
"tuningConfig" : {
|
||||
"type" : "index",
|
||||
"targetPartitionSize" : 5000000,
|
||||
"maxRowsInMemory" : 25000,
|
||||
"forceExtendableShardSpecs" : true
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,3 @@
|
|||
{"timestamp":"2018-01-01T01:01:35Z","animal":"tiger", "number":100}
|
||||
{"timestamp":"2018-01-01T03:01:35Z","animal":"aardvark", "number":42}
|
||||
{"timestamp":"2018-01-01T03:01:35Z","animal":"giraffe", "number":14124}
|
|
@ -0,0 +1,3 @@
|
|||
{"timestamp":"2018-01-01T01:01:35Z","animal":"lion", "number":100}
|
||||
{"timestamp":"2018-01-01T03:01:35Z","animal":"aardvark", "number":9999}
|
||||
{"timestamp":"2018-01-01T04:01:35Z","animal":"bear", "number":111}
|
|
@ -0,0 +1,4 @@
|
|||
{"timestamp":"2018-01-01T07:01:35Z","animal":"octopus", "number":115}
|
||||
{"timestamp":"2018-01-01T05:01:35Z","animal":"mongoose", "number":737}
|
||||
{"timestamp":"2018-01-01T06:01:35Z","animal":"snake", "number":1234}
|
||||
{"timestamp":"2018-01-01T01:01:35Z","animal":"lion", "number":300}
|
|
@ -0,0 +1,2 @@
|
|||
{"timestamp":"2018-01-01T04:01:35Z","animal":"bear", "number":222}
|
||||
{"timestamp":"2018-01-01T09:01:35Z","animal":"falcon", "number":1241}
|
|
@ -0,0 +1,49 @@
|
|||
{
|
||||
"type" : "index",
|
||||
"spec" : {
|
||||
"dataSchema" : {
|
||||
"dataSource" : "updates-tutorial",
|
||||
"parser" : {
|
||||
"type" : "string",
|
||||
"parseSpec" : {
|
||||
"format" : "json",
|
||||
"dimensionsSpec" : {
|
||||
"dimensions" : [
|
||||
"animal"
|
||||
]
|
||||
},
|
||||
"timestampSpec": {
|
||||
"column": "timestamp",
|
||||
"format": "iso"
|
||||
}
|
||||
}
|
||||
},
|
||||
"metricsSpec" : [
|
||||
{ "type" : "count", "name" : "count" },
|
||||
{ "type" : "longSum", "name" : "number", "fieldName" : "number" }
|
||||
],
|
||||
"granularitySpec" : {
|
||||
"type" : "uniform",
|
||||
"segmentGranularity" : "week",
|
||||
"queryGranularity" : "minute",
|
||||
"intervals" : ["2018-01-01/2018-01-03"],
|
||||
"rollup" : true
|
||||
}
|
||||
},
|
||||
"ioConfig" : {
|
||||
"type" : "index",
|
||||
"firehose" : {
|
||||
"type" : "local",
|
||||
"baseDir" : "quickstart/tutorial",
|
||||
"filter" : "updates-data.json"
|
||||
},
|
||||
"appendToExisting" : false
|
||||
},
|
||||
"tuningConfig" : {
|
||||
"type" : "index",
|
||||
"targetPartitionSize" : 5000000,
|
||||
"maxRowsInMemory" : 25000,
|
||||
"forceExtendableShardSpecs" : true
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,49 @@
|
|||
{
|
||||
"type" : "index",
|
||||
"spec" : {
|
||||
"dataSchema" : {
|
||||
"dataSource" : "updates-tutorial",
|
||||
"parser" : {
|
||||
"type" : "string",
|
||||
"parseSpec" : {
|
||||
"format" : "json",
|
||||
"dimensionsSpec" : {
|
||||
"dimensions" : [
|
||||
"animal"
|
||||
]
|
||||
},
|
||||
"timestampSpec": {
|
||||
"column": "timestamp",
|
||||
"format": "iso"
|
||||
}
|
||||
}
|
||||
},
|
||||
"metricsSpec" : [
|
||||
{ "type" : "count", "name" : "count" },
|
||||
{ "type" : "longSum", "name" : "number", "fieldName" : "number" }
|
||||
],
|
||||
"granularitySpec" : {
|
||||
"type" : "uniform",
|
||||
"segmentGranularity" : "week",
|
||||
"queryGranularity" : "minute",
|
||||
"intervals" : ["2018-01-01/2018-01-03"],
|
||||
"rollup" : true
|
||||
}
|
||||
},
|
||||
"ioConfig" : {
|
||||
"type" : "index",
|
||||
"firehose" : {
|
||||
"type" : "local",
|
||||
"baseDir" : "quickstart/tutorial",
|
||||
"filter" : "updates-data2.json"
|
||||
},
|
||||
"appendToExisting" : false
|
||||
},
|
||||
"tuningConfig" : {
|
||||
"type" : "index",
|
||||
"targetPartitionSize" : 5000000,
|
||||
"maxRowsInMemory" : 25000,
|
||||
"forceExtendableShardSpecs" : true
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,79 @@
|
|||
{
|
||||
"type" : "index_hadoop",
|
||||
"spec" : {
|
||||
"dataSchema" : {
|
||||
"dataSource" : "wikipedia",
|
||||
"parser" : {
|
||||
"type" : "hadoopyString",
|
||||
"parseSpec" : {
|
||||
"format" : "json",
|
||||
"dimensionsSpec" : {
|
||||
"dimensions" : [
|
||||
"channel",
|
||||
"cityName",
|
||||
"comment",
|
||||
"countryIsoCode",
|
||||
"countryName",
|
||||
"isAnonymous",
|
||||
"isMinor",
|
||||
"isNew",
|
||||
"isRobot",
|
||||
"isUnpatrolled",
|
||||
"metroCode",
|
||||
"namespace",
|
||||
"page",
|
||||
"regionIsoCode",
|
||||
"regionName",
|
||||
"user",
|
||||
{ "name": "added", "type": "long" },
|
||||
{ "name": "deleted", "type": "long" },
|
||||
{ "name": "delta", "type": "long" }
|
||||
]
|
||||
},
|
||||
"timestampSpec" : {
|
||||
"format" : "auto",
|
||||
"column" : "time"
|
||||
}
|
||||
}
|
||||
},
|
||||
"metricsSpec" : [],
|
||||
"granularitySpec" : {
|
||||
"type" : "uniform",
|
||||
"segmentGranularity" : "day",
|
||||
"queryGranularity" : "none",
|
||||
"intervals" : ["2015-09-12/2015-09-13"],
|
||||
"rollup" : false
|
||||
}
|
||||
},
|
||||
"ioConfig" : {
|
||||
"type" : "hadoop",
|
||||
"inputSpec" : {
|
||||
"type" : "static",
|
||||
"paths" : "/quickstart/wikiticker-2015-09-12-sampled.json.gz"
|
||||
}
|
||||
},
|
||||
"tuningConfig" : {
|
||||
"type" : "hadoop",
|
||||
"partitionsSpec" : {
|
||||
"type" : "hashed",
|
||||
"targetPartitionSize" : 5000000
|
||||
},
|
||||
"forceExtendableShardSpecs" : true,
|
||||
"jobProperties" : {
|
||||
"fs.default.name" : "hdfs://druid-hadoop-demo:9000",
|
||||
"fs.defaultFS" : "hdfs://druid-hadoop-demo:9000",
|
||||
"dfs.datanode.address" : "druid-hadoop-demo",
|
||||
"dfs.client.use.datanode.hostname" : "true",
|
||||
"dfs.datanode.use.datanode.hostname" : "true",
|
||||
"yarn.resourcemanager.hostname" : "druid-hadoop-demo",
|
||||
"yarn.nodemanager.vmem-check-enabled" : "false",
|
||||
"mapreduce.map.java.opts" : "-Duser.timezone=UTC -Dfile.encoding=UTF-8",
|
||||
"mapreduce.job.user.classpath.first" : "true",
|
||||
"mapreduce.reduce.java.opts" : "-Duser.timezone=UTC -Dfile.encoding=UTF-8",
|
||||
"mapreduce.map.memory.mb" : 1024,
|
||||
"mapreduce.reduce.memory.mb" : 1024
|
||||
}
|
||||
}
|
||||
},
|
||||
"hadoopDependencyCoordinates": ["org.apache.hadoop:hadoop-client:2.8.3"]
|
||||
}
|
|
@ -0,0 +1,64 @@
|
|||
{
|
||||
"type" : "index",
|
||||
"spec" : {
|
||||
"dataSchema" : {
|
||||
"dataSource" : "wikipedia",
|
||||
"parser" : {
|
||||
"type" : "string",
|
||||
"parseSpec" : {
|
||||
"format" : "json",
|
||||
"dimensionsSpec" : {
|
||||
"dimensions" : [
|
||||
"channel",
|
||||
"cityName",
|
||||
"comment",
|
||||
"countryIsoCode",
|
||||
"countryName",
|
||||
"isAnonymous",
|
||||
"isMinor",
|
||||
"isNew",
|
||||
"isRobot",
|
||||
"isUnpatrolled",
|
||||
"metroCode",
|
||||
"namespace",
|
||||
"page",
|
||||
"regionIsoCode",
|
||||
"regionName",
|
||||
"user",
|
||||
{ "name": "added", "type": "long" },
|
||||
{ "name": "deleted", "type": "long" },
|
||||
{ "name": "delta", "type": "long" }
|
||||
]
|
||||
},
|
||||
"timestampSpec": {
|
||||
"column": "time",
|
||||
"format": "iso"
|
||||
}
|
||||
}
|
||||
},
|
||||
"metricsSpec" : [],
|
||||
"granularitySpec" : {
|
||||
"type" : "uniform",
|
||||
"segmentGranularity" : "day",
|
||||
"queryGranularity" : "none",
|
||||
"intervals" : ["2015-09-12/2015-09-13"],
|
||||
"rollup" : false
|
||||
}
|
||||
},
|
||||
"ioConfig" : {
|
||||
"type" : "index",
|
||||
"firehose" : {
|
||||
"type" : "local",
|
||||
"baseDir" : "quickstart/",
|
||||
"filter" : "wikiticker-2015-09-12-sampled.json.gz"
|
||||
},
|
||||
"appendToExisting" : false
|
||||
},
|
||||
"tuningConfig" : {
|
||||
"type" : "index",
|
||||
"targetPartitionSize" : 5000000,
|
||||
"maxRowsInMemory" : 25000,
|
||||
"forceExtendableShardSpecs" : true
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,59 @@
|
|||
{
|
||||
"type": "kafka",
|
||||
"dataSchema": {
|
||||
"dataSource": "wikipedia",
|
||||
"parser": {
|
||||
"type": "string",
|
||||
"parseSpec": {
|
||||
"format": "json",
|
||||
"timestampSpec": {
|
||||
"column": "time",
|
||||
"format": "auto"
|
||||
},
|
||||
"dimensionsSpec": {
|
||||
"dimensions": [
|
||||
"channel",
|
||||
"cityName",
|
||||
"comment",
|
||||
"countryIsoCode",
|
||||
"countryName",
|
||||
"isAnonymous",
|
||||
"isMinor",
|
||||
"isNew",
|
||||
"isRobot",
|
||||
"isUnpatrolled",
|
||||
"metroCode",
|
||||
"namespace",
|
||||
"page",
|
||||
"regionIsoCode",
|
||||
"regionName",
|
||||
"user",
|
||||
{ "name": "added", "type": "long" },
|
||||
{ "name": "deleted", "type": "long" },
|
||||
{ "name": "delta", "type": "long" }
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
"metricsSpec" : [],
|
||||
"granularitySpec": {
|
||||
"type": "uniform",
|
||||
"segmentGranularity": "DAY",
|
||||
"queryGranularity": "NONE",
|
||||
"rollup": false
|
||||
}
|
||||
},
|
||||
"tuningConfig": {
|
||||
"type": "kafka",
|
||||
"reportParseExceptions": false
|
||||
},
|
||||
"ioConfig": {
|
||||
"topic": "wikipedia",
|
||||
"replicas": 2,
|
||||
"taskDuration": "PT10M",
|
||||
"completionTimeout": "PT20M",
|
||||
"consumerProperties": {
|
||||
"bootstrap.servers": "localhost:9092"
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,3 @@
|
|||
{
|
||||
"query":"SELECT page, COUNT(*) AS Edits FROM wikipedia WHERE \"__time\" BETWEEN TIMESTAMP '2015-09-12 00:00:00' AND TIMESTAMP '2015-09-13 00:00:00' GROUP BY page ORDER BY Edits DESC LIMIT 10"
|
||||
}
|
|
@ -1,16 +1,15 @@
|
|||
{
|
||||
"queryType" : "topN",
|
||||
"dataSource" : "wikiticker",
|
||||
"dataSource" : "wikipedia",
|
||||
"intervals" : ["2015-09-12/2015-09-13"],
|
||||
"granularity" : "all",
|
||||
"dimension" : "page",
|
||||
"metric" : "edits",
|
||||
"threshold" : 25,
|
||||
"metric" : "count",
|
||||
"threshold" : 10,
|
||||
"aggregations" : [
|
||||
{
|
||||
"type" : "longSum",
|
||||
"name" : "edits",
|
||||
"fieldName" : "count"
|
||||
"type" : "count",
|
||||
"name" : "count"
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
|
@ -1,85 +0,0 @@
|
|||
{
|
||||
"type" : "index_hadoop",
|
||||
"spec" : {
|
||||
"ioConfig" : {
|
||||
"type" : "hadoop",
|
||||
"inputSpec" : {
|
||||
"type" : "static",
|
||||
"paths" : "quickstart/wikiticker-2015-09-12-sampled.json.gz"
|
||||
}
|
||||
},
|
||||
"dataSchema" : {
|
||||
"dataSource" : "wikiticker",
|
||||
"granularitySpec" : {
|
||||
"type" : "uniform",
|
||||
"segmentGranularity" : "day",
|
||||
"queryGranularity" : "none",
|
||||
"intervals" : ["2015-09-12/2015-09-13"]
|
||||
},
|
||||
"parser" : {
|
||||
"type" : "hadoopyString",
|
||||
"parseSpec" : {
|
||||
"format" : "json",
|
||||
"dimensionsSpec" : {
|
||||
"dimensions" : [
|
||||
"channel",
|
||||
"cityName",
|
||||
"comment",
|
||||
"countryIsoCode",
|
||||
"countryName",
|
||||
"isAnonymous",
|
||||
"isMinor",
|
||||
"isNew",
|
||||
"isRobot",
|
||||
"isUnpatrolled",
|
||||
"metroCode",
|
||||
"namespace",
|
||||
"page",
|
||||
"regionIsoCode",
|
||||
"regionName",
|
||||
"user"
|
||||
]
|
||||
},
|
||||
"timestampSpec" : {
|
||||
"format" : "auto",
|
||||
"column" : "time"
|
||||
}
|
||||
}
|
||||
},
|
||||
"metricsSpec" : [
|
||||
{
|
||||
"name" : "count",
|
||||
"type" : "count"
|
||||
},
|
||||
{
|
||||
"name" : "added",
|
||||
"type" : "longSum",
|
||||
"fieldName" : "added"
|
||||
},
|
||||
{
|
||||
"name" : "deleted",
|
||||
"type" : "longSum",
|
||||
"fieldName" : "deleted"
|
||||
},
|
||||
{
|
||||
"name" : "delta",
|
||||
"type" : "longSum",
|
||||
"fieldName" : "delta"
|
||||
},
|
||||
{
|
||||
"name" : "user_unique",
|
||||
"type" : "hyperUnique",
|
||||
"fieldName" : "user"
|
||||
}
|
||||
]
|
||||
},
|
||||
"tuningConfig" : {
|
||||
"type" : "hadoop",
|
||||
"partitionsSpec" : {
|
||||
"type" : "hashed",
|
||||
"targetPartitionSize" : 5000000
|
||||
},
|
||||
"jobProperties" : {}
|
||||
}
|
||||
}
|
||||
}
|