From 4ec4b8e024513744c5fbd23a012a29a5070af108 Mon Sep 17 00:00:00 2001 From: fjy Date: Tue, 8 Oct 2013 16:34:58 -0700 Subject: [PATCH 01/30] rewrite indexing service docs --- docs/content/Indexing-Service.md | 270 ++++++++------ docs/content/Tasks.md | 180 +++++++--- .../Tutorial:-Loading-Your-Data-Part-1.md | 335 ++++++++++++++++++ ... => Tutorial:-Loading-Your-Data-Part-2.md} | 0 docs/content/Tutorial:-The-Druid-Cluster.md | 2 +- .../indexing/overlord/ForkingTaskRunner.java | 4 +- .../overlord/ForkingTaskRunnerFactory.java | 6 +- .../indexing/overlord/RemoteTaskRunner.java | 2 +- .../config/EC2AutoScalingStrategyConfig.java | 37 -- .../config/ForkingTaskRunnerConfig.java | 9 - .../config/RemoteTaskRunnerConfig.java | 6 +- .../indexing/worker/config/WorkerConfig.java | 4 +- .../executor/ExecutorLifecycleFactory.java | 44 --- .../overlord/TestRemoteTaskRunnerConfig.java | 2 +- .../src/main/java/io/druid/cli/CliPeon.java | 2 +- .../druid/cli/convert/ConvertProperties.java | 2 +- 16 files changed, 652 insertions(+), 253 deletions(-) create mode 100644 docs/content/Tutorial:-Loading-Your-Data-Part-1.md rename docs/content/{Loading-Your-Data.md => Tutorial:-Loading-Your-Data-Part-2.md} (100%) delete mode 100644 indexing-service/src/main/java/io/druid/indexing/overlord/config/EC2AutoScalingStrategyConfig.java delete mode 100644 indexing-service/src/main/java/io/druid/indexing/worker/executor/ExecutorLifecycleFactory.java diff --git a/docs/content/Indexing-Service.md b/docs/content/Indexing-Service.md index 271c4bb09c6..8f733a0f83c 100644 --- a/docs/content/Indexing-Service.md +++ b/docs/content/Indexing-Service.md @@ -1,69 +1,147 @@ --- layout: doc_page --- -Disclaimer: We are still in the process of finalizing the indexing service and these configs are prone to change at any time. We will announce when we feel the indexing service and the configurations described are stable. +The indexing service is a highly-available, distributed service that runs indexing related tasks. Indexing service [tasks](Tasks.html) create (and sometimes destroy) Druid [segments](Segments.html). The indexing service has a master/slave like architecture. -The indexing service is a distributed task/job queue. It accepts requests in the form of [Tasks](Tasks.html) and executes those tasks across a set of worker nodes. Worker capacity can be automatically adjusted based on the number of tasks pending in the system. The indexing service is highly available, has built in retry logic, and can backup per task logs in deep storage. +The indexing service is composed of three main components: a peon component that can run a single task, a middle manager component that manages peons, and an overlord component that manages task distribution to middle managers. +Overlords and middle managers may run on the same node or across multiple nodes while middle managers and peons always run on the same node. -The indexing service is composed of two main components, a coordinator node that manages task distribution and worker capacity, and worker nodes that execute tasks in separate JVMs. +Most Basic Getting Started Configuration +---------------------------------------- +Run: +``` +io.druid.cli.Main server overlord +``` + +With the following JVM configuration: + +``` +-server +-Xmx2g +-Duser.timezone=UTC +-Dfile.encoding=UTF-8 + +-Ddruid.host=localhost +-Ddruid.port=8080 +-Ddruid.service=overlord + +-Ddruid.zk.service.host=localhost + +-Ddruid.db.connector.connectURI=jdbc:mysql://localhost:3306/druid +-Ddruid.db.connector.user=driud +-Ddruid.db.connector.password=diurd + +-Ddruid.selectors.indexing.serviceName=indexer +-Ddruid.indexer.runner.javaOpts="-server -Xmx1g" +-Ddruid.indexer.runner.startPort=8081 +-Ddruid.indexer.fork.property.druid.computation.buffer.size=268435456 +``` + +You can now submit simple indexing tasks to the indexing service. + + -Indexer Coordinator Node ------------------------- + -Tasks can be submitted via POST requests to: +Overlord Node +------------- + +The overlord node is responsible for accepting tasks, coordinating task distribution, creating locks around tasks, and returning statuses to callers. + +#### Usage + +Tasks are submitted to the overlord node in the form of JSON objects. Tasks can be submitted via POST requests to: ``` -http://:/druid/indexer/v1/task +http://:/druid/indexer/v1/task ``` Tasks can cancelled via POST requests to: ``` -http://:/druid/indexer/v1/task/{taskId}/shutdown +http://:/druid/indexer/v1/task/{taskId}/shutdown ``` -Issuing the cancel request once sends a graceful shutdown request. Graceful shutdowns may not stop a task right away, but instead issue a safe stop command at a point deemed least impactful to the system. Issuing the cancel request twice in succession will kill –9 the task. +Issuing the cancel request will kill –9 the task. Task statuses can be retrieved via GET requests to: ``` -http://:/druid/indexer/v1/task/{taskId}/status +http://:/druid/indexer/v1/task/{taskId}/status ``` Task segments can be retrieved via GET requests to: ``` -http://:/druid/indexer/v1/task/{taskId}/segments +http://:/druid/indexer/v1/task/{taskId}/segments ``` -When a task is submitted, the coordinator creates a lock over the data source and interval of the task. The coordinator also stores the task in a MySQL database table. The database table is read at startup time to bootstrap any tasks that may have been submitted to the coordinator but may not yet have been executed. +#### Console -The coordinator also exposes a simple UI to show what tasks are currently running on what nodes at +The overlord console can be used to view pending tasks, running tasks, available workers, and recent worker creation and termination. The console can be accessed at: ``` -http://:/static/console.html +http://:8080/console.html ``` -#### Task Execution - -The coordinator retrieves worker setup metadata from the Druid [MySQL](MySQL.html) config table. This metadata contains information about the version of workers to create, the maximum and minimum number of workers in the cluster at one time, and additional information required to automatically create workers. - -Tasks are assigned to workers by creating entries under specific /tasks paths associated with a worker, similar to how the Druid coordinator node assigns segments to historical nodes. See [Worker Configuration](Indexing-Service#configuration-1). Once a worker picks up a task, it deletes the task entry and announces a task status under a /status path associated with the worker. Tasks are submitted to a worker until the worker hits capacity. If all workers in a cluster are at capacity, the indexer coordinator node automatically creates new worker resources. - #### Autoscaling -The Autoscaling mechanisms currently in place are tightly coupled with our deployment infrastructure but the framework should be in place for other implementations. We are highly open to new implementations or extensions of the existing mechanisms. In our own deployments, worker nodes are Amazon AWS EC2 nodes and they are provisioned to register themselves in a [galaxy](https://github.com/ning/galaxy) environment. +The Autoscaling mechanisms currently in place are tightly coupled with our deployment infrastructure but the framework should be in place for other implementations. We are highly open to new implementations or extensions of the existing mechanisms. In our own deployments, middle manager nodes are Amazon AWS EC2 nodes and they are provisioned to register themselves in a [galaxy](https://github.com/ning/galaxy) environment. -The Coordinator node controls the number of workers in the cluster according to a worker setup spec that is submitted via a POST request to the indexer at: +If autoscaling is enabled, new middle managers may be added when a task has been in pending state for too long. Middle managers may be terminated if they have not run any tasks for a period of time. + +#### JVM Configuration + +The overlord module requires the following basic configs to run in remote mode: + +|Property|Description|Default| +|--------|-----------|-------| +|`druid.indexer.runner.type`|Choices "local" or "remote". Indicates whether tasks should be run locally or in a distributed environment.|local| +|`druid.indexer.storage.type`|Choices are "local" or "db". Indicates whether incoming tasks should be stored locally (in heap) or in a database. Storing incoming tasks in a database allows for tasks to be bootstrapped if the overlord should fail.|local| + +The following configs only apply if the overlord is running in remote mode: + +|Property|Description|Default| +|--------|-----------|-------| +|`druid.indexer.runner.taskAssignmentTimeout`|How long to wait after a task as been assigned to a middle manager before throwing an error.|PT5M| +|`druid.indexer.runner.minWorkerVersion`|The minimum middle manager version to send tasks to. |none| +|`druid.indexer.runner.compressZnodes`|Indicates whether or not the overlord should expect middle managers to compress Znodes.|false| +|`druid.indexer.runner.maxZnodeBytes`|The maximum size Znode in bytes that can be created in Zookeeper.|524288| + +There are additional configs for autoscaling (if it is enabled): + +|Property|Description|Default| +|--------|-----------|-------| +|`druid.indexer.autoscale.doAutoscale`|If set to "true" autoscaling will be enabled.|false| +|`druid.indexer.autoscale.provisionPeriod`|How often to check whether or not new middle managers should be added.|PT1M| +|`druid.indexer.autoscale.terminatePeriod`|How often to check when middle managers should be removed.|PT1H| +|`druid.indexer.autoscale.originTime`|The starting reference timestamp that the terminate period increments upon.|2012-01-01T00:55:00.000Z| +|`druid.indexer.autoscale.strategy`|Choices are "noop" or "ec2". Sets the strategy to run when autoscaling is required.|noop| +|`druid.indexer.autoscale.workerIdleTimeout`|How long can a worker be idle (not a run task) before it can be considered for termination.|PT10M| +|`druid.indexer.autoscale.maxScalingDuration`|How long the overlord will wait around for a middle manager to show up before giving up.|PT15M| +|`druid.indexer.autoscale.numEventsToTrack`|The number of autoscaling related events (node creation and termination) to track.|10| +|`druid.indexer.autoscale.pendingTaskTimeout`|How long a task can be in "pending" state before the overlord tries to scale up.|PT30S| +|`druid.indexer.autoscale.workerVersion`|If set, will only create nodes of set version during autoscaling. Overrides dynamic configuration. |null| +|`druid.indexer.autoscale.workerPort`|The port that middle managers will run on.|8080| + +#### Dynamic Configuration + +Overlord dynamic configuration is mainly for autoscaling. The overlord reads a worker setup spec as a JSON object from the Druid [MySQL](MySQL.html) config table. This object contains information about the version of middle managers to create, the maximum and minimum number of middle managers in the cluster at one time, and additional information required to automatically create middle managers. + +The JSON object can be submitted to the overlord via a POST request at: ``` http://:/druid/indexer/v1/worker/setup @@ -94,7 +172,7 @@ A sample worker setup spec is shown below: } ``` -Issuing a GET request at the same URL will return the current worker setup spec that is currently in place. The worker setup spec list above is just a sample and it is possible to write worker setup specs for other deployment environments. A description of the worker setup spec is shown below. +Issuing a GET request at the same URL will return the current worker setup spec that is currently in place. The worker setup spec list above is just a sample and it is possible to extend the code base for other deployment environments. A description of the worker setup spec is shown below. |Property|Description|Default| |--------|-----------|-------| @@ -104,109 +182,85 @@ Issuing a GET request at the same URL will return the current worker setup spec |`nodeData`|A JSON object that contains metadata about new nodes to create.|none| |`userData`|A JSON object that contains metadata about how the node should register itself on startup. This data is sent with node creation requests.|none| -For more information about configuring Auto-scaling, see [Auto-Scaling Configuration](https://github.com/metamx/druid/wiki/Indexing-Service#auto-scaling-configuration). +#### Running + +``` +io.druid.cli.Main server overlord +``` + +Note: When running the overlord in local mode, all middle manager and peon configurations must be provided as well. + +MiddleManager Node +------------------ + +The middle manager node is a worker node that executes submitted tasks. Middle Managers forward tasks to peons that run in separate JVMs. Each peon is capable of running only one task at a time, however, a middle manager may have multiple peons. + +#### JVM Configuration + +Middle managers pass their configurations down to their child peons. The middle manager module requires the following configs: + +|Property|Description|Default| +|--------|-----------|-------| +|`druid.worker.ip`|The IP of the worker.|localhost| +|`druid.worker.version`|Version identifier for the middle manager.|0| +|`druid.worker.capacity`|Maximum number of tasks the middle manager can accept.|Number of available processors - 1| +|`druid.indexer.runner.compressZnodes`|Indicates whether or not the middle managers should compress Znodes.|false| +|`druid.indexer.runner.maxZnodeBytes`|The maximum size Znode in bytes that can be created in Zookeeper.|524288| +|`druid.indexer.runner.taskDir`|Temporary intermediate directory used during task execution.|/tmp/persistent| +|`druid.indexer.runner.javaCommand`|Command required to execute java.|java| +|`druid.indexer.runner.javaOpts`|-X Java options to run the peon in its own JVM.|""| +|`druid.indexer.runner.classpath`|Java classpath for the peon.|System.getProperty("java.class.path")| +|`druid.indexer.runner.startPort`|The port that peons begin running on.|8080| +|`druid.indexer.runner.allowedPrefixes`|Whitelist of prefixes for configs that can be passed down to child peons.|"com.metamx", "druid", "io.druid", "user.timezone","file.encoding"| #### Running -Indexer Coordinator nodes can be run using the `com.metamx.druid.indexing.coordinator.http.IndexerCoordinatorMain` class. - -#### Configuration - -Indexer Coordinator nodes require [basic service configuration](https://github.com/metamx/druid/wiki/Configuration#basic-service-configuration). In addition, there are several extra configurations that are required. - ``` --Ddruid.zk.paths.indexer.announcementsPath=/druid/indexer/announcements --Ddruid.zk.paths.indexer.leaderLatchPath=/druid/indexer/leaderLatchPath --Ddruid.zk.paths.indexer.statusPath=/druid/indexer/status --Ddruid.zk.paths.indexer.tasksPath=/druid/demo/indexer/tasks - --Ddruid.indexer.runner=remote --Ddruid.indexer.taskDir=/mnt/persistent/task/ --Ddruid.indexer.configTable=sample_config --Ddruid.indexer.workerSetupConfigName=worker_setup --Ddruid.indexer.strategy=ec2 --Ddruid.indexer.hadoopWorkingPath=/tmp/druid-indexing --Ddruid.indexer.logs.s3bucket=some_bucket --Ddruid.indexer.logs.s3prefix=some_prefix +io.druid.cli.Main server middleManager ``` -The indexing service requires some additional Zookeeper configs. +Peons +----- +Peons run a single task in a single JVM. Peons are a part of middle managers and should rarely (if ever) be run on their own. + +#### JVM Configuration +Although peons inherit the configurations of their parent middle managers, explicit child peon configs can be set by prefixing them with: + +``` +druid.indexer.fork.property +``` + +Additional peon configs include: |Property|Description|Default| |--------|-----------|-------| -|`druid.zk.paths.indexer.announcementsPath`|The base path where workers announce themselves.|none| -|`druid.zk.paths.indexer.leaderLatchPath`|The base that coordinator nodes use to determine a leader.|none| -|`druid.zk.paths.indexer.statusPath`|The base path where workers announce task statuses.|none| -|`druid.zk.paths.indexer.tasksPath`|The base path where the coordinator assigns new tasks.|none| +|`druid.peon.mode`|Choices are "local" and "remote". Setting this to local means you intend to run the peon as a standalone node (Not recommended).|remote| +|`druid.indexer.baseDir`|Base temporary working directory.|/tmp| +|`druid.indexer.baseTaskDir`|Base temporary working directory for tasks.|/tmp/persistent/tasks| +|`druid.indexer.hadoopWorkingPath`|Temporary working directory for Hadoop tasks.|/tmp/druid-indexing| +|`druid.indexer.defaultRowFlushBoundary`|Highest row count before persisting to disk. Used for indexing generating tasks.|50000| +|`druid.indexer.task.chathandler.type`|Choices are "noop" and "announce". Certain tasks will use service discovery to announce an HTTP endpoint that events can be posted to.|noop| -There’s several additional configs that are required to run tasks. +If the peon is running in remote mode, there must be an overlord up and running. Running peons in remote mode require the following configurations: |Property|Description|Default| |--------|-----------|-------| -|`druid.indexer.runner`|Indicates whether tasks should be run locally or in a distributed environment. "local" or "remote".|local| -|`druid.indexer.taskDir`|Intermediate temporary directory that tasks may use.|none| -|`druid.indexer.configTable`|The MySQL config table where misc configs live.|none| -|`druid.indexer.strategy`|The autoscaling strategy to use.|noop| -|`druid.indexer.hadoopWorkingPath`|Intermediate temporary hadoop working directory that certain index tasks may use.|none| -|`druid.indexer.logs.s3bucket`|S3 bucket to store logs.|none| -|`druid.indexer.logs.s3prefix`|S3 key prefix to store logs.|none| - -#### Console - -The indexer console can be used to view pending tasks, running tasks, available workers, and recent worker creation and termination. The console can be accessed at: - -``` -http://:8080/static/console.html -``` - -Worker Node ------------ - -The worker node executes submitted tasks. Workers run tasks in separate JVMs. +|`druid.peon.taskActionClient.retry.minWait`|The minimum retry time to communicate with overlord.|PT1M| +|`druid.peon.taskActionClient.retry.maxWait`|The maximum retry time to communicate with overlord.|PT10M| +|`druid.peon.taskActionClient.retry.maxRetryCount`|The maximum number of retries to communicate with overlord.|10| #### Running -Worker nodes can be run using the `com.metamx.druid.indexing.worker.http.WorkerMain` class. Worker nodes can automatically be created by the Indexer Coordinator as part of autoscaling. - -#### Configuration - -Worker nodes require [basic service configuration](https://github.com/metamx/druid/wiki/Configuration#basic-service-configuration). In addition, there are several extra configurations that are required. +The peon should very rarely ever be run independent of the middle manager. ``` --Ddruid.worker.version=0 --Ddruid.worker.capacity=3 - --Ddruid.indexer.threads=3 --Ddruid.indexer.taskDir=/mnt/persistent/task/ --Ddruid.indexer.hadoopWorkingPath=/tmp/druid-indexing - --Ddruid.worker.coordinatorService=druid:sample_cluster:indexer - --Ddruid.indexer.fork.hostpattern=:%d --Ddruid.indexer.fork.startport=8080 --Ddruid.indexer.fork.main=com.metamx.druid.indexing.worker.executor.ExecutorMain --Ddruid.indexer.fork.opts="-server -Xmx1g -Xms1g -XX:NewSize=256m -XX:MaxNewSize=256m" --Ddruid.indexer.fork.property.druid.service=druid/sample_cluster/executor - -# These configs are the same configs you would set for basic service configuration, just with a different prefix --Ddruid.indexer.fork.property.druid.monitoring.monitorSystem=false --Ddruid.indexer.fork.property.druid.computation.buffer.size=268435456 --Ddruid.indexer.fork.property.druid.indexer.taskDir=/mnt/persistent/task/ --Ddruid.indexer.fork.property.druid.processing.formatString=processing-%s --Ddruid.indexer.fork.property.druid.processing.numThreads=1 --Ddruid.indexer.fork.property.druid.server.maxSize=0 --Ddruid.indexer.fork.property.druid.request.logging.dir=request_logs/ +io.druid.cli.Main internal peon ``` -Many of the configurations for workers are similar to those for basic service configuration":https://github.com/metamx/druid/wiki/Configuration\#basic-service-configuration, but with a different config prefix. Below we describe the unique worker configs. +The task file contains the task JSON object. +The status file indicates where the task status will be output. -|Property|Description|Default| -|--------|-----------|-------| -|`druid.worker.version`|Version identifier for the worker.|0| -|`druid.worker.capacity`|Maximum number of tasks the worker can accept.|1| -|`druid.indexer.threads`|Number of processing threads per worker.|1| -|`druid.worker.coordinatorService`|Name of the indexer coordinator used for service discovery.|none| -|`druid.indexer.fork.hostpattern`|The format of the host name.|none| -|`druid.indexer.fork.startport`|Port in which child JVM starts from.|none| -|`druid.indexer.fork.opts`|JVM options for child JVMs.|none| +Tasks +----- +See [Tasks](Tasks.html). diff --git a/docs/content/Tasks.md b/docs/content/Tasks.md index 9f70e6995d7..b4443a3617a 100644 --- a/docs/content/Tasks.md +++ b/docs/content/Tasks.md @@ -1,71 +1,165 @@ --- layout: doc_page --- -Tasks are run on workers and always operate on a single datasource. Once an indexer coordinator node accepts a task, a lock is created for the datasource and interval specified in the task. Tasks do not need to explicitly release locks, they are released upon task completion. Tasks may potentially release locks early if they desire. Tasks ids are unique by naming them using UUIDs or the timestamp in which the task was created. Tasks are also part of a "task group", which is a set of tasks that can share interval locks. +Tasks are run on middle managers and always operate on a single data source. There are several different types of tasks. -Append Task ------------ +Segment Creation Tasks +---------------------- + +#### Index Task + +The Index Task is a simpler variation of the Index Hadoop task that is designed to be used for smaller data sets. The task executes within the indexing service and does not require an external Hadoop setup to use. The grammar of the index task is as follows: + +``` +{ + "type" : "index", + "dataSource" : "example", + "granularitySpec" : { + "type" : "uniform", + "gran" : "DAY", + "intervals" : [ "2010/2020" ] + }, + "aggregators" : [ { + "type" : "count", + "name" : "count" + }, { + "type" : "doubleSum", + "name" : "value", + "fieldName" : "value" + } ], + "firehose" : { + "type" : "local", + "baseDir" : "/tmp/data/json", + "filter" : "sample_data.json", + "parser" : { + "timestampSpec" : { + "column" : "timestamp" + }, + "data" : { + "format" : "json", + "dimensions" : [ "dim1", "dim2", "dim3" ] + } + } + } +} +``` + +|property|description|required?| +|--------|-----------|---------| +|type|The task type, this should always be "index".|yes| +|id|The task ID.|no| +|granularitySpec|See [granularitySpec](Tasks.html#Granularity-Spec)|yes| +|spatialDimensions|Dimensions to build spatial indexes over. See [Spatial-Indexing](Spatial-Indexing.html)|no| +|aggregators|The metrics to aggregate in the data set. For more info, see [Aggregations](Aggregations.html)|yes| +|indexGranularity|The rollup granularity for timestamps.|no| +|targetPartitionSize|Used in sharding. Determines how many rows are in each segment.|no| +|firehose|The input source of data. For more info, see [Firehose](Firehose.html)|yes| +|rowFlushBoundary|Used in determining when intermediate persist should occur to disk.|no| + +#### Index Hadoop Task + +The Hadoop Index Task is used to index larger data sets that require the parallelization and processing power of a Hadoop cluster. + +``` +{ + "type" : "index_hadoop", + "config": +} +``` + +|property|description|required?| +|--------|-----------|---------| +|type|The task type, this should always be "index_hadoop".|yes| +|config|See [Batch Ingestion](Batch-ingestion.html)|yes| + +#### Index Realtime Task + +Segment Merging Tasks +--------------------- + +#### Append Task Append tasks append a list of segments together into a single segment (one after the other). The grammar is: - { - "id": , - "dataSource": , - "segments": - } +``` +{ + "id": , + "dataSource": , + "segments": +} +``` -Merge Task ----------- +#### Merge Task Merge tasks merge a list of segments together. Any common timestamps are merged. The grammar is: - { - "id": , - "dataSource": , - "segments": - } +``` +{ + "id": , + "dataSource": , + "segments": +} +``` -Delete Task ------------ +Segment Destroying Tasks +------------------------ + +#### Delete Task Delete tasks create empty segments with no data. The grammar is: - { - "id": , - "dataSource": , - "segments": - } +``` +{ + "id": , + "dataSource": , + "segments": +} +``` -Kill Task ---------- +#### Kill Task Kill tasks delete all information about a segment and removes it from deep storage. Killable segments must be disabled (used==0) in the Druid segment table. The available grammar is: - { - "id": , - "dataSource": , - "segments": - } +``` +{ + "id": , + "dataSource": , + "segments": +} +``` -Index Task ----------- +Misc. Tasks +----------- -Index Partitions Task ---------------------- +#### Version Converter Task -Index Generator Task --------------------- +These tasks convert segments from an existing older index version to the latest index version. The available grammar is: -Index Hadoop Task ------------------ +``` +{ + "id": , + "groupId" : , + "dataSource": , + "interval" : , + "segment": +} +``` -Index Realtime Task -------------------- +#### Noop Task -Version Converter Task ----------------------- +These tasks start, sleep for a time and are used only for testing. The available grammar is: -Version Converter SubTask -------------------------- +``` +{ + "id": , + "interval" : , + "runTime" : , + "firehose": +} +``` + +Locking +------- +Once an overlord node accepts a task, a lock is created for the data source and interval specified in the task. Tasks do not need to explicitly release locks, they are released upon task completion. Tasks may potentially release locks early if they desire. Tasks ids are unique by naming them using UUIDs or the timestamp in which the task was created. Tasks are also part of a "task group", which is a set of tasks that can share interval locks. diff --git a/docs/content/Tutorial:-Loading-Your-Data-Part-1.md b/docs/content/Tutorial:-Loading-Your-Data-Part-1.md new file mode 100644 index 00000000000..a18009d7f98 --- /dev/null +++ b/docs/content/Tutorial:-Loading-Your-Data-Part-1.md @@ -0,0 +1,335 @@ +--- +layout: doc_page +--- +In our last [tutorial](Tutorial:-The-Druid-Cluster.html), we setup a complete Druid cluster. We created all the Druid dependencies and loaded some batched data. Druid shards data into self contained chunks known as segments. Segments are the fundamental unit of storage in Druid and all Druid nodes only understand segments. + +In this tutorial, we will learn how to create segments using the final piece of the Druid Cluster, the [indexing service](Indexing-Service.html). The indexing service is a standalone service that accepts [tasks](Tasks.html) in the form of POST requests. Tasks often relate to indexing data, and hence, segments are often the final output of a task. + +About the data +-------------- + +The data source we'll be working with is Wikipedia edits. Each time an edit is made in Wikipedia, an event gets pushed to an IRC channel associated with the language of the Wikipedia page. We scrape IRC channels for several different languages and load this data into Druid. + +Each event has a timestamp indicating the time of the edit (in UTC time), a list of dimensions indicating various metadata about the event (such as information about the user editing the page and where the user resides), and a list of metrics associated with the event (such as the number of characters added and deleted). + +Specifically. the data schema looks like so: + +Dimensions (things to filter on): + +```json +"page" +"language" +"user" +"unpatrolled" +"newPage" +"robot" +"anonymous" +"namespace" +"continent" +"country" +"region" +"city" +``` + +Metrics (things to aggregate over): + +```json +"count" +"added" +"delta" +"deleted" +``` + +These metrics track the number of characters added, deleted, and changed. + +Setting Up +---------- + +There are two ways to setup Druid: download a tarball, or [Build From Source](Build From Source.html). You only need to do one of these. + +### Download a Tarball + +We've built a tarball that contains everything you'll need. You'll find it [here](http://static.druid.io/artifacts/releases/druid-services-0.6.0-bin.tar.gz) +Download this file to a directory of your choosing. + +You can extract the awesomeness within by issuing: + +``` +tar -zxvf druid-services-*-bin.tar.gz +``` + +Not too lost so far right? That's great! If you cd into the directory: + +``` +cd druid-services-0.6.0 +``` + +You should see a bunch of files: + +* run_example_server.sh +* run_example_client.sh +* LICENSE, config, examples, lib directories + +Running Example Scripts +----------------------- + +Let's start doing stuff. You can start a Druid [Realtime](Realtime.html) node by issuing: + +``` +./run_example_server.sh +``` + +Select "wikipedia". + +Once the node starts up you will see a bunch of logs about setting up properties and connecting to the data source. If everything was successful, you should see messages of the form shown below. + +``` +2013-09-04 19:33:11,922 INFO [main] org.eclipse.jetty.server.AbstractConnector - Started SelectChannelConnector@0.0.0.0:8083 +2013-09-04 19:33:11,946 INFO [ApiDaemon] io.druid.segment.realtime.firehose.IrcFirehoseFactory - irc connection to server [irc.wikimedia.org] established +2013-09-04 19:33:11,946 INFO [ApiDaemon] io.druid.segment.realtime.firehose.IrcFirehoseFactory - Joining channel #en.wikipedia +2013-09-04 19:33:11,946 INFO [ApiDaemon] io.druid.segment.realtime.firehose.IrcFirehoseFactory - Joining channel #fr.wikipedia +2013-09-04 19:33:11,946 INFO [ApiDaemon] io.druid.segment.realtime.firehose.IrcFirehoseFactory - Joining channel #de.wikipedia +2013-09-04 19:33:11,946 INFO [ApiDaemon] io.druid.segment.realtime.firehose.IrcFirehoseFactory - Joining channel #ja.wikipedia +``` + +The Druid real time-node ingests events in an in-memory buffer. Periodically, these events will be persisted to disk. If you are interested in the details of our real-time architecture and why we persist indexes to disk, I suggest you read our [White Paper](http://static.druid.io/docs/druid.pdf). + +Okay, things are about to get real-time. To query the real-time node you've spun up, you can issue: + +``` +./run_example_client.sh +``` + +Select "wikipedia" once again. This script issues [GroupByQuery](GroupByQuery.html)s to the data we've been ingesting. The query looks like this: + +```json +{ + "queryType":"groupBy", + "dataSource":"wikipedia", + "granularity":"minute", + "dimensions":[ "page" ], + "aggregations":[ + {"type":"count", "name":"rows"}, + {"type":"longSum", "fieldName":"edit_count", "name":"count"} + ], + "filter":{ "type":"selector", "dimension":"namespace", "value":"article" }, + "intervals":[ "2013-06-01T00:00/2020-01-01T00" ] +} +``` + +This is a **groupBy** query, which you may be familiar with from SQL. We are grouping, or aggregating, via the `dimensions` field: `["page"]`. We are **filtering** via the `namespace` dimension, to only look at edits on `articles`. Our **aggregations** are what we are calculating: a count of the number of data rows, and a count of the number of edits that have occurred. + +The result looks something like this: + +```json +[ + { + "version": "v1", + "timestamp": "2013-09-04T21:44:00.000Z", + "event": { "count": 0, "page": "2013\u201314_Brentford_F.C._season", "rows": 1 } + }, + { + "version": "v1", + "timestamp": "2013-09-04T21:44:00.000Z", + "event": { "count": 0, "page": "8e_\u00e9tape_du_Tour_de_France_2013", "rows": 1 } + }, + { + "version": "v1", + "timestamp": "2013-09-04T21:44:00.000Z", + "event": { "count": 0, "page": "Agenda_of_the_Tea_Party_movement", "rows": 1 } + }, +... +``` + +This groupBy query is a bit complicated and we'll return to it later. For the time being, just make sure you are getting some blocks of data back. If you are having problems, make sure you have [curl](http://curl.haxx.se/) installed. Control+C to break out of the client script. + +h2. Querying Druid + +In your favorite editor, create the file: + +``` +time_boundary_query.body +``` + +Druid queries are JSON blobs which are relatively painless to create programmatically, but an absolute pain to write by hand. So anyway, we are going to create a Druid query by hand. Add the following to the file you just created: + +``` +{ + "queryType": "timeBoundary", + "dataSource": "wikipedia" +} +``` + +The [TimeBoundaryQuery](TimeBoundaryQuery.html) is one of the simplest Druid queries. To run the query, you can issue: + +``` +curl -X POST 'http://localhost:8083/druid/v2/?pretty' -H 'content-type: application/json' -d @time_boundary_query.body +``` + +We get something like this JSON back: + +```json +[ { + "timestamp" : "2013-09-04T21:44:00.000Z", + "result" : { + "minTime" : "2013-09-04T21:44:00.000Z", + "maxTime" : "2013-09-04T21:47:00.000Z" + } +} ] +``` + +As you can probably tell, the result is indicating the maximum and minimum timestamps we've seen thus far (summarized to a minutely granularity). Let's explore a bit further. + +Return to your favorite editor and create the file: + +``` +timeseries_query.body +``` + +We are going to make a slightly more complicated query, the [TimeseriesQuery](TimeseriesQuery.html). Copy and paste the following into the file: + +``` +{ + "queryType": "timeseries", + "dataSource": "wikipedia", + "intervals": [ "2010-01-01/2020-01-01" ], + "granularity": "all", + "aggregations": [ + {"type": "longSum", "fieldName": "count", "name": "edit_count"}, + {"type": "doubleSum", "fieldName": "added", "name": "chars_added"} + ] +} +``` + +You are probably wondering, what are these [Granularities](Granularities.html) and [Aggregations](Aggregations.html) things? What the query is doing is aggregating some metrics over some span of time. +To issue the query and get some results, run the following in your command line: + +``` +curl -X POST 'http://localhost:8083/druid/v2/?pretty' -H 'content-type: application/json' -d ````timeseries_query.body +``` + +Once again, you should get a JSON blob of text back with your results, that looks something like this: + +```json +[ { + "timestamp" : "2013-09-04T21:44:00.000Z", + "result" : { "chars_added" : 312670.0, "edit_count" : 733 } +} ] +``` + +If you issue the query again, you should notice your results updating. + +Right now all the results you are getting back are being aggregated into a single timestamp bucket. What if we wanted to see our aggregations on a per minute basis? What field can we change in the query to accomplish this? + +If you loudly exclaimed "we can change granularity to minute", you are absolutely correct! We can specify different granularities to bucket our results, like so: + +``` +{ + "queryType": "timeseries", + "dataSource": "wikipedia", + "intervals": [ "2010-01-01/2020-01-01" ], + "granularity": "minute", + "aggregations": [ + {"type": "longSum", "fieldName": "count", "name": "edit_count"}, + {"type": "doubleSum", "fieldName": "added", "name": "chars_added"} + ] +} +``` + +This gives us something like the following: + +```json +[ + { + "timestamp" : "2013-09-04T21:44:00.000Z", + "result" : { "chars_added" : 30665.0, "edit_count" : 128 } + }, + { + "timestamp" : "2013-09-04T21:45:00.000Z", + "result" : { "chars_added" : 122637.0, "edit_count" : 167 } + }, + { + "timestamp" : "2013-09-04T21:46:00.000Z", + "result" : { "chars_added" : 78938.0, "edit_count" : 159 } + }, +... +] +``` + +Solving a Problem +----------------- + +One of Druid's main powers is to provide answers to problems, so let's pose a problem. What if we wanted to know what the top pages in the US are, ordered by the number of edits over the last few minutes you've been going through this tutorial? To solve this problem, we have to return to the query we introduced at the very beginning of this tutorial, the [GroupByQuery](GroupByQuery.html). It would be nice if we could group by results by dimension value and somehow sort those results... and it turns out we can! + +Let's create the file: + +``` +group_by_query.body +``` + +and put the following in there: + +``` +{ + "queryType": "groupBy", + "dataSource": "wikipedia", + "granularity": "all", + "dimensions": [ "page" ], + "orderBy": { + "type": "default", + "columns": [ { "dimension": "edit_count", "direction": "DESCENDING" } ], + "limit": 10 + }, + "aggregations": [ + {"type": "longSum", "fieldName": "count", "name": "edit_count"} + ], + "filter": { "type": "selector", "dimension": "country", "value": "United States" }, + "intervals": ["2012-10-01T00:00/2020-01-01T00"] +} +``` + +Woah! Our query just got a way more complicated. Now we have these [Filters](Filters.html) things and this [OrderBy](OrderBy.html) thing. Fear not, it turns out the new objects we've introduced to our query can help define the format of our results and provide an answer to our question. + +If you issue the query: + +``` +curl -X POST 'http://localhost:8083/druid/v2/?pretty' -H 'content-type: application/json' -d @group_by_query.body +``` + +You should see an answer to our question. As an example, some results are shown below: + +```json +[ + { + "version" : "v1", + "timestamp" : "2012-10-01T00:00:00.000Z", + "event" : { "page" : "RTC_Transit", "edit_count" : 6 } + }, + { + "version" : "v1", + "timestamp" : "2012-10-01T00:00:00.000Z", + "event" : { "page" : "List_of_Deadly_Women_episodes", "edit_count" : 4 } + }, + { + "version" : "v1", + "timestamp" : "2012-10-01T00:00:00.000Z", + "event" : { "page" : "User_talk:David_Biddulph", "edit_count" : 4 } + }, +... +``` + +Feel free to tweak other query parameters to answer other questions you may have about the data. + +Next Steps +---------- + +What to know even more information about the Druid Cluster? Check out [Tutorial: The Druid Cluster](Tutorial:-The-Druid-Cluster.html) + +Druid is even more fun if you load your own data into it! To learn how to load your data, see [Loading Your Data](Loading-Your-Data.html). + +Additional Information +---------------------- + +This tutorial is merely showcasing a small fraction of what Druid can do. If you are interested in more information about Druid, including setting up a more sophisticated Druid cluster, please read the other links in our wiki. + +And thus concludes our journey! Hopefully you learned a thing or two about Druid real-time ingestion, querying Druid, and how Druid can be used to solve problems. If you have additional questions, feel free to post in our [google groups page](http://www.groups.google.com/forum/#!forum/druid-development). diff --git a/docs/content/Loading-Your-Data.md b/docs/content/Tutorial:-Loading-Your-Data-Part-2.md similarity index 100% rename from docs/content/Loading-Your-Data.md rename to docs/content/Tutorial:-Loading-Your-Data-Part-2.md diff --git a/docs/content/Tutorial:-The-Druid-Cluster.md b/docs/content/Tutorial:-The-Druid-Cluster.md index 780c0abac42..ee3f1106563 100644 --- a/docs/content/Tutorial:-The-Druid-Cluster.md +++ b/docs/content/Tutorial:-The-Druid-Cluster.md @@ -1,7 +1,7 @@ --- layout: doc_page --- -Welcome back! In our first [tutorial](Tutorial:-A-First-Look-at-Druid), we introduced you to the most basic Druid setup: a single realtime node. We streamed in some data and queried it. Realtime nodes collect very recent data and periodically hand that data off to the rest of the Druid cluster. Some questions about the architecture must naturally come to mind. What does the rest of Druid cluster look like? How does Druid load available static data? +Welcome back! In our first [tutorial](Tutorial:-A-First-Look-at-Druid.html), we introduced you to the most basic Druid setup: a single realtime node. We streamed in some data and queried it. Realtime nodes collect very recent data and periodically hand that data off to the rest of the Druid cluster. Some questions about the architecture must naturally come to mind. What does the rest of Druid cluster look like? How does Druid load available static data? This tutorial will hopefully answer these questions! diff --git a/indexing-service/src/main/java/io/druid/indexing/overlord/ForkingTaskRunner.java b/indexing-service/src/main/java/io/druid/indexing/overlord/ForkingTaskRunner.java index 1e69516db95..166c0168395 100644 --- a/indexing-service/src/main/java/io/druid/indexing/overlord/ForkingTaskRunner.java +++ b/indexing-service/src/main/java/io/druid/indexing/overlord/ForkingTaskRunner.java @@ -46,6 +46,7 @@ import io.druid.indexing.common.task.Task; import io.druid.indexing.common.tasklogs.TaskLogPusher; import io.druid.indexing.common.tasklogs.TaskLogStreamer; import io.druid.indexing.overlord.config.ForkingTaskRunnerConfig; +import io.druid.indexing.worker.config.WorkerConfig; import io.druid.server.DruidNode; import org.apache.commons.io.FileUtils; @@ -84,6 +85,7 @@ public class ForkingTaskRunner implements TaskRunner, TaskLogStreamer @Inject public ForkingTaskRunner( ForkingTaskRunnerConfig config, + WorkerConfig workerConfig, Properties props, TaskLogPusher taskLogPusher, ObjectMapper jsonMapper, @@ -96,7 +98,7 @@ public class ForkingTaskRunner implements TaskRunner, TaskLogStreamer this.jsonMapper = jsonMapper; this.node = node; - this.exec = MoreExecutors.listeningDecorator(Executors.newFixedThreadPool(config.maxForks())); + this.exec = MoreExecutors.listeningDecorator(Executors.newFixedThreadPool(workerConfig.getCapacity())); } @Override diff --git a/indexing-service/src/main/java/io/druid/indexing/overlord/ForkingTaskRunnerFactory.java b/indexing-service/src/main/java/io/druid/indexing/overlord/ForkingTaskRunnerFactory.java index 4a090f4fa47..226b142998b 100644 --- a/indexing-service/src/main/java/io/druid/indexing/overlord/ForkingTaskRunnerFactory.java +++ b/indexing-service/src/main/java/io/druid/indexing/overlord/ForkingTaskRunnerFactory.java @@ -24,6 +24,7 @@ import com.google.inject.Inject; import io.druid.guice.annotations.Self; import io.druid.indexing.common.tasklogs.TaskLogPusher; import io.druid.indexing.overlord.config.ForkingTaskRunnerConfig; +import io.druid.indexing.worker.config.WorkerConfig; import io.druid.server.DruidNode; import java.util.Properties; @@ -33,6 +34,7 @@ import java.util.Properties; public class ForkingTaskRunnerFactory implements TaskRunnerFactory { private final ForkingTaskRunnerConfig config; + private final WorkerConfig workerConfig; private final Properties props; private final ObjectMapper jsonMapper; private final TaskLogPusher persistentTaskLogs; @@ -41,12 +43,14 @@ public class ForkingTaskRunnerFactory implements TaskRunnerFactory @Inject public ForkingTaskRunnerFactory( final ForkingTaskRunnerConfig config, + final WorkerConfig workerConfig, final Properties props, final ObjectMapper jsonMapper, final TaskLogPusher persistentTaskLogs, @Self DruidNode node ) { this.config = config; + this.workerConfig = workerConfig; this.props = props; this.jsonMapper = jsonMapper; this.persistentTaskLogs = persistentTaskLogs; @@ -56,6 +60,6 @@ public class ForkingTaskRunnerFactory implements TaskRunnerFactory @Override public TaskRunner build() { - return new ForkingTaskRunner(config, props, persistentTaskLogs, jsonMapper, node); + return new ForkingTaskRunner(config, workerConfig, props, persistentTaskLogs, jsonMapper, node); } } diff --git a/indexing-service/src/main/java/io/druid/indexing/overlord/RemoteTaskRunner.java b/indexing-service/src/main/java/io/druid/indexing/overlord/RemoteTaskRunner.java index fb212a164d9..d4dfb8e0d95 100644 --- a/indexing-service/src/main/java/io/druid/indexing/overlord/RemoteTaskRunner.java +++ b/indexing-service/src/main/java/io/druid/indexing/overlord/RemoteTaskRunner.java @@ -748,7 +748,7 @@ public class RemoteTaskRunner implements TaskRunner, TaskLogStreamer ); sortedWorkers.addAll(zkWorkers.values()); final String configMinWorkerVer = workerSetupData.get().getMinVersion(); - final String minWorkerVer = configMinWorkerVer == null ? config.getWorkerVersion() : configMinWorkerVer; + final String minWorkerVer = configMinWorkerVer == null ? config.getMinWorkerVersion() : configMinWorkerVer; for (ZkWorker zkWorker : sortedWorkers) { if (zkWorker.canRunTask(task) && zkWorker.isValidVersion(minWorkerVer)) { diff --git a/indexing-service/src/main/java/io/druid/indexing/overlord/config/EC2AutoScalingStrategyConfig.java b/indexing-service/src/main/java/io/druid/indexing/overlord/config/EC2AutoScalingStrategyConfig.java deleted file mode 100644 index ddd52bf4ded..00000000000 --- a/indexing-service/src/main/java/io/druid/indexing/overlord/config/EC2AutoScalingStrategyConfig.java +++ /dev/null @@ -1,37 +0,0 @@ -/* - * Druid - a distributed column store. - * Copyright (C) 2012, 2013 Metamarkets Group Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. - */ - -package io.druid.indexing.overlord.config; - -import org.skife.config.Config; -import org.skife.config.Default; -import org.skife.config.DefaultNull; - -/** - */ -public abstract class EC2AutoScalingStrategyConfig -{ - @Config("druid.indexer.worker.port") - @Default("8080") - public abstract String getWorkerPort(); - - @Config("druid.indexer.worker.version") - @DefaultNull - public abstract String getWorkerVersion(); -} diff --git a/indexing-service/src/main/java/io/druid/indexing/overlord/config/ForkingTaskRunnerConfig.java b/indexing-service/src/main/java/io/druid/indexing/overlord/config/ForkingTaskRunnerConfig.java index 6f941874076..e2f30e4235b 100644 --- a/indexing-service/src/main/java/io/druid/indexing/overlord/config/ForkingTaskRunnerConfig.java +++ b/indexing-service/src/main/java/io/druid/indexing/overlord/config/ForkingTaskRunnerConfig.java @@ -29,10 +29,6 @@ import java.util.List; public class ForkingTaskRunnerConfig { - @JsonProperty - @Min(1) - private int maxForks = 1; - @JsonProperty @NotNull private String taskDir = "/tmp/persistent"; @@ -69,11 +65,6 @@ public class ForkingTaskRunnerConfig "file.encoding" ); - public int maxForks() - { - return maxForks; - } - public String getTaskDir() { return taskDir; diff --git a/indexing-service/src/main/java/io/druid/indexing/overlord/config/RemoteTaskRunnerConfig.java b/indexing-service/src/main/java/io/druid/indexing/overlord/config/RemoteTaskRunnerConfig.java index 79c9c2b9186..dddb1671c51 100644 --- a/indexing-service/src/main/java/io/druid/indexing/overlord/config/RemoteTaskRunnerConfig.java +++ b/indexing-service/src/main/java/io/druid/indexing/overlord/config/RemoteTaskRunnerConfig.java @@ -37,7 +37,7 @@ public class RemoteTaskRunnerConfig private boolean compressZnodes = false; @JsonProperty - private String workerVersion = null; + private String minWorkerVersion = null; @JsonProperty @Min(10 * 1024) @@ -53,9 +53,9 @@ public class RemoteTaskRunnerConfig return compressZnodes; } - public String getWorkerVersion() + public String getMinWorkerVersion() { - return workerVersion; + return minWorkerVersion; } public long getMaxZnodeBytes() diff --git a/indexing-service/src/main/java/io/druid/indexing/worker/config/WorkerConfig.java b/indexing-service/src/main/java/io/druid/indexing/worker/config/WorkerConfig.java index 81f41d43393..d510df4c3ee 100644 --- a/indexing-service/src/main/java/io/druid/indexing/worker/config/WorkerConfig.java +++ b/indexing-service/src/main/java/io/druid/indexing/worker/config/WorkerConfig.java @@ -30,11 +30,11 @@ public class WorkerConfig { @JsonProperty @NotNull - private String ip = null; + private String ip = "localhost"; @JsonProperty @NotNull - private String version = null; + private String version = "0"; @JsonProperty @Min(1) diff --git a/indexing-service/src/main/java/io/druid/indexing/worker/executor/ExecutorLifecycleFactory.java b/indexing-service/src/main/java/io/druid/indexing/worker/executor/ExecutorLifecycleFactory.java deleted file mode 100644 index 8ad0455ec5a..00000000000 --- a/indexing-service/src/main/java/io/druid/indexing/worker/executor/ExecutorLifecycleFactory.java +++ /dev/null @@ -1,44 +0,0 @@ -/* - * Druid - a distributed column store. - * Copyright (C) 2012, 2013 Metamarkets Group Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. - */ - -package io.druid.indexing.worker.executor; - -import com.fasterxml.jackson.databind.ObjectMapper; -import io.druid.indexing.overlord.TaskRunner; - -import java.io.File; - -public class ExecutorLifecycleFactory -{ - private final File taskFile; - private final File statusFile; - - public ExecutorLifecycleFactory(File taskFile, File statusFile) - { - this.taskFile = taskFile; - this.statusFile = statusFile; - } - - public ExecutorLifecycle build(TaskRunner taskRunner, ObjectMapper jsonMapper) - { - return new ExecutorLifecycle( - new ExecutorLifecycleConfig().setTaskFile(taskFile).setStatusFile(statusFile), taskRunner, jsonMapper - ); - } -} diff --git a/indexing-service/src/test/java/io/druid/indexing/overlord/TestRemoteTaskRunnerConfig.java b/indexing-service/src/test/java/io/druid/indexing/overlord/TestRemoteTaskRunnerConfig.java index aa714a30b25..dc6b4d59389 100644 --- a/indexing-service/src/test/java/io/druid/indexing/overlord/TestRemoteTaskRunnerConfig.java +++ b/indexing-service/src/test/java/io/druid/indexing/overlord/TestRemoteTaskRunnerConfig.java @@ -45,7 +45,7 @@ public class TestRemoteTaskRunnerConfig extends RemoteTaskRunnerConfig } @Override - public String getWorkerVersion() + public String getMinWorkerVersion() { return ""; } diff --git a/services/src/main/java/io/druid/cli/CliPeon.java b/services/src/main/java/io/druid/cli/CliPeon.java index b874ec69a7b..4525ca1889e 100644 --- a/services/src/main/java/io/druid/cli/CliPeon.java +++ b/services/src/main/java/io/druid/cli/CliPeon.java @@ -118,7 +118,7 @@ public class CliPeon extends GuiceRunnable binder.bind(TaskToolboxFactory.class).in(LazySingleton.class); JsonConfigProvider.bind(binder, "druid.indexer.task", TaskConfig.class); - JsonConfigProvider.bind(binder, "druid.worker.taskActionClient.retry", RetryPolicyConfig.class); + JsonConfigProvider.bind(binder, "druid.peon.taskActionClient.retry", RetryPolicyConfig.class); configureTaskActionClient(binder); diff --git a/services/src/main/java/io/druid/cli/convert/ConvertProperties.java b/services/src/main/java/io/druid/cli/convert/ConvertProperties.java index 613db784e2a..d5a36554c89 100644 --- a/services/src/main/java/io/druid/cli/convert/ConvertProperties.java +++ b/services/src/main/java/io/druid/cli/convert/ConvertProperties.java @@ -85,7 +85,7 @@ public class ConvertProperties implements Runnable new Rename("druid.indexer.fork.startport", "druid.indexer.runner.startPort"), new Rename("druid.indexer.properties.prefixes", "druid.indexer.runner.allowedPrefixes"), new Rename("druid.indexer.taskAssignmentTimeoutDuration", "druid.indexer.runner.taskAssignmentTimeout"), - new Rename("druid.indexer.worker.version", "druid.indexer.runner.workerVersion"), + new Rename("druid.indexer.worker.version", "druid.indexer.runner.minWorkerVersion"), new Rename("druid.zk.maxNumBytes", "druid.indexer.runner.maxZnodeBytes"), new Rename("druid.indexer.provisionResources.duration", "druid.indexer.autoscale.provisionPeriod"), new Rename("druid.indexer.terminateResources.duration", "druid.indexer.autoscale.terminatePeriod"), From a9a723bd11dd49226affeb3a687e37a973d77d18 Mon Sep 17 00:00:00 2001 From: fjy Date: Wed, 9 Oct 2013 15:42:39 -0700 Subject: [PATCH 02/30] clean up poms, add a new loading your own data tutorial, add new validation, clean up logs --- cassandra-storage/pom.xml | 2 +- common/pom.xml | 2 +- docs/content/Indexing-Service.md | 4 +- docs/content/Tasks.md | 97 ++++- .../Tutorial:-Loading-Your-Data-Part-1.md | 394 +++++++----------- .../bin/examples/indexing/wikipedia_data.json | 5 + .../indexing/wikipedia_index_task.json | 39 ++ examples/config/overlord/runtime.properties | 14 + examples/pom.xml | 11 +- hdfs-storage/pom.xml | 2 +- indexing-hadoop/pom.xml | 4 +- indexing-service/pom.xml | 13 +- processing/pom.xml | 4 +- s3-extensions/pom.xml | 2 +- server/pom.xml | 6 +- services/pom.xml | 19 +- services/src/assembly/assembly.xml | 7 + services/src/main/java/io/druid/cli/Main.java | 3 +- .../cli/validate/DruidJsonValidator.java | 78 ++++ 19 files changed, 422 insertions(+), 284 deletions(-) create mode 100644 examples/bin/examples/indexing/wikipedia_data.json create mode 100644 examples/bin/examples/indexing/wikipedia_index_task.json create mode 100644 examples/config/overlord/runtime.properties create mode 100644 services/src/main/java/io/druid/cli/validate/DruidJsonValidator.java diff --git a/cassandra-storage/pom.xml b/cassandra-storage/pom.xml index 76af3436ca2..be57860cfa1 100644 --- a/cassandra-storage/pom.xml +++ b/cassandra-storage/pom.xml @@ -20,7 +20,7 @@ 4.0.0 - com.metamx.druid + io.druid druid-cassandra-storage druid-cassandra-storage druid-cassandra-storage diff --git a/common/pom.xml b/common/pom.xml index af29a79f989..5f6d62b77e4 100644 --- a/common/pom.xml +++ b/common/pom.xml @@ -20,7 +20,7 @@ 4.0.0 - com.metamx.druid + io.druid druid-common druid-common druid-common diff --git a/docs/content/Indexing-Service.md b/docs/content/Indexing-Service.md index 8f733a0f83c..350c475af96 100644 --- a/docs/content/Indexing-Service.md +++ b/docs/content/Indexing-Service.md @@ -29,10 +29,10 @@ With the following JVM configuration: -Ddruid.zk.service.host=localhost -Ddruid.db.connector.connectURI=jdbc:mysql://localhost:3306/druid --Ddruid.db.connector.user=driud +-Ddruid.db.connector.user=druid -Ddruid.db.connector.password=diurd --Ddruid.selectors.indexing.serviceName=indexer +-Ddruid.selectors.indexing.serviceName=overlord -Ddruid.indexer.runner.javaOpts="-server -Xmx1g" -Ddruid.indexer.runner.startPort=8081 -Ddruid.indexer.fork.property.druid.computation.buffer.size=268435456 diff --git a/docs/content/Tasks.md b/docs/content/Tasks.md index b4443a3617a..0510cc17a14 100644 --- a/docs/content/Tasks.md +++ b/docs/content/Tasks.md @@ -74,7 +74,102 @@ The Hadoop Index Task is used to index larger data sets that require the paralle |type|The task type, this should always be "index_hadoop".|yes| |config|See [Batch Ingestion](Batch-ingestion.html)|yes| -#### Index Realtime Task +#### Realtime Index Task + +The indexing service can also run real-time tasks. These tasks effectively transform a middle manager into a real-time node. We introduced real-time tasks as a way to programmatically add new real-time data sources without needing to manually add nodes. The grammar for the real-time task is as follows: + +``` +{ + "type" : "index_realtime", + "id": "example, + "resource": { + "availabilityGroup" : "someGroup", + "requiredCapacity" : 1 + }, + "schema": { + "dataSource": "dataSourceName", + "aggregators": [ + { + "type": "count", + "name": "events" + }, + { + "type": "doubleSum", + "name": "outColumn", + "fieldName": "inColumn" + } + ], + "indexGranularity": "minute", + "shardSpec": { + "type": "none" + } + }, + "firehose": { + "type": "kafka-0.7.2", + "consumerProps": { + "zk.connect": "zk_connect_string", + "zk.connectiontimeout.ms": "15000", + "zk.sessiontimeout.ms": "15000", + "zk.synctime.ms": "5000", + "groupid": "consumer-group", + "fetch.size": "1048586", + "autooffset.reset": "largest", + "autocommit.enable": "false" + }, + "feed": "your_kafka_topic", + "parser": { + "timestampSpec": { + "column": "timestamp", + "format": "iso" + }, + "data": { + "format": "json" + }, + "dimensionExclusions": [ + "value" + ] + } + }, + "fireDepartmentConfig": { + "maxRowsInMemory": 500000, + "intermediatePersistPeriod": "PT10m" + }, + "windowPeriod": "PT10m", + "segmentGranularity": "hour", + "rejectionPolicy": { + "type": "messageTime" + } +} +``` + +Id: +The ID of the task. Not required. + +Resource: +A JSON object used for high availability purposes. Not required. + +|Field|Type|Description|Required| +|-----|----|-----------|--------| +|availabilityGroup|String|An uniqueness identifier for the task. Tasks with the same availability group will always run on different middle managers. Used mainly for replication. |yes| +|requiredCapacity|Integer|How much middle manager capacity this task will take.|yes| + +Schema: +See [Schema](Realtime.html#Schema). + +Fire Department Config: +See [Config](Realtime.html#Config). + +Firehose: +See [Firehose](Firehose.html). + +Window Period: +See [Realtime](Realtime.html). + +Segment Granularity: +See [Realtime](Realtime.html). + +Rejection Policy: +See [Realtime](Realtime.html). Segment Merging Tasks --------------------- diff --git a/docs/content/Tutorial:-Loading-Your-Data-Part-1.md b/docs/content/Tutorial:-Loading-Your-Data-Part-1.md index a18009d7f98..684e0ad29c1 100644 --- a/docs/content/Tutorial:-Loading-Your-Data-Part-1.md +++ b/docs/content/Tutorial:-Loading-Your-Data-Part-1.md @@ -1,18 +1,14 @@ --- layout: doc_page --- -In our last [tutorial](Tutorial:-The-Druid-Cluster.html), we setup a complete Druid cluster. We created all the Druid dependencies and loaded some batched data. Druid shards data into self contained chunks known as segments. Segments are the fundamental unit of storage in Druid and all Druid nodes only understand segments. +In our last [tutorial](Tutorial:-The-Druid-Cluster.html), we setup a complete Druid cluster. We created all the Druid dependencies and loaded some batched data. Druid shards data into self-contained chunks known as [segments](Segments.html). Segments are the fundamental unit of storage in Druid and all Druid nodes only understand segments. -In this tutorial, we will learn how to create segments using the final piece of the Druid Cluster, the [indexing service](Indexing-Service.html). The indexing service is a standalone service that accepts [tasks](Tasks.html) in the form of POST requests. Tasks often relate to indexing data, and hence, segments are often the final output of a task. +In this tutorial, we will learn about batch ingestion (as opposed to real-time ingestion) and how to create segments using the final piece of the Druid Cluster, the [indexing service](Indexing-Service.html). The indexing service is a standalone service that accepts [tasks](Tasks.html) in the form of POST requests. The output of most tasks are segments. About the data -------------- -The data source we'll be working with is Wikipedia edits. Each time an edit is made in Wikipedia, an event gets pushed to an IRC channel associated with the language of the Wikipedia page. We scrape IRC channels for several different languages and load this data into Druid. - -Each event has a timestamp indicating the time of the edit (in UTC time), a list of dimensions indicating various metadata about the event (such as information about the user editing the page and where the user resides), and a list of metrics associated with the event (such as the number of characters added and deleted). - -Specifically. the data schema looks like so: +The data source we'll be working with is Wikipedia edits once again. The data schema is the same as the previous tutorials: Dimensions (things to filter on): @@ -39,297 +35,215 @@ Metrics (things to aggregate over): "delta" "deleted" ``` - -These metrics track the number of characters added, deleted, and changed. - Setting Up ---------- -There are two ways to setup Druid: download a tarball, or [Build From Source](Build From Source.html). You only need to do one of these. +At this point, you should already have Druid downloaded and are comfortable with running a Druid cluster locally. If you are not, stop here and familiarize yourself with the first two tutorials. -### Download a Tarball +Let's start from our usual starting point in the tarball directory. -We've built a tarball that contains everything you'll need. You'll find it [here](http://static.druid.io/artifacts/releases/druid-services-0.6.0-bin.tar.gz) -Download this file to a directory of your choosing. - -You can extract the awesomeness within by issuing: +Segments require data, so before we can build a Druid segment, we are going to need some raw data. Make sure that the following file exists: ``` -tar -zxvf druid-services-*-bin.tar.gz +examples/indexing/wikipedia_data.json ``` -Not too lost so far right? That's great! If you cd into the directory: +Open the file and make sure the following events exist: ``` -cd druid-services-0.6.0 +{"timestamp": "2013-08-31T01:02:33Z", "page": "Gypsy Danger", "language" : "en", "user" : "nuclear", "unpatrolled" : "true", "newPage" : "true", "robot": "false", "anonymous": "false", "namespace":"article", "continent":"North America", "country":"United States", "region":"Bay Area", "city":"San Francisco", "added": 57, "deleted": 200, "delta": -143} +{"timestamp": "2013-08-31T03:32:45Z", "page": "Striker Eureka", "language" : "en", "user" : "speed", "unpatrolled" : "false", "newPage" : "true", "robot": "true", "anonymous": "false", "namespace":"wikipedia", "continent":"Australia", "country":"Australia", "region":"Dingo Land", "city":"Syndey", "added": 459, "deleted": 129, "delta": 330} +{"timestamp": "2013-08-31T07:11:21Z", "page": "Cherno Alpha", "language" : "ru", "user" : "masterYi", "unpatrolled" : "false", "newPage" : "true", "robot": "true", "anonymous": "false", "namespace":"article", "continent":"Asia", "country":"Russia", "region":"Vodka Land", "city":"Moscow", "added": 123, "deleted": 12, "delta": 111} +{"timestamp": "2013-08-31T11:58:39Z", "page": "Crimson Typhoon", "language" : "zh", "user" : "triplets", "unpatrolled" : "true", "newPage" : "false", "robot": "true", "anonymous": "false", "namespace":"wikipedia", "continent":"Asia", "country":"China", "region":"Shanxi", "city":"Taiyuan", "added": 905, "deleted": 5, "delta": 900} +{"timestamp": "2013-08-31T12:41:27Z", "page": "Coyote Tango", "language" : "ja", "user" : "cancer", "unpatrolled" : "true", "newPage" : "false", "robot": "true", "anonymous": "false", "namespace":"wikipedia", "continent":"Asia", "country":"Japan", "region":"Kanto", "city":"Tokyo", "added": 1, "deleted": 10, "delta": -9} ``` -You should see a bunch of files: +There are five data points spread across the day of 2013-08-31. Talk about big data right? Thankfully, we don't need a ton of data to introduce how batch ingestion works. -* run_example_server.sh -* run_example_client.sh -* LICENSE, config, examples, lib directories +In order to ingest and query this data, we are going to need to run a historical node, a coordinator node, and an indexing service to run the batch ingestion. -Running Example Scripts ------------------------ +#### Starting a Local Indexing Service -Let's start doing stuff. You can start a Druid [Realtime](Realtime.html) node by issuing: +The simplest indexing service we can start up is to run an [overlord](Indexing-Service.html) node in local mode. You can do so by issuing: -``` -./run_example_server.sh +```bash +java -Xmx2g -Duser.timezone=UTC -Dfile.encoding=UTF-8 -classpath lib/*:config/overlord io.druid.cli.Main server overlord ``` -Select "wikipedia". - -Once the node starts up you will see a bunch of logs about setting up properties and connecting to the data source. If everything was successful, you should see messages of the form shown below. - +The overlord configurations should already exist in: ``` -2013-09-04 19:33:11,922 INFO [main] org.eclipse.jetty.server.AbstractConnector - Started SelectChannelConnector@0.0.0.0:8083 -2013-09-04 19:33:11,946 INFO [ApiDaemon] io.druid.segment.realtime.firehose.IrcFirehoseFactory - irc connection to server [irc.wikimedia.org] established -2013-09-04 19:33:11,946 INFO [ApiDaemon] io.druid.segment.realtime.firehose.IrcFirehoseFactory - Joining channel #en.wikipedia -2013-09-04 19:33:11,946 INFO [ApiDaemon] io.druid.segment.realtime.firehose.IrcFirehoseFactory - Joining channel #fr.wikipedia -2013-09-04 19:33:11,946 INFO [ApiDaemon] io.druid.segment.realtime.firehose.IrcFirehoseFactory - Joining channel #de.wikipedia -2013-09-04 19:33:11,946 INFO [ApiDaemon] io.druid.segment.realtime.firehose.IrcFirehoseFactory - Joining channel #ja.wikipedia +config/overlord/runtime.properties ``` -The Druid real time-node ingests events in an in-memory buffer. Periodically, these events will be persisted to disk. If you are interested in the details of our real-time architecture and why we persist indexes to disk, I suggest you read our [White Paper](http://static.druid.io/docs/druid.pdf). - -Okay, things are about to get real-time. To query the real-time node you've spun up, you can issue: - +The configurations for the overlord node are as follows: ``` -./run_example_client.sh +druid.host=localhost +druid.port=8087 +druid.service=overlord + +druid.zk.service.host=localhost + +druid.db.connector.connectURI=jdbc:mysql://localhost:3306/druid +druid.db.connector.user=druid +druid.db.connector.password=diurd + +druid.selectors.indexing.serviceName=overlord +druid.indexer.runner.javaOpts="-server -Xmx1g" +druid.indexer.runner.startPort=8088 +druid.indexer.fork.property.druid.computation.buffer.size=268435456 ``` -Select "wikipedia" once again. This script issues [GroupByQuery](GroupByQuery.html)s to the data we've been ingesting. The query looks like this: +If you are interested in reading more about these configurations, see [here](Indexing-Service.html). -```json +When the overlord node is ready for tasks, you should see a message like the following: +``` +013-10-09 21:30:32,817 INFO [Thread-14] io.druid.indexing.overlord.TaskQueue - Waiting for work... +``` + +#### Starting Other Nodes + +Just in case you forgot how, let's start up the other nodes we require: + +Coordinator node: + +```bash +java -Xmx256m -Duser.timezone=UTC -Dfile.encoding=UTF-8 -classpath lib/*:config/coordinator io.druid.cli.Main server coordinator +``` + +Historical node: +```bash +java -Xmx256m -Duser.timezone=UTC -Dfile.encoding=UTF-8 -classpath lib/*:config/historical io.druid.cli.Main server historical +``` + +Note: Historical, real-time and broker nodes share the same query interface. Hence, we do not explicitly need a broker node for this tutorial. All queries can go against the historical node directly. + +Once all the nodes are up and running, we are ready to index some data. + +Indexing the Data +----------------- + +To index the data and build a Druid segment, we are going to need to submit a task to the indexing service. This task should already exist: + +``` +examples/indexing/index_task.json +``` + +Open up the file to see the following: + +``` { - "queryType":"groupBy", - "dataSource":"wikipedia", - "granularity":"minute", - "dimensions":[ "page" ], - "aggregations":[ - {"type":"count", "name":"rows"}, - {"type":"longSum", "fieldName":"edit_count", "name":"count"} - ], - "filter":{ "type":"selector", "dimension":"namespace", "value":"article" }, - "intervals":[ "2013-06-01T00:00/2020-01-01T00" ] + "type" : "index", + "dataSource" : "wikipedia", + "granularitySpec" : { + "type" : "uniform", + "gran" : "DAY", + "intervals" : [ "2013-08-31/2013-09-01" ] + }, + "aggregators" : [{ + "type" : "count", + "name" : "edit_count" + }, { + "type" : "doubleSum", + "name" : "added", + "fieldName" : "added" + }, { + "type" : "doubleSum", + "name" : "deleted", + "fieldName" : "deleted" + }, { + "type" : "doubleSum", + "name" : "delta", + "fieldName" : "delta" + }], + "firehose" : { + "type" : "local", + "baseDir" : "examples/indexing", + "filter" : "wikipedia_data.json", + "parser" : { + "timestampSpec" : { + "column" : "timestamp" + }, + "data" : { + "format" : "json", + "dimensions" : ["page","language","user","unpatrolled","newPage","robot","anonymous","namespace","continent","country","region","city"] + } + } + } } ``` -This is a **groupBy** query, which you may be familiar with from SQL. We are grouping, or aggregating, via the `dimensions` field: `["page"]`. We are **filtering** via the `namespace` dimension, to only look at edits on `articles`. Our **aggregations** are what we are calculating: a count of the number of data rows, and a count of the number of edits that have occurred. +Okay, so what is happening here? The "type" field indicates the type of task we plan to run. In this case, it is a simple "index" task. The "granularitySpec" indicates that we are building a daily segment for 2013-08-31 to 2013-09-01. Next, the "aggregators" indicate which fields in our data set we plan to build metric columns for. The "fieldName" corresponds to the metric name in the raw data. The "name" corresponds to what our metric column is actually going to be called in the segment. Finally, we have a local "firehose" that is going to read data from disk. We tell the firehose where our data is located and the types of files we are looking to ingest. In our case, we only have a single data file. -The result looks something like this: +Let's send our task to the indexing service now: -```json -[ - { - "version": "v1", - "timestamp": "2013-09-04T21:44:00.000Z", - "event": { "count": 0, "page": "2013\u201314_Brentford_F.C._season", "rows": 1 } - }, - { - "version": "v1", - "timestamp": "2013-09-04T21:44:00.000Z", - "event": { "count": 0, "page": "8e_\u00e9tape_du_Tour_de_France_2013", "rows": 1 } - }, - { - "version": "v1", - "timestamp": "2013-09-04T21:44:00.000Z", - "event": { "count": 0, "page": "Agenda_of_the_Tea_Party_movement", "rows": 1 } - }, +``` +curl -X 'POST' -H 'Content-Type:application/json' -d @examples/indexing/wikipedia_index_task.json localhost:8087/druid/indexer/v1/task +``` + +Issuing the request should return a task ID like so: + +``` +fjy$ curl -X 'POST' -H 'Content-Type:application/json' -d @examples/indexing/wikipedia_index_task.json localhost:8087/druid/indexer/v1/task +{"task":"index_wikipedia_2013-10-09T21:30:32.802Z"} +fjy$ +``` + +In your indexing service logs, you should see the following: + +```` +2013-10-09 21:41:41,150 INFO [qtp300448720-21] io.druid.indexing.overlord.HeapMemoryTaskStorage - Inserting task index_wikipedia_2013-10-09T21:41:41.147Z with status: TaskStatus{id=index_wikipedia_2013-10-09T21:41:41.147Z, status=RUNNING, duration=-1} +2013-10-09 21:41:41,151 INFO [qtp300448720-21] io.druid.indexing.overlord.TaskLockbox - Created new TaskLockPosse: TaskLockPosse{taskLock=TaskLock{groupId=index_wikipedia_2013-10-09T21:41:41.147Z, dataSource=wikipedia, interval=2013-08-31T00:00:00.000Z/2013-09-01T00:00:00.000Z, version=2013-10-09T21:41:41.151Z}, taskIds=[]} ... -``` +013-10-09 21:41:41,215 INFO [pool-6-thread-1] io.druid.indexing.overlord.ForkingTaskRunner - Logging task index_wikipedia_2013-10-09T21:41:41.147Z_generator_2013-08-31T00:00:00.000Z_2013-09-01T00:00:00.000Z_0 output to: /tmp/persistent/index_wikipedia_2013-10-09T21:41:41.147Z_generator_2013-08-31T00:00:00.000Z_2013-09-01T00:00:00.000Z_0/b5099fdb-d6b0-4b81-9053-b2af70336a7e/log +2013-10-09 21:41:45,017 INFO [qtp300448720-22] io.druid.indexing.common.actions.LocalTaskActionClient - Performing action for task[index_wikipedia_2013-10-09T21:41:41.147Z_generator_2013-08-31T00:00:00.000Z_2013-09-01T00:00:00.000Z_0]: LockListAction{} -This groupBy query is a bit complicated and we'll return to it later. For the time being, just make sure you are getting some blocks of data back. If you are having problems, make sure you have [curl](http://curl.haxx.se/) installed. Control+C to break out of the client script. +```` -h2. Querying Druid - -In your favorite editor, create the file: +After a few seconds, the task should complete and you should see in the indexing service logs: ``` -time_boundary_query.body +2013-10-09 21:41:45,765 INFO [pool-6-thread-1] io.druid.indexing.overlord.exec.TaskConsumer - Received SUCCESS status for task: IndexGeneratorTask{id=index_wikipedia_2013-10-09T21:41:41.147Z_generator_2013-08-31T00:00:00.000Z_2013-09-01T00:00:00.000Z_0, type=index_generator, dataSource=wikipedia, interval=Optional.of(2013-08-31T00:00:00.000Z/2013-09-01T00:00:00.000Z)} ``` -Druid queries are JSON blobs which are relatively painless to create programmatically, but an absolute pain to write by hand. So anyway, we are going to create a Druid query by hand. Add the following to the file you just created: +Congratulations! The segment has completed building. Once a segment is built, a segment metadata entry is created in your MySQL table. The coordinator compares what is in the segment metadata table with what is in the cluster. A new entry in the metadata table will cause the coordinator to load the new segment in a minute or so. + +You should see the following logs on the coordinator: ``` -{ - "queryType": "timeBoundary", - "dataSource": "wikipedia" -} +2013-10-09 21:41:54,368 INFO [Coordinator-Exec--0] io.druid.server.coordinator.DruidCoordinatorLogger - [_default_tier] : Assigned 1 segments among 1 servers +2013-10-09 21:41:54,369 INFO [Coordinator-Exec--0] io.druid.server.coordinator.DruidCoordinatorLogger - Load Queues: +2013-10-09 21:41:54,369 INFO [Coordinator-Exec--0] io.druid.server.coordinator.DruidCoordinatorLogger - Server[localhost:8081, historical, _default_tier] has 1 left to load, 0 left to drop, 4,477 bytes queued, 4,477 bytes served. ``` -The [TimeBoundaryQuery](TimeBoundaryQuery.html) is one of the simplest Druid queries. To run the query, you can issue: +These logs indicate that the coordinator has assigned our new segment to the historical node to download and serve. If you look at the historical node logs, you should see: ``` -curl -X POST 'http://localhost:8083/druid/v2/?pretty' -H 'content-type: application/json' -d @time_boundary_query.body +2013-10-09 21:41:54,369 INFO [ZkCoordinator-0] io.druid.server.coordination.ZkCoordinator - Loading segment wikipedia_2013-08-31T00:00:00.000Z_2013-09-01T00:00:00.000Z_2013-10-09T21:41:41.151Z +2013-10-09 21:41:54,369 INFO [ZkCoordinator-0] io.druid.segment.loading.LocalDataSegmentPuller - Unzipping local file[/tmp/druid/localStorage/wikipedia/2013-08-31T00:00:00.000Z_2013-09-01T00:00:00.000Z/2013-10-09T21:41:41.151Z/0/index.zip] to [/tmp/druid/indexCache/wikipedia/2013-08-31T00:00:00.000Z_2013-09-01T00:00:00.000Z/2013-10-09T21:41:41.151Z/0] +2013-10-09 21:41:54,370 INFO [ZkCoordinator-0] io.druid.utils.CompressionUtils - Unzipping file[/tmp/druid/localStorage/wikipedia/2013-08-31T00:00:00.000Z_2013-09-01T00:00:00.000Z/2013-10-09T21:41:41.151Z/0/index.zip] to [/tmp/druid/indexCache/wikipedia/2013-08-31T00:00:00.000Z_2013-09-01T00:00:00.000Z/2013-10-09T21:41:41.151Z/0] +2013-10-09 21:41:54,380 INFO [ZkCoordinator-0] io.druid.server.coordination.SingleDataSegmentAnnouncer - Announcing segment[wikipedia_2013-08-31T00:00:00.000Z_2013-09-01T00:00:00.000Z_2013-10-09T21:41:41.151Z] to path[/druid/servedSegments/localhost:8081/wikipedia_2013-08-31T00:00:00.000Z_2013-09-01T00:00:00.000Z_2013-10-09T21:41:41.151Z] ``` -We get something like this JSON back: +Once the segment is announced the segment is queryable. Now you should be able to query the data. -```json +Issuing a [TimeBoundaryQuery](TimeBoundaryQuery.html) should yield: + +``` [ { - "timestamp" : "2013-09-04T21:44:00.000Z", + "timestamp" : "2013-08-31T01:02:33.000Z", "result" : { - "minTime" : "2013-09-04T21:44:00.000Z", - "maxTime" : "2013-09-04T21:47:00.000Z" + "minTime" : "2013-08-31T01:02:33.000Z", + "maxTime" : "2013-08-31T12:41:27.000Z" } } ] ``` -As you can probably tell, the result is indicating the maximum and minimum timestamps we've seen thus far (summarized to a minutely granularity). Let's explore a bit further. - -Return to your favorite editor and create the file: - -``` -timeseries_query.body -``` - -We are going to make a slightly more complicated query, the [TimeseriesQuery](TimeseriesQuery.html). Copy and paste the following into the file: - -``` -{ - "queryType": "timeseries", - "dataSource": "wikipedia", - "intervals": [ "2010-01-01/2020-01-01" ], - "granularity": "all", - "aggregations": [ - {"type": "longSum", "fieldName": "count", "name": "edit_count"}, - {"type": "doubleSum", "fieldName": "added", "name": "chars_added"} - ] -} -``` - -You are probably wondering, what are these [Granularities](Granularities.html) and [Aggregations](Aggregations.html) things? What the query is doing is aggregating some metrics over some span of time. -To issue the query and get some results, run the following in your command line: - -``` -curl -X POST 'http://localhost:8083/druid/v2/?pretty' -H 'content-type: application/json' -d ````timeseries_query.body -``` - -Once again, you should get a JSON blob of text back with your results, that looks something like this: - -```json -[ { - "timestamp" : "2013-09-04T21:44:00.000Z", - "result" : { "chars_added" : 312670.0, "edit_count" : 733 } -} ] -``` - -If you issue the query again, you should notice your results updating. - -Right now all the results you are getting back are being aggregated into a single timestamp bucket. What if we wanted to see our aggregations on a per minute basis? What field can we change in the query to accomplish this? - -If you loudly exclaimed "we can change granularity to minute", you are absolutely correct! We can specify different granularities to bucket our results, like so: - -``` -{ - "queryType": "timeseries", - "dataSource": "wikipedia", - "intervals": [ "2010-01-01/2020-01-01" ], - "granularity": "minute", - "aggregations": [ - {"type": "longSum", "fieldName": "count", "name": "edit_count"}, - {"type": "doubleSum", "fieldName": "added", "name": "chars_added"} - ] -} -``` - -This gives us something like the following: - -```json -[ - { - "timestamp" : "2013-09-04T21:44:00.000Z", - "result" : { "chars_added" : 30665.0, "edit_count" : 128 } - }, - { - "timestamp" : "2013-09-04T21:45:00.000Z", - "result" : { "chars_added" : 122637.0, "edit_count" : 167 } - }, - { - "timestamp" : "2013-09-04T21:46:00.000Z", - "result" : { "chars_added" : 78938.0, "edit_count" : 159 } - }, -... -] -``` - -Solving a Problem ------------------ - -One of Druid's main powers is to provide answers to problems, so let's pose a problem. What if we wanted to know what the top pages in the US are, ordered by the number of edits over the last few minutes you've been going through this tutorial? To solve this problem, we have to return to the query we introduced at the very beginning of this tutorial, the [GroupByQuery](GroupByQuery.html). It would be nice if we could group by results by dimension value and somehow sort those results... and it turns out we can! - -Let's create the file: - -``` -group_by_query.body -``` - -and put the following in there: - -``` -{ - "queryType": "groupBy", - "dataSource": "wikipedia", - "granularity": "all", - "dimensions": [ "page" ], - "orderBy": { - "type": "default", - "columns": [ { "dimension": "edit_count", "direction": "DESCENDING" } ], - "limit": 10 - }, - "aggregations": [ - {"type": "longSum", "fieldName": "count", "name": "edit_count"} - ], - "filter": { "type": "selector", "dimension": "country", "value": "United States" }, - "intervals": ["2012-10-01T00:00/2020-01-01T00"] -} -``` - -Woah! Our query just got a way more complicated. Now we have these [Filters](Filters.html) things and this [OrderBy](OrderBy.html) thing. Fear not, it turns out the new objects we've introduced to our query can help define the format of our results and provide an answer to our question. - -If you issue the query: - -``` -curl -X POST 'http://localhost:8083/druid/v2/?pretty' -H 'content-type: application/json' -d @group_by_query.body -``` - -You should see an answer to our question. As an example, some results are shown below: - -```json -[ - { - "version" : "v1", - "timestamp" : "2012-10-01T00:00:00.000Z", - "event" : { "page" : "RTC_Transit", "edit_count" : 6 } - }, - { - "version" : "v1", - "timestamp" : "2012-10-01T00:00:00.000Z", - "event" : { "page" : "List_of_Deadly_Women_episodes", "edit_count" : 4 } - }, - { - "version" : "v1", - "timestamp" : "2012-10-01T00:00:00.000Z", - "event" : { "page" : "User_talk:David_Biddulph", "edit_count" : 4 } - }, -... -``` - -Feel free to tweak other query parameters to answer other questions you may have about the data. - Next Steps ---------- -What to know even more information about the Druid Cluster? Check out [Tutorial: The Druid Cluster](Tutorial:-The-Druid-Cluster.html) - -Druid is even more fun if you load your own data into it! To learn how to load your data, see [Loading Your Data](Loading-Your-Data.html). +This tutorial covered ingesting a small batch data set and loading it into Druid. In [Loading Your Data Part 2](Tutorial-Loading-Your-Data-Part-2.html), we will cover how to ingest data using Hadoop for larger data sets. Additional Information ---------------------- -This tutorial is merely showcasing a small fraction of what Druid can do. If you are interested in more information about Druid, including setting up a more sophisticated Druid cluster, please read the other links in our wiki. - -And thus concludes our journey! Hopefully you learned a thing or two about Druid real-time ingestion, querying Druid, and how Druid can be used to solve problems. If you have additional questions, feel free to post in our [google groups page](http://www.groups.google.com/forum/#!forum/druid-development). +Getting data into Druid can definitely be difficult for first time users. Please don't hesitate to ask questions in our IRC channel or on our [google groups page](http://www.groups.google.com/forum/#!forum/druid-development). diff --git a/examples/bin/examples/indexing/wikipedia_data.json b/examples/bin/examples/indexing/wikipedia_data.json new file mode 100644 index 00000000000..5a30c4a68bc --- /dev/null +++ b/examples/bin/examples/indexing/wikipedia_data.json @@ -0,0 +1,5 @@ +{"timestamp": "2013-08-31T01:02:33Z", "page": "Gypsy Danger", "language" : "en", "user" : "nuclear", "unpatrolled" : "true", "newPage" : "true", "robot": "false", "anonymous": "false", "namespace":"article", "continent":"North America", "country":"United States", "region":"Bay Area", "city":"San Francisco", "added": 57, "deleted": 200, "delta": -143} +{"timestamp": "2013-08-31T03:32:45Z", "page": "Striker Eureka", "language" : "en", "user" : "speed", "unpatrolled" : "false", "newPage" : "true", "robot": "true", "anonymous": "false", "namespace":"wikipedia", "continent":"Australia", "country":"Australia", "region":"Dingo Land", "city":"Syndey", "added": 459, "deleted": 129, "delta": 330} +{"timestamp": "2013-08-31T07:11:21Z", "page": "Cherno Alpha", "language" : "ru", "user" : "masterYi", "unpatrolled" : "false", "newPage" : "true", "robot": "true", "anonymous": "false", "namespace":"article", "continent":"Asia", "country":"Russia", "region":"Vodka Land", "city":"Moscow", "added": 123, "deleted": 12, "delta": 111} +{"timestamp": "2013-08-31T11:58:39Z", "page": "Crimson Typhoon", "language" : "zh", "user" : "triplets", "unpatrolled" : "true", "newPage" : "false", "robot": "true", "anonymous": "false", "namespace":"wikipedia", "continent":"Asia", "country":"China", "region":"Shanxi", "city":"Taiyuan", "added": 905, "deleted": 5, "delta": 900} +{"timestamp": "2013-08-31T12:41:27Z", "page": "Coyote Tango", "language" : "ja", "user" : "cancer", "unpatrolled" : "true", "newPage" : "false", "robot": "true", "anonymous": "false", "namespace":"wikipedia", "continent":"Asia", "country":"Japan", "region":"Kanto", "city":"Tokyo", "added": 1, "deleted": 10, "delta": -9} \ No newline at end of file diff --git a/examples/bin/examples/indexing/wikipedia_index_task.json b/examples/bin/examples/indexing/wikipedia_index_task.json new file mode 100644 index 00000000000..10b346187a9 --- /dev/null +++ b/examples/bin/examples/indexing/wikipedia_index_task.json @@ -0,0 +1,39 @@ +{ + "type" : "index", + "dataSource" : "wikipedia", + "granularitySpec" : { + "type" : "uniform", + "gran" : "DAY", + "intervals" : [ "2013-08-31/2013-09-01" ] + }, + "aggregators" : [{ + "type" : "count", + "name" : "edit_count" + }, { + "type" : "doubleSum", + "name" : "added", + "fieldName" : "added" + }, { + "type" : "doubleSum", + "name" : "deleted", + "fieldName" : "deleted" + }, { + "type" : "doubleSum", + "name" : "delta", + "fieldName" : "delta" + }], + "firehose" : { + "type" : "local", + "baseDir" : "examples/indexing/", + "filter" : "wikipedia_data.json", + "parser" : { + "timestampSpec" : { + "column" : "timestamp" + }, + "data" : { + "format" : "json", + "dimensions" : ["page","language","user","unpatrolled","newPage","robot","anonymous","namespace","continent","country","region","city"] + } + } + } +} \ No newline at end of file diff --git a/examples/config/overlord/runtime.properties b/examples/config/overlord/runtime.properties new file mode 100644 index 00000000000..c9c4478d4c4 --- /dev/null +++ b/examples/config/overlord/runtime.properties @@ -0,0 +1,14 @@ +druid.host=localhost +druid.port=8087 +druid.service=overlord + +druid.zk.service.host=localhost + +druid.db.connector.connectURI=jdbc:mysql://localhost:3306/druid +druid.db.connector.user=druid +druid.db.connector.password=diurd + +druid.selectors.indexing.serviceName=overlord +druid.indexer.runner.javaOpts="-server -Xmx1g" +druid.indexer.runner.startPort=8088 +druid.indexer.fork.property.druid.computation.buffer.size=268435456 diff --git a/examples/pom.xml b/examples/pom.xml index 4891972dbf4..fc4543487ce 100644 --- a/examples/pom.xml +++ b/examples/pom.xml @@ -20,7 +20,7 @@ 4.0.0 - com.metamx.druid + io.druid druid-examples druid-examples druid-examples @@ -33,17 +33,12 @@ - com.metamx.druid - druid-realtime - ${project.parent.version} - - - com.metamx.druid + io.druid druid-server ${project.parent.version} - com.metamx.druid + io.druid druid-common ${project.parent.version} diff --git a/hdfs-storage/pom.xml b/hdfs-storage/pom.xml index fb555601dc9..a18a4c787ac 100644 --- a/hdfs-storage/pom.xml +++ b/hdfs-storage/pom.xml @@ -20,7 +20,7 @@ 4.0.0 - com.metamx.druid + io.druid druid-hdfs-storage druid-hdfs-storage druid-hdfs-storage diff --git a/indexing-hadoop/pom.xml b/indexing-hadoop/pom.xml index d09b34b13b0..5dc514a28f2 100644 --- a/indexing-hadoop/pom.xml +++ b/indexing-hadoop/pom.xml @@ -20,7 +20,7 @@ 4.0.0 - com.metamx.druid + io.druid druid-indexing-hadoop druid-indexing-hadoop Druid Indexing Hadoop @@ -33,7 +33,7 @@ - com.metamx.druid + io.druid druid-server ${project.parent.version} diff --git a/indexing-service/pom.xml b/indexing-service/pom.xml index 45c2fb0aa7d..dd82f8a6fea 100644 --- a/indexing-service/pom.xml +++ b/indexing-service/pom.xml @@ -20,7 +20,7 @@ 4.0.0 - com.metamx.druid + io.druid druid-indexing-service druid-indexing-service druid-indexing-service @@ -33,25 +33,20 @@ - com.metamx.druid + io.druid druid-common ${project.parent.version} - com.metamx.druid + io.druid druid-server ${project.parent.version} - com.metamx.druid + io.druid druid-indexing-hadoop ${project.parent.version} - - com.metamx.druid - druid-realtime - ${project.parent.version} - com.metamx diff --git a/processing/pom.xml b/processing/pom.xml index 405c8997078..12553ab6e3a 100644 --- a/processing/pom.xml +++ b/processing/pom.xml @@ -20,7 +20,7 @@ 4.0.0 - com.metamx.druid + io.druid druid-processing druid-processing A module that is everything required to understands Druid Segments @@ -33,7 +33,7 @@ - com.metamx.druid + io.druid druid-common ${project.parent.version} diff --git a/s3-extensions/pom.xml b/s3-extensions/pom.xml index 4450d2a33bb..df00390971e 100644 --- a/s3-extensions/pom.xml +++ b/s3-extensions/pom.xml @@ -20,7 +20,7 @@ 4.0.0 - com.metamx.druid + io.druid druid-s3-extensions druid-s3-extensions druid-s3-extensions diff --git a/server/pom.xml b/server/pom.xml index 8a9f0d3e40d..54332a61faf 100644 --- a/server/pom.xml +++ b/server/pom.xml @@ -21,7 +21,7 @@ 4.0.0 - com.metamx.druid + io.druid druid-server druid-server Druid Server @@ -34,7 +34,7 @@ - com.metamx.druid + io.druid druid-processing ${project.parent.version} @@ -221,7 +221,7 @@ test - com.metamx.druid + io.druid druid-processing ${project.parent.version} test-jar diff --git a/services/pom.xml b/services/pom.xml index 0f202976a75..bebf3db933d 100644 --- a/services/pom.xml +++ b/services/pom.xml @@ -20,7 +20,7 @@ 4.0.0 - com.metamx.druid + io.druid druid-services druid-services druid-services @@ -33,37 +33,32 @@ - com.metamx.druid + io.druid druid-examples ${project.parent.version} - com.metamx.druid + io.druid druid-indexing-hadoop ${project.parent.version} - com.metamx.druid + io.druid druid-indexing-service ${project.parent.version} - com.metamx.druid - druid-realtime - ${project.parent.version} - - - com.metamx.druid + io.druid druid-server ${project.parent.version} - com.metamx.druid + io.druid druid-examples ${project.parent.version} - com.metamx.druid + io.druid druid-indexing-service ${project.parent.version} diff --git a/services/src/assembly/assembly.xml b/services/src/assembly/assembly.xml index 7ba9ac842c0..3dfb6a68f7e 100644 --- a/services/src/assembly/assembly.xml +++ b/services/src/assembly/assembly.xml @@ -42,6 +42,13 @@ config/historical + + ../examples/config/overlord + + * + + config/overlord + ../examples/bin diff --git a/services/src/main/java/io/druid/cli/Main.java b/services/src/main/java/io/druid/cli/Main.java index 750226ca871..7e529860517 100644 --- a/services/src/main/java/io/druid/cli/Main.java +++ b/services/src/main/java/io/druid/cli/Main.java @@ -24,6 +24,7 @@ import io.airlift.command.Cli; import io.airlift.command.Help; import io.airlift.command.ParseException; import io.druid.cli.convert.ConvertProperties; +import io.druid.cli.validate.DruidJsonValidator; import io.druid.initialization.Initialization; import io.druid.server.initialization.ExtensionsConfig; @@ -58,7 +59,7 @@ public class Main builder.withGroup("tools") .withDescription("Various tools for working with Druid") .withDefaultCommand(Help.class) - .withCommands(ConvertProperties.class); + .withCommands(ConvertProperties.class, DruidJsonValidator.class); builder.withGroup("index") .withDescription("Run indexing for druid") diff --git a/services/src/main/java/io/druid/cli/validate/DruidJsonValidator.java b/services/src/main/java/io/druid/cli/validate/DruidJsonValidator.java new file mode 100644 index 00000000000..6fcfd2806a1 --- /dev/null +++ b/services/src/main/java/io/druid/cli/validate/DruidJsonValidator.java @@ -0,0 +1,78 @@ +/* + * Druid - a distributed column store. + * Copyright (C) 2012, 2013 Metamarkets Group Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + */ + +package io.druid.cli.validate; + +import com.fasterxml.jackson.databind.ObjectMapper; +import com.google.api.client.repackaged.com.google.common.base.Throwables; +import com.metamx.common.UOE; +import io.airlift.command.Arguments; +import io.airlift.command.Command; +import io.airlift.command.Option; +import io.druid.indexer.HadoopDruidIndexerConfig; +import io.druid.indexing.common.task.Task; +import io.druid.jackson.DefaultObjectMapper; +import io.druid.query.Query; +import io.druid.segment.realtime.Schema; + +import java.io.File; + +/** + */ +@Command( + name = "validator", + description = "Validates that a given Druid JSON object is correctly formatted" +) +public class DruidJsonValidator implements Runnable +{ + @Option(name = "-f", title = "file", description = "file to validate", required = true) + public String jsonFile; + + @Option(name = "-t", title = "type", description = "the type of schema to validate", required = true) + public String type; + + @Override + public void run() + { + File file = new File(jsonFile); + if (!file.exists()) { + System.out.printf("File[%s] does not exist.%n", file); + } + + final ObjectMapper jsonMapper = new DefaultObjectMapper(); + + try { + if (type.equalsIgnoreCase("query")) { + jsonMapper.readValue(file, Query.class); + } else if (type.equalsIgnoreCase("hadoopConfig")) { + jsonMapper.readValue(file, HadoopDruidIndexerConfig.class); + } else if (type.equalsIgnoreCase("task")) { + jsonMapper.readValue(file, Task.class); + } else if (type.equalsIgnoreCase("realtimeSchema")) { + jsonMapper.readValue(file, Schema.class); + } else { + throw new UOE("Unknown type[%s]", type); + } + } + catch (Exception e) { + System.out.println("INVALID JSON!"); + throw Throwables.propagate(e); + } + } +} From 560981acbf61cbd1354814a6890c50a16c1b8bd9 Mon Sep 17 00:00:00 2001 From: fjy Date: Thu, 10 Oct 2013 15:05:01 -0700 Subject: [PATCH 03/30] a ton of fixes to docs --- .../io/druid/common/config/ConfigManager.java | 40 +- docs/_includes/page_footer.html | 5 +- docs/content/Booting-a-production-cluster.md | 2 +- docs/content/Cluster-setup.md | 105 ++++ docs/content/Home.md | 3 +- docs/content/Indexing-Service.md | 4 +- docs/content/Querying-your-data.md | 293 ----------- docs/content/Tasks.md | 24 +- .../Tutorial:-A-First-Look-at-Druid.md | 23 +- docs/content/Tutorial:-All-About-Queries.md | 197 +++++++ .../Tutorial:-Loading-Your-Data-Part-1.md | 46 +- .../Tutorial:-Loading-Your-Data-Part-2.md | 494 ++++++++---------- docs/content/Tutorial:-The-Druid-Cluster.md | 57 +- docs/content/Tutorial:-Webstream.md | 10 +- docs/content/Twitter-Tutorial.textile | 2 +- docs/content/contents.textile | 23 +- docs/content/toc.textile | 12 +- .../examples/indexing/wikipedia.spec} | 49 +- .../bin/examples/indexing/wikipedia_data.json | 4 +- .../indexing/wikipedia_hadoop_config.json | 49 ++ .../indexing/wikipedia_index_hadoop_task.json | 41 ++ .../indexing/wikipedia_index_task.json | 2 +- examples/bin/examples/wikipedia/query.body | 4 +- examples/config/realtime/runtime.properties | 2 - 24 files changed, 767 insertions(+), 724 deletions(-) create mode 100644 docs/content/Cluster-setup.md delete mode 100644 docs/content/Querying-your-data.md create mode 100644 docs/content/Tutorial:-All-About-Queries.md rename examples/{config/realtime/realtime.spec => bin/examples/indexing/wikipedia.spec} (52%) create mode 100644 examples/bin/examples/indexing/wikipedia_hadoop_config.json create mode 100644 examples/bin/examples/indexing/wikipedia_index_hadoop_task.json diff --git a/common/src/main/java/io/druid/common/config/ConfigManager.java b/common/src/main/java/io/druid/common/config/ConfigManager.java index 0ec4cd33181..172c82a6e1c 100644 --- a/common/src/main/java/io/druid/common/config/ConfigManager.java +++ b/common/src/main/java/io/druid/common/config/ConfigManager.java @@ -23,6 +23,7 @@ import com.google.common.base.Supplier; import com.google.common.base.Throwables; import com.google.common.collect.Maps; import com.google.inject.Inject; +import com.metamx.common.ISE; import com.metamx.common.concurrent.ScheduledExecutors; import com.metamx.common.lifecycle.LifecycleStart; import com.metamx.common.lifecycle.LifecycleStop; @@ -38,6 +39,7 @@ import org.skife.jdbi.v2.tweak.ResultSetMapper; import java.sql.ResultSet; import java.sql.SQLException; import java.util.Arrays; +import java.util.List; import java.util.Map; import java.util.concurrent.Callable; import java.util.concurrent.ConcurrentMap; @@ -76,7 +78,7 @@ public class ConfigManager final String configTable = dbTables.get().getConfigTable(); this.selectStatement = String.format("SELECT payload FROM %s WHERE name = :name", configTable); - insertStatement = String.format( + this.insertStatement = String.format( "INSERT INTO %s (name, payload) VALUES (:name, :payload) ON DUPLICATE KEY UPDATE payload = :payload", configTable ); @@ -186,19 +188,29 @@ public class ConfigManager @Override public byte[] withHandle(Handle handle) throws Exception { - return handle.createQuery(selectStatement) - .bind("name", key) - .map( - new ResultSetMapper() - { - @Override - public byte[] map(int index, ResultSet r, StatementContext ctx) throws SQLException - { - return r.getBytes("payload"); - } - } - ) - .first(); + List matched = handle.createQuery(selectStatement) + .bind("name", key) + .map( + new ResultSetMapper() + { + @Override + public byte[] map(int index, ResultSet r, StatementContext ctx) + throws SQLException + { + return r.getBytes("payload"); + } + } + ).list(); + + if (matched.isEmpty()) { + return null; + } + + if (matched.size() > 1) { + throw new ISE("Error! More than one matching entry[%d] found for [%s]?!", matched.size(), key); + } + + return matched.get(0); } } ); diff --git a/docs/_includes/page_footer.html b/docs/_includes/page_footer.html index e8fe3e85aa6..1890df00f8b 100644 --- a/docs/_includes/page_footer.html +++ b/docs/_includes/page_footer.html @@ -11,9 +11,6 @@ info@druid.io
- Metamarkets - 625 2nd Street, Suite #230
- San Francisco, CA 94017
@@ -25,7 +22,7 @@
  • DRUID
  • What is Druid?
  • Downloads
  • -
  • Documentation
  • +
  • Documentation