fix task docs

2015-01-21 21:48:48 -08:00 · 2015-01-21 21:48:48 -08:00 · 74a4fddcfe
parent b71fff0db4
commit 74a4fddcfe
2 changed files with 176 additions and 83 deletions
--- a/docs/content/Tasks.md
+++ b/docs/content/Tasks.md
@ -16,48 +16,98 @@ The Index Task is a simpler variation of the Index Hadoop task that is designed
 ```json
 {
  "type" : "index",
-  "dataSource" : "example",
-  "granularitySpec" : {
-    "type" : "uniform",
-    "gran" : "DAY",
-    "intervals" : [ "2010/2020" ]
-  },
-  "aggregators" : [ {
-     "type" : "count",
-     "name" : "count"
-   }, {
-     "type" : "doubleSum",
-     "name" : "value",
-     "fieldName" : "value"
-  } ],
-  "firehose" : {
-    "type" : "local",
-    "baseDir" : "/tmp/data/json",
-    "filter" : "sample_data.json",
-    "parser" : {
-      "timestampSpec" : {
-        "column" : "timestamp"
+  "spec" : {
+    "dataSchema" : {
+      "dataSource" : "wikipedia",
+      "parser" : {
+        "type" : "string",
+        "parseSpec" : {
+          "format" : "json",
+          "timestampSpec" : {
+            "column" : "timestamp",
+            "format" : "auto"
+          },
+          "dimensionsSpec" : {
+            "dimensions": ["page","language","user","unpatrolled","newPage","robot","anonymous","namespace","continent","country","region","city"],
+            "dimensionExclusions" : [],
+            "spatialDimensions" : []
+          }
+        }
      },
-      "data" : {
-        "format" : "json",
-        "dimensions" : [ "dim1", "dim2", "dim3" ]
+      "metricsSpec" : [
+        {
+          "type" : "count",
+          "name" : "count"
+        },
+        {
+          "type" : "doubleSum",
+          "name" : "added",
+          "fieldName" : "added"
+        },
+        {
+          "type" : "doubleSum",
+          "name" : "deleted",
+          "fieldName" : "deleted"
+        },
+        {
+          "type" : "doubleSum",
+          "name" : "delta",
+          "fieldName" : "delta"
+        }
+      ],
+      "granularitySpec" : {
+        "type" : "uniform",
+        "segmentGranularity" : "DAY",
+        "queryGranularity" : "NONE",
+        "intervals" : [ "2013-08-31/2013-09-01" ]
      }
+    },
+    "ioConfig" : {
+      "type" : "index",
+      "firehose" : {
+        "type" : "local",
+        "baseDir" : "examples/indexing/",
+        "filter" : "wikipedia_data.json"
+       }
+    },
+    "tuningConfig" : {
+      "type" : "index",
+      "targetPartitionSize" : -1,
+      "rowFlushBoundary" : 0,
+      "numShards": 2
    }
  }
 }
 ```

+#### Task Properties
+
 |property|description|required?|
 |--------|-----------|---------|
 |type|The task type, this should always be "index".|yes|
 |id|The task ID. If this is not explicitly specified, Druid generates the task ID using the name of the task file and date-time stamp. |no|
-|granularitySpec|Specifies the segment chunks that the task will process. `type` is always "uniform"; `gran` sets the granularity of the chunks ("DAY" means all segments containing timestamps in the same day), while `intervals` sets the interval that the chunks will cover.|yes|
-|spatialDimensions|Dimensions to build spatial indexes over. See [Geographic Queries](GeographicQueries.html).|no|
-|aggregators|The metrics to aggregate in the data set. For more info, see [Aggregations](Aggregations.html).|yes|
-|indexGranularity|The rollup granularity for timestamps. See [Realtime Ingestion](Realtime-ingestion.html) for more information. |no|
-|targetPartitionSize|Used in sharding. Determines how many rows are in each segment.|no|
-|firehose|The input source of data. For more info, see [Firehose](Firehose.html).|yes|
-|rowFlushBoundary|Used in determining when intermediate persist should occur to disk.|no|
+|spec|The ingestion spec. See below for more details. |yes|
+
+#### DataSchema
+
+This field is required.
+
+See [Ingestion](Ingestion.html)
+
+#### IOConfig
+
+This field is required. You can specify a type of [Firehose](Firehose.html) here.
+
+#### TuningConfig
+
+The tuningConfig is optional and default parameters will be used if no tuningConfig is specified. See below for more details.
+
+|property|description|default|required?|
+|--------|-----------|-------|---------|
+|type|The task type, this should always be "index".|None.||yes|
+|targetPartitionSize|Used in sharding. Determines how many rows are in each segment.|5000000|no|
+|rowFlushBoundary|Used in determining when intermediate persist should occur to disk.|500000|no|
+|numShards|You can skip the intermediate persist step if you specify the number of shards you want and set targetPartitionSize=-1.|null|no|

 ### Index Hadoop Task

@ -93,66 +143,109 @@ If you are having trouble with any extensions in HadoopIndexTask, it may be the

 ### Realtime Index Task

-The indexing service can also run real-time tasks. These tasks effectively transform a middle manager into a real-time node. We introduced real-time tasks as a way to programmatically add new real-time data sources without needing to manually add nodes. The grammar for the real-time task is as follows:
+The indexing service can also run real-time tasks. These tasks effectively transform a middle manager into a real-time node. We introduced real-time tasks as a way to programmatically add new real-time data sources without needing to manually add nodes. We recommend you use the library [tranquility](https://github.com/metamx/tranquility) to programmatically manage generating real-time index tasks. The grammar for the real-time task is as follows:

 ```json
 {
-  "type" : "index_realtime",
+  "type": "index_realtime",
  "id": "example",
  "resource": {
-    "availabilityGroup" : "someGroup",
-    "requiredCapacity" : 1
+    "availabilityGroup": "someGroup",
+    "requiredCapacity": 1
  },
-  "schema": {
-    "dataSource": "dataSourceName",
-    "aggregators": [
-      {
-        "type": "count",
-        "name": "events"
-      },
-      {
-        "type": "doubleSum",
-        "name": "outColumn",
-        "fieldName": "inColumn"
+  "spec": {
+    "dataSchema": {
+      "dataSource": "wikipedia",
+      "parser": {
+        "type": "string",
+        "parseSpec": {
+          "format": "json",
+          "timestampSpec": {
+            "column": "timestamp",
+            "format": "iso"
+          },
+          "dimensionsSpec": {
+            "dimensions": [
+              "page",
+              "language",
+              "user",
+              "unpatrolled",
+              "newPage",
+              "robot",
+              "anonymous",
+              "namespace",
+              "continent",
+              "country",
+              "region",
+              "city"
+            ],
+            "dimensionExclusions": [
+
+            ],
+            "spatialDimensions": [
+
+            ]
+          }
+        },
+        "metricsSpec": [
+          {
+            "type": "count",
+            "name": "count"
+          },
+          {
+            "type": "doubleSum",
+            "name": "added",
+            "fieldName": "added"
+          },
+          {
+            "type": "doubleSum",
+            "name": "deleted",
+            "fieldName": "deleted"
+          },
+          {
+            "type": "doubleSum",
+            "name": "delta",
+            "fieldName": "delta"
+          }
+        ],
+        "granularitySpec": {
+          "type": "uniform",
+          "segmentGranularity": "DAY",
+          "queryGranularity": "NONE"
+        }
      }
-    ],
-    "indexGranularity": "minute",
-    "shardSpec": {
-      "type": "none"
-    }
-  },
-  "firehose": {
-    "type": "kafka-0.7.2",
-    "consumerProps": {
-      "zk.connect": "zk_connect_string",
-      "zk.connectiontimeout.ms": "15000",
-      "zk.sessiontimeout.ms": "15000",
-      "zk.synctime.ms": "5000",
-      "groupid": "consumer-group",
-      "fetch.size": "1048586",
-      "autooffset.reset": "largest",
-      "autocommit.enable": "false"
    },
-    "feed": "your_kafka_topic",
-    "parser": {
-      "timestampSpec": {
-        "column": "timestamp",
-        "format": "iso"
+    "ioConfig": {
+      "type": "realtime",
+      "firehose": {
+        "type": "kafka-0.7.2",
+        "consumerProps": {
+          "zk.connect": "zk_connect_string",
+          "zk.connectiontimeout.ms": "15000",
+          "zk.sessiontimeout.ms": "15000",
+          "zk.synctime.ms": "5000",
+          "groupid": "consumer-group",
+          "fetch.size": "1048586",
+          "autooffset.reset": "largest",
+          "autocommit.enable": "false"
+        },
+        "feed": "your_kafka_topic"
      },
-      "data": {
-        "format": "json"
-      },
-      "dimensionExclusions": [
-        "value"
-      ]
+      "plumber": {
+        "type": "realtime"
+      }
+    },
+    "tuningConfig": {
+      "type": "realtime",
+      "maxRowsInMemory": 500000,
+      "intermediatePersistPeriod": "PT10m",
+      "windowPeriod": "PT10m",
+      "basePersistDirectory": "\/tmp\/realtime\/basePersist",
+      "rejectionPolicy": {
+        "type": "serverTime"
+      }
    }
-  },
-  "fireDepartmentConfig": {
-    "maxRowsInMemory": 500000,
-    "intermediatePersistPeriod": "PT10m"
-  },
-  "windowPeriod": "PT10m",
-  "segmentGranularity": "hour"
+  }
 }
 ```

--- a/docs/content/Tutorial:-Loading-Batch-Data.md
+++ b/docs/content/Tutorial:-Loading-Batch-Data.md
@ -101,7 +101,7 @@ Open up the file to see the following:
 ```json
 {
  "type" : "index",
-  "schema" : {
+  "spec" : {
    "dataSchema" : {
      "dataSource" : "wikipedia",
      "parser" : {
@ -260,7 +260,7 @@ Examining the contents of the file, you should find:
  ```json
  {
    "type" : "index_hadoop",
-    "schema" : {
+    "spec" : {
      "dataSchema" : {
        "dataSource" : "wikipedia",
        "parser" : {