diff --git a/docs/development/extensions-core/avro.md b/docs/development/extensions-core/avro.md index 006044d912d..8befbbe2243 100644 --- a/docs/development/extensions-core/avro.md +++ b/docs/development/extensions-core/avro.md @@ -24,206 +24,7 @@ title: "Apache Avro" This Apache Druid extension enables Druid to ingest and understand the Apache Avro data format. Make sure to [include](../../development/extensions.md#loading-extensions) `druid-avro-extensions` as an extension. -### Avro Stream Parser - -This is for streaming/realtime ingestion. - -| Field | Type | Description | Required | -|-------|------|-------------|----------| -| type | String | This should say `avro_stream`. | no | -| avroBytesDecoder | JSON Object | Specifies how to decode bytes to Avro record. | yes | -| parseSpec | JSON Object | Specifies the timestamp and dimensions of the data. Should be an "avro" parseSpec. | yes | - -An Avro parseSpec can contain a [`flattenSpec`](../../ingestion/index.md#flattenspec) using either the "root" or "path" -field types, which can be used to read nested Avro records. The "jq" field type is not currently supported for Avro. - -For example, using Avro stream parser with schema repo Avro bytes decoder: - -```json -"parser" : { - "type" : "avro_stream", - "avroBytesDecoder" : { - "type" : "schema_repo", - "subjectAndIdConverter" : { - "type" : "avro_1124", - "topic" : "${YOUR_TOPIC}" - }, - "schemaRepository" : { - "type" : "avro_1124_rest_client", - "url" : "${YOUR_SCHEMA_REPO_END_POINT}", - } - }, - "parseSpec" : { - "format": "avro", - "timestampSpec": , - "dimensionsSpec": , - "flattenSpec": - } -} -``` - -#### Avro Bytes Decoder - -If `type` is not included, the avroBytesDecoder defaults to `schema_repo`. - -##### Inline Schema Based Avro Bytes Decoder - -> The "schema_inline" decoder reads Avro records using a fixed schema and does not support schema migration. If you -> may need to migrate schemas in the future, consider one of the other decoders, all of which use a message header that -> allows the parser to identify the proper Avro schema for reading records. - -This decoder can be used if all the input events can be read using the same schema. In that case schema can be specified in the input task JSON itself as described below. - -``` -... -"avroBytesDecoder": { - "type": "schema_inline", - "schema": { - //your schema goes here, for example - "namespace": "org.apache.druid.data", - "name": "User", - "type": "record", - "fields": [ - { "name": "FullName", "type": "string" }, - { "name": "Country", "type": "string" } - ] - } -} -... -``` - -##### Multiple Inline Schemas Based Avro Bytes Decoder - -This decoder can be used if different input events can have different read schema. In that case schema can be specified in the input task JSON itself as described below. - -``` -... -"avroBytesDecoder": { - "type": "multiple_schemas_inline", - "schemas": { - //your id -> schema map goes here, for example - "1": { - "namespace": "org.apache.druid.data", - "name": "User", - "type": "record", - "fields": [ - { "name": "FullName", "type": "string" }, - { "name": "Country", "type": "string" } - ] - }, - "2": { - "namespace": "org.apache.druid.otherdata", - "name": "UserIdentity", - "type": "record", - "fields": [ - { "name": "Name", "type": "string" }, - { "name": "Location", "type": "string" } - ] - }, - ... - ... - } -} -... -``` - -Note that it is essentially a map of integer schema ID to avro schema object. This parser assumes that record has following format. - first 1 byte is version and must always be 1. - next 4 bytes are integer schema ID serialized using big-endian byte order. - remaining bytes contain serialized avro message. - -##### SchemaRepo Based Avro Bytes Decoder - -This Avro bytes decoder first extract `subject` and `id` from input message bytes, then use them to lookup the Avro schema with which to decode Avro record from bytes. Details can be found in [schema repo](https://github.com/schema-repo/schema-repo) and [AVRO-1124](https://issues.apache.org/jira/browse/AVRO-1124). You will need an http service like schema repo to hold the avro schema. Towards schema registration on the message producer side, you can refer to `org.apache.druid.data.input.AvroStreamInputRowParserTest#testParse()`. - -| Field | Type | Description | Required | -|-------|------|-------------|----------| -| type | String | This should say `schema_repo`. | no | -| subjectAndIdConverter | JSON Object | Specifies the how to extract subject and id from message bytes. | yes | -| schemaRepository | JSON Object | Specifies the how to lookup Avro schema from subject and id. | yes | - -###### Avro-1124 Subject And Id Converter - -This section describes the format of the `subjectAndIdConverter` object for the `schema_repo` Avro bytes decoder. - -| Field | Type | Description | Required | -|-------|------|-------------|----------| -| type | String | This should say `avro_1124`. | no | -| topic | String | Specifies the topic of your Kafka stream. | yes | - - -###### Avro-1124 Schema Repository - -This section describes the format of the `schemaRepository` object for the `schema_repo` Avro bytes decoder. - -| Field | Type | Description | Required | -|-------|------|-------------|----------| -| type | String | This should say `avro_1124_rest_client`. | no | -| url | String | Specifies the endpoint url of your Avro-1124 schema repository. | yes | - -##### Confluent Schema Registry-based Avro Bytes Decoder - -This Avro bytes decoder first extract unique `id` from input message bytes, then use them it lookup in the Schema Registry for the related schema, with which to decode Avro record from bytes. -Details can be found in Schema Registry [documentation](http://docs.confluent.io/current/schema-registry/docs/) and [repository](https://github.com/confluentinc/schema-registry). - -| Field | Type | Description | Required | -|-------|------|-------------|----------| -| type | String | This should say `schema_registry`. | no | -| url | String | Specifies the url endpoint of the Schema Registry. | yes | -| capacity | Integer | Specifies the max size of the cache (default == Integer.MAX_VALUE). | no | - -```json -... -"avroBytesDecoder" : { - "type" : "schema_registry", - "url" : -} -... -``` - -### Avro Hadoop Parser - -This is for batch ingestion using the `HadoopDruidIndexer`. The `inputFormat` of `inputSpec` in `ioConfig` must be set to `"org.apache.druid.data.input.avro.AvroValueInputFormat"`. You may want to set Avro reader's schema in `jobProperties` in `tuningConfig`, e.g.: `"avro.schema.input.value.path": "/path/to/your/schema.avsc"` or `"avro.schema.input.value": "your_schema_JSON_object"`, if reader's schema is not set, the schema in Avro object container file will be used, see [Avro specification](http://avro.apache.org/docs/1.7.7/spec.html#Schema+Resolution). Make sure to include "org.apache.druid.extensions:druid-avro-extensions" as an extension. - -| Field | Type | Description | Required | -|-------|------|-------------|----------| -| type | String | This should say `avro_hadoop`. | no | -| parseSpec | JSON Object | Specifies the timestamp and dimensions of the data. Should be an "avro" parseSpec. | yes | - -An Avro parseSpec can contain a [`flattenSpec`](../../ingestion/index.md#flattenspec) using either the "root" or "path" -field types, which can be used to read nested Avro records. The "jq" field type is not currently supported for Avro. - -For example, using Avro Hadoop parser with custom reader's schema file: - -```json -{ - "type" : "index_hadoop", - "spec" : { - "dataSchema" : { - "dataSource" : "", - "parser" : { - "type" : "avro_hadoop", - "parseSpec" : { - "format": "avro", - "timestampSpec": , - "dimensionsSpec": , - "flattenSpec": - } - } - }, - "ioConfig" : { - "type" : "hadoop", - "inputSpec" : { - "type" : "static", - "inputFormat": "org.apache.druid.data.input.avro.AvroValueInputFormat", - "paths" : "" - } - }, - "tuningConfig" : { - "jobProperties" : { - "avro.schema.input.value.path" : "/path/to/my/schema.avsc" - } - } - } -} -``` +The `druid-avro-extensions` provides two Avro Parsers for stream ingestion and Hadoop batch ingestion. +See [Avro Hadoop Parser](../../ingestion/data-formats.md#avro-hadoop-parser) +and [Avro Stream Parser](../../ingestion/data-formats.md#avro-stream-parser) +for details. diff --git a/docs/development/extensions-core/google.md b/docs/development/extensions-core/google.md index 0a28e755169..49a4c4cb775 100644 --- a/docs/development/extensions-core/google.md +++ b/docs/development/extensions-core/google.md @@ -37,127 +37,8 @@ Deep storage can be written to Google Cloud Storage either via this extension or |`druid.google.bucket`||GCS bucket name.|Must be set.| |`druid.google.prefix`||GCS prefix.|No-prefix| +## Reading data from Google Cloud Storage - - -## Google cloud storage batch ingestion input source - -This extension also provides an input source for Druid native batch ingestion to support reading objects directly from Google Cloud Storage. Objects can be specified as list of Google Cloud Storage URI strings. The Google Cloud Storage input source is splittable and can be used by [native parallel index tasks](../../ingestion/native-batch.md#parallel-task), where each worker task of `index_parallel` will read a single object. - -```json -... - "ioConfig": { - "type": "index_parallel", - "inputSource": { - "type": "google", - "uris": ["gs://foo/bar/file.json", "gs://bar/foo/file2.json"] - }, - "inputFormat": { - "type": "json" - }, - ... - }, -... -``` - -```json -... - "ioConfig": { - "type": "index_parallel", - "inputSource": { - "type": "google", - "prefixes": ["gs://foo/bar", "gs://bar/foo"] - }, - "inputFormat": { - "type": "json" - }, - ... - }, -... -``` - - -```json -... - "ioConfig": { - "type": "index_parallel", - "inputSource": { - "type": "google", - "objects": [ - { "bucket": "foo", "path": "bar/file1.json"}, - { "bucket": "bar", "path": "foo/file2.json"} - ] - }, - "inputFormat": { - "type": "json" - }, - ... - }, -... -``` - -|property|description|default|required?| -|--------|-----------|-------|---------| -|type|This should be `google`.|N/A|yes| -|uris|JSON array of URIs where Google Cloud Storage objects to be ingested are located.|N/A|`uris` or `prefixes` or `objects` must be set| -|prefixes|JSON array of URI prefixes for the locations of Google Cloud Storage objects to be ingested.|N/A|`uris` or `prefixes` or `objects` must be set| -|objects|JSON array of Google Cloud Storage objects to be ingested.|N/A|`uris` or `prefixes` or `objects` must be set| - - -Google Cloud Storage object: - -|property|description|default|required?| -|--------|-----------|-------|---------| -|bucket|Name of the Google Cloud Storage bucket|N/A|yes| -|path|The path where data is located.|N/A|yes| - -## Firehose - - - -#### StaticGoogleBlobStoreFirehose - -This firehose ingests events, similar to the StaticS3Firehose, but from an Google Cloud Store. - -As with the S3 blobstore, it is assumed to be gzipped if the extension ends in .gz - -This firehose is _splittable_ and can be used by [native parallel index tasks](../../ingestion/native-batch.md#parallel-task). -Since each split represents an object in this firehose, each worker task of `index_parallel` will read an object. - -Sample spec: - -```json -"firehose" : { - "type" : "static-google-blobstore", - "blobs": [ - { - "bucket": "foo", - "path": "/path/to/your/file.json" - }, - { - "bucket": "bar", - "path": "/another/path.json" - } - ] -} -``` - -This firehose provides caching and prefetching features. In IndexTask, a firehose can be read twice if intervals or -shardSpecs are not specified, and, in this case, caching can be useful. Prefetching is preferred when direct scan of objects is slow. - -|property|description|default|required?| -|--------|-----------|-------|---------| -|type|This should be `static-google-blobstore`.|N/A|yes| -|blobs|JSON array of Google Blobs.|N/A|yes| -|maxCacheCapacityBytes|Maximum size of the cache space in bytes. 0 means disabling cache. Cached files are not removed until the ingestion task completes.|1073741824|no| -|maxFetchCapacityBytes|Maximum size of the fetch space in bytes. 0 means disabling prefetch. Prefetched files are removed immediately once they are read.|1073741824|no| -|prefetchTriggerBytes|Threshold to trigger prefetching Google Blobs.|maxFetchCapacityBytes / 2|no| -|fetchTimeout|Timeout for fetching a Google Blob.|60000|no| -|maxFetchRetry|Maximum retry for fetching a Google Blob.|3|no| - -Google Blobs: - -|property|description|default|required?| -|--------|-----------|-------|---------| -|bucket|Name of the Google Cloud bucket|N/A|yes| -|path|The path where data is located.|N/A|yes| +The [Google Cloud Storage input source](../../ingestion/native-batch.md#google-cloud-storage-input-source) is supported by the [Parallel task](../../ingestion/native-batch.md#parallel-task) +to read objects directly from Google Cloud Storage. If you use the [Hadoop task](../../ingestion/hadoop.md), +you can read data from Google Cloud Storage by specifying the paths in your [`inputSpec`](../../ingestion/hadoop.md#inputspec). diff --git a/docs/development/extensions-core/hdfs.md b/docs/development/extensions-core/hdfs.md index 6c2616f625a..711404703b7 100644 --- a/docs/development/extensions-core/hdfs.md +++ b/docs/development/extensions-core/hdfs.md @@ -36,49 +36,134 @@ To use this Apache Druid extension, make sure to [include](../../development/ext |`druid.hadoop.security.kerberos.principal`|`druid@EXAMPLE.COM`| Principal user name |empty| |`druid.hadoop.security.kerberos.keytab`|`/etc/security/keytabs/druid.headlessUser.keytab`|Path to keytab file|empty| -If you are using the Hadoop indexer, set your output directory to be a location on Hadoop and it will work. +Besides the above settings, you also need to include all Hadoop configuration files (such as `core-site.xml`, `hdfs-site.xml`) +in the Druid classpath. One way to do this is copying all those files under `${DRUID_HOME}/conf/_common`. + +If you are using the Hadoop ingestion, set your output directory to be a location on Hadoop and it will work. If you want to eagerly authenticate against a secured hadoop/hdfs cluster you must set `druid.hadoop.security.kerberos.principal` and `druid.hadoop.security.kerberos.keytab`, this is an alternative to the cron job method that runs `kinit` command periodically. -### Configuration for Google Cloud Storage +### Configuration for Cloud Storage -The HDFS extension can also be used for GCS as deep storage. +You can also use the AWS S3 or the Google Cloud Storage as the deep storage via HDFS. + +#### Configuration for AWS S3 + +To use the AWS S3 as the deep storage, you need to configure `druid.storage.storageDirectory` properly. + +|Property|Possible Values|Description|Default| +|--------|---------------|-----------|-------| +|`druid.storage.type`|hdfs| |Must be set.| +|`druid.storage.storageDirectory`|s3a://bucket/example/directory or s3n://bucket/example/directory|Path to the deep storage|Must be set.| + +You also need to include the [Hadoop AWS module](https://hadoop.apache.org/docs/current/hadoop-aws/tools/hadoop-aws/index.html), especially the `hadoop-aws.jar` in the Druid classpath. +Run the below command to install the `hadoop-aws.jar` file under `${DRUID_HOME}/extensions/druid-hdfs-storage` in all nodes. + +```bash +java -classpath "${DRUID_HOME}lib/*" org.apache.druid.cli.Main tools pull-deps -h "org.apache.hadoop:hadoop-aws:${HADOOP_VERSION}"; +cp ${DRUID_HOME}/hadoop-dependencies/hadoop-aws/${HADOOP_VERSION}/hadoop-aws-${HADOOP_VERSION}.jar ${DRUID_HOME}/extensions/druid-hdfs-storage/ +``` + +Finally, you need to add the below properties in the `core-site.xml`. +For more configurations, see the [Hadoop AWS module](https://hadoop.apache.org/docs/current/hadoop-aws/tools/hadoop-aws/index.html). + +```xml + + fs.s3a.impl + org.apache.hadoop.fs.s3a.S3AFileSystem + The implementation class of the S3A Filesystem + + + + fs.AbstractFileSystem.s3a.impl + org.apache.hadoop.fs.s3a.S3A + The implementation class of the S3A AbstractFileSystem. + + + + fs.s3a.access.key + AWS access key ID. Omit for IAM role-based or provider-based authentication. + your access key + + + + fs.s3a.secret.key + AWS secret key. Omit for IAM role-based or provider-based authentication. + your secret key + +``` + +#### Configuration for Google Cloud Storage + +To use the Google Cloud Storage as the deep storage, you need to configure `druid.storage.storageDirectory` properly. |Property|Possible Values|Description|Default| |--------|---------------|-----------|-------| |`druid.storage.type`|hdfs||Must be set.| -|`druid.storage.storageDirectory`||gs://bucket/example/directory|Must be set.| +|`druid.storage.storageDirectory`|gs://bucket/example/directory|Path to the deep storage|Must be set.| -All services that need to access GCS need to have the [GCS connector jar](https://cloud.google.com/hadoop/google-cloud-storage-connector#manualinstallation) in their class path. One option is to place this jar in /lib/ and /extensions/druid-hdfs-storage/ +All services that need to access GCS need to have the [GCS connector jar](https://cloud.google.com/dataproc/docs/concepts/connectors/cloud-storage#other_sparkhadoop_clusters) in their class path. +Please read the [install instructions](https://github.com/GoogleCloudPlatform/bigdata-interop/blob/master/gcs/INSTALL.md) +to properly set up the necessary libraries and configurations. +One option is to place this jar in `${DRUID_HOME}/lib/` and `${DRUID_HOME}/extensions/druid-hdfs-storage/`. -Tested with Druid 0.9.0, Hadoop 2.7.2 and gcs-connector jar 1.4.4-hadoop2. +Finally, you need to configure the `core-site.xml` file with the filesystem +and authentication properties needed for GCS. You may want to copy the below +example properties. Please follow the instructions at +[https://github.com/GoogleCloudPlatform/bigdata-interop/blob/master/gcs/INSTALL.md](https://github.com/GoogleCloudPlatform/bigdata-interop/blob/master/gcs/INSTALL.md) +for more details. +For more configurations, [GCS core default](https://github.com/GoogleCloudPlatform/bigdata-interop/blob/master/gcs/conf/gcs-core-default.xml) +and [GCS core template](https://github.com/GoogleCloudPlatform/bdutil/blob/master/conf/hadoop2/gcs-core-template.xml). - +```xml + + fs.gs.impl + com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem + The FileSystem for gs: (GCS) uris. + -## Native batch ingestion + + fs.AbstractFileSystem.gs.impl + com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS + The AbstractFileSystem for gs: uris. + -This firehose ingests events from a predefined list of files from a Hadoop filesystem. -This firehose is _splittable_ and can be used by [native parallel index tasks](../../ingestion/native-batch.md#parallel-task). -Since each split represents an HDFS file, each worker task of `index_parallel` will read an object. + + google.cloud.auth.service.account.enable + true + + Whether to use a service account for GCS authorization. + Setting this property to `false` will disable use of service accounts for + authentication. + + -Sample spec: - -```json -"firehose" : { - "type" : "hdfs", - "paths": "/foo/bar,/foo/baz" -} + + google.cloud.auth.service.account.json.keyfile + /path/to/keyfile + + The JSON key file of the service account used for GCS + access when google.cloud.auth.service.account.enable is true. + + ``` -This firehose provides caching and prefetching features. During native batch indexing, a firehose can be read twice if -`intervals` are not specified, and, in this case, caching can be useful. Prefetching is preferred when direct scanning -of files is slow. +Tested with Druid 0.17.0, Hadoop 2.8.5 and gcs-connector jar 2.0.0-hadoop2. -|Property|Description|Default| -|--------|-----------|-------| -|type|This should be `hdfs`.|none (required)| -|paths|HDFS paths. Can be either a JSON array or comma-separated string of paths. Wildcards like `*` are supported in these paths.|none (required)| -|maxCacheCapacityBytes|Maximum size of the cache space in bytes. 0 means disabling cache. Cached files are not removed until the ingestion task completes.|1073741824| -|maxFetchCapacityBytes|Maximum size of the fetch space in bytes. 0 means disabling prefetch. Prefetched files are removed immediately once they are read.|1073741824| -|prefetchTriggerBytes|Threshold to trigger prefetching files.|maxFetchCapacityBytes / 2| -|fetchTimeout|Timeout for fetching each file.|60000| -|maxFetchRetry|Maximum number of retries for fetching each file.|3| +## Reading data from HDFS or Cloud Storage + +### Native batch ingestion + +The [HDFS input source](../../ingestion/native-batch.md#hdfs-input-source) is supported by the [Parallel task](../../ingestion/native-batch.md#parallel-task) +to read files directly from the HDFS Storage. You may be able to read objects from cloud storage +with the HDFS input source, but we highly recommend to use a proper +[Input Source](../../ingestion/native-batch.md#input-sources) instead if possible because +it is simple to set up. For now, only the [S3 input source](../../ingestion/native-batch.md#s3-input-source) +and the [Google Cloud Storage input source](../../ingestion/native-batch.md#google-cloud-storage-input-source) +are supported for cloud storage types, and so you may still want to use the HDFS input source +to read from cloud storage other than those two. + +### Hadoop-based ingestion + +If you use the [Hadoop ingestion](../../ingestion/hadoop.md), you can read data from HDFS +by specifying the paths in your [`inputSpec`](../../ingestion/hadoop.md#inputspec). +See the [Static](../../ingestion/hadoop.md#static) inputSpec for details. diff --git a/docs/development/extensions-core/kafka-ingestion.md b/docs/development/extensions-core/kafka-ingestion.md index de6504212b1..84f9531f944 100644 --- a/docs/development/extensions-core/kafka-ingestion.md +++ b/docs/development/extensions-core/kafka-ingestion.md @@ -60,22 +60,16 @@ A sample supervisor spec is shown below: "type": "kafka", "dataSchema": { "dataSource": "metrics-kafka", - "parser": { - "type": "string", - "parseSpec": { - "format": "json", - "timestampSpec": { - "column": "timestamp", - "format": "auto" - }, - "dimensionsSpec": { - "dimensions": [], - "dimensionExclusions": [ - "timestamp", - "value" - ] - } - } + "timestampSpec": { + "column": "timestamp", + "format": "auto" + }, + "dimensionsSpec": { + "dimensions": [], + "dimensionExclusions": [ + "timestamp", + "value" + ] }, "metricsSpec": [ { @@ -110,6 +104,9 @@ A sample supervisor spec is shown below: }, "ioConfig": { "topic": "metrics", + "inputFormat": { + "type": "json" + }, "consumerProperties": { "bootstrap.servers": "localhost:9092" }, @@ -196,6 +193,7 @@ For Roaring bitmaps: |Field|Type|Description|Required| |-----|----|-----------|--------| |`topic`|String|The Kafka topic to read from. This must be a specific topic as topic patterns are not supported.|yes| +|`inputFormat`|Object|[`inputFormat`](../../ingestion/data-formats.md#input-format) to specify how to parse input data. See [the below section](#specifying-data-format) for details about specifying the input format.|yes| |`consumerProperties`|Map|A map of properties to be passed to the Kafka consumer. This must contain a property `bootstrap.servers` with a list of Kafka brokers in the form: `:,:,...`. For SSL connections, the `keystore`, `truststore` and `key` passwords can be provided as a [Password Provider](../../operations/password-provider.md) or String password.|yes| |`pollTimeout`|Long|The length of time to wait for the Kafka consumer to poll records, in milliseconds|no (default == 100)| |`replicas`|Integer|The number of replica sets, where 1 means a single set of tasks (no replication). Replica tasks will always be assigned to different workers to provide resiliency against process failure.|no (default == 1)| @@ -209,6 +207,19 @@ For Roaring bitmaps: |`lateMessageRejectionPeriod`|ISO8601 Period|Configure tasks to reject messages with timestamps earlier than this period before the task was created; for example if this is set to `PT1H` and the supervisor creates a task at *2016-01-01T12:00Z*, messages with timestamps earlier than *2016-01-01T11:00Z* will be dropped. This may help prevent concurrency issues if your data stream has late messages and you have multiple pipelines that need to operate on the same segments (e.g. a realtime and a nightly batch ingestion pipeline). Please note that only one of `lateMessageRejectionPeriod` or `lateMessageRejectionStartDateTime` can be specified.|no (default == none)| |`earlyMessageRejectionPeriod`|ISO8601 Period|Configure tasks to reject messages with timestamps later than this period after the task reached its taskDuration; for example if this is set to `PT1H`, the taskDuration is set to `PT1H` and the supervisor creates a task at *2016-01-01T12:00Z*, messages with timestamps later than *2016-01-01T14:00Z* will be dropped. **Note:** Tasks sometimes run past their task duration, for example, in cases of supervisor failover. Setting earlyMessageRejectionPeriod too low may cause messages to be dropped unexpectedly whenever a task runs past its originally configured task duration.|no (default == none)| +#### Specifying data format + +Kafka indexing service supports both [`inputFormat`](../../ingestion/data-formats.md#input-format) and [`parser`](../../ingestion/data-formats.md#parser) to specify the data format. +The `inputFormat` is a new and recommended way to specify the data format for Kafka indexing service, +but unfortunately, it doesn't support all data formats supported by the legacy `parser`. +(They will be supported in the future.) + +The supported `inputFormat`s include [`csv`](../../ingestion/data-formats.md#csv), +[`delimited`](../../ingestion/data-formats.md#tsv-delimited), and [`json`](../../ingestion/data-formats.md#json). +You can also read [`avro_stream`](../../ingestion/data-formats.md#avro-stream-parser), +[`protobuf`](../../ingestion/data-formats.md#protobuf-parser), +and [`thrift`](../extensions-contrib/thrift.md) formats using `parser`. + ## Operations This section gives descriptions of how some supervisor APIs work specifically in Kafka Indexing Service. diff --git a/docs/development/extensions-core/kinesis-ingestion.md b/docs/development/extensions-core/kinesis-ingestion.md index dd2a6352ca6..ddec2ddd2a7 100644 --- a/docs/development/extensions-core/kinesis-ingestion.md +++ b/docs/development/extensions-core/kinesis-ingestion.md @@ -52,22 +52,16 @@ A sample supervisor spec is shown below: "type": "kinesis", "dataSchema": { "dataSource": "metrics-kinesis", - "parser": { - "type": "string", - "parseSpec": { - "format": "json", - "timestampSpec": { - "column": "timestamp", - "format": "auto" - }, - "dimensionsSpec": { - "dimensions": [], - "dimensionExclusions": [ - "timestamp", - "value" - ] - } - } + "timestampSpec": { + "column": "timestamp", + "format": "auto" + }, + "dimensionsSpec": { + "dimensions": [], + "dimensionExclusions": [ + "timestamp", + "value" + ] }, "metricsSpec": [ { @@ -102,6 +96,9 @@ A sample supervisor spec is shown below: }, "ioConfig": { "stream": "metrics", + "inputFormat": { + "type": "json" + }, "endpoint": "kinesis.us-east-1.amazonaws.com", "taskCount": 1, "replicas": 1, @@ -195,6 +192,7 @@ For Roaring bitmaps: |Field|Type|Description|Required| |-----|----|-----------|--------| |`stream`|String|The Kinesis stream to read.|yes| +|`inputFormat`|Object|[`inputFormat`](../../ingestion/data-formats.md#input-format) to specify how to parse input data. See [the below section](#specifying-data-format) for details about specifying the input format.|yes| |`endpoint`|String|The AWS Kinesis stream endpoint for a region. You can find a list of endpoints [here](http://docs.aws.amazon.com/general/latest/gr/rande.html#ak_region).|no (default == kinesis.us-east-1.amazonaws.com)| |`replicas`|Integer|The number of replica sets, where 1 means a single set of tasks (no replication). Replica tasks will always be assigned to different workers to provide resiliency against process failure.|no (default == 1)| |`taskCount`|Integer|The maximum number of *reading* tasks in a *replica set*. This means that the maximum number of reading tasks will be `taskCount * replicas` and the total number of tasks (*reading* + *publishing*) will be higher than this. See 'Capacity Planning' below for more details. The number of reading tasks will be less than `taskCount` if `taskCount > {numKinesisShards}`.|no (default == 1)| @@ -211,6 +209,19 @@ For Roaring bitmaps: |`awsExternalId`|String|The AWS external id to use for additional permissions.|no| |`deaggregate`|Boolean|Whether to use the de-aggregate function of the KCL. See below for details.|no| +#### Specifying data format + +Kinesis indexing service supports both [`inputFormat`](../../ingestion/data-formats.md#input-format) and [`parser`](../../ingestion/data-formats.md#parser) to specify the data format. +The `inputFormat` is a new and recommended way to specify the data format for Kinesis indexing service, +but unfortunately, it doesn't support all data formats supported by the legacy `parser`. +(They will be supported in the future.) + +The supported `inputFormat`s include [`csv`](../../ingestion/data-formats.md#csv), +[`delimited`](../../ingestion/data-formats.md#tsv-delimited), and [`json`](../../ingestion/data-formats.md#json). +You can also read [`avro_stream`](../../ingestion/data-formats.md#avro-stream-parser), +[`protobuf`](../../ingestion/data-formats.md#protobuf-parser), +and [`thrift`](../extensions-contrib/thrift.md) formats using `parser`. + ## Operations This section gives descriptions of how some supervisor APIs work specifically in Kinesis Indexing Service. diff --git a/docs/development/extensions-core/mysql.md b/docs/development/extensions-core/mysql.md index 2ce85ddc5fc..79792a1839d 100644 --- a/docs/development/extensions-core/mysql.md +++ b/docs/development/extensions-core/mysql.md @@ -108,7 +108,7 @@ Copy or symlink this file to `extensions/mysql-metadata-storage` under the distr ### MySQL Firehose -The MySQL extension provides an implementation of an [SqlFirehose](../../ingestion/native-batch.md#firehoses) which can be used to ingest data into Druid from a MySQL database. +The MySQL extension provides an implementation of an [SqlFirehose](../../ingestion/native-batch.md#firehoses-deprecated) which can be used to ingest data into Druid from a MySQL database. ```json { diff --git a/docs/development/extensions-core/orc.md b/docs/development/extensions-core/orc.md index 4510250d1e2..26e79104cdb 100644 --- a/docs/development/extensions-core/orc.md +++ b/docs/development/extensions-core/orc.md @@ -28,239 +28,9 @@ Apache ORC files. To use this extension, make sure to [include](../../development/extensions.md#loading-extensions) `druid-orc-extensions`. -## ORC Hadoop Parser - -The `inputFormat` of `inputSpec` in `ioConfig` must be set to `"org.apache.orc.mapreduce.OrcInputFormat"`. - - -|Field | Type | Description | Required| -|----------|-------------|----------------------------------------------------------------------------------------|---------| -|type | String | This should say `orc` | yes| -|parseSpec | JSON Object | Specifies the timestamp and dimensions of the data (`timeAndDims` and `orc` format) and a `flattenSpec` (`orc` format) | yes| - -The parser supports two `parseSpec` formats: `orc` and `timeAndDims`. - -`orc` supports auto field discovery and flattening, if specified with a [`flattenSpec`](../../ingestion/index.md#flattenspec). -If no `flattenSpec` is specified, `useFieldDiscovery` will be enabled by default. Specifying a `dimensionSpec` is -optional if `useFieldDiscovery` is enabled: if a `dimensionSpec` is supplied, the list of `dimensions` it defines will be -the set of ingested dimensions, if missing the discovered fields will make up the list. - -`timeAndDims` parse spec must specify which fields will be extracted as dimensions through the `dimensionSpec`. - -[All column types](https://orc.apache.org/docs/types.html) are supported, with the exception of `union` types. Columns of - `list` type, if filled with primitives, may be used as a multi-value dimension, or specific elements can be extracted with -`flattenSpec` expressions. Likewise, primitive fields may be extracted from `map` and `struct` types in the same manner. -Auto field discovery will automatically create a string dimension for every (non-timestamp) primitive or `list` of -primitives, as well as any flatten expressions defined in the `flattenSpec`. - -### Hadoop job properties -Like most Hadoop jobs, the best outcomes will add `"mapreduce.job.user.classpath.first": "true"` or -`"mapreduce.job.classloader": "true"` to the `jobProperties` section of `tuningConfig`. Note that it is likely if using -`"mapreduce.job.classloader": "true"` that you will need to set `mapreduce.job.classloader.system.classes` to include -`-org.apache.hadoop.hive.` to instruct Hadoop to load `org.apache.hadoop.hive` classes from the application jars instead -of system jars, e.g. - -```json -... - "mapreduce.job.classloader": "true", - "mapreduce.job.classloader.system.classes" : "java., javax.accessibility., javax.activation., javax.activity., javax.annotation., javax.annotation.processing., javax.crypto., javax.imageio., javax.jws., javax.lang.model., -javax.management.j2ee., javax.management., javax.naming., javax.net., javax.print., javax.rmi., javax.script., -javax.security.auth.message., javax.security.auth., javax.security.cert., javax.security.sasl., javax.sound., javax.sql., javax.swing., javax.tools., javax.transaction., -javax.xml.registry., -javax.xml.rpc., javax.xml., org.w3c.dom., org.xml.sax., org.apache.commons.logging., org.apache.log4j., -org.apache.hadoop.hbase., -org.apache.hadoop.hive., org.apache.hadoop., core-default.xml, hdfs-default.xml, mapred-default.xml, yarn-default.xml", -... -``` - -This is due to the `hive-storage-api` dependency of the -`orc-mapreduce` library, which provides some classes under the `org.apache.hadoop.hive` package. If instead using the -setting `"mapreduce.job.user.classpath.first": "true"`, then this will not be an issue. - -### Examples - -#### `orc` parser, `orc` parseSpec, auto field discovery, flatten expressions - -```json -{ - "type": "index_hadoop", - "spec": { - "ioConfig": { - "type": "hadoop", - "inputSpec": { - "type": "static", - "inputFormat": "org.apache.orc.mapreduce.OrcInputFormat", - "paths": "path/to/file.orc" - }, - ... - }, - "dataSchema": { - "dataSource": "example", - "parser": { - "type": "orc", - "parseSpec": { - "format": "orc", - "flattenSpec": { - "useFieldDiscovery": true, - "fields": [ - { - "type": "path", - "name": "nestedDim", - "expr": "$.nestedData.dim1" - }, - { - "type": "path", - "name": "listDimFirstItem", - "expr": "$.listDim[1]" - } - ] - }, - "timestampSpec": { - "column": "timestamp", - "format": "millis" - } - } - }, - ... - }, - "tuningConfig": - } - } -} -``` - -#### `orc` parser, `orc` parseSpec, field discovery with no flattenSpec or dimensionSpec - -```json -{ - "type": "index_hadoop", - "spec": { - "ioConfig": { - "type": "hadoop", - "inputSpec": { - "type": "static", - "inputFormat": "org.apache.orc.mapreduce.OrcInputFormat", - "paths": "path/to/file.orc" - }, - ... - }, - "dataSchema": { - "dataSource": "example", - "parser": { - "type": "orc", - "parseSpec": { - "format": "orc", - "timestampSpec": { - "column": "timestamp", - "format": "millis" - } - } - }, - ... - }, - "tuningConfig": - } - } -} -``` - -#### `orc` parser, `orc` parseSpec, no autodiscovery - -```json -{ - "type": "index_hadoop", - "spec": { - "ioConfig": { - "type": "hadoop", - "inputSpec": { - "type": "static", - "inputFormat": "org.apache.orc.mapreduce.OrcInputFormat", - "paths": "path/to/file.orc" - }, - ... - }, - "dataSchema": { - "dataSource": "example", - "parser": { - "type": "orc", - "parseSpec": { - "format": "orc", - "flattenSpec": { - "useFieldDiscovery": false, - "fields": [ - { - "type": "path", - "name": "nestedDim", - "expr": "$.nestedData.dim1" - }, - { - "type": "path", - "name": "listDimFirstItem", - "expr": "$.listDim[1]" - } - ] - }, - "timestampSpec": { - "column": "timestamp", - "format": "millis" - }, - "dimensionsSpec": { - "dimensions": [ - "dim1", - "dim3", - "nestedDim", - "listDimFirstItem" - ], - "dimensionExclusions": [], - "spatialDimensions": [] - } - } - }, - ... - }, - "tuningConfig": - } - } -} -``` - -#### `orc` parser, `timeAndDims` parseSpec -```json -{ - "type": "index_hadoop", - "spec": { - "ioConfig": { - "type": "hadoop", - "inputSpec": { - "type": "static", - "inputFormat": "org.apache.orc.mapreduce.OrcInputFormat", - "paths": "path/to/file.orc" - }, - ... - }, - "dataSchema": { - "dataSource": "example", - "parser": { - "type": "orc", - "parseSpec": { - "format": "timeAndDims", - "timestampSpec": { - "column": "timestamp", - "format": "auto" - }, - "dimensionsSpec": { - "dimensions": [ - "dim1", - "dim2", - "dim3", - "listDim" - ], - "dimensionExclusions": [], - "spatialDimensions": [] - } - } - }, - ... - }, - "tuningConfig": - } -} - -``` +The `druid-orc-extensions` provides the [ORC input format](../../ingestion/data-formats.md#orc) and the [ORC Hadoop parser](../../ingestion/data-formats.md#orc-hadoop-parser) +for [native batch ingestion](../../ingestion/native-batch.md) and [Hadoop batch ingestion](../../ingestion/hadoop.md), respectively. +Please see corresponding docs for details. ### Migration from 'contrib' extension This extension, first available in version 0.15.0, replaces the previous 'contrib' extension which was available until diff --git a/docs/development/extensions-core/parquet.md b/docs/development/extensions-core/parquet.md index 9fbdb4401f2..614e5dcd232 100644 --- a/docs/development/extensions-core/parquet.md +++ b/docs/development/extensions-core/parquet.md @@ -29,233 +29,8 @@ Apache Parquet files. Note: If using the `parquet-avro` parser for Apache Hadoop based indexing, `druid-parquet-extensions` depends on the `druid-avro-extensions` module, so be sure to [include both](../../development/extensions.md#loading-extensions). -## Parquet and Native Batch -This extension provides a `parquet` input format which can be used with Druid [native batch ingestion](../../ingestion/native-batch.md). - -### Parquet InputFormat -|Field | Type | Description | Required| -|---|---|---|---| -|type| String| This should be set to `parquet` to read Parquet file| yes | -|flattenSpec| JSON Object |Define a [`flattenSpec`](../../ingestion/index.md#flattenspec) to extract nested values from a Parquet file. Note that only 'path' expression are supported ('jq' is unavailable).| no (default will auto-discover 'root' level properties) | -| binaryAsString | Boolean | Specifies if the bytes parquet column which is not logically marked as a string or enum type should be treated as a UTF-8 encoded string. | no (default == false) | - -### Example - -```json - ... - "ioConfig": { - "type": "index_parallel", - "inputSource": { - "type": "local", - "baseDir": "/some/path/to/file/", - "filter": "file.parquet" - }, - "inputFormat": { - "type": "parquet" - "flattenSpec": { - "useFieldDiscovery": true, - "fields": [ - { - "type": "path", - "name": "nested", - "expr": "$.path.to.nested" - } - ] - } - "binaryAsString": false - }, - ... - } - ... -``` -## Parquet Hadoop Parser - -For Hadoop, this extension provides two parser implementations for reading Parquet files: - -* `parquet` - using a simple conversion contained within this extension -* `parquet-avro` - conversion to avro records with the `parquet-avro` library and using the `druid-avro-extensions` - module to parse the avro data - -Selection of conversion method is controlled by parser type, and the correct hadoop input format must also be set in -the `ioConfig`: - -* `org.apache.druid.data.input.parquet.DruidParquetInputFormat` for `parquet` -* `org.apache.druid.data.input.parquet.DruidParquetAvroInputFormat` for `parquet-avro` - - -Both parse options support auto field discovery and flattening if provided with a -[`flattenSpec`](../../ingestion/index.md#flattenspec) with `parquet` or `avro` as the format. Parquet nested list and map -[logical types](https://github.com/apache/parquet-format/blob/master/LogicalTypes.md) _should_ operate correctly with -JSON path expressions for all supported types. `parquet-avro` sets a hadoop job property -`parquet.avro.add-list-element-records` to `false` (which normally defaults to `true`), in order to 'unwrap' primitive -list elements into multi-value dimensions. - -The `parquet` parser supports `int96` Parquet values, while `parquet-avro` does not. There may also be some subtle -differences in the behavior of JSON path expression evaluation of `flattenSpec`. - -We suggest using `parquet` over `parquet-avro` to allow ingesting data beyond the schema constraints of Avro conversion. -However, `parquet-avro` was the original basis for this extension, and as such it is a bit more mature. - - -|Field | Type | Description | Required| -|----------|-------------|----------------------------------------------------------------------------------------|---------| -| type | String | Choose `parquet` or `parquet-avro` to determine how Parquet files are parsed | yes | -| parseSpec | JSON Object | Specifies the timestamp and dimensions of the data, and optionally, a flatten spec. Valid parseSpec formats are `timeAndDims`, `parquet`, `avro` (if used with avro conversion). | yes | -| binaryAsString | Boolean | Specifies if the bytes parquet column which is not logically marked as a string or enum type should be treated as a UTF-8 encoded string. | no(default == false) | - -When the time dimension is a [DateType column](https://github.com/apache/parquet-format/blob/master/LogicalTypes.md), a format should not be supplied. When the format is UTF8 (String), either `auto` or a explicitly defined [format](http://www.joda.org/joda-time/apidocs/org/joda/time/format/DateTimeFormat.html) is required. - -### Examples - -#### `parquet` parser, `parquet` parseSpec -```json -{ - "type": "index_hadoop", - "spec": { - "ioConfig": { - "type": "hadoop", - "inputSpec": { - "type": "static", - "inputFormat": "org.apache.druid.data.input.parquet.DruidParquetInputFormat", - "paths": "path/to/file.parquet" - }, - ... - }, - "dataSchema": { - "dataSource": "example", - "parser": { - "type": "parquet", - "parseSpec": { - "format": "parquet", - "flattenSpec": { - "useFieldDiscovery": true, - "fields": [ - { - "type": "path", - "name": "nestedDim", - "expr": "$.nestedData.dim1" - }, - { - "type": "path", - "name": "listDimFirstItem", - "expr": "$.listDim[1]" - } - ] - }, - "timestampSpec": { - "column": "timestamp", - "format": "auto" - }, - "dimensionsSpec": { - "dimensions": [], - "dimensionExclusions": [], - "spatialDimensions": [] - } - } - }, - ... - }, - "tuningConfig": - } - } -} -``` - -#### `parquet` parser, `timeAndDims` parseSpec -```json -{ - "type": "index_hadoop", - "spec": { - "ioConfig": { - "type": "hadoop", - "inputSpec": { - "type": "static", - "inputFormat": "org.apache.druid.data.input.parquet.DruidParquetInputFormat", - "paths": "path/to/file.parquet" - }, - ... - }, - "dataSchema": { - "dataSource": "example", - "parser": { - "type": "parquet", - "parseSpec": { - "format": "timeAndDims", - "timestampSpec": { - "column": "timestamp", - "format": "auto" - }, - "dimensionsSpec": { - "dimensions": [ - "dim1", - "dim2", - "dim3", - "listDim" - ], - "dimensionExclusions": [], - "spatialDimensions": [] - } - } - }, - ... - }, - "tuningConfig": - } -} - -``` -#### `parquet-avro` parser, `avro` parseSpec -```json -{ - "type": "index_hadoop", - "spec": { - "ioConfig": { - "type": "hadoop", - "inputSpec": { - "type": "static", - "inputFormat": "org.apache.druid.data.input.parquet.DruidParquetAvroInputFormat", - "paths": "path/to/file.parquet" - }, - ... - }, - "dataSchema": { - "dataSource": "example", - "parser": { - "type": "parquet-avro", - "parseSpec": { - "format": "avro", - "flattenSpec": { - "useFieldDiscovery": true, - "fields": [ - { - "type": "path", - "name": "nestedDim", - "expr": "$.nestedData.dim1" - }, - { - "type": "path", - "name": "listDimFirstItem", - "expr": "$.listDim[1]" - } - ] - }, - "timestampSpec": { - "column": "timestamp", - "format": "auto" - }, - "dimensionsSpec": { - "dimensions": [], - "dimensionExclusions": [], - "spatialDimensions": [] - } - } - }, - ... - }, - "tuningConfig": - } - } -} -``` - -For additional details see [Hadoop ingestion](../../ingestion/hadoop.md) and [general ingestion spec](../../ingestion/index.md) documentation. +The `druid-parquet-extensions` provides the [Parquet input format](../../ingestion/data-formats.md#parquet), the [Parquet Hadoop parser](../../ingestion/data-formats.md#parquet-hadoop-parser), +and the [Parquet Avro Hadoop Parser](../../ingestion/data-formats.md#parquet-avro-hadoop-parser) with `druid-avro-extensions`. +The Parquet input format is available for [native batch ingestion](../../ingestion/native-batch.md) +and the other 2 parsers are for [Hadoop batch ingestion](../../ingestion/hadoop.md). +Please see corresponding docs for details. diff --git a/docs/development/extensions-core/postgresql.md b/docs/development/extensions-core/postgresql.md index 4be3a7678df..1fc2e202ad6 100644 --- a/docs/development/extensions-core/postgresql.md +++ b/docs/development/extensions-core/postgresql.md @@ -87,7 +87,7 @@ In most cases, the configuration options map directly to the [postgres JDBC conn ### PostgreSQL Firehose -The PostgreSQL extension provides an implementation of an [SqlFirehose](../../ingestion/native-batch.md#firehoses) which can be used to ingest data into Druid from a PostgreSQL database. +The PostgreSQL extension provides an implementation of an [SqlFirehose](../../ingestion/native-batch.md#firehoses-deprecated) which can be used to ingest data into Druid from a PostgreSQL database. ```json { diff --git a/docs/development/extensions-core/protobuf.md b/docs/development/extensions-core/protobuf.md index c90e597b63c..c06620366e9 100644 --- a/docs/development/extensions-core/protobuf.md +++ b/docs/development/extensions-core/protobuf.md @@ -25,15 +25,8 @@ title: "Protobuf" This Apache Druid extension enables Druid to ingest and understand the Protobuf data format. Make sure to [include](../../development/extensions.md#loading-extensions) `druid-protobuf-extensions` as an extension. -## Protobuf Parser - - -| Field | Type | Description | Required | -|-------|------|-------------|----------| -| type | String | This should say `protobuf`. | no | -| descriptor | String | Protobuf descriptor file name in the classpath or URL. | yes | -| protoMessageType | String | Protobuf message type in the descriptor. Both short name and fully qualified name are accepted. The parser uses the first message type found in the descriptor if not specified. | no | -| parseSpec | JSON Object | Specifies the timestamp and dimensions of the data. The format must be JSON. See [JSON ParseSpec](../../ingestion/index.md) for more configuration options. Please note timeAndDims parseSpec is no longer supported. | yes | +The `druid-protobuf-extensions` provides the [Protobuf Parser](../../ingestion/data-formats.md#protobuf-parser) +for [stream ingestion](../../ingestion/index.md#streaming). See corresponding docs for details. ## Example: Load Protobuf messages from Kafka diff --git a/docs/development/extensions-core/s3.md b/docs/development/extensions-core/s3.md index 93d9a36f6b2..ef3539b4f14 100644 --- a/docs/development/extensions-core/s3.md +++ b/docs/development/extensions-core/s3.md @@ -98,111 +98,8 @@ You can enable [server-side encryption](https://docs.aws.amazon.com/AmazonS3/lat - kms: [Server-side encryption with AWS KMS–Managed Keys](https://docs.aws.amazon.com/AmazonS3/latest/dev/UsingKMSEncryption.html) - custom: [Server-side encryption with Customer-Provided Encryption Keys](https://docs.aws.amazon.com/AmazonS3/latest/dev/ServerSideEncryptionCustomerKeys.html) +## Reading data from S3 - - -## S3 batch ingestion input source - -This extension also provides an input source for Druid native batch ingestion to support reading objects directly from S3. Objects can be specified either via a list of S3 URI strings or a list of S3 location prefixes, which will attempt to list the contents and ingest all objects contained in the locations. The S3 input source is splittable and can be used by [native parallel index tasks](../../ingestion/native-batch.md#parallel-task), where each worker task of `index_parallel` will read a single object. - -Sample spec: - -```json -... - "ioConfig": { - "type": "index_parallel", - "inputSource": { - "type": "s3", - "uris": ["s3://foo/bar/file.json", "s3://bar/foo/file2.json"] - }, - "inputFormat": { - "type": "json" - }, - ... - }, -... -``` - -```json -... - "ioConfig": { - "type": "index_parallel", - "inputSource": { - "type": "s3", - "prefixes": ["s3://foo/bar", "s3://bar/foo"] - }, - "inputFormat": { - "type": "json" - }, - ... - }, -... -``` - - -```json -... - "ioConfig": { - "type": "index_parallel", - "inputSource": { - "type": "s3", - "objects": [ - { "bucket": "foo", "path": "bar/file1.json"}, - { "bucket": "bar", "path": "foo/file2.json"} - ] - }, - "inputFormat": { - "type": "json" - }, - ... - }, -... -``` - -|property|description|default|required?| -|--------|-----------|-------|---------| -|type|This should be `s3`.|N/A|yes| -|uris|JSON array of URIs where S3 objects to be ingested are located.|N/A|`uris` or `prefixes` or `objects` must be set| -|prefixes|JSON array of URI prefixes for the locations of S3 objects to be ingested.|N/A|`uris` or `prefixes` or `objects` must be set| -|objects|JSON array of S3 Objects to be ingested.|N/A|`uris` or `prefixes` or `objects` must be set| - - -S3 Object: - -|property|description|default|required?| -|--------|-----------|-------|---------| -|bucket|Name of the S3 bucket|N/A|yes| -|path|The path where data is located.|N/A|yes| - - - -## StaticS3Firehose - -This firehose ingests events from a predefined list of S3 objects. -This firehose is _splittable_ and can be used by [native parallel index tasks](../../ingestion/native-batch.md#parallel-task). -Since each split represents an object in this firehose, each worker task of `index_parallel` will read an object. - -Sample spec: - -```json -"firehose" : { - "type" : "static-s3", - "uris": ["s3://foo/bar/file.gz", "s3://bar/foo/file2.gz"] -} -``` - -This firehose provides caching and prefetching features. In IndexTask, a firehose can be read twice if intervals or -shardSpecs are not specified, and, in this case, caching can be useful. Prefetching is preferred when direct scan of objects is slow. - -|property|description|default|required?| -|--------|-----------|-------|---------| -|type|This should be `static-s3`.|N/A|yes| -|uris|JSON array of URIs where s3 files to be ingested are located.|N/A|`uris` or `prefixes` must be set| -|prefixes|JSON array of URI prefixes for the locations of s3 files to be ingested.|N/A|`uris` or `prefixes` must be set| -|maxCacheCapacityBytes|Maximum size of the cache space in bytes. 0 means disabling cache. Cached files are not removed until the ingestion task completes.|1073741824|no| -|maxFetchCapacityBytes|Maximum size of the fetch space in bytes. 0 means disabling prefetch. Prefetched files are removed immediately once they are read.|1073741824|no| -|prefetchTriggerBytes|Threshold to trigger prefetching s3 objects.|maxFetchCapacityBytes / 2|no| -|fetchTimeout|Timeout for fetching an s3 object.|60000|no| -|maxFetchRetry|Maximum retry for fetching an s3 object.|3|no| - - +The [S3 input source](../../ingestion/native-batch.md#s3-input-source) is supported by the [Parallel task](../../ingestion/native-batch.md#parallel-task) +to read objects directly from S3. If you use the [Hadoop task](../../ingestion/hadoop.md), +you can read data from S3 by specifying the S3 paths in your [`inputSpec`](../../ingestion/hadoop.md#inputspec). diff --git a/docs/development/javascript.md b/docs/development/javascript.md index 0802c93c912..c05acc9e62b 100644 --- a/docs/development/javascript.md +++ b/docs/development/javascript.md @@ -34,7 +34,7 @@ JavaScript can be used to extend Druid in a variety of ways: - [Extraction functions](../querying/dimensionspecs.html#javascript-extraction-function) - [Filters](../querying/filters.html#javascript-filter) - [Post-aggregators](../querying/post-aggregations.html#javascript-post-aggregator) -- [Input parsers](../ingestion/data-formats.html#javascript) +- [Input parsers](../ingestion/data-formats.html#javascript-parsespec) - [Router strategy](../design/router.html#javascript) - [Worker select strategy](../configuration/index.html#javascript-worker-select-strategy) diff --git a/docs/development/modules.md b/docs/development/modules.md index de2bbc1e6fe..7ef5c971920 100644 --- a/docs/development/modules.md +++ b/docs/development/modules.md @@ -31,9 +31,11 @@ Druid's extensions leverage Guice in order to add things at runtime. Basically, 1. Add a new deep storage implementation by extending the `org.apache.druid.segment.loading.DataSegment*` and `org.apache.druid.tasklogs.TaskLog*` classes. -1. Add a new Firehose by extending `org.apache.druid.data.input.FirehoseFactory`. -1. Add a new input parser by extending `org.apache.druid.data.input.impl.InputRowParser`. -1. Add a new string-based input format by extending `org.apache.druid.data.input.impl.ParseSpec`. +1. Add a new input source by extending `org.apache.druid.data.input.InputSource`. +1. Add a new input entity by extending `org.apache.druid.data.input.InputEntity`. +1. Add a new input source reader if necessary by extending `org.apache.druid.data.input.InputSourceReader`. You can use `org.apache.druid.data.input.impl.InputEntityIteratingReader` in most cases. +1. Add a new input format by extending `org.apache.druid.data.input.InputFormat`. +1. Add a new input entity reader by extending `org.apache.druid.data.input.TextReader` for text formats or `org.apache.druid.data.input.IntermediateRowParsingReader` for binary formats. 1. Add Aggregators by extending `org.apache.druid.query.aggregation.AggregatorFactory`, `org.apache.druid.query.aggregation.Aggregator`, and `org.apache.druid.query.aggregation.BufferAggregator`. 1. Add PostAggregators by extending `org.apache.druid.query.aggregation.PostAggregator`. @@ -57,7 +59,7 @@ The DruidModule class is has two methods The `configure(Binder)` method is the same method that a normal Guice module would have. -The `getJacksonModules()` method provides a list of Jackson modules that are used to help initialize the Jackson ObjectMapper instances used by Druid. This is how you add extensions that are instantiated via Jackson (like AggregatorFactory and Firehose objects) to Druid. +The `getJacksonModules()` method provides a list of Jackson modules that are used to help initialize the Jackson ObjectMapper instances used by Druid. This is how you add extensions that are instantiated via Jackson (like AggregatorFactory and InputSource objects) to Druid. ### Registering your Druid Module @@ -148,29 +150,43 @@ To start a segment killing task, you need to access the old Coordinator console After the killing task ends, `index.zip` (`partitionNum_index.zip` for HDFS data storage) file should be deleted from the data storage. -### Adding a new Firehose +### Adding support for a new input source -There is an example of this in the `s3-extensions` module with the StaticS3FirehoseFactory. +Adding support for a new input source requires to implement three interfaces, i.e., `InputSource`, `InputEntity`, and `InputSourceReader`. +`InputSource` is to define where the input data is stored. `InputEntity` is to define how data can be read in parallel +in [native parallel indexing](../ingestion/native-batch.md). +`InputSourceReader` defines how to read your new input source and you can simply use the provided `InputEntityIteratingReader` in most cases. -Adding a Firehose is done almost entirely through the Jackson Modules instead of Guice. Specifically, note the implementation +There is an example of this in the `druid-s3-extensions` module with the `S3InputSource` and `S3Entity`. + +Adding an InputSource is done almost entirely through the Jackson Modules instead of Guice. Specifically, note the implementation ``` java @Override public List getJacksonModules() { return ImmutableList.of( - new SimpleModule().registerSubtypes(new NamedType(StaticS3FirehoseFactory.class, "static-s3")) + new SimpleModule().registerSubtypes(new NamedType(S3InputSource.class, "s3")) ); } ``` -This is registering the FirehoseFactory with Jackson's polymorphic serialization/deserialization layer. More concretely, having this will mean that if you specify a `"firehose": { "type": "static-s3", ... }` in your realtime config, then the system will load this FirehoseFactory for your firehose. +This is registering the InputSource with Jackson's polymorphic serialization/deserialization layer. More concretely, having this will mean that if you specify a `"inputSource": { "type": "s3", ... }` in your IO config, then the system will load this InputSource for your `InputSource` implementation. -Note that inside of Druid, we have made the @JacksonInject annotation for Jackson deserialized objects actually use the base Guice injector to resolve the object to be injected. So, if your FirehoseFactory needs access to some object, you can add a @JacksonInject annotation on a setter and it will get set on instantiation. +Note that inside of Druid, we have made the `@JacksonInject` annotation for Jackson deserialized objects actually use the base Guice injector to resolve the object to be injected. So, if your InputSource needs access to some object, you can add a `@JacksonInject` annotation on a setter and it will get set on instantiation. + +### Adding support for a new data format + +Adding support for a new data format requires implementing two interfaces, i.e., `InputFormat` and `InputEntityReader`. +`InputFormat` is to define how your data is formatted. `InputEntityReader` is to define how to parse your data and convert into Druid `InputRow`. + +There is an example in the `druid-orc-extensions` module with the `OrcInputFormat` and `OrcReader`. + +Adding an InputFormat is very similar to adding an InputSource. They operate purely through Jackson and thus should just be additions to the Jackson modules returned by your DruidModule. ### Adding Aggregators -Adding AggregatorFactory objects is very similar to Firehose objects. They operate purely through Jackson and thus should just be additions to the Jackson modules returned by your DruidModule. +Adding AggregatorFactory objects is very similar to InputSource objects. They operate purely through Jackson and thus should just be additions to the Jackson modules returned by your DruidModule. ### Adding Complex Metrics diff --git a/docs/ingestion/data-formats.md b/docs/ingestion/data-formats.md index b20f54a97e8..fc060a36a36 100644 --- a/docs/ingestion/data-formats.md +++ b/docs/ingestion/data-formats.md @@ -25,7 +25,9 @@ title: "Data formats" Apache Druid can ingest denormalized data in JSON, CSV, or a delimited form such as TSV, or any custom format. While most examples in the documentation use data in JSON format, it is not difficult to configure Druid to ingest any other delimited data. We welcome any contributions to new formats. -For additional data formats, please see our [extensions list](../development/extensions.md). +This page lists all default and core extension data formats supported by Druid. +For additional data formats supported with community extensions, +please see our [community extensions list](../development/extensions.md#community-extensions). ## Formatting the Data @@ -63,44 +65,1035 @@ _TSV (Delimited)_ Note that the CSV and TSV data do not contain column heads. This becomes important when you specify the data for ingesting. +Besides text formats, Druid also supports binary formats such as [Orc](#orc) and [Parquet](#parquet) formats. + ## Custom Formats Druid supports custom data formats and can use the `Regex` parser or the `JavaScript` parsers to parse these formats. Please note that using any of these parsers for parsing data will not be as efficient as writing a native Java parser or using an external stream processor. We welcome contributions of new Parsers. -## Configuration +## Input Format -All forms of Druid ingestion require some form of schema object. The format of the data to be ingested is specified using the`parseSpec` entry in your `dataSchema`. +> The Input Format is a new way to specify the data format of your input data which was introduced in 0.17.0. +Unfortunately, the Input Format doesn't support all data formats or ingestion methods supported by Druid yet. +Especially if you want to use the Hadoop ingestion, you still need to use the [Parser](#parser). +If your data is formatted in some format not listed in this section, please consider using the Parser instead. + +All forms of Druid ingestion require some form of schema object. The format of the data to be ingested is specified using the `inputFormat` entry in your [`ioConfig`](index.md#ioconfig). ### JSON +The `inputFormat` to load data of JSON format. An example is: + ```json - "parseSpec":{ - "format" : "json", - "timestampSpec" : { - "column" : "timestamp" - }, - "dimensionSpec" : { - "dimensions" : ["page","language","user","unpatrolled","newPage","robot","anonymous","namespace","continent","country","region","city"] - } - } +"ioConfig": { + "inputFormat": { + "type": "json" + }, + ... +} ``` -If you have nested JSON, [Druid can automatically flatten it for you](index.md#flattenspec). +The JSON `inputFormat` has the following components: + +| Field | Type | Description | Required | +|-------|------|-------------|----------| +| type | String | This should say `json`. | yes | +| flattenSpec | JSON Object | Specifies flattening configuration for nested JSON data. See [`flattenSpec`](#flattenspec) for more info. | no | +| featureSpec | JSON Object | [JSON parser features](https://github.com/FasterXML/jackson-core/wiki/JsonParser-Features) supported by Jackson library. Those features will be applied when parsing the input JSON data. | no | ### CSV +The `inputFormat` to load data of the CSV format. An example is: + ```json - "parseSpec": { - "format" : "csv", - "timestampSpec" : { - "column" : "timestamp" - }, +"ioConfig": { + "inputFormat": { + "type": "csv", + "columns" : ["timestamp","page","language","user","unpatrolled","newPage","robot","anonymous","namespace","continent","country","region","city","added","deleted","delta"] + }, + ... +} +``` + +The CSV `inputFormat` has the following components: + +| Field | Type | Description | Required | +|-------|------|-------------|----------| +| type | String | This should say `csv`. | yes | +| listDelimiter | String | A custom delimiter for multi-value dimensions. | no (default == ctrl+A) | +| columns | JSON array | Specifies the columns of the data. The columns should be in the same order with the columns of your data. | yes if `findColumnsFromHeader` is false or missing | +| findColumnsFromHeader | Boolean | If this is set, the task will find the column names from the header row. Note that `skipHeaderRows` will be applied before finding column names from the header. For example, if you set `skipHeaderRows` to 2 and `findColumnsFromHeader` to true, the task will skip the first two lines and then extract column information from the third line. `columns` will be ignored if this is set to true. | no (default = false if `columns` is set; otherwise null) | +| skipHeaderRows | Integer | If this is set, the task will skip the first `skipHeaderRows` rows. | no (default = 0) | + +### TSV (Delimited) + +```json +"ioConfig": { + "inputFormat": { + "type": "tsv", "columns" : ["timestamp","page","language","user","unpatrolled","newPage","robot","anonymous","namespace","continent","country","region","city","added","deleted","delta"], - "dimensionsSpec" : { - "dimensions" : ["page","language","user","unpatrolled","newPage","robot","anonymous","namespace","continent","country","region","city"] + "delimiter":"|" + }, + ... +} +``` + +The `inputFormat` to load data of a delimited format. An example is: + +| Field | Type | Description | Required | +|-------|------|-------------|----------| +| type | String | This should say `tsv`. | yes | +| delimiter | String | A custom delimiter for data values. | no (default == `\t`) | +| listDelimiter | String | A custom delimiter for multi-value dimensions. | no (default == ctrl+A) | +| columns | JSON array | Specifies the columns of the data. The columns should be in the same order with the columns of your data. | yes if `findColumnsFromHeader` is false or missing | +| findColumnsFromHeader | Boolean | If this is set, the task will find the column names from the header row. Note that `skipHeaderRows` will be applied before finding column names from the header. For example, if you set `skipHeaderRows` to 2 and `findColumnsFromHeader` to true, the task will skip the first two lines and then extract column information from the third line. `columns` will be ignored if this is set to true. | no (default = false if `columns` is set; otherwise null) | +| skipHeaderRows | Integer | If this is set, the task will skip the first `skipHeaderRows` rows. | no (default = 0) | + +Be sure to change the `delimiter` to the appropriate delimiter for your data. Like CSV, you must specify the columns and which subset of the columns you want indexed. + +### ORC + +> You need to include the [`druid-orc-extensions`](../development/extensions-core/orc.md) as an extension to use the ORC input format. + +> If you are considering upgrading from earlier than 0.15.0 to 0.15.0 or a higher version, +> please read [Migration from 'contrib' extension](../development/extensions-core/orc.md#migration-from-contrib-extension) carefully. + +The `inputFormat` to load data of ORC format. An example is: + +```json +"ioConfig": { + "inputFormat": { + "type": "orc", + "flattenSpec": { + "useFieldDiscovery": true, + "fields": [ + { + "type": "path", + "name": "nested", + "expr": "$.path.to.nested" + } + ] + } + "binaryAsString": false + }, + ... +} +``` + +The ORC `inputFormat` has the following components: + +| Field | Type | Description | Required | +|-------|------|-------------|----------| +| type | String | This should say `json`. | yes | +| flattenSpec | JSON Object | Specifies flattening configuration for nested JSON data. See [`flattenSpec`](#flattenspec) for more info. | no | +| binaryAsString | Boolean | Specifies if the binary orc column which is not logically marked as a string should be treated as a UTF-8 encoded string. | no (default == false) | + +### Parquet + +> You need to include the [`druid-parquet-extensions`](../development/extensions-core/parquet.md) as an extension to use the Parquet input format. + +The `inputFormat` to load data of Parquet format. An example is: + +```json +"ioConfig": { + "inputFormat": { + "type": "parquet", + "flattenSpec": { + "useFieldDiscovery": true, + "fields": [ + { + "type": "path", + "name": "nested", + "expr": "$.path.to.nested" + } + ] + } + "binaryAsString": false + }, + ... +} +``` + +The Parquet `inputFormat` has the following components: + +| Field | Type | Description | Required | +|-------|------|-------------|----------| +|type| String| This should be set to `parquet` to read Parquet file| yes | +|flattenSpec| JSON Object |Define a [`flattenSpec`](#flattenspec) to extract nested values from a Parquet file. Note that only 'path' expression are supported ('jq' is unavailable).| no (default will auto-discover 'root' level properties) | +| binaryAsString | Boolean | Specifies if the bytes parquet column which is not logically marked as a string or enum type should be treated as a UTF-8 encoded string. | no (default == false) | + +### FlattenSpec + +The `flattenSpec` is located in `inputFormat` → `flattenSpec` and is responsible for +bridging the gap between potentially nested input data (such as JSON, Avro, etc) and Druid's flat data model. +An example `flattenSpec` is: + +```json +"flattenSpec": { + "useFieldDiscovery": true, + "fields": [ + { "name": "baz", "type": "root" }, + { "name": "foo_bar", "type": "path", "expr": "$.foo.bar" }, + { "name": "first_food", "type": "jq", "expr": ".thing.food[1]" } + ] +} +``` +> Conceptually, after input data records are read, the `flattenSpec` is applied first before +> any other specs such as [`timestampSpec`](./index.md#timestampspec), [`transformSpec`](./index.md#transformspec), +> [`dimensionsSpec`](./index.md#dimensionsspec), or [`metricsSpec`](./index.md#metricsspec). Keep this in mind when writing +> your ingestion spec. + +Flattening is only supported for [data formats](data-formats.md) that support nesting, including `avro`, `json`, `orc`, +and `parquet`. + +A `flattenSpec` can have the following components: + +| Field | Description | Default | +|-------|-------------|---------| +| useFieldDiscovery | If true, interpret all root-level fields as available fields for usage by [`timestampSpec`](./index.md#timestampspec), [`transformSpec`](./index.md#transformspec), [`dimensionsSpec`](./index.md#dimensionsspec), and [`metricsSpec`](./index.md#metricsspec).

If false, only explicitly specified fields (see `fields`) will be available for use. | `true` | +| fields | Specifies the fields of interest and how they are accessed. [See below for details.](#field-flattening-specifications) | `[]` | + +#### Field flattening specifications + +Each entry in the `fields` list can have the following components: + +| Field | Description | Default | +|-------|-------------|---------| +| type | Options are as follows:

  • `root`, referring to a field at the root level of the record. Only really useful if `useFieldDiscovery` is false.
  • `path`, referring to a field using [JsonPath](https://github.com/jayway/JsonPath) notation. Supported by most data formats that offer nesting, including `avro`, `json`, `orc`, and `parquet`.
  • `jq`, referring to a field using [jackson-jq](https://github.com/eiiches/jackson-jq) notation. Only supported for the `json` format.
| none (required) | +| name | Name of the field after flattening. This name can be referred to by the [`timestampSpec`](./index.md#timestampspec), [`transformSpec`](./index.md#transformspec), [`dimensionsSpec`](./index.md#dimensionsspec), and [`metricsSpec`](./index.md#metricsspec).| none (required) | +| expr | Expression for accessing the field while flattening. For type `path`, this should be [JsonPath](https://github.com/jayway/JsonPath). For type `jq`, this should be [jackson-jq](https://github.com/eiiches/jackson-jq) notation. For other types, this parameter is ignored. | none (required for types `path` and `jq`) | + +#### Notes on flattening + +* For convenience, when defining a root-level field, it is possible to define only the field name, as a string, instead of a JSON object. For example, `{"name": "baz", "type": "root"}` is equivalent to `"baz"`. +* Enabling `useFieldDiscovery` will only automatically detect "simple" fields at the root level that correspond to data types that Druid supports. This includes strings, numbers, and lists of strings or numbers. Other types will not be automatically detected, and must be specified explicitly in the `fields` list. +* Duplicate field `name`s are not allowed. An exception will be thrown. +* If `useFieldDiscovery` is enabled, any discovered field with the same name as one already defined in the `fields` list will be skipped, rather than added twice. +* [http://jsonpath.herokuapp.com/](http://jsonpath.herokuapp.com/) is useful for testing `path`-type expressions. +* jackson-jq supports a subset of the full [jq](https://stedolan.github.io/jq/) syntax. Please refer to the [jackson-jq documentation](https://github.com/eiiches/jackson-jq) for details. + +## Parser + +> The Parser is deprecated for [native batch tasks](./native-batch.md), [Kafka indexing service](../development/extensions-core/kafka-ingestion.md), +and [Kinesis indexing service](../development/extensions-core/kinesis-ingestion.md). +Consider using the [input format](#input-format) instead for these types of ingestion. + +This section lists all default and core extension parsers. +For community extension parsers, please see our [community extensions list](../development/extensions.html#community-extensions). + +### String Parser + +`string` typed parsers operate on text based inputs that can be split into individual records by newlines. +Each line can be further parsed using [`parseSpec`](#parsespec). + +| Field | Type | Description | Required | +|-------|------|-------------|----------| +| type | String | This should say `string` in general, or `hadoopyString` when used in a Hadoop indexing job. | yes | +| parseSpec | JSON Object | Specifies the format, timestamp, and dimensions of the data. | yes | + +### Avro Hadoop Parser + +> You need to include the [`druid-avro-extensions`](../development/extensions-core/avro.md) as an extension to use the Avro Hadoop Parser. + +This parser is for [Hadoop batch ingestion](./hadoop.md). +The `inputFormat` of `inputSpec` in `ioConfig` must be set to `"org.apache.druid.data.input.avro.AvroValueInputFormat"`. +You may want to set Avro reader's schema in `jobProperties` in `tuningConfig`, +e.g.: `"avro.schema.input.value.path": "/path/to/your/schema.avsc"` or +`"avro.schema.input.value": "your_schema_JSON_object"`, +if reader's schema is not set, the schema in Avro object container file will be used, +see [Avro specification](http://avro.apache.org/docs/1.7.7/spec.html#Schema+Resolution). + +| Field | Type | Description | Required | +|-------|------|-------------|----------| +| type | String | This should say `avro_hadoop`. | yes | +| parseSpec | JSON Object | Specifies the timestamp and dimensions of the data. Should be an "avro" parseSpec. | yes | + +An Avro parseSpec can contain a [`flattenSpec`](#flattenspec) using either the "root" or "path" +field types, which can be used to read nested Avro records. The "jq" field type is not currently supported for Avro. + +For example, using Avro Hadoop parser with custom reader's schema file: + +```json +{ + "type" : "index_hadoop", + "spec" : { + "dataSchema" : { + "dataSource" : "", + "parser" : { + "type" : "avro_hadoop", + "parseSpec" : { + "format": "avro", + "timestampSpec": , + "dimensionsSpec": , + "flattenSpec": + } + } + }, + "ioConfig" : { + "type" : "hadoop", + "inputSpec" : { + "type" : "static", + "inputFormat": "org.apache.druid.data.input.avro.AvroValueInputFormat", + "paths" : "" + } + }, + "tuningConfig" : { + "jobProperties" : { + "avro.schema.input.value.path" : "/path/to/my/schema.avsc" + } } } +} +``` + +### ORC Hadoop Parser + +> You need to include the [`druid-orc-extensions`](../development/extensions-core/orc.md) as an extension to use the ORC Hadoop Parser. + +> If you are considering upgrading from earlier than 0.15.0 to 0.15.0 or a higher version, +> please read [Migration from 'contrib' extension](../development/extensions-core/orc.md#migration-from-contrib-extension) carefully. + +This parser is for [Hadoop batch ingestion](./hadoop.md). +The `inputFormat` of `inputSpec` in `ioConfig` must be set to `"org.apache.orc.mapreduce.OrcInputFormat"`. + +|Field | Type | Description | Required| +|----------|-------------|----------------------------------------------------------------------------------------|---------| +|type | String | This should say `orc` | yes| +|parseSpec | JSON Object | Specifies the timestamp and dimensions of the data (`timeAndDims` and `orc` format) and a `flattenSpec` (`orc` format) | yes| + +The parser supports two `parseSpec` formats: `orc` and `timeAndDims`. + +`orc` supports auto field discovery and flattening, if specified with a [`flattenSpec`](#flattenspec). +If no `flattenSpec` is specified, `useFieldDiscovery` will be enabled by default. Specifying a `dimensionSpec` is +optional if `useFieldDiscovery` is enabled: if a `dimensionSpec` is supplied, the list of `dimensions` it defines will be +the set of ingested dimensions, if missing the discovered fields will make up the list. + +`timeAndDims` parse spec must specify which fields will be extracted as dimensions through the `dimensionSpec`. + +[All column types](https://orc.apache.org/docs/types.html) are supported, with the exception of `union` types. Columns of + `list` type, if filled with primitives, may be used as a multi-value dimension, or specific elements can be extracted with +`flattenSpec` expressions. Likewise, primitive fields may be extracted from `map` and `struct` types in the same manner. +Auto field discovery will automatically create a string dimension for every (non-timestamp) primitive or `list` of +primitives, as well as any flatten expressions defined in the `flattenSpec`. + +#### Hadoop job properties +Like most Hadoop jobs, the best outcomes will add `"mapreduce.job.user.classpath.first": "true"` or +`"mapreduce.job.classloader": "true"` to the `jobProperties` section of `tuningConfig`. Note that it is likely if using +`"mapreduce.job.classloader": "true"` that you will need to set `mapreduce.job.classloader.system.classes` to include +`-org.apache.hadoop.hive.` to instruct Hadoop to load `org.apache.hadoop.hive` classes from the application jars instead +of system jars, e.g. + +```json +... + "mapreduce.job.classloader": "true", + "mapreduce.job.classloader.system.classes" : "java., javax.accessibility., javax.activation., javax.activity., javax.annotation., javax.annotation.processing., javax.crypto., javax.imageio., javax.jws., javax.lang.model., -javax.management.j2ee., javax.management., javax.naming., javax.net., javax.print., javax.rmi., javax.script., -javax.security.auth.message., javax.security.auth., javax.security.cert., javax.security.sasl., javax.sound., javax.sql., javax.swing., javax.tools., javax.transaction., -javax.xml.registry., -javax.xml.rpc., javax.xml., org.w3c.dom., org.xml.sax., org.apache.commons.logging., org.apache.log4j., -org.apache.hadoop.hbase., -org.apache.hadoop.hive., org.apache.hadoop., core-default.xml, hdfs-default.xml, mapred-default.xml, yarn-default.xml", +... +``` + +This is due to the `hive-storage-api` dependency of the +`orc-mapreduce` library, which provides some classes under the `org.apache.hadoop.hive` package. If instead using the +setting `"mapreduce.job.user.classpath.first": "true"`, then this will not be an issue. + +#### Examples + +##### `orc` parser, `orc` parseSpec, auto field discovery, flatten expressions + +```json +{ + "type": "index_hadoop", + "spec": { + "ioConfig": { + "type": "hadoop", + "inputSpec": { + "type": "static", + "inputFormat": "org.apache.orc.mapreduce.OrcInputFormat", + "paths": "path/to/file.orc" + }, + ... + }, + "dataSchema": { + "dataSource": "example", + "parser": { + "type": "orc", + "parseSpec": { + "format": "orc", + "flattenSpec": { + "useFieldDiscovery": true, + "fields": [ + { + "type": "path", + "name": "nestedDim", + "expr": "$.nestedData.dim1" + }, + { + "type": "path", + "name": "listDimFirstItem", + "expr": "$.listDim[1]" + } + ] + }, + "timestampSpec": { + "column": "timestamp", + "format": "millis" + } + } + }, + ... + }, + "tuningConfig": + } + } +} +``` + +##### `orc` parser, `orc` parseSpec, field discovery with no flattenSpec or dimensionSpec + +```json +{ + "type": "index_hadoop", + "spec": { + "ioConfig": { + "type": "hadoop", + "inputSpec": { + "type": "static", + "inputFormat": "org.apache.orc.mapreduce.OrcInputFormat", + "paths": "path/to/file.orc" + }, + ... + }, + "dataSchema": { + "dataSource": "example", + "parser": { + "type": "orc", + "parseSpec": { + "format": "orc", + "timestampSpec": { + "column": "timestamp", + "format": "millis" + } + } + }, + ... + }, + "tuningConfig": + } + } +} +``` + +##### `orc` parser, `orc` parseSpec, no autodiscovery + +```json +{ + "type": "index_hadoop", + "spec": { + "ioConfig": { + "type": "hadoop", + "inputSpec": { + "type": "static", + "inputFormat": "org.apache.orc.mapreduce.OrcInputFormat", + "paths": "path/to/file.orc" + }, + ... + }, + "dataSchema": { + "dataSource": "example", + "parser": { + "type": "orc", + "parseSpec": { + "format": "orc", + "flattenSpec": { + "useFieldDiscovery": false, + "fields": [ + { + "type": "path", + "name": "nestedDim", + "expr": "$.nestedData.dim1" + }, + { + "type": "path", + "name": "listDimFirstItem", + "expr": "$.listDim[1]" + } + ] + }, + "timestampSpec": { + "column": "timestamp", + "format": "millis" + }, + "dimensionsSpec": { + "dimensions": [ + "dim1", + "dim3", + "nestedDim", + "listDimFirstItem" + ], + "dimensionExclusions": [], + "spatialDimensions": [] + } + } + }, + ... + }, + "tuningConfig": + } + } +} +``` + +##### `orc` parser, `timeAndDims` parseSpec +```json +{ + "type": "index_hadoop", + "spec": { + "ioConfig": { + "type": "hadoop", + "inputSpec": { + "type": "static", + "inputFormat": "org.apache.orc.mapreduce.OrcInputFormat", + "paths": "path/to/file.orc" + }, + ... + }, + "dataSchema": { + "dataSource": "example", + "parser": { + "type": "orc", + "parseSpec": { + "format": "timeAndDims", + "timestampSpec": { + "column": "timestamp", + "format": "auto" + }, + "dimensionsSpec": { + "dimensions": [ + "dim1", + "dim2", + "dim3", + "listDim" + ], + "dimensionExclusions": [], + "spatialDimensions": [] + } + } + }, + ... + }, + "tuningConfig": + } +} + +``` + +### Parquet Hadoop Parser + +> You need to include the [`druid-parquet-extensions`](../development/extensions-core/parquet.md) as an extension to use the Parquet Hadoop Parser. + +The Parquet Hadoop parser is for [Hadoop batch ingestion](./hadoop.md) and parses Parquet files directly. +The `inputFormat` of `inputSpec` in `ioConfig` must be set to `org.apache.druid.data.input.parquet.DruidParquetInputFormat`. + +The Parquet Hadoop Parser supports auto field discovery and flattening if provided with a +[`flattenSpec`](#flattenspec) with the `parquet` `parseSpec`. Parquet nested list and map +[logical types](https://github.com/apache/parquet-format/blob/master/LogicalTypes.md) _should_ operate correctly with +JSON path expressions for all supported types. + +|Field | Type | Description | Required| +|----------|-------------|----------------------------------------------------------------------------------------|---------| +| type | String | This should say `parquet`.| yes | +| parseSpec | JSON Object | Specifies the timestamp and dimensions of the data, and optionally, a flatten spec. Valid parseSpec formats are `timeAndDims` and `parquet` | yes | +| binaryAsString | Boolean | Specifies if the bytes parquet column which is not logically marked as a string or enum type should be treated as a UTF-8 encoded string. | no(default == false) | + +When the time dimension is a [DateType column](https://github.com/apache/parquet-format/blob/master/LogicalTypes.md), +a format should not be supplied. When the format is UTF8 (String), either `auto` or a explicitly defined +[format](http://www.joda.org/joda-time/apidocs/org/joda/time/format/DateTimeFormat.html) is required. + +#### Parquet Hadoop Parser vs Parquet Avro Hadoop Parser + +Both parsers are to read Parquet files, but slightly different. The main +differences are: + +* The Parquet Hadoop Parser uses a simple conversion while the Parquet Avro Hadoop Parser +converts Parquet data into avro records first with the `parquet-avro` library and then +parses avro data using the `druid-avro-extensions` module to ingest into Druid. +* The Parquet Hadoop Parser sets a hadoop job property +`parquet.avro.add-list-element-records` to `false` (which normally defaults to `true`), in order to 'unwrap' primitive +list elements into multi-value dimensions. +* The Parquet Hadoop Parser supports `int96` Parquet values, while the Parquet Avro Hadoop Parser does not. +There may also be some subtle differences in the behavior of JSON path expression evaluation of `flattenSpec`. + +Based on those differences, we suggest using the Parquet Hadoop Parser over the Parquet Avro Hadoop Parser +to allow ingesting data beyond the schema constraints of Avro conversion. +However, the Parquet Avro Hadoop Parser was the original basis for supporting the Parquet format, and as such it is a bit more mature. + +#### Examples + +##### `parquet` parser, `parquet` parseSpec +```json +{ + "type": "index_hadoop", + "spec": { + "ioConfig": { + "type": "hadoop", + "inputSpec": { + "type": "static", + "inputFormat": "org.apache.druid.data.input.parquet.DruidParquetInputFormat", + "paths": "path/to/file.parquet" + }, + ... + }, + "dataSchema": { + "dataSource": "example", + "parser": { + "type": "parquet", + "parseSpec": { + "format": "parquet", + "flattenSpec": { + "useFieldDiscovery": true, + "fields": [ + { + "type": "path", + "name": "nestedDim", + "expr": "$.nestedData.dim1" + }, + { + "type": "path", + "name": "listDimFirstItem", + "expr": "$.listDim[1]" + } + ] + }, + "timestampSpec": { + "column": "timestamp", + "format": "auto" + }, + "dimensionsSpec": { + "dimensions": [], + "dimensionExclusions": [], + "spatialDimensions": [] + } + } + }, + ... + }, + "tuningConfig": + } + } +} +``` + +##### `parquet` parser, `timeAndDims` parseSpec +```json +{ + "type": "index_hadoop", + "spec": { + "ioConfig": { + "type": "hadoop", + "inputSpec": { + "type": "static", + "inputFormat": "org.apache.druid.data.input.parquet.DruidParquetInputFormat", + "paths": "path/to/file.parquet" + }, + ... + }, + "dataSchema": { + "dataSource": "example", + "parser": { + "type": "parquet", + "parseSpec": { + "format": "timeAndDims", + "timestampSpec": { + "column": "timestamp", + "format": "auto" + }, + "dimensionsSpec": { + "dimensions": [ + "dim1", + "dim2", + "dim3", + "listDim" + ], + "dimensionExclusions": [], + "spatialDimensions": [] + } + } + }, + ... + }, + "tuningConfig": + } +} + +``` + +### Parquet Avro Hadoop Parser + +> Consider using the [Parquet Hadoop Parser](#parquet-hadoop-parser) over this parser to ingest +Parquet files. See [Parquet Hadoop Parser vs Parquet Avro Hadoop Parser](#parquet-hadoop-parser-vs-parquet-avro-hadoop-parser) +for the differences between those parsers. + +> You need to include both the [`druid-parquet-extensions`](../development/extensions-core/parquet.md) +[`druid-avro-extensions`] as extensions to use the Parquet Avro Hadoop Parser. + +The Parquet Avro Hadoop Parser is for [Hadoop batch ingestion](./hadoop.md). +This parser first converts the Parquet data into Avro records, and then parses them to ingest into Druid. +The `inputFormat` of `inputSpec` in `ioConfig` must be set to `org.apache.druid.data.input.parquet.DruidParquetAvroInputFormat`. + +The Parquet Avro Hadoop Parser supports auto field discovery and flattening if provided with a +[`flattenSpec`](#flattenspec) with the `avro` `parseSpec`. Parquet nested list and map +[logical types](https://github.com/apache/parquet-format/blob/master/LogicalTypes.md) _should_ operate correctly with +JSON path expressions for all supported types. This parser sets a hadoop job property +`parquet.avro.add-list-element-records` to `false` (which normally defaults to `true`), in order to 'unwrap' primitive +list elements into multi-value dimensions. + +Note that the `int96` Parquet value type is not supported with this parser. + +|Field | Type | Description | Required| +|----------|-------------|----------------------------------------------------------------------------------------|---------| +| type | String | This should say `parquet-avro`. | yes | +| parseSpec | JSON Object | Specifies the timestamp and dimensions of the data, and optionally, a flatten spec. Should be `avro`. | yes | +| binaryAsString | Boolean | Specifies if the bytes parquet column which is not logically marked as a string or enum type should be treated as a UTF-8 encoded string. | no(default == false) | + +When the time dimension is a [DateType column](https://github.com/apache/parquet-format/blob/master/LogicalTypes.md), +a format should not be supplied. When the format is UTF8 (String), either `auto` or +an explicitly defined [format](http://www.joda.org/joda-time/apidocs/org/joda/time/format/DateTimeFormat.html) is required. + +#### Example + +```json +{ + "type": "index_hadoop", + "spec": { + "ioConfig": { + "type": "hadoop", + "inputSpec": { + "type": "static", + "inputFormat": "org.apache.druid.data.input.parquet.DruidParquetAvroInputFormat", + "paths": "path/to/file.parquet" + }, + ... + }, + "dataSchema": { + "dataSource": "example", + "parser": { + "type": "parquet-avro", + "parseSpec": { + "format": "avro", + "flattenSpec": { + "useFieldDiscovery": true, + "fields": [ + { + "type": "path", + "name": "nestedDim", + "expr": "$.nestedData.dim1" + }, + { + "type": "path", + "name": "listDimFirstItem", + "expr": "$.listDim[1]" + } + ] + }, + "timestampSpec": { + "column": "timestamp", + "format": "auto" + }, + "dimensionsSpec": { + "dimensions": [], + "dimensionExclusions": [], + "spatialDimensions": [] + } + } + }, + ... + }, + "tuningConfig": + } + } +} +``` + +### Avro Stream Parser + +> You need to include the [`druid-avro-extensions`](../development/extensions-core/avro.md) as an extension to use the Avro Stream Parser. + +This parser is for [stream ingestion](./index.md#streaming) and reads Avro data from a stream directly. + +| Field | Type | Description | Required | +|-------|------|-------------|----------| +| type | String | This should say `avro_stream`. | no | +| avroBytesDecoder | JSON Object | Specifies how to decode bytes to Avro record. | yes | +| parseSpec | JSON Object | Specifies the timestamp and dimensions of the data. Should be an "avro" parseSpec. | yes | + +An Avro parseSpec can contain a [`flattenSpec`](#flattenspec) using either the "root" or "path" +field types, which can be used to read nested Avro records. The "jq" field type is not currently supported for Avro. + +For example, using Avro stream parser with schema repo Avro bytes decoder: + +```json +"parser" : { + "type" : "avro_stream", + "avroBytesDecoder" : { + "type" : "schema_repo", + "subjectAndIdConverter" : { + "type" : "avro_1124", + "topic" : "${YOUR_TOPIC}" + }, + "schemaRepository" : { + "type" : "avro_1124_rest_client", + "url" : "${YOUR_SCHEMA_REPO_END_POINT}", + } + }, + "parseSpec" : { + "format": "avro", + "timestampSpec": , + "dimensionsSpec": , + "flattenSpec": + } +} +``` + +#### Avro Bytes Decoder + +If `type` is not included, the avroBytesDecoder defaults to `schema_repo`. + +##### Inline Schema Based Avro Bytes Decoder + +> The "schema_inline" decoder reads Avro records using a fixed schema and does not support schema migration. If you +> may need to migrate schemas in the future, consider one of the other decoders, all of which use a message header that +> allows the parser to identify the proper Avro schema for reading records. + +This decoder can be used if all the input events can be read using the same schema. In that case schema can be specified in the input task JSON itself as described below. + +``` +... +"avroBytesDecoder": { + "type": "schema_inline", + "schema": { + //your schema goes here, for example + "namespace": "org.apache.druid.data", + "name": "User", + "type": "record", + "fields": [ + { "name": "FullName", "type": "string" }, + { "name": "Country", "type": "string" } + ] + } +} +... +``` + +##### Multiple Inline Schemas Based Avro Bytes Decoder + +This decoder can be used if different input events can have different read schema. In that case schema can be specified in the input task JSON itself as described below. + +``` +... +"avroBytesDecoder": { + "type": "multiple_schemas_inline", + "schemas": { + //your id -> schema map goes here, for example + "1": { + "namespace": "org.apache.druid.data", + "name": "User", + "type": "record", + "fields": [ + { "name": "FullName", "type": "string" }, + { "name": "Country", "type": "string" } + ] + }, + "2": { + "namespace": "org.apache.druid.otherdata", + "name": "UserIdentity", + "type": "record", + "fields": [ + { "name": "Name", "type": "string" }, + { "name": "Location", "type": "string" } + ] + }, + ... + ... + } +} +... +``` + +Note that it is essentially a map of integer schema ID to avro schema object. This parser assumes that record has following format. + first 1 byte is version and must always be 1. + next 4 bytes are integer schema ID serialized using big-endian byte order. + remaining bytes contain serialized avro message. + +##### SchemaRepo Based Avro Bytes Decoder + +This Avro bytes decoder first extract `subject` and `id` from input message bytes, then use them to lookup the Avro schema with which to decode Avro record from bytes. Details can be found in [schema repo](https://github.com/schema-repo/schema-repo) and [AVRO-1124](https://issues.apache.org/jira/browse/AVRO-1124). You will need an http service like schema repo to hold the avro schema. Towards schema registration on the message producer side, you can refer to `org.apache.druid.data.input.AvroStreamInputRowParserTest#testParse()`. + +| Field | Type | Description | Required | +|-------|------|-------------|----------| +| type | String | This should say `schema_repo`. | no | +| subjectAndIdConverter | JSON Object | Specifies the how to extract subject and id from message bytes. | yes | +| schemaRepository | JSON Object | Specifies the how to lookup Avro schema from subject and id. | yes | + +###### Avro-1124 Subject And Id Converter + +This section describes the format of the `subjectAndIdConverter` object for the `schema_repo` Avro bytes decoder. + +| Field | Type | Description | Required | +|-------|------|-------------|----------| +| type | String | This should say `avro_1124`. | no | +| topic | String | Specifies the topic of your Kafka stream. | yes | + + +###### Avro-1124 Schema Repository + +This section describes the format of the `schemaRepository` object for the `schema_repo` Avro bytes decoder. + +| Field | Type | Description | Required | +|-------|------|-------------|----------| +| type | String | This should say `avro_1124_rest_client`. | no | +| url | String | Specifies the endpoint url of your Avro-1124 schema repository. | yes | + +##### Confluent Schema Registry-based Avro Bytes Decoder + +This Avro bytes decoder first extract unique `id` from input message bytes, then use them it lookup in the Schema Registry for the related schema, with which to decode Avro record from bytes. +Details can be found in Schema Registry [documentation](http://docs.confluent.io/current/schema-registry/docs/) and [repository](https://github.com/confluentinc/schema-registry). + +| Field | Type | Description | Required | +|-------|------|-------------|----------| +| type | String | This should say `schema_registry`. | no | +| url | String | Specifies the url endpoint of the Schema Registry. | yes | +| capacity | Integer | Specifies the max size of the cache (default == Integer.MAX_VALUE). | no | + +```json +... +"avroBytesDecoder" : { + "type" : "schema_registry", + "url" : +} +... +``` + +### Protobuf Parser + +> You need to include the [`druid-protobuf-extensions`](../development/extensions-core/protobuf.md) as an extension to use the Protobuf Parser. + +This parser is for [stream ingestion](./index.md#streaming) and reads Protocol buffer data from a stream directly. + +| Field | Type | Description | Required | +|-------|------|-------------|----------| +| type | String | This should say `protobuf`. | yes | +| descriptor | String | Protobuf descriptor file name in the classpath or URL. | yes | +| protoMessageType | String | Protobuf message type in the descriptor. Both short name and fully qualified name are accepted. The parser uses the first message type found in the descriptor if not specified. | no | +| parseSpec | JSON Object | Specifies the timestamp and dimensions of the data. The format must be JSON. See [JSON ParseSpec](./index.md) for more configuration options. Please note timeAndDims parseSpec is no longer supported. | yes | + +Sample spec: + +```json +"parser": { + "type": "protobuf", + "descriptor": "file:///tmp/metrics.desc", + "protoMessageType": "Metrics", + "parseSpec": { + "format": "json", + "timestampSpec": { + "column": "timestamp", + "format": "auto" + }, + "dimensionsSpec": { + "dimensions": [ + "unit", + "http_method", + "http_code", + "page", + "metricType", + "server" + ], + "dimensionExclusions": [ + "timestamp", + "value" + ] + } + } +} +``` + +See the [extension description](../development/extensions-core/protobuf.md) for +more details and examples. + +## ParseSpec + +> The Parser is deprecated for [native batch tasks](./native-batch.md), [Kafka indexing service](../development/extensions-core/kafka-ingestion.md), +and [Kinesis indexing service](../development/extensions-core/kinesis-ingestion.md). +Consider using the [input format](#input-format) instead for these types of ingestion. + +ParseSpecs serve two purposes: + +- The String Parser use them to determine the format (i.e. JSON, CSV, TSV) of incoming rows. +- All Parsers use them to determine the timestamp and dimensions of incoming rows. + +If `format` is not included, the parseSpec defaults to `tsv`. + +### JSON ParseSpec + +Use this with the String Parser to load JSON. + +| Field | Type | Description | Required | +|-------|------|-------------|----------| +| format | String | This should say `json`. | no | +| timestampSpec | JSON Object | Specifies the column and format of the timestamp. | yes | +| dimensionsSpec | JSON Object | Specifies the dimensions of the data. | yes | +| flattenSpec | JSON Object | Specifies flattening configuration for nested JSON data. See [`flattenSpec`](#flattenspec) for more info. | no | + +Sample spec: + +```json +"parseSpec": { + "format" : "json", + "timestampSpec" : { + "column" : "timestamp" + }, + "dimensionSpec" : { + "dimensions" : ["page","language","user","unpatrolled","newPage","robot","anonymous","namespace","continent","country","region","city"] + } +} +``` + +### JSON Lowercase ParseSpec + +> The _jsonLowercase_ parser is deprecated and may be removed in a future version of Druid. + +This is a special variation of the JSON ParseSpec that lower cases all the column names in the incoming JSON data. This parseSpec is required if you are updating to Druid 0.7.x from Druid 0.6.x, are directly ingesting JSON with mixed case column names, do not have any ETL in place to lower case those column names, and would like to make queries that include the data you created using 0.6.x and 0.7.x. + +| Field | Type | Description | Required | +|-------|------|-------------|----------| +| format | String | This should say `jsonLowercase`. | yes | +| timestampSpec | JSON Object | Specifies the column and format of the timestamp. | yes | +| dimensionsSpec | JSON Object | Specifies the dimensions of the data. | yes | + +### CSV ParseSpec + +Use this with the String Parser to load CSV. Strings are parsed using the com.opencsv library. + +| Field | Type | Description | Required | +|-------|------|-------------|----------| +| format | String | This should say `csv`. | yes | +| timestampSpec | JSON Object | Specifies the column and format of the timestamp. | yes | +| dimensionsSpec | JSON Object | Specifies the dimensions of the data. | yes | +| listDelimiter | String | A custom delimiter for multi-value dimensions. | no (default == ctrl+A) | +| columns | JSON array | Specifies the columns of the data. | yes | + +Sample spec: + +```json +"parseSpec": { + "format" : "csv", + "timestampSpec" : { + "column" : "timestamp" + }, + "columns" : ["timestamp","page","language","user","unpatrolled","newPage","robot","anonymous","namespace","continent","country","region","city","added","deleted","delta"], + "dimensionsSpec" : { + "dimensions" : ["page","language","user","unpatrolled","newPage","robot","anonymous","namespace","continent","country","region","city"] + } +} ``` #### CSV Index Tasks @@ -120,20 +1113,34 @@ tasks will fail with an exception. The `columns` field must be included and and ensure that the order of the fields matches the columns of your input data in the same order. -### TSV (Delimited) +### TSV / Delimited ParseSpec + +Use this with the String Parser to load any delimited text that does not require special escaping. By default, +the delimiter is a tab, so this will load TSV. + +| Field | Type | Description | Required | +|-------|------|-------------|----------| +| format | String | This should say `tsv`. | yes | +| timestampSpec | JSON Object | Specifies the column and format of the timestamp. | yes | +| dimensionsSpec | JSON Object | Specifies the dimensions of the data. | yes | +| delimiter | String | A custom delimiter for data values. | no (default == \t) | +| listDelimiter | String | A custom delimiter for multi-value dimensions. | no (default == ctrl+A) | +| columns | JSON String array | Specifies the columns of the data. | yes | + +Sample spec: ```json - "parseSpec": { - "format" : "tsv", - "timestampSpec" : { - "column" : "timestamp" - }, - "columns" : ["timestamp","page","language","user","unpatrolled","newPage","robot","anonymous","namespace","continent","country","region","city","added","deleted","delta"], - "delimiter":"|", - "dimensionsSpec" : { - "dimensions" : ["page","language","user","unpatrolled","newPage","robot","anonymous","namespace","continent","country","region","city"] - } +"parseSpec": { + "format" : "tsv", + "timestampSpec" : { + "column" : "timestamp" + }, + "columns" : ["timestamp","page","language","user","unpatrolled","newPage","robot","anonymous","namespace","continent","country","region","city","added","deleted","delta"], + "delimiter":"|", + "dimensionsSpec" : { + "dimensions" : ["page","language","user","unpatrolled","newPage","robot","anonymous","namespace","continent","country","region","city"] } +} ``` Be sure to change the `delimiter` to the appropriate delimiter for your data. Like CSV, you must specify the columns and which subset of the columns you want indexed. @@ -155,38 +1162,44 @@ tasks will fail with an exception. The `columns` field must be included and and ensure that the order of the fields matches the columns of your input data in the same order. -### Regex +### Multi-value dimensions + +Dimensions can have multiple values for TSV and CSV data. To specify the delimiter for a multi-value dimension, set the `listDelimiter` in the `parseSpec`. + +JSON data can contain multi-value dimensions as well. The multiple values for a dimension must be formatted as a JSON array in the ingested data. No additional `parseSpec` configuration is needed. + +### Regex ParseSpec ```json - "parseSpec":{ - "format" : "regex", - "timestampSpec" : { - "column" : "timestamp" - }, - "dimensionsSpec" : { - "dimensions" : [] - }, - "columns" : [], - "pattern" : - } +"parseSpec":{ + "format" : "regex", + "timestampSpec" : { + "column" : "timestamp" + }, + "dimensionsSpec" : { + "dimensions" : [] + }, + "columns" : [], + "pattern" : +} ``` The `columns` field must match the columns of your regex matching groups in the same order. If columns are not provided, default columns names ("column_1", "column2", ... "column_n") will be assigned. Ensure that your column names include all your dimensions. -### JavaScript +### JavaScript ParseSpec ```json - "parseSpec":{ - "format" : "javascript", - "timestampSpec" : { - "column" : "timestamp" - }, - "dimensionsSpec" : { - "dimensions" : ["page","language","user","unpatrolled","newPage","robot","anonymous","namespace","continent","country","region","city"] - }, - "function" : "function(str) { var parts = str.split(\"-\"); return { one: parts[0], two: parts[1] } }" - } +"parseSpec":{ + "format" : "javascript", + "timestampSpec" : { + "column" : "timestamp" + }, + "dimensionsSpec" : { + "dimensions" : ["page","language","user","unpatrolled","newPage","robot","anonymous","namespace","continent","country","region","city"] + }, + "function" : "function(str) { var parts = str.split(\"-\"); return { one: parts[0], two: parts[1] } }" +} ``` Note with the JavaScript parser that data must be fully parsed and returned as a `{key:value}` format in the JS logic. @@ -194,82 +1207,7 @@ This means any flattening or parsing multi-dimensional values must be done here. > JavaScript-based functionality is disabled by default. Please refer to the Druid [JavaScript programming guide](../development/javascript.md) for guidelines about using Druid's JavaScript functionality, including instructions on how to enable it. -### Multi-value dimensions - -Dimensions can have multiple values for TSV and CSV data. To specify the delimiter for a multi-value dimension, set the `listDelimiter` in the `parseSpec`. - -JSON data can contain multi-value dimensions as well. The multiple values for a dimension must be formatted as a JSON array in the ingested data. No additional `parseSpec` configuration is needed. - -## Parser - -The default parser type is `string`, though a handful of extensions provide additional parser types. `string` typed parsers operate on text based inputs that can be split into individual records by newlines. For additional data formats, please see our [extensions list](../development/extensions.html). - -### String Parser - -| Field | Type | Description | Required | -|-------|------|-------------|----------| -| type | String | This should say `string` in general, or `hadoopyString` when used in a Hadoop indexing job. | no | -| parseSpec | JSON Object | Specifies the format, timestamp, and dimensions of the data. | yes | - -### ParseSpec - -ParseSpecs serve two purposes: - -- The String Parser use them to determine the format (i.e. JSON, CSV, TSV) of incoming rows. -- All Parsers use them to determine the timestamp and dimensions of incoming rows. - -If `format` is not included, the parseSpec defaults to `tsv`. - -#### JSON ParseSpec - -Use this with the String Parser to load JSON. - -| Field | Type | Description | Required | -|-------|------|-------------|----------| -| format | String | This should say `json`. | no | -| timestampSpec | JSON Object | Specifies the column and format of the timestamp. | yes | -| dimensionsSpec | JSON Object | Specifies the dimensions of the data. | yes | -| flattenSpec | JSON Object | Specifies flattening configuration for nested JSON data. See [`flattenSpec`](index.md#flattenspec) for more info. | no | - -#### JSON Lowercase ParseSpec - -> The _jsonLowercase_ parser is deprecated and may be removed in a future version of Druid. - -This is a special variation of the JSON ParseSpec that lower cases all the column names in the incoming JSON data. This parseSpec is required if you are updating to Druid 0.7.x from Druid 0.6.x, are directly ingesting JSON with mixed case column names, do not have any ETL in place to lower case those column names, and would like to make queries that include the data you created using 0.6.x and 0.7.x. - -| Field | Type | Description | Required | -|-------|------|-------------|----------| -| format | String | This should say `jsonLowercase`. | yes | -| timestampSpec | JSON Object | Specifies the column and format of the timestamp. | yes | -| dimensionsSpec | JSON Object | Specifies the dimensions of the data. | yes | - -#### CSV ParseSpec - -Use this with the String Parser to load CSV. Strings are parsed using the com.opencsv library. - -| Field | Type | Description | Required | -|-------|------|-------------|----------| -| format | String | This should say `csv`. | yes | -| timestampSpec | JSON Object | Specifies the column and format of the timestamp. | yes | -| dimensionsSpec | JSON Object | Specifies the dimensions of the data. | yes | -| listDelimiter | String | A custom delimiter for multi-value dimensions. | no (default == ctrl+A) | -| columns | JSON array | Specifies the columns of the data. | yes | - -#### TSV / Delimited ParseSpec - -Use this with the String Parser to load any delimited text that does not require special escaping. By default, -the delimiter is a tab, so this will load TSV. - -| Field | Type | Description | Required | -|-------|------|-------------|----------| -| format | String | This should say `tsv`. | yes | -| timestampSpec | JSON Object | Specifies the column and format of the timestamp. | yes | -| dimensionsSpec | JSON Object | Specifies the dimensions of the data. | yes | -| delimiter | String | A custom delimiter for data values. | no (default == \t) | -| listDelimiter | String | A custom delimiter for multi-value dimensions. | no (default == ctrl+A) | -| columns | JSON String array | Specifies the columns of the data. | yes | - -#### TimeAndDims ParseSpec +### TimeAndDims ParseSpec Use this with non-String Parsers to provide them with timestamp and dimensions information. Non-String Parsers handle all formatting decisions on their own, without using the ParseSpec. @@ -280,3 +1218,24 @@ handle all formatting decisions on their own, without using the ParseSpec. | timestampSpec | JSON Object | Specifies the column and format of the timestamp. | yes | | dimensionsSpec | JSON Object | Specifies the dimensions of the data. | yes | +### Orc ParseSpec + +Use this with the Hadoop ORC Parser to load ORC files. + +| Field | Type | Description | Required | +|-------|------|-------------|----------| +| format | String | This should say `orc`. | no | +| timestampSpec | JSON Object | Specifies the column and format of the timestamp. | yes | +| dimensionsSpec | JSON Object | Specifies the dimensions of the data. | yes | +| flattenSpec | JSON Object | Specifies flattening configuration for nested JSON data. See [`flattenSpec`](#flattenspec) for more info. | no | + +### Parquet ParseSpec + +Use this with the Hadoop Parquet Parser to load Parquet files. + +| Field | Type | Description | Required | +|-------|------|-------------|----------| +| format | String | This should say `parquet`. | no | +| timestampSpec | JSON Object | Specifies the column and format of the timestamp. | yes | +| dimensionsSpec | JSON Object | Specifies the dimensions of the data. | yes | +| flattenSpec | JSON Object | Specifies flattening configuration for nested JSON data. See [`flattenSpec`](#flattenspec) for more info. | no | diff --git a/docs/ingestion/data-management.md b/docs/ingestion/data-management.md index 35ebcf4e67c..f4c17926e44 100644 --- a/docs/ingestion/data-management.md +++ b/docs/ingestion/data-management.md @@ -143,7 +143,7 @@ To control the number of result segments per time chunk, you can set [maxRowsPer Please note that you can run multiple compactionTasks at the same time. For example, you can run 12 compactionTasks per month instead of running a single task for the entire year. A compaction task internally generates an `index` task spec for performing compaction work with some fixed parameters. -For example, its `firehose` is always the [ingestSegmentFirehose](native-batch.md#segment-firehose), and `dimensionsSpec` and `metricsSpec` +For example, its `inputSource` is always the [DruidInputSource](native-batch.md#druid-input-source), and `dimensionsSpec` and `metricsSpec` include all dimensions and metrics of the input segments by default. Compaction tasks will exit with a failure status code, without doing anything, if the interval you specify has no @@ -233,7 +233,7 @@ There are other types of `inputSpec` to enable reindexing and delta ingestion. ### Reindexing with Native Batch Ingestion This section assumes the reader understands how to do batch ingestion without Hadoop using [native batch indexing](../ingestion/native-batch.md), -which uses a "firehose" to know where and how to read the input data. The [`ingestSegment` firehose](native-batch.md#segment-firehose) +which uses an `inputSource` to know where and how to read the input data. The [`DruidInputSource`](native-batch.md#druid-input-source) can be used to read data from segments inside Druid. Note that IndexTask is to be used for prototyping purposes only as it has to do all processing inside a single process and can't scale. Please use Hadoop batch ingestion for production scenarios dealing with more than 1GB of data. diff --git a/docs/ingestion/faq.md b/docs/ingestion/faq.md index 5ce15d900e7..1e6ffe10254 100644 --- a/docs/ingestion/faq.md +++ b/docs/ingestion/faq.md @@ -81,8 +81,8 @@ You can use a [segment metadata query](../querying/segmentmetadataquery.md) for ## How can I Reindex existing data in Druid with schema changes? -You can use IngestSegmentFirehose with index task to ingest existing druid segments using a new schema and change the name, dimensions, metrics, rollup, etc. of the segment. -See [Firehose](../ingestion/native-batch.md#firehoses) for more details on IngestSegmentFirehose. +You can use DruidInputSource with the [Parallel task](../ingestion/native-batch.md) to ingest existing druid segments using a new schema and change the name, dimensions, metrics, rollup, etc. of the segment. +See [DruidInputSource](../ingestion/native-batch.md#druid-input-source) for more details. Or, if you use hadoop based ingestion, then you can use "dataSource" input spec to do reindexing. See the [Update existing data](../ingestion/data-management.md#update) section of the data management page for more details. @@ -91,7 +91,7 @@ See the [Update existing data](../ingestion/data-management.md#update) section o In a lot of situations you may want to lower the granularity of older data. Example, any data older than 1 month has only hour level granularity but newer data has minute level granularity. This use case is same as re-indexing. -To do this use the IngestSegmentFirehose and run an indexer task. The IngestSegment firehose will allow you to take in existing segments from Druid and aggregate them and feed them back into Druid. It will also allow you to filter the data in those segments while feeding it back in. This means if there are rows you want to delete, you can just filter them away during re-ingestion. +To do this use the [DruidInputSource](../ingestion/native-batch.md#druid-input-source) and run a [Parallel task](../ingestion/native-batch.md). The DruidInputSource will allow you to take in existing segments from Druid and aggregate them and feed them back into Druid. It will also allow you to filter the data in those segments while feeding it back in. This means if there are rows you want to delete, you can just filter them away during re-ingestion. Typically the above will be run as a batch job to say everyday feed in a chunk of data and aggregate it. Or, if you use hadoop based ingestion, then you can use "dataSource" input spec to do reindexing. diff --git a/docs/ingestion/hadoop.md b/docs/ingestion/hadoop.md index f2f58279a4e..2101c1b9fc9 100644 --- a/docs/ingestion/hadoop.md +++ b/docs/ingestion/hadoop.md @@ -115,7 +115,7 @@ Also note that Druid automatically computes the classpath for Hadoop job contain ## `dataSchema` -This field is required. See the [`dataSchema`](index.md#dataschema) section of the main ingestion page for details on +This field is required. See the [`dataSchema`](index.md#legacy-dataschema-spec) section of the main ingestion page for details on what it should contain. ## `ioConfig` @@ -145,7 +145,52 @@ A type of inputSpec where a static path to the data files is provided. For example, using the static input paths: ``` -"paths" : "s3n://billy-bucket/the/data/is/here/data.gz,s3n://billy-bucket/the/data/is/here/moredata.gz,s3n://billy-bucket/the/data/is/here/evenmoredata.gz" +"paths" : "hdfs://path/to/data/is/here/data.gz,hdfs://path/to/data/is/here/moredata.gz,hdfs://path/to/data/is/here/evenmoredata.gz" +``` + +You can also read from cloud storage such as AWS S3 or Google Cloud Storage. +To do so, you need to install the necessary library under Druid's classpath in _all MiddleManager or Indexer processes_. +For S3, you can run the below command to install the [Hadoop AWS module](https://hadoop.apache.org/docs/current/hadoop-aws/tools/hadoop-aws/index.html). + +```bash +java -classpath "${DRUID_HOME}lib/*" org.apache.druid.cli.Main tools pull-deps -h "org.apache.hadoop:hadoop-aws:${HADOOP_VERSION}"; +cp ${DRUID_HOME}/hadoop-dependencies/hadoop-aws/${HADOOP_VERSION}/hadoop-aws-${HADOOP_VERSION}.jar ${DRUID_HOME}/extensions/druid-hdfs-storage/ +``` + +Once you install the Hadoop AWS module in all MiddleManager and Indexer processes, you can put +your S3 paths in the inputSpec with the below job properties. +For more configurations, see the [Hadoop AWS module](https://hadoop.apache.org/docs/current/hadoop-aws/tools/hadoop-aws/index.html). + +``` +"paths" : "s3a://billy-bucket/the/data/is/here/data.gz,s3a://billy-bucket/the/data/is/here/moredata.gz,s3a://billy-bucket/the/data/is/here/evenmoredata.gz" +``` + +```json +"jobProperties" : { + "fs.s3a.impl" : "org.apache.hadoop.fs.s3a.S3AFileSystem", + "fs.AbstractFileSystem.s3a.impl" : "org.apache.hadoop.fs.s3a.S3A", + "fs.s3a.access.key" : "YOUR_ACCESS_KEY", + "fs.s3a.secret.key" : "YOUR_SECRET_KEY" +} +``` + +For Google Cloud Storage, you need to install [GCS connector jar](https://github.com/GoogleCloudPlatform/bigdata-interop/blob/master/gcs/INSTALL.md) +under `${DRUID_HOME}/hadoop-dependencies` in _all MiddleManager or Indexer processes_. +Once you install the GCS Connector jar in all MiddleManager and Indexer processes, you can put +your Google Cloud Storage paths in the inputSpec with the below job properties. +For more configurations, see the [instructions to configure Hadoop](https://github.com/GoogleCloudPlatform/bigdata-interop/blob/master/gcs/INSTALL.md#configure-hadoop), +[GCS core default](https://github.com/GoogleCloudPlatform/bigdata-interop/blob/master/gcs/conf/gcs-core-default.xml) +and [GCS core template](https://github.com/GoogleCloudPlatform/bdutil/blob/master/conf/hadoop2/gcs-core-template.xml). + +``` +"paths" : "gs://billy-bucket/the/data/is/here/data.gz,gs://billy-bucket/the/data/is/here/moredata.gz,gs://billy-bucket/the/data/is/here/evenmoredata.gz" +``` + +```json +"jobProperties" : { + "fs.gs.impl" : "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem", + "fs.AbstractFileSystem.gs.impl" : "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS" +} ``` #### `granularity` diff --git a/docs/ingestion/index.md b/docs/ingestion/index.md index 54324ab65ee..a52081a66a6 100644 --- a/docs/ingestion/index.md +++ b/docs/ingestion/index.md @@ -28,8 +28,9 @@ All data in Druid is organized into _segments_, which are data files that genera Loading data in Druid is called _ingestion_ or _indexing_ and consists of reading data from a source system and creating segments based on that data. -In most ingestion methods, the work of loading data is done by Druid MiddleManager processes. One exception is -Hadoop-based ingestion, where this work is instead done using a Hadoop MapReduce job on YARN (although MiddleManager +In most ingestion methods, the work of loading data is done by Druid [MiddleManager](../design/middlemanager.md) processes +(or the [Indexer](../design/indexer.md) processes). One exception is +Hadoop-based ingestion, where this work is instead done using a Hadoop MapReduce job on YARN (although MiddleManager or Indexer processes are still involved in starting and monitoring the Hadoop jobs). Once segments have been generated and stored in [deep storage](../dependencies/deep-storage.md), they will be loaded by Historical processes. For more details on how this works under the hood, see the [Storage design](../design/architecture.md#storage-design) section of Druid's design @@ -70,25 +71,26 @@ This table compares the major available options: ### Batch -When doing batch loads from files, you should use one-time [tasks](tasks.md), and you have three options: `index` -(native batch; single-task), `index_parallel` (native batch; parallel), or `index_hadoop` (Hadoop-based). +When doing batch loads from files, you should use one-time [tasks](tasks.md), and you have three options: `index_parallel` (native batch; parallel), `index_hadoop` (Hadoop-based), +or `index` (native batch; single-task). In general, we recommend native batch whenever it meets your needs, since the setup is simpler (it does not depend on -an external Hadoop cluster). However, there are still scenarios where Hadoop-based batch ingestion is the right choice, -especially due to its support for custom partitioning options and reading binary data formats. +an external Hadoop cluster). However, there are still scenarios where Hadoop-based batch ingestion might be a better choice, +for example when you already have a running Hadoop cluster and want to +use the cluster resource of the existing cluster for batch ingestion. This table compares the three available options: -| **Method** | [Native batch (simple)](native-batch.html#simple-task) | [Native batch (parallel)](native-batch.html#parallel-task) | [Hadoop-based](hadoop.html) | +| **Method** | [Native batch (parallel)](native-batch.html#parallel-task) | [Hadoop-based](hadoop.html) | [Native batch (simple)](native-batch.html#simple-task) | |---|-----|--------------|------------| -| **Task type** | `index` | `index_parallel` | `index_hadoop` | -| **Parallel?** | No. Each task is single-threaded. | Yes, if firehose is splittable and `maxNumConcurrentSubTasks` > 1 in tuningConfig. See [firehose documentation](native-batch.md#firehoses) for details. | Yes, always. | -| **Can append or overwrite?** | Yes, both. | Yes, both. | Overwrite only. | -| **External dependencies** | None. | None. | Hadoop cluster (Druid submits Map/Reduce jobs). | -| **Input locations** | Any [firehose](native-batch.md#firehoses). | Any [firehose](native-batch.md#firehoses). | Any Hadoop FileSystem or Druid datasource. | -| **File formats** | Text file formats (CSV, TSV, JSON). Support for binary formats is coming in a future release. | Text file formats (CSV, TSV, JSON). Support for binary formats is coming in a future release. | Any Hadoop InputFormat. | -| **[Rollup modes](#rollup)** | Perfect if `forceGuaranteedRollup` = true in the [`tuningConfig`](native-batch.md#tuningconfig).| Perfect if `forceGuaranteedRollup` = true in the [`tuningConfig`](native-batch.md#tuningconfig). | Always perfect. | -| **Partitioning options** | Hash-based partitioning is supported when `forceGuaranteedRollup` = true in the [`tuningConfig`](native-batch.md#tuningconfig). | Hash-based or range-based partitioning (when `forceGuaranteedRollup` = true). | Hash-based or range-based partitioning via [`partitionsSpec`](hadoop.md#partitionsspec). | +| **Task type** | `index_parallel` | `index_hadoop` | `index` | +| **Parallel?** | Yes, if `inputFormat` is splittable and `maxNumConcurrentSubTasks` > 1 in `tuningConfig`. See [data format documentation](./data-formats.md) for details. | Yes, always. | No. Each task is single-threaded. | +| **Can append or overwrite?** | Yes, both. | Overwrite only. | Yes, both. | +| **External dependencies** | None. | Hadoop cluster (Druid submits Map/Reduce jobs). | None. | +| **Input locations** | Any [`inputSource`](./native-batch.md#input-sources). | Any Hadoop FileSystem or Druid datasource. | Any [`inputSource`](./native-batch.md#input-sources). | +| **File formats** | Any [`inputFormat`](./data-formats.md#input-format). | Any Hadoop InputFormat. | Any [`inputFormat`](./data-formats.md#input-format). | +| **[Rollup modes](#rollup)** | Perfect if `forceGuaranteedRollup` = true in the [`tuningConfig`](native-batch.md#tuningconfig). | Always perfect. | Perfect if `forceGuaranteedRollup` = true in the [`tuningConfig`](native-batch.md#tuningconfig). | +| **Partitioning options** | Dynamic, hash-based, and range-based partitioning methods are available. See [Partitions Spec](./native-batch.md#partitionsspec) for details. | Hash-based or range-based partitioning via [`partitionsSpec`](hadoop.md#partitionsspec). | Dynamic and hash-based partitioning methods are available. See [Partitions Spec](./native-batch.md#partitionsspec) for details. | @@ -192,11 +194,11 @@ datasource that has rollup disabled (or enabled, but with a minimal rollup ratio has fewer dimensions and a higher rollup ratio. When queries only involve dimensions in the "abbreviated" set, using that datasource leads to much faster query times. This can often be done with just a small increase in storage footprint, since abbreviated datasources tend to be substantially smaller. -- If you are using a [best-effort rollup](#best-effort-rollup) ingestion configuration that does not guarantee perfect +- If you are using a [best-effort rollup](#perfect-rollup-vs-best-effort-rollup) ingestion configuration that does not guarantee perfect rollup, you can potentially improve your rollup ratio by switching to a guaranteed perfect rollup option, or by [reindexing](data-management.md#compaction-and-reindexing) your data in the background after initial ingestion. -### Best-effort rollup +### Perfect rollup vs Best-effort rollup Some Druid ingestion methods guarantee _perfect rollup_, meaning that input data are perfectly aggregated at ingestion time. Others offer _best-effort rollup_, meaning that input data might not be perfectly aggregated and thus there could @@ -233,7 +235,7 @@ This partitioning happens for all ingestion methods, and is based on the `segmen ingestion spec's `dataSchema`. The segments within a particular time chunk may also be partitioned further, using options that vary based on the -ingestion method you have chosen. In general, doing this secondary partitioning using a particular dimension will +ingestion type you have chosen. In general, doing this secondary partitioning using a particular dimension will improve locality, meaning that rows with the same value for that dimension are stored together and can be accessed quickly. @@ -287,44 +289,31 @@ definition is an _ingestion spec_. Ingestion specs consists of three main components: -- [`dataSchema`](#dataschema), which configures the [datasource name](#datasource), [input row parser](#parser), - [primary timestamp](#timestampspec), [flattening of nested data](#flattenspec) (if needed), - [dimensions](#dimensionsspec), [metrics](#metricsspec), and [transforms and filters](#transformspec) (if needed). -- [`ioConfig`](#ioconfig), which tells Druid how to connect to the source system and . For more information, see the +- [`dataSchema`](#dataschema), which configures the [datasource name](#datasource), + [primary timestamp](#timestampspec), [dimensions](#dimensionsspec), [metrics](#metricsspec), and [transforms and filters](#transformspec) (if needed). +- [`ioConfig`](#ioconfig), which tells Druid how to connect to the source system and how to parse data. For more information, see the documentation for each [ingestion method](#ingestion-methods). - [`tuningConfig`](#tuningconfig), which controls various tuning parameters specific to each [ingestion method](#ingestion-methods). -Example ingestion spec for task type "index" (native batch): +Example ingestion spec for task type `index_parallel` (native batch): ``` { - "type": "index", + "type": "index_parallel", "spec": { "dataSchema": { "dataSource": "wikipedia", - "parser": { - "type": "string", - "parseSpec": { - "format": "json", - "flattenSpec": { - "useFieldDiscovery": true, - "fields": [ - { "type": "path", "name": "userId", "expr": "$.user.id" } - ] - }, - "timestampSpec": { - "column": "timestamp", - "format": "auto" - }, - "dimensionsSpec": { - "dimensions": [ - { "type": "string", "page" }, - { "type": "string", "language" }, - { "type": "long", "name": "userId" } - ] - } - } + "timestampSpec": { + "column": "timestamp", + "format": "auto" + }, + "dimensionsSpec": { + "dimensions": [ + { "type": "string", "page" }, + { "type": "string", "language" }, + { "type": "long", "name": "userId" } + ] }, "metricsSpec": [ { "type": "count", "name": "count" }, @@ -340,15 +329,24 @@ Example ingestion spec for task type "index" (native batch): } }, "ioConfig": { - "type": "index", - "firehose": { + "type": "index_parallel", + "inputSource": { "type": "local", "baseDir": "examples/indexing/", "filter": "wikipedia_data.json" + }, + "inputFormat": { + "type": "json", + "flattenSpec": { + "useFieldDiscovery": true, + "fields": [ + { "type": "path", "name": "userId", "expr": "$.user.id" } + ] + } } }, "tuningConfig": { - "type": "index" + "type": "index_parallel" } } } @@ -365,39 +363,30 @@ available in Druid's [web console](../operations/druid-console.md). Druid's visu ## `dataSchema` +> The `dataSchema` spec has been changed in 0.17.0. The new spec is supported by all ingestion methods +except for _Hadoop_ ingestion. See the [Legacy `dataSchema` spec](#legacy-dataschema-spec) for the old spec. + The `dataSchema` is a holder for the following components: -- [datasource name](#datasource), [input row parser](#parser), - [primary timestamp](#timestampspec), [flattening of nested data](#flattenspec) (if needed), - [dimensions](#dimensionsspec), [metrics](#metricsspec), and [transforms and filters](#transformspec) (if needed). +- [datasource name](#datasource), [primary timestamp](#timestampspec), + [dimensions](#dimensionsspec), [metrics](#metricsspec), and + [transforms and filters](#transformspec) (if needed). An example `dataSchema` is: ``` "dataSchema": { "dataSource": "wikipedia", - "parser": { - "type": "string", - "parseSpec": { - "format": "json", - "flattenSpec": { - "useFieldDiscovery": true, - "fields": [ - { "type": "path", "name": "userId", "expr": "$.user.id" } - ] - }, - "timestampSpec": { - "column": "timestamp", - "format": "auto" - }, - "dimensionsSpec": { - "dimensions": [ - { "type": "string", "page" }, - { "type": "string", "language" }, - { "type": "long", "name": "userId" } - ] - } - } + "timestampSpec": { + "column": "timestamp", + "format": "auto" + }, + "dimensionsSpec": { + "dimensions": [ + { "type": "string", "page" }, + { "type": "string", "language" }, + { "type": "long", "name": "userId" } + ] }, "metricsSpec": [ { "type": "count", "name": "count" }, @@ -424,50 +413,9 @@ The `dataSource` is located in `dataSchema` → `dataSource` and is simply the n "dataSource": "my-first-datasource" ``` -### `parser` - -The `parser` is located in `dataSchema` → `parser` and is responsible for configuring a wide variety of -items related to parsing input records. - -For details about supported data formats, see the ["Data formats" page](data-formats.md). - -For details about major components of the `parseSpec`, refer to their subsections: - -- [`timestampSpec`](#timestampspec), responsible for configuring the [primary timestamp](#primary-timestamp). -- [`dimensionsSpec`](#dimensionsspec), responsible for configuring [dimensions](#dimensions). -- [`flattenSpec`](#flattenspec), responsible for flattening nested data formats. - -An example `parser` is: - -``` -"parser": { - "type": "string", - "parseSpec": { - "format": "json", - "flattenSpec": { - "useFieldDiscovery": true, - "fields": [ - { "type": "path", "name": "userId", "expr": "$.user.id" } - ] - }, - "timestampSpec": { - "column": "timestamp", - "format": "auto" - }, - "dimensionsSpec": { - "dimensions": [ - { "type": "string", "page" }, - { "type": "string", "language" }, - { "type": "long", "name": "userId" } - ] - } - } -} -``` - ### `timestampSpec` -The `timestampSpec` is located in `dataSchema` → `parser` → `parseSpec` → `timestampSpec` and is responsible for +The `timestampSpec` is located in `dataSchema` → `timestampSpec` and is responsible for configuring the [primary timestamp](#primary-timestamp). An example `timestampSpec` is: ``` @@ -478,7 +426,7 @@ configuring the [primary timestamp](#primary-timestamp). An example `timestampSp ``` > Conceptually, after input data records are read, Druid applies ingestion spec components in a particular order: -> first [`flattenSpec`](#flattenspec), then [`timestampSpec`](#timestampspec), then [`transformSpec`](#transformspec), +> first [`flattenSpec`](data-formats.md#flattenspec) (if any), then [`timestampSpec`](#timestampspec), then [`transformSpec`](#transformspec), > and finally [`dimensionsSpec`](#dimensionsspec) and [`metricsSpec`](#metricsspec). Keep this in mind when writing > your ingestion spec. @@ -492,7 +440,7 @@ A `timestampSpec` can have the following components: ### `dimensionsSpec` -The `dimensionsSpec` is located in `dataSchema` → `parser` → `parseSpec` → `dimensionsSpec` and is responsible for +The `dimensionsSpec` is located in `dataSchema` → `dimensionsSpec` and is responsible for configuring [dimensions](#dimensions). An example `dimensionsSpec` is: ``` @@ -508,7 +456,7 @@ configuring [dimensions](#dimensions). An example `dimensionsSpec` is: ``` > Conceptually, after input data records are read, Druid applies ingestion spec components in a particular order: -> first [`flattenSpec`](#flattenspec), then [`timestampSpec`](#timestampspec), then [`transformSpec`](#transformspec), +> first [`flattenSpec`](data-formats.md#flattenspec) (if any), then [`timestampSpec`](#timestampspec), then [`transformSpec`](#transformspec), > and finally [`dimensionsSpec`](#dimensionsspec) and [`metricsSpec`](#metricsspec). Keep this in mind when writing > your ingestion spec. @@ -516,8 +464,8 @@ A `dimensionsSpec` can have the following components: | Field | Description | Default | |-------|-------------|---------| -| dimensions | A list of [dimension names or objects](#dimension-objects).

If this is an empty array, Druid will treat all non-timestamp, non-metric columns that do not appear in `dimensionExclusions` as String-typed dimension columns (see [inclusions and exclusions](#inclusions-and-exclusions) below). | `[]` | -| dimensionExclusions | The names of dimensions to exclude from ingestion. Only names are supported here, not objects. | `[]` | +| dimensions | A list of [dimension names or objects](#dimension-objects). Cannot have the same column in both `dimensions` and `dimensionExclusions`.

If this is an empty array, Druid will treat all non-timestamp, non-metric columns that do not appear in `dimensionExclusions` as String-typed dimension columns (see [inclusions and exclusions](#inclusions-and-exclusions) below). | `[]` | +| dimensionExclusions | The names of dimensions to exclude from ingestion. Only names are supported here, not objects. Cannot have the same column in both `dimensions` and `dimensionExclusions`.| `[]` | | spatialDimensions | An array of [spatial dimensions](../development/geo.md). | `[]` | #### Dimension objects @@ -537,11 +485,11 @@ Dimension objects can have the following components: Druid will interpret a `dimensionsSpec` in two possible ways: _normal_ or _schemaless_. -Normal interpretation occurs when either `dimensions` or `spatialDimensions` is non-empty. In this case, the combination of the two lists will be taken as the set of dimensions to be ingested, and `dimensionExclusions` is ignored. +Normal interpretation occurs when either `dimensions` or `spatialDimensions` is non-empty. In this case, the combination of the two lists will be taken as the set of dimensions to be ingested. Schemaless interpretation occurs when both `dimensions` and `spatialDimensions` are empty or null. In this case, the set of dimensions is determined in the following way: -1. First, start from the set of all input fields from the [`parser`](#parser) (or the [`flattenSpec`](#flattenspec), if one is being used). +1. First, start from the set of all input fields from the [`inputFormat`](./data-formats.md) (or the [`flattenSpec`](./data-formats.md#flattenspec), if one is being used). 2. Any field listed in `dimensionExclusions` is excluded. 3. The field listed as `column` in the [`timestampSpec`](#timestampspec) is excluded. 4. Any field used as an input to an aggregator from the [metricsSpec](#metricsspec) is excluded. @@ -551,58 +499,6 @@ Schemaless interpretation occurs when both `dimensions` and `spatialDimensions` > Note: Fields generated by a [`transformSpec`](#transformspec) are not currently considered candidates for > schemaless dimension interpretation. -### `flattenSpec` - -The `flattenSpec` is located in `dataSchema` → `parser` → `parseSpec` → `flattenSpec` and is responsible for -bridging the gap between potentially nested input data (such as JSON, Avro, etc) and Druid's flat data model. -An example `flattenSpec` is: - -``` -"flattenSpec": { - "useFieldDiscovery": true, - "fields": [ - { "name": "baz", "type": "root" }, - { "name": "foo_bar", "type": "path", "expr": "$.foo.bar" }, - { "name": "first_food", "type": "jq", "expr": ".thing.food[1]" } - ] -} -``` - -> Conceptually, after input data records are read, Druid applies ingestion spec components in a particular order: -> first [`flattenSpec`](#flattenspec), then [`timestampSpec`](#timestampspec), then [`transformSpec`](#transformspec), -> and finally [`dimensionsSpec`](#dimensionsspec) and [`metricsSpec`](#metricsspec). Keep this in mind when writing -> your ingestion spec. - - -Flattening is only supported for [data formats](data-formats.md) that support nesting, including `avro`, `json`, `orc`, -and `parquet`. Flattening is _not_ supported for the `timeAndDims` parseSpec type. - -A `flattenSpec` can have the following components: - -| Field | Description | Default | -|-------|-------------|---------| -| useFieldDiscovery | If true, interpret all root-level fields as available fields for usage by [`timestampSpec`](#timestampspec), [`transformSpec`](#transformspec), [`dimensionsSpec`](#dimensionsspec), and [`metricsSpec`](#metricsspec).

If false, only explicitly specified fields (see `fields`) will be available for use. | `true` | -| fields | Specifies the fields of interest and how they are accessed. [See below for details.](#field-flattening-specifications) | `[]` | - -#### Field flattening specifications - -Each entry in the `fields` list can have the following components: - -| Field | Description | Default | -|-------|-------------|---------| -| type | Options are as follows:

  • `root`, referring to a field at the root level of the record. Only really useful if `useFieldDiscovery` is false.
  • `path`, referring to a field using [JsonPath](https://github.com/jayway/JsonPath) notation. Supported by most data formats that offer nesting, including `avro`, `json`, `orc`, and `parquet`.
  • `jq`, referring to a field using [jackson-jq](https://github.com/eiiches/jackson-jq) notation. Only supported for the `json` format.
| none (required) | -| name | Name of the field after flattening. This name can be referred to by the [`timestampSpec`](#timestampspec), [`transformSpec`](#transformspec), [`dimensionsSpec`](#dimensionsspec), and [`metricsSpec`](#metricsspec).| none (required) | -| expr | Expression for accessing the field while flattening. For type `path`, this should be [JsonPath](https://github.com/jayway/JsonPath). For type `jq`, this should be [jackson-jq](https://github.com/eiiches/jackson-jq) notation. For other types, this parameter is ignored. | none (required for types `path` and `jq`) | - -#### Notes on flattening - -* For convenience, when defining a root-level field, it is possible to define only the field name, as a string, instead of a JSON object. For example, `{"name": "baz", "type": "root"}` is equivalent to `"baz"`. -* Enabling `useFieldDiscovery` will only autodetect "simple" fields at the root level that correspond to data types that Druid supports. This includes strings, numbers, and lists of strings or numbers. Other types will not be automatically detected, and must be specified explicitly in the `fields` list. -* Duplicate field `name`s are not allowed. An exception will be thrown. -* If `useFieldDiscovery` is enabled, any discovered field with the same name as one already defined in the `fields` list will be skipped, rather than added twice. -* [http://jsonpath.herokuapp.com/](http://jsonpath.herokuapp.com/) is useful for testing `path`-type expressions. -* jackson-jq supports a subset of the full [jq](https://stedolan.github.io/jq/) syntax. Please refer to the [jackson-jq documentation](https://github.com/eiiches/jackson-jq) for details. - ### `metricsSpec` The `metricsSpec` is located in `dataSchema` → `metricsSpec` and is a list of [aggregators](../querying/aggregations.md) @@ -655,9 +551,9 @@ A `granularitySpec` can have the following components: | Field | Description | Default | |-------|-------------|---------| | type | Either `uniform` or `arbitrary`. In most cases you want to use `uniform`.| `uniform` | -| segmentGranularity | [Time chunking](../design/architecture.html#datasources-and-segments) granularity for this datasource. Multiple segments can be created per time chunk. For example, when set to `day`, the events of the same day fall into the same time chunk which can be optionally further partitioned into multiple segments based on other configurations and input size. Any [granularity](../querying/granularities.md) can be provided here.

Ignored if `type` is set to `arbitrary`.| `day` | -| queryGranularity | The resolution of timestamp storage within each segment. This must be equal to, or finer, than `segmentGranularity`. This will be the finest granularity that you can query at and still receive sensible results, but note that you can still query at anything coarser than this granularity. E.g., a value of `minute` will mean that records will be stored at minutely granularity, and can be sensibly queried at any multiple of minutes (including minutely, 5-minutely, hourly, etc).

Any [granularity](../querying/granularities.md) can be provided here. Use `none` to store timestamps as-is, without any truncation.| `none` | -| rollup | Whether to use ingestion-time [rollup](#rollup) or not. | `true` | +| segmentGranularity | [Time chunking](../design/architecture.html#datasources-and-segments) granularity for this datasource. Multiple segments can be created per time chunk. For example, when set to `day`, the events of the same day fall into the same time chunk which can be optionally further partitioned into multiple segments based on other configurations and input size. Any [granularity](../querying/granularities.md) can be provided here. Note that all segments in the same time chunk should have the same segment granularity.

Ignored if `type` is set to `arbitrary`.| `day` | +| queryGranularity | The resolution of timestamp storage within each segment. This must be equal to, or finer, than `segmentGranularity`. This will be the finest granularity that you can query at and still receive sensible results, but note that you can still query at anything coarser than this granularity. E.g., a value of `minute` will mean that records will be stored at minutely granularity, and can be sensibly queried at any multiple of minutes (including minutely, 5-minutely, hourly, etc).

Any [granularity](../querying/granularities.md) can be provided here. Use `none` to store timestamps as-is, without any truncation. Note that `rollup` will be applied if it is set even when the `queryGranularity` is set to `none`. | `none` | +| rollup | Whether to use ingestion-time [rollup](#rollup) or not. Note that rollup is still effective even when `queryGranularity` is set to `none`. Your data will be rolled up if they have the exactly same timestamp. | `true` | | intervals | A list of intervals describing what time chunks of segments should be created. If `type` is set to `uniform`, this list will be broken up and rounded-off based on the `segmentGranularity`. If `type` is set to `arbitrary`, this list will be used as-is.

If `null` or not provided, batch ingestion tasks will generally determine which time chunks to output based on what timestamps are found in the input data.

If specified, batch ingestion tasks may be able to skip a determining-partitions phase, which can result in faster ingestion. Batch ingestion tasks may also be able to request all their locks up-front instead of one by one. Batch ingestion tasks will throw away any records with timestamps outside of the specified intervals.

Ignored for any form of streaming ingestion. | `null` | ### `transformSpec` @@ -679,7 +575,7 @@ records during ingestion time. It is optional. An example `transformSpec` is: ``` > Conceptually, after input data records are read, Druid applies ingestion spec components in a particular order: -> first [`flattenSpec`](#flattenspec), then [`timestampSpec`](#timestampspec), then [`transformSpec`](#transformspec), +> first [`flattenSpec`](data-formats.md#flattenspec) (if any), then [`timestampSpec`](#timestampspec), then [`transformSpec`](#transformspec), > and finally [`dimensionsSpec`](#dimensionsspec) and [`metricsSpec`](#metricsspec). Keep this in mind when writing > your ingestion spec. @@ -712,17 +608,91 @@ Druid currently includes one kind of built-in transform, the expression transfor The `expression` is a [Druid query expression](../misc/math-expr.md). +> Conceptually, after input data records are read, Druid applies ingestion spec components in a particular order: +> first [`flattenSpec`](data-formats.md#flattenspec) (if any), then [`timestampSpec`](#timestampspec), then [`transformSpec`](#transformspec), +> and finally [`dimensionsSpec`](#dimensionsspec) and [`metricsSpec`](#metricsspec). Keep this in mind when writing +> your ingestion spec. + #### Filter The `filter` conditionally filters input rows during ingestion. Only rows that pass the filter will be ingested. Any of Druid's standard [query filters](../querying/filters.md) can be used. Note that within a `transformSpec`, the `transforms` are applied before the `filter`, so the filter can refer to a transform. +### Legacy `dataSchema` spec + +> The `dataSchema` spec has been changed in 0.17.0. The new spec is supported by all ingestion methods +except for _Hadoop_ ingestion. See [`dataSchema`](#dataschema) for the new spec. + +The legacy `dataSchema` spec has below two more components in addition to the ones listed in the [`dataSchema`](#dataschema) section above. + +- [input row parser](#parser-deprecated), [flattening of nested data](#flattenspec) (if needed) + +#### `parser` (Deprecated) + +In legacy `dataSchema`, the `parser` is located in the `dataSchema` → `parser` and is responsible for configuring a wide variety of +items related to parsing input records. The `parser` is deprecated and it is highly recommended to use `inputFormat` instead. +For details about `inputFormat` and supported `parser` types, see the ["Data formats" page](data-formats.md). + +For details about major components of the `parseSpec`, refer to their subsections: + +- [`timestampSpec`](#timestampspec), responsible for configuring the [primary timestamp](#primary-timestamp). +- [`dimensionsSpec`](#dimensionsspec), responsible for configuring [dimensions](#dimensions). +- [`flattenSpec`](#flattenspec), responsible for flattening nested data formats. + +An example `parser` is: + +``` +"parser": { + "type": "string", + "parseSpec": { + "format": "json", + "flattenSpec": { + "useFieldDiscovery": true, + "fields": [ + { "type": "path", "name": "userId", "expr": "$.user.id" } + ] + }, + "timestampSpec": { + "column": "timestamp", + "format": "auto" + }, + "dimensionsSpec": { + "dimensions": [ + { "type": "string", "page" }, + { "type": "string", "language" }, + { "type": "long", "name": "userId" } + ] + } + } +} +``` + +#### `flattenSpec` + +In the legacy `dataSchema`, the `flattenSpec` is located in `dataSchema` → `parser` → `parseSpec` → `flattenSpec` and is responsible for +bridging the gap between potentially nested input data (such as JSON, Avro, etc) and Druid's flat data model. +See [Flatten spec](./data-formats.md#flattenspec) for more details. + ## `ioConfig` The `ioConfig` influences how data is read from a source system, such as Apache Kafka, Amazon S3, a mounted -filesystem, or any other supported source system. For details, see the documentation provided by each -[ingestion method](#ingestion-methods). +filesystem, or any other supported source system. The `inputFormat` property applies to all +[ingestion method](#ingestion-methods) except for Hadoop ingestion. The Hadoop ingestion still +uses the [`parser`](#parser-deprecated) in the legacy `dataSchema`. +The rest of `ioConfig` is specific to each individual ingestion method. +An example `ioConfig` to read JSON data is: + +```json +"ioConfig": { + "type": "", + "inputFormat": { + "type": "json" + }, + ... +} +``` +For more details, see the documentation provided by each [ingestion method](#ingestion-methods). ## `tuningConfig` diff --git a/docs/ingestion/native-batch.md b/docs/ingestion/native-batch.md index e7ed5e453ac..d4117d95b95 100644 --- a/docs/ingestion/native-batch.md +++ b/docs/ingestion/native-batch.md @@ -42,48 +42,48 @@ demonstrates the "simple" (single-task) mode. ## Parallel task The Parallel task (type `index_parallel`) is a task for parallel batch indexing. This task only uses Druid's resource and -doesn't depend on other external systems like Hadoop. `index_parallel` task is a supervisor task which basically generates +doesn't depend on other external systems like Hadoop. `index_parallel` task is a supervisor task which basically creates multiple worker tasks and submits them to the Overlord. Each worker task reads input data and creates segments. Once they successfully generate segments for all input data, they report the generated segment list to the supervisor task. The supervisor task periodically checks the status of worker tasks. If one of them fails, it retries the failed task -until the number of retries reaches the configured limit. If all worker tasks succeed, then it publishes the reported segments at once. +until the number of retries reaches to the configured limit. If all worker tasks succeed, then it publishes the reported segments at once and finalize the ingestion. -The parallel Index Task can run in two different modes depending on `forceGuaranteedRollup` in `tuningConfig`. -If `forceGuaranteedRollup` = false, it's executed in a single phase. In this mode, -each sub task creates segments individually and reports them to the supervisor task. +The detailed behavior of the Parallel task is different depending on the [`partitionsSpec`](#partitionsspec). +See each `partitionsSpec` for more details. -If `forceGuaranteedRollup` = true, it's executed in two phases with data shuffle which is similar to [MapReduce](https://en.wikipedia.org/wiki/MapReduce). -In the first phase, each sub task partitions input data based on `segmentGranularity` (primary partition key) in `granularitySpec` -and `partitionDimension` or `partitionDimensions` (secondary partition key) in `partitionsSpec`. The partitioned data is served by -the [middleManager](../design/middlemanager.md) or the [indexer](../design/indexer.md) -where the first phase tasks ran. In the second phase, each sub task fetches -partitioned data from MiddleManagers or indexers and merges them to create the final segments. -As in the single phase execution, the created segments are reported to the supervisor task to publish at once. +To use this task, the [`inputSource`](#input-sources) in the `ioConfig` should be _splittable_ and `maxNumConcurrentSubTasks` should be set to larger than 1 in the `tuningConfig`. +Otherwise, this task runs sequentially; the `index_paralllel` task reads each input file one by one and creates segments by itself. +The supported splittable input formats for now are: -To use this task, the `firehose` in `ioConfig` should be _splittable_ and `maxNumConcurrentSubTasks` should be set something larger than 1 in `tuningConfig`. -Otherwise, this task runs sequentially. Here is the list of currently splittable firehoses. +- [`s3`](#s3-input-source) reads data from AWS S3 storage. +- [`gs`](#google-cloud-storage-input-source) reads data from Google Cloud Storage. +- [`hdfs`](#hdfs-input-source) reads data from HDFS storage. +- [`http`](#http-input-source) reads data from HTTP servers. +- [`local`](#local-input-source) reads data from local storage. +- [`druid`](#druid-input-source) reads data from a Druid datasource. + +Some other cloud storage types are supported with the legacy [`firehose`](#firehoses-deprecated). +The below `firehose` types are also splittable. Note that only text formats are supported +with the `firehose`. -- [`local`](#local-firehose) -- [`ingestSegment`](#segment-firehose) -- [`http`](#http-firehose) -- [`s3`](../development/extensions-core/s3.md#firehose) -- [`hdfs`](../development/extensions-core/hdfs.md#firehose) - [`static-azure-blobstore`](../development/extensions-contrib/azure.md#firehose) -- [`static-google-blobstore`](../development/extensions-core/google.md#firehose) - [`static-cloudfiles`](../development/extensions-contrib/cloudfiles.md#firehose) -The splittable firehose is responsible for generating _splits_. The supervisor task generates _worker task specs_ containing a split +The splittable `inputSource` (and `firehose`) types are responsible for generating _splits_. +The supervisor task generates _worker task specs_ containing a split and submits worker tasks using those specs. As a result, the number of worker tasks depends on -the implementation of splittable firehoses. Please note that multiple tasks can be created for the same worker task spec +the implementation of the splittable `inputSource`. For now, all implementations create one split per input file +except for the Druid Input Source. Please note that multiple worker tasks can be created for the same worker task spec if one of them fails. You may want to consider the below things: -- The number of concurrent tasks run in parallel ingestion is determined by `maxNumConcurrentSubTasks` in the `tuningConfig`. - The supervisor task checks the number of current running sub tasks and creates more if it's smaller than `maxNumConcurrentSubTasks` no matter how many task slots are currently available. +- The number of concurrent worker tasks in parallel ingestion is determined by `maxNumConcurrentSubTasks` in the `tuningConfig`. + The supervisor task checks the number of current running worker tasks and creates more if it's smaller than `maxNumConcurrentSubTasks` + no matter how many task slots are currently available. This may affect to other ingestion performance. See the below [Capacity Planning](#capacity-planning) section for more details. - By default, batch ingestion replaces all data (in your `granularitySpec`'s intervals) in any segment that it writes to. - If you'd like to add to the segment instead, set the `appendToExisting` flag in `ioConfig`. Note that it only replaces + If you'd like to add to the segment instead, set the `appendToExisting` flag in the `ioConfig`. Note that it only replaces data in segments where it actively adds data: if there are segments in your `granularitySpec`'s intervals that have no data written by this task, they will be left alone. If any existing segments partially overlap with the `granularitySpec`'s intervals, the portion of those segments outside the new segments' intervals will still be visible. @@ -98,6 +98,25 @@ A sample task is shown below: "spec": { "dataSchema": { "dataSource": "wikipedia_parallel_index_test", + "timestampSpec": { + "column": "timestamp" + }, + "dimensionsSpec": { + "dimensions": [ + "page", + "language", + "user", + "unpatrolled", + "newPage", + "robot", + "anonymous", + "namespace", + "continent", + "country", + "region", + "city" + ] + }, "metricsSpec": [ { "type": "count", @@ -123,38 +142,17 @@ A sample task is shown below: "segmentGranularity": "DAY", "queryGranularity": "second", "intervals" : [ "2013-08-31/2013-09-02" ] - }, - "parser": { - "parseSpec": { - "format" : "json", - "timestampSpec": { - "column": "timestamp" - }, - "dimensionsSpec": { - "dimensions": [ - "page", - "language", - "user", - "unpatrolled", - "newPage", - "robot", - "anonymous", - "namespace", - "continent", - "country", - "region", - "city" - ] - } - } } }, "ioConfig": { "type": "index_parallel", - "firehose": { + "inputSource": { "type": "local", "baseDir": "examples/indexing/", "filter": "wikipedia_index_data*" + }, + "inputFormat": { + "type": "json" } }, "tuningconfig": { @@ -178,7 +176,7 @@ This field is required. See [Ingestion Spec DataSchema](../ingestion/index.md#dataschema) -If you specify `intervals` explicitly in your dataSchema's granularitySpec, batch ingestion will lock the full intervals +If you specify `intervals` explicitly in your dataSchema's `granularitySpec`, batch ingestion will lock the full intervals specified when it starts up, and you will learn quickly if the specified interval overlaps with locks held by other tasks (e.g., Kafka ingestion). Otherwise, batch ingestion will lock each interval as it is discovered, so you may only learn that the task overlaps with a higher-priority task later in ingestion. If you specify `intervals` explicitly, any @@ -191,7 +189,7 @@ that range if there's some stray data with unexpected timestamps. |property|description|default|required?| |--------|-----------|-------|---------| |type|The task type, this should always be `index_parallel`.|none|yes| -|firehose|Specify a [Firehose](#firehoses) here.|none|yes| +|inputFormat|[`inputFormat`](./data-formats.md#input-format) to specify how to parse input data.|none|yes| |appendToExisting|Creates segments as additional shards of the latest version, effectively appending to the segment set instead of replacing it. This will only work if the existing segment set has extendable-type shardSpecs.|false|no| ### `tuningConfig` @@ -206,7 +204,7 @@ The tuningConfig is optional and default parameters will be used if no tuningCon |maxBytesInMemory|Used in determining when intermediate persists to disk should occur. Normally this is computed internally and user does not need to set it. This value represents number of bytes to aggregate in heap memory before persisting. This is based on a rough estimate of memory usage and not actual usage. The maximum heap memory usage for indexing is maxBytesInMemory * (2 + maxPendingPersists)|1/6 of max JVM memory|no| |maxTotalRows|Deprecated. Use `partitionsSpec` instead. Total number of rows in segments waiting for being pushed. Used in determining when intermediate pushing should occur.|20000000|no| |numShards|Deprecated. Use `partitionsSpec` instead. Directly specify the number of shards to create when using a `hashed` `partitionsSpec`. If this is specified and `intervals` is specified in the `granularitySpec`, the index task can skip the determine intervals/partitions pass through the data. `numShards` cannot be specified if `maxRowsPerSegment` is set.|null|no| -|splitHintSpec|Used to give a hint to control the amount of data that each first phase task reads. This hint could be ignored depending on the implementation of firehose. See [SplitHintSpec](#splithintspec) for more details.|null|no| +|splitHintSpec|Used to give a hint to control the amount of data that each first phase task reads. This hint could be ignored depending on the implementation of the input source. See [SplitHintSpec](#splithintspec) for more details.|null|no| |partitionsSpec|Defines how to partition data in each timeChunk, see [PartitionsSpec](#partitionsspec)|`dynamic` if `forceGuaranteedRollup` = false, `hashed` or `single_dim` if `forceGuaranteedRollup` = true|no| |indexSpec|Defines segment storage format options to be used at indexing time, see [IndexSpec](index.md#indexspec)|null|no| |indexSpecForIntermediatePersists|Defines segment storage format options to be used at indexing time for intermediate persisted temporary segments. this can be used to disable dimension/metric compression on intermediate segments to reduce memory required for final merging. however, disabling compression on intermediate segments might increase page cache use while they are used before getting merged into final segment published, see [IndexSpec](index.md#indexspec) for possible values.|same as indexSpec|no| @@ -215,7 +213,7 @@ The tuningConfig is optional and default parameters will be used if no tuningCon |reportParseExceptions|If true, exceptions encountered during parsing will be thrown and will halt ingestion; if false, unparseable rows and fields will be skipped.|false|no| |pushTimeout|Milliseconds to wait for pushing segments. It must be >= 0, where 0 means to wait forever.|0|no| |segmentWriteOutMediumFactory|Segment write-out medium to use when creating segments. See [SegmentWriteOutMediumFactory](#segmentwriteoutmediumfactory).|Not specified, the value from `druid.peon.defaultSegmentWriteOutMediumFactory.type` is used|no| -|maxNumConcurrentSubTasks|Maximum number of sub tasks which can be run in parallel at the same time. The supervisor task would spawn worker tasks up to `maxNumConcurrentSubTasks` regardless of the current available task slots. If this value is set to 1, the supervisor task processes data ingestion on its own instead of spawning worker tasks. If this value is set to too large, too many worker tasks can be created which might block other ingestion. Check [Capacity Planning](#capacity-planning) for more details.|1|no| +|maxNumConcurrentSubTasks|Maximum number of worker tasks which can be run in parallel at the same time. The supervisor task would spawn worker tasks up to `maxNumConcurrentSubTasks` regardless of the current available task slots. If this value is set to 1, the supervisor task processes data ingestion on its own instead of spawning worker tasks. If this value is set to too large, too many worker tasks can be created which might block other ingestion. Check [Capacity Planning](#capacity-planning) for more details.|1|no| |maxRetry|Maximum number of retries on task failures.|3|no| |maxNumSegmentsToMerge|Max limit for the number of segments that a single task can merge at the same time in the second phase. Used only `forceGuaranteedRollup` is set.|100|no| |totalNumMergeTasks|Total number of tasks to merge segments in the second phase when `forceGuaranteedRollup` is set.|10|no| @@ -226,13 +224,13 @@ The tuningConfig is optional and default parameters will be used if no tuningCon ### `splitHintSpec` `SplitHintSpec` is used to give a hint when the supervisor task creates input splits. -Note that each sub task processes a single input split. You can control the amount of data each sub task will read during the first phase. +Note that each worker task processes a single input split. You can control the amount of data each worker task will read during the first phase. Currently only one splitHintSpec, i.e., `segments`, is available. #### `SegmentsSplitHintSpec` -`SegmentsSplitHintSpec` is used only for `IngestSegmentFirehose`. +`SegmentsSplitHintSpec` is used only for [`DruidInputSource`](#druid-input-source) (and legacy [`IngestSegmentFirehose`](#ingestsegmentfirehose)). |property|description|default|required?| |--------|-----------|-------|---------| @@ -246,32 +244,22 @@ You should use different partitionsSpec depending on the [rollup mode](../ingest For perfect rollup, you should use either `hashed` (partitioning based on the hash of dimensions in each row) or `single_dim` (based on ranges of a single dimension). For best-effort rollup, you should use `dynamic`. -The three `partitionsSpec` types have different pros and cons: -- `dynamic`: Fastest ingestion speed. Guarantees a well-balanced distribution in segment size. Only best-effort rollup. -- `hashed`: Moderate ingestion speed. Creates a well-balanced distribution in segment size. Allows perfect rollup. -- `single_dim`: Slowest ingestion speed. Segment sizes may be skewed depending on the partition key, but the broker can - use the partition information to efficiently prune segments early to speed up queries. Allows perfect rollup. +The three `partitionsSpec` types have different characteristics. -#### Hash-based partitioning +| PartitionsSpec | Ingestion speed | Partitioning method | Supported rollup mode | Segment pruning at query time | +|----------------|-----------------|---------------------|-----------------------|-------------------------------| +| `dynamic` | Fastest | Partitioning based on number of rows in segment. | Best-effort rollup | N/A | +| `hashed` | Moderate | Partitioning based on the hash value of partition dimensions. This partitioning may reduce your datasource size and query latency by improving data locality. See [Partitioning](./index.md#partitioning) for more details. | Perfect rollup | N/A | +| `single_dim` | Slowest | Range partitioning based on the value of the partition dimension. Segment sizes may be skewed depending on the partition key distribution. This may reduce your datasource size and query latency by improving data locality. See [Partitioning](./index.md#partitioning) for more details. | Perfect rollup | The broker can use the partition information to prune segments early to speed up queries. Since the broker knows the range of `partitionDimension` values in each segment, given a query including a filter on the `partitionDimension`, the broker can pick up only the segments holding the rows satisfying the filter on `partitionDimension` for query processing. | -|property|description|default|required?| -|--------|-----------|-------|---------| -|type|This should always be `hashed`|none|yes| -|numShards|Directly specify the number of shards to create. If this is specified and `intervals` is specified in the `granularitySpec`, the index task can skip the determine intervals/partitions pass through the data. `numShards` cannot be specified if `targetRowsPerSegment` is set.|null|yes| -|partitionDimensions|The dimensions to partition on. Leave blank to select all dimensions.|null|no| - -#### Single-dimension range partitioning - -> Because single-range partitioning makes two passes over the input, the index task may fail if the input changes -> in between the two passes. - -|property|description|default|required?| -|--------|-----------|-------|---------| -|type|This should always be `single_dim`|none|yes| -|partitionDimension|The dimension to partition on. Only rows with a single dimension value are allowed.|none|yes| -|targetRowsPerSegment|Target number of rows to include in a partition, should be a number that targets segments of 500MB\~1GB.|none|either this or `maxRowsPerSegment`| -|maxRowsPerSegment|Maximum number of rows to include in a partition. Defaults to 50% larger than the `targetRowsPerSegment`.|none|either this or `targetRowsPerSegment`| -|assumeGrouped|Assume that input data has already been grouped on time and dimensions. Ingestion will run faster, but may choose sub-optimal partitions if this assumption is violated.|false|no| +The recommended use case for each partitionsSpec is: +- If your data has a uniformly distributed column which is frequently used in your queries, +consider using `single_dim` partitionsSpec to maximize the performance of most of your queries. +- If your data doesn't a uniformly distributed column, but is expected to have a [high rollup ratio](./index.md#maximizing-rollup-ratio) +when you roll up with some dimensions, consider using `hashed` partitionsSpec. +It could reduce the size of datasource and query latency by improving data locality. +- If the above two scenarios are not the case or you don't need to roll up your datasource, +consider using `dynamic` partitionsSpec. #### Dynamic partitioning @@ -279,7 +267,83 @@ The three `partitionsSpec` types have different pros and cons: |--------|-----------|-------|---------| |type|This should always be `dynamic`|none|yes| |maxRowsPerSegment|Used in sharding. Determines how many rows are in each segment.|5000000|no| -|maxTotalRows|Total number of rows in segments waiting for being pushed. Used in determining when intermediate segment push should occur.|20000000|no| +|maxTotalRows|Total number of rows across all segments waiting for being pushed. Used in determining when intermediate segment push should occur.|20000000|no| + +With the Dynamic partitioning, the parallel index task runs in a single phase: +it will spawn multiple worker tasks (type `single_phase_sub_task`), each of which creates segments. +How the worker task creates segments is: + +- The task creates a new segment whenever the number of rows in the current segment exceeds + `maxRowsPerSegment`. +- Once the total number of rows in all segments across all time chunks reaches to `maxTotalRows`, + the task pushes all segments created so far to the deep storage and creates new ones. + +#### Hash-based partitioning + +|property|description|default|required?| +|--------|-----------|-------|---------| +|type|This should always be `hashed`|none|yes| +|numShards|Directly specify the number of shards to create. If this is specified and `intervals` is specified in the `granularitySpec`, the index task can skip the determine intervals/partitions pass through the data.|null|yes| +|partitionDimensions|The dimensions to partition on. Leave blank to select all dimensions.|null|no| + +The Parallel task with hash-based partitioning is similar to [MapReduce](https://en.wikipedia.org/wiki/MapReduce). +The task runs in 2 phases, i.e., `partial segment generation` and `partial segment merge`. +- In the `partial segment generation` phase, just like the Map phase in MapReduce, +the Parallel task splits the input data (currently one split for +each input file or based on `splitHintSpec` for `DruidInputSource`) +and assigns each split to a worker task. Each worker task (type `partial_index_generate`) reads the assigned split, +and partitions rows by the time chunk from `segmentGranularity` (primary partition key) in the `granularitySpec` +and then by the hash value of `partitionDimensions` (secondary partition key) in the `partitionsSpec`. +The partitioned data is stored in local storage of +the [middleManager](../design/middlemanager.md) or the [indexer](../design/indexer.md). +- The `partial segment merge` phase is similar to the Reduce phase in MapReduce. +The Parallel task spawns a new set of worker tasks (type `partial_index_merge`) to merge the partitioned data +created in the previous phase. Here, the partitioned data is shuffled based on +the time chunk and the hash value of `partitionDimensions` to be merged; each worker task reads the data +falling in the same time chunk and the same hash value from multiple MiddleManager/Indexer processes and merges +them to create the final segments. Finally, they push the final segments to the deep storage at once. + +#### Single-dimension range partitioning + +> Single dimension range partitioning is currently not supported in the sequential mode of the Parallel task. +Try set `maxNumConcurrentSubTasks` to larger than 1 to use this partitioning. + +|property|description|default|required?| +|--------|-----------|-------|---------| +|type|This should always be `single_dim`|none|yes| +|partitionDimension|The dimension to partition on. Only rows with a single dimension value are allowed.|none|yes| +|targetRowsPerSegment|Target number of rows to include in a partition, should be a number that targets segments of 500MB\~1GB.|none|either this or `maxRowsPerSegment`| +|maxRowsPerSegment|Soft max for the number of rows to include in a partition.|none|either this or `targetRowsPerSegment`| +|assumeGrouped|Assume that input data has already been grouped on time and dimensions. Ingestion will run faster, but may choose sub-optimal partitions if this assumption is violated.|false|no| + +With `single-dim` partitioning, the Parallel task runs in 3 phases, +i.e., `partial dimension distribution`, `partial segment generation`, and `partial segment merge`. +The first phase is to collect some statistics to find +the best partitioning and the other 2 phases are to create partial segments +and to merge them, respectively, as in hash-based partitioning. +- In the `partial dimension distribution` phase, the Parallel task splits the input data and +assigns them to worker tasks (currently one split for +each input file or based on `splitHintSpec` for `DruidInputSource`). Each worker task (type `partial_dimension_distribution`) reads +the assigned split and builds a histogram for `partitionDimension`. +The Parallel task collects those histograms from worker tasks and finds +the best range partitioning based on `partitionDimension` to evenly +distribute rows across partitions. Note that either `targetRowsPerSegment` +or `maxRowsPerSegment` will be used to find the best partitioning. +- In the `partial segment generation` phase, the Parallel task spawns new worker tasks (type `partial_range_index_generate`) +to create partitioned data. Each worker task reads a split created as in the previous phase, +partitions rows by the time chunk from the `segmentGranularity` (primary partition key) in the `granularitySpec` +and then by the range partitioning found in the previous phase. +The partitioned data is stored in local storage of +the [middleManager](../design/middlemanager.md) or the [indexer](../design/indexer.md). +- In the `partial segment merge` phase, the parallel index task spawns a new set of worker tasks (type `partial_index_generic_merge`) to merge the partitioned +data created in the previous phase. Here, the partitioned data is shuffled based on +the time chunk and the value of `partitionDimension`; each worker task reads the segments +falling in the same partition of the same range from multiple MiddleManager/Indexer processes and merges +them to create the final segments. Finally, they push the final segments to the deep storage. + +> Because the task with single-dimension range partitioning makes two passes over the input +> in `partial dimension distribution` and `partial segment generation` phases, +> the task may fail if the input changes in between the two passes. ### HTTP status endpoints @@ -350,50 +414,25 @@ An example of the result is "ingestionSpec": { "dataSchema": { "dataSource": "lineitem", - "parser": { - "type": "hadoopyString", - "parseSpec": { - "format": "tsv", - "delimiter": "|", - "timestampSpec": { - "column": "l_shipdate", - "format": "yyyy-MM-dd" - }, - "dimensionsSpec": { - "dimensions": [ - "l_orderkey", - "l_partkey", - "l_suppkey", - "l_linenumber", - "l_returnflag", - "l_linestatus", - "l_shipdate", - "l_commitdate", - "l_receiptdate", - "l_shipinstruct", - "l_shipmode", - "l_comment" - ] - }, - "columns": [ - "l_orderkey", - "l_partkey", - "l_suppkey", - "l_linenumber", - "l_quantity", - "l_extendedprice", - "l_discount", - "l_tax", - "l_returnflag", - "l_linestatus", - "l_shipdate", - "l_commitdate", - "l_receiptdate", - "l_shipinstruct", - "l_shipmode", - "l_comment" - ] - } + "timestampSpec": { + "column": "l_shipdate", + "format": "yyyy-MM-dd" + }, + "dimensionsSpec": { + "dimensions": [ + "l_orderkey", + "l_partkey", + "l_suppkey", + "l_linenumber", + "l_returnflag", + "l_linestatus", + "l_shipdate", + "l_commitdate", + "l_receiptdate", + "l_shipinstruct", + "l_shipmode", + "l_comment" + ] }, "metricsSpec": [ { @@ -443,11 +482,32 @@ An example of the result is }, "ioConfig": { "type": "index_parallel", - "firehose": { + "inputSource": { "type": "local", "baseDir": "/path/to/data/", - "filter": "lineitem.tbl.5", - "parser": null + "filter": "lineitem.tbl.5" + }, + "inputFormat": { + "format": "tsv", + "delimiter": "|", + "columns": [ + "l_orderkey", + "l_partkey", + "l_suppkey", + "l_linenumber", + "l_quantity", + "l_extendedprice", + "l_discount", + "l_tax", + "l_returnflag", + "l_linestatus", + "l_shipdate", + "l_commitdate", + "l_receiptdate", + "l_shipinstruct", + "l_shipmode", + "l_comment" + ] }, "appendToExisting": false }, @@ -548,20 +608,13 @@ A sample task is shown below: "spec" : { "dataSchema" : { "dataSource" : "wikipedia", - "parser" : { - "type" : "string", - "parseSpec" : { - "format" : "json", - "timestampSpec" : { - "column" : "timestamp", - "format" : "auto" - }, - "dimensionsSpec" : { - "dimensions": ["page","language","user","unpatrolled","newPage","robot","anonymous","namespace","continent","country","region","city"], - "dimensionExclusions" : [], - "spatialDimensions" : [] - } - } + "timestampSpec" : { + "column" : "timestamp", + "format" : "auto" + }, + "dimensionsSpec" : { + "dimensions": ["page","language","user","unpatrolled","newPage","robot","anonymous","namespace","continent","country","region","city"], + "dimensionExclusions" : [] }, "metricsSpec" : [ { @@ -593,10 +646,13 @@ A sample task is shown below: }, "ioConfig" : { "type" : "index", - "firehose" : { + "inputSource" : { "type" : "local", "baseDir" : "examples/indexing/", "filter" : "wikipedia_data.json" + }, + "inputFormat": { + "type": "json" } }, "tuningConfig" : { @@ -632,7 +688,7 @@ that range if there's some stray data with unexpected timestamps. |property|description|default|required?| |--------|-----------|-------|---------| |type|The task type, this should always be "index".|none|yes| -|firehose|Specify a [Firehose](#firehoses) here.|none|yes| +|inputFormat|[`inputFormat`](./data-formats.md#input-format) to specify how to parse input data.|none|yes| |appendToExisting|Creates segments as additional shards of the latest version, effectively appending to the segment set instead of replacing it. This will only work if the existing segment set has extendable-type shardSpecs.|false|no| ### `tuningConfig` @@ -679,7 +735,7 @@ For best-effort rollup, you should use `dynamic`. |--------|-----------|-------|---------| |type|This should always be `dynamic`|none|yes| |maxRowsPerSegment|Used in sharding. Determines how many rows are in each segment.|5000000|no| -|maxTotalRows|Total number of rows in segments waiting for being pushed. Used in determining when intermediate segment push should occur.|20000000|no| +|maxTotalRows|Total number of rows in segments waiting for being pushed.|20000000|no| ### `segmentWriteOutMediumFactory` @@ -706,19 +762,573 @@ continues to ingest remaining data. To enable bulk pushing mode, `forceGuaranteedRollup` should be set in the TuningConfig. Note that this option cannot be used with `appendToExisting` of IOConfig. -Firehoses are pluggable and thus the configuration schema can and will vary based on the `type` of the firehose. +## Input Sources -| Field | Type | Description | Required | -|-------|------|-------------|----------| -| type | String | Specifies the type of firehose. Each value will have its own configuration schema, firehoses packaged with Druid are described below. | yes | +The input source is the place to define from where your index task reads data. +Only the native Parallel task and Simple task support the input source. -## Firehoses +### S3 Input Source +> You need to include the [`druid-s3-extensions`](../development/extensions-core/s3.md) as an extension to use the S3 input source. + +The S3 input source is to support reading objects directly from S3. +Objects can be specified either via a list of S3 URI strings or a list of +S3 location prefixes, which will attempt to list the contents and ingest +all objects contained in the locations. The S3 input source is splittable +and can be used by the [Parallel task](#parallel-task), +where each worker task of `index_parallel` will read a single object. + +Sample specs: + +```json +... + "ioConfig": { + "type": "index_parallel", + "inputSource": { + "type": "s3", + "uris": ["s3://foo/bar/file.json", "s3://bar/foo/file2.json"] + }, + "inputFormat": { + "type": "json" + }, + ... + }, +... +``` + +```json +... + "ioConfig": { + "type": "index_parallel", + "inputSource": { + "type": "s3", + "prefixes": ["s3://foo/bar", "s3://bar/foo"] + }, + "inputFormat": { + "type": "json" + }, + ... + }, +... +``` + + +```json +... + "ioConfig": { + "type": "index_parallel", + "inputSource": { + "type": "s3", + "objects": [ + { "bucket": "foo", "path": "bar/file1.json"}, + { "bucket": "bar", "path": "foo/file2.json"} + ] + }, + "inputFormat": { + "type": "json" + }, + ... + }, +... +``` + +|property|description|default|required?| +|--------|-----------|-------|---------| +|type|This should be `s3`.|None|yes| +|uris|JSON array of URIs where S3 objects to be ingested are located.|None|`uris` or `prefixes` or `objects` must be set| +|prefixes|JSON array of URI prefixes for the locations of S3 objects to be ingested.|None|`uris` or `prefixes` or `objects` must be set| +|objects|JSON array of S3 Objects to be ingested.|None|`uris` or `prefixes` or `objects` must be set| + +S3 Object: + +|property|description|default|required?| +|--------|-----------|-------|---------| +|bucket|Name of the S3 bucket|None|yes| +|path|The path where data is located.|None|yes| + +### Google Cloud Storage Input Source + +> You need to include the [`druid-google-extensions`](../development/extensions-core/google.md) as an extension to use the Google Cloud Storage input source. + +The Google Cloud Storage input source is to support reading objects directly +from Google Cloud Storage. Objects can be specified as list of Google +Cloud Storage URI strings. The Google Cloud Storage input source is splittable +and can be used by the [Parallel task](#parallel-task), where each worker task of `index_parallel` will read a single object. + +Sample specs: + +```json +... + "ioConfig": { + "type": "index_parallel", + "inputSource": { + "type": "google", + "uris": ["gs://foo/bar/file.json", "gs://bar/foo/file2.json"] + }, + "inputFormat": { + "type": "json" + }, + ... + }, +... +``` + +```json +... + "ioConfig": { + "type": "index_parallel", + "inputSource": { + "type": "google", + "prefixes": ["gs://foo/bar", "gs://bar/foo"] + }, + "inputFormat": { + "type": "json" + }, + ... + }, +... +``` + + +```json +... + "ioConfig": { + "type": "index_parallel", + "inputSource": { + "type": "google", + "objects": [ + { "bucket": "foo", "path": "bar/file1.json"}, + { "bucket": "bar", "path": "foo/file2.json"} + ] + }, + "inputFormat": { + "type": "json" + }, + ... + }, +... +``` + +|property|description|default|required?| +|--------|-----------|-------|---------| +|type|This should be `google`.|None|yes| +|uris|JSON array of URIs where Google Cloud Storage objects to be ingested are located.|None|`uris` or `prefixes` or `objects` must be set| +|prefixes|JSON array of URI prefixes for the locations of Google Cloud Storage objects to be ingested.|None|`uris` or `prefixes` or `objects` must be set| +|objects|JSON array of Google Cloud Storage objects to be ingested.|None|`uris` or `prefixes` or `objects` must be set| + +Google Cloud Storage object: + +|property|description|default|required?| +|--------|-----------|-------|---------| +|bucket|Name of the Google Cloud Storage bucket|None|yes| +|path|The path where data is located.|None|yes| + +### HDFS Input Source + +> You need to include the [`druid-hdfs-storage`](../development/extensions-core/hdfs.md) as an extension to use the HDFS input source. + +The HDFS input source is to support reading files directly +from HDFS storage. File paths can be specified as an HDFS URI string or a list +of HDFS URI strings. The HDFS input source is splittable and can be used by the [Parallel task](#parallel-task), +where each worker task of `index_parallel` will read a single file. + +Sample specs: + +```json +... + "ioConfig": { + "type": "index_parallel", + "inputSource": { + "type": "hdfs", + "paths": "hdfs://foo/bar/", "hdfs://bar/foo" + }, + "inputFormat": { + "type": "json" + }, + ... + }, +... +``` + +```json +... + "ioConfig": { + "type": "index_parallel", + "inputSource": { + "type": "hdfs", + "paths": ["hdfs://foo/bar", "hdfs://bar/foo"] + }, + "inputFormat": { + "type": "json" + }, + ... + }, +... +``` + +```json +... + "ioConfig": { + "type": "index_parallel", + "inputSource": { + "type": "hdfs", + "paths": "hdfs://foo/bar/file.json", "hdfs://bar/foo/file2.json" + }, + "inputFormat": { + "type": "json" + }, + ... + }, +... +``` + +```json +... + "ioConfig": { + "type": "index_parallel", + "inputSource": { + "type": "hdfs", + "paths": ["hdfs://foo/bar/file.json", "hdfs://bar/foo/file2.json"] + }, + "inputFormat": { + "type": "json" + }, + ... + }, +... +``` + +|property|description|default|required?| +|--------|-----------|-------|---------| +|type|This should be `hdfs`.|None|yes| +|paths|HDFS paths. Can be either a JSON array or comma-separated string of paths. Wildcards like `*` are supported in these paths.|None|yes| + +You can also ingest from cloud storage using the HDFS input source. +However, if you want to read from AWS S3 or Google Cloud Storage, consider using +the [S3 input source](#s3-input-source) or the [Google Cloud Storage input source](#google-cloud-storage-input-source) instead. + +### HTTP Input Source + +The HDFS input source is to support reading files directly +from remote sites via HTTP. +The HDFS input source is _splittable_ and can be used by the [Parallel task](#parallel-task), +where each worker task of `index_parallel` will read a file. + +Sample specs: + +```json +... + "ioConfig": { + "type": "index_parallel", + "inputSource": { + "type": "http", + "uris": ["http://example.com/uri1", "http://example2.com/uri2"] + }, + "inputFormat": { + "type": "json" + }, + ... + }, +... +``` + +Example with authentication fields using the DefaultPassword provider (this requires the password to be in the ingestion spec): + +```json +... + "ioConfig": { + "type": "index_parallel", + "inputSource": { + "type": "http", + "uris": ["http://example.com/uri1", "http://example2.com/uri2"], + "httpAuthenticationUsername": "username", + "httpAuthenticationPassword": "password123" + }, + "inputFormat": { + "type": "json" + }, + ... + }, +... +``` + +You can also use the other existing Druid PasswordProviders. Here is an example using the EnvironmentVariablePasswordProvider: + +```json +... + "ioConfig": { + "type": "index_parallel", + "inputSource": { + "type": "http", + "uris": ["http://example.com/uri1", "http://example2.com/uri2"], + "httpAuthenticationUsername": "username", + "httpAuthenticationPassword": { + "type": "environment", + "variable": "HTTP_INPUT_SOURCE_PW" + } + }, + "inputFormat": { + "type": "json" + }, + ... + }, +... +} +``` + +|property|description|default|required?| +|--------|-----------|-------|---------| +|type|This should be `http`|None|yes| +|uris|URIs of the input files.|None|yes| +|httpAuthenticationUsername|Username to use for authentication with specified URIs. Can be optionally used if the URIs specified in the spec require a Basic Authentication Header.|None|no| +|httpAuthenticationPassword|PasswordProvider to use with specified URIs. Can be optionally used if the URIs specified in the spec require a Basic Authentication Header.|None|no| + +### Inline Input Source + +The Inline input source can be used to read the data inlined in its own spec. +It can be used for demos or for quickly testing out parsing and schema. + +Sample spec: + +```json +... + "ioConfig": { + "type": "index_parallel", + "inputSource": { + "type": "inline", + "data": "0,values,formatted\n1,as,CSV" + }, + "inputFormat": { + "type": "csv" + }, + ... + }, +... +``` + +|property|description|required?| +|--------|-----------|---------| +|type|This should be "inline".|yes| +|data|Inlined data to ingest.|yes| + +### Local Input Source + +The Local input source is to support reading files directly from local storage, +and is mainly intended for proof-of-concept testing. +The Local input source is _splittable_ and can be used by the [Parallel task](#parallel-task), +where each worker task of `index_parallel` will read a file. + +Sample spec: + +```json +... + "ioConfig": { + "type": "index_parallel", + "inputSource": { + "type": "local", + "filter" : "*.csv", + "baseDir": "/data/directory" + }, + "inputFormat": { + "type": "csv" + }, + ... + }, +... +``` + +|property|description|required?| +|--------|-----------|---------| +|type|This should be "local".|yes| +|filter|A wildcard filter for files. See [here](http://commons.apache.org/proper/commons-io/apidocs/org/apache/commons/io/filefilter/WildcardFileFilter.html) for more information.|yes| +|baseDir|directory to search recursively for files to be ingested. |yes| + +### Druid Input Source + +The Druid input source is to support reading data directly from existing Druid segments, +potentially using a new schema and changing the name, dimensions, metrics, rollup, etc. of the segment. +The Druid input source is _splittable_ and can be used by the [Parallel task](#parallel-task). +This input source has a fixed input format for reading from Druid segments; +no `inputFormat` field needs to be specified in the ingestion spec when using this input source. + +|property|description|required?| +|--------|-----------|---------| +|type|This should be "druid".|yes| +|dataSource|A String defining the Druid datasource to fetch rows from|yes| +|interval|A String representing an ISO-8601 interval, which defines the time range to fetch the data over.|yes| +|dimensions|A list of Strings containing the names of dimension columns to select from the Druid datasource. If the list is empty, no dimensions are returned. If null, all dimensions are returned. |no| +|metrics|The list of Strings containing the names of metric columns to select. If the list is empty, no metrics are returned. If null, all metrics are returned.|no| +|filter| See [Filters](../querying/filters.md). Only rows that match the filter, if specified, will be returned.|no| + +A minimal example DruidInputSource spec is shown below: + +```json +... + "ioConfig": { + "type": "index_parallel", + "inputSource": { + "type": "druid", + "dataSource": "wikipedia", + "interval": "2013-01-01/2013-01-02" + } + ... + }, +... +``` + +The spec above will read all existing dimension and metric columns from +the `wikipedia` datasource, including all rows with a timestamp (the `__time` column) +within the interval `2013-01-01/2013-01-02`. + +A spec that applies a filter and reads a subset of the original datasource's columns is shown below. + +```json +... + "ioConfig": { + "type": "index_parallel", + "inputSource": { + "type": "druid", + "dataSource": "wikipedia", + "interval": "2013-01-01/2013-01-02", + "dimensions": [ + "page", + "user" + ], + "metrics": [ + "added" + ], + "filter": { + "type": "selector", + "dimension": "page", + "value": "Druid" + } + } + ... + }, +... +``` + +This spec above will only return the `page`, `user` dimensions and `added` metric. +Only rows where `page` = `Druid` will be returned. + +## Firehoses (Deprecated) + +Firehoses are deprecated in 0.17.0. It's highly recommended to use the [Input source](#input-sources) instead. There are several firehoses readily available in Druid, some are meant for examples, others can be used directly in a production environment. -For additional firehoses, please see our [extensions list](../development/extensions.md). +### StaticS3Firehose - +> You need to include the [`druid-s3-extensions`](../development/extensions-core/s3.md) as an extension to use the StaticS3Firehose. + +This firehose ingests events from a predefined list of S3 objects. +This firehose is _splittable_ and can be used by the [Parallel task](#parallel-task). +Since each split represents an object in this firehose, each worker task of `index_parallel` will read an object. + +Sample spec: + +```json +"firehose" : { + "type" : "static-s3", + "uris": ["s3://foo/bar/file.gz", "s3://bar/foo/file2.gz"] +} +``` + +This firehose provides caching and prefetching features. In the Simple task, a firehose can be read twice if intervals or +shardSpecs are not specified, and, in this case, caching can be useful. Prefetching is preferred when direct scan of objects is slow. +Note that prefetching or caching isn't that useful in the Parallel task. + +|property|description|default|required?| +|--------|-----------|-------|---------| +|type|This should be `static-s3`.|None|yes| +|uris|JSON array of URIs where s3 files to be ingested are located.|None|`uris` or `prefixes` must be set| +|prefixes|JSON array of URI prefixes for the locations of s3 files to be ingested.|None|`uris` or `prefixes` must be set| +|maxCacheCapacityBytes|Maximum size of the cache space in bytes. 0 means disabling cache. Cached files are not removed until the ingestion task completes.|1073741824|no| +|maxFetchCapacityBytes|Maximum size of the fetch space in bytes. 0 means disabling prefetch. Prefetched files are removed immediately once they are read.|1073741824|no| +|prefetchTriggerBytes|Threshold to trigger prefetching s3 objects.|maxFetchCapacityBytes / 2|no| +|fetchTimeout|Timeout for fetching an s3 object.|60000|no| +|maxFetchRetry|Maximum retry for fetching an s3 object.|3|no| + +#### StaticGoogleBlobStoreFirehose + +> You need to include the [`druid-google-extensions`](../development/extensions-core/google.md) as an extension to use the StaticGoogleBlobStoreFirehose. + +This firehose ingests events, similar to the StaticS3Firehose, but from an Google Cloud Store. + +As with the S3 blobstore, it is assumed to be gzipped if the extension ends in .gz + +This firehose is _splittable_ and can be used by the [Parallel task](#parallel-task). +Since each split represents an object in this firehose, each worker task of `index_parallel` will read an object. + +Sample spec: + +```json +"firehose" : { + "type" : "static-google-blobstore", + "blobs": [ + { + "bucket": "foo", + "path": "/path/to/your/file.json" + }, + { + "bucket": "bar", + "path": "/another/path.json" + } + ] +} +``` + +This firehose provides caching and prefetching features. In the Simple task, a firehose can be read twice if intervals or +shardSpecs are not specified, and, in this case, caching can be useful. Prefetching is preferred when direct scan of objects is slow. +Note that prefetching or caching isn't that useful in the Parallel task. + +|property|description|default|required?| +|--------|-----------|-------|---------| +|type|This should be `static-google-blobstore`.|None|yes| +|blobs|JSON array of Google Blobs.|None|yes| +|maxCacheCapacityBytes|Maximum size of the cache space in bytes. 0 means disabling cache. Cached files are not removed until the ingestion task completes.|1073741824|no| +|maxFetchCapacityBytes|Maximum size of the fetch space in bytes. 0 means disabling prefetch. Prefetched files are removed immediately once they are read.|1073741824|no| +|prefetchTriggerBytes|Threshold to trigger prefetching Google Blobs.|maxFetchCapacityBytes / 2|no| +|fetchTimeout|Timeout for fetching a Google Blob.|60000|no| +|maxFetchRetry|Maximum retry for fetching a Google Blob.|3|no| + +Google Blobs: + +|property|description|default|required?| +|--------|-----------|-------|---------| +|bucket|Name of the Google Cloud bucket|None|yes| +|path|The path where data is located.|None|yes| + +### HDFSFirehose + +> You need to include the [`druid-hdfs-storage`](../development/extensions-core/hdfs.md) as an extension to use the HDFSFirehose. + +This firehose ingests events from a predefined list of files from the HDFS storage. +This firehose is _splittable_ and can be used by the [Parallel task](#parallel-task). +Since each split represents an HDFS file, each worker task of `index_parallel` will read a file. + +Sample spec: + +```json +"firehose" : { + "type" : "hdfs", + "paths": "/foo/bar,/foo/baz" +} +``` + +This firehose provides caching and prefetching features. During native batch indexing, a firehose can be read twice if +`intervals` are not specified, and, in this case, caching can be useful. Prefetching is preferred when direct scanning +of files is slow. +Note that prefetching or caching isn't that useful in the Parallel task. + +|Property|Description|Default| +|--------|-----------|-------| +|type|This should be `hdfs`.|none (required)| +|paths|HDFS paths. Can be either a JSON array or comma-separated string of paths. Wildcards like `*` are supported in these paths.|none (required)| +|maxCacheCapacityBytes|Maximum size of the cache space in bytes. 0 means disabling cache. Cached files are not removed until the ingestion task completes.|1073741824| +|maxFetchCapacityBytes|Maximum size of the fetch space in bytes. 0 means disabling prefetch. Prefetched files are removed immediately once they are read.|1073741824| +|prefetchTriggerBytes|Threshold to trigger prefetching files.|maxFetchCapacityBytes / 2| +|fetchTimeout|Timeout for fetching each file.|60000| +|maxFetchRetry|Maximum number of retries for fetching each file.|3| ### LocalFirehose @@ -791,6 +1401,7 @@ You can also use the other existing Druid PasswordProviders. Here is an example ``` The below configurations can optionally be used for tuning the Firehose performance. +Note that prefetching or caching isn't that useful in the Parallel task. |property|description|default| |--------|-----------|-------| @@ -874,8 +1485,6 @@ Requires one of the following extensions: |type|The type of database to query. Valid values are `mysql` and `postgresql`_||Yes| |connectorConfig|Specify the database connection properties via `connectURI`, `user` and `password`||Yes| - - ### InlineFirehose This Firehose can be used to read the data inlined in its own spec. @@ -894,8 +1503,6 @@ A sample inline Firehose spec is shown below: |type|This should be "inline".|yes| |data|Inlined data to ingest.|yes| - - ### CombiningFirehose This Firehose can be used to combine and merge data from a list of different Firehoses. @@ -911,57 +1518,3 @@ This Firehose can be used to combine and merge data from a list of different Fir |--------|-----------|---------| |type|This should be "combining"|yes| |delegates|List of Firehoses to combine data from|yes| - - -## Input Sources - -### DruidInputSource - -This InputSource can be used to read data from existing Druid segments, potentially using a new schema and changing the name, dimensions, metrics, rollup, etc. of the segment. -This InputSource is _splittable_ and can be used by [native parallel index tasks](native-batch.md#parallel-task). -This InputSource has a fixed InputFormat for reading from Druid segments; no InputFormat needs to be specified in the ingestion spec when using this InputSource. - -|property|description|required?| -|--------|-----------|---------| -|type|This should be "druid".|yes| -|dataSource|A String defining the Druid datasource to fetch rows from|yes| -|interval|A String representing an ISO-8601 interval, which defines the time range to fetch the data over.|yes| -|dimensions|A list of Strings containing the names of dimension columns to select from the Druid datasource. If the list is empty, no dimensions are returned. If null, all dimensions are returned. |no| -|metrics|The list of Strings containing the names of metric columns to select. If the list is empty, no metrics are returned. If null, all metrics are returned.|no| -|filter| See [Filters](../querying/filters.md). Only rows that match the filter, if specified, will be returned.|no| - -A minimal example DruidInputSource spec is shown below: - -```json -{ - "type": "druid", - "dataSource": "wikipedia", - "interval": "2013-01-01/2013-01-02" -} -``` - -The spec above will read all existing dimension and metric columns from the `wikipedia` datasource, including all rows with a timestamp (the `__time` column) within the interval `2013-01-01/2013-01-02`. - -A spec that applies a filter and reads a subset of the original datasource's columns is shown below. - -```json -{ - "type": "druid", - "dataSource": "wikipedia", - "interval": "2013-01-01/2013-01-02", - "dimensions": [ - "page", - "user" - ], - "metrics": [ - "added" - ], - "filter": { - "type": "selector", - "dimension": "page", - "value": "Druid" - } -} -``` - -This spec above will only return the `page`, `user` dimensions and `added` metric. Only rows where `page` = `Druid` will be returned. diff --git a/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/PartialDimensionDistributionParallelIndexTaskRunner.java b/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/PartialDimensionDistributionParallelIndexTaskRunner.java index 239976b77ca..8f10102fc8f 100644 --- a/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/PartialDimensionDistributionParallelIndexTaskRunner.java +++ b/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/PartialDimensionDistributionParallelIndexTaskRunner.java @@ -34,6 +34,8 @@ import java.util.Map; class PartialDimensionDistributionParallelIndexTaskRunner extends InputSourceSplitParallelIndexTaskRunner { + private static final String PHASE_NAME = "partial dimension distribution"; + // For tests private final IndexTaskClientFactory taskClientFactory; @@ -82,7 +84,7 @@ class PartialDimensionDistributionParallelIndexTaskRunner @Override public String getName() { - return PartialDimensionDistributionTask.TYPE; + return PHASE_NAME; } @Override diff --git a/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/PartialGenericSegmentMergeParallelIndexTaskRunner.java b/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/PartialGenericSegmentMergeParallelIndexTaskRunner.java index dab5bd5c13d..d01658a5193 100644 --- a/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/PartialGenericSegmentMergeParallelIndexTaskRunner.java +++ b/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/PartialGenericSegmentMergeParallelIndexTaskRunner.java @@ -35,6 +35,8 @@ import java.util.Map; class PartialGenericSegmentMergeParallelIndexTaskRunner extends ParallelIndexPhaseRunner { + private static final String PHASE_NAME = "partial segment merge"; + private final DataSchema dataSchema; private final List mergeIOConfigs; @@ -58,7 +60,7 @@ class PartialGenericSegmentMergeParallelIndexTaskRunner @Override public String getName() { - return PartialGenericSegmentMergeTask.TYPE; + return PHASE_NAME; } @Override diff --git a/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/PartialHashSegmentGenerateParallelIndexTaskRunner.java b/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/PartialHashSegmentGenerateParallelIndexTaskRunner.java index db92233aab3..41fdcd003ee 100644 --- a/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/PartialHashSegmentGenerateParallelIndexTaskRunner.java +++ b/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/PartialHashSegmentGenerateParallelIndexTaskRunner.java @@ -36,6 +36,8 @@ import java.util.Map; class PartialHashSegmentGenerateParallelIndexTaskRunner extends InputSourceSplitParallelIndexTaskRunner { + private static final String PHASE_NAME = "partial segment generation"; + // For tests private final IndexTaskClientFactory taskClientFactory; private final AppenderatorsManager appenderatorsManager; @@ -72,7 +74,7 @@ class PartialHashSegmentGenerateParallelIndexTaskRunner @Override public String getName() { - return PartialHashSegmentGenerateTask.TYPE; + return PHASE_NAME; } @Override diff --git a/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/PartialHashSegmentMergeParallelIndexTaskRunner.java b/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/PartialHashSegmentMergeParallelIndexTaskRunner.java index 36cf8d1979b..c693513c652 100644 --- a/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/PartialHashSegmentMergeParallelIndexTaskRunner.java +++ b/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/PartialHashSegmentMergeParallelIndexTaskRunner.java @@ -37,6 +37,8 @@ import java.util.Map; class PartialHashSegmentMergeParallelIndexTaskRunner extends ParallelIndexPhaseRunner { + private static final String PHASE_NAME = "partial segment merge"; + private final DataSchema dataSchema; private final List mergeIOConfigs; @@ -60,7 +62,7 @@ class PartialHashSegmentMergeParallelIndexTaskRunner @Override public String getName() { - return PartialHashSegmentMergeTask.TYPE; + return PHASE_NAME; } @Override diff --git a/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/PartialRangeSegmentGenerateParallelIndexTaskRunner.java b/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/PartialRangeSegmentGenerateParallelIndexTaskRunner.java index 71f084dab86..86e4d4b75f7 100644 --- a/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/PartialRangeSegmentGenerateParallelIndexTaskRunner.java +++ b/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/PartialRangeSegmentGenerateParallelIndexTaskRunner.java @@ -38,6 +38,8 @@ import java.util.Map; class PartialRangeSegmentGenerateParallelIndexTaskRunner extends InputSourceSplitParallelIndexTaskRunner> { + private static final String PHASE_NAME = "partial segment generation"; + private final IndexTaskClientFactory taskClientFactory; private final AppenderatorsManager appenderatorsManager; private final Map intervalToPartitions; @@ -87,7 +89,7 @@ class PartialRangeSegmentGenerateParallelIndexTaskRunner @Override public String getName() { - return PartialRangeSegmentGenerateTask.TYPE; + return PHASE_NAME; } @Override diff --git a/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/SinglePhaseParallelIndexTaskRunner.java b/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/SinglePhaseParallelIndexTaskRunner.java index c98ec2c9964..52ebb04a4b8 100644 --- a/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/SinglePhaseParallelIndexTaskRunner.java +++ b/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/SinglePhaseParallelIndexTaskRunner.java @@ -38,9 +38,10 @@ import java.util.Map; * As its name indicates, distributed indexing is done in a single phase, i.e., without shuffling intermediate data. As * a result, this task can't be used for perfect rollup. */ -class SinglePhaseParallelIndexTaskRunner - extends ParallelIndexPhaseRunner +class SinglePhaseParallelIndexTaskRunner extends ParallelIndexPhaseRunner { + private static final String PHASE_NAME = "segment generation"; + private final ParallelIndexIngestionSpec ingestionSchema; private final SplittableInputSource baseInputSource; @@ -70,7 +71,7 @@ class SinglePhaseParallelIndexTaskRunner @Override public String getName() { - return SinglePhaseSubTask.TYPE; + return PHASE_NAME; } @VisibleForTesting diff --git a/website/.spelling b/website/.spelling index cdf9afd275d..311b0b61556 100644 --- a/website/.spelling +++ b/website/.spelling @@ -58,6 +58,7 @@ Double.POSITIVE_INFINITY Double.POSITIVE_INFINITY. Dropwizard dropwizard +DruidInputSource DruidSQL EC2 EC2ContainerCredentialsProviderWrapper @@ -67,6 +68,7 @@ EMRFS ETL Elasticsearch FirehoseFactory +FlattenSpec Float.NEGATIVE_INFINITY Float.POSITIVE_INFINITY ForwardedRequestCustomizer @@ -77,6 +79,7 @@ GUIs GroupBy Guice HDFS +HDFSFirehose HLL HashSet Homebrew @@ -92,6 +95,7 @@ IndexSpec IndexTask InfluxDB InputFormat +InputSource Integer.MAX_VALUE JBOD JDBC @@ -103,6 +107,7 @@ JMX JRE JS JSON +JsonPath JVM JVMs Joda @@ -220,7 +225,10 @@ e.g. encodings endian enum +expr failover +featureSpec +findColumnsFromHeader filenames filesystem firefox @@ -243,6 +251,7 @@ influxdb injective inlined interruptible +jackson-jq javadoc kerberos keystore @@ -329,6 +338,7 @@ searchable servlet sharded sharding +skipHeaderRows smooshed splittable stdout @@ -362,6 +372,7 @@ unparseable unparsed uptime uris +useFieldDiscovery v1 v2 vCPUs