From c7a8bb90e9cc207da2310ad942e6f527d8058a0a Mon Sep 17 00:00:00 2001 From: Igal Levy Date: Mon, 4 Nov 2013 12:45:06 -0800 Subject: [PATCH 01/12] updated link to removed Data-Flow page --- docs/content/Tutorial:-The-Druid-Cluster.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/content/Tutorial:-The-Druid-Cluster.md b/docs/content/Tutorial:-The-Druid-Cluster.md index 31a87cb0bb2..2b6c5c9aea7 100644 --- a/docs/content/Tutorial:-The-Druid-Cluster.md +++ b/docs/content/Tutorial:-The-Druid-Cluster.md @@ -244,5 +244,5 @@ druid.processing.buffer.sizeBytes=10000000 Next Steps ---------- -If you are intested in how data flows through the different Druid components, check out the Druid [Data Flow](Data-Flow.html). Now that you have an understanding of what the Druid cluster looks like, why not load some of your own data? -Check out the next [tutorial](Tutorial%3A-Loading-Your-Data-Part-1.html) section for more info! \ No newline at end of file +If you are intested in how data flows through the different Druid components, check out the [Druid data flow architecture](Design.html). Now that you have an understanding of what the Druid cluster looks like, why not load some of your own data? +Check out the next [tutorial](Tutorial%3A-Loading-Your-Data-Part-1.html) section for more info! From 50327cb841ccb52e538fe4963d8d1107af4645c6 Mon Sep 17 00:00:00 2001 From: Igal Levy Date: Mon, 4 Nov 2013 13:24:02 -0800 Subject: [PATCH 02/12] moved Concepts-and-Terminology to Getting Started section --- docs/content/toc.textile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/content/toc.textile b/docs/content/toc.textile index 7abe247cb00..c9208a8ba60 100644 --- a/docs/content/toc.textile +++ b/docs/content/toc.textile @@ -10,6 +10,7 @@ h1. Contents * "Contribute":./Contribute.html h2. Getting Started +* "Concepts and Terminology":./Concepts-and-Terminology.html * "Tutorial: A First Look at Druid":./Tutorial:-A-First-Look-at-Druid.html * "Tutorial: The Druid Cluster":./Tutorial:-The-Druid-Cluster.html * "Tutorial: Loading Your Data Part 1":./Tutorial:-Loading-Your-Data-Part-1.html @@ -62,7 +63,6 @@ h2. Architecture ** "Deep Storage":./Deep-Storage.html ** "MySQL":./MySQL.html ** "ZooKeeper":./ZooKeeper.html -* "Concepts and Terminology":./Concepts-and-Terminology.html h2. Development * "Versioning":./Versioning.html From 61a3c90d7ad8d62e6a7e7ac962becbb5c1a4d6e9 Mon Sep 17 00:00:00 2001 From: Igal Levy Date: Tue, 12 Nov 2013 13:42:39 -0800 Subject: [PATCH 03/12] svg versions of druid architecture diagrams --- docs/_graphics/druid-dataflow-3.svg | 3 +++ docs/_graphics/druid-manage-1.svg | 3 +++ 2 files changed, 6 insertions(+) create mode 100644 docs/_graphics/druid-dataflow-3.svg create mode 100644 docs/_graphics/druid-manage-1.svg diff --git a/docs/_graphics/druid-dataflow-3.svg b/docs/_graphics/druid-dataflow-3.svg new file mode 100644 index 00000000000..7dd97112b5e --- /dev/null +++ b/docs/_graphics/druid-dataflow-3.svg @@ -0,0 +1,3 @@ + + + Produced by OmniGraffle 6.0.1 2013-11-07 21:29ZCanvas 1Layer 1BATCH DATACLIENTCLIENTDATA STREAMCLIENTDEEPSTORAGEDATAQUERIESINDEXINGINDEXINGREALTIMEBROKERHISTORICAL diff --git a/docs/_graphics/druid-manage-1.svg b/docs/_graphics/druid-manage-1.svg new file mode 100644 index 00000000000..3e6dc289e4d --- /dev/null +++ b/docs/_graphics/druid-manage-1.svg @@ -0,0 +1,3 @@ + + + Produced by OmniGraffle 6.0.1 2013-11-07 21:29ZmanagementLayer 1BATCH DATACLIENTCLIENTDATA STREAMCLIENTDEEPSTORAGEDATAQUERIESINDEXINGINDEXINGREALTIMEBROKERHISTORICALmaskmanodesCOORDINATORMySQLZOOKEEPER From d0fe70a21f716a26bbaf6dc0e38c50e535a539e7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Xavier=20L=C3=A9aut=C3=A9?= Date: Wed, 20 Nov 2013 17:14:23 -0800 Subject: [PATCH 04/12] replace deprecated calls to getJsonFactory and createJsonParser --- processing/src/main/java/io/druid/jackson/JacksonModule.java | 2 +- server/src/main/java/io/druid/client/DirectDruidClient.java | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/processing/src/main/java/io/druid/jackson/JacksonModule.java b/processing/src/main/java/io/druid/jackson/JacksonModule.java index 7f6162e7927..16d19cf9544 100644 --- a/processing/src/main/java/io/druid/jackson/JacksonModule.java +++ b/processing/src/main/java/io/druid/jackson/JacksonModule.java @@ -49,7 +49,7 @@ public class JacksonModule implements Module public ObjectMapper smileMapper() { ObjectMapper retVal = new DefaultObjectMapper(new SmileFactory()); - retVal.getJsonFactory().setCodec(retVal); + retVal.getFactory().setCodec(retVal); return retVal; } } diff --git a/server/src/main/java/io/druid/client/DirectDruidClient.java b/server/src/main/java/io/druid/client/DirectDruidClient.java index 80844ccbeda..5292628d8ea 100644 --- a/server/src/main/java/io/druid/client/DirectDruidClient.java +++ b/server/src/main/java/io/druid/client/DirectDruidClient.java @@ -89,7 +89,7 @@ public class DirectDruidClient implements QueryRunner this.httpClient = httpClient; this.host = host; - this.isSmile = this.objectMapper.getJsonFactory() instanceof SmileFactory; + this.isSmile = this.objectMapper.getFactory() instanceof SmileFactory; this.openConnections = new AtomicInteger(); } @@ -269,7 +269,7 @@ public class DirectDruidClient implements QueryRunner { if (jp == null) { try { - jp = objectMapper.getJsonFactory().createJsonParser(future.get()); + jp = objectMapper.getFactory().createParser(future.get()); if (jp.nextToken() != JsonToken.START_ARRAY) { throw new IAE("Next token wasn't a START_ARRAY, was[%s]", jp.getCurrentToken()); } else { From 3134affac9e1f6fc628289f345c1f0ebfdbf6307 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Xavier=20L=C3=A9aut=C3=A9?= Date: Wed, 20 Nov 2013 17:15:26 -0800 Subject: [PATCH 05/12] fix NPE in DirectDruidClient --- server/src/main/java/io/druid/client/DirectDruidClient.java | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/server/src/main/java/io/druid/client/DirectDruidClient.java b/server/src/main/java/io/druid/client/DirectDruidClient.java index 5292628d8ea..8befef5cabb 100644 --- a/server/src/main/java/io/druid/client/DirectDruidClient.java +++ b/server/src/main/java/io/druid/client/DirectDruidClient.java @@ -292,7 +292,9 @@ public class DirectDruidClient implements QueryRunner @Override public void close() throws IOException { - jp.close(); + if(jp != null) { + jp.close(); + } } } } From a3b60557e2d9d9988c3c7a42155d0668a7cb3c63 Mon Sep 17 00:00:00 2001 From: Igal Levy Date: Wed, 20 Nov 2013 21:32:41 -0800 Subject: [PATCH 06/12] Refactored with new and updated sections and definitions. --- docs/content/Concepts-and-Terminology.md | 44 +++++++++++++++++++----- 1 file changed, 36 insertions(+), 8 deletions(-) diff --git a/docs/content/Concepts-and-Terminology.md b/docs/content/Concepts-and-Terminology.md index 3b5bc3ca730..722f44f2e31 100644 --- a/docs/content/Concepts-and-Terminology.md +++ b/docs/content/Concepts-and-Terminology.md @@ -4,11 +4,39 @@ layout: doc_page Concepts and Terminology ======================== -* **Aggregators**: A mechanism for combining records during realtime incremental indexing, Hadoop batch indexing, and in queries. -* **DataSource**: A table-like view of data; specified in a "specFile" and in a query. -* **Granularity**: The time interval corresponding to aggregation by time. - * **indexGranularity**: specifies the granularity used to bucket timestamps within a segment. - * **segmentGranularity**: specifies the granularity of the segment, i.e. the amount of time a segment will represent -* **Segment**: A collection of (internal) records that are stored and processed together. -* **Shard**: A sub-partition of the data in a segment. It is possible to have multiple segments represent all data for a given segmentGranularity. -* **specFile**: is specification for services in JSON format; see [Realtime](Realtime.html) and [Batch-ingestion](Batch-ingestion.html) +The following definitions are with respect to the Druid data store. They are intended to help you better understand the Druid documentation, where the defined terms are used. While reading the definitions in order isn't necessary, some entries do build on previous definitions. + +More definitions are also available on the [architecture design page](Design). + +## Data + +* **Timeseries Data** Data points which are ordered in time. The closing value of a financial index or the number of tweets per hour with a certain hashtag are examples of timeseries data. + +* **Timestamp** An absolute position on a timeline, given in a standard alpha-numerical format such as with UTC time. Timeseries data points can be ordered by timestamp, and in Druid, they are. + +* **Columns** The format for storing records (as opposed to rows). Druid stores records in columns rather than using the classic row-oriented format of traditional RDBMS. This columnar format allows for performing analytics at speeds magnitudes faster than on row-oriented data. + +* **Dimensions** Aspects or categories of data, such as languages or locations. For example, with *language* and *country* as the type of dimension, values could be "English" or "Mandarin" for language, or "USA" or "China" for country. In Druid, dimensions can serve as filters for narrowing down hits (for example, language = "English" or country = "China"). + +* **Metrics** Countable data that can be aggregated. Metrics, for example, can be the number of visitors to a website, number of tweets per day, or average revenue. + +* **Segment** A collection of (internal) records that are stored and processed together. Druid chunks data into segments representing a time interval, and these are stored and manipulated in the cluster. + +* **Shard** A sub-partition of the data, allowing multiple segments to represent the data in a certain time interval. Sharding occurs along time partitions to better handle amounts of data that exceed certain limits on segment size, although sharding along dimensions may also occur to optimize efficiency. + + +## Ingestion + +* **Aggregation** The summarizing of data meeting certain specifications. Druid aggregates timeseries data, which in effect compacts the data. Time intervals (set in configuration) are used to create buckets, while timestamps determine which buckets data is sent to. + +* **Granularity** The time interval corresponding to aggregation by time. Druid configuration settings specify the granularity of timestamp buckets in a segment (for example, by minute or by hour), as well as the granularity of the segment itself. The latter is essentially the overall range of absolute time covered by the segment. + + + +## Queries + +* **Aggregators** A mechanism for combining records during realtime incremental indexing, Hadoop batch indexing, and in queries. + +* **specFile** The specification for services in JSON format; see [Realtime](Realtime.html) and [Batch-ingestion](Batch-ingestion.html) + +* **DataSource** A table-like view of data; specified in specFiles and in queries. From b645eb19b5ec9d17868af0c3aea9e0282d52dc69 Mon Sep 17 00:00:00 2001 From: Igal Levy Date: Thu, 21 Nov 2013 11:04:22 -0800 Subject: [PATCH 07/12] Refactor to remove sections and order entries. --- docs/content/Concepts-and-Terminology.md | 41 ++++++++++++------------ 1 file changed, 20 insertions(+), 21 deletions(-) diff --git a/docs/content/Concepts-and-Terminology.md b/docs/content/Concepts-and-Terminology.md index 722f44f2e31..0e6561c2905 100644 --- a/docs/content/Concepts-and-Terminology.md +++ b/docs/content/Concepts-and-Terminology.md @@ -4,39 +4,38 @@ layout: doc_page Concepts and Terminology ======================== -The following definitions are with respect to the Druid data store. They are intended to help you better understand the Druid documentation, where the defined terms are used. While reading the definitions in order isn't necessary, some entries do build on previous definitions. +The following definitions are given with respect to the Druid data store. They are intended to help you better understand the Druid documentation, where these terms and concepts occur. -More definitions are also available on the [architecture design page](Design). +More definitions are also available on the [design page](Design.html). -## Data +* **Aggregation** The summarizing of data meeting certain specifications. Druid aggregates [timeseries data](#timeseries), which in effect compacts the data. Time intervals (set in configuration) are used to create buckets, while [timestamps](#timestamp) determine which buckets data aggregated in. -* **Timeseries Data** Data points which are ordered in time. The closing value of a financial index or the number of tweets per hour with a certain hashtag are examples of timeseries data. +* **Aggregators** A mechanism for combining records during realtime incremental indexing, Hadoop batch indexing, and in queries. -* **Timestamp** An absolute position on a timeline, given in a standard alpha-numerical format such as with UTC time. Timeseries data points can be ordered by timestamp, and in Druid, they are. - -* **Columns** The format for storing records (as opposed to rows). Druid stores records in columns rather than using the classic row-oriented format of traditional RDBMS. This columnar format allows for performing analytics at speeds magnitudes faster than on row-oriented data. +* **DataSource** A table-like view of data; specified in [specFiles](#specfile) and in queries. Datasources specify the source of data being ingested and ultimately stored in [segments](#segment). * **Dimensions** Aspects or categories of data, such as languages or locations. For example, with *language* and *country* as the type of dimension, values could be "English" or "Mandarin" for language, or "USA" or "China" for country. In Druid, dimensions can serve as filters for narrowing down hits (for example, language = "English" or country = "China"). +* **Granularity** The time interval corresponding to aggregation by time. Druid configuration settings specify the granularity of timestamp buckets in a segment (for example, by minute or by hour), as well as the granularity of the segment itself. The latter is essentially the overall range of absolute time covered by the segment. In queries, granularity settings control the summarization of findings. + +* **Ingestion** The pulling and initial storing and processing of data. Druid supports realtime and batch ingestion of data, and applies indexing in both cases. + +* **Interval** + * **Metrics** Countable data that can be aggregated. Metrics, for example, can be the number of visitors to a website, number of tweets per day, or average revenue. +* **Rollup** The aggregation of data that occurs at one or more stages, based on settings in a [configuration file](#specFile). + + * **Segment** A collection of (internal) records that are stored and processed together. Druid chunks data into segments representing a time interval, and these are stored and manipulated in the cluster. * **Shard** A sub-partition of the data, allowing multiple segments to represent the data in a certain time interval. Sharding occurs along time partitions to better handle amounts of data that exceed certain limits on segment size, although sharding along dimensions may also occur to optimize efficiency. - -## Ingestion - -* **Aggregation** The summarizing of data meeting certain specifications. Druid aggregates timeseries data, which in effect compacts the data. Time intervals (set in configuration) are used to create buckets, while timestamps determine which buckets data is sent to. - -* **Granularity** The time interval corresponding to aggregation by time. Druid configuration settings specify the granularity of timestamp buckets in a segment (for example, by minute or by hour), as well as the granularity of the segment itself. The latter is essentially the overall range of absolute time covered by the segment. - - - -## Queries - -* **Aggregators** A mechanism for combining records during realtime incremental indexing, Hadoop batch indexing, and in queries. - + * **specFile** The specification for services in JSON format; see [Realtime](Realtime.html) and [Batch-ingestion](Batch-ingestion.html) -* **DataSource** A table-like view of data; specified in specFiles and in queries. + +* **Timeseries Data** Data points which are ordered in time. The closing value of a financial index or the number of tweets per hour with a certain hashtag are examples of timeseries data. + + +* **Timestamp** An absolute position on a timeline, given in a standard alpha-numerical format such as with UTC time. Timeseries data points can be ordered by timestamp, and in Druid, they are. From 9d36b9962cafa7ce7bb4789a381ff40d39267b2a Mon Sep 17 00:00:00 2001 From: Igal Levy Date: Thu, 21 Nov 2013 11:08:26 -0800 Subject: [PATCH 08/12] fixed awkward wording --- docs/content/Design.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/content/Design.md b/docs/content/Design.md index 8bab38d6a96..c497a3ea47a 100644 --- a/docs/content/Design.md +++ b/docs/content/Design.md @@ -49,7 +49,7 @@ Aside from these nodes, there are 3 external dependencies to the system: 2. A [MySQL instance](MySQL.html) for maintenance of metadata about the data segments that should be served by the system 3. A ["deep storage" LOB store/file system](Deep-Storage.html) to hold the stored segments -The following diagram shows how certain nodes and dependencies help manage the cluster by tracking and exchanging metadata. This management layer is illustrated in the following diagram: +The following diagram illustrates the cluster's management layer, showing how certain nodes and dependencies help manage the cluster by tracking and exchanging metadata: From c71c33ed69ae5d3a341ace0577d06e755f052e07 Mon Sep 17 00:00:00 2001 From: Igal Levy Date: Thu, 21 Nov 2013 15:10:44 -0800 Subject: [PATCH 09/12] * removed terms that are widespread -- such as "column" and "interval" -- since there are too many ways to define them, in favor of allowing them to be defined in context on pages where they occur. For example, "interval" shows up in the JSON blob containing segment metadata, as well as in queries. * removed sections -- it's hard for a new user to understand exactly why these terms are in those sections, or what that means, so better to allow specific definitions or the context in pages to impart those relationships. * Added links between entries --- docs/content/Concepts-and-Terminology.md | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/docs/content/Concepts-and-Terminology.md b/docs/content/Concepts-and-Terminology.md index 0e6561c2905..a0422f783cb 100644 --- a/docs/content/Concepts-and-Terminology.md +++ b/docs/content/Concepts-and-Terminology.md @@ -6,22 +6,20 @@ Concepts and Terminology The following definitions are given with respect to the Druid data store. They are intended to help you better understand the Druid documentation, where these terms and concepts occur. -More definitions are also available on the [design page](Design.html). +More definitions are available on the [design page](Design.html). * **Aggregation** The summarizing of data meeting certain specifications. Druid aggregates [timeseries data](#timeseries), which in effect compacts the data. Time intervals (set in configuration) are used to create buckets, while [timestamps](#timestamp) determine which buckets data aggregated in. * **Aggregators** A mechanism for combining records during realtime incremental indexing, Hadoop batch indexing, and in queries. -* **DataSource** A table-like view of data; specified in [specFiles](#specfile) and in queries. Datasources specify the source of data being ingested and ultimately stored in [segments](#segment). +* **DataSource** A table-like view of data; specified in [specFiles](#specfile) and in queries. A dataSource specifies the source of data being ingested and ultimately stored in [segments](#segment). * **Dimensions** Aspects or categories of data, such as languages or locations. For example, with *language* and *country* as the type of dimension, values could be "English" or "Mandarin" for language, or "USA" or "China" for country. In Druid, dimensions can serve as filters for narrowing down hits (for example, language = "English" or country = "China"). -* **Granularity** The time interval corresponding to aggregation by time. Druid configuration settings specify the granularity of timestamp buckets in a segment (for example, by minute or by hour), as well as the granularity of the segment itself. The latter is essentially the overall range of absolute time covered by the segment. In queries, granularity settings control the summarization of findings. +* **Granularity** The time interval corresponding to aggregation by time. Druid configuration settings specify the granularity of [timestamp](#timestamp) buckets in a [segment](#segment) (for example, by minute or by hour), as well as the granularity of the segment itself. The latter is essentially the overall range of absolute time covered by the segment. In queries, granularity settings control the summarization of findings. * **Ingestion** The pulling and initial storing and processing of data. Druid supports realtime and batch ingestion of data, and applies indexing in both cases. -* **Interval** - * **Metrics** Countable data that can be aggregated. Metrics, for example, can be the number of visitors to a website, number of tweets per day, or average revenue. * **Rollup** The aggregation of data that occurs at one or more stages, based on settings in a [configuration file](#specFile). @@ -29,7 +27,7 @@ More definitions are also available on the [design page](Design.html). * **Segment** A collection of (internal) records that are stored and processed together. Druid chunks data into segments representing a time interval, and these are stored and manipulated in the cluster. -* **Shard** A sub-partition of the data, allowing multiple segments to represent the data in a certain time interval. Sharding occurs along time partitions to better handle amounts of data that exceed certain limits on segment size, although sharding along dimensions may also occur to optimize efficiency. +* **Shard** A sub-partition of the data, allowing multiple [segments](#segment) to represent the data in a certain time interval. Sharding occurs along time partitions to better handle amounts of data that exceed certain limits on segment size, although sharding along dimensions may also occur to optimize efficiency. * **specFile** The specification for services in JSON format; see [Realtime](Realtime.html) and [Batch-ingestion](Batch-ingestion.html) @@ -38,4 +36,4 @@ More definitions are also available on the [design page](Design.html). * **Timeseries Data** Data points which are ordered in time. The closing value of a financial index or the number of tweets per hour with a certain hashtag are examples of timeseries data. -* **Timestamp** An absolute position on a timeline, given in a standard alpha-numerical format such as with UTC time. Timeseries data points can be ordered by timestamp, and in Druid, they are. +* **Timestamp** An absolute position on a timeline, given in a standard alpha-numerical format such as with UTC time. [Timeseries data](#timeseries) points can be ordered by timestamp, and in Druid, they are. From be5ce88093e4930169a5875b85520731075d6dca Mon Sep 17 00:00:00 2001 From: hkmurakami Date: Sat, 23 Nov 2013 12:57:51 -0800 Subject: [PATCH 10/12] fix header formatting in tutorial doc --- docs/content/Tutorial:-A-First-Look-at-Druid.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/content/Tutorial:-A-First-Look-at-Druid.md b/docs/content/Tutorial:-A-First-Look-at-Druid.md index ee8ceba5122..389863d501b 100644 --- a/docs/content/Tutorial:-A-First-Look-at-Druid.md +++ b/docs/content/Tutorial:-A-First-Look-at-Druid.md @@ -140,7 +140,8 @@ The result looks something like this: This groupBy query is a bit complicated and we'll return to it later. For the time being, just make sure you are getting some blocks of data back. If you are having problems, make sure you have [curl](http://curl.haxx.se/) installed. Control+C to break out of the client script. -h2. Querying Druid +Querying Druid +-------------- In your favorite editor, create the file: From 964bff065a99fa2a2abf192a5e98d7680151886e Mon Sep 17 00:00:00 2001 From: hkmurakami Date: Sat, 23 Nov 2013 13:13:05 -0800 Subject: [PATCH 11/12] Rename Druid-vs-redshift.md to Druid-vs-Redshift.md links in documentation point to capitalized version. (http://druid.io/docs/0.6.10/) --- docs/content/{Druid-vs-redshift.md => Druid-vs-Redshift.md} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename docs/content/{Druid-vs-redshift.md => Druid-vs-Redshift.md} (100%) diff --git a/docs/content/Druid-vs-redshift.md b/docs/content/Druid-vs-Redshift.md similarity index 100% rename from docs/content/Druid-vs-redshift.md rename to docs/content/Druid-vs-Redshift.md From eed215e53e165c47d47ae9a43a9dcf398b5c91a1 Mon Sep 17 00:00:00 2001 From: hkmurakami Date: Sat, 23 Nov 2013 13:13:46 -0800 Subject: [PATCH 12/12] Rename Druid-vs-vertica.md to Druid-vs-Vertica.md links in documentation point to capitalized version (http://druid.io/docs/0.6.10/) --- docs/content/{Druid-vs-vertica.md => Druid-vs-Vertica.md} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename docs/content/{Druid-vs-vertica.md => Druid-vs-Vertica.md} (100%) diff --git a/docs/content/Druid-vs-vertica.md b/docs/content/Druid-vs-Vertica.md similarity index 100% rename from docs/content/Druid-vs-vertica.md rename to docs/content/Druid-vs-Vertica.md