From c06b37f36e264f94a1f3612182a170b7b76b0839 Mon Sep 17 00:00:00 2001 From: Russell Jurney Date: Mon, 16 Sep 2013 14:08:22 -0700 Subject: [PATCH 1/6] Won't serve html --- docs/_config.yml | 2 + docs/_layouts/default.html | 44 +++++ docs/_layouts/post.html | 9 + .../2013-09-16-welcome-to-jekyll.markdown | 24 +++ docs/css/main.css | 160 ++++++++++++++++++ docs/css/syntax.css | 60 +++++++ docs/index.html | 13 ++ 7 files changed, 312 insertions(+) create mode 100644 docs/_config.yml create mode 100644 docs/_layouts/default.html create mode 100644 docs/_layouts/post.html create mode 100644 docs/_posts/2013-09-16-welcome-to-jekyll.markdown create mode 100755 docs/css/main.css create mode 100644 docs/css/syntax.css create mode 100644 docs/index.html diff --git a/docs/_config.yml b/docs/_config.yml new file mode 100644 index 00000000000..362c8bf5f91 --- /dev/null +++ b/docs/_config.yml @@ -0,0 +1,2 @@ +name: Your New Jekyll Site +pygments: true diff --git a/docs/_layouts/default.html b/docs/_layouts/default.html new file mode 100644 index 00000000000..22e7e3f2a31 --- /dev/null +++ b/docs/_layouts/default.html @@ -0,0 +1,44 @@ + + + + + + {{ page.title }} + + + + + + + + + + + +
+ + + {{ content }} + + +
+ + + diff --git a/docs/_layouts/post.html b/docs/_layouts/post.html new file mode 100644 index 00000000000..04e3586b82a --- /dev/null +++ b/docs/_layouts/post.html @@ -0,0 +1,9 @@ +--- +layout: default +--- +

{{ page.title }}

+

{{ page.date | date_to_string }}

+ +
+{{ content }} +
diff --git a/docs/_posts/2013-09-16-welcome-to-jekyll.markdown b/docs/_posts/2013-09-16-welcome-to-jekyll.markdown new file mode 100644 index 00000000000..c50a740d847 --- /dev/null +++ b/docs/_posts/2013-09-16-welcome-to-jekyll.markdown @@ -0,0 +1,24 @@ +--- +layout: post +title: "Welcome to Jekyll!" +date: 2013-09-16 13:06:49 +categories: jekyll update +--- + +You'll find this post in your `_posts` directory - edit this post and re-build (or run with the `-w` switch) to see your changes! +To add new posts, simply add a file in the `_posts` directory that follows the convention: YYYY-MM-DD-name-of-post.ext. + +Jekyll also offers powerful support for code snippets: + +{% highlight ruby %} +def print_hi(name) + puts "Hi, #{name}" +end +print_hi('Tom') +#=> prints 'Hi, Tom' to STDOUT. +{% endhighlight %} + +Check out the [Jekyll docs][jekyll] for more info on how to get the most out of Jekyll. File all bugs/feature requests at [Jekyll's GitHub repo][jekyll-gh]. + +[jekyll-gh]: https://github.com/mojombo/jekyll +[jekyll]: http://jekyllrb.com diff --git a/docs/css/main.css b/docs/css/main.css new file mode 100755 index 00000000000..50a818048ad --- /dev/null +++ b/docs/css/main.css @@ -0,0 +1,160 @@ +/*****************************************************************************/ +/* +/* Common +/* +/*****************************************************************************/ + +/* Global Reset */ +* { + margin: 0; + padding: 0; +} + +html, body { height: 100%; } + +body { + background-color: #FFF; + font: 13.34px Helvetica, Arial, sans-serif; + font-size: small; + text-align: center; +} + +h1, h2, h3, h4, h5, h6 { + font-size: 100%; } + +h1 { margin-bottom: 1em; } +p { margin: 1em 0; } + +a { color: #00a; } +a:hover { color: #000; } +a:visited { color: #a0a; } + +/*****************************************************************************/ +/* +/* Home +/* +/*****************************************************************************/ +ul.posts { + list-style-type: none; + margin-bottom: 2em; +} + +ul.posts li { + line-height: 1.75em; +} + +ul.posts span { + color: #aaa; + font-family: Monaco, "Courier New", monospace; + font-size: 80%; +} + +/*****************************************************************************/ +/* +/* Site +/* +/*****************************************************************************/ + +.site { + font-size: 115%; + text-align: justify; + width: 42em; + margin: 3em auto 2em; + line-height: 1.5em; +} + +.site .header a { + font-weight: bold; + text-decoration: none; +} + +.site .header h1.title { + display: inline-block; + margin-bottom: 2em; +} + +.site .header h1.title a { + color: #a00; +} + +.site .header h1.title a:hover { + color: #000; +} + +.site .header a.extra { + color: #aaa; + margin-left: 1em; +} + +.site .header a.extra:hover { + color: #000; +} + +.site .meta { + color: #aaa; +} + +.site .footer { + font-size: 80%; + color: #666; + border-top: 4px solid #eee; + margin-top: 2em; + overflow: hidden; +} + +.site .footer .contact { + float: left; + margin-right: 3em; +} + +.site .footer .contact a { + color: #8085C1; +} + +.site .footer .rss { + margin-top: 1.1em; + margin-right: -.2em; + float: right; +} + +.site .footer .rss img { + border: 0; +} + +/*****************************************************************************/ +/* +/* Posts +/* +/*****************************************************************************/ + +/* standard */ +.post pre { + border: 1px solid #ddd; + background-color: #eef; + padding: 0 .4em; +} + +.post ul, .post ol { + margin-left: 1.35em; +} + +.post code { + border: 1px solid #ddd; + background-color: #eef; + padding: 0 .2em; +} + +.post pre code { + border: none; +} + +/* terminal */ +.post pre.terminal { + border: 1px solid #000; + background-color: #333; + color: #FFF; +} + +.post pre.terminal code { + background-color: #333; +} diff --git a/docs/css/syntax.css b/docs/css/syntax.css new file mode 100644 index 00000000000..2774b764926 --- /dev/null +++ b/docs/css/syntax.css @@ -0,0 +1,60 @@ +.highlight { background: #ffffff; } +.highlight .c { color: #999988; font-style: italic } /* Comment */ +.highlight .err { color: #a61717; background-color: #e3d2d2 } /* Error */ +.highlight .k { font-weight: bold } /* Keyword */ +.highlight .o { font-weight: bold } /* Operator */ +.highlight .cm { color: #999988; font-style: italic } /* Comment.Multiline */ +.highlight .cp { color: #999999; font-weight: bold } /* Comment.Preproc */ +.highlight .c1 { color: #999988; font-style: italic } /* Comment.Single */ +.highlight .cs { color: #999999; font-weight: bold; font-style: italic } /* Comment.Special */ +.highlight .gd { color: #000000; background-color: #ffdddd } /* Generic.Deleted */ +.highlight .gd .x { color: #000000; background-color: #ffaaaa } /* Generic.Deleted.Specific */ +.highlight .ge { font-style: italic } /* Generic.Emph */ +.highlight .gr { color: #aa0000 } /* Generic.Error */ +.highlight .gh { color: #999999 } /* Generic.Heading */ +.highlight .gi { color: #000000; background-color: #ddffdd } /* Generic.Inserted */ +.highlight .gi .x { color: #000000; background-color: #aaffaa } /* Generic.Inserted.Specific */ +.highlight .go { color: #888888 } /* Generic.Output */ +.highlight .gp { color: #555555 } /* Generic.Prompt */ +.highlight .gs { font-weight: bold } /* Generic.Strong */ +.highlight .gu { color: #aaaaaa } /* Generic.Subheading */ +.highlight .gt { color: #aa0000 } /* Generic.Traceback */ +.highlight .kc { font-weight: bold } /* Keyword.Constant */ +.highlight .kd { font-weight: bold } /* Keyword.Declaration */ +.highlight .kp { font-weight: bold } /* Keyword.Pseudo */ +.highlight .kr { font-weight: bold } /* Keyword.Reserved */ +.highlight .kt { color: #445588; font-weight: bold } /* Keyword.Type */ +.highlight .m { color: #009999 } /* Literal.Number */ +.highlight .s { color: #d14 } /* Literal.String */ +.highlight .na { color: #008080 } /* Name.Attribute */ +.highlight .nb { color: #0086B3 } /* Name.Builtin */ +.highlight .nc { color: #445588; font-weight: bold } /* Name.Class */ +.highlight .no { color: #008080 } /* Name.Constant */ +.highlight .ni { color: #800080 } /* Name.Entity */ +.highlight .ne { color: #990000; font-weight: bold } /* Name.Exception */ +.highlight .nf { color: #990000; font-weight: bold } /* Name.Function */ +.highlight .nn { color: #555555 } /* Name.Namespace */ +.highlight .nt { color: #000080 } /* Name.Tag */ +.highlight .nv { color: #008080 } /* Name.Variable */ +.highlight .ow { font-weight: bold } /* Operator.Word */ +.highlight .w { color: #bbbbbb } /* Text.Whitespace */ +.highlight .mf { color: #009999 } /* Literal.Number.Float */ +.highlight .mh { color: #009999 } /* Literal.Number.Hex */ +.highlight .mi { color: #009999 } /* Literal.Number.Integer */ +.highlight .mo { color: #009999 } /* Literal.Number.Oct */ +.highlight .sb { color: #d14 } /* Literal.String.Backtick */ +.highlight .sc { color: #d14 } /* Literal.String.Char */ +.highlight .sd { color: #d14 } /* Literal.String.Doc */ +.highlight .s2 { color: #d14 } /* Literal.String.Double */ +.highlight .se { color: #d14 } /* Literal.String.Escape */ +.highlight .sh { color: #d14 } /* Literal.String.Heredoc */ +.highlight .si { color: #d14 } /* Literal.String.Interpol */ +.highlight .sx { color: #d14 } /* Literal.String.Other */ +.highlight .sr { color: #009926 } /* Literal.String.Regex */ +.highlight .s1 { color: #d14 } /* Literal.String.Single */ +.highlight .ss { color: #990073 } /* Literal.String.Symbol */ +.highlight .bp { color: #999999 } /* Name.Builtin.Pseudo */ +.highlight .vc { color: #008080 } /* Name.Variable.Class */ +.highlight .vg { color: #008080 } /* Name.Variable.Global */ +.highlight .vi { color: #008080 } /* Name.Variable.Instance */ +.highlight .il { color: #009999 } /* Literal.Number.Integer.Long */ diff --git a/docs/index.html b/docs/index.html new file mode 100644 index 00000000000..c7268192374 --- /dev/null +++ b/docs/index.html @@ -0,0 +1,13 @@ +--- +layout: default +title: Your New Jekyll Site +--- + +
+

Blog Posts

+ +
\ No newline at end of file From 248fba683a3cfe8a99c6ebb0d101d3693445e87b Mon Sep 17 00:00:00 2001 From: Russell Jurney Date: Mon, 16 Sep 2013 14:49:36 -0700 Subject: [PATCH 2/6] Added prepend tag to make pages display. --- docs/Aggregations.md | 93 ++----------------------- docs/Batch-ingestion.md | 3 + docs/Booting-a-production-cluster.md | 3 + docs/Broker.md | 3 + docs/Build-from-source.md | 3 + docs/Cluster-setup.md | 3 + docs/Compute.md | 3 + docs/Concepts-and-Terminology.md | 3 + docs/Configuration.md | 3 + docs/Contribute.md | 3 + docs/Deep-Storage.md | 3 + docs/Design.md | 3 + docs/Download.md | 3 + docs/Druid-Personal-Demo-Cluster.md | 3 + docs/Druid-vs-Cassandra.md | 3 + docs/Druid-vs-Hadoop.md | 3 + docs/Druid-vs-Impala-or-Shark.md | 3 + docs/Druid-vs-redshift.md | 3 + docs/Druid-vs-vertica.md | 3 + docs/Examples.md | 3 + docs/Filters.md | 3 + docs/Firehose.md | 3 + docs/Granularities.md | 3 + docs/GroupByQuery.md | 3 + docs/Having.md | 3 + docs/Home.md | 3 + docs/Indexing-Service.md | 3 + docs/Libraries.md | 3 + docs/Loading-Your-Data.md | 3 + docs/Master.md | 5 +- docs/MySQL.md | 3 + docs/OrderBy.md | 3 + docs/Plumber.md | 3 + docs/Post-aggregations.md | 3 + docs/Querying-your-data.md | 3 + docs/Querying.md | 3 + docs/Realtime.md | 3 + docs/Rule-Configuration.md | 3 + docs/SearchQuery.md | 3 + docs/SearchQuerySpec.md | 3 + docs/SegmentMetadataQuery.md | 3 + docs/Segments.md | 3 + docs/Spatial-Filters.md | 3 + docs/Spatial-Indexing.md | 3 + docs/Stand-Alone-With-Riak-CS.md | 3 + docs/Support.md | 3 + docs/Tasks.md | 3 + docs/Thanks.md | 3 + docs/TimeBoundaryQuery.md | 3 + docs/TimeseriesQuery.md | 3 + docs/Tutorial:-A-First-Look-at-Druid.md | 3 + docs/Tutorial:-The-Druid-Cluster.md | 3 + docs/Tutorial:-Webstream.md | 3 + docs/Twitter-Tutorial.md | 3 + docs/Versioning.md | 3 + docs/ZooKeeper.md | 3 + docs/_config.yml | 1 + docs/contents.md | 3 + 58 files changed, 176 insertions(+), 88 deletions(-) diff --git a/docs/Aggregations.md b/docs/Aggregations.md index ffdbd18a4f2..886dac7a572 100644 --- a/docs/Aggregations.md +++ b/docs/Aggregations.md @@ -1,87 +1,6 @@ -Aggregations are specifications of processing over metrics available in Druid. -Available aggregations are: - -### Sum aggregators - -#### `longSum` aggregator - -computes the sum of values as a 64-bit, signed integer - - { - "type" : "longSum", - "name" : , - "fieldName" : - } - -`name` – output name for the summed value -`fieldName` – name of the metric column to sum over - -#### `doubleSum` aggregator - -Computes the sum of values as 64-bit floating point value. Similar to `longSum` - - { - "type" : "doubleSum", - "name" : , - "fieldName" : - } - -### Count aggregator - -`count` computes the row count that match the filters - - { - "type" : "count", - "name" : , - } - -### Min / Max aggregators - -#### `min` aggregator - -`min` computes the minimum metric value - - { - "type" : "min", - "name" : , - "fieldName" : - } - -#### `max` aggregator - -`max` computes the maximum metric value - - { - "type" : "max", - "name" : , - "fieldName" : - } - -### JavaScript aggregator - -Computes an arbitrary JavaScript function over a set of columns (both metrics and dimensions). - -All JavaScript functions must return numerical values. - - { - "type": "javascript", - "name": "", - "fieldNames" : [ , , ... ], - "fnAggregate" : "function(current, column1, column2, ...) { - - return - }" - "fnCombine" : "function(partialA, partialB) { return ; }" - "fnReset" : "function() { return ; }" - } - -**Example** - - { - "type": "javascript", - "name": "sum(log(x)/y) + 10", - "fieldNames": ["x", "y"], - "fnAggregate" : "function(current, a, b) { return current + (Math.log(a) * b); }" - "fnCombine" : "function(partialA, partialB) { return partialA + partialB; }" - "fnReset" : "function() { return 10; }" - } +--- +layout: default +--- +--- +layout: default +--- diff --git a/docs/Batch-ingestion.md b/docs/Batch-ingestion.md index 97212777bc4..f91f0dbb081 100644 --- a/docs/Batch-ingestion.md +++ b/docs/Batch-ingestion.md @@ -1,3 +1,6 @@ +--- +layout: default +--- Batch Data Ingestion ==================== diff --git a/docs/Booting-a-production-cluster.md b/docs/Booting-a-production-cluster.md index 32181fce2c1..c25ef25c607 100644 --- a/docs/Booting-a-production-cluster.md +++ b/docs/Booting-a-production-cluster.md @@ -1,3 +1,6 @@ +--- +layout: default +--- # Booting a Single Node Cluster # [[Loading Your Data]] and [[Querying Your Data]] contain recipes to boot a small druid cluster on localhost. Here we will boot a small cluster on EC2. You can checkout the code, or download a tarball from [here](http://static.druid.io/artifacts/druid-services-0.5.51-SNAPSHOT-bin.tar.gz). diff --git a/docs/Broker.md b/docs/Broker.md index 6d8f3db2ca0..e71100e9915 100644 --- a/docs/Broker.md +++ b/docs/Broker.md @@ -1,3 +1,6 @@ +--- +layout: default +--- Broker ====== diff --git a/docs/Build-from-source.md b/docs/Build-from-source.md index aaa5411368d..3f323259b80 100644 --- a/docs/Build-from-source.md +++ b/docs/Build-from-source.md @@ -1,3 +1,6 @@ +--- +layout: default +--- ### Clone and Build from Source The other way to setup Druid is from source via git. To do so, run these commands: diff --git a/docs/Cluster-setup.md b/docs/Cluster-setup.md index 23cb806d4fc..29837e94296 100644 --- a/docs/Cluster-setup.md +++ b/docs/Cluster-setup.md @@ -1,3 +1,6 @@ +--- +layout: default +--- A Druid cluster consists of various node types that need to be set up depending on your use case. See our [[Design]] docs for a description of the different node types. Setup Scripts diff --git a/docs/Compute.md b/docs/Compute.md index 755f2475707..8df11f8ca2f 100644 --- a/docs/Compute.md +++ b/docs/Compute.md @@ -1,3 +1,6 @@ +--- +layout: default +--- Compute ======= diff --git a/docs/Concepts-and-Terminology.md b/docs/Concepts-and-Terminology.md index a9accabf88a..1e7f535388d 100644 --- a/docs/Concepts-and-Terminology.md +++ b/docs/Concepts-and-Terminology.md @@ -1,3 +1,6 @@ +--- +layout: default +--- Concepts and Terminology ======================== diff --git a/docs/Configuration.md b/docs/Configuration.md index c3150d44805..353b8be77b3 100644 --- a/docs/Configuration.md +++ b/docs/Configuration.md @@ -1,3 +1,6 @@ +--- +layout: default +--- This describes the basic server configuration that is loaded by all the server processes; the same file is loaded by all. See also the json “specFile” descriptions in [[Realtime]] and [[Batch-ingestion]]. JVM Configuration Best Practices diff --git a/docs/Contribute.md b/docs/Contribute.md index 8a5bcc75f99..58d53a6d224 100644 --- a/docs/Contribute.md +++ b/docs/Contribute.md @@ -1,3 +1,6 @@ +--- +layout: default +--- If you are interested in contributing to the code, we accept [pull requests](https://help.github.com/articles/using-pull-requests). Note: we have only just completed decoupling our Metamarkets-specific code from the code base and we took some short-cuts in interface design to make it happen. So, there are a number of interfaces that exist right now which are likely to be in flux. If you are embedding Druid in your system, it will be safest for the time being to only extend/implement interfaces that this wiki describes, as those are intended as stable (unless otherwise mentioned). For issue tracking, we are using the github issue tracker. Please fill out an issue from the Issues tab on the github screen. diff --git a/docs/Deep-Storage.md b/docs/Deep-Storage.md index f30aa50333e..bd9a0ec8a66 100644 --- a/docs/Deep-Storage.md +++ b/docs/Deep-Storage.md @@ -1,3 +1,6 @@ +--- +layout: default +--- Deep storage is where segments are stored. It is a storage mechanism that Druid does not provide. This deep storage infrastructure defines the level of durability of your data, as long as Druid nodes can see this storage infrastructure and get at the segments stored on it, you will not lose data no matter how many Druid nodes you lose. If segments disappear from this storage layer, then you will lose whatever data those segments represented. The currently supported types of deep storage follow. diff --git a/docs/Design.md b/docs/Design.md index 2d67f1e3139..888d0b871b3 100644 --- a/docs/Design.md +++ b/docs/Design.md @@ -1,3 +1,6 @@ +--- +layout: default +--- For a comprehensive look at the architecture of Druid, read the [White Paper](http://static.druid.io/docs/druid.pdf). What is Druid? diff --git a/docs/Download.md b/docs/Download.md index 1bdbe799c50..00de8597f11 100644 --- a/docs/Download.md +++ b/docs/Download.md @@ -1,3 +1,6 @@ +--- +layout: default +--- A version may be declared as a release candidate if it has been deployed to a sizable production cluster. Release candidates are declared as stable after we feel fairly confident there are no major bugs in the version. Check out the [[Versioning]] section for how we describe software versions. Release Candidate diff --git a/docs/Druid-Personal-Demo-Cluster.md b/docs/Druid-Personal-Demo-Cluster.md index 81a088226f5..ab49d828dbc 100644 --- a/docs/Druid-Personal-Demo-Cluster.md +++ b/docs/Druid-Personal-Demo-Cluster.md @@ -1,3 +1,6 @@ +--- +layout: default +--- # Druid Personal Demo Cluster (DPDC) Note, there are currently some issues with the CloudFormation. We are working through them and will update the documentation here when things work properly. In the meantime, the simplest way to get your feet wet with a cluster setup is to run through the instructions at [housejester/druid-test-harness](https://github.com/housejester/druid-test-harness), though it is based on an older version. If you just want to get a feel for the types of data and queries that you can issue, check out [[Realtime Examples]] diff --git a/docs/Druid-vs-Cassandra.md b/docs/Druid-vs-Cassandra.md index 4cac3922324..e191dde2af7 100644 --- a/docs/Druid-vs-Cassandra.md +++ b/docs/Druid-vs-Cassandra.md @@ -1,3 +1,6 @@ +--- +layout: default +--- We are not experts on Cassandra, if anything is incorrect about our portrayal, please let us know on the mailing list or via some other means. We will fix this page. Druid is highly optimized for scans and aggregations, it supports arbitrarily deep drill downs into data sets without the need to pre-compute, and it can ingest event streams in real-time and allow users to query events as they come in. Cassandra is a great key-value store and it has some features that allow you to use it to do more interesting things than what you can do with a pure key-value store. But, it is not built for the same use cases that Druid handles, namely regularly scanning over billions of entries per query. diff --git a/docs/Druid-vs-Hadoop.md b/docs/Druid-vs-Hadoop.md index 68744179b1e..37559b1da8f 100644 --- a/docs/Druid-vs-Hadoop.md +++ b/docs/Druid-vs-Hadoop.md @@ -1,3 +1,6 @@ +--- +layout: default +--- Druid is a complementary addition to Hadoop. Hadoop is great at storing and making accessible large amounts of individually low-value data. Unfortunately, Hadoop is not great at providing query speed guarantees on top of that data, nor does it have very good operational characteristics for a customer-facing production system. Druid, on the other hand, excels at taking high-value summaries of the low-value data on Hadoop, making it available in a fast and always-on fashion, such that it could be exposed directly to a customer. Druid also requires some infrastructure to exist for “deep storage”. HDFS is one of the implemented options for this “deep storage”. diff --git a/docs/Druid-vs-Impala-or-Shark.md b/docs/Druid-vs-Impala-or-Shark.md index e9a0c673b87..3174fbbea5f 100644 --- a/docs/Druid-vs-Impala-or-Shark.md +++ b/docs/Druid-vs-Impala-or-Shark.md @@ -1,3 +1,6 @@ +--- +layout: default +--- The question of Druid versus Impala or Shark basically comes down to your product requirements and what the systems were designed to do. Druid was designed to diff --git a/docs/Druid-vs-redshift.md b/docs/Druid-vs-redshift.md index 2b360a4668b..8469209b10b 100644 --- a/docs/Druid-vs-redshift.md +++ b/docs/Druid-vs-redshift.md @@ -1,3 +1,6 @@ +--- +layout: default +--- ###How does Druid compare to Redshift? In terms of drawing a differentiation, Redshift is essentially ParAccel (Actian) which Amazon is licensing. diff --git a/docs/Druid-vs-vertica.md b/docs/Druid-vs-vertica.md index b35f62e9f03..b20976b74a6 100644 --- a/docs/Druid-vs-vertica.md +++ b/docs/Druid-vs-vertica.md @@ -1,3 +1,6 @@ +--- +layout: default +--- How does Druid compare to Vertica? Vertica is similar to ParAccel/Redshift ([[Druid-vs-Redshift]]) described above in that it wasn’t built for real-time streaming data ingestion and it supports full SQL. diff --git a/docs/Examples.md b/docs/Examples.md index 88ca41fb4fa..9ab10466e56 100644 --- a/docs/Examples.md +++ b/docs/Examples.md @@ -1,3 +1,6 @@ +--- +layout: default +--- Examples ======== diff --git a/docs/Filters.md b/docs/Filters.md index f655861d5fb..41ae91f93e2 100644 --- a/docs/Filters.md +++ b/docs/Filters.md @@ -1,3 +1,6 @@ +--- +layout: default +--- A filter is a JSON object indicating which rows of data should be included in the computation for a query. It’s essentially the equivalent of the WHERE clause in SQL. Druid supports the following types of filters. ### Selector filter diff --git a/docs/Firehose.md b/docs/Firehose.md index ab9b2ac53d2..c571f035a10 100644 --- a/docs/Firehose.md +++ b/docs/Firehose.md @@ -1,3 +1,6 @@ +--- +layout: default +--- Firehoses describe the data stream source. They are pluggable and thus the configuration schema can and will vary based on the `type` of the firehose. |Field|Type|Description|Required| diff --git a/docs/Granularities.md b/docs/Granularities.md index ea568dd7d62..cf5283841c0 100644 --- a/docs/Granularities.md +++ b/docs/Granularities.md @@ -1,3 +1,6 @@ +--- +layout: default +--- The granularity field determines how data gets bucketed across the time dimension, i.e how it gets aggregated by hour, day, minute, etc. It can be specified either as a string for simple granularities or as an object for arbitrary granularities. diff --git a/docs/GroupByQuery.md b/docs/GroupByQuery.md index 735dd5c393a..656ff1a41a1 100644 --- a/docs/GroupByQuery.md +++ b/docs/GroupByQuery.md @@ -1,3 +1,6 @@ +--- +layout: default +--- These types of queries take a groupBy query object and return an array of JSON objects where each object represents a grouping asked for by the query. An example groupBy query object is shown below: diff --git a/docs/Having.md b/docs/Having.md index 47226f1b88e..62ab4644451 100644 --- a/docs/Having.md +++ b/docs/Having.md @@ -1,3 +1,6 @@ +--- +layout: default +--- A having clause is a JSON object identifying which rows from a groupBy query should be returned, by specifying conditions on aggregated values. It is essentially the equivalent of the HAVING clause in SQL. diff --git a/docs/Home.md b/docs/Home.md index 88e1c86b8aa..934f11b8c92 100644 --- a/docs/Home.md +++ b/docs/Home.md @@ -1,3 +1,6 @@ +--- +layout: default +--- Druid is an open-source analytics datastore designed for realtime, exploratory, queries on large-scale data sets (100’s of Billions entries, 100’s TB data). Druid provides for cost effective, always-on, realtime data ingestion and arbitrary data exploration. - Check out some [[Examples]] diff --git a/docs/Indexing-Service.md b/docs/Indexing-Service.md index 0e4ff939f4a..60abbd73b9f 100644 --- a/docs/Indexing-Service.md +++ b/docs/Indexing-Service.md @@ -1,3 +1,6 @@ +--- +layout: default +--- Disclaimer: We are still in the process of finalizing the indexing service and these configs are prone to change at any time. We will announce when we feel the indexing service and the configurations described are stable. The indexing service is a distributed task/job queue. It accepts requests in the form of [[Tasks]] and executes those tasks across a set of worker nodes. Worker capacity can be automatically adjusted based on the number of tasks pending in the system. The indexing service is highly available, has built in retry logic, and can backup per task logs in deep storage. diff --git a/docs/Libraries.md b/docs/Libraries.md index 41374e310c1..75bc17c633c 100644 --- a/docs/Libraries.md +++ b/docs/Libraries.md @@ -1,3 +1,6 @@ +--- +layout: default +--- ### R - [RDruid](https://github.com/metamx/RDruid) - Druid connector for R diff --git a/docs/Loading-Your-Data.md b/docs/Loading-Your-Data.md index 568a20767ac..dd4b0f8a7fb 100644 --- a/docs/Loading-Your-Data.md +++ b/docs/Loading-Your-Data.md @@ -1,3 +1,6 @@ +--- +layout: default +--- Once you have a realtime node working, it is time to load your own data to see how Druid performs. Druid can ingest data in three ways: via Kafka and a realtime node, via the indexing service, and via the Hadoop batch loader. Data is ingested in realtime using a [[Firehose]]. diff --git a/docs/Master.md b/docs/Master.md index 891f6b854ef..f7345524980 100644 --- a/docs/Master.md +++ b/docs/Master.md @@ -1,3 +1,6 @@ +--- +layout: default +--- Master ====== @@ -12,7 +15,7 @@ Rules Segments are loaded and dropped from the cluster based on a set of rules. Rules indicate how segments should be assigned to different compute node tiers and how many replicants of a segment should exist in each tier. Rules may also indicate when segments should be dropped entirely from the cluster. The master loads a set of rules from the database. Rules may be specific to a certain datasource and/or a default set of rules can be configured. Rules are read in order and hence the ordering of rules is important. The master will cycle through all available segments and match each segment with the first rule that applies. Each segment may only match a single rule -For more information on rules, see [[Rule Configuration]]. +For more information on rules, see [[Rule Configuration.md]]. Cleaning Up Segments -------------------- diff --git a/docs/MySQL.md b/docs/MySQL.md index 79cf6ed6d8b..f7ee2ec4db1 100644 --- a/docs/MySQL.md +++ b/docs/MySQL.md @@ -1,3 +1,6 @@ +--- +layout: default +--- MySQL is an external dependency of Druid. We use it to store various metadata about the system, but not to store the actual data. There are a number of tables used for various purposes described below. Segments Table diff --git a/docs/OrderBy.md b/docs/OrderBy.md index 993df6f4674..9dcffff7886 100644 --- a/docs/OrderBy.md +++ b/docs/OrderBy.md @@ -1,3 +1,6 @@ +--- +layout: default +--- The orderBy field provides the functionality to sort and limit the set of results from a groupBy query. Available options are: ### DefaultLimitSpec diff --git a/docs/Plumber.md b/docs/Plumber.md index cf650fb6cdd..b2123e94393 100644 --- a/docs/Plumber.md +++ b/docs/Plumber.md @@ -1,3 +1,6 @@ +--- +layout: default +--- The Plumber is the thing that handles generated segments both while they are being generated and when they are “done”. This is also technically a pluggable interface and there are multiple implementations, but there are a lot of details handled by the plumber such that it is expected that there will only be a few implementations and only more advanced third-parties will implement their own. See [here](https://github.com/metamx/druid/wiki/Plumber#available-plumbers) for a description of the plumbers included with Druid. |Field|Type|Description|Required| diff --git a/docs/Post-aggregations.md b/docs/Post-aggregations.md index 8ff7a91ecb5..4aa6c7f8db7 100644 --- a/docs/Post-aggregations.md +++ b/docs/Post-aggregations.md @@ -1,3 +1,6 @@ +--- +layout: default +--- Post-aggregations are specifications of processing that should happen on aggregated values as they come out of Druid. If you include a post aggregation as part of a query, make sure to include all aggregators the post-aggregator requires. There are several post-aggregators available. diff --git a/docs/Querying-your-data.md b/docs/Querying-your-data.md index 520edcaf613..39d22ab3a32 100644 --- a/docs/Querying-your-data.md +++ b/docs/Querying-your-data.md @@ -1,3 +1,6 @@ +--- +layout: default +--- # Setup # Before we start querying druid, we're going to finish setting up a complete cluster on localhost. In [[Loading Your Data]] we setup a [[Realtime]], [[Compute]] and [[Master]] node. If you've already completed that tutorial, you need only follow the directions for 'Booting a Broker Node'. diff --git a/docs/Querying.md b/docs/Querying.md index 21ed93c7bb5..db845bc694f 100644 --- a/docs/Querying.md +++ b/docs/Querying.md @@ -1,3 +1,6 @@ +--- +layout: default +--- Querying ======== diff --git a/docs/Realtime.md b/docs/Realtime.md index 1908a469f80..c92cc7f7175 100644 --- a/docs/Realtime.md +++ b/docs/Realtime.md @@ -1,3 +1,6 @@ +--- +layout: default +--- Realtime ======== diff --git a/docs/Rule-Configuration.md b/docs/Rule-Configuration.md index 1d2b4c03461..2695da646ab 100644 --- a/docs/Rule-Configuration.md +++ b/docs/Rule-Configuration.md @@ -1,3 +1,6 @@ +--- +layout: default +--- Note: It is recommended that the master console is used to configure rules. However, the master node does have HTTP endpoints to programmatically configure rules. Load Rules diff --git a/docs/SearchQuery.md b/docs/SearchQuery.md index af125889c32..7acf04419fa 100644 --- a/docs/SearchQuery.md +++ b/docs/SearchQuery.md @@ -1,3 +1,6 @@ +--- +layout: default +--- A search query returns dimension values that match the search specification. { diff --git a/docs/SearchQuerySpec.md b/docs/SearchQuerySpec.md index 48036c65d56..9b9db04b8e6 100644 --- a/docs/SearchQuerySpec.md +++ b/docs/SearchQuerySpec.md @@ -1,3 +1,6 @@ +--- +layout: default +--- Search query specs define how a “match” is defined between a search value and a dimension value. The available search query specs are: InsensitiveContainsSearchQuerySpec diff --git a/docs/SegmentMetadataQuery.md b/docs/SegmentMetadataQuery.md index 606d0800447..0e6eefb78e1 100644 --- a/docs/SegmentMetadataQuery.md +++ b/docs/SegmentMetadataQuery.md @@ -1,3 +1,6 @@ +--- +layout: default +--- Segment metadata queries return per segment information about: \* Cardinality of all columns in the segment \* Estimated byte size for the segment columns in TSV format diff --git a/docs/Segments.md b/docs/Segments.md index 5bffdd30b10..7da12950d15 100644 --- a/docs/Segments.md +++ b/docs/Segments.md @@ -1,3 +1,6 @@ +--- +layout: default +--- Segments ======== diff --git a/docs/Spatial-Filters.md b/docs/Spatial-Filters.md index c9ce15d5cc9..2ca83b9a3f9 100644 --- a/docs/Spatial-Filters.md +++ b/docs/Spatial-Filters.md @@ -1,3 +1,6 @@ +--- +layout: default +--- Note: This feature is highly experimental and only works with spatially indexed dimensions. The grammar for a spatial filter is as follows: diff --git a/docs/Spatial-Indexing.md b/docs/Spatial-Indexing.md index 5f7dc2b174c..1df36593433 100644 --- a/docs/Spatial-Indexing.md +++ b/docs/Spatial-Indexing.md @@ -1,3 +1,6 @@ +--- +layout: default +--- Note: This feature is highly experimental. In any of the data specs, there is now the option of providing spatial dimensions. For example, for a JSON data spec, spatial dimensions can be specified as follows: diff --git a/docs/Stand-Alone-With-Riak-CS.md b/docs/Stand-Alone-With-Riak-CS.md index aaa77b3151c..505b59f9283 100644 --- a/docs/Stand-Alone-With-Riak-CS.md +++ b/docs/Stand-Alone-With-Riak-CS.md @@ -1,3 +1,6 @@ +--- +layout: default +--- This page describes how to use Riak-CS for deep storage instead of S3. We are still setting up some of the peripheral stuff (file downloads, etc.). This guide provided by Pablo Nebrera, thanks! diff --git a/docs/Support.md b/docs/Support.md index 1561e935381..3dd512e050f 100644 --- a/docs/Support.md +++ b/docs/Support.md @@ -1,3 +1,6 @@ +--- +layout: default +--- Numerous backend engineers at [Metamarkets](http://www.metamarkets.com) work on Druid full-time. If you any questions about usage or code, feel free to contact any of us. Google Groups Mailing List diff --git a/docs/Tasks.md b/docs/Tasks.md index 53f441696d9..95341f581ec 100644 --- a/docs/Tasks.md +++ b/docs/Tasks.md @@ -1,3 +1,6 @@ +--- +layout: default +--- Tasks are run on workers and always operate on a single datasource. Once an indexer coordinator node accepts a task, a lock is created for the datasource and interval specified in the task. Tasks do not need to explicitly release locks, they are released upon task completion. Tasks may potentially release locks early if they desire. Tasks ids are unique by naming them using UUIDs or the timestamp in which the task was created. Tasks are also part of a “task group”, which is a set of tasks that can share interval locks. There are several different types of tasks. diff --git a/docs/Thanks.md b/docs/Thanks.md index f84708fb6c8..cb1c873cca0 100644 --- a/docs/Thanks.md +++ b/docs/Thanks.md @@ -1,3 +1,6 @@ +--- +layout: default +--- YourKit supports the Druid open source projects with its full-featured Java Profiler. YourKit, LLC is the creator of innovative and intelligent tools for profiling diff --git a/docs/TimeBoundaryQuery.md b/docs/TimeBoundaryQuery.md index 432df69961d..bde4ca1c812 100644 --- a/docs/TimeBoundaryQuery.md +++ b/docs/TimeBoundaryQuery.md @@ -1,3 +1,6 @@ +--- +layout: default +--- Time boundary queries return the earliest and latest data points of a data set. The grammar is: { diff --git a/docs/TimeseriesQuery.md b/docs/TimeseriesQuery.md index d189b176a01..56f2ce733b9 100644 --- a/docs/TimeseriesQuery.md +++ b/docs/TimeseriesQuery.md @@ -1,3 +1,6 @@ +--- +layout: default +--- Timeseries queries ================== diff --git a/docs/Tutorial:-A-First-Look-at-Druid.md b/docs/Tutorial:-A-First-Look-at-Druid.md index ef725135aa4..4722dd173c0 100644 --- a/docs/Tutorial:-A-First-Look-at-Druid.md +++ b/docs/Tutorial:-A-First-Look-at-Druid.md @@ -1,3 +1,6 @@ +--- +layout: default +--- Greetings! This tutorial will help clarify some core Druid concepts. We will use a realtime dataset and issue some basic Druid queries. If you are ready to explore Druid, and learn a thing or two, read on! About the data diff --git a/docs/Tutorial:-The-Druid-Cluster.md b/docs/Tutorial:-The-Druid-Cluster.md index b01824e52a2..e2eff84f505 100644 --- a/docs/Tutorial:-The-Druid-Cluster.md +++ b/docs/Tutorial:-The-Druid-Cluster.md @@ -1,3 +1,6 @@ +--- +layout: default +--- Welcome back! In our first [tutorial](https://github.com/metamx/druid/wiki/Tutorial%3A-A-First-Look-at-Druid), we introduced you to the most basic Druid setup: a single realtime node. We streamed in some data and queried it. Realtime nodes collect very recent data and periodically hand that data off to the rest of the Druid cluster. Some questions about the architecture must naturally come to mind. What does the rest of Druid cluster look like? How does Druid load available static data? This tutorial will hopefully answer these questions! diff --git a/docs/Tutorial:-Webstream.md b/docs/Tutorial:-Webstream.md index c8b0bcada8b..973204f31d4 100644 --- a/docs/Tutorial:-Webstream.md +++ b/docs/Tutorial:-Webstream.md @@ -1,3 +1,6 @@ +--- +layout: default +--- Greetings! This tutorial will help clarify some core Druid concepts. We will use a realtime dataset and issue some basic Druid queries. If you are ready to explore Druid, and learn a thing or two, read on! About the data diff --git a/docs/Twitter-Tutorial.md b/docs/Twitter-Tutorial.md index c113282e937..cedd26b9250 100644 --- a/docs/Twitter-Tutorial.md +++ b/docs/Twitter-Tutorial.md @@ -1,3 +1,6 @@ +--- +layout: default +--- Greetings! We see you’ve taken an interest in Druid. That’s awesome! Hopefully this tutorial will help clarify some core Druid concepts. We will go through one of the Real-time [[Examples]], and issue some basic Druid queries. The data source we’ll be working with is the [Twitter spritzer stream](https://dev.twitter.com/docs/streaming-apis/streams/public). If you are ready to explore Druid, brave its challenges, and maybe learn a thing or two, read on! Setting Up diff --git a/docs/Versioning.md b/docs/Versioning.md index 33c665d6542..7b9fa24045c 100644 --- a/docs/Versioning.md +++ b/docs/Versioning.md @@ -1,3 +1,6 @@ +--- +layout: default +--- This page discusses how we do versioning and provides information on our stable releases. Versioning Strategy diff --git a/docs/ZooKeeper.md b/docs/ZooKeeper.md index 250d0280bcb..03f2b1b8e0c 100644 --- a/docs/ZooKeeper.md +++ b/docs/ZooKeeper.md @@ -1,3 +1,6 @@ +--- +layout: default +--- Druid uses ZooKeeper (ZK) for management of current cluster state. The operations that happen over ZK are 1. [[Master]] leader election diff --git a/docs/_config.yml b/docs/_config.yml index 362c8bf5f91..1ba74937d8b 100644 --- a/docs/_config.yml +++ b/docs/_config.yml @@ -1,2 +1,3 @@ name: Your New Jekyll Site pygments: true +markdown: redcarpet diff --git a/docs/contents.md b/docs/contents.md index 2298a7c8bdf..23b56bc33a5 100644 --- a/docs/contents.md +++ b/docs/contents.md @@ -1,3 +1,6 @@ +--- +layout: default +--- Contents \* [[Introduction|Home]] \* [[Download]] From 063a068ab238f2412c090c33c756a73540a77b17 Mon Sep 17 00:00:00 2001 From: Russell Jurney Date: Mon, 16 Sep 2013 16:01:14 -0700 Subject: [PATCH 3/6] Converted links, sans space to slash --- docs/Aggregations.md | 90 ++++++++++++++++++++++++- docs/Batch-ingestion.md | 8 +-- docs/Booting-a-production-cluster.md | 2 +- docs/Broker.md | 6 +- docs/Cluster-setup.md | 40 +++++------ docs/Compute.md | 8 +-- docs/Concepts-and-Terminology.md | 2 +- docs/Configuration.md | 12 ++-- docs/Contribute.md | 2 +- docs/Design.md | 2 +- docs/Download.md | 2 +- docs/Druid-Personal-Demo-Cluster.md | 4 +- docs/Druid-vs-Impala-or-Shark.md | 4 +- docs/Druid-vs-vertica.md | 2 +- docs/Examples.md | 4 +- docs/Firehose.md | 4 +- docs/Granularities.md | 2 +- docs/GroupByQuery.md | 12 ++-- docs/Home.md | 14 ++-- docs/Indexing-Service.md | 4 +- docs/Loading-Your-Data.md | 16 ++--- docs/Master.md | 4 +- docs/MySQL.md | 6 +- docs/Post-aggregations.md | 4 +- docs/Querying-your-data.md | 16 ++--- docs/Querying.md | 18 ++--- docs/Realtime.md | 10 +-- docs/SearchQuery.md | 6 +- docs/Segments.md | 2 +- docs/Stand-Alone-With-Riak-CS.md | 10 +-- docs/TimeseriesQuery.md | 8 +-- docs/Tutorial:-A-First-Look-at-Druid.md | 16 ++--- docs/Tutorial:-The-Druid-Cluster.md | 2 +- docs/Tutorial:-Webstream.md | 12 ++-- docs/Twitter-Tutorial.md | 14 ++-- docs/Versioning.md | 2 +- docs/ZooKeeper.md | 12 ++-- docs/contents.md | 64 +++++++++--------- 38 files changed, 265 insertions(+), 181 deletions(-) diff --git a/docs/Aggregations.md b/docs/Aggregations.md index 886dac7a572..37b99aeffc2 100644 --- a/docs/Aggregations.md +++ b/docs/Aggregations.md @@ -1,6 +1,90 @@ --- layout: default --- ---- -layout: default ---- +Aggregations are specifications of processing over metrics available in Druid. +Available aggregations are: + +### Sum aggregators + +#### `longSum` aggregator + +computes the sum of values as a 64-bit, signed integer + + { + "type" : "longSum", + "name" : , + "fieldName" : + } + +`name` – output name for the summed value +`fieldName` – name of the metric column to sum over + +#### `doubleSum` aggregator + +Computes the sum of values as 64-bit floating point value. Similar to `longSum` + + { + "type" : "doubleSum", + "name" : , + "fieldName" : + } + +### Count aggregator + +`count` computes the row count that match the filters + + { + "type" : "count", + "name" : , + } + +### Min / Max aggregators + +#### `min` aggregator + +`min` computes the minimum metric value + + { + "type" : "min", + "name" : , + "fieldName" : + } + +#### `max` aggregator + +`max` computes the maximum metric value + + { + "type" : "max", + "name" : , + "fieldName" : + } + +### JavaScript aggregator + +Computes an arbitrary JavaScript function over a set of columns (both metrics and dimensions). + +All JavaScript functions must return numerical values. + + { + "type": "javascript", + "name": "", + "fieldNames" : [ , , ... ], + "fnAggregate" : "function(current, column1, column2, ...) { + + return + }" + "fnCombine" : "function(partialA, partialB) { return ; }" + "fnReset" : "function() { return ; }" + } + +**Example** + + { + "type": "javascript", + "name": "sum(log(x)/y) + 10", + "fieldNames": ["x", "y"], + "fnAggregate" : "function(current, a, b) { return current + (Math.log(a) * b); }" + "fnCombine" : "function(partialA, partialB) { return partialA + partialB; }" + "fnReset" : "function() { return 10; }" + } \ No newline at end of file diff --git a/docs/Batch-ingestion.md b/docs/Batch-ingestion.md index f91f0dbb081..6511b85b452 100644 --- a/docs/Batch-ingestion.md +++ b/docs/Batch-ingestion.md @@ -4,14 +4,14 @@ layout: default Batch Data Ingestion ==================== -There are two choices for batch data ingestion to your Druid cluster, you can use the [[Indexing service]] or you can use the `HadoopDruidIndexerMain`. This page describes how to use the `HadoopDruidIndexerMain`. +There are two choices for batch data ingestion to your Druid cluster, you can use the [Indexing service](Indexing-service.html) or you can use the `HadoopDruidIndexerMain`. This page describes how to use the `HadoopDruidIndexerMain`. Which should I use? ------------------- -The [[Indexing service]] is a node that can run as part of your Druid cluster and can accomplish a number of different types of indexing tasks. Even if all you care about is batch indexing, it provides for the encapsulation of things like the Database that is used for segment metadata and other things, so that your indexing tasks do not need to include such information. Long-term, the indexing service is going to be the preferred method of ingesting data. +The [Indexing service](Indexing service.html) is a node that can run as part of your Druid cluster and can accomplish a number of different types of indexing tasks. Even if all you care about is batch indexing, it provides for the encapsulation of things like the Database that is used for segment metadata and other things, so that your indexing tasks do not need to include such information. Long-term, the indexing service is going to be the preferred method of ingesting data. -The `HadoopDruidIndexerMain` runs hadoop jobs in order to separate and index data segments. It takes advantage of Hadoop as a job scheduling and distributed job execution platform. It is a simple method if you already have Hadoop running and don’t want to spend the time configuring and deploying the [[Indexing service]] just yet. +The `HadoopDruidIndexerMain` runs hadoop jobs in order to separate and index data segments. It takes advantage of Hadoop as a job scheduling and distributed job execution platform. It is a simple method if you already have Hadoop running and don’t want to spend the time configuring and deploying the [Indexing service](Indexing service.html) just yet. HadoopDruidIndexer ------------------ @@ -138,4 +138,4 @@ This is a specification of the properties that tell the job how to update metada |password|password for db|yes| |segmentTable|table to use in DB|yes| -These properties should parrot what you have configured for your [[Master]]. +These properties should parrot what you have configured for your [Master](Master.html). diff --git a/docs/Booting-a-production-cluster.md b/docs/Booting-a-production-cluster.md index c25ef25c607..d5fc38c8ce5 100644 --- a/docs/Booting-a-production-cluster.md +++ b/docs/Booting-a-production-cluster.md @@ -3,7 +3,7 @@ layout: default --- # Booting a Single Node Cluster # -[[Loading Your Data]] and [[Querying Your Data]] contain recipes to boot a small druid cluster on localhost. Here we will boot a small cluster on EC2. You can checkout the code, or download a tarball from [here](http://static.druid.io/artifacts/druid-services-0.5.51-SNAPSHOT-bin.tar.gz). +[Loading Your Data](Loading Your Data.html) and [Querying Your Data](Querying Your Data.html) contain recipes to boot a small druid cluster on localhost. Here we will boot a small cluster on EC2. You can checkout the code, or download a tarball from [here](http://static.druid.io/artifacts/druid-services-0.5.51-SNAPSHOT-bin.tar.gz). The [ec2 run script](https://github.com/metamx/druid/blob/master/examples/bin/run_ec2.sh), run_ec2.sh, is located at 'examples/bin' if you have checked out the code, or at the root of the project if you've downloaded a tarball. The scripts rely on the [Amazon EC2 API Tools](http://aws.amazon.com/developertools/351), and you will need to set three environment variables: diff --git a/docs/Broker.md b/docs/Broker.md index e71100e9915..fee33bedc83 100644 --- a/docs/Broker.md +++ b/docs/Broker.md @@ -9,9 +9,9 @@ The Broker is the node to route queries to if you want to run a distributed clus Forwarding Queries ------------------ -Most druid queries contain an interval object that indicates a span of time for which data is requested. Likewise, Druid [[Segments]] are partitioned to contain data for some interval of time and segments are distributed across a cluster. Consider a simple datasource with 7 segments where each segment contains data for a given day of the week. Any query issued to the datasource for more than one day of data will hit more than one segment. These segments will likely be distributed across multiple nodes, and hence, the query will likely hit multiple nodes. +Most druid queries contain an interval object that indicates a span of time for which data is requested. Likewise, Druid [Segments](Segments.html) are partitioned to contain data for some interval of time and segments are distributed across a cluster. Consider a simple datasource with 7 segments where each segment contains data for a given day of the week. Any query issued to the datasource for more than one day of data will hit more than one segment. These segments will likely be distributed across multiple nodes, and hence, the query will likely hit multiple nodes. -To determine which nodes to forward queries to, the Broker node first builds a view of the world from information in Zookeeper. Zookeeper maintains information about [[Compute]] and [[Realtime]] nodes and the segments they are serving. For every datasource in Zookeeper, the Broker node builds a timeline of segments and the nodes that serve them. When queries are received for a specific datasource and interval, the Broker node performs a lookup into the timeline associated with the query datasource for the query interval and retrieves the nodes that contain data for the query. The Broker node then forwards down the query to the selected nodes. +To determine which nodes to forward queries to, the Broker node first builds a view of the world from information in Zookeeper. Zookeeper maintains information about [Compute](Compute.html) and [Realtime](Realtime.html) nodes and the segments they are serving. For every datasource in Zookeeper, the Broker node builds a timeline of segments and the nodes that serve them. When queries are received for a specific datasource and interval, the Broker node performs a lookup into the timeline associated with the query datasource for the query interval and retrieves the nodes that contain data for the query. The Broker node then forwards down the query to the selected nodes. Caching ------- @@ -27,4 +27,4 @@ Broker nodes can be run using the `com.metamx.druid.http.BrokerMain` class. Configuration ------------- -See [[Configuration]]. +See [Configuration](Configuration.html). diff --git a/docs/Cluster-setup.md b/docs/Cluster-setup.md index 29837e94296..b8281e99468 100644 --- a/docs/Cluster-setup.md +++ b/docs/Cluster-setup.md @@ -1,7 +1,7 @@ --- layout: default --- -A Druid cluster consists of various node types that need to be set up depending on your use case. See our [[Design]] docs for a description of the different node types. +A Druid cluster consists of various node types that need to be set up depending on your use case. See our [Design](Design.html) docs for a description of the different node types. Setup Scripts ------------- @@ -11,14 +11,14 @@ One of our community members, [housejester](https://github.com/housejester/), co Minimum Physical Layout: Absolute Minimum ----------------------------------------- -As a special case, the absolute minimum setup is one of the standalone examples for realtime ingestion and querying; see [[Examples]] that can easily run on one machine with one core and 1GB RAM. This layout can be set up to try some basic queries with Druid. +As a special case, the absolute minimum setup is one of the standalone examples for realtime ingestion and querying; see [Examples](Examples.html) that can easily run on one machine with one core and 1GB RAM. This layout can be set up to try some basic queries with Druid. Minimum Physical Layout: Experimental Testing with 4GB of RAM ------------------------------------------------------------- This layout can be used to load some data from deep storage onto a Druid compute node for the first time. A minimal physical layout for a 1 or 2 core machine with 4GB of RAM is: -1. node1: [[Master]] + metadata service + zookeeper + [[Compute]] +1. node1: [Master](Master.html) + metadata service + zookeeper + [Compute](Compute.html) 2. transient nodes: indexer This setup is only reasonable to prove that a configuration works. It would not be worthwhile to use this layout for performance measurement. @@ -30,13 +30,13 @@ Comfortable Physical Layout: Pilot Project with Multiple Machines A minimal physical layout not constrained by cores that demonstrates parallel querying and realtime, using AWS-EC2 “small”/m1.small (one core, with 1.7GB of RAM) or larger, no realtime, is: -1. node1: [[Master]] (m1.small) +1. node1: [Master](Master.html) (m1.small) 2. node2: metadata service (m1.small) 3. node3: zookeeper (m1.small) -4. node4: [[Broker]] (m1.small or m1.medium or m1.large) -5. node5: [[Compute]] (m1.small or m1.medium or m1.large) -6. node6: [[Compute]] (m1.small or m1.medium or m1.large) -7. node7: [[Realtime]] (m1.small or m1.medium or m1.large) +4. node4: [Broker](Broker.html) (m1.small or m1.medium or m1.large) +5. node5: [Compute](Compute.html) (m1.small or m1.medium or m1.large) +6. node6: [Compute](Compute.html) (m1.small or m1.medium or m1.large) +7. node7: [Realtime](Realtime.html) (m1.small or m1.medium or m1.large) 8. transient nodes: indexer This layout naturally lends itself to adding more RAM and core to Compute nodes, and to adding many more Compute nodes. Depending on the actual load, the Master, metadata server, and Zookeeper might need to use larger machines. @@ -48,18 +48,18 @@ High Availability Physical Layout An HA layout allows full rolling restarts and heavy volume: -1. node1: [[Master]] (m1.small or m1.medium or m1.large) -2. node2: [[Master]] (m1.small or m1.medium or m1.large) (backup) +1. node1: [Master](Master.html) (m1.small or m1.medium or m1.large) +2. node2: [Master](Master.html) (m1.small or m1.medium or m1.large) (backup) 3. node3: metadata service (c1.medium or m1.large) 4. node4: metadata service (c1.medium or m1.large) (backup) 5. node5: zookeeper (c1.medium) 6. node6: zookeeper (c1.medium) 7. node7: zookeeper (c1.medium) -8. node8: [[Broker]] (m1.small or m1.medium or m1.large or m2.xlarge or m2.2xlarge or m2.4xlarge) -9. node9: [[Broker]] (m1.small or m1.medium or m1.large or m2.xlarge or m2.2xlarge or m2.4xlarge) (backup) -10. node10: [[Compute]] (m1.small or m1.medium or m1.large or m2.xlarge or m2.2xlarge or m2.4xlarge) -11. node11: [[Compute]] (m1.small or m1.medium or m1.large or m2.xlarge or m2.2xlarge or m2.4xlarge) -12. node12: [[Realtime]] (m1.small or m1.medium or m1.large or m2.xlarge or m2.2xlarge or m2.4xlarge) +8. node8: [Broker](Broker.html) (m1.small or m1.medium or m1.large or m2.xlarge or m2.2xlarge or m2.4xlarge) +9. node9: [Broker](Broker.html) (m1.small or m1.medium or m1.large or m2.xlarge or m2.2xlarge or m2.4xlarge) (backup) +10. node10: [Compute](Compute.html) (m1.small or m1.medium or m1.large or m2.xlarge or m2.2xlarge or m2.4xlarge) +11. node11: [Compute](Compute.html) (m1.small or m1.medium or m1.large or m2.xlarge or m2.2xlarge or m2.4xlarge) +12. node12: [Realtime](Realtime.html) (m1.small or m1.medium or m1.large or m2.xlarge or m2.2xlarge or m2.4xlarge) 13. transient nodes: indexer Sizing for Cores and RAM @@ -79,7 +79,7 @@ Local disk (“ephemeral” on AWS EC2) for caching is recommended over network Setup ----- -Setting up a cluster is essentially just firing up all of the nodes you want with the proper [[configuration]]. One thing to be aware of is that there are a few properties in the configuration that potentially need to be set individually for each process: +Setting up a cluster is essentially just firing up all of the nodes you want with the proper [configuration](configuration.html). One thing to be aware of is that there are a few properties in the configuration that potentially need to be set individually for each process: druid.server.type=historical|realtime @@ -107,8 +107,8 @@ The following table shows the possible services and fully qualified class for ma |service|main class| |-------|----------| -|[[ Realtime ]]|com.metamx.druid.realtime.RealtimeMain| -|[[ Master ]]|com.metamx.druid.http.MasterMain| -|[[ Broker ]]|com.metamx.druid.http.BrokerMain| -|[[ Compute ]]|com.metamx.druid.http.ComputeMain| +|[ Realtime ]( Realtime .html)|com.metamx.druid.realtime.RealtimeMain| +|[ Master ]( Master .html)|com.metamx.druid.http.MasterMain| +|[ Broker ]( Broker .html)|com.metamx.druid.http.BrokerMain| +|[ Compute ]( Compute .html)|com.metamx.druid.http.ComputeMain| diff --git a/docs/Compute.md b/docs/Compute.md index 8df11f8ca2f..e7df17ebbd5 100644 --- a/docs/Compute.md +++ b/docs/Compute.md @@ -11,9 +11,9 @@ Loading and Serving Segments Each compute node maintains a constant connection to Zookeeper and watches a configurable set of Zookeeper paths for new segment information. Compute nodes do not communicate directly with each other or with the master nodes but instead rely on Zookeeper for coordination. -The [[Master]] node is responsible for assigning new segments to compute nodes. Assignment is done by creating an ephemeral Zookeeper entry under a load queue path associated with a compute node. For more information on how the master assigns segments to compute nodes, please see [[Master]]. +The [Master](Master.html) node is responsible for assigning new segments to compute nodes. Assignment is done by creating an ephemeral Zookeeper entry under a load queue path associated with a compute node. For more information on how the master assigns segments to compute nodes, please see [Master](Master.html). -When a compute node notices a new load queue entry in its load queue path, it will first check a local disk directory (cache) for the information about segment. If no information about the segment exists in the cache, the compute node will download metadata about the new segment to serve from Zookeeper. This metadata includes specifications about where the segment is located in deep storage and about how to decompress and process the segment. For more information about segment metadata and Druid segments in general, please see [[Segments]]. Once a compute node completes processing a segment, the segment is announced in Zookeeper under a served segments path associated with the node. At this point, the segment is available for querying. +When a compute node notices a new load queue entry in its load queue path, it will first check a local disk directory (cache) for the information about segment. If no information about the segment exists in the cache, the compute node will download metadata about the new segment to serve from Zookeeper. This metadata includes specifications about where the segment is located in deep storage and about how to decompress and process the segment. For more information about segment metadata and Druid segments in general, please see [Segments](Segments.html). Once a compute node completes processing a segment, the segment is announced in Zookeeper under a served segments path associated with the node. At this point, the segment is available for querying. Loading and Serving Segments From Cache --------------------------------------- @@ -25,7 +25,7 @@ The segment cache is also leveraged when a compute node is first started. On sta Querying Segments ----------------- -Please see [[Querying]] for more information on querying compute nodes. +Please see [Querying](Querying.html) for more information on querying compute nodes. For every query that a compute node services, it will log the query and report metrics on the time taken to run the query. @@ -37,4 +37,4 @@ Compute nodes can be run using the `com.metamx.druid.http.ComputeMain` class. Configuration ------------- -See [[Configuration]]. +See [Configuration](Configuration.html). diff --git a/docs/Concepts-and-Terminology.md b/docs/Concepts-and-Terminology.md index 1e7f535388d..925941dd8c5 100644 --- a/docs/Concepts-and-Terminology.md +++ b/docs/Concepts-and-Terminology.md @@ -12,4 +12,4 @@ Concepts and Terminology - **Segment:** A collection of (internal) records that are stored and processed together. - **Shard:** A unit of partitioning data across machine. TODO: clarify; by time or other dimensions? -- **specFile** is specification for services in JSON format; see [[Realtime]] and [[Batch-ingestion]] +- **specFile** is specification for services in JSON format; see [Realtime](Realtime.html) and [Batch-ingestion](Batch-ingestion.html) diff --git a/docs/Configuration.md b/docs/Configuration.md index 353b8be77b3..544b9ea4f55 100644 --- a/docs/Configuration.md +++ b/docs/Configuration.md @@ -1,7 +1,7 @@ --- layout: default --- -This describes the basic server configuration that is loaded by all the server processes; the same file is loaded by all. See also the json “specFile” descriptions in [[Realtime]] and [[Batch-ingestion]]. +This describes the basic server configuration that is loaded by all the server processes; the same file is loaded by all. See also the json “specFile” descriptions in [Realtime](Realtime.html) and [Batch-ingestion](Batch-ingestion.html). JVM Configuration Best Practices ================================ @@ -80,7 +80,7 @@ Configuration groupings ### S3 Access -These properties are for connecting with S3 and using it to pull down segments. In the future, we plan on being able to use other deep storage file systems as well, like HDFS. The file system is actually only accessed by the [[Compute]], [[Realtime]] and [[Indexing service]] nodes. +These properties are for connecting with S3 and using it to pull down segments. In the future, we plan on being able to use other deep storage file systems as well, like HDFS. The file system is actually only accessed by the [Compute](Compute.html), [Realtime](Realtime.html) and [Indexing service](Indexing service.html) nodes. |Property|Description|Default| |--------|-----------|-------| @@ -91,7 +91,7 @@ These properties are for connecting with S3 and using it to pull down segments. ### JDBC connection -These properties specify the jdbc connection and other configuration around the “segments table” database. The only processes that connect to the DB with these properties are the [[Master]] and [[Indexing service]]. This is tested on MySQL. +These properties specify the jdbc connection and other configuration around the “segments table” database. The only processes that connect to the DB with these properties are the [Master](Master.html) and [Indexing service](Indexing service.html). This is tested on MySQL. |Property|Description|Default| |--------|-----------|-------| @@ -113,7 +113,7 @@ These properties specify the jdbc connection and other configuration around the ### Zk properties -See [[ZooKeeper]] for a description of these properties. +See [ZooKeeper](ZooKeeper.html) for a description of these properties. ### Service properties @@ -146,7 +146,7 @@ These are properties that the compute nodes use ### Emitter Properties -The Druid servers emit various metrics and alerts via something we call an [[Emitter]]. There are two emitter implementations included with the code, one that just logs to log4j and one that does POSTs of JSON events to a server. More information can be found on the [[Emitter]] page. The properties for using the logging emitter are described below. +The Druid servers emit various metrics and alerts via something we call an [Emitter](Emitter.html). There are two emitter implementations included with the code, one that just logs to log4j and one that does POSTs of JSON events to a server. More information can be found on the [Emitter](Emitter.html) page. The properties for using the logging emitter are described below. |Property|Description|Default| |--------|-----------|-------| @@ -158,5 +158,5 @@ The Druid servers emit various metrics and alerts via something we call an [[Emi |Property|Description|Default| |--------|-----------|-------| -|`druid.realtime.specFile`|The file with realtime specifications in it. See [[Realtime]].|none| +|`druid.realtime.specFile`|The file with realtime specifications in it. See [Realtime](Realtime.html).|none| diff --git a/docs/Contribute.md b/docs/Contribute.md index 58d53a6d224..a853eb430d2 100644 --- a/docs/Contribute.md +++ b/docs/Contribute.md @@ -5,4 +5,4 @@ If you are interested in contributing to the code, we accept [pull requests](htt For issue tracking, we are using the github issue tracker. Please fill out an issue from the Issues tab on the github screen. -We also have a [[Libraries]] page that lists external libraries that people have created for working with Druid. +We also have a [Libraries](Libraries.html) page that lists external libraries that people have created for working with Druid. diff --git a/docs/Design.md b/docs/Design.md index 888d0b871b3..25d69d95fe8 100644 --- a/docs/Design.md +++ b/docs/Design.md @@ -53,7 +53,7 @@ Getting data into the Druid system requires an indexing process. This gives the - Bitmap compression - RLE (on the roadmap, but not yet implemented) -The output of the indexing process is stored in a “deep storage” LOB store/file system ([[Deep Storage]] for information about potential options). Data is then loaded by compute nodes by first downloading the data to their local disk and then memory mapping it before serving queries. +The output of the indexing process is stored in a “deep storage” LOB store/file system ([Deep Storage](Deep Storage.html) for information about potential options). Data is then loaded by compute nodes by first downloading the data to their local disk and then memory mapping it before serving queries. If a compute node dies, it will no longer serve its segments, but given that the segments are still available on the “deep storage” any other node can simply download the segment and start serving it. This means that it is possible to actually remove all compute nodes from the cluster and then re-provision them without any data loss. It also means that if the “deep storage” is not available, the nodes can continue to serve the segments they have already pulled down (i.e. the cluster goes stale, not down). diff --git a/docs/Download.md b/docs/Download.md index 00de8597f11..1bf1352de58 100644 --- a/docs/Download.md +++ b/docs/Download.md @@ -1,7 +1,7 @@ --- layout: default --- -A version may be declared as a release candidate if it has been deployed to a sizable production cluster. Release candidates are declared as stable after we feel fairly confident there are no major bugs in the version. Check out the [[Versioning]] section for how we describe software versions. +A version may be declared as a release candidate if it has been deployed to a sizable production cluster. Release candidates are declared as stable after we feel fairly confident there are no major bugs in the version. Check out the [Versioning](Versioning.html) section for how we describe software versions. Release Candidate ----------------- diff --git a/docs/Druid-Personal-Demo-Cluster.md b/docs/Druid-Personal-Demo-Cluster.md index ab49d828dbc..498f8ff8e14 100644 --- a/docs/Druid-Personal-Demo-Cluster.md +++ b/docs/Druid-Personal-Demo-Cluster.md @@ -3,7 +3,7 @@ layout: default --- # Druid Personal Demo Cluster (DPDC) -Note, there are currently some issues with the CloudFormation. We are working through them and will update the documentation here when things work properly. In the meantime, the simplest way to get your feet wet with a cluster setup is to run through the instructions at [housejester/druid-test-harness](https://github.com/housejester/druid-test-harness), though it is based on an older version. If you just want to get a feel for the types of data and queries that you can issue, check out [[Realtime Examples]] +Note, there are currently some issues with the CloudFormation. We are working through them and will update the documentation here when things work properly. In the meantime, the simplest way to get your feet wet with a cluster setup is to run through the instructions at [housejester/druid-test-harness](https://github.com/housejester/druid-test-harness), though it is based on an older version. If you just want to get a feel for the types of data and queries that you can issue, check out [Realtime Examples](Realtime Examples.html) ## Introduction To make it easy for you to get started with Druid, we created an AWS (Amazon Web Services) [CloudFormation](http://aws.amazon.com/cloudformation/) Template that allows you to create a small pre-configured Druid cluster using your own AWS account. The cluster contains a pre-loaded sample workload, the Wikipedia edit stream, and a basic query interface that gets you familiar with Druid capabilities like drill-downs and filters. @@ -14,7 +14,7 @@ This guide walks you through the steps to create the cluster and then how to cre ## What’s in this Druid Demo Cluster? -1. A single "Master" node. This node co-locates the [[Master]] process, the [[Broker]] process, Zookeeper, and the MySQL instance. You can read more about Druid architecture [[Design]]. +1. A single "Master" node. This node co-locates the [Master](Master.html) process, the [Broker](Broker.html) process, Zookeeper, and the MySQL instance. You can read more about Druid architecture [Design](Design.html). 1. Three compute nodes; these compute nodes, have been pre-configured to work with the Master node and should automatically load up the Wikipedia edit stream data (no specific setup is required). diff --git a/docs/Druid-vs-Impala-or-Shark.md b/docs/Druid-vs-Impala-or-Shark.md index 3174fbbea5f..ee59b3def0c 100644 --- a/docs/Druid-vs-Impala-or-Shark.md +++ b/docs/Druid-vs-Impala-or-Shark.md @@ -20,11 +20,11 @@ What does this mean? We can talk about it in terms of four general areas ## Fault Tolerance -Druid pulls segments down from [[Deep Storage]] before serving queries on top of it. This means that for the data to exist in the Druid cluster, it must exist as a local copy on a historical node. If deep storage becomes unavailable for any reason, new segments will not be loaded into the system, but the cluster will continue to operate exactly as it was when the backing store disappeared. +Druid pulls segments down from [Deep Storage](Deep Storage.html) before serving queries on top of it. This means that for the data to exist in the Druid cluster, it must exist as a local copy on a historical node. If deep storage becomes unavailable for any reason, new segments will not be loaded into the system, but the cluster will continue to operate exactly as it was when the backing store disappeared. Impala and Shark, on the other hand, pull their data in from HDFS (or some other Hadoop FileSystem) in response to a query. This has implications for the operation of queries if you need to take HDFS down for a bit (say a software upgrade). It's possible that data that has been cached in the nodes is still available when the backing file system goes down, but I'm not sure. -This is just one example, but Druid was built to continue operating in the face of failures of any one of its various pieces. The [[Design]] describes these design decisions from the Druid side in more detail. +This is just one example, but Druid was built to continue operating in the face of failures of any one of its various pieces. The [Design](Design.html) describes these design decisions from the Druid side in more detail. ## Query Speed diff --git a/docs/Druid-vs-vertica.md b/docs/Druid-vs-vertica.md index b20976b74a6..535e5e06300 100644 --- a/docs/Druid-vs-vertica.md +++ b/docs/Druid-vs-vertica.md @@ -3,7 +3,7 @@ layout: default --- How does Druid compare to Vertica? -Vertica is similar to ParAccel/Redshift ([[Druid-vs-Redshift]]) described above in that it wasn’t built for real-time streaming data ingestion and it supports full SQL. +Vertica is similar to ParAccel/Redshift ([Druid-vs-Redshift](Druid-vs-Redshift.html)) described above in that it wasn’t built for real-time streaming data ingestion and it supports full SQL. The other big difference is that instead of employing indexing, Vertica tries to optimize processing by leveraging run-length encoding (RLE) and other compression techniques along with a “projection” system that creates materialized copies of the data in a different sort order (to maximize the effectiveness of RLE). diff --git a/docs/Examples.md b/docs/Examples.md index 9ab10466e56..2f48f60b1b5 100644 --- a/docs/Examples.md +++ b/docs/Examples.md @@ -34,7 +34,7 @@ Clone Druid and build it: Twitter Example --------------- -For a full tutorial based on the twitter example, check out this [[Twitter Tutorial]]. +For a full tutorial based on the twitter example, check out this [Twitter Tutorial](Twitter Tutorial.html). This Example uses a feature of Twitter that allows for sampling of it’s stream. We sample the Twitter stream via our [TwitterSpritzerFirehoseFactory](https://github.com/metamx/druid/blob/master/examples/src/main/java/druid/examples/twitter/TwitterSpritzerFirehoseFactory.java) class and use it to simulate the kinds of data you might ingest into Druid. Then, with the client part, the sample shows what kinds of analytics explorations you can do during and after the data is loaded. @@ -48,7 +48,7 @@ This Example uses a feature of Twitter that allows for sampling of it’s stream ### What you’ll do -See [[Tutorial]] +See [Tutorial](Tutorial.html) Rand Example ------------ diff --git a/docs/Firehose.md b/docs/Firehose.md index c571f035a10..92c5caa2386 100644 --- a/docs/Firehose.md +++ b/docs/Firehose.md @@ -28,11 +28,11 @@ This firehose ingests events from a predefined list of S3 objects. #### TwitterSpritzerFirehose -See [[Examples]]. This firehose connects directly to the twitter spritzer data stream. +See [Examples](Examples.html). This firehose connects directly to the twitter spritzer data stream. #### RandomFirehose -See [[Examples]]. This firehose creates a stream of random numbers. +See [Examples](Examples.html). This firehose creates a stream of random numbers. #### RabbitMqFirehouse diff --git a/docs/Granularities.md b/docs/Granularities.md index cf5283841c0..0cb25a7a5df 100644 --- a/docs/Granularities.md +++ b/docs/Granularities.md @@ -11,7 +11,7 @@ Simple granularities are specified as a string and bucket timestamps by their UT Supported granularity strings are: `all`, `none`, `minute`, `fifteen_minute`, `thirty_minute`, `hour` and `day` \* **`all`** buckets everything into a single bucket -\* **`none`** does not bucket data (it actually uses the granularity of the index - minimum here is `none` which means millisecond granularity). Using `none` in a [[timeseries query|TimeSeriesQuery]] is currently not recommended (the system will try to generate 0 values for all milliseconds that didn’t exist, which is often a lot). +\* **`none`** does not bucket data (it actually uses the granularity of the index - minimum here is `none` which means millisecond granularity). Using `none` in a [timeseries query|TimeSeriesQuery](timeseries query|TimeSeriesQuery.html) is currently not recommended (the system will try to generate 0 values for all milliseconds that didn’t exist, which is often a lot). ### Duration Granularities diff --git a/docs/GroupByQuery.md b/docs/GroupByQuery.md index 656ff1a41a1..7e95ebcbdee 100644 --- a/docs/GroupByQuery.md +++ b/docs/GroupByQuery.md @@ -93,12 +93,12 @@ There are 9 main parts to a groupBy query: |queryType|This String should always be “groupBy”; this is the first thing Druid looks at to figure out how to interpret the query|yes| |dataSource|A String defining the data source to query, very similar to a table in a relational database|yes| |dimensions|A JSON list of dimensions to do the groupBy over|yes| -|orderBy|See [[OrderBy]].|no| -|having|See [[Having]].|no| -|granularity|Defines the granularity of the query. See [[Granularities]]|yes| -|filter|See [[Filters]]|no| -|aggregations|See [[Aggregations]]|yes| -|postAggregations|See [[Post Aggregations]]|no| +|orderBy|See [OrderBy](OrderBy.html).|no| +|having|See [Having](Having.html).|no| +|granularity|Defines the granularity of the query. See [Granularities](Granularities.html)|yes| +|filter|See [Filters](Filters.html)|no| +|aggregations|See [Aggregations](Aggregations.html)|yes| +|postAggregations|See [Post Aggregations](Post Aggregations.html)|no| |intervals|A JSON Object representing ISO-8601 Intervals. This defines the time ranges to run the query over.|yes| |context|An additional JSON Object which can be used to specify certain flags.|no| diff --git a/docs/Home.md b/docs/Home.md index 934f11b8c92..8587aae4749 100644 --- a/docs/Home.md +++ b/docs/Home.md @@ -3,7 +3,7 @@ layout: default --- Druid is an open-source analytics datastore designed for realtime, exploratory, queries on large-scale data sets (100’s of Billions entries, 100’s TB data). Druid provides for cost effective, always-on, realtime data ingestion and arbitrary data exploration. -- Check out some [[Examples]] +- Check out some [Examples](Examples.html) - Try out Druid with our Getting Started [Tutorial](https://github.com/metamx/druid/wiki/Tutorial%3A-A-First-Look-at-Druid) - Learn more by reading the [White Paper](http://static.druid.io/docs/druid.pdf) @@ -19,7 +19,7 @@ The first one is the joy that everyone feels the first time they get Hadoop runn Druid is especially useful if you are summarizing your data sets and then querying the summarizations. If you put your summarizations into Druid, you will get quick queryability out of a system that you can be confident will scale up as your data volumes increase. Deployments have scaled up to 2TB of data per hour at peak ingested and aggregated in real-time. -We have more details about the general design of the system and why you might want to use it in our [White Paper](http://static.druid.io/docs/druid.pdf) or in our [[Design]] doc. +We have more details about the general design of the system and why you might want to use it in our [White Paper](http://static.druid.io/docs/druid.pdf) or in our [Design](Design.html) doc. The data store world is vast, confusing and constantly in flux. This page is meant to help potential evaluators decide whether Druid is a good fit for the problem one needs to solve. If anything about it is incorrect please provide that feedback on the mailing list or via some other means, we will fix this page. @@ -38,11 +38,11 @@ The data store world is vast, confusing and constantly in flux. This page is mea \* Downtime is no big deal #### Druid vs… -\* [[Druid-vs-Impala-or-Shark]] -\* [[Druid-vs-Redshift]] -\* [[Druid-vs-Vertica]] -\* [[Druid-vs-Cassandra]] -\* [[Druid-vs-Hadoop]] +\* [Druid-vs-Impala-or-Shark](Druid-vs-Impala-or-Shark.html) +\* [Druid-vs-Redshift](Druid-vs-Redshift.html) +\* [Druid-vs-Vertica](Druid-vs-Vertica.html) +\* [Druid-vs-Cassandra](Druid-vs-Cassandra.html) +\* [Druid-vs-Hadoop](Druid-vs-Hadoop.html) Key Features ------------ diff --git a/docs/Indexing-Service.md b/docs/Indexing-Service.md index 60abbd73b9f..d878e1b4176 100644 --- a/docs/Indexing-Service.md +++ b/docs/Indexing-Service.md @@ -3,7 +3,7 @@ layout: default --- Disclaimer: We are still in the process of finalizing the indexing service and these configs are prone to change at any time. We will announce when we feel the indexing service and the configurations described are stable. -The indexing service is a distributed task/job queue. It accepts requests in the form of [[Tasks]] and executes those tasks across a set of worker nodes. Worker capacity can be automatically adjusted based on the number of tasks pending in the system. The indexing service is highly available, has built in retry logic, and can backup per task logs in deep storage. +The indexing service is a distributed task/job queue. It accepts requests in the form of [Tasks](Tasks.html) and executes those tasks across a set of worker nodes. Worker capacity can be automatically adjusted based on the number of tasks pending in the system. The indexing service is highly available, has built in retry logic, and can backup per task logs in deep storage. The indexing service is composed of two main components, a coordinator node that manages task distribution and worker capacity, and worker nodes that execute tasks in separate JVMs. @@ -45,7 +45,7 @@ The coordinator also exposes a simple UI to show what tasks are currently runnin #### Task Execution -The coordinator retrieves worker setup metadata from the Druid [[MySQL]] config table. This metadata contains information about the version of workers to create, the maximum and minimum number of workers in the cluster at one time, and additional information required to automatically create workers. +The coordinator retrieves worker setup metadata from the Druid [MySQL](MySQL.html) config table. This metadata contains information about the version of workers to create, the maximum and minimum number of workers in the cluster at one time, and additional information required to automatically create workers. Tasks are assigned to workers by creating entries under specific /tasks paths associated with a worker, similar to how the Druid master node assigns segments to compute nodes. See [Worker Configuration](Indexing-Service#configuration-1). Once a worker picks up a task, it deletes the task entry and announces a task status under a /status path associated with the worker. Tasks are submitted to a worker until the worker hits capacity. If all workers in a cluster are at capacity, the indexer coordinator node automatically creates new worker resources. diff --git a/docs/Loading-Your-Data.md b/docs/Loading-Your-Data.md index dd4b0f8a7fb..a5edd9d65ea 100644 --- a/docs/Loading-Your-Data.md +++ b/docs/Loading-Your-Data.md @@ -3,7 +3,7 @@ layout: default --- Once you have a realtime node working, it is time to load your own data to see how Druid performs. -Druid can ingest data in three ways: via Kafka and a realtime node, via the indexing service, and via the Hadoop batch loader. Data is ingested in realtime using a [[Firehose]]. +Druid can ingest data in three ways: via Kafka and a realtime node, via the indexing service, and via the Hadoop batch loader. Data is ingested in realtime using a [Firehose](Firehose.html). ## Create Config Directories ## Each type of node needs its own config file and directory, so create them as subdirectories under the druid directory. @@ -17,7 +17,7 @@ mkdir config/broker ## Loading Data with Kafka ## -[KafkaFirehoseFactory](https://github.com/metamx/druid/blob/master/realtime/src/main/java/com/metamx/druid/realtime/firehose/KafkaFirehoseFactory.java) is how druid communicates with Kafka. Using this [[Firehose]] with the right configuration, we can import data into Druid in realtime without writing any code. To load data to a realtime node via Kafka, we'll first need to initialize Zookeeper and Kafka, and then configure and initialize a [[Realtime]] node. +[KafkaFirehoseFactory](https://github.com/metamx/druid/blob/master/realtime/src/main/java/com/metamx/druid/realtime/firehose/KafkaFirehoseFactory.java) is how druid communicates with Kafka. Using this [Firehose](Firehose.html) with the right configuration, we can import data into Druid in realtime without writing any code. To load data to a realtime node via Kafka, we'll first need to initialize Zookeeper and Kafka, and then configure and initialize a [Realtime](Realtime.html) node. ### Booting Kafka ### @@ -165,7 +165,7 @@ curl -X POST "http://localhost:8080/druid/v2/?pretty" \ } } ] ``` -Now you're ready for [[Querying Your Data]]! +Now you're ready for [Querying Your Data](Querying Your Data.html)! ## Loading Data with the HadoopDruidIndexer ## @@ -184,7 +184,7 @@ mysql -u root GRANT ALL ON druid.* TO 'druid'@'localhost' IDENTIFIED BY 'diurd'; CREATE database druid; ``` -The [[Master]] node will create the tables it needs based on its configuration. +The [Master](Master.html) node will create the tables it needs based on its configuration. ### Make sure you have ZooKeeper Running ### @@ -206,7 +206,7 @@ cd .. ``` ### Launch a Master Node ### -If you've already setup a realtime node, be aware that although you can run multiple node types on one physical computer, you must assign them unique ports. Having used 8080 for the [[Realtime]] node, we use 8081 for the [[Master]]. +If you've already setup a realtime node, be aware that although you can run multiple node types on one physical computer, you must assign them unique ports. Having used 8080 for the [Realtime](Realtime.html) node, we use 8081 for the [Master](Master.html). 1. Setup a configuration file called config/master/runtime.properties similar to: ```bash @@ -251,7 +251,7 @@ druid.paths.indexCache=/tmp/druid/indexCache # Path on local FS for storage of segment metadata; dir will be created if needed druid.paths.segmentInfoCache=/tmp/druid/segmentInfoCache ``` -2. Launch the [[Master]] node +2. Launch the [Master](Master.html) node ```bash java -Xmx256m -Duser.timezone=UTC -Dfile.encoding=UTF-8 \ -classpath lib/*:config/master \ @@ -324,7 +324,7 @@ We can use the same records we have been, in a file called records.json: ### Run the Hadoop Job ### -Now its time to run the Hadoop [[Batch-ingestion]] job, HadoopDruidIndexer, which will fill a historical [[Compute]] node with data. First we'll need to configure the job. +Now its time to run the Hadoop [Batch-ingestion](Batch-ingestion.html) job, HadoopDruidIndexer, which will fill a historical [Compute](Compute.html) node with data. First we'll need to configure the job. 1. Create a config called batchConfig.json similar to: ```json @@ -367,4 +367,4 @@ Now its time to run the Hadoop [[Batch-ingestion]] job, HadoopDruidIndexer, whic java -Xmx256m -Duser.timezone=UTC -Dfile.encoding=UTF-8 -Ddruid.realtime.specFile=realtime.spec -classpath lib/* com.metamx.druid.indexer.HadoopDruidIndexerMain batchConfig.json ``` -You can now move on to [[Querying Your Data]]! \ No newline at end of file +You can now move on to [Querying Your Data](Querying Your Data.html)! \ No newline at end of file diff --git a/docs/Master.md b/docs/Master.md index f7345524980..c96af56dea9 100644 --- a/docs/Master.md +++ b/docs/Master.md @@ -15,7 +15,7 @@ Rules Segments are loaded and dropped from the cluster based on a set of rules. Rules indicate how segments should be assigned to different compute node tiers and how many replicants of a segment should exist in each tier. Rules may also indicate when segments should be dropped entirely from the cluster. The master loads a set of rules from the database. Rules may be specific to a certain datasource and/or a default set of rules can be configured. Rules are read in order and hence the ordering of rules is important. The master will cycle through all available segments and match each segment with the first rule that applies. Each segment may only match a single rule -For more information on rules, see [[Rule Configuration.md]]. +For more information on rules, see [Rule Configuration](Rule Configuration.html). Cleaning Up Segments -------------------- @@ -103,4 +103,4 @@ Master nodes can be run using the `com.metamx.druid.http.MasterMain` class. Configuration ------------- -See [[Configuration]]. +See [Configuration](Configuration.html). diff --git a/docs/MySQL.md b/docs/MySQL.md index f7ee2ec4db1..88ef75006cf 100644 --- a/docs/MySQL.md +++ b/docs/MySQL.md @@ -8,7 +8,7 @@ Segments Table This is dictated by the `druid.database.segmentTable` property (Note that these properties are going to change in the next stable version after 0.4.12). -This table stores metadata about the segments that are available in the system. The table is polled by the [[Master]] to determine the set of segments that should be available for querying in the system. The table has two main functional columns, the other columns are for indexing purposes. +This table stores metadata about the segments that are available in the system. The table is polled by the [Master](Master.html) to determine the set of segments that should be available for querying in the system. The table has two main functional columns, the other columns are for indexing purposes. The `used` column is a boolean “tombstone”. A 1 means that the segment should be “used” by the cluster (i.e. it should be loaded and available for requests). A 0 means that the segment should not be actively loaded into the cluster. We do this as a means of removing segments from the cluster without actually removing their metadata (which allows for simpler rolling back if that is ever an issue). @@ -34,7 +34,7 @@ Note that the format of this blob can and will change from time-to-time. Rule Table ---------- -The rule table is used to store the various rules about where segments should land. These rules are used by the [[Master]] when making segment (re-)allocation decisions about the cluster. +The rule table is used to store the various rules about where segments should land. These rules are used by the [Master](Master.html) when making segment (re-)allocation decisions about the cluster. Config Table ------------ @@ -44,4 +44,4 @@ The config table is used to store runtime configuration objects. We do not have Task-related Tables ------------------- -There are also a number of tables created and used by the [[Indexing Service]] in the course of its work. +There are also a number of tables created and used by the [Indexing Service](Indexing Service.html) in the course of its work. diff --git a/docs/Post-aggregations.md b/docs/Post-aggregations.md index 4aa6c7f8db7..2e11f98d0e0 100644 --- a/docs/Post-aggregations.md +++ b/docs/Post-aggregations.md @@ -22,9 +22,9 @@ The grammar for an arithmetic post aggregation is: ### Field accessor post-aggregator -This returns the value produced by the specified [[aggregator|Aggregations]]. +This returns the value produced by the specified [aggregator|Aggregations](aggregator|Aggregations.html). -`fieldName` refers to the output name of the aggregator given in the [[aggregations|Aggregations]] portion of the query. +`fieldName` refers to the output name of the aggregator given in the [aggregations|Aggregations](aggregations|Aggregations.html) portion of the query. field_accessor : { "type" : "fieldAccess", diff --git a/docs/Querying-your-data.md b/docs/Querying-your-data.md index 39d22ab3a32..5bf72a6fa54 100644 --- a/docs/Querying-your-data.md +++ b/docs/Querying-your-data.md @@ -3,7 +3,7 @@ layout: default --- # Setup # -Before we start querying druid, we're going to finish setting up a complete cluster on localhost. In [[Loading Your Data]] we setup a [[Realtime]], [[Compute]] and [[Master]] node. If you've already completed that tutorial, you need only follow the directions for 'Booting a Broker Node'. +Before we start querying druid, we're going to finish setting up a complete cluster on localhost. In [Loading Your Data](Loading Your Data.html) we setup a [Realtime](Realtime.html), [Compute](Compute.html) and [Master](Master.html) node. If you've already completed that tutorial, you need only follow the directions for 'Booting a Broker Node'. ## Booting a Broker Node ## @@ -98,11 +98,11 @@ com.metamx.druid.http.ComputeMain # Querying Your Data # -Now that we have a complete cluster setup on localhost, we need to load data. To do so, refer to [[Loading Your Data]]. Having done that, its time to query our data! For a complete specification of queries, see [[Querying]]. +Now that we have a complete cluster setup on localhost, we need to load data. To do so, refer to [Loading Your Data](Loading Your Data.html). Having done that, its time to query our data! For a complete specification of queries, see [Querying](Querying.html). ## Querying Different Nodes ## -As a shared-nothing system, there are three ways to query druid, against the [[Realtime]], [[Compute]] or [[Broker]] node. Querying a Realtime node returns only realtime data, querying a compute node returns only historical segments. Querying the broker will query both realtime and compute segments and compose an overall result for the query. This is the normal mode of operation for queries in druid. +As a shared-nothing system, there are three ways to query druid, against the [Realtime](Realtime.html), [Compute](Compute.html) or [Broker](Broker.html) node. Querying a Realtime node returns only realtime data, querying a compute node returns only historical segments. Querying the broker will query both realtime and compute segments and compose an overall result for the query. This is the normal mode of operation for queries in druid. ### Construct a Query ### @@ -183,7 +183,7 @@ Now that we know what nodes can be queried (although you should usually use the ## Querying Against the realtime.spec ## -How are we to know what queries we can run? Although [[Querying]] is a helpful index, to get a handle on querying our data we need to look at our [[Realtime]] node's realtime.spec file: +How are we to know what queries we can run? Although [Querying](Querying.html) is a helpful index, to get a handle on querying our data we need to look at our [Realtime](Realtime.html) node's realtime.spec file: ```json [{ @@ -225,7 +225,7 @@ Our dataSource tells us the name of the relation/table, or 'source of data', to ### aggregations ### -Note the [[Aggregations]] in our query: +Note the [Aggregations](Aggregations.html) in our query: ```json "aggregations": [ @@ -244,7 +244,7 @@ this matches up to the aggregators in the schema of our realtime.spec! ### dimensions ### -Lets look back at our actual records (from [[Loading Your Data]]): +Lets look back at our actual records (from [Loading Your Data](Loading Your Data.html)): ```json {"utcdt": "2010-01-01T01:01:01", "wp": 1000, "gender": "male", "age": 100} @@ -359,8 +359,8 @@ Which gets us just people aged 40: } ] ``` -Check out [[Filters]] for more. +Check out [Filters](Filters.html) for more. ## Learn More ## -You can learn more about querying at [[Querying]]! Now check out [[Booting a production cluster]]! \ No newline at end of file +You can learn more about querying at [Querying](Querying.html)! Now check out [Booting a production cluster](Booting a production cluster.html)! \ No newline at end of file diff --git a/docs/Querying.md b/docs/Querying.md index db845bc694f..7e613a074ec 100644 --- a/docs/Querying.md +++ b/docs/Querying.md @@ -4,7 +4,7 @@ layout: default Querying ======== -Queries are made using an HTTP REST style request to a [[Broker]], [[Compute]], or [[Realtime]] node. The query is expressed in JSON and each of these node types expose the same REST query interface. +Queries are made using an HTTP REST style request to a [Broker](Broker.html), [Compute](Compute.html), or [Realtime](Realtime.html) node. The query is expressed in JSON and each of these node types expose the same REST query interface. We start by describing an example query with additional comments that mention possible variations. Query operators are also summarized in a table below. @@ -55,7 +55,7 @@ The dataSource JSON field shown next identifies where to apply the query. In thi \`\`\`javascript [dataSource]() “randSeq”, \`\`\` -The granularity JSON field specifies the bucket size for values. It could be a built-in time interval like “second”, “minute”, “fifteen\_minute”, “thirty\_minute”, “hour” or “day”. It can also be an expression like `{"type": "period", "period":"PT6m"}` meaning “6 minute buckets”. See [[Granularities]] for more information on the different options for this field. In this example, it is set to the special value “all” which means [bucket all data points together into the same time bucket]() +The granularity JSON field specifies the bucket size for values. It could be a built-in time interval like “second”, “minute”, “fifteen\_minute”, “thirty\_minute”, “hour” or “day”. It can also be an expression like `{"type": "period", "period":"PT6m"}` meaning “6 minute buckets”. See [Granularities](Granularities.html) for more information on the different options for this field. In this example, it is set to the special value “all” which means [bucket all data points together into the same time bucket]() \`\`\`javascript [granularity]() “all”, \`\`\` @@ -63,7 +63,7 @@ The dimensions JSON field value is an array of zero or more fields as defined in \`\`\`javascript [dimensions]() [], \`\`\` -A groupBy also requires the JSON field “aggregations” (See [[Aggregations]]), which are applied to the column specified by fieldName and the output of the aggregation will be named according to the value in the “name” field: +A groupBy also requires the JSON field “aggregations” (See [Aggregations](Aggregations.html)), which are applied to the column specified by fieldName and the output of the aggregation will be named according to the value in the “name” field: \`\`\`javascript [aggregations]() [ { [type]() “count”, [name]() “rows” }, @@ -71,7 +71,7 @@ A groupBy also requires the JSON field “aggregations” (See [[Aggregations]]) { [type]() “doubleSum”, [fieldName]() “outColumn”, [name]() “randomNumberSum” } ], \`\`\` -You can also specify postAggregations, which are applied after data has been aggregated for the current granularity and dimensions bucket. See [[Post Aggregations]] for a detailed description. In the rand example, an arithmetic type operation (division, as specified by “fn”) is performed with the result “name” of “avg\_random”. The “fields” field specifies the inputs from the aggregation stage to this expression. Note that identifiers corresponding to “name” JSON field inside the type “fieldAccess” are required but not used outside this expression, so they are prefixed with “dummy” for clarity: +You can also specify postAggregations, which are applied after data has been aggregated for the current granularity and dimensions bucket. See [Post Aggregations](Post Aggregations.html) for a detailed description. In the rand example, an arithmetic type operation (division, as specified by “fn”) is performed with the result “name” of “avg\_random”. The “fields” field specifies the inputs from the aggregation stage to this expression. Note that identifiers corresponding to “name” JSON field inside the type “fieldAccess” are required but not used outside this expression, so they are prefixed with “dummy” for clarity: \`\`\`javascript [postAggregations]() [{ [type]() “arithmetic”, @@ -99,11 +99,11 @@ The following table summarizes query properties. |timeseries, groupBy, search, timeBoundary|dataSource|query is applied to this data source|yes| |timeseries, groupBy, search|intervals|range of time series to include in query|yes| |timeseries, groupBy, search, timeBoundary|context|This is a key-value map that can allow the query to alter some of the behavior of a query. It is primarily used for debugging, for example if you include `"bySegment":true` in the map, you will get results associated with the data segment they came from.|no| -|timeseries, groupBy, search|filter|Specifies the filter (the “WHERE” clause in SQL) for the query. See [[Filters]]|no| -|timeseries, groupBy, search|granularity|the timestamp granularity to bucket results into (i.e. “hour”). See [[Granularities]] for more information.|no| +|timeseries, groupBy, search|filter|Specifies the filter (the “WHERE” clause in SQL) for the query. See [Filters](Filters.html)|no| +|timeseries, groupBy, search|granularity|the timestamp granularity to bucket results into (i.e. “hour”). See [Granularities](Granularities.html) for more information.|no| |groupBy|dimensions|constrains the groupings; if empty, then one value per time granularity bucket|yes| -|timeseries, groupBy|aggregations|aggregations that combine values in a bucket. See [[Aggregations]].|yes| -|timeseries, groupBy|postAggregations|aggregations of aggregations. See [[Post Aggregations]].|yes| +|timeseries, groupBy|aggregations|aggregations that combine values in a bucket. See [Aggregations](Aggregations.html).|yes| +|timeseries, groupBy|postAggregations|aggregations of aggregations. See [Post Aggregations](Post Aggregations.html).|yes| |search|limit|maximum number of results (default is 1000), a system-level maximum can also be set via `com.metamx.query.search.maxSearchLimit`|no| |search|searchDimensions|Dimensions to apply the search query to. If not specified, it will search through all dimensions.|no| |search|query|The query portion of the search query. This is essentially a predicate that specifies if something matches.|yes| @@ -111,4 +111,4 @@ The following table summarizes query properties. Additional Information about Query Types ---------------------------------------- -[[TimeseriesQuery]] +[TimeseriesQuery](TimeseriesQuery.html) diff --git a/docs/Realtime.md b/docs/Realtime.md index c92cc7f7175..855607d7eb5 100644 --- a/docs/Realtime.md +++ b/docs/Realtime.md @@ -4,7 +4,7 @@ layout: default Realtime ======== -Realtime nodes provide a realtime index. Data indexed via these nodes is immediately available for querying. Realtime nodes will periodically build segments representing the data they’ve collected over some span of time and hand these segments off to [[Compute]] nodes. +Realtime nodes provide a realtime index. Data indexed via these nodes is immediately available for querying. Realtime nodes will periodically build segments representing the data they’ve collected over some span of time and hand these segments off to [Compute](Compute.html) nodes. Running ------- @@ -21,7 +21,7 @@ The segment propagation diagram for real-time data ingestion can be seen below: Configuration ------------- -Realtime nodes take a mix of base server configuration and spec files that describe how to connect, process and expose the realtime feed. See [[Configuration]] for information about general server configuration. +Realtime nodes take a mix of base server configuration and spec files that describe how to connect, process and expose the realtime feed. See [Configuration](Configuration.html) for information about general server configuration. ### Realtime “specFile” @@ -62,7 +62,7 @@ There are four parts to a realtime stream specification, `schema`, `config`, `fi #### Schema -This describes the data schema for the output Druid segment. More information about concepts in Druid and querying can be found at [[Concepts-and-Terminology]] and [[Querying]]. +This describes the data schema for the output Druid segment. More information about concepts in Druid and querying can be found at [Concepts-and-Terminology](Concepts-and-Terminology.html) and [Querying](Querying.html). |Field|Type|Description|Required| |-----|----|-----------|--------| @@ -83,11 +83,11 @@ This provides configuration for the data processing portion of the realtime stre ### Firehose -See [[Firehose]]. +See [Firehose](Firehose.html). ### Plumber -See [[Plumber]] +See [Plumber](Plumber.html) Constraints ----------- diff --git a/docs/SearchQuery.md b/docs/SearchQuery.md index 7acf04419fa..b206f652c6e 100644 --- a/docs/SearchQuery.md +++ b/docs/SearchQuery.md @@ -30,11 +30,11 @@ There are several main parts to a search query: |--------|-----------|---------| |queryType|This String should always be “search”; this is the first thing Druid looks at to figure out how to interpret the query|yes| |dataSource|A String defining the data source to query, very similar to a table in a relational database|yes| -|granularity|Defines the granularity of the query. See [[Granularities]]|yes| -|filter|See [[Filters]]|no| +|granularity|Defines the granularity of the query. See [Granularities](Granularities.html)|yes| +|filter|See [Filters](Filters.html)|no| |intervals|A JSON Object representing ISO-8601 Intervals. This defines the time ranges to run the query over.|yes| |searchDimensions|The dimensions to run the search over. Excluding this means the search is run over all dimensions.|no| -|query|See [[SearchQuerySpec]].|yes| +|query|See [SearchQuerySpec](SearchQuerySpec.html).|yes| |sort|How the results of the search should sorted. Two possible types here are “lexicographic” and “strlen”.|yes| |context|An additional JSON Object which can be used to specify certain flags.|no| diff --git a/docs/Segments.md b/docs/Segments.md index 7da12950d15..5af50cd8e48 100644 --- a/docs/Segments.md +++ b/docs/Segments.md @@ -4,7 +4,7 @@ layout: default Segments ======== -Segments are the fundamental structure to store data in Druid. [[Compute]] and [[Realtime]] nodes load and serve segments for querying. To construct segments, Druid will always shard data by a time partition. Data may be further sharded based on dimension cardinality and row count. +Segments are the fundamental structure to store data in Druid. [Compute](Compute.html) and [Realtime](Realtime.html) nodes load and serve segments for querying. To construct segments, Druid will always shard data by a time partition. Data may be further sharded based on dimension cardinality and row count. The latest Druid segment version is `v9`. diff --git a/docs/Stand-Alone-With-Riak-CS.md b/docs/Stand-Alone-With-Riak-CS.md index 505b59f9283..d1dc8f780d8 100644 --- a/docs/Stand-Alone-With-Riak-CS.md +++ b/docs/Stand-Alone-With-Riak-CS.md @@ -22,12 +22,12 @@ We started with a minimal CentOS installation but you can use any other compatib 1. A Kafka Broker 1. A single-node Zookeeper ensemble 1. A single-node Riak-CS cluster -1. A Druid [[Master]] -1. A Druid [[Broker]] -1. A Druid [[Compute]] -1. A Druid [[Realtime]] +1. A Druid [Master](Master.html) +1. A Druid [Broker](Broker.html) +1. A Druid [Compute](Compute.html) +1. A Druid [Realtime](Realtime.html) -This just walks through getting the relevant software installed and running. You will then need to configure the [[Realtime]] node to take in your data. +This just walks through getting the relevant software installed and running. You will then need to configure the [Realtime](Realtime.html) node to take in your data. ### Configure System diff --git a/docs/TimeseriesQuery.md b/docs/TimeseriesQuery.md index 56f2ce733b9..9ea79fcfa75 100644 --- a/docs/TimeseriesQuery.md +++ b/docs/TimeseriesQuery.md @@ -84,10 +84,10 @@ There are 7 main parts to a timeseries query: |--------|-----------|---------| |queryType|This String should always be “timeseries”; this is the first thing Druid looks at to figure out how to interpret the query|yes| |dataSource|A String defining the data source to query, very similar to a table in a relational database|yes| -|granularity|Defines the granularity of the query. See [[Granularities]]|yes| -|filter|See [[Filters]]|no| -|aggregations|See [[Aggregations]]|yes| -|postAggregations|See [[Post Aggregations]]|no| +|granularity|Defines the granularity of the query. See [Granularities](Granularities.html)|yes| +|filter|See [Filters](Filters.html)|no| +|aggregations|See [Aggregations](Aggregations.html)|yes| +|postAggregations|See [Post Aggregations](Post Aggregations.html)|no| |intervals|A JSON Object representing ISO-8601 Intervals. This defines the time ranges to run the query over.|yes| |context|An additional JSON Object which can be used to specify certain flags.|no| diff --git a/docs/Tutorial:-A-First-Look-at-Druid.md b/docs/Tutorial:-A-First-Look-at-Druid.md index 4722dd173c0..c3de0df1d91 100644 --- a/docs/Tutorial:-A-First-Look-at-Druid.md +++ b/docs/Tutorial:-A-First-Look-at-Druid.md @@ -41,7 +41,7 @@ These metrics track the number of characters added, deleted, and changed. Setting Up ---------- -There are two ways to setup Druid: download a tarball, or [[Build From Source]]. You only need to do one of these. +There are two ways to setup Druid: download a tarball, or [Build From Source](Build From Source.html). You only need to do one of these. ### Download a Tarball @@ -64,7 +64,7 @@ You should see a bunch of files: Running Example Scripts ----------------------- -Let’s start doing stuff. You can start a Druid [[Realtime]] node by issuing: +Let’s start doing stuff. You can start a Druid [Realtime](Realtime.html) node by issuing: ./run_example_server.sh @@ -176,7 +176,7 @@ As you can probably tell, the result is indicating the maximum and minimum times Return to your favorite editor and create the file:
timeseries_query.body
-We are going to make a slightly more complicated query, the [[TimeseriesQuery]]. Copy and paste the following into the file: +We are going to make a slightly more complicated query, the [TimeseriesQuery](TimeseriesQuery.html). Copy and paste the following into the file:

 {
     "queryType": "timeseries", 
@@ -200,7 +200,7 @@ We are going to make a slightly more complicated query, the [[TimeseriesQuery]].
 }
 
-You are probably wondering, what are these [[Granularities]] and [[Aggregations]] things? What the query is doing is aggregating some metrics over some span of time. +You are probably wondering, what are these [Granularities](Granularities.html) and [Aggregations](Aggregations.html) things? What the query is doing is aggregating some metrics over some span of time. To issue the query and get some results, run the following in your command line:
curl -X POST 'http://localhost:8083/druid/v2/?pretty' -H 'content-type: application/json'  -d  ````timeseries\_query.body
 
@@ -275,7 +275,7 @@ This gives us something like the following:
 Solving a Problem
 -----------------
 
-One of Druid’s main powers is to provide answers to problems, so let’s pose a problem. What if we wanted to know what the top pages in the US are, ordered by the number of edits over the last few minutes you’ve been going through this tutorial? To solve this problem, we have to return to the query we introduced at the very beginning of this tutorial, the [[GroupByQuery]]. It would be nice if we could group by results by dimension value and somehow sort those results… and it turns out we can!
+One of Druid’s main powers is to provide answers to problems, so let’s pose a problem. What if we wanted to know what the top pages in the US are, ordered by the number of edits over the last few minutes you’ve been going through this tutorial? To solve this problem, we have to return to the query we introduced at the very beginning of this tutorial, the [GroupByQuery](GroupByQuery.html). It would be nice if we could group by results by dimension value and somehow sort those results… and it turns out we can!
 
 Let’s create the file:
 
@@ -317,7 +317,7 @@ Let’s create the file:
     }
     
 
-Woah! Our query just got a way more complicated. Now we have these [[Filters]] things and this [[OrderBy]] thing. Fear not, it turns out the new objects we’ve introduced to our query can help define the format of our results and provide an answer to our question.
+Woah! Our query just got a way more complicated. Now we have these [Filters](Filters.html) things and this [OrderBy](OrderBy.html) thing. Fear not, it turns out the new objects we’ve introduced to our query can help define the format of our results and provide an answer to our question.
 
 If you issue the query:
 
@@ -357,9 +357,9 @@ Feel free to tweak other query parameters to answer other questions you may have
 Next Steps
 ----------
 
-What to know even more information about the Druid Cluster? Check out [[Tutorial: The Druid Cluster]]
+What to know even more information about the Druid Cluster? Check out [Tutorial: The Druid Cluster](Tutorial: The Druid Cluster.html)
 
-Druid is even more fun if you load your own data into it! To learn how to load your data, see [[Loading Your Data]].
+Druid is even more fun if you load your own data into it! To learn how to load your data, see [Loading Your Data](Loading Your Data.html).
 
 Additional Information
 ----------------------
diff --git a/docs/Tutorial:-The-Druid-Cluster.md b/docs/Tutorial:-The-Druid-Cluster.md
index e2eff84f505..286447cc3cd 100644
--- a/docs/Tutorial:-The-Druid-Cluster.md
+++ b/docs/Tutorial:-The-Druid-Cluster.md
@@ -19,7 +19,7 @@ tar -zxvf druid-services-*-bin.tar.gz
 cd druid-services-*
 ```
 
-You can also [[Build From Source]].
+You can also [Build From Source](Build From Source.html).
 
 ## External Dependencies ##
 
diff --git a/docs/Tutorial:-Webstream.md b/docs/Tutorial:-Webstream.md
index 973204f31d4..bbfb42450fd 100644
--- a/docs/Tutorial:-Webstream.md
+++ b/docs/Tutorial:-Webstream.md
@@ -145,7 +145,7 @@ As you can probably tell, the result is indicating the maximum and minimum times
 Return to your favorite editor and create the file:
 
timeseries_query.body
-We are going to make a slightly more complicated query, the [[TimeseriesQuery]]. Copy and paste the following into the file: +We are going to make a slightly more complicated query, the [TimeseriesQuery](TimeseriesQuery.html). Copy and paste the following into the file:

 {
     "queryType": "timeseries", 
@@ -168,7 +168,7 @@ We are going to make a slightly more complicated query, the [[TimeseriesQuery]].
 }
 
-You are probably wondering, what are these [[Granularities]] and [[Aggregations]] things? What the query is doing is aggregating some metrics over some span of time. +You are probably wondering, what are these [Granularities](Granularities.html) and [Aggregations](Aggregations.html) things? What the query is doing is aggregating some metrics over some span of time. To issue the query and get some results, run the following in your command line:
curl -X POST 'http://localhost:8083/druid/v2/?pretty' -H 'content-type: application/json'  -d  ````timeseries\_query.body
 
@@ -246,7 +246,7 @@ This gives us something like the following:
 Solving a Problem
 -----------------
 
-One of Druid’s main powers is to provide answers to problems, so let’s pose a problem. What if we wanted to know what the top states in the US are, ordered by the number of visits by known users over the last few minutes? To solve this problem, we have to return to the query we introduced at the very beginning of this tutorial, the [[GroupByQuery]]. It would be nice if we could group by results by dimension value and somehow sort those results… and it turns out we can!
+One of Druid’s main powers is to provide answers to problems, so let’s pose a problem. What if we wanted to know what the top states in the US are, ordered by the number of visits by known users over the last few minutes? To solve this problem, we have to return to the query we introduced at the very beginning of this tutorial, the [GroupByQuery](GroupByQuery.html). It would be nice if we could group by results by dimension value and somehow sort those results… and it turns out we can!
 
 Let’s create the file:
 
@@ -292,7 +292,7 @@ Let’s create the file:
     }
     
 
-Woah! Our query just got a way more complicated. Now we have these [[Filters]] things and this [[OrderBy]] thing. Fear not, it turns out the new objects we’ve introduced to our query can help define the format of our results and provide an answer to our question.
+Woah! Our query just got a way more complicated. Now we have these [Filters](Filters.html) things and this [OrderBy](OrderBy.html) thing. Fear not, it turns out the new objects we’ve introduced to our query can help define the format of our results and provide an answer to our question.
 
 If you issue the query:
 
@@ -346,8 +346,8 @@ Feel free to tweak other query parameters to answer other questions you may have
 Next Steps
 ----------
 
-What to know even more information about the Druid Cluster? Check out [[Tutorial: The Druid Cluster]]
-Druid is even more fun if you load your own data into it! To learn how to load your data, see [[Loading Your Data]].
+What to know even more information about the Druid Cluster? Check out [Tutorial: The Druid Cluster](Tutorial: The Druid Cluster.html)
+Druid is even more fun if you load your own data into it! To learn how to load your data, see [Loading Your Data](Loading Your Data.html).
 
 Additional Information
 ----------------------
diff --git a/docs/Twitter-Tutorial.md b/docs/Twitter-Tutorial.md
index cedd26b9250..dc2151ec3e4 100644
--- a/docs/Twitter-Tutorial.md
+++ b/docs/Twitter-Tutorial.md
@@ -1,7 +1,7 @@
 ---
 layout: default
 ---
-Greetings! We see you’ve taken an interest in Druid. That’s awesome! Hopefully this tutorial will help clarify some core Druid concepts. We will go through one of the Real-time [[Examples]], and issue some basic Druid queries. The data source we’ll be working with is the [Twitter spritzer stream](https://dev.twitter.com/docs/streaming-apis/streams/public). If you are ready to explore Druid, brave its challenges, and maybe learn a thing or two, read on!
+Greetings! We see you’ve taken an interest in Druid. That’s awesome! Hopefully this tutorial will help clarify some core Druid concepts. We will go through one of the Real-time [Examples](Examples.html), and issue some basic Druid queries. The data source we’ll be working with is the [Twitter spritzer stream](https://dev.twitter.com/docs/streaming-apis/streams/public). If you are ready to explore Druid, brave its challenges, and maybe learn a thing or two, read on!
 
 Setting Up
 ----------
@@ -52,7 +52,7 @@ You can find the example executables in the examples/bin directory:
 Running Example Scripts
 -----------------------
 
-Let’s start doing stuff. You can start a Druid [[Realtime]] node by issuing:
+Let’s start doing stuff. You can start a Druid [Realtime](Realtime.html) node by issuing:
 
     ./run_example_server.sh
 
@@ -175,7 +175,7 @@ If you said the result is indicating the maximum and minimum timestamps we've se
 Return to your favorite editor and create the file:
 
timeseries_query.body
-We are going to make a slightly more complicated query, the [[TimeseriesQuery]]. Copy and paste the following into the file: +We are going to make a slightly more complicated query, the [TimeseriesQuery](TimeseriesQuery.html). Copy and paste the following into the file:
{
   "queryType":"timeseries",
   "dataSource":"twitterstream",
@@ -188,7 +188,7 @@ We are going to make a slightly more complicated query, the [[TimeseriesQuery]].
 }
 
-You are probably wondering, what are these [[Granularities]] and [[Aggregations]] things? What the query is doing is aggregating some metrics over some span of time. +You are probably wondering, what are these [Granularities](Granularities.html) and [Aggregations](Aggregations.html) things? What the query is doing is aggregating some metrics over some span of time. To issue the query and get some results, run the following in your command line:
curl -X POST 'http://localhost:8080/druid/v2/?pretty' -H 'content-type: application/json'  -d  ````timeseries\_query.body
 
@@ -252,7 +252,7 @@ This gives us something like the following:
 Solving a Problem
 -----------------
 
-One of Druid’s main powers (see what we did there?) is to provide answers to problems, so let’s pose a problem. What if we wanted to know what the top hash tags are, ordered by the number tweets, where the language is english, over the last few minutes you’ve been reading this tutorial? To solve this problem, we have to return to the query we introduced at the very beginning of this tutorial, the [[GroupByQuery]]. It would be nice if we could group by results by dimension value and somehow sort those results… and it turns out we can!
+One of Druid’s main powers (see what we did there?) is to provide answers to problems, so let’s pose a problem. What if we wanted to know what the top hash tags are, ordered by the number tweets, where the language is english, over the last few minutes you’ve been reading this tutorial? To solve this problem, we have to return to the query we introduced at the very beginning of this tutorial, the [GroupByQuery](GroupByQuery.html). It would be nice if we could group by results by dimension value and somehow sort those results… and it turns out we can!
 
 Let’s create the file:
 
@@ -272,7 +272,7 @@ Let’s create the file:
     }
     
 
-Woah! Our query just got a way more complicated. Now we have these [[Filters]] things and this [[OrderBy]] thing. Fear not, it turns out the new objects we’ve introduced to our query can help define the format of our results and provide an answer to our question.
+Woah! Our query just got a way more complicated. Now we have these [Filters](Filters.html) things and this [OrderBy](OrderBy.html) thing. Fear not, it turns out the new objects we’ve introduced to our query can help define the format of our results and provide an answer to our question.
 
 If you issue the query:
 
@@ -324,6 +324,6 @@ Feel free to tweak other query parameters to answer other questions you may have
 Additional Information
 ----------------------
 
-This tutorial is merely showcasing a small fraction of what Druid can do. Next, continue on to [[Loading Your Data]].
+This tutorial is merely showcasing a small fraction of what Druid can do. Next, continue on to [Loading Your Data](Loading Your Data.html).
 
 And thus concludes our journey! Hopefully you learned a thing or two about Druid real-time ingestion, querying Druid, and how Druid can be used to solve problems. If you have additional questions, feel free to post in our [google groups page](http://www.groups.google.com/forum/#!forum/druid-development).
diff --git a/docs/Versioning.md b/docs/Versioning.md
index 7b9fa24045c..6b9e79fe9d3 100644
--- a/docs/Versioning.md
+++ b/docs/Versioning.md
@@ -21,4 +21,4 @@ For external deployments, we recommend running the stable release tag. Releases
 Tagging strategy
 ----------------
 
-Tags of the codebase are equivalent to release candidates. We tag the code every time we want to take it through our release process, which includes some QA cycles and deployments. So, it is not safe to assume that a tag is a stable release, it is a solidification of the code as it goes through our production QA cycle and deployment. Tags will never change, but we often go through a number of iterations of tags before actually getting a stable release onto production. So, it is recommended that if you are not aware of what is on a tag, to stick to the stable releases listed on the [[Download]] page.
+Tags of the codebase are equivalent to release candidates. We tag the code every time we want to take it through our release process, which includes some QA cycles and deployments. So, it is not safe to assume that a tag is a stable release, it is a solidification of the code as it goes through our production QA cycle and deployment. Tags will never change, but we often go through a number of iterations of tags before actually getting a stable release onto production. So, it is recommended that if you are not aware of what is on a tag, to stick to the stable releases listed on the [Download](Download.html) page.
diff --git a/docs/ZooKeeper.md b/docs/ZooKeeper.md
index 03f2b1b8e0c..d3e24e29ceb 100644
--- a/docs/ZooKeeper.md
+++ b/docs/ZooKeeper.md
@@ -3,9 +3,9 @@ layout: default
 ---
 Druid uses ZooKeeper (ZK) for management of current cluster state. The operations that happen over ZK are
 
-1.  [[Master]] leader election
-2.  Segment “publishing” protocol from [[Compute]] and [[Realtime]]
-3.  Segment load/drop protocol between [[Master]] and [[Compute]]
+1.  [Master](Master.html) leader election
+2.  Segment “publishing” protocol from [Compute](Compute.html) and [Realtime](Realtime.html)
+3.  Segment load/drop protocol between [Master](Master.html) and [Compute](Compute.html)
 
 ### Property Configuration
 
@@ -41,7 +41,7 @@ We use the Curator LeadershipLatch recipe to do leader election at path
 
 The `announcementsPath` and `servedSegmentsPath` are used for this.
 
-All [[Compute]] and [[Realtime]] nodes publish themselves on the `announcementsPath`, specifically, they will create an ephemeral znode at
+All [Compute](Compute.html) and [Realtime](Realtime.html) nodes publish themselves on the `announcementsPath`, specifically, they will create an ephemeral znode at
 
     ${druid.zk.paths.announcementsPath}/${druid.host}
 
@@ -53,13 +53,13 @@ And as they load up segments, they will attach ephemeral znodes that look like
 
     ${druid.zk.paths.servedSegmentsPath}/${druid.host}/_segment_identifier_
 
-Nodes like the [[Master]] and [[Broker]] can then watch these paths to see which nodes are currently serving which segments.
+Nodes like the [Master](Master.html) and [Broker](Broker.html) can then watch these paths to see which nodes are currently serving which segments.
 
 ### Segment load/drop protocol between Master and Compute
 
 The `loadQueuePath` is used for this.
 
-When the [[Master]] decides that a [[Compute]] node should load or drop a segment, it writes an ephemeral znode to
+When the [Master](Master.html) decides that a [Compute](Compute.html) node should load or drop a segment, it writes an ephemeral znode to
 
     ${druid.zk.paths.loadQueuePath}/_host_of_compute_node/_segment_identifier
 
diff --git a/docs/contents.md b/docs/contents.md
index 23b56bc33a5..0d3f7f9cb62 100644
--- a/docs/contents.md
+++ b/docs/contents.md
@@ -2,70 +2,70 @@
 layout: default
 ---
 Contents
-\* [[Introduction|Home]]
-\* [[Download]]
-\* [[Support]]
-\* [[Contribute]]
+\* [Introduction|Home](Introduction|Home.html)
+\* [Download](Download.html)
+\* [Support](Support.html)
+\* [Contribute](Contribute.html)
 ========================
 
 Getting Started
-\* [[Tutorial: A First Look at Druid]]
-\* [[Tutorial: The Druid Cluster]]
-\* [[Loading Your Data]]
-\* [[Querying Your Data]]
-\* [[Booting a Production Cluster]]
-\* [[Examples]]
-\* [[Cluster Setup]]
-\* [[Configuration]]
+\* [Tutorial: A First Look at Druid](Tutorial: A First Look at Druid.html)
+\* [Tutorial: The Druid Cluster](Tutorial: The Druid Cluster.html)
+\* [Loading Your Data](Loading Your Data.html)
+\* [Querying Your Data](Querying Your Data.html)
+\* [Booting a Production Cluster](Booting a Production Cluster.html)
+\* [Examples](Examples.html)
+\* [Cluster Setup](Cluster Setup.html)
+\* [Configuration](Configuration.html)
 --------------------------------------
 
 Data Ingestion
-\* [[Realtime]]
-\* [[Batch|Batch Ingestion]]
-\* [[Indexing Service]]
+\* [Realtime](Realtime.html)
+\* [Batch|Batch Ingestion](Batch|Batch Ingestion.html)
+\* [Indexing Service](Indexing Service.html)
 ----------------------------
 
 Querying
-\* [[Querying]]
+\* [Querying](Querying.html)
 **\* ]
-**\* [[Aggregations]]
+**\* [Aggregations](Aggregations.html)
 **\* ]
-**\* [[Granularities]]
+**\* [Granularities](Granularities.html)
 \* Query Types
 **\* ]
 ****\* ]
 ****\* ]
-**\* [[SearchQuery]]
+**\* [SearchQuery](SearchQuery.html)
 **\* ]
-** [[SegmentMetadataQuery]]
+** [SegmentMetadataQuery](SegmentMetadataQuery.html)
 **\* ]
-**\* [[TimeseriesQuery]]
+**\* [TimeseriesQuery](TimeseriesQuery.html)
 ---------------------------
 
 Architecture
-\* [[Design]]
-\* [[Segments]]
+\* [Design](Design.html)
+\* [Segments](Segments.html)
 \* Node Types
 **\* ]
-**\* [[Broker]]
+**\* [Broker](Broker.html)
 **\* ]
 ****\* ]
-**\* [[Realtime]]
+**\* [Realtime](Realtime.html)
 **\* ]
-**\* [[Plumber]]
+**\* [Plumber](Plumber.html)
 \* External Dependencies
 **\* ]
-**\* [[MySQL]]
+**\* [MySQL](MySQL.html)
 **\* ]
-** [[Concepts and Terminology]]
+** [Concepts and Terminology](Concepts and Terminology.html)
 -------------------------------
 
 Development
-\* [[Versioning]]
-\* [[Build From Source]]
-\* [[Libraries]]
+\* [Versioning](Versioning.html)
+\* [Build From Source](Build From Source.html)
+\* [Libraries](Libraries.html)
 ------------------------
 
 Misc
-\* [[Thanks]]
+\* [Thanks](Thanks.html)
 -------------

From 946a9e502fe544ea8e08218a4fd0d0a683cab44b Mon Sep 17 00:00:00 2001
From: Russell Jurney 
Date: Mon, 16 Sep 2013 16:19:49 -0700
Subject: [PATCH 4/6] Replaced spaces with dashes

---
 docs/Batch-ingestion.md                 |  2 +-
 docs/Booting-a-production-cluster.md    |  2 +-
 docs/Configuration.md                   |  2 +-
 docs/Druid-Personal-Demo-Cluster.md     |  2 +-
 docs/Examples.md                        |  2 +-
 docs/GroupByQuery.md                    |  2 +-
 docs/Libraries.md                       |  3 +++
 docs/Loading-Your-Data.md               |  4 ++--
 docs/Master.md                          |  2 +-
 docs/MySQL.md                           |  2 +-
 docs/Querying-your-data.md              |  6 +++---
 docs/TimeseriesQuery.md                 |  2 +-
 docs/Tutorial:-A-First-Look-at-Druid.md |  4 ++--
 docs/Tutorial:-The-Druid-Cluster.md     |  2 +-
 docs/Tutorial:-Webstream.md             |  4 ++--
 docs/contents.md                        | 20 ++++++++++----------
 16 files changed, 32 insertions(+), 29 deletions(-)

diff --git a/docs/Batch-ingestion.md b/docs/Batch-ingestion.md
index 6511b85b452..42a42ac7b29 100644
--- a/docs/Batch-ingestion.md
+++ b/docs/Batch-ingestion.md
@@ -9,7 +9,7 @@ There are two choices for batch data ingestion to your Druid cluster, you can us
 Which should I use?
 -------------------
 
-The [Indexing service](Indexing service.html) is a node that can run as part of your Druid cluster and can accomplish a number of different types of indexing tasks. Even if all you care about is batch indexing, it provides for the encapsulation of things like the Database that is used for segment metadata and other things, so that your indexing tasks do not need to include such information. Long-term, the indexing service is going to be the preferred method of ingesting data.
+The [Indexing service](Indexing-service.html) is a node that can run as part of your Druid cluster and can accomplish a number of different types of indexing tasks. Even if all you care about is batch indexing, it provides for the encapsulation of things like the Database that is used for segment metadata and other things, so that your indexing tasks do not need to include such information. Long-term, the indexing service is going to be the preferred method of ingesting data.
 
 The `HadoopDruidIndexerMain` runs hadoop jobs in order to separate and index data segments. It takes advantage of Hadoop as a job scheduling and distributed job execution platform. It is a simple method if you already have Hadoop running and don’t want to spend the time configuring and deploying the [Indexing service](Indexing service.html) just yet.
 
diff --git a/docs/Booting-a-production-cluster.md b/docs/Booting-a-production-cluster.md
index d5fc38c8ce5..f7e5444ab8e 100644
--- a/docs/Booting-a-production-cluster.md
+++ b/docs/Booting-a-production-cluster.md
@@ -3,7 +3,7 @@ layout: default
 ---
 # Booting a Single Node Cluster #
 
-[Loading Your Data](Loading Your Data.html) and [Querying Your Data](Querying Your Data.html) contain recipes to boot a small druid cluster on localhost. Here we will boot a small cluster on EC2. You can checkout the code, or download a tarball from [here](http://static.druid.io/artifacts/druid-services-0.5.51-SNAPSHOT-bin.tar.gz).
+[Loading Your Data](Loading-Your-Data.html) and [Querying Your Data](Querying-Your-Data.html) contain recipes to boot a small druid cluster on localhost. Here we will boot a small cluster on EC2. You can checkout the code, or download a tarball from [here](http://static.druid.io/artifacts/druid-services-0.5.51-SNAPSHOT-bin.tar.gz).
 
 The [ec2 run script](https://github.com/metamx/druid/blob/master/examples/bin/run_ec2.sh), run_ec2.sh, is located at 'examples/bin' if you have checked out the code, or at the root of the project if you've downloaded a tarball. The scripts rely on the [Amazon EC2 API Tools](http://aws.amazon.com/developertools/351), and you will need to set three environment variables:
 
diff --git a/docs/Configuration.md b/docs/Configuration.md
index 544b9ea4f55..4042d02d825 100644
--- a/docs/Configuration.md
+++ b/docs/Configuration.md
@@ -91,7 +91,7 @@ These properties are for connecting with S3 and using it to pull down segments.
 
 ### JDBC connection
 
-These properties specify the jdbc connection and other configuration around the “segments table” database. The only processes that connect to the DB with these properties are the [Master](Master.html) and [Indexing service](Indexing service.html). This is tested on MySQL.
+These properties specify the jdbc connection and other configuration around the “segments table” database. The only processes that connect to the DB with these properties are the [Master](Master.html) and [Indexing service](Indexing-service.html). This is tested on MySQL.
 
 |Property|Description|Default|
 |--------|-----------|-------|
diff --git a/docs/Druid-Personal-Demo-Cluster.md b/docs/Druid-Personal-Demo-Cluster.md
index 498f8ff8e14..0ef9834f198 100644
--- a/docs/Druid-Personal-Demo-Cluster.md
+++ b/docs/Druid-Personal-Demo-Cluster.md
@@ -3,7 +3,7 @@ layout: default
 ---
 # Druid Personal Demo Cluster (DPDC)
 
-Note, there are currently some issues with the CloudFormation.  We are working through them and will update the documentation here when things work properly.  In the meantime, the simplest way to get your feet wet with a cluster setup is to run through the instructions at [housejester/druid-test-harness](https://github.com/housejester/druid-test-harness), though it is based on an older version.  If you just want to get a feel for the types of data and queries that you can issue, check out [Realtime Examples](Realtime Examples.html)
+Note, there are currently some issues with the CloudFormation.  We are working through them and will update the documentation here when things work properly.  In the meantime, the simplest way to get your feet wet with a cluster setup is to run through the instructions at [housejester/druid-test-harness](https://github.com/housejester/druid-test-harness), though it is based on an older version.  If you just want to get a feel for the types of data and queries that you can issue, check out [Realtime Examples](Realtime-Examples.html)
 
 ## Introduction
 To make it easy for you to get started with Druid, we created an AWS (Amazon Web Services) [CloudFormation](http://aws.amazon.com/cloudformation/) Template that allows you to create a small pre-configured Druid cluster using your own AWS account. The cluster contains a pre-loaded sample workload, the Wikipedia edit stream, and a basic query interface that gets you familiar with Druid capabilities like drill-downs and filters. 
diff --git a/docs/Examples.md b/docs/Examples.md
index 2f48f60b1b5..4207911464b 100644
--- a/docs/Examples.md
+++ b/docs/Examples.md
@@ -34,7 +34,7 @@ Clone Druid and build it:
 Twitter Example
 ---------------
 
-For a full tutorial based on the twitter example, check out this [Twitter Tutorial](Twitter Tutorial.html).
+For a full tutorial based on the twitter example, check out this [Twitter Tutorial](Twitter-Tutorial.html).
 
 This Example uses a feature of Twitter that allows for sampling of it’s stream. We sample the Twitter stream via our [TwitterSpritzerFirehoseFactory](https://github.com/metamx/druid/blob/master/examples/src/main/java/druid/examples/twitter/TwitterSpritzerFirehoseFactory.java) class and use it to simulate the kinds of data you might ingest into Druid. Then, with the client part, the sample shows what kinds of analytics explorations you can do during and after the data is loaded.
 
diff --git a/docs/GroupByQuery.md b/docs/GroupByQuery.md
index 7e95ebcbdee..01edc6bdc7e 100644
--- a/docs/GroupByQuery.md
+++ b/docs/GroupByQuery.md
@@ -98,7 +98,7 @@ There are 9 main parts to a groupBy query:
 |granularity|Defines the granularity of the query. See [Granularities](Granularities.html)|yes|
 |filter|See [Filters](Filters.html)|no|
 |aggregations|See [Aggregations](Aggregations.html)|yes|
-|postAggregations|See [Post Aggregations](Post Aggregations.html)|no|
+|postAggregations|See [Post Aggregations](Post-Aggregations.html)|no|
 |intervals|A JSON Object representing ISO-8601 Intervals. This defines the time ranges to run the query over.|yes|
 |context|An additional JSON Object which can be used to specify certain flags.|no|
 
diff --git a/docs/Libraries.md b/docs/Libraries.md
index 75bc17c633c..0c57ffab3e8 100644
--- a/docs/Libraries.md
+++ b/docs/Libraries.md
@@ -13,6 +13,9 @@ Some great folks have written their own libraries to interact with Druid
 #### Ruby
 \* [madvertise/ruby-druid](https://github.com/madvertise/ruby-druid) - A ruby client for Druid
 
+#### Python
+\* [metamx/pydruid](https://github.com/metamx/pydruid) - A python client for Druid
+
 #### Helper Libraries
 
 -   [madvertise/druid-dumbo](https://github.com/madvertise/druid-dumbo) - Scripts to help generate batch configs for the ingestion of data into Druid
diff --git a/docs/Loading-Your-Data.md b/docs/Loading-Your-Data.md
index a5edd9d65ea..2e27fad8303 100644
--- a/docs/Loading-Your-Data.md
+++ b/docs/Loading-Your-Data.md
@@ -165,7 +165,7 @@ curl -X POST "http://localhost:8080/druid/v2/?pretty" \
   }
 } ]
 ```
-Now you're ready for [Querying Your Data](Querying Your Data.html)!
+Now you're ready for [Querying Your Data](Querying-Your-Data.html)!
 
 ## Loading Data with the HadoopDruidIndexer ##
 
@@ -367,4 +367,4 @@ Now its time to run the Hadoop [Batch-ingestion](Batch-ingestion.html) job, Hado
 java -Xmx256m -Duser.timezone=UTC -Dfile.encoding=UTF-8 -Ddruid.realtime.specFile=realtime.spec -classpath lib/* com.metamx.druid.indexer.HadoopDruidIndexerMain batchConfig.json
 ```
 
-You can now move on to [Querying Your Data](Querying Your Data.html)!
\ No newline at end of file
+You can now move on to [Querying Your Data](Querying-Your-Data.html)!
\ No newline at end of file
diff --git a/docs/Master.md b/docs/Master.md
index c96af56dea9..eb86a3e81fd 100644
--- a/docs/Master.md
+++ b/docs/Master.md
@@ -15,7 +15,7 @@ Rules
 
 Segments are loaded and dropped from the cluster based on a set of rules. Rules indicate how segments should be assigned to different compute node tiers and how many replicants of a segment should exist in each tier. Rules may also indicate when segments should be dropped entirely from the cluster. The master loads a set of rules from the database. Rules may be specific to a certain datasource and/or a default set of rules can be configured. Rules are read in order and hence the ordering of rules is important. The master will cycle through all available segments and match each segment with the first rule that applies. Each segment may only match a single rule
 
-For more information on rules, see [Rule Configuration](Rule Configuration.html).
+For more information on rules, see [Rule Configuration](Rule-Configuration.html).
 
 Cleaning Up Segments
 --------------------
diff --git a/docs/MySQL.md b/docs/MySQL.md
index 88ef75006cf..713ad0ab18d 100644
--- a/docs/MySQL.md
+++ b/docs/MySQL.md
@@ -44,4 +44,4 @@ The config table is used to store runtime configuration objects. We do not have
 Task-related Tables
 -------------------
 
-There are also a number of tables created and used by the [Indexing Service](Indexing Service.html) in the course of its work.
+There are also a number of tables created and used by the [Indexing Service](Indexing-Service.html) in the course of its work.
diff --git a/docs/Querying-your-data.md b/docs/Querying-your-data.md
index 5bf72a6fa54..dc3e04d645c 100644
--- a/docs/Querying-your-data.md
+++ b/docs/Querying-your-data.md
@@ -3,7 +3,7 @@ layout: default
 ---
 # Setup #
 
-Before we start querying druid, we're going to finish setting up a complete cluster on localhost. In [Loading Your Data](Loading Your Data.html) we setup a [Realtime](Realtime.html), [Compute](Compute.html) and [Master](Master.html) node. If you've already completed that tutorial, you need only follow the directions for 'Booting a Broker Node'.
+Before we start querying druid, we're going to finish setting up a complete cluster on localhost. In [Loading Your Data](Loading-Your-Data.html) we setup a [Realtime](Realtime.html), [Compute](Compute.html) and [Master](Master.html) node. If you've already completed that tutorial, you need only follow the directions for 'Booting a Broker Node'.
 
 ## Booting a Broker Node ##
 
@@ -98,7 +98,7 @@ com.metamx.druid.http.ComputeMain
 
 # Querying Your Data #
 
-Now that we have a complete cluster setup on localhost, we need to load data. To do so, refer to [Loading Your Data](Loading Your Data.html). Having done that, its time to query our data! For a complete specification of queries, see [Querying](Querying.html).
+Now that we have a complete cluster setup on localhost, we need to load data. To do so, refer to [Loading Your Data](Loading-Your-Data.html). Having done that, its time to query our data! For a complete specification of queries, see [Querying](Querying.html).
 
 ## Querying Different Nodes ##
 
@@ -363,4 +363,4 @@ Check out [Filters](Filters.html) for more.
 
 ## Learn More ##
 
-You can learn more about querying at [Querying](Querying.html)! Now check out [Booting a production cluster](Booting a production cluster.html)!
\ No newline at end of file
+You can learn more about querying at [Querying](Querying.html)! Now check out [Booting a production cluster](Booting-a-production-cluster.html)!
\ No newline at end of file
diff --git a/docs/TimeseriesQuery.md b/docs/TimeseriesQuery.md
index 9ea79fcfa75..62ebcee59f1 100644
--- a/docs/TimeseriesQuery.md
+++ b/docs/TimeseriesQuery.md
@@ -87,7 +87,7 @@ There are 7 main parts to a timeseries query:
 |granularity|Defines the granularity of the query. See [Granularities](Granularities.html)|yes|
 |filter|See [Filters](Filters.html)|no|
 |aggregations|See [Aggregations](Aggregations.html)|yes|
-|postAggregations|See [Post Aggregations](Post Aggregations.html)|no|
+|postAggregations|See [Post Aggregations](Post-Aggregations.html)|no|
 |intervals|A JSON Object representing ISO-8601 Intervals. This defines the time ranges to run the query over.|yes|
 |context|An additional JSON Object which can be used to specify certain flags.|no|
 
diff --git a/docs/Tutorial:-A-First-Look-at-Druid.md b/docs/Tutorial:-A-First-Look-at-Druid.md
index c3de0df1d91..987cf89fa28 100644
--- a/docs/Tutorial:-A-First-Look-at-Druid.md
+++ b/docs/Tutorial:-A-First-Look-at-Druid.md
@@ -357,9 +357,9 @@ Feel free to tweak other query parameters to answer other questions you may have
 Next Steps
 ----------
 
-What to know even more information about the Druid Cluster? Check out [Tutorial: The Druid Cluster](Tutorial: The Druid Cluster.html)
+What to know even more information about the Druid Cluster? Check out [Tutorial: The Druid Cluster](Tutorial:-The-Druid-Cluster.html)
 
-Druid is even more fun if you load your own data into it! To learn how to load your data, see [Loading Your Data](Loading Your Data.html).
+Druid is even more fun if you load your own data into it! To learn how to load your data, see [Loading Your Data](Loading-Your-Data.html).
 
 Additional Information
 ----------------------
diff --git a/docs/Tutorial:-The-Druid-Cluster.md b/docs/Tutorial:-The-Druid-Cluster.md
index 286447cc3cd..282ec9fa7f8 100644
--- a/docs/Tutorial:-The-Druid-Cluster.md
+++ b/docs/Tutorial:-The-Druid-Cluster.md
@@ -19,7 +19,7 @@ tar -zxvf druid-services-*-bin.tar.gz
 cd druid-services-*
 ```
 
-You can also [Build From Source](Build From Source.html).
+You can also [Build From Source](Build-From-Source.html).
 
 ## External Dependencies ##
 
diff --git a/docs/Tutorial:-Webstream.md b/docs/Tutorial:-Webstream.md
index bbfb42450fd..bfb7ed73bed 100644
--- a/docs/Tutorial:-Webstream.md
+++ b/docs/Tutorial:-Webstream.md
@@ -346,8 +346,8 @@ Feel free to tweak other query parameters to answer other questions you may have
 Next Steps
 ----------
 
-What to know even more information about the Druid Cluster? Check out [Tutorial: The Druid Cluster](Tutorial: The Druid Cluster.html)
-Druid is even more fun if you load your own data into it! To learn how to load your data, see [Loading Your Data](Loading Your Data.html).
+What to know even more information about the Druid Cluster? Check out [Tutorial: The Druid Cluster](Tutorial:-The-Druid-Cluster.html)
+Druid is even more fun if you load your own data into it! To learn how to load your data, see [Loading Your Data](Loading-Your-Data.html).
 
 Additional Information
 ----------------------
diff --git a/docs/contents.md b/docs/contents.md
index 0d3f7f9cb62..963f88926e1 100644
--- a/docs/contents.md
+++ b/docs/contents.md
@@ -9,20 +9,20 @@ Contents
 ========================
 
 Getting Started
-\* [Tutorial: A First Look at Druid](Tutorial: A First Look at Druid.html)
-\* [Tutorial: The Druid Cluster](Tutorial: The Druid Cluster.html)
-\* [Loading Your Data](Loading Your Data.html)
-\* [Querying Your Data](Querying Your Data.html)
-\* [Booting a Production Cluster](Booting a Production Cluster.html)
+\* [Tutorial: A First Look at Druid](Tutorial:-A-First-Look-at-Druid.html)
+\* [Tutorial: The Druid Cluster](Tutorial:-The-Druid-Cluster.html)
+\* [Loading Your Data](Loading-Your-Data.html)
+\* [Querying Your Data](Querying-Your-Data.html)
+\* [Booting a Production Cluster](Booting-a-Production-Cluster.html)
 \* [Examples](Examples.html)
-\* [Cluster Setup](Cluster Setup.html)
+\* [Cluster Setup](Cluster-Setup.html)
 \* [Configuration](Configuration.html)
 --------------------------------------
 
 Data Ingestion
 \* [Realtime](Realtime.html)
-\* [Batch|Batch Ingestion](Batch|Batch Ingestion.html)
-\* [Indexing Service](Indexing Service.html)
+\* [Batch|Batch Ingestion](Batch|Batch-Ingestion.html)
+\* [Indexing Service](Indexing-Service.html)
 ----------------------------
 
 Querying
@@ -57,12 +57,12 @@ Architecture
 **\* ]
 **\* [MySQL](MySQL.html)
 **\* ]
-** [Concepts and Terminology](Concepts and Terminology.html)
+** [Concepts and Terminology](Concepts-and-Terminology.html)
 -------------------------------
 
 Development
 \* [Versioning](Versioning.html)
-\* [Build From Source](Build From Source.html)
+\* [Build From Source](Build-From-Source.html)
 \* [Libraries](Libraries.html)
 ------------------------
 

From 6df31408d9fbdab0242113cbd3ebbb6485a2a812 Mon Sep 17 00:00:00 2001
From: Russell Jurney 
Date: Mon, 16 Sep 2013 17:09:59 -0700
Subject: [PATCH 5/6] made layouts work

---
 docs/_layouts/default.html            |  179 +-
 docs/_layouts/docs.html               |    8 +
 docs/_layouts/page.html               |   11 +
 docs/_layouts/post.html               |   43 +-
 docs/css/bootstrap-responsive.css     | 1058 +++++
 docs/css/bootstrap-responsive.min.css |    9 +
 docs/css/bootstrap.css                | 5774 +++++++++++++++++++++++++
 docs/css/bootstrap.min.css            |    9 +
 docs/css/custom.css                   |  592 +++
 docs/css/default.html                 |  147 +
 docs/css/docs.html                    |    8 +
 docs/css/page.html                    |   11 +
 docs/css/pie.htc                      |   96 +
 docs/css/post.html                    |   44 +
 docs/css/syntax.css                   |    2 +-
 15 files changed, 7948 insertions(+), 43 deletions(-)
 create mode 100644 docs/_layouts/docs.html
 create mode 100644 docs/_layouts/page.html
 create mode 100644 docs/css/bootstrap-responsive.css
 create mode 100644 docs/css/bootstrap-responsive.min.css
 create mode 100644 docs/css/bootstrap.css
 create mode 100644 docs/css/bootstrap.min.css
 create mode 100644 docs/css/custom.css
 create mode 100644 docs/css/default.html
 create mode 100644 docs/css/docs.html
 create mode 100644 docs/css/page.html
 create mode 100644 docs/css/pie.htc
 create mode 100644 docs/css/post.html

diff --git a/docs/_layouts/default.html b/docs/_layouts/default.html
index 22e7e3f2a31..12106274e0f 100644
--- a/docs/_layouts/default.html
+++ b/docs/_layouts/default.html
@@ -1,44 +1,147 @@
 
-
-    
-        
-        
-        {{ page.title }}
-        
-
-        
-        
-
-        
-        
-
-    
-    
-
-        
- - - {{ content }} - -