diff --git a/.gitignore b/.gitignore index c0def8f6c5c..fb2e57e5db9 100644 --- a/.gitignore +++ b/.gitignore @@ -8,6 +8,8 @@ logs/ build/ target/ .local-execution-hints.log +docs/html/ +docs/build.log ## eclipse ignores (use 'mvn eclipse:eclipse' to build eclipse projects) ## The only configuration files which are not ignored are .settings since @@ -19,6 +21,7 @@ target/ */.project */.classpath */eclipse-build +.settings/ ## netbeans ignores nb-configuration.xml diff --git a/docs/community/clients.asciidoc b/docs/community/clients.asciidoc new file mode 100644 index 00000000000..ded07d5d94d --- /dev/null +++ b/docs/community/clients.asciidoc @@ -0,0 +1,152 @@ +== Clients + +[float] +=== Perl + +* http://github.com/clintongormley/ElasticSearch.pm[ElasticSearch.pm]: + Perl client. + +[float] +=== Python + +* http://github.com/aparo/pyes[pyes]: + Python client. + +* http://github.com/rhec/pyelasticsearch[pyelasticsearch]: + Python client. + +* https://github.com/eriky/ESClient[ESClient]: + A lightweight and easy to use Python client for ElasticSearch. + +* https://github.com/humangeo/rawes[rawes]: + Python low level client. + +* https://github.com/mozilla/elasticutils/[elasticutils]: + A friendly chainable ElasticSearch interface for Python. + +* http://intridea.github.io/surfiki-refine-elasticsearch/[Surfiki Refine]: + Python Map-Reduce engine targeting Elasticsearch indices. + +[float] +=== Ruby + +* http://github.com/karmi/tire[Tire]: + Ruby API & DSL, with ActiveRecord/ActiveModel integration. + +* http://github.com/grantr/rubberband[rubberband]: + Ruby client. + +* https://github.com/PoseBiz/stretcher[stretcher]: + Ruby client. + +* https://github.com/wireframe/elastic_searchable/[elastic_searchable]: + Ruby client + Rails integration. + +[float] +=== PHP + +* http://github.com/ruflin/Elastica[Elastica]: + PHP client. + +* http://github.com/nervetattoo/elasticsearch[elasticsearch] PHP client. + +* http://github.com/polyfractal/Sherlock[Sherlock]: + PHP client, one-to-one mapping with query DSL, fluid interface. + +[float] +=== Java + +* https://github.com/searchbox-io/Jest[Jest]: + Java Rest client. + +[float] +=== Javascript + +* https://github.com/fullscale/elastic.js[Elastic.js]: + A JavaScript implementation of the ElasticSearch Query DSL and Core API. + +* https://github.com/phillro/node-elasticsearch-client[node-elasticsearch-client]: + A NodeJS client for elastic search. + +* https://github.com/ramv/node-elastical[node-elastical]: + Node.js client for the ElasticSearch REST API + +[float] +=== .Net + +* https://github.com/Yegoroff/PlainElastic.Net[PlainElastic.Net]: + .NET client. + +* https://github.com/Mpdreamz/NEST[NEST]: + .NET client. + +* https://github.com/medcl/ElasticSearch.Net[ElasticSearch.NET]: + .NET client. + +[float] +=== Scala + +* https://github.com/sksamuel/elastic4s[elastic4s]: + Scala DSL. + +* https://github.com/scalastuff/esclient[esclient]: + Thin Scala client. + +* https://github.com/bsadeh/scalastic[scalastic]: + Scala client. + +[float] +=== Clojure + +* http://github.com/clojurewerkz/elastisch[Elastisch]: + Clojure client. + +[float] +=== Go + +* https://github.com/mattbaird/elastigo[elastigo]: + Go client. + +* https://github.com/belogik/goes[goes]: + Go lib. + +[float] +=== Erlang + +* http://github.com/tsloughter/erlastic_search[erlastic_search]: + Erlang client using HTTP. + +* https://github.com/dieswaytoofast/erlasticsearch[erlasticsearch]: + Erlang client using Thrift. + +* https://github.com/datahogs/tirexs[Tirexs]: + An https://github.com/elixir-lang/elixir[Elixir] based API/DSL, inspired by + http://github.com/karmi/tire[Tire]. Ready to use in pure Erlang + environment. + +[float] +=== EventMachine + +* http://github.com/vangberg/em-elasticsearch[em-elasticsearch]: + elasticsearch library for eventmachine. + +[float] +=== Command Line + +* https://github.com/elasticsearch/es2unix[es2unix]: + Elasticsearch API consumable by the Linux command line. + +* https://github.com/javanna/elasticshell[elasticshell]: + command line shell for elasticsearch. + +[float] +=== OCaml + +* https://github.com/tovbinm/ocaml-elasticsearch[ocaml-elasticsearch]: + OCaml client for Elasticsearch + +[float] +=== Smalltalk + +* http://ss3.gemstone.com/ss/Elasticsearch.html[Elasticsearch] - + Smalltalk client for Elasticsearch diff --git a/docs/community/frontends.asciidoc b/docs/community/frontends.asciidoc new file mode 100644 index 00000000000..e856b5747f9 --- /dev/null +++ b/docs/community/frontends.asciidoc @@ -0,0 +1,16 @@ +== Front Ends + +* https://chrome.google.com/webstore/detail/sense/doinijnbnggojdlcjifpdckfokbbfpbo[Sense]: + Chrome curl-like plugin for runninq requests against an Elasticsearch node + +* https://github.com/mobz/elasticsearch-head[elasticsearch-head]: + A web front end for an elastic search cluster. + +* https://github.com/OlegKunitsyn/elasticsearch-browser[browser]: + Web front-end over elasticsearch data. + +* https://github.com/polyfractal/elasticsearch-inquisitor[Inquisitor]: + Front-end to help debug/diagnose queries and analyzers + +* http://elastichammer.exploringelasticsearch.com/[Hammer]: + Web front-end for elasticsearch diff --git a/docs/community/github.asciidoc b/docs/community/github.asciidoc new file mode 100644 index 00000000000..74d6f664bf4 --- /dev/null +++ b/docs/community/github.asciidoc @@ -0,0 +1,5 @@ +== GitHub + +GitHub is a place where a lot of development is done around +*elasticsearch*, here is a simple search for +https://github.com/search?q=elasticsearch&type=Repositories[repositories]. diff --git a/docs/community/index.asciidoc b/docs/community/index.asciidoc new file mode 100644 index 00000000000..945c53e8be4 --- /dev/null +++ b/docs/community/index.asciidoc @@ -0,0 +1,15 @@ += Community Supported Clients + + +include::clients.asciidoc[] + +include::frontends.asciidoc[] + +include::integrations.asciidoc[] + +include::misc.asciidoc[] + +include::monitoring.asciidoc[] + +include::github.asciidoc[] + diff --git a/docs/community/integrations.asciidoc b/docs/community/integrations.asciidoc new file mode 100644 index 00000000000..486e26690e7 --- /dev/null +++ b/docs/community/integrations.asciidoc @@ -0,0 +1,71 @@ +== Integrations + + +* http://grails.org/plugin/elasticsearch[Grails]: + ElasticSearch Grails plugin. + +* https://github.com/carrot2/elasticsearch-carrot2[carrot2]: + Results clustering with carrot2 + +* https://github.com/angelf/escargot[escargot]: + ElasticSearch connector for Rails (WIP). + +* https://metacpan.org/module/Catalyst::Model::Search::ElasticSearch[Catalyst]: + ElasticSearch and Catalyst integration. + +* http://github.com/aparo/django-elasticsearch[django-elasticsearch]: + Django ElasticSearch Backend. + +* http://github.com/Aconex/elasticflume[elasticflume]: + http://github.com/cloudera/flume[Flume] sink implementation. + +* http://code.google.com/p/terrastore/wiki/Search_Integration[Terrastore Search]: + http://code.google.com/p/terrastore/[Terrastore] integration module with elasticsearch. + +* https://github.com/infochimps/wonderdog[Wonderdog]: + Hadoop bulk loader into elasticsearch. + +* http://geeks.aretotally.in/play-framework-module-elastic-search-distributed-searching-with-json-http-rest-or-java[Play!Framework]: + Integrate with Play! Framework Application. + +* https://github.com/Exercise/FOQElasticaBundle[ElasticaBundle]: + Symfony2 Bundle wrapping Elastica. + +* http://drupal.org/project/elasticsearch[Drupal]: + Drupal ElasticSearch integration. + +* https://github.com/refuge/couch_es[couch_es]: + elasticsearch helper for couchdb based products (apache couchdb, bigcouch & refuge) + +* https://github.com/sonian/elasticsearch-jetty[Jetty]: + Jetty HTTP Transport + +* https://github.com/dadoonet/spring-elasticsearch[Spring Elasticsearch]: + Spring Factory for Elasticsearch + +* https://camel.apache.org/elasticsearch.html[Apache Camel Integration]: + An Apache camel component to integrate elasticsearch + +* https://github.com/tlrx/elasticsearch-test[elasticsearch-test]: + Elasticsearch Java annotations for unit testing with + http://www.junit.org/[JUnit] + +* http://searchbox-io.github.com/wp-elasticsearch/[Wp-ElasticSearch]: + ElasticSearch WordPress Plugin + +* https://github.com/OlegKunitsyn/eslogd[eslogd]: + Linux daemon that replicates events to a central ElasticSearch server in real-time + +* https://github.com/drewr/elasticsearch-clojure-repl[elasticsearch-clojure-repl]: + Plugin that embeds nREPL for run-time introspective adventure! Also + serves as an nREPL transport. + +* http://haystacksearch.org/[Haystack]: + Modular search for Django + +* https://github.com/cleverage/play2-elasticsearch[play2-elasticsearch]: + ElasticSearch module for Play Framework 2.x + +* https://github.com/fullscale/dangle[dangle]: + A set of AngularJS directives that provide common visualizations for elasticsearch based on + D3. diff --git a/docs/community/misc.asciidoc b/docs/community/misc.asciidoc new file mode 100644 index 00000000000..b12c0f8e9b2 --- /dev/null +++ b/docs/community/misc.asciidoc @@ -0,0 +1,17 @@ +== Misc + +* https://github.com/electrical/puppet-elasticsearch[Puppet]: + Elasticsearch puppet module. + +* http://github.com/elasticsearch/cookbook-elasticsearch[Chef]: + Chef cookbook for Elasticsearch + +* https://github.com/tavisto/elasticsearch-rpms[elasticsearch-rpms]: + RPMs for elasticsearch. + +* http://www.github.com/neogenix/daikon[daikon]: + Daikon ElasticSearch CLI + +* https://github.com/Aconex/scrutineer[Scrutineer]: + A high performance consistency checker to compare what you've indexed + with your source of truth content (e.g. DB) diff --git a/docs/community/monitoring.asciidoc b/docs/community/monitoring.asciidoc new file mode 100644 index 00000000000..5a39b308d06 --- /dev/null +++ b/docs/community/monitoring.asciidoc @@ -0,0 +1,27 @@ +== Health and Performance Monitoring + +* https://github.com/lukas-vlcek/bigdesk[bigdesk]: + Live charts and statistics for elasticsearch cluster. + +* https://github.com/karmi/elasticsearch-paramedic[paramedic]: + Live charts with cluster stats and indices/shards information. + +* http://www.elastichq.org/[ElasticSearchHQ]: + Free cluster health monitoring tool + +* http://sematext.com/spm/index.html[SPM for ElasticSearch]: + Performance monitoring with live charts showing cluster and node stats, integrated + alerts, email reports, etc. + +* https://github.com/radu-gheorghe/check-es[check-es]: + Nagios/Shinken plugins for checking on elasticsearch + +* https://github.com/anchor/nagios-plugin-elasticsearch[check_elasticsearch]: + An ElasticSearch availability and performance monitoring plugin for + Nagios. + +* https://github.com/rbramley/Opsview-elasticsearch[opsview-elasticsearch]: + Opsview plugin written in Perl for monitoring ElasticSearch + +* https://github.com/polyfractal/elasticsearch-segmentspy[SegmentSpy]: + Plugin to watch Lucene segment merges across your cluster diff --git a/docs/groovy-api/anatomy.asciidoc b/docs/groovy-api/anatomy.asciidoc new file mode 100644 index 00000000000..4f0f18b4916 --- /dev/null +++ b/docs/groovy-api/anatomy.asciidoc @@ -0,0 +1,99 @@ +[[anatomy]] +== API Anatomy + +Once a <> has been +obtained, all of ElasticSearch APIs can be executed on it. Each Groovy +API is exposed using three different mechanisms. + +[float] +=== Closure Request + +The first type is to simply provide the request as a Closure, which +automatically gets resolved into the respective request instance (for +the index API, its the `IndexRequest` class). The API returns a special +future, called `GActionFuture`. This is a groovier version of +elasticsearch Java `ActionFuture` (in turn a nicer extension to Java own +`Future`) which allows to register listeners (closures) on it for +success and failures, as well as blocking for the response. For example: + +[source,js] +-------------------------------------------------- +def indexR = client.index { + index "test" + type "type1" + id "1" + source { + test = "value" + complex { + value1 = "value1" + value2 = "value2" + } + } +} + +println "Indexed $indexR.response.id into $indexR.response.index/$indexR.response.type" +-------------------------------------------------- + +In the above example, calling `indexR.response` will simply block for +the response. We can also block for the response for a specific timeout: + +[source,js] +-------------------------------------------------- +IndexResponse response = indexR.response "5s" // block for 5 seconds, same as: +response = indexR.response 5, TimeValue.SECONDS // +-------------------------------------------------- + +We can also register closures that will be called on success and on +failure: + +[source,js] +-------------------------------------------------- +indexR.success = {IndexResponse response -> + pritnln "Indexed $response.id into $response.index/$response.type" +} +indexR.failure = {Throwable t -> + println "Failed to index: $t.message" +} +-------------------------------------------------- + +[float] +=== Request + +This option allows to pass the actual instance of the request (instead +of a closure) as a parameter. The rest is similar to the closure as a +parameter option (the `GActionFuture` handling). For example: + +[source,js] +-------------------------------------------------- +def indexR = client.index (new IndexRequest( + index: "test", + type: "type1", + id: "1", + source: { + test = "value" + complex { + value1 = "value1" + value2 = "value2" + } + })) + +println "Indexed $indexR.response.id into $indexR.response.index/$indexR.response.type" +-------------------------------------------------- + +[float] +=== Java Like + +The last option is to provide an actual instance of the API request, and +an `ActionListener` for the callback. This is exactly like the Java API +with the added `gexecute` which returns the `GActionFuture`: + +[source,js] +-------------------------------------------------- +def indexR = node.client.prepareIndex("test", "type1", "1").setSource({ + test = "value" + complex { + value1 = "value1" + value2 = "value2" + } +}).gexecute() +-------------------------------------------------- diff --git a/docs/groovy-api/client.asciidoc b/docs/groovy-api/client.asciidoc new file mode 100644 index 00000000000..28d5ba00bf0 --- /dev/null +++ b/docs/groovy-api/client.asciidoc @@ -0,0 +1,58 @@ +[[client]] +== Client + +Obtaining an elasticsearch Groovy `GClient` (a `GClient` is a simple +wrapper on top of the Java `Client`) is simple. The most common way to +get a client is by starting an embedded `Node` which acts as a node +within the cluster. + +[float] +=== Node Client + +A Node based client is the simplest form to get a `GClient` to start +executing operations against elasticsearch. + +[source,js] +-------------------------------------------------- +import org.elasticsearch.groovy.client.GClient +import org.elasticsearch.groovy.node.GNode +import static org.elasticsearch.groovy.node.GNodeBuilder.nodeBuilder + +// on startup + +GNode node = nodeBuilder().node(); +GClient client = node.client(); + +// on shutdown + +node.close(); +-------------------------------------------------- + +Since elasticsearch allows to configure it using JSON based settings, +the configuration itself can be done using a closure that represent the +JSON: + +[source,js] +-------------------------------------------------- +import org.elasticsearch.groovy.node.GNode +import org.elasticsearch.groovy.node.GNodeBuilder +import static org.elasticsearch.groovy.node.GNodeBuilder.* + +// on startup + +GNodeBuilder nodeBuilder = nodeBuilder(); +nodeBuilder.settings { + node { + client = true + } + cluster { + name = "test" + } +} + +GNode node = nodeBuilder.node() + +// on shutdown + +node.stop().close() +-------------------------------------------------- diff --git a/docs/groovy-api/count.asciidoc b/docs/groovy-api/count.asciidoc new file mode 100644 index 00000000000..066bb054c5b --- /dev/null +++ b/docs/groovy-api/count.asciidoc @@ -0,0 +1,22 @@ +[[count]] +== Count API + +The count API is very similar to the +link:{java}/count.html[Java count API]. The Groovy +extension allows to provide the query to execute as a `Closure` (similar +to GORM criteria builder): + +[source,js] +-------------------------------------------------- +def count = client.count { + indices "test" + types "type1" + query { + term { + test = "value" + } + } +} +-------------------------------------------------- + +The query follows the same link:{ref}/query-dsl.html[Query DSL]. diff --git a/docs/groovy-api/delete.asciidoc b/docs/groovy-api/delete.asciidoc new file mode 100644 index 00000000000..c339b4919b5 --- /dev/null +++ b/docs/groovy-api/delete.asciidoc @@ -0,0 +1,15 @@ +[[delete]] +== Delete API + +The delete API is very similar to the +link:{java}/delete.html[Java delete API], here is an +example: + +[source,js] +-------------------------------------------------- +def deleteF = node.client.delete { + index "test" + type "type1" + id "1" +} +-------------------------------------------------- diff --git a/docs/groovy-api/get.asciidoc b/docs/groovy-api/get.asciidoc new file mode 100644 index 00000000000..aa206ccf19a --- /dev/null +++ b/docs/groovy-api/get.asciidoc @@ -0,0 +1,18 @@ +[[get]] +== Get API + +The get API is very similar to the +link:{java}/get.html[Java get API]. The main benefit +of using groovy is handling the source content. It can be automatically +converted to a `Map` which means using Groovy to navigate it is simple: + +[source,js] +-------------------------------------------------- +def getF = node.client.get { + index "test" + type "type1" + id "1" +} + +println "Result of field2: $getF.response.source.complex.field2" +-------------------------------------------------- diff --git a/docs/groovy-api/index.asciidoc b/docs/groovy-api/index.asciidoc new file mode 100644 index 00000000000..f089642400b --- /dev/null +++ b/docs/groovy-api/index.asciidoc @@ -0,0 +1,50 @@ += Groovy API +:ref: http://www.elasticsearch.org/guide/elasticsearch/reference/current +:java: http://www.elasticsearch.org/guide/elasticsearch/client/java-api/current + +[preface] +== Preface + +This section describes the http://groovy.codehaus.org/[Groovy] API +elasticsearch provides. All elasticsearch APIs are executed using a +<>, and are completely +asynchronous in nature (they either accept a listener, or return a +future). + +The Groovy API is a wrapper on top of the +link:{java}[Java API] exposing it in a groovier +manner. The execution options for each API follow a similar manner and +covered in <>. + +[float] +==== Maven Repository + +The Groovy API is hosted on +http://search.maven.org/#search%7Cga%7C1%7Ca%3A%22elasticsearch-client-groovy%22[Maven +Central]. + +For example, you can define the latest version in your `pom.xml` file: + +[source,xml] +-------------------------------------------------- + + org.elasticsearch + elasticsearch-client-groovy + ${es.version} + +-------------------------------------------------- + +include::anatomy.asciidoc[] + +include::client.asciidoc[] + +include::index_.asciidoc[] + +include::get.asciidoc[] + +include::delete.asciidoc[] + +include::search.asciidoc[] + +include::count.asciidoc[] + diff --git a/docs/groovy-api/index_.asciidoc b/docs/groovy-api/index_.asciidoc new file mode 100644 index 00000000000..7c35b1ada5c --- /dev/null +++ b/docs/groovy-api/index_.asciidoc @@ -0,0 +1,31 @@ +[[index_]] +== Index API + +The index API is very similar to the +link:{java}/index_.html[Java index API]. The Groovy +extension to it is the ability to provide the indexed source using a +closure. For example: + +[source,js] +-------------------------------------------------- +def indexR = client.index { + index "test" + type "type1" + id "1" + source { + test = "value" + complex { + value1 = "value1" + value2 = "value2" + } + } +} +-------------------------------------------------- + +In the above example, the source closure itself gets transformed into an +XContent (defaults to JSON). In order to change how the source closure +is serialized, a global (static) setting can be set on the `GClient` by +changing the `indexContentType` field. + +Note also that the `source` can be set using the typical Java based +APIs, the `Closure` option is a Groovy extension. diff --git a/docs/groovy-api/search.asciidoc b/docs/groovy-api/search.asciidoc new file mode 100644 index 00000000000..2e2dadf79f0 --- /dev/null +++ b/docs/groovy-api/search.asciidoc @@ -0,0 +1,114 @@ +[[search]] +== Search API + +The search API is very similar to the +link:{java}/search.html[Java search API]. The Groovy +extension allows to provide the search source to execute as a `Closure` +including the query itself (similar to GORM criteria builder): + +[source,js] +-------------------------------------------------- +def search = node.client.search { + indices "test" + types "type1" + source { + query { + term(test: "value") + } + } +} + +search.response.hits.each {SearchHit hit -> + println "Got hit $hit.id from $hit.index/$hit.type" +} +-------------------------------------------------- + +It can also be executed using the "Java API" while still using a closure +for the query: + +[source,js] +-------------------------------------------------- +def search = node.client.prepareSearch("test").setQuery({ + term(test: "value") +}).gexecute(); + +search.response.hits.each {SearchHit hit -> + println "Got hit $hit.id from $hit.index/$hit.type" +} +-------------------------------------------------- + +The format of the search `Closure` follows the same JSON syntax as the +link:{ref}/search-search.html[Search API] request. + +[float] +=== More examples + +Term query where multiple values are provided (see +link:{ref}/query-dsl-terms-query.html[terms]): + +[source,js] +-------------------------------------------------- +def search = node.client.search { + indices "test" + types "type1" + source { + query { + terms(test: ["value1", "value2"]) + } + } +} +-------------------------------------------------- + +Query string (see +link:{ref}/query-dsl-query-string-query.html[query string]): + +[source,js] +-------------------------------------------------- +def search = node.client.search { + indices "test" + types "type1" + source { + query { + query_string( + fields: ["test"], + query: "value1 value2") + } + } +} +-------------------------------------------------- + +Pagination (see +link:{ref}/search-request-from-size.html[from/size]): + +[source,js] +-------------------------------------------------- +def search = node.client.search { + indices "test" + types "type1" + source { + from = 0 + size = 10 + query { + term(test: "value") + } + } +} +-------------------------------------------------- + +Sorting (see link:{ref}/search-request-sort.html[sort]): + +[source,js] +-------------------------------------------------- +def search = node.client.search { + indices "test" + types "type1" + source { + query { + term(test: "value") + } + sort = [ + date : [ order: "desc"] + ] + } +} +-------------------------------------------------- diff --git a/docs/java-api/bulk.asciidoc b/docs/java-api/bulk.asciidoc new file mode 100644 index 00000000000..031f3dcef5f --- /dev/null +++ b/docs/java-api/bulk.asciidoc @@ -0,0 +1,38 @@ +[[bulk]] +== Bulk API + +The bulk API allows one to index and delete several documents in a +single request. Here is a sample usage: + +[source,java] +-------------------------------------------------- +import static org.elasticsearch.common.xcontent.XContentFactory.*; + +BulkRequestBuilder bulkRequest = client.prepareBulk(); + +// either use client#prepare, or use Requests# to directly build index/delete requests +bulkRequest.add(client.prepareIndex("twitter", "tweet", "1") + .setSource(jsonBuilder() + .startObject() + .field("user", "kimchy") + .field("postDate", new Date()) + .field("message", "trying out Elastic Search") + .endObject() + ) + ); + +bulkRequest.add(client.prepareIndex("twitter", "tweet", "2") + .setSource(jsonBuilder() + .startObject() + .field("user", "kimchy") + .field("postDate", new Date()) + .field("message", "another post") + .endObject() + ) + ); + +BulkResponse bulkResponse = bulkRequest.execute().actionGet(); +if (bulkResponse.hasFailures()) { + // process failures by iterating through each bulk response item +} +-------------------------------------------------- diff --git a/docs/java-api/client.asciidoc b/docs/java-api/client.asciidoc new file mode 100644 index 00000000000..f6d80cc5c76 --- /dev/null +++ b/docs/java-api/client.asciidoc @@ -0,0 +1,185 @@ +[[client]] +== Client + +You can use the *java client* in multiple ways: + +* Perform standard <>, <>, + <> and <> operations on an + existing cluster +* Perform administrative tasks on a running cluster +* Start full nodes when you want to run Elasticsearch embedded in your + own application or when you want to launch unit or integration tests + +Obtaining an elasticsearch `Client` is simple. The most common way to +get a client is by: + +1. creating an embedded link:#nodeclient[`Node`] that acts as a node +within a cluster +2. requesting a `Client` from your embedded `Node`. + +Another manner is by creating a link:#transportclient[`TransportClient`] +that connects to a cluster. + +*Important:* + +______________________________________________________________________________________________________________________________________________________________ +Please note that you are encouraged to use the same version on client +and cluster sides. You may hit some incompatibilities issues when mixing +major versions. +______________________________________________________________________________________________________________________________________________________________ + +[float] +=== Node Client + +Instantiating a node based client is the simplest way to get a `Client` +that can execute operations against elasticsearch. + +[source,java] +-------------------------------------------------- +import static org.elasticsearch.node.NodeBuilder.*; + +// on startup + +Node node = nodeBuilder().node(); +Client client = node.client(); + +// on shutdown + +node.close(); +-------------------------------------------------- + +When you start a `Node`, it joins an elasticsearch cluster. You can have +different clusters by simple setting the `cluster.name` setting, or +explicitly using the `clusterName` method on the builder. + +You can define `cluster.name` in `/src/main/resources/elasticsearch.yml` +dir in your project. As long as `elasticsearch.yml` is present in the +classloader, it will be used when you start your node. + +[source,java] +-------------------------------------------------- +cluster.name=yourclustername +-------------------------------------------------- + +Or in Java: + +[source,java] +-------------------------------------------------- +Node node = nodeBuilder().clusterName("yourclustername").node(); +Client client = node.client(); +-------------------------------------------------- + +The benefit of using the `Client` is the fact that operations are +automatically routed to the node(s) the operations need to be executed +on, without performing a "double hop". For example, the index operation +will automatically be executed on the shard that it will end up existing +at. + +When you start a `Node`, the most important decision is whether it +should hold data or not. In other words, should indices and shards be +allocated to it. Many times we would like to have the clients just be +clients, without shards being allocated to them. This is simple to +configure by setting either `node.data` setting to `false` or +`node.client` to `true` (the `NodeBuilder` respective helper methods on +it): + +[source,java] +-------------------------------------------------- +import static org.elasticsearch.node.NodeBuilder.*; + +// on startup + +Node node = nodeBuilder().client(true).node(); +Client client = node.client(); + +// on shutdown + +node.close(); +-------------------------------------------------- + +Another common usage is to start the `Node` and use the `Client` in +unit/integration tests. In such a case, we would like to start a "local" +`Node` (with a "local" discovery and transport). Again, this is just a +matter of a simple setting when starting the `Node`. Note, "local" here +means local on the JVM (well, actually class loader) level, meaning that +two *local* servers started within the same JVM will discover themselves +and form a cluster. + +[source,java] +-------------------------------------------------- +import static org.elasticsearch.node.NodeBuilder.*; + +// on startup + +Node node = nodeBuilder().local(true).node(); +Client client = node.client(); + +// on shutdown + +node.close(); +-------------------------------------------------- + +[float] +=== Transport Client + +The `TransportClient` connects remotely to an elasticsearch cluster +using the transport module. It does not join the cluster, but simply +gets one or more initial transport addresses and communicates with them +in round robin fashion on each action (though most actions will probably +be "two hop" operations). + +[source,java] +-------------------------------------------------- +// on startup + +Client client = new TransportClient() + .addTransportAddress(new InetSocketTransportAddress("host1", 9300)) + .addTransportAddress(new InetSocketTransportAddress("host2", 9300)); + +// on shutdown + +client.close(); +-------------------------------------------------- + +Note that you have to set the cluster name if you use one different to +"elasticsearch": + +[source,java] +-------------------------------------------------- +Settings settings = ImmutableSettings.settingsBuilder() + .put("cluster.name", "myClusterName").build(); +Client client = new TransportClient(settings); +//Add transport addresses and do something with the client... +-------------------------------------------------- + +Or using `elasticsearch.yml` file as shown in the link:#nodeclient[Node +Client section] + +The client allows to sniff the rest of the cluster, and add those into +its list of machines to use. In this case, note that the ip addresses +used will be the ones that the other nodes were started with (the +"publish" address). In order to enable it, set the +`client.transport.sniff` to `true`: + +[source,java] +-------------------------------------------------- +Settings settings = ImmutableSettings.settingsBuilder() + .put("client.transport.sniff", true).build(); +TransportClient client = new TransportClient(settings); +-------------------------------------------------- + +Other transport client level settings include: + +[cols="<,<",options="header",] +|======================================================================= +|Parameter |Description +|`client.transport.ignore_cluster_name` |Set to `true` to ignore cluster +name validation of connected nodes. (since 0.19.4) + +|`client.transport.ping_timeout` |The time to wait for a ping response +from a node. Defaults to `5s`. + +|`client.transport.nodes_sampler_interval` |How often to sample / ping +the nodes listed and connected. Defaults to `5s`. +|======================================================================= + diff --git a/docs/java-api/count.asciidoc b/docs/java-api/count.asciidoc new file mode 100644 index 00000000000..8dd75f2c61a --- /dev/null +++ b/docs/java-api/count.asciidoc @@ -0,0 +1,38 @@ +[[count]] +== Count API + +The count API allows to easily execute a query and get the number of +matches for that query. It can be executed across one or more indices +and across one or more types. The query can be provided using the +link:{ref}/query-dsl.html[Query DSL]. + +[source,java] +-------------------------------------------------- +import static org.elasticsearch.index.query.xcontent.FilterBuilders.*; +import static org.elasticsearch.index.query.xcontent.QueryBuilders.*; + +CountResponse response = client.prepareCount("test") + .setQuery(termQuery("_type", "type1")) + .execute() + .actionGet(); +-------------------------------------------------- + +For more information on the count operation, check out the REST +link:{ref}/search-count.html[count] docs. + +[float] +=== Operation Threading + +The count API allows to set the threading model the operation will be +performed when the actual execution of the API is performed on the same +node (the API is executed on a shard that is allocated on the same +server). + +There are three threading modes.The `NO_THREADS` mode means that the +count operation will be executed on the calling thread. The +`SINGLE_THREAD` mode means that the count operation will be executed on +a single different thread for all local shards. The `THREAD_PER_SHARD` +mode means that the count operation will be executed on a different +thread for each local shard. + +The default mode is `SINGLE_THREAD`. diff --git a/docs/java-api/delete-by-query.asciidoc b/docs/java-api/delete-by-query.asciidoc new file mode 100644 index 00000000000..04c5f57b267 --- /dev/null +++ b/docs/java-api/delete-by-query.asciidoc @@ -0,0 +1,21 @@ +[[delete-by-query]] +== Delete By Query API + +The delete by query API allows to delete documents from one or more +indices and one or more types based on a <>. Here +is an example: + +[source,java] +-------------------------------------------------- +import static org.elasticsearch.index.query.FilterBuilders.*; +import static org.elasticsearch.index.query.QueryBuilders.*; + +DeleteByQueryResponse response = client.prepareDeleteByQuery("test") + .setQuery(termQuery("_type", "type1")) + .execute() + .actionGet(); +-------------------------------------------------- + +For more information on the delete by query operation, check out the +link:{ref}/docs-delete-by-query.html[delete_by_query API] +docs. diff --git a/docs/java-api/delete.asciidoc b/docs/java-api/delete.asciidoc new file mode 100644 index 00000000000..52f245d4478 --- /dev/null +++ b/docs/java-api/delete.asciidoc @@ -0,0 +1,39 @@ +[[delete]] +== Delete API + +The delete API allows to delete a typed JSON document from a specific +index based on its id. The following example deletes the JSON document +from an index called twitter, under a type called tweet, with id valued +1: + +[source,java] +-------------------------------------------------- +DeleteResponse response = client.prepareDelete("twitter", "tweet", "1") + .execute() + .actionGet(); +-------------------------------------------------- + +For more information on the delete operation, check out the +link:{ref}/docs-delete.html[delete API] docs. + +[float] +=== Operation Threading + +The delete API allows to set the threading model the operation will be +performed when the actual execution of the API is performed on the same +node (the API is executed on a shard that is allocated on the same +server). + +The options are to execute the operation on a different thread, or to +execute it on the calling thread (note that the API is still async). By +default, `operationThreaded` is set to `true` which means the operation +is executed on a different thread. Here is an example that sets it to +`false`: + +[source,java] +-------------------------------------------------- +DeleteResponse response = client.prepareDelete("twitter", "tweet", "1") + .setOperationThreaded(false) + .execute() + .actionGet(); +-------------------------------------------------- diff --git a/docs/java-api/facets.asciidoc b/docs/java-api/facets.asciidoc new file mode 100644 index 00000000000..4920e3cb043 --- /dev/null +++ b/docs/java-api/facets.asciidoc @@ -0,0 +1,483 @@ +[[facets]] +== Facets + +Elasticsearch provides a full Java API to play with facets. See the +link:{ref}/search-facets.html[Facets guide]. + +Use the factory for facet builders (`FacetBuilders`) and add each facet +you want to compute when querying and add it to your search request: + +[source,java] +-------------------------------------------------- +SearchResponse sr = node.client().prepareSearch() + .setQuery( /* your query */ ) + .addFacet( /* add a facet */ ) + .execute().actionGet(); +-------------------------------------------------- + +Note that you can add more than one facet. See +link:{ref}/search-search.html[Search Java API] for details. + +To build facet requests, use `FacetBuilders` helpers. Just import them +in your class: + +[source,java] +-------------------------------------------------- +import org.elasticsearch.search.facet.FacetBuilders.*; +-------------------------------------------------- + +[float] +=== Facets + +[float] +==== Terms Facet + +Here is how you can use +link:{ref}/search-facets-terms-facet.html[Terms Facet] +with Java API. + +[float] +===== Prepare facet request + +Here is an example on how to create the facet request: + +[source,java] +-------------------------------------------------- +FacetBuilders.termsFacet("f") + .field("brand") + .size(10); +-------------------------------------------------- + +[float] +===== Use facet response + +Import Facet definition classes: + +[source,java] +-------------------------------------------------- +import org.elasticsearch.search.facet.terms.*; +-------------------------------------------------- + +[source,java] +-------------------------------------------------- +// sr is here your SearchResponse object +TermsFacet f = (TermsFacet) sr.facets().facetsAsMap().get("f"); + +f.getTotalCount(); // Total terms doc count +f.getOtherCount(); // Not shown terms doc count +f.getMissingCount(); // Without term doc count + +// For each entry +for (TermsFacet.Entry entry : f) { + entry.getTerm(); // Term + entry.getCount(); // Doc count +} +-------------------------------------------------- + +[float] +==== Range Facet + +Here is how you can use +link:{ref}/search-facets-range-facet.html[Range Facet] +with Java API. + +[float] +===== Prepare facet request + +Here is an example on how to create the facet request: + +[source,java] +-------------------------------------------------- +FacetBuilders.rangeFacet("f") + .field("price") // Field to compute on + .addUnboundedFrom(3) // from -infinity to 3 (excluded) + .addRange(3, 6) // from 3 to 6 (excluded) + .addUnboundedTo(6); // from 6 to +infinity +-------------------------------------------------- + +[float] +===== Use facet response + +Import Facet definition classes: + +[source,java] +-------------------------------------------------- +import org.elasticsearch.search.facet.range.*; +-------------------------------------------------- + +[source,java] +-------------------------------------------------- +// sr is here your SearchResponse object +RangeFacet f = (RangeFacet) sr.facets().facetsAsMap().get("f"); + +// For each entry +for (RangeFacet.Entry entry : f) { + entry.getFrom(); // Range from requested + entry.getTo(); // Range to requested + entry.getCount(); // Doc count + entry.getMin(); // Min value + entry.getMax(); // Max value + entry.getMean(); // Mean + entry.getTotal(); // Sum of values +} +-------------------------------------------------- + +[float] +==== Histogram Facet + +Here is how you can use +link:{ref}/search-facets-histogram-facet.html[Histogram +Facet] with Java API. + +[float] +===== Prepare facet request + +Here is an example on how to create the facet request: + +[source,java] +-------------------------------------------------- +HistogramFacetBuilder facet = FacetBuilders.histogramFacet("f") + .field("price") + .interval(1); +-------------------------------------------------- + +[float] +===== Use facet response + +Import Facet definition classes: + +[source,java] +-------------------------------------------------- +import org.elasticsearch.search.facet.histogram.*; +-------------------------------------------------- + +[source,java] +-------------------------------------------------- +// sr is here your SearchResponse object +HistogramFacet f = (HistogramFacet) sr.facets().facetsAsMap().get("f"); + +// For each entry +for (HistogramFacet.Entry entry : f) { + entry.getKey(); // Key (X-Axis) + entry.getCount(); // Doc count (Y-Axis) +} +-------------------------------------------------- + +[float] +==== Date Histogram Facet + +Here is how you can use +link:{ref}/search-facets-date-histogram-facet.html[Date +Histogram Facet] with Java API. + +[float] +===== Prepare facet request + +Here is an example on how to create the facet request: + +[source,java] +-------------------------------------------------- +FacetBuilders.dateHistogramFacet("f") + .field("date") // Your date field + .interval("year"); // You can also use "quarter", "month", "week", "day", + // "hour" and "minute" or notation like "1.5h" or "2w" +-------------------------------------------------- + +[float] +===== Use facet response + +Import Facet definition classes: + +[source,java] +-------------------------------------------------- +import org.elasticsearch.search.facet.datehistogram.*; +-------------------------------------------------- + +[source,java] +-------------------------------------------------- +// sr is here your SearchResponse object +DateHistogramFacet f = (DateHistogramFacet) sr.facets().facetsAsMap().get("f"); + +// For each entry +for (DateHistogramFacet.Entry entry : f) { + entry.getTime(); // Date in ms since epoch (X-Axis) + entry.getCount(); // Doc count (Y-Axis) +} +-------------------------------------------------- + +[float] +==== Filter Facet (not facet filter) + +Here is how you can use +link:{ref}/search-facets-filter-facet.html[Filter Facet] +with Java API. + +If you are looking on how to apply a filter to a facet, have a look at +link:#facet-filter[facet filter] using Java API. + +[float] +===== Prepare facet request + +Here is an example on how to create the facet request: + +[source,java] +-------------------------------------------------- +FacetBuilders.filterFacet("f", + FilterBuilders.termFilter("brand", "heineken")); // Your Filter here +-------------------------------------------------- + +See <> to +learn how to build filters using Java. + +[float] +===== Use facet response + +Import Facet definition classes: + +[source,java] +-------------------------------------------------- +import org.elasticsearch.search.facet.filter.*; +-------------------------------------------------- + +[source,java] +-------------------------------------------------- +// sr is here your SearchResponse object +FilterFacet f = (FilterFacet) sr.facets().facetsAsMap().get("f"); + +f.getCount(); // Number of docs that matched +-------------------------------------------------- + +[float] +==== Query Facet + +Here is how you can use +link:{ref}/search-facets-query-facet.html[Query Facet] +with Java API. + +[float] +===== Prepare facet request + +Here is an example on how to create the facet request: + +[source,java] +-------------------------------------------------- +FacetBuilders.queryFacet("f", + QueryBuilders.matchQuery("brand", "heineken")); +-------------------------------------------------- + +[float] +===== Use facet response + +Import Facet definition classes: + +[source,java] +-------------------------------------------------- +import org.elasticsearch.search.facet.query.*; +-------------------------------------------------- + +[source,java] +-------------------------------------------------- +// sr is here your SearchResponse object +QueryFacet f = (QueryFacet) sr.facets().facetsAsMap().get("f"); + +f.getCount(); // Number of docs that matched +-------------------------------------------------- + +See <> to +learn how to build queries using Java. + +[float] +==== Statistical + +Here is how you can use +link:{ref}/search-facets-statistical-facet.html[Statistical +Facet] with Java API. + +[float] +===== Prepare facet request + +Here is an example on how to create the facet request: + +[source,java] +-------------------------------------------------- +FacetBuilders.statisticalFacet("f") + .field("price"); +-------------------------------------------------- + +[float] +===== Use facet response + +Import Facet definition classes: + +[source,java] +-------------------------------------------------- +import org.elasticsearch.search.facet.statistical.*; +-------------------------------------------------- + +[source,java] +-------------------------------------------------- +// sr is here your SearchResponse object +StatisticalFacet f = (StatisticalFacet) sr.facets().facetsAsMap().get("f"); + +f.getCount(); // Doc count +f.getMin(); // Min value +f.getMax(); // Max value +f.getMean(); // Mean +f.getTotal(); // Sum of values +f.getStdDeviation(); // Standard Deviation +f.getSumOfSquares(); // Sum of Squares +f.getVariance(); // Variance +-------------------------------------------------- + +[float] +==== Terms Stats Facet + +Here is how you can use +link:{ref}/search-facets-terms-stats-facet.html[Terms +Stats Facet] with Java API. + +[float] +===== Prepare facet request + +Here is an example on how to create the facet request: + +[source,java] +-------------------------------------------------- +FacetBuilders.termsStatsFacet("f") + .keyField("brand") + .valueField("price"); +-------------------------------------------------- + +[float] +===== Use facet response + +Import Facet definition classes: + +[source,java] +-------------------------------------------------- +import org.elasticsearch.search.facet.termsstats.*; +-------------------------------------------------- + +[source,java] +-------------------------------------------------- +// sr is here your SearchResponse object +TermsStatsFacet f = (TermsStatsFacet) sr.facets().facetsAsMap().get("f"); +f.getTotalCount(); // Total terms doc count +f.getOtherCount(); // Not shown terms doc count +f.getMissingCount(); // Without term doc count + +// For each entry +for (TermsStatsFacet.Entry entry : f) { + entry.getTerm(); // Term + entry.getCount(); // Doc count + entry.getMin(); // Min value + entry.getMax(); // Max value + entry.getMean(); // Mean + entry.getTotal(); // Sum of values +} +-------------------------------------------------- + +[float] +==== Geo Distance Facet + +Here is how you can use +link:{ref}/search-facets-geo-distance-facet.html[Geo +Distance Facet] with Java API. + +[float] +===== Prepare facet request + +Here is an example on how to create the facet request: + +[source,java] +-------------------------------------------------- +FacetBuilders.geoDistanceFacet("f") + .field("pin.location") // Field containing coordinates we want to compare with + .point(40, -70) // Point from where we start (0) + .addUnboundedFrom(10) // 0 to 10 km (excluded) + .addRange(10, 20) // 10 to 20 km (excluded) + .addRange(20, 100) // 20 to 100 km (excluded) + .addUnboundedTo(100) // from 100 km to infinity (and beyond ;-) ) + .unit(DistanceUnit.KILOMETERS); // All distances are in kilometers. Can be MILES +-------------------------------------------------- + +[float] +===== Use facet response + +Import Facet definition classes: + +[source,java] +-------------------------------------------------- +import org.elasticsearch.search.facet.geodistance.*; +-------------------------------------------------- + +[source,java] +-------------------------------------------------- +// sr is here your SearchResponse object +GeoDistanceFacet f = (GeoDistanceFacet) sr.facets().facetsAsMap().get("f"); + +// For each entry +for (GeoDistanceFacet.Entry entry : f) { + entry.getFrom(); // Distance from requested + entry.getTo(); // Distance to requested + entry.getCount(); // Doc count + entry.getMin(); // Min value + entry.getMax(); // Max value + entry.getTotal(); // Sum of values + entry.getMean(); // Mean +} +-------------------------------------------------- + +[float] +=== Facet filters (not Filter Facet) + +By default, facets are applied on the query resultset whatever filters +exists or are. + +If you need to compute facets with the same filters or even with other +filters, you can add the filter to any facet using +`AbstractFacetBuilder#facetFilter(FilterBuilder)` method: + +[source,java] +-------------------------------------------------- +FacetBuilders + .termsFacet("f").field("brand") // Your facet + .facetFilter( // Your filter here + FilterBuilders.termFilter("colour", "pale") + ); +-------------------------------------------------- + +For example, you can reuse the same filter you created for your query: + +[source,java] +-------------------------------------------------- +// A common filter +FilterBuilder filter = FilterBuilders.termFilter("colour", "pale"); + +TermsFacetBuilder facet = FacetBuilders.termsFacet("f") + .field("brand") + .facetFilter(filter); // We apply it to the facet + +SearchResponse sr = node.client().prepareSearch() + .setQuery(QueryBuilders.matchAllQuery()) + .setFilter(filter) // We apply it to the query + .addFacet(facet) + .execute().actionGet(); +-------------------------------------------------- + +See documentation on how to build +<>. + +[float] +=== Scope + +By default, facets are computed within the query resultset. But, you can +compute facets from all documents in the index whatever the query is, +using `global` parameter: + +[source,java] +-------------------------------------------------- +TermsFacetBuilder facet = FacetBuilders.termsFacet("f") + .field("brand") + .global(true); +-------------------------------------------------- diff --git a/docs/java-api/get.asciidoc b/docs/java-api/get.asciidoc new file mode 100644 index 00000000000..d83c12f2480 --- /dev/null +++ b/docs/java-api/get.asciidoc @@ -0,0 +1,38 @@ +[[get]] +== Get API + +The get API allows to get a typed JSON document from the index based on +its id. The following example gets a JSON document from an index called +twitter, under a type called tweet, with id valued 1: + +[source,java] +-------------------------------------------------- +GetResponse response = client.prepareGet("twitter", "tweet", "1") + .execute() + .actionGet(); +-------------------------------------------------- + +For more information on the index operation, check out the REST +link:{ref}/docs-get.html[get] docs. + +[float] +=== Operation Threading + +The get API allows to set the threading model the operation will be +performed when the actual execution of the API is performed on the same +node (the API is executed on a shard that is allocated on the same +server). + +The options are to execute the operation on a different thread, or to +execute it on the calling thread (note that the API is still async). By +default, `operationThreaded` is set to `true` which means the operation +is executed on a different thread. Here is an example that sets it to +`false`: + +[source,java] +-------------------------------------------------- +GetResponse response = client.prepareGet("twitter", "tweet", "1") + .setOperationThreaded(false) + .execute() + .actionGet(); +-------------------------------------------------- diff --git a/docs/java-api/index.asciidoc b/docs/java-api/index.asciidoc new file mode 100644 index 00000000000..c2e10170e14 --- /dev/null +++ b/docs/java-api/index.asciidoc @@ -0,0 +1,61 @@ +[[java-api]] += Java API +:ref: http://www.elasticsearch.org/guide/elasticsearch/reference/current + +[preface] +== Preface +This section describes the Java API that elasticsearch provides. All +elasticsearch operations are executed using a +<> object. All +operations are completely asynchronous in nature (either accepts a +listener, or return a future). + +Additionally, operations on a client may be accumulated and executed in +<>. + +Note, all the APIs are exposed through the +Java API (actually, the Java API is used internally to execute them). + +[float] +== Maven Repository + +Elasticsearch is hosted on +http://search.maven.org/#search%7Cga%7C1%7Ca%3A%22elasticsearch%22[Maven +Central]. + +For example, you can define the latest version in your `pom.xml` file: + +[source,xml] +-------------------------------------------------- + + org.elasticsearch + elasticsearch + ${es.version} + +-------------------------------------------------- + + +include::client.asciidoc[] + +include::index_.asciidoc[] + +include::get.asciidoc[] + +include::delete.asciidoc[] + +include::bulk.asciidoc[] + +include::search.asciidoc[] + +include::count.asciidoc[] + +include::delete-by-query.asciidoc[] + +include::facets.asciidoc[] + +include::percolate.asciidoc[] + +include::query-dsl-queries.asciidoc[] + +include::query-dsl-filters.asciidoc[] + diff --git a/docs/java-api/index_.asciidoc b/docs/java-api/index_.asciidoc new file mode 100644 index 00000000000..9b71ab8c92d --- /dev/null +++ b/docs/java-api/index_.asciidoc @@ -0,0 +1,201 @@ +[[index_]] +== Index API + +The index API allows one to index a typed JSON document into a specific +index and make it searchable. + +[float] +=== Generate JSON document + +There are different way of generating JSON document: + +* Manually (aka do it yourself) using native `byte[]` or as a `String` + +* Using `Map` that will be automatically converted to its JSON +equivalent + +* Using a third party library to serialize your beans such as +http://wiki.fasterxml.com/JacksonHome[Jackson] + +* Using built-in helpers XContentFactory.jsonBuilder() + +Internally, each type is converted to `byte[]` (so a String is converted +to a `byte[]`). Therefore, if the object is in this form already, then +use it. The `jsonBuilder` is highly optimized JSON generator that +directly constructs a `byte[]`. + +[float] +==== Do It Yourself + +Nothing really difficult here but note that you will have to encode +dates regarding to the +link:{ref}/mapping-date-format.html[Date Format]. + +[source,java] +-------------------------------------------------- +String json = "{" + + "\"user\":\"kimchy\"," + + "\"postDate\":\"2013-01-30\"," + + "\"message\":\"trying out Elastic Search\"," + + "}"; +-------------------------------------------------- + +[float] +==== Using Map + +Map is a key:values pair collection. It represents very well a JSON +structure: + +[source,java] +-------------------------------------------------- +Map json = new HashMap(); +json.put("user","kimchy"); +json.put("postDate",new Date()); +json.put("message","trying out Elastic Search"); +-------------------------------------------------- + +[float] +==== Serialize your beans + +Elasticsearch already use Jackson but shade it under +`org.elasticsearch.common.jackson` package. + + So, you can add your own Jackson version in your `pom.xml` file or in +your classpath. See http://wiki.fasterxml.com/JacksonDownload[Jackson +Download Page]. + +For example: + +[source,java] +-------------------------------------------------- + + com.fasterxml.jackson.core + jackson-databind + 2.1.3 + +-------------------------------------------------- + +Then, you can start serializing your beans to JSON: + +[source,java] +-------------------------------------------------- +import com.fasterxml.jackson.databind.*; + +// instance a json mapper +ObjectMapper mapper = new ObjectMapper(); // create once, reuse + +// generate json +String json = mapper.writeValueAsString(yourbeaninstance); +-------------------------------------------------- + +[float] +==== Use Elasticsearch helpers + +Elasticsearch provides built-in helpers to generate JSON content. + +[source,java] +-------------------------------------------------- +import static org.elasticsearch.common.xcontent.XContentFactory.*; + +XContentBuilder builder = jsonBuilder() + .startObject() + .field("user", "kimchy") + .field("postDate", new Date()) + .field("message", "trying out Elastic Search") + .endObject() +-------------------------------------------------- + +Note that you can also add arrays with `startArray(String)` and +`endArray()` methods. By the way, `field` method + + accept many object types. You can pass directly numbers, dates and even +other XContentBuilder objects. + +If you need to see the generated JSON content, you can use the +@string()@method. + +[source,java] +-------------------------------------------------- +String json = builder.string(); +-------------------------------------------------- + +[float] +=== Index document + +The following example indexes a JSON document into an index called +twitter, under a type called tweet, with id valued 1: + +[source,java] +-------------------------------------------------- +import static org.elasticsearch.common.xcontent.XContentFactory.*; + +IndexResponse response = client.prepareIndex("twitter", "tweet", "1") + .setSource(jsonBuilder() + .startObject() + .field("user", "kimchy") + .field("postDate", new Date()) + .field("message", "trying out Elastic Search") + .endObject() + ) + .execute() + .actionGet(); +-------------------------------------------------- + +Note that you can also index your documents as JSON String and that you +don't have to give an ID: + +[source,java] +-------------------------------------------------- +String json = "{" + + "\"user\":\"kimchy\"," + + "\"postDate\":\"2013-01-30\"," + + "\"message\":\"trying out Elastic Search\"," + + "}"; + +IndexResponse response = client.prepareIndex("twitter", "tweet") + .setSource(json) + .execute() + .actionGet(); +-------------------------------------------------- + +`IndexResponse` object will give you report: + +[source,java] +-------------------------------------------------- +// Index name +String _index = response.index(); +// Type name +String _type = response.type(); +// Document ID (generated or not) +String _id = response.id(); +// Version (if it's the first time you index this document, you will get: 1) +long _version = response.version(); +-------------------------------------------------- + +If you use percolation while indexing, `IndexResponse` object will give +you percolator that have matched: + +[source,java] +-------------------------------------------------- +IndexResponse response = client.prepareIndex("twitter", "tweet", "1") + .setSource(json) + .setPercolate("*") + .execute() + .actionGet(); + +List matches = response.matches(); +-------------------------------------------------- + +For more information on the index operation, check out the REST +link:{ref}/docs-index_.html[index] docs. + +[float] +=== Operation Threading + +The index API allows to set the threading model the operation will be +performed when the actual execution of the API is performed on the same +node (the API is executed on a shard that is allocated on the same +server). + +The options are to execute the operation on a different thread, or to +execute it on the calling thread (note that the API is still async). By +default, `operationThreaded` is set to `true` which means the operation +is executed on a different thread. diff --git a/docs/java-api/percolate.asciidoc b/docs/java-api/percolate.asciidoc new file mode 100644 index 00000000000..f893674c656 --- /dev/null +++ b/docs/java-api/percolate.asciidoc @@ -0,0 +1,48 @@ +[[percolate]] +== Percolate API + +The percolator allows to register queries against an index, and then +send `percolate` requests which include a doc, and getting back the +queries that match on that doc out of the set of registered queries. + +Read the main {ref}/search-percolate.html[percolate] +documentation before reading this guide. + +[source,java] +-------------------------------------------------- +//This is the query we're registering in the percolator +QueryBuilder qb = termQuery("content", "amazing"); + +//Index the query = register it in the percolator +client.prepareIndex("_percolator", "myIndexName", "myDesignatedQueryName") + .setSource(jsonBuilder() + .startObject() + .field("query", qb) // Register the query + .endObject()) + .setRefresh(true) // Needed when the query shall be available immediately + .execute().actionGet(); +-------------------------------------------------- + +This indexes the above term query under the name +*myDesignatedQueryName*. + +In order to check a document against the registered queries, use this +code: + +[source,java] +-------------------------------------------------- +//Build a document to check against the percolator +XContentBuilder docBuilder = XContentFactory.jsonBuilder().startObject(); +docBuilder.field("doc").startObject(); //This is needed to designate the document +docBuilder.field("content", "This is amazing!"); +docBuilder.endObject(); //End of the doc field +docBuilder.endObject(); //End of the JSON root object +//Percolate +PercolateResponse response = + client.preparePercolate("myIndexName", "myDocumentType").setSource(docBuilder).execute().actionGet(); +//Iterate over the results +for(String result : response) { + //Handle the result which is the name of + //the query in the percolator +} +-------------------------------------------------- diff --git a/docs/java-api/query-dsl-filters.asciidoc b/docs/java-api/query-dsl-filters.asciidoc new file mode 100644 index 00000000000..5b40f07dac4 --- /dev/null +++ b/docs/java-api/query-dsl-filters.asciidoc @@ -0,0 +1,459 @@ +[[query-dsl-filters]] +== Query DSL - Filters + +elasticsearch provides a full Java query dsl in a similar manner to the +REST link:{ref}/query-dsl.html[Query DSL]. The factory for filter +builders is `FilterBuilders`. + +Once your query is ready, you can use the <>. + +See also how to build <>. + +To use `FilterBuilders` just import them in your class: + +[source,java] +-------------------------------------------------- +import org.elasticsearch.index.query.FilterBuilders.*; +-------------------------------------------------- + +Note that you can easily print (aka debug) JSON generated queries using +`toString()` method on `FilterBuilder` object. + +[float] +=== And Filter + +See link:{ref}/query-dsl-and-filter.html[And Filter] + + +[source,java] +-------------------------------------------------- +FilterBuilders.andFilter( + FilterBuilders.rangeFilter("postDate").from("2010-03-01").to("2010-04-01"), + FilterBuilders.prefixFilter("name.second", "ba") + ); +-------------------------------------------------- + +Note that you can cache the result using +`AndFilterBuilder#cache(boolean)` method. See <>. + +[float] +=== Bool Filter + +See link:{ref}/query-dsl-bool-filter.html[Bool Filter] + + +[source,java] +-------------------------------------------------- +FilterBuilders.boolFilter() + .must(FilterBuilders.termFilter("tag", "wow")) + .mustNot(FilterBuilders.rangeFilter("age").from("10").to("20")) + .should(FilterBuilders.termFilter("tag", "sometag")) + .should(FilterBuilders.termFilter("tag", "sometagtag")); +-------------------------------------------------- + +Note that you can cache the result using +`BoolFilterBuilder#cache(boolean)` method. See <>. + +[float] +=== Exists Filter + +See link:{ref}/query-dsl-exists-filter.html[Exists Filter]. + + +[source,java] +-------------------------------------------------- +FilterBuilders.existsFilter("user"); +-------------------------------------------------- + +[float] +=== Ids Filter + +See link:{ref}/query-dsl-ids-filter.html[IDs Filter] + + +[source,java] +-------------------------------------------------- +FilterBuilders.idsFilter("my_type", "type2").addIds("1", "4", "100"); + +// Type is optional +FilterBuilders.idsFilter().addIds("1", "4", "100"); +-------------------------------------------------- + +[float] +=== Limit Filter + +See link:{ref}/query-dsl-limit-filter.html[Limit Filter] + + +[source,java] +-------------------------------------------------- +FilterBuilders.limitFilter(100); +-------------------------------------------------- + +[float] +=== Type Filter + +See link:{ref}/query-dsl-type-filter.html[Type Filter] + + +[source,java] +-------------------------------------------------- +FilterBuilders.typeFilter("my_type"); +-------------------------------------------------- + +[float] +=== Geo Bounding Box Filter + +See link:{ref}/query-dsl-geo-bounding-box-filter.html[Geo +Bounding Box Filter] + +[source,java] +-------------------------------------------------- +FilterBuilders.geoBoundingBoxFilter("pin.location") + .topLeft(40.73, -74.1) + .bottomRight(40.717, -73.99); +-------------------------------------------------- + +Note that you can cache the result using +`GeoBoundingBoxFilterBuilder#cache(boolean)` method. See +<>. + +[float] +=== GeoDistance Filter + +See link:{ref}/query-dsl-geo-distance-filter.html[Geo +Distance Filter] + +[source,java] +-------------------------------------------------- +FilterBuilders.geoDistanceFilter("pin.location") + .point(40, -70) + .distance(200, DistanceUnit.KILOMETERS) + .optimizeBbox("memory") // Can be also "indexed" or "none" + .geoDistance(GeoDistance.ARC); // Or GeoDistance.PLANE +-------------------------------------------------- + +Note that you can cache the result using +`GeoDistanceFilterBuilder#cache(boolean)` method. See +<>. + +[float] +=== Geo Distance Range Filter + +See link:{ref}/query-dsl-geo-distance-range-filter.html[Geo +Distance Range Filter] + +[source,java] +-------------------------------------------------- +FilterBuilders.geoDistanceRangeFilter("pin.location") + .point(40, -70) + .from("200km") + .to("400km") + .includeLower(true) + .includeUpper(false) + .optimizeBbox("memory") // Can be also "indexed" or "none" + .geoDistance(GeoDistance.ARC); // Or GeoDistance.PLANE +-------------------------------------------------- + +Note that you can cache the result using +`GeoDistanceRangeFilterBuilder#cache(boolean)` method. See +<>. + +[float] +=== Geo Polygon Filter + +See link:{ref}/query-dsl-geo-polygon-filter.html[Geo Polygon +Filter] + +[source,java] +-------------------------------------------------- +FilterBuilders.geoPolygonFilter("pin.location") + .addPoint(40, -70) + .addPoint(30, -80) + .addPoint(20, -90); +-------------------------------------------------- + +Note that you can cache the result using +`GeoPolygonFilterBuilder#cache(boolean)` method. See +<>. + +[float] +=== Geo Shape Filter + +See link:{ref}/query-dsl-geo-shape-filter.html[Geo Shape +Filter] + +Note: the `geo_shape` type uses `Spatial4J` and `JTS`, both of which are +optional dependencies. Consequently you must add `Spatial4J` and `JTS` +to your classpath in order to use this type: + +[source,xml] +----------------------------------------------- + + com.spatial4j + spatial4j + 0.3 + + + + com.vividsolutions + jts + 1.12 + + + xerces + xercesImpl + + + +----------------------------------------------- + +[source,java] +-------------------------------------------------- +// Import Spatial4J shapes +import com.spatial4j.core.context.SpatialContext; +import com.spatial4j.core.shape.Shape; +import com.spatial4j.core.shape.impl.RectangleImpl; + +// Also import ShapeRelation +import org.elasticsearch.common.geo.ShapeRelation; +-------------------------------------------------- + +[source,java] +-------------------------------------------------- +// Shape within another +filter = FilterBuilders.geoShapeFilter("location", + new RectangleImpl(0,10,0,10,SpatialContext.GEO)) + .relation(ShapeRelation.WITHIN); + +// Intersect shapes +filter = FilterBuilders.geoShapeFilter("location", + new PointImpl(0, 0, SpatialContext.GEO)) + .relation(ShapeRelation.INTERSECTS); + +// Using pre-indexed shapes +filter = FilterBuilders.geoShapeFilter("location", "New Zealand", "countries") + .relation(ShapeRelation.DISJOINT); +-------------------------------------------------- + +[float] +=== Has Child / Has Parent Filters + +See: + * link:{ref}/query-dsl-has-child-filter.html[Has Child Filter] + * link:{ref}/query-dsl-has-parent-filter.html[Has Parent Filter] + +[source,java] +-------------------------------------------------- +// Has Child +QFilterBuilders.hasChildFilter("blog_tag", + QueryBuilders.termQuery("tag", "something")); + +// Has Parent +QFilterBuilders.hasParentFilter("blog", + QueryBuilders.termQuery("tag", "something")); +-------------------------------------------------- + +[float] +=== Match All Filter + +See link:{ref}/query-dsl-match-all-filter.html[Match All Filter] + +[source,java] +-------------------------------------------------- +FilterBuilders.matchAllFilter(); +-------------------------------------------------- + +[float] +=== Missing Filter + +See link:{ref}/query-dsl-missing-filter.html[Missing Filter] + + +[source,java] +-------------------------------------------------- +FilterBuilders.missingFilter("user") + .existence(true) + .nullValue(true); +-------------------------------------------------- + +[float] +=== Not Filter + +See link:{ref}/query-dsl-not-filter.html[Not Filter] + + +[source,java] +-------------------------------------------------- +FilterBuilders.notFilter( + FilterBuilders.rangeFilter("price").from("1").to("2")); +-------------------------------------------------- + +[float] +=== Numeric Range Filter + +See link:{ref}/query-dsl-numeric-range-filter.html[Numeric +Range Filter] + +[source,java] +-------------------------------------------------- +FilterBuilders.numericRangeFilter("age") + .from(10) + .to(20) + .includeLower(true) + .includeUpper(false); +-------------------------------------------------- + +Note that you can cache the result using +`NumericRangeFilterBuilder#cache(boolean)` method. See +<>. + +[float] +=== Or Filter + +See link:{ref}/query-dsl-or-filter.html[Or Filter] + + +[source,java] +-------------------------------------------------- +FilterBuilders.orFilter( + FilterBuilders.termFilter("name.second", "banon"), + FilterBuilders.termFilter("name.nick", "kimchy") + ); +-------------------------------------------------- + +Note that you can cache the result using +`OrFilterBuilder#cache(boolean)` method. See <>. + +[float] +=== Prefix Filter + +See link:{ref}/query-dsl-prefix-filter.html[Prefix Filter] + + +[source,java] +-------------------------------------------------- +FilterBuilders.prefixFilter("user", "ki"); +-------------------------------------------------- + +Note that you can cache the result using +`PrefixFilterBuilder#cache(boolean)` method. See <>. + +[float] +=== Query Filter + +See link:{ref}/query-dsl-query-filter.html[Query Filter] + + +[source,java] +-------------------------------------------------- +FilterBuilders.queryFilter( + QueryBuilders.queryString("this AND that OR thus") + ); +-------------------------------------------------- + +Note that you can cache the result using +`QueryFilterBuilder#cache(boolean)` method. See <>. + +[float] +=== Range Filter + +See link:{ref}/query-dsl-range-filter.html[Range Filter] + + +[source,java] +-------------------------------------------------- +FilterBuilders.rangeFilter("age") + .from("10") + .to("20") + .includeLower(true) + .includeUpper(false); + +// A simplified form using gte, gt, lt or lte +FilterBuilders.rangeFilter("age") + .gte("10") + .lt("20"); +-------------------------------------------------- + +Note that you can ask not to cache the result using +`RangeFilterBuilder#cache(boolean)` method. See <>. + +[float] +=== Script Filter + +See link:{ref}/query-dsl-script-filter.html[Script Filter] + + +[source,java] +-------------------------------------------------- +FilterBuilder filter = FilterBuilders.scriptFilter( + "doc['age'].value > param1" + ).addParam("param1", 10); +-------------------------------------------------- + +Note that you can cache the result using +`ScriptFilterBuilder#cache(boolean)` method. See <>. + +[float] +=== Term Filter + +See link:{ref}/query-dsl-term-filter.html[Term Filter] + + +[source,java] +-------------------------------------------------- +FilterBuilders.termFilter("user", "kimchy"); +-------------------------------------------------- + +Note that you can ask not to cache the result using +`TermFilterBuilder#cache(boolean)` method. See <>. + +[float] +=== Terms Filter + +See link:{ref}/query-dsl-terms-filter.html[Terms Filter] + + +[source,java] +-------------------------------------------------- +FilterBuilders.termsFilter("user", "kimchy", "elasticsearch") + .execution("plain"); // Optional, can be also "bool", "and" or "or" + // or "bool_nocache", "and_nocache" or "or_nocache" +-------------------------------------------------- + +Note that you can ask not to cache the result using +`TermsFilterBuilder#cache(boolean)` method. See <>. + +[float] +=== Nested Filter + +See link:{ref}/query-dsl-nested-filter.html[Nested Filter] + + +[source,java] +-------------------------------------------------- +FilterBuilders.nestedFilter("obj1", + QueryBuilders.boolQuery() + .must(QueryBuilders.matchQuery("obj1.name", "blue")) + .must(QueryBuilders.rangeQuery("obj1.count").gt(5)) + ); +-------------------------------------------------- + +Note that you can ask not to cache the result using +`NestedFilterBuilder#cache(boolean)` method. See <>. + +[[query-dsl-filters-caching]] +[float] +=== Caching + +By default, some filters are cached or not cached. You can have a fine +tuning control using `cache(boolean)` method when exists. For example: + +[source,java] +-------------------------------------------------- +FilterBuilder filter = FilterBuilders.andFilter( + FilterBuilders.rangeFilter("postDate").from("2010-03-01").to("2010-04-01"), + FilterBuilders.prefixFilter("name.second", "ba") + ) + .cache(true); +-------------------------------------------------- diff --git a/docs/java-api/query-dsl-queries.asciidoc b/docs/java-api/query-dsl-queries.asciidoc new file mode 100644 index 00000000000..d5010f2e06e --- /dev/null +++ b/docs/java-api/query-dsl-queries.asciidoc @@ -0,0 +1,489 @@ +[[query-dsl-queries]] +== Query DSL - Queries + +elasticsearch provides a full Java query dsl in a similar manner to the +REST link:{ref}/query-dsl.html[Query DSL]. The factory for query +builders is `QueryBuilders`. Once your query is ready, you can use the +<>. + +See also how to build <> + +To use `QueryBuilders` just import them in your class: + +[source,java] +-------------------------------------------------- +import org.elasticsearch.index.query.QueryBuilders.*; +-------------------------------------------------- + +Note that you can easily print (aka debug) JSON generated queries using +`toString()` method on `QueryBuilder` object. + +The `QueryBuilder` can then be used with any API that accepts a query, +such as `count` and `search`. + +[float] +=== Match Query + +See link:{ref}/query-dsl-match-query.html[Match Query] + + +[source,java] +-------------------------------------------------- +QueryBuilder qb = QueryBuilders.matchQuery("name", "kimchy elasticsearch"); +-------------------------------------------------- + +[float] +=== MultiMatch Query + +See link:{ref}/query-dsl-multi-match-query.html[MultiMatch +Query] + +[source,java] +-------------------------------------------------- +QueryBuilder qb = QueryBuilders.multiMatchQuery( + "kimchy elasticsearch", // Text you are looking for + "user", "message" // Fields you query on + ); +-------------------------------------------------- + +[float] +=== Boolean Query + +See link:{ref}/query-dsl-bool-query.html[Boolean Query] + + +[source,java] +-------------------------------------------------- +QueryBuilder qb = QueryBuilders + .boolQuery() + .must(termQuery("content", "test1")) + .must(termQuery("content", "test4")) + .mustNot(termQuery("content", "test2")) + .should(termQuery("content", "test3")); +-------------------------------------------------- + +[float] +=== Boosting Query + +See link:{ref}/query-dsl-boosting-query.html[Boosting Query] + + +[source,java] +-------------------------------------------------- +QueryBuilders.boostingQuery() + .positive(QueryBuilders.termQuery("name","kimchy")) + .negative(QueryBuilders.termQuery("name","dadoonet")) + .negativeBoost(0.2f); +-------------------------------------------------- + +[float] +=== IDs Query + +See link:{ref}/query-dsl-ids-query.html[IDs Query] + + +[source,java] +-------------------------------------------------- +QueryBuilders.idsQuery().ids("1", "2"); +-------------------------------------------------- + +[float] +=== Custom Score Query + +See link:{ref}/query-dsl-custom-score-query.html[Custom Score +Query] + +[source,java] +-------------------------------------------------- +QueryBuilders.customScoreQuery(QueryBuilders.matchAllQuery()) // Your query here + .script("_score * doc['price'].value"); // Your script here + +// If the script have parameters, use the same script and provide parameters to it. +QueryBuilders.customScoreQuery(QueryBuilders.matchAllQuery()) + .script("_score * doc['price'].value / pow(param1, param2)") + .param("param1", 2) + .param("param2", 3.1); +-------------------------------------------------- + +[float] +=== Custom Boost Factor Query + +See +link:{ref}/query-dsl-custom-boost-factor-query.html[Custom +Boost Factor Query] + +[source,java] +-------------------------------------------------- +QueryBuilders.customBoostFactorQuery(QueryBuilders.matchAllQuery()) // Your query + .boostFactor(3.1f); +-------------------------------------------------- + +[float] +=== Constant Score Query + +See link:{ref}/query-dsl-constant-score-query.html[Constant +Score Query] + +[source,java] +-------------------------------------------------- +// Using with Filters +QueryBuilders.constantScoreQuery(FilterBuilders.termFilter("name","kimchy")) + .boost(2.0f); + +// With Queries +QueryBuilders.constantScoreQuery(QueryBuilders.termQuery("name","kimchy")) + .boost(2.0f); +-------------------------------------------------- + +[float] +=== Disjunction Max Query + +See link:{ref}/query-dsl-dis-max-query.html[Disjunction Max +Query] + +[source,java] +-------------------------------------------------- +QueryBuilders.disMaxQuery() + .add(QueryBuilders.termQuery("name","kimchy")) // Your queries + .add(QueryBuilders.termQuery("name","elasticsearch")) // Your queries + .boost(1.2f) + .tieBreaker(0.7f); +-------------------------------------------------- + +[float] +=== Field Query + +See link:{ref}/query-dsl-field-query.html[Field Query] + + +[source,java] +-------------------------------------------------- +QueryBuilders.fieldQuery("name", "+kimchy -dadoonet"); + +// Note that you can write the same query using queryString query. +QueryBuilders.queryString("+kimchy -dadoonet").field("name"); +-------------------------------------------------- + +[float] +=== Fuzzy Like This (Field) Query (flt and flt_field) + +See: + * link:{ref}/query-dsl-flt-query.html[Fuzzy Like This Query] + * link:{ref}/query-dsl-flt-field-query.html[Fuzzy Like This Field Query] + +[source,java] +-------------------------------------------------- +// flt Query +QueryBuilders.fuzzyLikeThisQuery("name.first", "name.last") // Fields + .likeText("text like this one") // Text + .maxQueryTerms(12); // Max num of Terms + // in generated queries + +// flt_field Query +QueryBuilders.fuzzyLikeThisFieldQuery("name.first") // Only on single field + .likeText("text like this one") + .maxQueryTerms(12); +-------------------------------------------------- + +[float] +=== FuzzyQuery + +See link:{ref}/query-dsl-fuzzy-query.html[Fuzzy Query] + + +[source,java] +-------------------------------------------------- +QueryBuilder qb = QueryBuilders.fuzzyQuery("name", "kimzhy"); +-------------------------------------------------- + +[float] +=== Has Child / Has Parent + +See: + * link:{ref}/query-dsl-has-child-query.html[Has Child Query] + * link:{ref}/query-dsl-has-parent-query.html[Has Parent] + +[source,java] +-------------------------------------------------- +// Has Child +QueryBuilders.hasChildQuery("blog_tag", + QueryBuilders.termQuery("tag","something")) + +// Has Parent +QueryBuilders.hasParentQuery("blog", + QueryBuilders.termQuery("tag","something")); +-------------------------------------------------- + +[float] +=== MatchAll Query + +See link:{ref}/query-dsl-match-all-query.html[Match All +Query] + +[source,java] +-------------------------------------------------- +QueryBuilder qb = QueryBuilders.matchAllQuery(); +-------------------------------------------------- + +[float] +=== Fuzzy Like This (Field) Query (flt and flt_field) + +See: + * link:{ref}/query-dsl-mlt-query.html[More Like This Query] + * link:{ref}/query-dsl-mlt-field-query.html[More Like This Field Query] + +[source,java] +-------------------------------------------------- +// mlt Query +QueryBuilders.moreLikeThisQuery("name.first", "name.last") // Fields + .likeText("text like this one") // Text + .minTermFreq(1) // Ignore Threshold + .maxQueryTerms(12); // Max num of Terms + // in generated queries + +// mlt_field Query +QueryBuilders.moreLikeThisFieldQuery("name.first") // Only on single field + .likeText("text like this one") + .minTermFreq(1) + .maxQueryTerms(12); +-------------------------------------------------- + +[float] +=== Prefix Query + +See link:{ref}/query-dsl-prefix-query.html[Prefix Query] + +[source,java] +-------------------------------------------------- +QueryBuilders.prefixQuery("brand", "heine"); +-------------------------------------------------- + +[float] +=== QueryString Query + +See link:{ref}/query-dsl-query-string-query.html[QueryString Query] + +[source,java] +-------------------------------------------------- +QueryBuilder qb = QueryBuilders.queryString("+kimchy -elasticsearch"); +-------------------------------------------------- + +[float] +=== Range Query + +See link:{ref}/query-dsl-range-query.html[Range Query] + +[source,java] +-------------------------------------------------- +QueryBuilder qb = QueryBuilders + .rangeQuery("price") + .from(5) + .to(10) + .includeLower(true) + .includeUpper(false); +-------------------------------------------------- + +[float] +=== Span Queries (first, near, not, or, term) + +See: + * link:{ref}/query-dsl-span-first-query.html[Span First Query] + * link:{ref}/query-dsl-span-near-query.html[Span Near Query] + * link:{ref}/query-dsl-span-not-query.html[Span Not Query] + * link:{ref}/query-dsl-span-or-query.html[Span Or Query] + * link:{ref}/query-dsl-span-term-query.html[Span Term Query] + +[source,java] +-------------------------------------------------- +// Span First +QueryBuilders.spanFirstQuery( + QueryBuilders.spanTermQuery("user", "kimchy"), // Query + 3 // Max End position + ); + +// Span Near +QueryBuilders.spanNearQuery() + .clause(QueryBuilders.spanTermQuery("field","value1")) // Span Term Queries + .clause(QueryBuilders.spanTermQuery("field","value2")) + .clause(QueryBuilders.spanTermQuery("field","value3")) + .slop(12) // Slop factor + .inOrder(false) + .collectPayloads(false); + +// Span Not +QueryBuilders.spanNotQuery() + .include(QueryBuilders.spanTermQuery("field","value1")) + .exclude(QueryBuilders.spanTermQuery("field","value2")); + +// Span Or +QueryBuilders.spanOrQuery() + .clause(QueryBuilders.spanTermQuery("field","value1")) + .clause(QueryBuilders.spanTermQuery("field","value2")) + .clause(QueryBuilders.spanTermQuery("field","value3")); + +// Span Term +QueryBuilders.spanTermQuery("user","kimchy"); +-------------------------------------------------- + +[float] +=== Term Query + +See link:{ref}/query-dsl-term-query.html[Term Query] + +[source,java] +-------------------------------------------------- +QueryBuilder qb = QueryBuilders.termQuery("name", "kimchy"); +-------------------------------------------------- + +[float] +=== Terms Query + +See link:{ref}/query-dsl-terms-query.html[Terms Query] + +[source,java] +-------------------------------------------------- +QueryBuilders.termsQuery("tags", // field + "blue", "pill") // values + .minimumMatch(1); // How many terms must match +-------------------------------------------------- + +[float] +=== Top Children Query + +See link:{ref}/query-dsl-top-children-query.html[Top Children Query] + +[source,java] +-------------------------------------------------- +QueryBuilders.topChildrenQuery( + "blog_tag", // field + QueryBuilders.termQuery("tag", "something") // Query + ) + .score("max") // max, sum or avg + .factor(5) + .incrementalFactor(2); +-------------------------------------------------- + +[float] +=== Wildcard Query + +See link:{ref}/query-dsl-wildcard-query.html[Wildcard Query] + + +[source,java] +-------------------------------------------------- +QueryBuilders.wildcardQuery("user", "k?mc*"); +-------------------------------------------------- + +[float] +=== Nested Query + +See link:{ref}/query-dsl-nested-query.html[Nested Query] + + +[source,java] +-------------------------------------------------- +QueryBuilders.nestedQuery("obj1", // Path + QueryBuilders.boolQuery() // Your query + .must(QueryBuilders.matchQuery("obj1.name", "blue")) + .must(QueryBuilders.rangeQuery("obj1.count").gt(5)) + ) + .scoreMode("avg"); // max, total, avg or none +-------------------------------------------------- + +[float] +=== Custom Filters Score Query + +See +link:{ref}/query-dsl-custom-filters-score-query.html[Custom Filters Score Query] + +[source,java] +-------------------------------------------------- +QueryBuilders.customFiltersScoreQuery( + QueryBuilders.matchAllQuery()) // Query + // Filters with their boost factors + .add(FilterBuilders.rangeFilter("age").from(0).to(10), 3) + .add(FilterBuilders.rangeFilter("age").from(10).to(20), 2) + .scoreMode("first"); // first, min, max, total, avg or multiply +-------------------------------------------------- + +[float] +=== Indices Query + +See link:{ref}/query-dsl-indices-query.html[Indices Query] + + +[source,java] +-------------------------------------------------- +// Using another query when no match for the main one +QueryBuilders.indicesQuery( + QueryBuilders.termQuery("tag", "wow"), + "index1", "index2" + ) + .noMatchQuery(QueryBuilders.termQuery("tag", "kow")); + +// Using all (match all) or none (match no documents) +QueryBuilders.indicesQuery( + QueryBuilders.termQuery("tag", "wow"), + "index1", "index2" + ) + .noMatchQuery("all"); // all or none +-------------------------------------------------- + +[float] +=== GeoShape Query + +See link:{ref}/query-dsl-geo-shape-query.html[GeoShape Query] + + +Note: the `geo_shape` type uses `Spatial4J` and `JTS`, both of which are +optional dependencies. Consequently you must add `Spatial4J` and `JTS` +to your classpath in order to use this type: + +[source,java] +-------------------------------------------------- + + com.spatial4j + spatial4j + 0.3 + + + + com.vividsolutions + jts + 1.12 + + + xerces + xercesImpl + + + +-------------------------------------------------- + +[source,java] +-------------------------------------------------- +// Import Spatial4J shapes +import com.spatial4j.core.context.SpatialContext; +import com.spatial4j.core.shape.Shape; +import com.spatial4j.core.shape.impl.RectangleImpl; + +// Also import ShapeRelation +import org.elasticsearch.common.geo.ShapeRelation; +-------------------------------------------------- + +[source,java] +-------------------------------------------------- +// Shape within another +QueryBuilders.geoShapeQuery("location", + new RectangleImpl(0,10,0,10,SpatialContext.GEO)) + .relation(ShapeRelation.WITHIN); + +// Intersect shapes +QueryBuilders.geoShapeQuery("location", + new PointImpl(0, 0, SpatialContext.GEO)) + .relation(ShapeRelation.INTERSECTS); + +// Using pre-indexed shapes +QueryBuilders.geoShapeQuery("location", "New Zealand", "countries") + .relation(ShapeRelation.DISJOINT); +-------------------------------------------------- diff --git a/docs/java-api/search.asciidoc b/docs/java-api/search.asciidoc new file mode 100644 index 00000000000..110cac954f6 --- /dev/null +++ b/docs/java-api/search.asciidoc @@ -0,0 +1,137 @@ +[[search]] +== Search API + +The search API allows to execute a search query and get back search hits +that match the query. It can be executed across one or more indices and +across one or more types. The query can either be provided using the +<> or +the <>. +The body of the search request is built using the +`SearchSourceBuilder`. Here is an example: + +[source,java] +-------------------------------------------------- +import org.elasticsearch.action.search.SearchResponse; +import org.elasticsearch.action.search.SearchType; +import org.elasticsearch.index.query.FilterBuilders.*; +import org.elasticsearch.index.query.QueryBuilders.*; +-------------------------------------------------- + +[source,java] +-------------------------------------------------- +SearchResponse response = client.prepareSearch("index1", "index2") + .setTypes("type1", "type2") + .setSearchType(SearchType.DFS_QUERY_THEN_FETCH) + .setQuery(QueryBuilders.termQuery("multi", "test")) // Query + .setFilter(FilterBuilders.rangeFilter("age").from(12).to(18)) // Filter + .setFrom(0).setSize(60).setExplain(true) + .execute() + .actionGet(); +-------------------------------------------------- + +Note that all parameters are optional. Here is the smallest search call +you can write: + +[source,java] +-------------------------------------------------- +// MatchAll on the whole cluster with all default options +SearchResponse response = client.prepareSearch().execute().actionGet(); +-------------------------------------------------- + +For more information on the search operation, check out the REST +link:{ref}/search.html[search] docs. + +[float] +=== Using scrolls in Java + +Read the link:{ref}/search-request-scroll.html[scroll documentation] +first! + +[source,java] +-------------------------------------------------- +import static org.elasticsearch.index.query.FilterBuilders.*; +import static org.elasticsearch.index.query.QueryBuilders.*; + +QueryBuilder qb = termQuery("multi", "test"); + +SearchResponse scrollResp = client.prepareSearch(test) + .setSearchType(SearchType.SCAN) + .setScroll(new TimeValue(60000)) + .setQuery(qb) + .setSize(100).execute().actionGet(); //100 hits per shard will be returned for each scroll +//Scroll until no hits are returned +while (true) { + scrollResp = client.prepareSearchScroll(scrollResp.getScrollId()).setScroll(new TimeValue(600000)).execute().actionGet(); + for (SearchHit hit : scrollResp.getHits()) { + //Handle the hit... + } + //Break condition: No hits are returned + if (scrollResp.hits().hits().length == 0) { + break; + } +} +-------------------------------------------------- + +[float] +=== Operation Threading + +The search API allows to set the threading model the operation will be +performed when the actual execution of the API is performed on the same +node (the API is executed on a shard that is allocated on the same +server). + +There are three threading modes.The `NO_THREADS` mode means that the +search operation will be executed on the calling thread. The +`SINGLE_THREAD` mode means that the search operation will be executed on +a single different thread for all local shards. The `THREAD_PER_SHARD` +mode means that the search operation will be executed on a different +thread for each local shard. + +The default mode is `SINGLE_THREAD`. + +[float] +=== MultiSearch API + +See link:{ref}/search-multi-search.html[MultiSearch API Query] +documentation + +[source,java] +-------------------------------------------------- +SearchRequestBuilder srb1 = node.client() + .prepareSearch().setQuery(QueryBuilders.queryString("elasticsearch")).setSize(1); +SearchRequestBuilder srb2 = node.client() + .prepareSearch().setQuery(QueryBuilders.matchQuery("name", "kimchy")).setSize(1); + +MultiSearchResponse sr = node.client().prepareMultiSearch() + .add(srb1) + .add(srb2) + .execute().actionGet(); + +// You will get all individual responses from MultiSearchResponse#responses() +long nbHits = 0; +for (MultiSearchResponse.Item item : sr.responses()) { + SearchResponse response = item.response(); + nbHits += response.hits().totalHits(); +} +-------------------------------------------------- + +[float] +=== Using Facets + +The following code shows how to add two facets within your search: + +[source,java] +-------------------------------------------------- +SearchResponse sr = node.client().prepareSearch() + .setQuery(QueryBuilders.matchAllQuery()) + .addFacet(FacetBuilders.termsFacet("f1").field("field")) + .addFacet(FacetBuilders.dateHistogramFacet("f2").field("birth").interval("year")) + .execute().actionGet(); + +// Get your facet results +TermsFacet f1 = (TermsFacet) sr.facets().facetsAsMap().get("f1"); +DateHistogramFacet f2 = (DateHistogramFacet) sr.facets().facetsAsMap().get("f2"); +-------------------------------------------------- + +See <> +documentation for details. diff --git a/docs/reference/analysis.asciidoc b/docs/reference/analysis.asciidoc new file mode 100644 index 00000000000..467724ffb93 --- /dev/null +++ b/docs/reference/analysis.asciidoc @@ -0,0 +1,76 @@ +[[analysis]] += Analysis + +[partintro] +-- +The index analysis module acts as a configurable registry of Analyzers +that can be used in order to both break indexed (analyzed) fields when a +document is indexed and process query strings. It maps to the Lucene +`Analyzer`. + + +Analyzers are composed of a single <> +and zero or more <>. The tokenizer may +be preceded by one or more <>. The +analysis module allows one to register `TokenFilters`, `Tokenizers` and +`Analyzers` under logical names that can then be referenced either in +mapping definitions or in certain APIs. The Analysis module +automatically registers (*if not explicitly defined*) built in +analyzers, token filters, and tokenizers. + +Here is a sample configuration: + +[source,js] +-------------------------------------------------- +index : + analysis : + analyzer : + standard : + type : standard + stopwords : [stop1, stop2] + myAnalyzer1 : + type : standard + stopwords : [stop1, stop2, stop3] + max_token_length : 500 + # configure a custom analyzer which is + # exactly like the default standard analyzer + myAnalyzer2 : + tokenizer : standard + filter : [standard, lowercase, stop] + tokenizer : + myTokenizer1 : + type : standard + max_token_length : 900 + myTokenizer2 : + type : keyword + buffer_size : 512 + filter : + myTokenFilter1 : + type : stop + stopwords : [stop1, stop2, stop3, stop4] + myTokenFilter2 : + type : length + min : 0 + max : 2000 +-------------------------------------------------- + +[float] +=== Backwards compatibility + +All analyzers, tokenizers, and token filters can be configured with a +`version` parameter to control which Lucene version behavior they should +use. Possible values are: `3.0` - `3.6`, `4.0` - `4.3` (the highest +version number is the default option). + +-- + +include::analysis/analyzers.asciidoc[] + +include::analysis/tokenizers.asciidoc[] + +include::analysis/tokenfilters.asciidoc[] + +include::analysis/charfilters.asciidoc[] + +include::analysis/icu-plugin.asciidoc[] + diff --git a/docs/reference/analysis/analyzers.asciidoc b/docs/reference/analysis/analyzers.asciidoc new file mode 100644 index 00000000000..d6348d26368 --- /dev/null +++ b/docs/reference/analysis/analyzers.asciidoc @@ -0,0 +1,69 @@ +[[analysis-analyzers]] +== Analyzers + +Analyzers are composed of a single <> +and zero or more <>. The tokenizer may +be preceded by one or more <>. +The analysis module allows you to register `Analyzers` under logical +names which can then be referenced either in mapping definitions or in +certain APIs. + +Elasticsearch comes with a number of prebuilt analyzers which are +ready to use. Alternatively, you can combine the built in +character filters, tokenizers and token filters to create +<>. + +[float] +=== Default Analyzers + +An analyzer is registered under a logical name. It can then be +referenced from mapping definitions or certain APIs. When none are +defined, defaults are used. There is an option to define which analyzers +will be used by default when none can be derived. + +The `default` logical name allows one to configure an analyzer that will +be used both for indexing and for searching APIs. The `default_index` +logical name can be used to configure a default analyzer that will be +used just when indexing, and the `default_search` can be used to +configure a default analyzer that will be used just when searching. + +[float] +=== Aliasing Analyzers + +Analyzers can be aliased to have several registered lookup names +associated with them. For example, the following will allow +the `standard` analyzer to also be referenced with `alias1` +and `alias2` values. + + +[source,js] +-------------------------------------------------- +index : + analysis : + analyzer : + standard : + alias: [alias1, alias2] + type : standard + stopwords : [test1, test2, test3] +-------------------------------------------------- + +Below is a list of the built in analyzers. + +include::analyzers/standard-analyzer.asciidoc[] + +include::analyzers/simple-analyzer.asciidoc[] + +include::analyzers/whitespace-analyzer.asciidoc[] + +include::analyzers/stop-analyzer.asciidoc[] + +include::analyzers/keyword-analyzer.asciidoc[] + +include::analyzers/pattern-analyzer.asciidoc[] + +include::analyzers/lang-analyzer.asciidoc[] + +include::analyzers/snowball-analyzer.asciidoc[] + +include::analyzers/custom-analyzer.asciidoc[] + diff --git a/docs/reference/analysis/analyzers/custom-analyzer.asciidoc b/docs/reference/analysis/analyzers/custom-analyzer.asciidoc new file mode 100644 index 00000000000..5c778a6c83d --- /dev/null +++ b/docs/reference/analysis/analyzers/custom-analyzer.asciidoc @@ -0,0 +1,52 @@ +[[analysis-custom-analyzer]] +=== Custom Analyzer + +An analyzer of type `custom` that allows to combine a `Tokenizer` with +zero or more `Token Filters`, and zero or more `Char Filters`. The +custom analyzer accepts a logical/registered name of the tokenizer to +use, and a list of logical/registered names of token filters. + +The following are settings that can be set for a `custom` analyzer type: + +[cols="<,<",options="header",] +|======================================================================= +|Setting |Description +|`tokenizer` |The logical / registered name of the tokenizer to use. + +|`filter` |An optional list of logical / registered name of token +filters. + +|`char_filter` |An optional list of logical / registered name of char +filters. +|======================================================================= + +Here is an example: + +[source,js] +-------------------------------------------------- +index : + analysis : + analyzer : + myAnalyzer2 : + type : custom + tokenizer : myTokenizer1 + filter : [myTokenFilter1, myTokenFilter2] + char_filter : [my_html] + tokenizer : + myTokenizer1 : + type : standard + max_token_length : 900 + filter : + myTokenFilter1 : + type : stop + stopwords : [stop1, stop2, stop3, stop4] + myTokenFilter2 : + type : length + min : 0 + max : 2000 + char_filter : + my_html : + type : html_strip + escaped_tags : [xxx, yyy] + read_ahead : 1024 +-------------------------------------------------- diff --git a/docs/reference/analysis/analyzers/keyword-analyzer.asciidoc b/docs/reference/analysis/analyzers/keyword-analyzer.asciidoc new file mode 100644 index 00000000000..7704895c9da --- /dev/null +++ b/docs/reference/analysis/analyzers/keyword-analyzer.asciidoc @@ -0,0 +1,7 @@ +[[analysis-keyword-analyzer]] +=== Keyword Analyzer + +An analyzer of type `keyword` that "tokenizes" an entire stream as a +single token. This is useful for data like zip codes, ids and so on. +Note, when using mapping definitions, it might make more sense to simply +mark the field as `not_analyzed`. diff --git a/docs/reference/analysis/analyzers/lang-analyzer.asciidoc b/docs/reference/analysis/analyzers/lang-analyzer.asciidoc new file mode 100644 index 00000000000..d3505dd1d37 --- /dev/null +++ b/docs/reference/analysis/analyzers/lang-analyzer.asciidoc @@ -0,0 +1,20 @@ +[[analysis-lang-analyzer]] +=== Language Analyzers + +A set of analyzers aimed at analyzing specific language text. The +following types are supported: `arabic`, `armenian`, `basque`, +`brazilian`, `bulgarian`, `catalan`, `chinese`, `cjk`, `czech`, +`danish`, `dutch`, `english`, `finnish`, `french`, `galician`, `german`, +`greek`, `hindi`, `hungarian`, `indonesian`, `italian`, `norwegian`, +`persian`, `portuguese`, `romanian`, `russian`, `spanish`, `swedish`, +`turkish`, `thai`. + +All analyzers support setting custom `stopwords` either internally in +the config, or by using an external stopwords file by setting +`stopwords_path`. + +The following analyzers support setting custom `stem_exclusion` list: +`arabic`, `armenian`, `basque`, `brazilian`, `bulgarian`, `catalan`, +`czech`, `danish`, `dutch`, `english`, `finnish`, `french`, `galician`, +`german`, `hindi`, `hungarian`, `indonesian`, `italian`, `norwegian`, +`portuguese`, `romanian`, `russian`, `spanish`, `swedish`, `turkish`. diff --git a/docs/reference/analysis/analyzers/pattern-analyzer.asciidoc b/docs/reference/analysis/analyzers/pattern-analyzer.asciidoc new file mode 100644 index 00000000000..97464677331 --- /dev/null +++ b/docs/reference/analysis/analyzers/pattern-analyzer.asciidoc @@ -0,0 +1,126 @@ +[[analysis-pattern-analyzer]] +=== Pattern Analyzer + +An analyzer of type `pattern` that can flexibly separate text into terms +via a regular expression. Accepts the following settings: + +The following are settings that can be set for a `pattern` analyzer +type: + +[cols="<,<",options="header",] +|=================================================================== +|Setting |Description +|`lowercase` |Should terms be lowercased or not. Defaults to `true`. +|`pattern` |The regular expression pattern, defaults to `\W+`. +|`flags` |The regular expression flags. +|=================================================================== + +*IMPORTANT*: The regular expression should match the *token separators*, +not the tokens themselves. + +Flags should be pipe-separated, eg `"CASE_INSENSITIVE|COMMENTS"`. Check +http://download.oracle.com/javase/6/docs/api/java/util/regex/Pattern.html#field_summary[Java +Pattern API] for more details about `flags` options. + +[float] +==== Pattern Analyzer Examples + +In order to try out these examples, you should delete the `test` index +before running each example: + +[source,js] +-------------------------------------------------- + curl -XDELETE localhost:9200/test +-------------------------------------------------- + +[float] +===== Whitespace tokenizer + +[source,js] +-------------------------------------------------- + curl -XPUT 'localhost:9200/test' -d ' + { + "settings":{ + "analysis": { + "analyzer": { + "whitespace":{ + "type": "pattern", + "pattern":"\\\\s+" + } + } + } + } + }' + + curl 'localhost:9200/test/_analyze?pretty=1&analyzer=whitespace' -d 'foo,bar baz' + # "foo,bar", "baz" +-------------------------------------------------- + +[float] +===== Non-word character tokenizer + +[source,js] +-------------------------------------------------- + + curl -XPUT 'localhost:9200/test' -d ' + { + "settings":{ + "analysis": { + "analyzer": { + "nonword":{ + "type": "pattern", + "pattern":"[^\\\\w]+" + } + } + } + } + }' + + curl 'localhost:9200/test/_analyze?pretty=1&analyzer=nonword' -d 'foo,bar baz' + # "foo,bar baz" becomes "foo", "bar", "baz" + + curl 'localhost:9200/test/_analyze?pretty=1&analyzer=nonword' -d 'type_1-type_4' + # "type_1","type_4" +-------------------------------------------------- + +[float] +===== CamelCase tokenizer + +[source,js] +-------------------------------------------------- + + curl -XPUT 'localhost:9200/test?pretty=1' -d ' + { + "settings":{ + "analysis": { + "analyzer": { + "camel":{ + "type": "pattern", + "pattern":"([^\\\\p{L}\\\\d]+)|(?<=\\\\D)(?=\\\\d)|(?<=\\\\d)(?=\\\\D)|(?<=[\\\\p{L}&&[^\\\\p{Lu}]])(?=\\\\p{Lu})|(?<=\\\\p{Lu})(?=\\\\p{Lu}[\\\\p{L}&&[^\\\\p{Lu}]])" + } + } + } + } + }' + + curl 'localhost:9200/test/_analyze?pretty=1&analyzer=camel' -d ' + MooseX::FTPClass2_beta + ' + # "moose","x","ftp","class","2","beta" +-------------------------------------------------- + +The regex above is easier to understand as: + +[source,js] +-------------------------------------------------- + + ([^\\p{L}\\d]+) # swallow non letters and numbers, + | (?<=\\D)(?=\\d) # or non-number followed by number, + | (?<=\\d)(?=\\D) # or number followed by non-number, + | (?<=[ \\p{L} && [^\\p{Lu}]]) # or lower case + (?=\\p{Lu}) # followed by upper case, + | (?<=\\p{Lu}) # or upper case + (?=\\p{Lu} # followed by upper case + [\\p{L}&&[^\\p{Lu}]] # then lower case + ) +-------------------------------------------------- diff --git a/docs/reference/analysis/analyzers/simple-analyzer.asciidoc b/docs/reference/analysis/analyzers/simple-analyzer.asciidoc new file mode 100644 index 00000000000..9d7a7c30423 --- /dev/null +++ b/docs/reference/analysis/analyzers/simple-analyzer.asciidoc @@ -0,0 +1,6 @@ +[[analysis-simple-analyzer]] +=== Simple Analyzer + +An analyzer of type `simple` that is built using a +<>. diff --git a/docs/reference/analysis/analyzers/snowball-analyzer.asciidoc b/docs/reference/analysis/analyzers/snowball-analyzer.asciidoc new file mode 100644 index 00000000000..234f41db155 --- /dev/null +++ b/docs/reference/analysis/analyzers/snowball-analyzer.asciidoc @@ -0,0 +1,63 @@ +[[analysis-snowball-analyzer]] +=== Snowball Analyzer + +An analyzer of type `snowball` that uses the +<>, with +<>, +<>, +<>, and +<>. + +The Snowball Analyzer is a stemming analyzer from Lucene that is +originally based on the snowball project from +http://snowball.tartarus.org[snowball.tartarus.org]. + +Sample usage: + +[source,js] +-------------------------------------------------- +{ + "index" : { + "analysis" : { + "analyzer" : { + "my_analyzer" : { + "type" : "snowball", + "language" : "English" + } + } + } + } +} +-------------------------------------------------- + +The `language` parameter can have the same values as the +<> and defaults to `English`. Note that not all the language +analyzers have a default set of stopwords provided. + +The `stopwords` parameter can be used to provide stopwords for the +languages that has no defaults, or to simply replace the default set +with your custom list. A default set of stopwords for many of these +languages is available from for instance +https://github.com/apache/lucene-solr/tree/trunk/lucene/analysis/common/src/resources/org/apache/lucene/analysis/[here] +and +https://github.com/apache/lucene-solr/tree/trunk/lucene/analysis/common/src/resources/org/apache/lucene/analysis/snowball[here.] + +A sample configuration (in YAML format) specifying Swedish with +stopwords: + +[source,js] +-------------------------------------------------- +index : + analysis : + analyzer : + my_analyzer: + type: snowball + language: Swedish + stopwords: "och,det,att,i,en,jag,hon,som,han,på,den,med,var,sig,för,så,till,är,men,ett,om,hade,de,av,icke,mig,du,henne,då,sin,nu,har,inte,hans,honom,skulle,hennes,där,min,man,ej,vid,kunde,något,från,ut,när,efter,upp,vi,dem,vara,vad,över,än,dig,kan,sina,här,ha,mot,alla,under,någon,allt,mycket,sedan,ju,denna,själv,detta,åt,utan,varit,hur,ingen,mitt,ni,bli,blev,oss,din,dessa,några,deras,blir,mina,samma,vilken,er,sådan,vår,blivit,dess,inom,mellan,sådant,varför,varje,vilka,ditt,vem,vilket,sitta,sådana,vart,dina,vars,vårt,våra,ert,era,vilkas" +-------------------------------------------------- diff --git a/docs/reference/analysis/analyzers/standard-analyzer.asciidoc b/docs/reference/analysis/analyzers/standard-analyzer.asciidoc new file mode 100644 index 00000000000..bcd24a1265b --- /dev/null +++ b/docs/reference/analysis/analyzers/standard-analyzer.asciidoc @@ -0,0 +1,26 @@ +[[analysis-standard-analyzer]] +=== Standard Analyzer + +An analyzer of type `standard` that is built of using +<>, with +<>, +<>, and +<>. + +The following are settings that can be set for a `standard` analyzer +type: + +[cols="<,<",options="header",] +|======================================================================= +|Setting |Description +|`stopwords` |A list of stopword to initialize the stop filter with. +Defaults to the english stop words. + +|`max_token_length` |The maximum token length. If a token is seen that +exceeds this length then it is discarded. Defaults to `255`. +|======================================================================= + diff --git a/docs/reference/analysis/analyzers/stop-analyzer.asciidoc b/docs/reference/analysis/analyzers/stop-analyzer.asciidoc new file mode 100644 index 00000000000..18599af0f90 --- /dev/null +++ b/docs/reference/analysis/analyzers/stop-analyzer.asciidoc @@ -0,0 +1,21 @@ +[[analysis-stop-analyzer]] +=== Stop Analyzer + +An analyzer of type `stop` that is built using a +<>, with +<>. + +The following are settings that can be set for a `stop` analyzer type: + +[cols="<,<",options="header",] +|======================================================================= +|Setting |Description +|`stopwords` |A list of stopword to initialize the stop filter with. +Defaults to the english stop words. + +|`stopwords_path` |A path (either relative to `config` location, or +absolute) to a stopwords file configuration. +|======================================================================= + diff --git a/docs/reference/analysis/analyzers/whitespace-analyzer.asciidoc b/docs/reference/analysis/analyzers/whitespace-analyzer.asciidoc new file mode 100644 index 00000000000..20956867544 --- /dev/null +++ b/docs/reference/analysis/analyzers/whitespace-analyzer.asciidoc @@ -0,0 +1,6 @@ +[[analysis-whitespace-analyzer]] +=== Whitespace Analyzer + +An analyzer of type `whitespace` that is built using a +<>. diff --git a/docs/reference/analysis/charfilters.asciidoc b/docs/reference/analysis/charfilters.asciidoc new file mode 100644 index 00000000000..a40cfffc054 --- /dev/null +++ b/docs/reference/analysis/charfilters.asciidoc @@ -0,0 +1,16 @@ +[[analysis-charfilters]] +== Character Filters + +Character filters are used to preprocess the string of +characters before it is passed to the <>. +A character filter may be used to strip out HTML markup, , or to convert +`"&"` characters to the word `"and"`. + +Elasticsearch has built in characters filters which can be +used to build <>. + +include::charfilters/mapping-charfilter.asciidoc[] + +include::charfilters/htmlstrip-charfilter.asciidoc[] + +include::charfilters/pattern-replace-charfilter.asciidoc[] diff --git a/docs/reference/analysis/charfilters/htmlstrip-charfilter.asciidoc b/docs/reference/analysis/charfilters/htmlstrip-charfilter.asciidoc new file mode 100644 index 00000000000..f12238a36ad --- /dev/null +++ b/docs/reference/analysis/charfilters/htmlstrip-charfilter.asciidoc @@ -0,0 +1,5 @@ +[[analysis-htmlstrip-charfilter]] +=== HTML Strip Char Filter + +A char filter of type `html_strip` stripping out HTML elements from an +analyzed text. diff --git a/docs/reference/analysis/charfilters/mapping-charfilter.asciidoc b/docs/reference/analysis/charfilters/mapping-charfilter.asciidoc new file mode 100644 index 00000000000..ef4df815506 --- /dev/null +++ b/docs/reference/analysis/charfilters/mapping-charfilter.asciidoc @@ -0,0 +1,38 @@ +[[analysis-mapping-charfilter]] +=== Mapping Char Filter + +A char filter of type `mapping` replacing characters of an analyzed text +with given mapping. + +Here is a sample configuration: + +[source,js] +-------------------------------------------------- +{ + "index" : { + "analysis" : { + "char_filter" : { + "my_mapping" : { + "type" : "mapping", + "mappings" : ["ph=>f", "qu=>q"] + } + }, + "analyzer" : { + "custom_with_char_filter" : { + "tokenizer" : "standard", + "char_filter" : ["my_mapping"] + }, + } + } + } +} +-------------------------------------------------- + +Otherwise the setting `mappings_path` can specify a file where you can +put the list of char mapping : + +[source,js] +-------------------------------------------------- +ph => f +qu => k +-------------------------------------------------- diff --git a/docs/reference/analysis/charfilters/pattern-replace-charfilter.asciidoc b/docs/reference/analysis/charfilters/pattern-replace-charfilter.asciidoc new file mode 100644 index 00000000000..5a0cf28aa12 --- /dev/null +++ b/docs/reference/analysis/charfilters/pattern-replace-charfilter.asciidoc @@ -0,0 +1,37 @@ +[[analysis-pattern-replace-charfilter]] +=== Pattern Replace Char Filter + +The `pattern_replace` char filter allows the use of a regex to +manipulate the characters in a string before analysis. The regular +expression is defined using the `pattern` parameter, and the replacement +string can be provided using the `replacement` parameter (supporting +referencing the original text, as explained +http://docs.oracle.com/javase/6/docs/api/java/util/regex/Matcher.html#appendReplacement(java.lang.StringBuffer,%20java.lang.String)[here]). +For more information check the +http://lucene.apache.org/core/4_3_1/analyzers-common/org/apache/lucene/analysis/pattern/PatternReplaceCharFilter.html[lucene +documentation] + +Here is a sample configuration: + +[source,js] +-------------------------------------------------- +{ + "index" : { + "analysis" : { + "char_filter" : { + "my_pattern":{ + "type":"pattern_replace", + "pattern":"sample(.*)", + "replacement":"replacedSample $1" + } + }, + "analyzer" : { + "custom_with_char_filter" : { + "tokenizer" : "standard", + "char_filter" : ["my_pattern"] + }, + } + } + } +} +-------------------------------------------------- diff --git a/docs/reference/analysis/icu-plugin.asciidoc b/docs/reference/analysis/icu-plugin.asciidoc new file mode 100644 index 00000000000..50cf8c77274 --- /dev/null +++ b/docs/reference/analysis/icu-plugin.asciidoc @@ -0,0 +1,148 @@ +[[analysis-icu-plugin]] +== ICU Analysis Plugin + +The http://icu-project.org/[ICU] analysis plugin allows for unicode +normalization, collation and folding. The plugin is called +https://github.com/elasticsearch/elasticsearch-analysis-icu[elasticsearch-analysis-icu]. + +The plugin includes the following analysis components: + +[float] +=== ICU Normalization + +Normalizes characters as explained +http://userguide.icu-project.org/transforms/normalization[here]. It +registers itself by default under `icu_normalizer` or `icuNormalizer` +using the default settings. Allows for the name parameter to be provided +which can include the following values: `nfc`, `nfkc`, and `nfkc_cf`. +Here is a sample settings: + +[source,js] +-------------------------------------------------- +{ + "index" : { + "analysis" : { + "analyzer" : { + "normalization" : { + "tokenizer" : "keyword", + "filter" : ["icu_normalizer"] + } + } + } + } +} +-------------------------------------------------- + +[float] +=== ICU Folding + +Folding of unicode characters based on `UTR#30`. It registers itself +under `icu_folding` and `icuFolding` names. +The filter also does lowercasing, which means the lowercase filter can +normally be left out. Sample setting: + +[source,js] +-------------------------------------------------- +{ + "index" : { + "analysis" : { + "analyzer" : { + "folding" : { + "tokenizer" : "keyword", + "filter" : ["icu_folding"] + } + } + } + } +} +-------------------------------------------------- + +[float] +==== Filtering + +The folding can be filtered by a set of unicode characters with the +parameter `unicodeSetFilter`. This is useful for a non-internationalized +search engine where retaining a set of national characters which are +primary letters in a specific language is wanted. See syntax for the +UnicodeSet +http://icu-project.org/apiref/icu4j/com/ibm/icu/text/UnicodeSet.html[here]. + +The Following example excempt Swedish characters from the folding. Note +that the filtered characters are NOT lowercased which is why we add that +filter below. + +[source,js] +-------------------------------------------------- +{ + "index" : { + "analysis" : { + "analyzer" : { + "folding" : { + "tokenizer" : "standard", + "filter" : ["my_icu_folding", "lowercase"] + } + } + "filter" : { + "my_icu_folding" : { + "type" : "icu_folding" + "unicodeSetFilter" : "[^åäöÅÄÖ]" + } + } + } + } +} +-------------------------------------------------- + +[float] +=== ICU Collation + +Uses collation token filter. Allows to either specify the rules for +collation (defined +http://www.icu-project.org/userguide/Collate_Customization.html[here]) +using the `rules` parameter (can point to a location or expressed in the +settings, location can be relative to config location), or using the +`language` parameter (further specialized by country and variant). By +default registers under `icu_collation` or `icuCollation` and uses the +default locale. + +Here is a sample settings: + +[source,js] +-------------------------------------------------- +{ + "index" : { + "analysis" : { + "analyzer" : { + "collation" : { + "tokenizer" : "keyword", + "filter" : ["icu_collation"] + } + } + } + } +} +-------------------------------------------------- + +And here is a sample of custom collation: + +[source,js] +-------------------------------------------------- +{ + "index" : { + "analysis" : { + "analyzer" : { + "collation" : { + "tokenizer" : "keyword", + "filter" : ["myCollator"] + } + }, + "filter" : { + "myCollator" : { + "type" : "icu_collation", + "language" : "en" + } + } + } + } +} +-------------------------------------------------- diff --git a/docs/reference/analysis/tokenfilters.asciidoc b/docs/reference/analysis/tokenfilters.asciidoc new file mode 100644 index 00000000000..57c4341f28a --- /dev/null +++ b/docs/reference/analysis/tokenfilters.asciidoc @@ -0,0 +1,71 @@ +[[analysis-tokenfilters]] +== Token Filters + +Token filters accept a stream of tokens from a +<> and can modify tokens +(eg lowercasing), delete tokens (eg remove stopwords) +or add tokens (eg synonyms). + +Elasticsearch has a number of built in token filters which can be +used to build <>. + +include::tokenfilters/standard-tokenfilter.asciidoc[] + +include::tokenfilters/asciifolding-tokenfilter.asciidoc[] + +include::tokenfilters/length-tokenfilter.asciidoc[] + +include::tokenfilters/lowercase-tokenfilter.asciidoc[] + +include::tokenfilters/ngram-tokenfilter.asciidoc[] + +include::tokenfilters/edgengram-tokenfilter.asciidoc[] + +include::tokenfilters/porterstem-tokenfilter.asciidoc[] + +include::tokenfilters/shingle-tokenfilter.asciidoc[] + +include::tokenfilters/stop-tokenfilter.asciidoc[] + +include::tokenfilters/word-delimiter-tokenfilter.asciidoc[] + +include::tokenfilters/stemmer-tokenfilter.asciidoc[] + +include::tokenfilters/stemmer-override-tokenfilter.asciidoc[] + +include::tokenfilters/keyword-marker-tokenfilter.asciidoc[] + +include::tokenfilters/keyword-repeat-tokenfilter.asciidoc[] + +include::tokenfilters/kstem-tokenfilter.asciidoc[] + +include::tokenfilters/snowball-tokenfilter.asciidoc[] + +include::tokenfilters/phonetic-tokenfilter.asciidoc[] + +include::tokenfilters/synonym-tokenfilter.asciidoc[] + +include::tokenfilters/compound-word-tokenfilter.asciidoc[] + +include::tokenfilters/reverse-tokenfilter.asciidoc[] + +include::tokenfilters/elision-tokenfilter.asciidoc[] + +include::tokenfilters/truncate-tokenfilter.asciidoc[] + +include::tokenfilters/unique-tokenfilter.asciidoc[] + +include::tokenfilters/pattern-capture-tokenfilter.asciidoc[] + +include::tokenfilters/pattern_replace-tokenfilter.asciidoc[] + +include::tokenfilters/trim-tokenfilter.asciidoc[] + +include::tokenfilters/limit-token-count-tokenfilter.asciidoc[] + +include::tokenfilters/hunspell-tokenfilter.asciidoc[] + +include::tokenfilters/common-grams-tokenfilter.asciidoc[] + +include::tokenfilters/normalization-tokenfilter.asciidoc[] + diff --git a/docs/reference/analysis/tokenfilters/asciifolding-tokenfilter.asciidoc b/docs/reference/analysis/tokenfilters/asciifolding-tokenfilter.asciidoc new file mode 100644 index 00000000000..aaca0eb3daf --- /dev/null +++ b/docs/reference/analysis/tokenfilters/asciifolding-tokenfilter.asciidoc @@ -0,0 +1,7 @@ +[[analysis-asciifolding-tokenfilter]] +=== ASCII Folding Token Filter + +A token filter of type `asciifolding` that converts alphabetic, numeric, +and symbolic Unicode characters which are not in the first 127 ASCII +characters (the "Basic Latin" Unicode block) into their ASCII +equivalents, if one exists. diff --git a/docs/reference/analysis/tokenfilters/common-grams-tokenfilter.asciidoc b/docs/reference/analysis/tokenfilters/common-grams-tokenfilter.asciidoc new file mode 100644 index 00000000000..f0659e00868 --- /dev/null +++ b/docs/reference/analysis/tokenfilters/common-grams-tokenfilter.asciidoc @@ -0,0 +1,61 @@ +[[analysis-common-grams-tokenfilter]] +=== Common Grams Token Filter + +Token filter that generates bigrams for frequently occuring terms. +Single terms are still indexed. It can be used as an alternative to the +<> when we don't want to completely ignore common terms. + +For example, the text "the quick brown is a fox" will be tokenized as +"the", "the_quick", "quick", "brown", "brown_is", "is_a", "a_fox", +"fox". Assuming "the", "is" and "a" are common words. + +When `query_mode` is enabled, the token filter removes common words and +single terms followed by a common word. This parameter should be enabled +in the search analyzer. + +For example, the query "the quick brown is a fox" will be tokenized as +"the_quick", "quick", "brown_is", "is_a", "a_fox", "fox". + +The following are settings that can be set: + +[cols="<,<",options="header",] +|======================================================================= +|Setting |Description +|`common_words` |A list of common words to use. + +|`common_words_path` |A path (either relative to `config` location, or +absolute) to a list of common words. Each word should be in its own +"line" (separated by a line break). The file must be UTF-8 encoded. + +|`ignore_case` |If true, common words matching will be case insensitive +(defaults to `false`). + +|`query_mode` |Generates bigrams then removes common words and single +terms followed by a common word (defaults to `false`). +|======================================================================= + +Note, `common_words` or `common_words_path` field is required. + +Here is an example: + +[source,js] +-------------------------------------------------- +index : + analysis : + analyzer : + index_grams : + tokenizer : whitespace + filter : [common_grams] + search_grams : + tokenizer : whitespace + filter : [common_grams_query] + filter : + common_grams : + type : common_grams + common_words: [a, an, the] + common_grams_query : + type : common_grams + query_mode: true + common_words: [a, an, the] +-------------------------------------------------- diff --git a/docs/reference/analysis/tokenfilters/compound-word-tokenfilter.asciidoc b/docs/reference/analysis/tokenfilters/compound-word-tokenfilter.asciidoc new file mode 100644 index 00000000000..6719a9cd519 --- /dev/null +++ b/docs/reference/analysis/tokenfilters/compound-word-tokenfilter.asciidoc @@ -0,0 +1,48 @@ +[[analysis-compound-word-tokenfilter]] +=== Compound Word Token Filter + +Token filters that allow to decompose compound words. There are two +types available: `dictionary_decompounder` and +`hyphenation_decompounder`. + +The following are settings that can be set for a compound word token +filter type: + +[cols="<,<",options="header",] +|======================================================================= +|Setting |Description +|`word_list` |A list of words to use. + +|`word_list_path` |A path (either relative to `config` location, or +absolute) to a list of words. + +|`min_word_size` |Minimum word size(Integer). Defaults to 5. + +|`min_subword_size` |Minimum subword size(Integer). Defaults to 2. + +|`max_subword_size` |Maximum subword size(Integer). Defaults to 15. + +|`only_longest_match` |Only matching the longest(Boolean). Defaults to +`false` +|======================================================================= + +Here is an example: + +[source,js] +-------------------------------------------------- +index : + analysis : + analyzer : + myAnalyzer2 : + type : custom + tokenizer : standard + filter : [myTokenFilter1, myTokenFilter2] + filter : + myTokenFilter1 : + type : dictionary_decompounder + word_list: [one, two, three] + myTokenFilter2 : + type : hyphenation_decompounder + word_list_path: path/to/words.txt + max_subword_size : 22 +-------------------------------------------------- diff --git a/docs/reference/analysis/tokenfilters/edgengram-tokenfilter.asciidoc b/docs/reference/analysis/tokenfilters/edgengram-tokenfilter.asciidoc new file mode 100644 index 00000000000..3ba0edeb8ef --- /dev/null +++ b/docs/reference/analysis/tokenfilters/edgengram-tokenfilter.asciidoc @@ -0,0 +1,16 @@ +[[analysis-edgengram-tokenfilter]] +=== Edge NGram Token Filter + +A token filter of type `edgeNGram`. + +The following are settings that can be set for a `edgeNGram` token +filter type: + +[cols="<,<",options="header",] +|====================================================== +|Setting |Description +|`min_gram` |Defaults to `1`. +|`max_gram` |Defaults to `2`. +|`side` |Either `front` or `back`. Defaults to `front`. +|====================================================== + diff --git a/docs/reference/analysis/tokenfilters/elision-tokenfilter.asciidoc b/docs/reference/analysis/tokenfilters/elision-tokenfilter.asciidoc new file mode 100644 index 00000000000..c44ccffd51e --- /dev/null +++ b/docs/reference/analysis/tokenfilters/elision-tokenfilter.asciidoc @@ -0,0 +1,28 @@ +[[analysis-elision-tokenfilter]] +=== Elision Token Filter + +A token filter which removes elisions. For example, "l'avion" (the +plane) will tokenized as "avion" (plane). + +Accepts `articles` setting which is a set of stop words articles. For +example: + +[source,js] +-------------------------------------------------- +"index" : { + "analysis" : { + "analyzer" : { + "default" : { + "tokenizer" : "standard", + "filter" : ["standard", "elision"] + } + }, + "filter" : { + "elision" : { + "type" : "elision", + "articles" : ["l", "m", "t", "qu", "n", "s", "j"] + } + } + } +} +-------------------------------------------------- diff --git a/docs/reference/analysis/tokenfilters/hunspell-tokenfilter.asciidoc b/docs/reference/analysis/tokenfilters/hunspell-tokenfilter.asciidoc new file mode 100644 index 00000000000..da16874b815 --- /dev/null +++ b/docs/reference/analysis/tokenfilters/hunspell-tokenfilter.asciidoc @@ -0,0 +1,116 @@ +[[analysis-hunspell-tokenfilter]] +=== Hunspell Token Filter + +Basic support for hunspell stemming. Hunspell dictionaries will be +picked up from a dedicated hunspell directory on the filesystem +(defaults to `/hunspell`). Each dictionary is expected to +have its own directory named after its associated locale (language). +This dictionary directory is expected to hold both the \*.aff and \*.dic +files (all of which will automatically be picked up). For example, +assuming the default hunspell location is used, the following directory +layout will define the `en_US` dictionary: + +[source,js] +-------------------------------------------------- +- conf + |-- hunspell + | |-- en_US + | | |-- en_US.dic + | | |-- en_US.aff +-------------------------------------------------- + +The location of the hunspell directory can be configured using the +`indices.analysis.hunspell.dictionary.location` settings in +_elasticsearch.yml_. + +Each dictionary can be configured with two settings: + +`ignore_case`:: + If true, dictionary matching will be case insensitive + (defaults to `false`) + +`strict_affix_parsing`:: + Determines whether errors while reading a + affix rules file will cause exception or simple be ignored (defaults to + `true`) + +These settings can be configured globally in `elasticsearch.yml` using + +* `indices.analysis.hunspell.dictionary.ignore_case` and +* `indices.analysis.hunspell.dictionary.strict_affix_parsing` + +or for specific dictionaries: + +* `indices.analysis.hunspell.dictionary.en_US.ignore_case` and +* `indices.analysis.hunspell.dictionary.en_US.strict_affix_parsing`. + +It is also possible to add `settings.yml` file under the dictionary +directory which holds these settings (this will override any other +settings defined in the `elasticsearch.yml`). + +One can use the hunspell stem filter by configuring it the analysis +settings: + +[source,js] +-------------------------------------------------- +{ + "analysis" : { + "analyzer" : { + "en" : { + "tokenizer" : "standard", + "filter" : [ "lowercase", "en_US" ] + } + }, + "filter" : { + "en_US" : { + "type" : "hunspell", + "locale" : "en_US", + "dedup" : true + } + } + } +} +-------------------------------------------------- + +The hunspell token filter accepts four options: + +`locale`:: + A locale for this filter. If this is unset, the `lang` or + `language` are used instead - so one of these has to be set. + +`dictionary`:: + The name of a dictionary. The path to your hunspell + dictionaries should be configured via + `indices.analysis.hunspell.dictionary.location` before. + +`dedup`:: + If only unique terms should be returned, this needs to be + set to `true`. Defaults to `true`. + +`recursion_level`:: + Configures the recursion level a + stemmer can go into. Defaults to `2`. Some languages (for example czech) + give better results when set to `1` or `0`, so you should test it out. + (since 0.90.3) + +NOTE: As opposed to the snowball stemmers (which are algorithm based) +this is a dictionary lookup based stemmer and therefore the quality of +the stemming is determined by the quality of the dictionary. + +[float] +==== References + +Hunspell is a spell checker and morphological analyzer designed for +languages with rich morphology and complex word compounding and +character encoding. + +1. Wikipedia, http://en.wikipedia.org/wiki/Hunspell + +2. Source code, http://hunspell.sourceforge.net/ + +3. Open Office Hunspell dictionaries, http://wiki.openoffice.org/wiki/Dictionaries + +4. Mozilla Hunspell dictionaries, https://addons.mozilla.org/en-US/firefox/language-tools/ + +5. Chromium Hunspell dictionaries, + http://src.chromium.org/viewvc/chrome/trunk/deps/third_party/hunspell_dictionaries/ diff --git a/docs/reference/analysis/tokenfilters/keyword-marker-tokenfilter.asciidoc b/docs/reference/analysis/tokenfilters/keyword-marker-tokenfilter.asciidoc new file mode 100644 index 00000000000..34f61713af5 --- /dev/null +++ b/docs/reference/analysis/tokenfilters/keyword-marker-tokenfilter.asciidoc @@ -0,0 +1,34 @@ +[[analysis-keyword-marker-tokenfilter]] +=== Keyword Marker Token Filter + +Protects words from being modified by stemmers. Must be placed before +any stemming filters. + +[cols="<,<",options="header",] +|======================================================================= +|Setting |Description +|`keywords` |A list of words to use. + +|`keywords_path` |A path (either relative to `config` location, or +absolute) to a list of words. + +|`ignore_case` |Set to `true` to lower case all words first. Defaults to +`false`. +|======================================================================= + +Here is an example: + +[source,js] +-------------------------------------------------- +index : + analysis : + analyzer : + myAnalyzer : + type : custom + tokenizer : standard + filter : [lowercase, protwods, porterStem] + filter : + protwods : + type : keyword_marker + keywords_path : analysis/protwords.txt +-------------------------------------------------- diff --git a/docs/reference/analysis/tokenfilters/keyword-repeat-tokenfilter.asciidoc b/docs/reference/analysis/tokenfilters/keyword-repeat-tokenfilter.asciidoc new file mode 100644 index 00000000000..e9f20118937 --- /dev/null +++ b/docs/reference/analysis/tokenfilters/keyword-repeat-tokenfilter.asciidoc @@ -0,0 +1,28 @@ +[[analysis-keyword-repeat-tokenfilter]] +=== Keyword Repeat Token Filter + +The `keyword_repeat` token filter Emits each incoming token twice once +as keyword and once as a non-keyword to allow an un-stemmed version of a +term to be indexed side by site to the stemmed version of the term. +Given the nature of this filter each token that isn't transformed by a +subsequent stemmer will be indexed twice. Therefore, consider adding a +`unique` filter with `only_on_same_position` set to `true` to drop +unnecessary duplicates. + +Note: this is available from `0.90.0.Beta2` on. + +Here is an example: + +[source,js] +-------------------------------------------------- +index : + analysis : + analyzer : + myAnalyzer : + type : custom + tokenizer : standard + filter : [lowercase, keyword_repeat, porterStem, unique_stem] + unique_stem: + type: unique + only_on_same_position : true +-------------------------------------------------- diff --git a/docs/reference/analysis/tokenfilters/kstem-tokenfilter.asciidoc b/docs/reference/analysis/tokenfilters/kstem-tokenfilter.asciidoc new file mode 100644 index 00000000000..ff0695e6496 --- /dev/null +++ b/docs/reference/analysis/tokenfilters/kstem-tokenfilter.asciidoc @@ -0,0 +1,6 @@ +[[analysis-kstem-tokenfilter]] +=== KStem Token Filter + +The `kstem` token filter is a high performance filter for english. All +terms must already be lowercased (use `lowercase` filter) for this +filter to work correctly. diff --git a/docs/reference/analysis/tokenfilters/length-tokenfilter.asciidoc b/docs/reference/analysis/tokenfilters/length-tokenfilter.asciidoc new file mode 100644 index 00000000000..2651980966e --- /dev/null +++ b/docs/reference/analysis/tokenfilters/length-tokenfilter.asciidoc @@ -0,0 +1,16 @@ +[[analysis-length-tokenfilter]] +=== Length Token Filter + +A token filter of type `length` that removes words that are too long or +too short for the stream. + +The following are settings that can be set for a `length` token filter +type: + +[cols="<,<",options="header",] +|=========================================================== +|Setting |Description +|`min` |The minimum number. Defaults to `0`. +|`max` |The maximum number. Defaults to `Integer.MAX_VALUE`. +|=========================================================== + diff --git a/docs/reference/analysis/tokenfilters/limit-token-count-tokenfilter.asciidoc b/docs/reference/analysis/tokenfilters/limit-token-count-tokenfilter.asciidoc new file mode 100644 index 00000000000..a6598be6095 --- /dev/null +++ b/docs/reference/analysis/tokenfilters/limit-token-count-tokenfilter.asciidoc @@ -0,0 +1,32 @@ +[[analysis-limit-token-count-tokenfilter]] +=== Limit Token Count Token Filter + +Limits the number of tokens that are indexed per document and field. + +[cols="<,<",options="header",] +|======================================================================= +|Setting |Description +|`max_token_count` |The maximum number of tokens that should be indexed +per document and field. The default is `1` + +|`consume_all_tokens` |If set to `true` the filter exhaust the stream +even if `max_token_count` tokens have been consumed already. The default +is `false`. +|======================================================================= + +Here is an example: + +[source,js] +-------------------------------------------------- +index : + analysis : + analyzer : + myAnalyzer : + type : custom + tokenizer : standard + filter : [lowercase, five_token_limit] + filter : + five_token_limit : + type : limit + max_token_count : 5 +-------------------------------------------------- diff --git a/docs/reference/analysis/tokenfilters/lowercase-tokenfilter.asciidoc b/docs/reference/analysis/tokenfilters/lowercase-tokenfilter.asciidoc new file mode 100644 index 00000000000..857c0d7916a --- /dev/null +++ b/docs/reference/analysis/tokenfilters/lowercase-tokenfilter.asciidoc @@ -0,0 +1,37 @@ +[[analysis-lowercase-tokenfilter]] +=== Lowercase Token Filter + +A token filter of type `lowercase` that normalizes token text to lower +case. + +Lowercase token filter supports Greek and Turkish lowercase token +filters through the `language` parameter. Below is a usage example in a +custom analyzer + +[source,js] +-------------------------------------------------- +index : + analysis : + analyzer : + myAnalyzer2 : + type : custom + tokenizer : myTokenizer1 + filter : [myTokenFilter1, myGreekLowerCaseFilter] + char_filter : [my_html] + tokenizer : + myTokenizer1 : + type : standard + max_token_length : 900 + filter : + myTokenFilter1 : + type : stop + stopwords : [stop1, stop2, stop3, stop4] + myGreekLowerCaseFilter : + type : lowercase + language : greek + char_filter : + my_html : + type : html_strip + escaped_tags : [xxx, yyy] + read_ahead : 1024 +-------------------------------------------------- diff --git a/docs/reference/analysis/tokenfilters/ngram-tokenfilter.asciidoc b/docs/reference/analysis/tokenfilters/ngram-tokenfilter.asciidoc new file mode 100644 index 00000000000..5f911360085 --- /dev/null +++ b/docs/reference/analysis/tokenfilters/ngram-tokenfilter.asciidoc @@ -0,0 +1,15 @@ +[[analysis-ngram-tokenfilter]] +=== NGram Token Filter + +A token filter of type `nGram`. + +The following are settings that can be set for a `nGram` token filter +type: + +[cols="<,<",options="header",] +|============================ +|Setting |Description +|`min_gram` |Defaults to `1`. +|`max_gram` |Defaults to `2`. +|============================ + diff --git a/docs/reference/analysis/tokenfilters/normalization-tokenfilter.asciidoc b/docs/reference/analysis/tokenfilters/normalization-tokenfilter.asciidoc new file mode 100644 index 00000000000..a1fcb4def5e --- /dev/null +++ b/docs/reference/analysis/tokenfilters/normalization-tokenfilter.asciidoc @@ -0,0 +1,15 @@ +[[analysis-normalization-tokenfilter]] +=== Normalization Token Filter + +There are several token filters available which try to normalize special +characters of a certain language. + +You can currently choose between `arabic_normalization` and +`persian_normalization` normalization in your token filter +configuration. For more information check the +http://lucene.apache.org/core/4_3_1/analyzers-common/org/apache/lucene/analysis/ar/ArabicNormalizer.html[ArabicNormalizer] +or the +http://lucene.apache.org/core/4_3_1/analyzers-common/org/apache/lucene/analysis/fa/PersianNormalizer.html[PersianNormalizer] +documentation. + +*Note:* This filters are available since `0.90.2` diff --git a/docs/reference/analysis/tokenfilters/pattern-capture-tokenfilter.asciidoc b/docs/reference/analysis/tokenfilters/pattern-capture-tokenfilter.asciidoc new file mode 100644 index 00000000000..4091296a76e --- /dev/null +++ b/docs/reference/analysis/tokenfilters/pattern-capture-tokenfilter.asciidoc @@ -0,0 +1,134 @@ +[[analysis-pattern-capture-tokenfilter]] +=== Pattern Capture Token Filter + +The `pattern_capture` token filter, unlike the `pattern` tokenizer, +emits a token for every capture group in the regular expression. +Patterns are not anchored to the beginning and end of the string, so +each pattern can match multiple times, and matches are allowed to +overlap. + +For instance a pattern like : + +[source,js] +-------------------------------------------------- +"(([a-z]+)(\d*))" +-------------------------------------------------- + +when matched against: + +[source,js] +-------------------------------------------------- +"abc123def456" +-------------------------------------------------- + +would produce the tokens: [ `abc123`, `abc`, `123`, `def456`, `def`, +`456` ] + +If `preserve_original` is set to `true` (the default) then it would also +emit the original token: `abc123def456`. + +This is particularly useful for indexing text like camel-case code, eg +`stripHTML` where a user may search for `"strip html"` or `"striphtml"`: + +[source,js] +-------------------------------------------------- +curl -XPUT localhost:9200/test/ -d ' +{ + "settings" : { + "analysis" : { + "filter" : { + "code" : { + "type" : "pattern_capture", + "preserve_original" : 1, + "patterns" : [ + "(\\p{Ll}+|\\p{Lu}\\p{Ll}+|\\p{Lu}+)", + "(\\d+)" + ] + } + }, + "analyzer" : { + "code" : { + "tokenizer" : "pattern", + "filter" : [ "code", "lowercase" ] + } + } + } + } +} +' +-------------------------------------------------- + +When used to analyze the text + +[source,js] +-------------------------------------------------- +import static org.apache.commons.lang.StringEscapeUtils.escapeHtml +-------------------------------------------------- + +this emits the tokens: [ `import`, `static`, `org`, `apache`, `commons`, +`lang`, `stringescapeutils`, `string`, `escape`, `utils`, `escapehtml`, +`escape`, `html` ] + +Another example is analyzing email addresses: + +[source,js] +-------------------------------------------------- +curl -XPUT localhost:9200/test/ -d ' +{ + "settings" : { + "analysis" : { + "filter" : { + "email" : { + "type" : "pattern_capture", + "preserve_original" : 1, + "patterns" : [ + "(\\w+)", + "(\\p{L}+)", + "(\\d+)", + "@(.+)" + ] + } + }, + "analyzer" : { + "email" : { + "tokenizer" : "uax_url_email", + "filter" : [ "email", "lowercase", "unique" ] + } + } + } + } +} +' +-------------------------------------------------- + +When the above analyzer is used on an email address like: + +[source,js] +-------------------------------------------------- +john-smith_123@foo-bar.com +-------------------------------------------------- + +it would produce the following tokens: [ `john-smith_123`, +`foo-bar.com`, `john`, `smith_123`, `smith`, `123`, `foo`, +`foo-bar.com`, `bar`, `com` ] + +Multiple patterns are required to allow overlapping captures, but also +means that patterns are less dense and easier to understand. + +*Note:* All tokens are emitted in the same position, and with the same +character offsets, so when combined with highlighting, the whole +original token will be highlighted, not just the matching subset. For +instance, querying the above email address for `"smith"` would +highlight: + +[source,js] +-------------------------------------------------- + john-smith_123@foo-bar.com +-------------------------------------------------- + +not: + +[source,js] +-------------------------------------------------- + john-smith_123@foo-bar.com +-------------------------------------------------- diff --git a/docs/reference/analysis/tokenfilters/pattern_replace-tokenfilter.asciidoc b/docs/reference/analysis/tokenfilters/pattern_replace-tokenfilter.asciidoc new file mode 100644 index 00000000000..54e08426e8b --- /dev/null +++ b/docs/reference/analysis/tokenfilters/pattern_replace-tokenfilter.asciidoc @@ -0,0 +1,9 @@ +[[analysis-pattern_replace-tokenfilter]] +=== Pattern Replace Token Filter + +The `pattern_replace` token filter allows to easily handle string +replacements based on a regular expression. The regular expression is +defined using the `pattern` parameter, and the replacement string can be +provided using the `replacement` parameter (supporting referencing the +original text, as explained +http://docs.oracle.com/javase/6/docs/api/java/util/regex/Matcher.html#appendReplacement(java.lang.StringBuffer,%20java.lang.String)[here]). diff --git a/docs/reference/analysis/tokenfilters/phonetic-tokenfilter.asciidoc b/docs/reference/analysis/tokenfilters/phonetic-tokenfilter.asciidoc new file mode 100644 index 00000000000..b7e9334db9d --- /dev/null +++ b/docs/reference/analysis/tokenfilters/phonetic-tokenfilter.asciidoc @@ -0,0 +1,5 @@ +[[analysis-phonetic-tokenfilter]] +=== Phonetic Token Filter + +The `phonetic` token filter is provided as a plugin and located +https://github.com/elasticsearch/elasticsearch-analysis-phonetic[here]. diff --git a/docs/reference/analysis/tokenfilters/porterstem-tokenfilter.asciidoc b/docs/reference/analysis/tokenfilters/porterstem-tokenfilter.asciidoc new file mode 100644 index 00000000000..188db1e5e9c --- /dev/null +++ b/docs/reference/analysis/tokenfilters/porterstem-tokenfilter.asciidoc @@ -0,0 +1,15 @@ +[[analysis-porterstem-tokenfilter]] +=== Porter Stem Token Filter + +A token filter of type `porterStem` that transforms the token stream as +per the Porter stemming algorithm. + +Note, the input to the stemming filter must already be in lower case, so +you will need to use +<> or +<> farther down the Tokenizer chain in order for this to +work properly!. For example, when using custom analyzer, make sure the +`lowercase` filter comes before the `porterStem` filter in the list of +filters. diff --git a/docs/reference/analysis/tokenfilters/reverse-tokenfilter.asciidoc b/docs/reference/analysis/tokenfilters/reverse-tokenfilter.asciidoc new file mode 100644 index 00000000000..b0049981555 --- /dev/null +++ b/docs/reference/analysis/tokenfilters/reverse-tokenfilter.asciidoc @@ -0,0 +1,4 @@ +[[analysis-reverse-tokenfilter]] +=== Reverse Token Filter + +A token filter of type `reverse` that simply reverses each token. diff --git a/docs/reference/analysis/tokenfilters/shingle-tokenfilter.asciidoc b/docs/reference/analysis/tokenfilters/shingle-tokenfilter.asciidoc new file mode 100644 index 00000000000..d47a01829bd --- /dev/null +++ b/docs/reference/analysis/tokenfilters/shingle-tokenfilter.asciidoc @@ -0,0 +1,36 @@ +[[analysis-shingle-tokenfilter]] +=== Shingle Token Filter + +A token filter of type `shingle` that constructs shingles (token +n-grams) from a token stream. In other words, it creates combinations of +tokens as a single token. For example, the sentence "please divide this +sentence into shingles" might be tokenized into shingles "please +divide", "divide this", "this sentence", "sentence into", and "into +shingles". + +This filter handles position increments > 1 by inserting filler tokens +(tokens with termtext "_"). It does not handle a position increment of +0. + +The following are settings that can be set for a `shingle` token filter +type: + +[cols="<,<",options="header",] +|======================================================================= +|Setting |Description +|`max_shingle_size` |The maximum shingle size. Defaults to `2`. + +|`min_shingle_sizes` |The minimum shingle size. Defaults to `2`. + +|`output_unigrams` |If `true` the output will contain the input tokens +(unigrams) as well as the shingles. Defaults to `true`. + +|`output_unigrams_if_no_shingles` |If `output_unigrams` is `false` the +output will contain the input tokens (unigrams) if no shingles are +available. Note if `output_unigrams` is set to `true` this setting has +no effect. Defaults to `false`. + +|`token_separator` |The string to use when joining adjacent tokens to +form a shingle. Defaults to `" "`. +|======================================================================= + diff --git a/docs/reference/analysis/tokenfilters/snowball-tokenfilter.asciidoc b/docs/reference/analysis/tokenfilters/snowball-tokenfilter.asciidoc new file mode 100644 index 00000000000..58d88988745 --- /dev/null +++ b/docs/reference/analysis/tokenfilters/snowball-tokenfilter.asciidoc @@ -0,0 +1,33 @@ +[[analysis-snowball-tokenfilter]] +=== Snowball Token Filter + +A filter that stems words using a Snowball-generated stemmer. The +`language` parameter controls the stemmer with the following available +values: `Armenian`, `Basque`, `Catalan`, `Danish`, `Dutch`, `English`, +`Finnish`, `French`, `German`, `German2`, `Hungarian`, `Italian`, `Kp`, +`Lovins`, `Norwegian`, `Porter`, `Portuguese`, `Romanian`, `Russian`, +`Spanish`, `Swedish`, `Turkish`. + +For example: + +[source,js] +-------------------------------------------------- +{ + "index" : { + "analysis" : { + "analyzer" : { + "my_analyzer" : { + "tokenizer" : "standard", + "filter" : ["standard", "lowercase", "my_snow"] + } + }, + "filter" : { + "my_snow" : { + "type" : "snowball", + "language" : "Lovins" + } + } + } + } +} +-------------------------------------------------- diff --git a/docs/reference/analysis/tokenfilters/standard-tokenfilter.asciidoc b/docs/reference/analysis/tokenfilters/standard-tokenfilter.asciidoc new file mode 100644 index 00000000000..3dd4fbf5005 --- /dev/null +++ b/docs/reference/analysis/tokenfilters/standard-tokenfilter.asciidoc @@ -0,0 +1,7 @@ +[[analysis-standard-tokenfilter]] +=== Standard Token Filter + +A token filter of type `standard` that normalizes tokens extracted with +the +<>. diff --git a/docs/reference/analysis/tokenfilters/stemmer-override-tokenfilter.asciidoc b/docs/reference/analysis/tokenfilters/stemmer-override-tokenfilter.asciidoc new file mode 100644 index 00000000000..012fab8bd8a --- /dev/null +++ b/docs/reference/analysis/tokenfilters/stemmer-override-tokenfilter.asciidoc @@ -0,0 +1,34 @@ +[[analysis-stemmer-override-tokenfilter]] +=== Stemmer Override Token Filter + +Overrides stemming algorithms, by applying a custom mapping, then +protecting these terms from being modified by stemmers. Must be placed +before any stemming filters. + +Rules are separated by "=>" + +[cols="<,<",options="header",] +|======================================================================= +|Setting |Description +|`rules` |A list of mapping rules to use. + +|`rules_path` |A path (either relative to `config` location, or +absolute) to a list of mappings. +|======================================================================= + +Here is an example: + +[source,js] +-------------------------------------------------- +index : + analysis : + analyzer : + myAnalyzer : + type : custom + tokenizer : standard + filter : [lowercase, custom_stems, porterStem] + filter: + custom_stems: + type: stemmer_override + rules_path : analysis/custom_stems.txt +-------------------------------------------------- diff --git a/docs/reference/analysis/tokenfilters/stemmer-tokenfilter.asciidoc b/docs/reference/analysis/tokenfilters/stemmer-tokenfilter.asciidoc new file mode 100644 index 00000000000..676c2f3894e --- /dev/null +++ b/docs/reference/analysis/tokenfilters/stemmer-tokenfilter.asciidoc @@ -0,0 +1,78 @@ +[[analysis-stemmer-tokenfilter]] +=== Stemmer Token Filter + +A filter that stems words (similar to `snowball`, but with more +options). The `language`/@name@ parameter controls the stemmer with the +following available values: + +http://lucene.apache.org/core/4_3_0/analyzers-common/index.html?org%2Fapache%2Flucene%2Fanalysis%2Far%2FArabicStemmer.html[arabic], +http://snowball.tartarus.org/algorithms/armenian/stemmer.html[armenian], +http://snowball.tartarus.org/algorithms/basque/stemmer.html[basque], +http://lucene.apache.org/core/4_3_0/analyzers-common/index.html?org%2Fapache%2Flucene%2Fanalysis%2Fbr%2FBrazilianStemmer.html[brazilian], +http://members.unine.ch/jacques.savoy/Papers/BUIR.pdf[bulgarian], +http://snowball.tartarus.org/algorithms/catalan/stemmer.html[catalan], +http://portal.acm.org/citation.cfm?id=1598600[czech], +http://snowball.tartarus.org/algorithms/danish/stemmer.html[danish], +http://snowball.tartarus.org/algorithms/dutch/stemmer.html[dutch], +http://snowball.tartarus.org/algorithms/english/stemmer.html[english], +http://snowball.tartarus.org/algorithms/finnish/stemmer.html[finnish], +http://snowball.tartarus.org/algorithms/french/stemmer.html[french], +http://snowball.tartarus.org/algorithms/german/stemmer.html[german], +http://snowball.tartarus.org/algorithms/german2/stemmer.html[german2], +http://sais.se/mthprize/2007/ntais2007.pdf[greek], +http://snowball.tartarus.org/algorithms/hungarian/stemmer.html[hungarian], +http://snowball.tartarus.org/algorithms/italian/stemmer.html[italian], +http://snowball.tartarus.org/algorithms/kraaij_pohlmann/stemmer.html[kp], +http://ciir.cs.umass.edu/pubfiles/ir-35.pdf[kstem], +http://snowball.tartarus.org/algorithms/lovins/stemmer.html[lovins], +http://lucene.apache.org/core/4_3_0/analyzers-common/index.html?org%2Fapache%2Flucene%2Fanalysis%2Flv%2FLatvianStemmer.html[latvian], +http://snowball.tartarus.org/algorithms/norwegian/stemmer.html[norwegian], +http://lucene.apache.org/core/4_3_0/analyzers-common/index.html?org%2Fapache%2Flucene%2Fanalysis%2Fno%2FNorwegianMinimalStemFilter.html[minimal_norwegian], +http://snowball.tartarus.org/algorithms/porter/stemmer.html[porter], +http://snowball.tartarus.org/algorithms/portuguese/stemmer.html[portuguese], +http://snowball.tartarus.org/algorithms/romanian/stemmer.html[romanian], +http://snowball.tartarus.org/algorithms/russian/stemmer.html[russian], +http://snowball.tartarus.org/algorithms/spanish/stemmer.html[spanish], +http://snowball.tartarus.org/algorithms/swedish/stemmer.html[swedish], +http://snowball.tartarus.org/algorithms/turkish/stemmer.html[turkish], +http://www.medialab.tfe.umu.se/courses/mdm0506a/material/fulltext_ID%3D10049387%26PLACEBO%3DIE.pdf[minimal_english], +http://lucene.apache.org/core/4_3_0/analyzers-common/index.html?org%2Fapache%2Flucene%2Fanalysis%2Fen%2FEnglishPossessiveFilter.html[possessive_english], +http://clef.isti.cnr.it/2003/WN_web/22.pdf[light_finish], +http://dl.acm.org/citation.cfm?id=1141523[light_french], +http://dl.acm.org/citation.cfm?id=318984[minimal_french], +http://dl.acm.org/citation.cfm?id=1141523[light_german], +http://members.unine.ch/jacques.savoy/clef/morpho.pdf[minimal_german], +http://computing.open.ac.uk/Sites/EACLSouthAsia/Papers/p6-Ramanathan.pdf[hindi], +http://dl.acm.org/citation.cfm?id=1141523&dl=ACM&coll=DL&CFID=179095584&CFTOKEN=80067181[light_hungarian], +http://www.illc.uva.nl/Publications/ResearchReports/MoL-2003-02.text.pdf[indonesian], +http://www.ercim.eu/publication/ws-proceedings/CLEF2/savoy.pdf[light_italian], +http://dl.acm.org/citation.cfm?id=1141523&dl=ACM&coll=DL&CFID=179095584&CFTOKEN=80067181[light_portuguese], +http://www.inf.ufrgs.br/\~buriol/papers/Orengo_CLEF07.pdf[minimal_portuguese], +http://www.inf.ufrgs.br/\~viviane/rslp/index.htm[portuguese], +http://doc.rero.ch/lm.php?url=1000%2C43%2C4%2C20091209094227-CA%2FDolamic_Ljiljana_-_Indexing_and_Searching_Strategies_for_the_Russian_20091209.pdf[light_russian], +http://www.ercim.eu/publication/ws-proceedings/CLEF2/savoy.pdf[light_spanish], +http://clef.isti.cnr.it/2003/WN_web/22.pdf[light_swedish]. + +For example: + +[source,js] +-------------------------------------------------- +{ + "index" : { + "analysis" : { + "analyzer" : { + "my_analyzer" : { + "tokenizer" : "standard", + "filter" : ["standard", "lowercase", "my_stemmer"] + } + }, + "filter" : { + "my_stemmer" : { + "type" : "stemmer", + "name" : "light_german" + } + } + } + } +} +-------------------------------------------------- diff --git a/docs/reference/analysis/tokenfilters/stop-tokenfilter.asciidoc b/docs/reference/analysis/tokenfilters/stop-tokenfilter.asciidoc new file mode 100644 index 00000000000..8701c171aa6 --- /dev/null +++ b/docs/reference/analysis/tokenfilters/stop-tokenfilter.asciidoc @@ -0,0 +1,33 @@ +[[analysis-stop-tokenfilter]] +=== Stop Token Filter + +A token filter of type `stop` that removes stop words from token +streams. + +The following are settings that can be set for a `stop` token filter +type: + +[cols="<,<",options="header",] +|======================================================================= +|Setting |Description +|`stopwords` |A list of stop words to use. Defaults to english stop +words. + +|`stopwords_path` |A path (either relative to `config` location, or +absolute) to a stopwords file configuration. Each stop word should be in +its own "line" (separated by a line break). The file must be UTF-8 +encoded. + +|`enable_position_increments` |Set to `true` if token positions should +record the removed stop words, `false` otherwise. Defaults to `true`. + +|`ignore_case` |Set to `true` to lower case all words first. Defaults to +`false`. +|======================================================================= + +stopwords allow for custom language specific expansion of default +stopwords. It follows the `_lang_` notation and supports: arabic, +armenian, basque, brazilian, bulgarian, catalan, czech, danish, dutch, +english, finnish, french, galician, german, greek, hindi, hungarian, +indonesian, italian, norwegian, persian, portuguese, romanian, russian, +spanish, swedish, turkish. diff --git a/docs/reference/analysis/tokenfilters/synonym-tokenfilter.asciidoc b/docs/reference/analysis/tokenfilters/synonym-tokenfilter.asciidoc new file mode 100644 index 00000000000..0fe0c460701 --- /dev/null +++ b/docs/reference/analysis/tokenfilters/synonym-tokenfilter.asciidoc @@ -0,0 +1,124 @@ +[[analysis-synonym-tokenfilter]] +=== Synonym Token Filter + +The `synonym` token filter allows to easily handle synonyms during the +analysis process. Synonyms are configured using a configuration file. +Here is an example: + +[source,js] +-------------------------------------------------- +{ + "index" : { + "analysis" : { + "analyzer" : { + "synonym" : { + "tokenizer" : "whitespace", + "filter" : ["synonym"] + } + }, + "filter" : { + "synonym" : { + "type" : "synonym", + "synonyms_path" : "analysis/synonym.txt" + } + } + } + } +} +-------------------------------------------------- + +The above configures a `synonym` filter, with a path of +`analysis/synonym.txt` (relative to the `config` location). The +`synonym` analyzer is then configured with the filter. Additional +settings are: `ignore_case` (defaults to `false`), and `expand` +(defaults to `true`). + +The `tokenizer` parameter controls the tokenizers that will be used to +tokenize the synonym, and defaults to the `whitespace` tokenizer. + +As of elasticsearch 0.17.9 two synonym formats are supported: Solr, +WordNet. + +[float] +==== Solr synonyms + +The following is a sample format of the file: + +[source,js] +-------------------------------------------------- +# blank lines and lines starting with pound are comments. + +#Explicit mappings match any token sequence on the LHS of "=>" +#and replace with all alternatives on the RHS. These types of mappings +#ignore the expand parameter in the schema. +#Examples: +i-pod, i pod => ipod, +sea biscuit, sea biscit => seabiscuit + +#Equivalent synonyms may be separated with commas and give +#no explicit mapping. In this case the mapping behavior will +#be taken from the expand parameter in the schema. This allows +#the same synonym file to be used in different synonym handling strategies. +#Examples: +ipod, i-pod, i pod +foozball , foosball +universe , cosmos + +# If expand==true, "ipod, i-pod, i pod" is equivalent to the explicit mapping: +ipod, i-pod, i pod => ipod, i-pod, i pod +# If expand==false, "ipod, i-pod, i pod" is equivalent to the explicit mapping: +ipod, i-pod, i pod => ipod + +#multiple synonym mapping entries are merged. +foo => foo bar +foo => baz +#is equivalent to +foo => foo bar, baz +-------------------------------------------------- + +You can also define synonyms for the filter directly in the +configuration file (note use of `synonyms` instead of `synonyms_path`): + +[source,js] +-------------------------------------------------- +{ + "filter" : { + "synonym" : { + "type" : "synonym", + "synonyms" : [ + "i-pod, i pod => ipod", + "universe, cosmos" + ] + } + } +} +-------------------------------------------------- + +However, it is recommended to define large synonyms set in a file using +`synonyms_path`. + +[float] +==== WordNet synonyms + +Synonyms based on http://wordnet.princeton.edu/[WordNet] format can be +declared using `format`: + +[source,js] +-------------------------------------------------- +{ + "filter" : { + "synonym" : { + "type" : "synonym", + "format" : "wordnet", + "synonyms" : [ + "s(100000001,1,'abstain',v,1,0).", + "s(100000001,2,'refrain',v,1,0).", + "s(100000001,3,'desist',v,1,0)." + ] + } + } +} +-------------------------------------------------- + +Using `synonyms_path` to define WordNet synonyms in a file is supported +as well. diff --git a/docs/reference/analysis/tokenfilters/trim-tokenfilter.asciidoc b/docs/reference/analysis/tokenfilters/trim-tokenfilter.asciidoc new file mode 100644 index 00000000000..34a0e93a3af --- /dev/null +++ b/docs/reference/analysis/tokenfilters/trim-tokenfilter.asciidoc @@ -0,0 +1,4 @@ +[[analysis-trim-tokenfilter]] +=== Trim Token Filter + +The `trim` token filter trims the whitespace surrounding a token. diff --git a/docs/reference/analysis/tokenfilters/truncate-tokenfilter.asciidoc b/docs/reference/analysis/tokenfilters/truncate-tokenfilter.asciidoc new file mode 100644 index 00000000000..14652f46342 --- /dev/null +++ b/docs/reference/analysis/tokenfilters/truncate-tokenfilter.asciidoc @@ -0,0 +1,10 @@ +[[analysis-truncate-tokenfilter]] +=== Truncate Token Filter + +The `truncate` token filter can be used to truncate tokens into a +specific length. This can come in handy with keyword (single token) +based mapped fields that are used for sorting in order to reduce memory +usage. + +It accepts a `length` parameter which control the number of characters +to truncate to, defaults to `10`. diff --git a/docs/reference/analysis/tokenfilters/unique-tokenfilter.asciidoc b/docs/reference/analysis/tokenfilters/unique-tokenfilter.asciidoc new file mode 100644 index 00000000000..8b42f6b73b9 --- /dev/null +++ b/docs/reference/analysis/tokenfilters/unique-tokenfilter.asciidoc @@ -0,0 +1,7 @@ +[[analysis-unique-tokenfilter]] +=== Unique Token Filter + +The `unique` token filter can be used to only index unique tokens during +analysis. By default it is applied on all the token stream. If +`only_on_same_position` is set to `true`, it will only remove duplicate +tokens on the same position. diff --git a/docs/reference/analysis/tokenfilters/word-delimiter-tokenfilter.asciidoc b/docs/reference/analysis/tokenfilters/word-delimiter-tokenfilter.asciidoc new file mode 100644 index 00000000000..9ce81e1ac9f --- /dev/null +++ b/docs/reference/analysis/tokenfilters/word-delimiter-tokenfilter.asciidoc @@ -0,0 +1,80 @@ +[[analysis-word-delimiter-tokenfilter]] +=== Word Delimiter Token Filter + +Named `word_delimiter`, it Splits words into subwords and performs +optional transformations on subword groups. Words are split into +subwords with the following rules: + +* split on intra-word delimiters (by default, all non alpha-numeric +characters). +* "Wi-Fi" -> "Wi", "Fi" +* split on case transitions: "PowerShot" -> "Power", "Shot" +* split on letter-number transitions: "SD500" -> "SD", "500" +* leading and trailing intra-word delimiters on each subword are +ignored: "//hello---there, 'dude'" -> "hello", "there", "dude" +* trailing "'s" are removed for each subword: "O'Neil's" -> "O", "Neil" + +Parameters include: + +`generate_word_parts`:: + If `true` causes parts of words to be + generated: "PowerShot" => "Power" "Shot". Defaults to `true`. + +`generate_number_parts`:: + If `true` causes number subwords to be + generated: "500-42" => "500" "42". Defaults to `true`. + +`catenate_words`:: + If `true` causes maximum runs of word parts to be + catenated: "wi-fi" => "wifi". Defaults to `false`. + +`catenate_numbers`:: + If `true` causes maximum runs of number parts to + be catenated: "500-42" => "50042". Defaults to `false`. + +`catenate_all`:: + If `true` causes all subword parts to be catenated: + "wi-fi-4000" => "wifi4000". Defaults to `false`. + +`split_on_case_change`:: + If `true` causes "PowerShot" to be two tokens; + ("Power-Shot" remains two parts regards). Defaults to `true`. + +`preserve_original`:: + If `true` includes original words in subwords: + "500-42" => "500-42" "500" "42". Defaults to `false`. + +`split_on_numerics`:: + If `true` causes "j2se" to be three tokens; "j" + "2" "se". Defaults to `true`. + +`stem_english_possessive`:: + If `true` causes trailing "'s" to be + removed for each subword: "O'Neil's" => "O", "Neil". Defaults to `true`. + +Advance settings include: + +`protected_words`:: + A list of protected words from being delimiter. + Either an array, or also can set `protected_words_path` which resolved + to a file configured with protected words (one on each line). + Automatically resolves to `config/` based location if exists. + +`type_table`:: + A custom type mapping table, for example (when configured + using `type_table_path`): + +[source,js] +-------------------------------------------------- + # Map the $, %, '.', and ',' characters to DIGIT + # This might be useful for financial data. + $ => DIGIT + % => DIGIT + . => DIGIT + \\u002C => DIGIT + + # in some cases you might not want to split on ZWJ + # this also tests the case where we need a bigger byte[] + # see http://en.wikipedia.org/wiki/Zero-width_joiner + \\u200D => ALPHANUM +-------------------------------------------------- diff --git a/docs/reference/analysis/tokenizers.asciidoc b/docs/reference/analysis/tokenizers.asciidoc new file mode 100644 index 00000000000..3118b0dfb2b --- /dev/null +++ b/docs/reference/analysis/tokenizers.asciidoc @@ -0,0 +1,30 @@ +[[analysis-tokenizers]] +== Tokenizers + +Tokenizers are used to break a string down into a stream of terms +or tokens. A simple tokenizer might split the string up into terms +wherever it encounters whitespace or punctuation. + +Elasticsearch has a number of built in tokenizers which can be +used to build <>. + +include::tokenizers/standard-tokenizer.asciidoc[] + +include::tokenizers/edgengram-tokenizer.asciidoc[] + +include::tokenizers/keyword-tokenizer.asciidoc[] + +include::tokenizers/letter-tokenizer.asciidoc[] + +include::tokenizers/lowercase-tokenizer.asciidoc[] + +include::tokenizers/ngram-tokenizer.asciidoc[] + +include::tokenizers/whitespace-tokenizer.asciidoc[] + +include::tokenizers/pattern-tokenizer.asciidoc[] + +include::tokenizers/uaxurlemail-tokenizer.asciidoc[] + +include::tokenizers/pathhierarchy-tokenizer.asciidoc[] + diff --git a/docs/reference/analysis/tokenizers/edgengram-tokenizer.asciidoc b/docs/reference/analysis/tokenizers/edgengram-tokenizer.asciidoc new file mode 100644 index 00000000000..d2d0294f700 --- /dev/null +++ b/docs/reference/analysis/tokenizers/edgengram-tokenizer.asciidoc @@ -0,0 +1,80 @@ +[[analysis-edgengram-tokenizer]] +=== Edge NGram Tokenizer + +A tokenizer of type `edgeNGram`. + +This tokenizer is very similar to `nGram` but only keeps n-grams which +start at the beginning of a token. + +The following are settings that can be set for a `edgeNGram` tokenizer +type: + +[cols="<,<,<",options="header",] +|======================================================================= +|Setting |Description |Default value +|`min_gram` |Minimum size in codepoints of a single n-gram |`1`. + +|`max_gram` |Maximum size in codepoints of a single n-gram |`2`. + +|`token_chars` |(Since `0.90.2`) Characters classes to keep in the +tokens, Elasticsearch will split on characters that don't belong to any +of these classes. |`[]` (Keep all characters) +|======================================================================= + + +`token_chars` accepts the following character classes: + +[horizontal] +`letter`:: for example `a`, `b`, `ï` or `京` +`digit`:: for example `3` or `7` +`whitespace`:: for example `" "` or `"\n"` +`punctuation`:: for example `!` or `"` +`symbol`:: for example `$` or `√` + +[float] +==== Example + +[source,js] +-------------------------------------------------- + curl -XPUT 'localhost:9200/test' -d ' + { + "settings" : { + "analysis" : { + "analyzer" : { + "my_edge_ngram_analyzer" : { + "tokenizer" : "my_edge_ngram_tokenizer" + } + }, + "tokenizer" : { + "my_edge_ngram_tokenizer" : { + "type" : "edgeNGram", + "min_gram" : "2", + "max_gram" : "5", + "token_chars": [ "letter", "digit" ] + } + } + } + } + }' + + curl 'localhost:9200/test/_analyze?pretty=1&analyzer=my_edge_ngram_analyzer' -d 'FC Schalke 04' + # FC, Sc, Sch, Scha, Schal, 04 +-------------------------------------------------- + +[float] +==== `side` deprecated + +There used to be a @side@ parameter up to `0.90.1` but it is now deprecated. In +order to emulate the behavior of `"side" : "BACK"` a +<> should be used together +with the <>. The +`edgeNGram` filter must be enclosed in `reverse` filters like this: + +[source,js] +-------------------------------------------------- + "filter" : ["reverse", "edgeNGram", "reverse"] +-------------------------------------------------- + +which essentially reverses the token, builds front `EdgeNGrams` and reverses +the ngram again. This has the same effect as the previous `"side" : "BACK"` setting. + diff --git a/docs/reference/analysis/tokenizers/keyword-tokenizer.asciidoc b/docs/reference/analysis/tokenizers/keyword-tokenizer.asciidoc new file mode 100644 index 00000000000..be75f3dbfd1 --- /dev/null +++ b/docs/reference/analysis/tokenizers/keyword-tokenizer.asciidoc @@ -0,0 +1,15 @@ +[[analysis-keyword-tokenizer]] +=== Keyword Tokenizer + +A tokenizer of type `keyword` that emits the entire input as a single +input. + +The following are settings that can be set for a `keyword` tokenizer +type: + +[cols="<,<",options="header",] +|======================================================= +|Setting |Description +|`buffer_size` |The term buffer size. Defaults to `256`. +|======================================================= + diff --git a/docs/reference/analysis/tokenizers/letter-tokenizer.asciidoc b/docs/reference/analysis/tokenizers/letter-tokenizer.asciidoc new file mode 100644 index 00000000000..03025ccd303 --- /dev/null +++ b/docs/reference/analysis/tokenizers/letter-tokenizer.asciidoc @@ -0,0 +1,7 @@ +[[analysis-letter-tokenizer]] +=== Letter Tokenizer + +A tokenizer of type `letter` that divides text at non-letters. That's to +say, it defines tokens as maximal strings of adjacent letters. Note, +this does a decent job for most European languages, but does a terrible +job for some Asian languages, where words are not separated by spaces. diff --git a/docs/reference/analysis/tokenizers/lowercase-tokenizer.asciidoc b/docs/reference/analysis/tokenizers/lowercase-tokenizer.asciidoc new file mode 100644 index 00000000000..655ef813304 --- /dev/null +++ b/docs/reference/analysis/tokenizers/lowercase-tokenizer.asciidoc @@ -0,0 +1,15 @@ +[[analysis-lowercase-tokenizer]] +=== Lowercase Tokenizer + +A tokenizer of type `lowercase` that performs the function of +<> and +<> together. It divides text at non-letters and converts +them to lower case. While it is functionally equivalent to the +combination of +<> and +<>, there is a performance advantage to doing the two +tasks at once, hence this (redundant) implementation. diff --git a/docs/reference/analysis/tokenizers/ngram-tokenizer.asciidoc b/docs/reference/analysis/tokenizers/ngram-tokenizer.asciidoc new file mode 100644 index 00000000000..c2e6f75a554 --- /dev/null +++ b/docs/reference/analysis/tokenizers/ngram-tokenizer.asciidoc @@ -0,0 +1,57 @@ +[[analysis-ngram-tokenizer]] +=== NGram Tokenizer + +A tokenizer of type `nGram`. + +The following are settings that can be set for a `nGram` tokenizer type: + +[cols="<,<,<",options="header",] +|======================================================================= +|Setting |Description |Default value +|`min_gram` |Minimum size in codepoints of a single n-gram |`1`. + +|`max_gram` |Maximum size in codepoints of a single n-gram |`2`. + +|`token_chars` |(Since `0.90.2`) Characters classes to keep in the +tokens, Elasticsearch will split on characters that don't belong to any +of these classes. |`[]` (Keep all characters) +|======================================================================= + +`token_chars` accepts the following character classes: + +[horizontal] +`letter`:: for example `a`, `b`, `ï` or `京` +`digit`:: for example `3` or `7` +`whitespace`:: for example `" "` or `"\n"` +`punctuation`:: for example `!` or `"` +`symbol`:: for example `$` or `√` + +[float] +==== Example + +[source,js] +-------------------------------------------------- + curl -XPUT 'localhost:9200/test' -d ' + { + "settings" : { + "analysis" : { + "analyzer" : { + "my_ngram_analyzer" : { + "tokenizer" : "my_ngram_tokenizer" + } + }, + "tokenizer" : { + "my_ngram_tokenizer" : { + "type" : "nGram", + "min_gram" : "2", + "max_gram" : "3", + "token_chars": [ "letter", "digit" ] + } + } + } + } + }' + + curl 'localhost:9200/test/_analyze?pretty=1&analyzer=my_ngram_analyzer' -d 'FC Schalke 04' + # FC, Sc, Sch, ch, cha, ha, hal, al, alk, lk, lke, ke, 04 +-------------------------------------------------- diff --git a/docs/reference/analysis/tokenizers/pathhierarchy-tokenizer.asciidoc b/docs/reference/analysis/tokenizers/pathhierarchy-tokenizer.asciidoc new file mode 100644 index 00000000000..e6876f55bc6 --- /dev/null +++ b/docs/reference/analysis/tokenizers/pathhierarchy-tokenizer.asciidoc @@ -0,0 +1,32 @@ +[[analysis-pathhierarchy-tokenizer]] +=== Path Hierarchy Tokenizer + +The `path_hierarchy` tokenizer takes something like this: + +------------------------- +/something/something/else +------------------------- + +And produces tokens: + +------------------------- +/something +/something/something +/something/something/else +------------------------- + +[cols="<,<",options="header",] +|======================================================================= +|Setting |Description +|`delimiter` |The character delimiter to use, defaults to `/`. + +|`replacement` |An optional replacement character to use. Defaults to +the `delimiter`. + +|`buffer_size` |The buffer size to use, defaults to `1024`. + +|`reverse` |Generates tokens in reverse order, defaults to `false`. + +|`skip` |Controls initial tokens to skip, defaults to `0`. +|======================================================================= + diff --git a/docs/reference/analysis/tokenizers/pattern-tokenizer.asciidoc b/docs/reference/analysis/tokenizers/pattern-tokenizer.asciidoc new file mode 100644 index 00000000000..72ca6041020 --- /dev/null +++ b/docs/reference/analysis/tokenizers/pattern-tokenizer.asciidoc @@ -0,0 +1,29 @@ +[[analysis-pattern-tokenizer]] +=== Pattern Tokenizer + +A tokenizer of type `pattern` that can flexibly separate text into terms +via a regular expression. Accepts the following settings: + +[cols="<,<",options="header",] +|====================================================================== +|Setting |Description +|`pattern` |The regular expression pattern, defaults to `\\W+`. +|`flags` |The regular expression flags. +|`group` |Which group to extract into tokens. Defaults to `-1` (split). +|====================================================================== + +*IMPORTANT*: The regular expression should match the *token separators*, +not the tokens themselves. + +`group` set to `-1` (the default) is equivalent to "split". Using group +>= 0 selects the matching group as the token. For example, if you have: + +------------------------ +pattern = \\'([^\']+)\\' +group = 0 +input = aaa 'bbb' 'ccc' +------------------------ + +the output will be two tokens: 'bbb' and 'ccc' (including the ' marks). +With the same input but using group=1, the output would be: bbb and ccc +(no ' marks). diff --git a/docs/reference/analysis/tokenizers/standard-tokenizer.asciidoc b/docs/reference/analysis/tokenizers/standard-tokenizer.asciidoc new file mode 100644 index 00000000000..c8b405bf820 --- /dev/null +++ b/docs/reference/analysis/tokenizers/standard-tokenizer.asciidoc @@ -0,0 +1,18 @@ +[[analysis-standard-tokenizer]] +=== Standard Tokenizer + +A tokenizer of type `standard` providing grammar based tokenizer that is +a good tokenizer for most European language documents. The tokenizer +implements the Unicode Text Segmentation algorithm, as specified in +http://unicode.org/reports/tr29/[Unicode Standard Annex #29]. + +The following are settings that can be set for a `standard` tokenizer +type: + +[cols="<,<",options="header",] +|======================================================================= +|Setting |Description +|`max_token_length` |The maximum token length. If a token is seen that +exceeds this length then it is discarded. Defaults to `255`. +|======================================================================= + diff --git a/docs/reference/analysis/tokenizers/uaxurlemail-tokenizer.asciidoc b/docs/reference/analysis/tokenizers/uaxurlemail-tokenizer.asciidoc new file mode 100644 index 00000000000..9ed28e60b91 --- /dev/null +++ b/docs/reference/analysis/tokenizers/uaxurlemail-tokenizer.asciidoc @@ -0,0 +1,16 @@ +[[analysis-uaxurlemail-tokenizer]] +=== UAX Email URL Tokenizer + +A tokenizer of type `uax_url_email` which works exactly like the +`standard` tokenizer, but tokenizes emails and urls as single tokens. + +The following are settings that can be set for a `uax_url_email` +tokenizer type: + +[cols="<,<",options="header",] +|======================================================================= +|Setting |Description +|`max_token_length` |The maximum token length. If a token is seen that +exceeds this length then it is discarded. Defaults to `255`. +|======================================================================= + diff --git a/docs/reference/analysis/tokenizers/whitespace-tokenizer.asciidoc b/docs/reference/analysis/tokenizers/whitespace-tokenizer.asciidoc new file mode 100644 index 00000000000..f0e1ce28a12 --- /dev/null +++ b/docs/reference/analysis/tokenizers/whitespace-tokenizer.asciidoc @@ -0,0 +1,4 @@ +[[analysis-whitespace-tokenizer]] +=== Whitespace Tokenizer + +A tokenizer of type `whitespace` that divides text at whitespace. diff --git a/docs/reference/cluster.asciidoc b/docs/reference/cluster.asciidoc new file mode 100644 index 00000000000..b14fb654a7a --- /dev/null +++ b/docs/reference/cluster.asciidoc @@ -0,0 +1,46 @@ +[[cluster]] += Cluster APIs + +[partintro] +-- +["float",id="cluster-nodes"] +== Nodes + +Most cluster level APIs allow to specify which nodes to execute on (for +example, getting the node stats for a node). Nodes can be identified in +the APIs either using their internal node id, the node name, address, +custom attributes, or just the `_local` node receiving the request. For +example, here are some sample executions of nodes info: + +[source,js] +-------------------------------------------------- +# Local +curl localhost:9200/_cluster/nodes/_local +# Address +curl localhost:9200/_cluster/nodes/10.0.0.3,10.0.0.4 +curl localhost:9200/_cluster/nodes/10.0.0.* +# Names +curl localhost:9200/_cluster/nodes/node_name_goes_here +curl localhost:9200/_cluster/nodes/node_name_goes_* +# Attributes (set something like node.rack: 2 in the config) +curl localhost:9200/_cluster/nodes/rack:2 +curl localhost:9200/_cluster/nodes/ra*:2 +curl localhost:9200/_cluster/nodes/ra*:2* +-------------------------------------------------- +-- + +include::cluster/health.asciidoc[] + +include::cluster/state.asciidoc[] + +include::cluster/reroute.asciidoc[] + +include::cluster/update-settings.asciidoc[] + +include::cluster/nodes-stats.asciidoc[] + +include::cluster/nodes-info.asciidoc[] + +include::cluster/nodes-hot-threads.asciidoc[] + +include::cluster/nodes-shutdown.asciidoc[] diff --git a/docs/reference/cluster/health.asciidoc b/docs/reference/cluster/health.asciidoc new file mode 100644 index 00000000000..d4b555f1b86 --- /dev/null +++ b/docs/reference/cluster/health.asciidoc @@ -0,0 +1,86 @@ +[[cluster-health]] +== Cluster Health + +The cluster health API allows to get a very simple status on the health +of the cluster. + +[source,js] +-------------------------------------------------- +$ curl -XGET 'http://localhost:9200/_cluster/health?pretty=true' +{ + "cluster_name" : "testcluster", + "status" : "green", + "timed_out" : false, + "number_of_nodes" : 2, + "number_of_data_nodes" : 2, + "active_primary_shards" : 5, + "active_shards" : 10, + "relocating_shards" : 0, + "initializing_shards" : 0, + "unassigned_shards" : 0 +} +-------------------------------------------------- + +The API can also be executed against one or more indices to get just the +specified indices health: + +[source,js] +-------------------------------------------------- +$ curl -XGET 'http://localhost:9200/_cluster/health/test1,test2' +-------------------------------------------------- + +The cluster health status is: `green`, `yellow` or `red`. On the shard +level, a `red` status indicates that the specific shard is not allocated +in the cluster, `yellow` means that the primary shard is allocated but +replicas are not, and `green` means that all shards are allocated. The +index level status is controlled by the worst shard status. The cluster +status is controlled by the worst index status. + +One of the main benefits of the API is the ability to wait until the +cluster reaches a certain high water-mark health level. For example, the +following will wait till the cluster reaches the `yellow` level for 50 +seconds (if it reaches the `green` or `yellow` status beforehand, it +will return): + +[source,js] +-------------------------------------------------- +$ curl -XGET 'http://localhost:9200/_cluster/health?wait_for_status=yellow&timeout=50s' +-------------------------------------------------- + +[float] +=== Request Parameters + +The cluster health API accepts the following request parameters: + +`level`:: + Can be one of `cluster`, `indices` or `shards`. Controls the + details level of the health information returned. Defaults to `cluster`. + +`wait_for_status`:: + One of `green`, `yellow` or `red`. Will wait (until + the timeout provided) until the status of the cluster changes to the one + provided. By default, will not wait for any status. + +`wait_for_relocating_shards`:: + A number controlling to how many relocating + shards to wait for. Usually will be `0` to indicate to wait till all + relocation have happened. Defaults to not to wait. + +`wait_for_nodes`:: + The request waits until the specified number `N` of + nodes is available. It also accepts `>=N`, `<=N`, `>N` and `>. + +By default, it just returns the attributes and core settings for a node. +It also allows to get information on `settings`, `os`, `process`, `jvm`, +`thread_pool`, `network`, `transport`, `http` and `plugin`: + +[source,js] +-------------------------------------------------- +curl -XGET 'http://localhost:9200/_nodes?os=true&process=true' +curl -XGET 'http://localhost:9200/_nodes/10.0.0.1/?os=true&process=true' + +# Or, specific type endpoint: + +curl -XGET 'http://localhost:9200/_nodes/process' +curl -XGET 'http://localhost:9200/_nodes/10.0.0.1/process' +-------------------------------------------------- + +The `all` flag can be set to return all the information. + +`plugin` - if set, the result will contain details about the loaded +plugins per node: + +* `name`: plugin name +* `description`: plugin description if any +* `site`: `true` if the plugin is a site plugin +* `jvm`: `true` if the plugin is a plugin running in the JVM +* `url`: URL if the plugin is a site plugin + +The result will look similar to: + +[source,js] +-------------------------------------------------- +{ + "ok" : true, + "cluster_name" : "test-cluster-MacBook-Air-de-David.local", + "nodes" : { + "hJLXmY_NTrCytiIMbX4_1g" : { + "name" : "node4", + "transport_address" : "inet[/172.18.58.139:9303]", + "hostname" : "MacBook-Air-de-David.local", + "version" : "0.90.0.Beta2-SNAPSHOT", + "http_address" : "inet[/172.18.58.139:9203]", + "plugins" : [ { + "name" : "test-plugin", + "description" : "test-plugin description", + "site" : true, + "jvm" : false + }, { + "name" : "test-no-version-plugin", + "description" : "test-no-version-plugin description", + "site" : true, + "jvm" : false + }, { + "name" : "dummy", + "description" : "No description found for dummy.", + "url" : "/_plugin/dummy/", + "site" : false, + "jvm" : true + } ] + } + } +} +-------------------------------------------------- + +if your `plugin` data is subject to change use +`plugins.info_refresh_interval` to change or disable the caching +interval: + +[source,js] +-------------------------------------------------- +# Change cache to 20 seconds +plugins.info_refresh_interval: 20s + +# Infinite cache +plugins.info_refresh_interval: -1 + +# Disable cache +plugins.info_refresh_interval: 0 +-------------------------------------------------- diff --git a/docs/reference/cluster/nodes-shutdown.asciidoc b/docs/reference/cluster/nodes-shutdown.asciidoc new file mode 100644 index 00000000000..2fb98ecd4d1 --- /dev/null +++ b/docs/reference/cluster/nodes-shutdown.asciidoc @@ -0,0 +1,56 @@ +[[cluster-nodes-shutdown]] +== Nodes Shutdown + +The nodes shutdown API allows to shutdown one or more (or all) nodes in +the cluster. Here is an example of shutting the `_local` node the +request is directed to: + +[source,js] +-------------------------------------------------- +$ curl -XPOST 'http://localhost:9200/_cluster/nodes/_local/_shutdown' +-------------------------------------------------- + +Specific node(s) can be shutdown as well using their respective node ids +(or other selective options as explained +<> .): + +[source,js] +-------------------------------------------------- +$ curl -XPOST 'http://localhost:9200/_cluster/nodes/nodeId1,nodeId2/_shutdown' +-------------------------------------------------- + +The master (of the cluster) can also be shutdown using: + +[source,js] +-------------------------------------------------- +$ curl -XPOST 'http://localhost:9200/_cluster/nodes/_master/_shutdown' +-------------------------------------------------- + +Finally, all nodes can be shutdown using one of the options below: + +[source,js] +-------------------------------------------------- +$ curl -XPOST 'http://localhost:9200/_shutdown' + +$ curl -XPOST 'http://localhost:9200/_cluster/nodes/_shutdown' + +$ curl -XPOST 'http://localhost:9200/_cluster/nodes/_all/_shutdown' +-------------------------------------------------- + +[float] +=== Delay + +By default, the shutdown will be executed after a 1 second delay (`1s`). +The delay can be customized by setting the `delay` parameter in a time +value format. For example: + +[source,js] +-------------------------------------------------- +$ curl -XPOST 'http://localhost:9200/_cluster/nodes/_local/_shutdown?delay=10s' +-------------------------------------------------- + +[float] +=== Disable Shutdown + +The shutdown API can be disabled by setting `action.disable_shutdown` in +the node configuration. diff --git a/docs/reference/cluster/nodes-stats.asciidoc b/docs/reference/cluster/nodes-stats.asciidoc new file mode 100644 index 00000000000..df06c57b8d3 --- /dev/null +++ b/docs/reference/cluster/nodes-stats.asciidoc @@ -0,0 +1,100 @@ +[[cluster-nodes-stats]] +== Nodes Stats + +[float] +=== Nodes statistics + +The cluster nodes stats API allows to retrieve one or more (or all) of +the cluster nodes statistics. + +[source,js] +-------------------------------------------------- +curl -XGET 'http://localhost:9200/_cluster/nodes/stats' +curl -XGET 'http://localhost:9200/_cluster/nodes/nodeId1,nodeId2/stats' + +# simplified +curl -XGET 'http://localhost:9200/_nodes/stats' +curl -XGET 'http://localhost:9200/_nodes/nodeId1,nodeId2/stats' +-------------------------------------------------- + +The first command retrieves stats of all the nodes in the cluster. The +second command selectively retrieves nodes stats of only `nodeId1` and +`nodeId2`. All the nodes selective options are explained +<>. + +By default, `indices` stats are returned. With options for `indices`, +`os`, `process`, `jvm`, `network`, `transport`, `http`, `fs`, and +`thread_pool`. For example: + +[horizontal] +`indices`:: + Indices stats about size, document count, indexing and + deletion times, search times, field cache size , merges and flushes + +`fs`:: + File system information, data path, free disk space, read/write + stats + +`http`:: + HTTP connection information + +`jvm`:: + JVM stats, memory pool information, garbage collection, buffer + pools + +`network`:: + TCP information + +`os`:: + Operating system stats, load average, cpu, mem, swap + +`process`:: + Process statistics, memory consumption, cpu usage, open + file descriptors + +`thread_pool`:: + Statistics about each thread pool, including current + size, queue and rejected tasks + +`transport`:: + Transport statistics about sent and received bytes in + cluster communication + +`clear`:: + Clears all the flags (first). Useful, if you only want to + retrieve specific stats. + +[source,js] +-------------------------------------------------- +# return indices and os +curl -XGET 'http://localhost:9200/_nodes/stats?os=true' +# return just os and process +curl -XGET 'http://localhost:9200/_nodes/stats?clear=true&os=true&process=true' +# specific type endpoint +curl -XGET 'http://localhost:9200/_nodes/process/stats' +curl -XGET 'http://localhost:9200/_nodes/10.0.0.1/process/stats' +# or, if you like the other way +curl -XGET 'http://localhost:9200/_nodes/stats/process' +curl -XGET 'http://localhost:9200/_nodes/10.0.0.1/stats/process' +-------------------------------------------------- + +The `all` flag can be set to return all the stats. + +[float] +=== Field data statistics + +From 0.90, you can get information about field data memory usage on node +level or on index level. + +[source,js] +-------------------------------------------------- +# Node Stats +curl localhost:9200/_nodes/stats/indices/fielddata/field1,field2?pretty + +# Indices Stat +curl localhost:9200/_stats/fielddata/field1,field2?pretty + +# You can use wildcards for field names +curl localhost:9200/_stats/fielddata/field*?pretty +curl localhost:9200/_nodes/stats/indices/fielddata/field*?pretty +-------------------------------------------------- diff --git a/docs/reference/cluster/reroute.asciidoc b/docs/reference/cluster/reroute.asciidoc new file mode 100644 index 00000000000..8348edb1fd4 --- /dev/null +++ b/docs/reference/cluster/reroute.asciidoc @@ -0,0 +1,68 @@ +[[cluster-reroute]] +== Cluster Reroute + +The reroute command allows to explicitly execute a cluster reroute +allocation command including specific commands. For example, a shard can +be moved from one node to another explicitly, an allocation can be +canceled, or an unassigned shard can be explicitly allocated on a +specific node. + +Here is a short example of how a simple reroute API call: + +[source,js] +-------------------------------------------------- + +curl -XPOST 'localhost:9200/_cluster/reroute' -d '{ + "commands" : [ { + "move" : + { + "index" : "test", "shard" : 0, + "from_node" : "node1", "to_node" : "node2" + } + }, + { + "allocate" : { + "index" : "test", "shard" : 1, "node" : "node3" + } + } + ] +}' +-------------------------------------------------- + +An important aspect to remember is the fact that once when an allocation +occurs, the cluster will aim at re-balancing its state back to an even +state. For example, if the allocation includes moving a shard from +`node1` to `node2`, in an `even` state, then another shard will be moved +from `node2` to `node1` to even things out. + +The cluster can be set to disable allocations, which means that only the +explicitly allocations will be performed. Obviously, only once all +commands has been applied, the cluster will aim to be re-balance its +state. + +Another option is to run the commands in `dry_run` (as a URI flag, or in +the request body). This will cause the commands to apply to the current +cluster state, and return the resulting cluster after the commands (and +re-balancing) has been applied. + +The commands supported are: + +`move`:: + Move a started shard from one node to another node. Accepts + `index` and `shard` for index name and shard number, `from_node` for the + node to move the shard `from`, and `to_node` for the node to move the + shard to. + +`cancel`:: + Cancel allocation of a shard (or recovery). Accepts `index` + and `shard` for index name and shard number, and `node` for the node to + cancel the shard allocation on. It also accepts `allow_primary` flag to + explicitly specify that it is allowed to cancel allocation for a primary + shard. + +`allocate`:: + Allocate an unassigned shard to a node. Accepts the + `index` and `shard` for index name and shard number, and `node` to + allocate the shard to. It also accepts `allow_primary` flag to + explicitly specify that it is allowed to explicitly allocate a primary + shard (might result in data loss). diff --git a/docs/reference/cluster/state.asciidoc b/docs/reference/cluster/state.asciidoc new file mode 100644 index 00000000000..e7b9360e9cf --- /dev/null +++ b/docs/reference/cluster/state.asciidoc @@ -0,0 +1,48 @@ +[[cluster-state]] +== Cluster State + +The cluster state API allows to get a comprehensive state information of +the whole cluster. + +[source,js] +-------------------------------------------------- +$ curl -XGET 'http://localhost:9200/_cluster/state' +-------------------------------------------------- + +By default, the cluster state request is routed to the master node, to +ensure that the latest cluster state is returned. +For debugging purposes, you can retrieve the cluster state local to a +particular node by adding `local=true` to the query string. + +[float] +=== Response Filters + +It is possible to filter the cluster state response using the following +REST parameters: + +`filter_nodes`:: + Set to `true` to filter out the `nodes` part of the + response. + +`filter_routing_table`:: + Set to `true` to filter out the `routing_table` + part of the response. + +`filter_metadata`:: + Set to `true` to filter out the `metadata` part of the + response. + +`filter_blocks`:: + Set to `true` to filter out the `blocks` part of the + response. + +`filter_indices`:: + When not filtering metadata, a comma separated list of + indices to include in the response. + +Example follows: + +[source,js] +-------------------------------------------------- +$ curl -XGET 'http://localhost:9200/_cluster/state?filter_nodes=true' +-------------------------------------------------- diff --git a/docs/reference/cluster/update-settings.asciidoc b/docs/reference/cluster/update-settings.asciidoc new file mode 100644 index 00000000000..a2a82124893 --- /dev/null +++ b/docs/reference/cluster/update-settings.asciidoc @@ -0,0 +1,198 @@ +[[cluster-update-settings]] +== Cluster Update Settings + +Allows to update cluster wide specific settings. Settings updated can +either be persistent (applied cross restarts) or transient (will not +survive a full cluster restart). Here is an example: + +[source,js] +-------------------------------------------------- +curl -XPUT localhost:9200/_cluster/settings -d '{ + "persistent" : { + "discovery.zen.minimum_master_nodes" : 2 + } +}' +-------------------------------------------------- + +Or: + +[source,js] +-------------------------------------------------- +curl -XPUT localhost:9200/_cluster/settings -d '{ + "transient" : { + "discovery.zen.minimum_master_nodes" : 2 + } +}' +-------------------------------------------------- + +The cluster responds with the settings updated. So the response for the +last example will be: + +[source,js] +-------------------------------------------------- +{ + "persistent" : {}, + "transient" : { + "discovery.zen.minimum_master_nodes" : "2" + } +}' +-------------------------------------------------- + +Cluster wide settings can be returned using: + +[source,js] +-------------------------------------------------- +curl -XGET localhost:9200/_cluster/settings +-------------------------------------------------- + +There is a specific list of settings that can be updated, those include: + +[float] +=== Cluster settings + +[float] +==== Routing allocation + +[float] +===== Awareness + +`cluster.routing.allocation.awareness.attributes`:: + See <>. + +`cluster.routing.allocation.awareness.force.*`:: + See <>. + +[float] +===== Balanced Shards + +`cluster.routing.allocation.balance.shard`:: + Defines the weight factor for shards allocated on a node + (float). Defaults to `0.45f`. + +`cluster.routing.allocation.balance.index`:: + Defines a factor to the number of shards per index allocated + on a specific node (float). Defaults to `0.5f`. + +`cluster.routing.allocation.balance.primary`:: + defines a weight factor for the number of primaries of a specific index + allocated on a node (float). `0.05f`. + +`cluster.routing.allocation.balance.threshold`:: + minimal optimization value of operations that should be performed (non + negative float). Defaults to `1.0f`. + +[float] +===== Concurrent Rebalance + +`cluster.routing.allocation.cluster_concurrent_rebalance`:: + Allow to control how many concurrent rebalancing of shards are + allowed cluster wide, and default it to `2` (integer). `-1` for + unlimited. See also <>. + +[float] +===== Disable allocation + +`cluster.routing.allocation.disable_allocation`:: + See <>. + +`cluster.routing.allocation.disable_replica_allocation`:: + See <>. + +`cluster.routing.allocation.disable_new_allocation`:: + See <>. + +[float] +===== Throttling allocation + +`cluster.routing.allocation.node_initial_primaries_recoveries`:: + See <>. + +`cluster.routing.allocation.node_concurrent_recoveries`:: + See <>. + +[float] +===== Filter allocation + +`cluster.routing.allocation.include.*`:: + See <>. + +`cluster.routing.allocation.exclude.*`:: + See <>. + +`cluster.routing.allocation.require.*` (from 0.90):: + See <>. + +[float] +==== Metadata + +`cluster.blocks.read_only`:: + Have the whole cluster read only (indices do not accept write operations), metadata is not allowed to be modified (create or delete indices). + +[float] +==== Discovery + +`discovery.zen.minimum_master_nodes`:: + See <> + +[float] +==== Threadpools + +`threadpool.*`:: + See <> + +[float] +=== Index settings + +[float] +==== Index filter cache + +`indices.cache.filter.size`:: + See <> + +`indices.cache.filter.expire` (time):: + See <> + +[float] +==== TTL interval + +`indices.ttl.interval` (time):: + See <> + +[float] +==== Recovery + +`indices.recovery.concurrent_streams`:: + See <> + +`indices.recovery.file_chunk_size`:: + See <> + +`indices.recovery.translog_ops`:: + See <> + +`indices.recovery.translog_size`:: + See <> + +`indices.recovery.compress`:: + See <> + +`indices.recovery.max_bytes_per_sec`:: + Since 0.90.1. See <> + +`indices.recovery.max_size_per_sec`:: + Deprecated since 0.90.1. See `max_bytes_per_sec` instead. + +[float] +==== Store level throttling + +`indices.store.throttle.type`:: + See <> + +`indices.store.throttle.max_bytes_per_sec`:: + See <> + +[float] +=== Logger + +Logger values can also be updated by setting `logger.` prefix. More +settings will be allowed to be updated. diff --git a/docs/reference/common-options.asciidoc b/docs/reference/common-options.asciidoc new file mode 100644 index 00000000000..9156b9a0791 --- /dev/null +++ b/docs/reference/common-options.asciidoc @@ -0,0 +1,45 @@ +[[search-common-options]] +== Common Options + +=== Pretty Results + +When appending `?pretty=true` to any request made, the JSON returned +will be pretty formatted (use it for debugging only!). Another option is +to set `format=yaml` which will cause the result to be returned in the +(sometimes) more readable yaml format. + +=== Parameters + +Rest parameters (when using HTTP, map to HTTP URL parameters) follow the +convention of using underscore casing. + +=== Boolean Values + +All REST APIs parameters (both request parameters and JSON body) support +providing boolean "false" as the values: `false`, `0`, `no` and `off`. +All other values are considered "true". Note, this is not related to +fields within a document indexed treated as boolean fields. + +=== Number Values + +All REST APIs support providing numbered parameters as `string` on top +of supporting the native JSON number types. + +=== Result Casing + +All REST APIs accept the `case` parameter. When set to `camelCase`, all +field names in the result will be returned in camel casing, otherwise, +underscore casing will be used. Note, this does not apply to the source +document indexed. + +=== JSONP + +All REST APIs accept a `callback` parameter resulting in a +http://en.wikipedia.org/wiki/JSONP[JSONP] result. + +=== Request body in query string + +For libraries that don't accept a request body for non-POST requests, +you can pass the request body as the `source` query string parameter +instead. + diff --git a/docs/reference/docs.asciidoc b/docs/reference/docs.asciidoc new file mode 100644 index 00000000000..754019ddeb6 --- /dev/null +++ b/docs/reference/docs.asciidoc @@ -0,0 +1,31 @@ +[[docs]] += Document APIs + +[partintro] +-- + +This section describes the REST APIs *elasticsearch* provides (mainly) +using JSON. The API is exposed using +<>, +<>, +<>. + +-- + +include::docs/index_.asciidoc[] + +include::docs/get.asciidoc[] + +include::docs/delete.asciidoc[] + +include::docs/update.asciidoc[] + +include::docs/multi-get.asciidoc[] + +include::docs/bulk.asciidoc[] + +include::docs/delete-by-query.asciidoc[] + +include::docs/bulk-udp.asciidoc[] + + diff --git a/docs/reference/docs/bulk-udp.asciidoc b/docs/reference/docs/bulk-udp.asciidoc new file mode 100644 index 00000000000..74565a396a0 --- /dev/null +++ b/docs/reference/docs/bulk-udp.asciidoc @@ -0,0 +1,57 @@ +[[docs-bulk-udp]] +== Bulk UDP API + +A Bulk UDP service is a service listening over UDP for bulk format +requests. The idea is to provide a low latency UDP service that allows +to easily index data that is not of critical nature. + +The Bulk UDP service is disabled by default, but can be enabled by +setting `bulk.udp.enabled` to `true`. + +The bulk UDP service performs internal bulk aggregation of the data and +then flushes it based on several parameters: + +`bulk.udp.bulk_actions`:: + The number of actions to flush a bulk after, + defaults to `1000`. + +`bulk.udp.bulk_size`:: + The size of the current bulk request to flush + the request once exceeded, defaults to `5mb`. + +`bulk.udp.flush_interval`:: + An interval after which the current + request is flushed, regardless of the above limits. Defaults to `5s`. +`bulk.udp.concurrent_requests`:: + The number on max in flight bulk + requests allowed. Defaults to `4`. + +The allowed network settings are: + +`bulk.udp.host`:: + The host to bind to, defaults to `network.host` + which defaults to any. + +`bulk.udp.port`:: + The port to use, defaults to `9700-9800`. + +`bulk.udp.receive_buffer_size`:: + The receive buffer size, defaults to `10mb`. + +Here is an example of how it can be used: + +[source,js] +-------------------------------------------------- +> cat bulk.txt +{ "index" : { "_index" : "test", "_type" : "type1" } } +{ "field1" : "value1" } +{ "index" : { "_index" : "test", "_type" : "type1" } } +{ "field1" : "value1" } +-------------------------------------------------- + +[source,js] +-------------------------------------------------- +> cat bulk.txt | nc -w 0 -u localhost 9700 +-------------------------------------------------- + + diff --git a/docs/reference/docs/bulk.asciidoc b/docs/reference/docs/bulk.asciidoc new file mode 100644 index 00000000000..0ba4230459c --- /dev/null +++ b/docs/reference/docs/bulk.asciidoc @@ -0,0 +1,174 @@ +[[docs-bulk]] +== Bulk API + +The bulk API makes it possible to perform many index/delete operations +in a single API call. This can greatly increase the indexing speed. The +REST API endpoint is `/_bulk`, and it expects the following JSON +structure: + +[source,js] +-------------------------------------------------- +action_and_meta_data\n +optional_source\n +action_and_meta_data\n +optional_source\n +.... +action_and_meta_data\n +optional_source\n +-------------------------------------------------- + +*NOTE*: the final line of data must end with a newline character `\n`. + +The possible actions are `index`, `create`, `delete` and since version +`0.90.1` also `update`. `index` and `create` expect a source on the next +line, and have the same semantics as the `op_type` parameter to the +standard index API (i.e. create will fail if a document with the same +index and type exists already, whereas index will add or replace a +document as necessary). `delete` does not expect a source on the +following line, and has the same semantics as the standard delete API. +`update` expects that the partial doc, upsert and script and its options +are specified on the next line. + +If you're providing text file input to `curl`, you *must* use the +`--data-binary` flag instead of plain `-d`. The latter doesn't preserve +newlines. Example: + +[source,js] +-------------------------------------------------- +$ cat requests +{ "index" : { "_index" : "test", "_type" : "type1", "_id" : "1" } } +{ "field1" : "value1" } +$ curl -s -XPOST localhost:9200/_bulk --data-binary @requests; echo +{"took":7,"items":[{"create":{"_index":"test","_type":"type1","_id":"1","_version":1,"ok":true}}]} +-------------------------------------------------- + +Because this format uses literal `\n`'s as delimiters, please be sure +that the JSON actions and sources are not pretty printed. Here is an +example of a correct sequence of bulk commands: + +[source,js] +-------------------------------------------------- +{ "index" : { "_index" : "test", "_type" : "type1", "_id" : "1" } } +{ "field1" : "value1" } +{ "delete" : { "_index" : "test", "_type" : "type1", "_id" : "2" } } +{ "create" : { "_index" : "test", "_type" : "type1", "_id" : "3" } } +{ "field1" : "value3" } +{ "update" : {"_id" : "1", "_type" : "type1", "_index" : "index1"} } +{ "doc" : {"field2" : "value2"} } +-------------------------------------------------- + +In the above example `doc` for the `update` action is a partial +document, that will be merged with the already stored document. + +The endpoints are `/_bulk`, `/{index}/_bulk`, and `{index}/type/_bulk`. +When the index or the index/type are provided, they will be used by +default on bulk items that don't provide them explicitly. + +A note on the format. The idea here is to make processing of this as +fast as possible. As some of the actions will be redirected to other +shards on other nodes, only `action_meta_data` is parsed on the +receiving node side. + +Client libraries using this protocol should try and strive to do +something similar on the client side, and reduce buffering as much as +possible. + +The response to a bulk action is a large JSON structure with the +individual results of each action that was performed. The failure of a +single action does not affect the remaining actions. + +There is no "correct" number of actions to perform in a single bulk +call. You should experiment with different settings to find the optimum +size for your particular workload. + +If using the HTTP API, make sure that the client does not send HTTP +chunks, as this will slow things down. + +[float] +=== Versioning + +Each bulk item can include the version value using the +`_version`/`version` field. It automatically follows the behavior of the +index / delete operation based on the `_version` mapping. It also +support the `version_type`/`_version_type` when using `external` +versioning. + +[float] +=== Routing + +Each bulk item can include the routing value using the +`_routing`/`routing` field. It automatically follows the behavior of the +index / delete operation based on the `_routing` mapping. + +[float] +=== Percolator + +Each bulk index action can include a percolate value using the +`_percolate`/`percolate` field. + +[float] +=== Parent + +Each bulk item can include the parent value using the `_parent`/`parent` +field. It automatically follows the behavior of the index / delete +operation based on the `_parent` / `_routing` mapping. + +[float] +=== Timestamp + +Each bulk item can include the timestamp value using the +`_timestamp`/`timestamp` field. It automatically follows the behavior of +the index operation based on the `_timestamp` mapping. + +[float] +=== TTL + +Each bulk item can include the ttl value using the `_ttl`/`ttl` field. +It automatically follows the behavior of the index operation based on +the `_ttl` mapping. + +[float] +=== Write Consistency + +When making bulk calls, you can require a minimum number of active +shards in the partition through the `consistency` parameter. The values +allowed are `one`, `quorum`, and `all`. It defaults to the node level +setting of `action.write_consistency`, which in turn defaults to +`quorum`. + +For example, in a N shards with 2 replicas index, there will have to be +at least 2 active shards within the relevant partition (`quorum`) for +the operation to succeed. In a N shards with 1 replica scenario, there +will need to be a single shard active (in this case, `one` and `quorum` +is the same). + +[float] +=== Refresh + +The `refresh` parameter can be set to `true` in order to refresh the +relevant shards immediately after the bulk operation has occurred and +make it searchable, instead of waiting for the normal refresh interval +to expire. Setting it to `true` can trigger additional load, and may +slow down indexing. + +[float] +=== Update + +When using `update` action `_retry_on_conflict` can be used as field in +the action itself (not in the extra payload line), to specify how many +times an update should be retried in the case of a version conflict. + +The `update` action payload, supports the following options: `doc` +(partial document), `upsert`, `doc_as_upsert`, `script`, `params` (for +script), `lang` (for script). See update documentation for details on +the options. Curl example with update actions: + +[source,js] +-------------------------------------------------- +{ "update" : {"_id" : "1", "_type" : "type1", "_index" : "index1", "_retry_on_conflict" : 3} } +{ "doc" : {"field" : "value"} } +{ "update" : { "_id" : "0", "_type" : "type1", "_index" : "index1", "_retry_on_conflict" : 3} } +{ "script" : "ctx._source.counter += param1", "lang" : "js", "params" : {"param1" : 1}, "upsert" : {"counter" : 1}} +{ "update" : {"_id" : "2", "_type" : "type1", "_index" : "index1", "_retry_on_conflict" : 3} } +{ "doc" : {"field" : "value"}, "doc_as_upsert" : true } +-------------------------------------------------- diff --git a/docs/reference/docs/delete-by-query.asciidoc b/docs/reference/docs/delete-by-query.asciidoc new file mode 100644 index 00000000000..1df88f53e81 --- /dev/null +++ b/docs/reference/docs/delete-by-query.asciidoc @@ -0,0 +1,139 @@ +[[docs-delete-by-query]] +== Delete By Query API + +The delete by query API allows to delete documents from one or more +indices and one or more types based on a query. The query can either be +provided using a simple query string as a parameter, or using the +<> defined within the request +body. Here is an example: + +[source,js] +-------------------------------------------------- +$ curl -XDELETE 'http://localhost:9200/twitter/tweet/_query?q=user:kimchy' + +$ curl -XDELETE 'http://localhost:9200/twitter/tweet/_query' -d '{ + "term" : { "user" : "kimchy" } +} +' +-------------------------------------------------- + +Both above examples end up doing the same thing, which is delete all +tweets from the twitter index for a certain user. The result of the +commands is: + +[source,js] +-------------------------------------------------- +{ + "ok" : true, + "_indices" : { + "twitter" : { + "_shards" : { + "total" : 5, + "successful" : 5, + "failed" : 0 + } + } + } +} +-------------------------------------------------- + +Note, delete by query bypasses versioning support. Also, it is not +recommended to delete "large chunks of the data in an index", many +times, it's better to simply reindex into a new index. + +[float] +=== Multiple Indices and Types + +The delete by query API can be applied to multiple types within an +index, and across multiple indices. For example, we can delete all +documents across all types within the twitter index: + +[source,js] +-------------------------------------------------- +$ curl -XDELETE 'http://localhost:9200/twitter/_query?q=user:kimchy' +-------------------------------------------------- + +We can also delete within specific types: + +[source,js] +-------------------------------------------------- +$ curl -XDELETE 'http://localhost:9200/twitter/tweet,user/_query?q=user:kimchy' +-------------------------------------------------- + +We can also delete all tweets with a certain tag across several indices +(for example, when each user has his own index): + +[source,js] +-------------------------------------------------- +$ curl -XDELETE 'http://localhost:9200/kimchy,elasticsearch/_query?q=tag:wow' +-------------------------------------------------- + +Or even delete across all indices: + +[source,js] +-------------------------------------------------- +$ curl -XDELETE 'http://localhost:9200/_all/_query?q=tag:wow' +-------------------------------------------------- + +[float] +=== Request Parameters + +When executing a delete by query using the query parameter `q`, the +query passed is a query string using Lucene query parser. There are +additional parameters that can be passed: + +[cols="<,<",options="header",] +|======================================================================= +|Name |Description +|df |The default field to use when no field prefix is defined within the +query. + +|analyzer |The analyzer name to be used when analyzing the query string. + +|default_operator |The default operator to be used, can be `AND` or +`OR`. Defaults to `OR`. +|======================================================================= + +[float] +=== Request Body + +The delete by query can use the <> within its body in order to express the query that should be +executed and delete all documents. The body content can also be passed +as a REST parameter named `source`. + +[float] +=== Distributed + +The delete by query API is broadcast across all primary shards, and from +there, replicated across all shards replicas. + +[float] +=== Routing + +The routing value (a comma separated list of the routing values) can be +specified to control which shards the delete by query request will be +executed on. + +[float] +=== Replication Type + +The replication of the operation can be done in an asynchronous manner +to the replicas (the operation will return once it has be executed on +the primary shard). The `replication` parameter can be set to `async` +(defaults to `sync`) in order to enable it. + +[float] +=== Write Consistency + +Control if the operation will be allowed to execute based on the number +of active shards within that partition (replication group). The values +allowed are `one`, `quorum`, and `all`. The parameter to set it is +`consistency`, and it defaults to the node level setting of +`action.write_consistency` which in turn defaults to `quorum`. + +For example, in a N shards with 2 replicas index, there will have to be +at least 2 active shards within the relevant partition (`quorum`) for +the operation to succeed. In a N shards with 1 replica scenario, there +will need to be a single shard active (in this case, `one` and `quorum` +is the same). diff --git a/docs/reference/docs/delete.asciidoc b/docs/reference/docs/delete.asciidoc new file mode 100644 index 00000000000..86ed4844f13 --- /dev/null +++ b/docs/reference/docs/delete.asciidoc @@ -0,0 +1,115 @@ +[[docs-delete]] +== Delete API + +The delete API allows to delete a typed JSON document from a specific +index based on its id. The following example deletes the JSON document +from an index called twitter, under a type called tweet, with id valued +1: + +[source,js] +-------------------------------------------------- +$ curl -XDELETE 'http://localhost:9200/twitter/tweet/1' +-------------------------------------------------- + +The result of the above delete operation is: + +[source,js] +-------------------------------------------------- +{ + "ok" : true, + "_index" : "twitter", + "_type" : "tweet", + "_id" : "1", + "found" : true +} +-------------------------------------------------- + +[float] +=== Versioning + +Each document indexed is versioned. When deleting a document, the +`version` can be specified to make sure the relevant document we are +trying to delete is actually being deleted and it has not changed in the +meantime. + +[float] +=== Routing + +When indexing using the ability to control the routing, in order to +delete a document, the routing value should also be provided. For +example: + +[source,js] +-------------------------------------------------- +$ curl -XDELETE 'http://localhost:9200/twitter/tweet/1?routing=kimchy' +-------------------------------------------------- + +The above will delete a tweet with id 1, but will be routed based on the +user. Note, issuing a delete without the correct routing, will cause the +document to not be deleted. + +Many times, the routing value is not known when deleting a document. For +those cases, when specifying the `_routing` mapping as `required`, and +no routing value is specified, the delete will be broadcasted +automatically to all shards. + +[float] +=== Parent + +The `parent` parameter can be set, which will basically be the same as +setting the routing parameter. + +Note that deleting a parent document does not automatically delete its +children. One way of deleting all child documents given a parent's id is +to perform a <> on the child +index with the automatically generated (and indexed) +field _parent, which is in the format parent_type#parent_id. + +[float] +=== Automatic index creation + +The delete operation automatically creates an index if it has not been +created before (check out the <> +for manually creating an index), and also automatically creates a +dynamic type mapping for the specific type if it has not been created +before (check out the <> +API for manually creating type mapping). + +[float] +=== Distributed + +The delete operation gets hashed into a specific shard id. It then gets +redirected into the primary shard within that id group, and replicated +(if needed) to shard replicas within that id group. + +[float] +=== Replication Type + +The replication of the operation can be done in an asynchronous manner +to the replicas (the operation will return once it has be executed on +the primary shard). The `replication` parameter can be set to `async` +(defaults to `sync`) in order to enable it. + +[float] +=== Write Consistency + +Control if the operation will be allowed to execute based on the number +of active shards within that partition (replication group). The values +allowed are `one`, `quorum`, and `all`. The parameter to set it is +`consistency`, and it defaults to the node level setting of +`action.write_consistency` which in turn defaults to `quorum`. + +For example, in a N shards with 2 replicas index, there will have to be +at least 2 active shards within the relevant partition (`quorum`) for +the operation to succeed. In a N shards with 1 replica scenario, there +will need to be a single shard active (in this case, `one` and `quorum` +is the same). + +[float] +=== Refresh + +The `refresh` parameter can be set to `true` in order to refresh the +relevant shard after the delete operation has occurred and make it +searchable. Setting it to `true` should be done after careful thought +and verification that this does not cause a heavy load on the system +(and slows down indexing). diff --git a/docs/reference/docs/get.asciidoc b/docs/reference/docs/get.asciidoc new file mode 100644 index 00000000000..7b7adb5f0c4 --- /dev/null +++ b/docs/reference/docs/get.asciidoc @@ -0,0 +1,158 @@ +[[docs-get]] +== Get API + +The get API allows to get a typed JSON document from the index based on +its id. The following example gets a JSON document from an index called +twitter, under a type called tweet, with id valued 1: + +[source,js] +-------------------------------------------------- +curl -XGET 'http://localhost:9200/twitter/tweet/1' +-------------------------------------------------- + +The result of the above get operation is: + +[source,js] +-------------------------------------------------- +{ + "_index" : "twitter", + "_type" : "tweet", + "_id" : "1", + "_source" : { + "user" : "kimchy", + "postDate" : "2009-11-15T14:12:12", + "message" : "trying out Elastic Search" + } +} +-------------------------------------------------- + +The above result includes the `_index`, `_type`, and `_id` of the +document we wish to retrieve, including the actual source of the +document that was indexed. + +The API also allows to check for the existance of a document using +`HEAD`, for example: + +[source,js] +-------------------------------------------------- +curl -XHEAD 'http://localhost:9200/twitter/tweet/1' +-------------------------------------------------- + +[float] +=== Realtime + +By default, the get API is realtime, and is not affected by the refresh +rate of the index (when data will become visible for search). + +In order to disable realtime GET, one can either set `realtime` +parameter to `false`, or globally default it to by setting the +`action.get.realtime` to `false` in the node configuration. + +When getting a document, one can specify `fields` to fetch from it. They +will, when possible, be fetched as stored fields (fields mapped as +stored in the mapping). When using realtime GET, there is no notion of +stored fields (at least for a period of time, basically, until the next +flush), so they will be extracted from the source itself (note, even if +source is not enabled). It is a good practice to assume that the fields +will be loaded from source when using realtime GET, even if the fields +are stored. + +[float] +=== Optional Type + +The get API allows for `_type` to be optional. Set it to `_all` in order +to fetch the first document matching the id across all types. + +[float] +=== Fields + +The get operation allows specifying a set of fields that will be +returned (by default, the `_source` field) by passing the `fields` +parameter. For example: + +[source,js] +-------------------------------------------------- +curl -XGET 'http://localhost:9200/twitter/tweet/1?fields=title,content' +-------------------------------------------------- + +The returned fields will either be loaded if they are stored, or fetched +from the `_source` (parsed and extracted). It also supports sub objects +extraction from _source, like `obj1.obj2`. + +[float] +=== Getting the _source directly + +Since version `0.90.1` there is a new rest end point that allows the +source to be returned directly without any additional content around it. +The get endpoint has the following structure: +`{index}/{type}/{id}/_source`. Curl example: + +[source,js] +-------------------------------------------------- +curl -XGET 'http://localhost:9200/twitter/tweet/1/_source' +-------------------------------------------------- + +Note, there is also a HEAD variant for the new _source endpoint. Curl +example: + +[source,js] +-------------------------------------------------- +curl -XHEAD 'http://localhost:9200/twitter/tweet/1/_source' +-------------------------------------------------- + +[float] +=== Routing + +When indexing using the ability to control the routing, in order to get +a document, the routing value should also be provided. For example: + +[source,js] +-------------------------------------------------- +curl -XGET 'http://localhost:9200/twitter/tweet/1?routing=kimchy' +-------------------------------------------------- + +The above will get a tweet with id 1, but will be routed based on the +user. Note, issuing a get without the correct routing, will cause the +document not to be fetched. + +[float] +=== Preference + +Controls a `preference` of which shard replicas to execute the get +request on. By default, the operation is randomized between the shard +replicas. + +The `preference` can be set to: + +`_primary`:: + The operation will go and be executed only on the primary + shards. + +`_local`:: + The operation will prefer to be executed on a local + allocated shard if possible. + +Custom (string) value:: + A custom value will be used to guarantee that + the same shards will be used for the same custom value. This can help + with "jumping values" when hitting different shards in different refresh + states. A sample value can be something like the web session id, or the + user name. + +[float] +=== Refresh + +The `refresh` parameter can be set to `true` in order to refresh the +relevant shard before the get operation and make it searchable. Setting +it to `true` should be done after careful thought and verification that +this does not cause a heavy load on the system (and slows down +indexing). + +[float] +=== Distributed + +The get operation gets hashed into a specific shard id. It then gets +redirected to one of the replicas within that shard id and returns the +result. The replicas are the primary shard and its replicas within that +shard id group. This means that the more replicas we will have, the +better GET scaling we will have. diff --git a/docs/reference/docs/index_.asciidoc b/docs/reference/docs/index_.asciidoc new file mode 100644 index 00000000000..19b21429681 --- /dev/null +++ b/docs/reference/docs/index_.asciidoc @@ -0,0 +1,375 @@ +[[docs-index_]] +== Index API + +The index API adds or updates a typed JSON document in a specific index, +making it searchable. The following example inserts the JSON document +into the "twitter" index, under a type called "tweet" with an id of 1: + +[source,js] +-------------------------------------------------- +$ curl -XPUT 'http://localhost:9200/twitter/tweet/1' -d '{ + "user" : "kimchy", + "post_date" : "2009-11-15T14:12:12", + "message" : "trying out Elastic Search" +}' +-------------------------------------------------- + +The result of the above index operation is: + +[source,js] +-------------------------------------------------- +{ + "ok" : true, + "_index" : "twitter", + "_type" : "tweet", + "_id" : "1", + "_version" : 1 +} +-------------------------------------------------- + +[float] +=== Automatic Index Creation + +The index operation automatically creates an index if it has not been +created before (check out the +<> for manually +creating an index), and also automatically creates a +dynamic type mapping for the specific type if one has not yet been +created (check out the <> +API for manually creating a type mapping). + +The mapping itself is very flexible and is schema-free. New fields and +objects will automatically be added to the mapping definition of the +type specified. Check out the <> +section for more information on mapping definitions. + +Though explained on the <> section, +it's important to note that the format of the JSON document can also +include the type (very handy when using JSON mappers), for example: + +[source,js] +-------------------------------------------------- +$ curl -XPUT 'http://localhost:9200/twitter/tweet/1' -d '{ + "tweet" : { + "user" : "kimchy", + "post_date" : "2009-11-15T14:12:12", + "message" : "trying out Elastic Search" + } +}' +-------------------------------------------------- + +Automatic index creation can be disabled by setting +`action.auto_create_index` to `false` in the config file of all nodes. +Automatic mapping creation can be disabled by setting +`index.mapper.dynamic` to `false` in the config files of all nodes (or +on the specific index settings). + +Automatic index creation can include a pattern based white/black list, +for example, set `action.auto_create_index` to `+aaa*,-bbb*,+ccc*,-*` (+ +meaning allowed, and - meaning disallowed). Note, this feature is +available since 0.20. + +[float] +=== Versioning + +Each indexed document is given a version number. The associated +`version` number is returned as part of the response to the index API +request. The index API optionally allows for +http://en.wikipedia.org/wiki/Optimistic_concurrency_control[optimistic +concurrency control] when the `version` parameter is specified. This +will control the version of the document the operation is intended to be +executed against. A good example of a use case for versioning is +performing a transactional read-then-update. Specifying a `version` from +the document initially read ensures no changes have happened in the +meantime (when reading in order to update, it is recommended to set +`preference` to `_primary`). For example: + +[source,js] +-------------------------------------------------- +curl -XPUT 'localhost:9200/twitter/tweet/1?version=2' -d '{ + "message" : "elasticsearch now has versioning support, double cool!" +}' +-------------------------------------------------- + +*NOTE:* versioning is completely real time, and is not affected by the +near real time aspects of search operations. If no version is provided, +then the operation is executed without any version checks. + +By default, internal versioning is used that starts at 1 and increments +with each update. Optionally, the version number can be supplemented +with an external value (for example, if maintained in a database). To +enable this functionality, `version_type` should be set to `external`. +The value provided must be a numeric, long value greater than 0, and +less than around 9.2e+18. When using the external version type, instead +of checking for a matching version number, the system checks to see if +the version number passed to the index request is greater than the +version of the currently stored document. If true, the document will be +indexed and the new version number used. If the value provided is less +than or equal to the stored document's version number, a version +conflict will occur and the index operation will fail. + +A nice side effect is that there is no need to maintain strict ordering +of async indexing operations executed as a result of changes to a source +database, as long as version numbers from the source database are used. +Even the simple case of updating the elasticsearch index using data from +a database is simplified if external versioning is used, as only the +latest version will be used if the index operations are out of order for +whatever reason. + +[float] +=== Operation Type + +The index operation also accepts an `op_type` that can be used to force +a `create` operation, allowing for "put-if-absent" behavior. When +`create` is used, the index operation will fail if a document by that id +already exists in the index. + +Here is an example of using the `op_type` parameter: + +[source,js] +-------------------------------------------------- +$ curl -XPUT 'http://localhost:9200/twitter/tweet/1?op_type=create' -d '{ + "user" : "kimchy", + "post_date" : "2009-11-15T14:12:12", + "message" : "trying out Elastic Search" +}' +-------------------------------------------------- + +Another option to specify `create` is to use the following uri: + +[source,js] +-------------------------------------------------- +$ curl -XPUT 'http://localhost:9200/twitter/tweet/1/_create' -d '{ + "user" : "kimchy", + "post_date" : "2009-11-15T14:12:12", + "message" : "trying out Elastic Search" +}' +-------------------------------------------------- + +[float] +=== Automatic ID Generation + +The index operation can be executed without specifying the id. In such a +case, an id will be generated automatically. In addition, the `op_type` +will automatically be set to `create`. Here is an example (note the +*POST* used instead of *PUT*): + +[source,js] +-------------------------------------------------- +$ curl -XPOST 'http://localhost:9200/twitter/tweet/' -d '{ + "user" : "kimchy", + "post_date" : "2009-11-15T14:12:12", + "message" : "trying out Elastic Search" +}' +-------------------------------------------------- + +The result of the above index operation is: + +[source,js] +-------------------------------------------------- +{ + "ok" : true, + "_index" : "twitter", + "_type" : "tweet", + "_id" : "6a8ca01c-7896-48e9-81cc-9f70661fcb32", + "_version" : 1 +} +-------------------------------------------------- + +[float] +=== Routing + +By default, shard placement — or `routing` — is controlled by using a +hash of the document's id value. For more explicit control, the value +fed into the hash function used by the router can be directly specified +on a per-operation basis using the `routing` parameter. For example: + +[source,js] +-------------------------------------------------- +$ curl -XPOST 'http://localhost:9200/twitter/tweet?routing=kimchy' -d '{ + "user" : "kimchy", + "post_date" : "2009-11-15T14:12:12", + "message" : "trying out Elastic Search" +}' +-------------------------------------------------- + +In the example above, the "tweet" document is routed to a shard based on +the `routing` parameter provided: "kimchy". + +When setting up explicit mapping, the `_routing` field can be optionally +used to direct the index operation to extract the routing value from the +document itself. This does come at the (very minimal) cost of an +additional document parsing pass. If the `_routing` mapping is defined, +and set to be `required`, the index operation will fail if no routing +value is provided or extracted. + +[float] +=== Parents & Children + +A child document can be indexed by specifying it's parent when indexing. +For example: + +[source,js] +-------------------------------------------------- +$ curl -XPUT localhost:9200/blogs/blog_tag/1122?parent=1111 -d '{ + "tag" : "something" +}' +-------------------------------------------------- + +When indexing a child document, the routing value is automatically set +to be the same as it's parent, unless the routing value is explicitly +specified using the `routing` parameter. + +[float] +=== Timestamp + +A document can be indexed with a `timestamp` associated with it. The +`timestamp` value of a document can be set using the `timestamp` +parameter. For example: + +[source,js] +-------------------------------------------------- +$ curl -XPUT localhost:9200/twitter/tweet/1?timestamp=2009-11-15T14%3A12%3A12 -d '{ + "user" : "kimchy", + "message" : "trying out Elastic Search", +}' +-------------------------------------------------- + +If the `timestamp` value is not provided externally or in the `_source`, +the `timestamp` will be automatically set to the date the document was +processed by the indexing chain. More information can be found on the +<>. + +[float] +=== TTL + +A document can be indexed with a `ttl` (time to live) associated with +it. Expired documents will be expunged automatically. The expiration +date that will be set for a document with a provided `ttl` is relative +to the `timestamp` of the document, meaning it can be based on the time +of indexing or on any time provided. The provided `ttl` must be strictly +positive and can be a number (in milliseconds) or any valid time value +as shown in the following examples: + +[source,js] +-------------------------------------------------- +curl -XPUT 'http://localhost:9200/twitter/tweet/1?ttl=86400000' -d '{ + "user": "kimchy", + "message": "Trying out elasticsearch, so far so good?" +}' +-------------------------------------------------- + +[source,js] +-------------------------------------------------- +curl -XPUT 'http://localhost:9200/twitter/tweet/1?ttl=1d' -d '{ + "user": "kimchy", + "message": "Trying out elasticsearch, so far so good?" +}' +-------------------------------------------------- + +[source,js] +-------------------------------------------------- +curl -XPUT 'http://localhost:9200/twitter/tweet/1' -d '{ + "_ttl": "1d", + "user": "kimchy", + "message": "Trying out elasticsearch, so far so good?" +}' +-------------------------------------------------- + +More information can be found on the +<>. + +[float] +=== Percolate + +<> can be performed +at index time by passing the `percolate` parameter. Setting it to `*` +will cause all percolation queries registered against the index to be +checked against the provided document, for example: + +[source,js] +-------------------------------------------------- +curl -XPUT localhost:9200/test/type1/1?percolate=* -d '{ + "field1" : "value1" +}' +-------------------------------------------------- + +To filter out which percolator queries will be executed, pass the query +string syntax to the `percolate` parameter: + +[source,js] +-------------------------------------------------- +curl -XPUT localhost:9200/test/type1/1?percolate=color:green -d '{ + "field1" : "value1", + "field2" : "value2" +}' +-------------------------------------------------- + +*NOTE:* In a distributed cluster, percolation during the index operation +is performed on the primary shard, as soon as the index operation +completes. The operation executes on the primary while the replicas are +updating, concurrently. Percolation during the index operation somewhat +cuts down on parsing overhead, as the parse tree for the document is +simply re-used for percolation. + +[float] +=== Distributed + +The index operation is directed to the primary shard based on its route +(see the Routing section above) and performed on the actual node +containing this shard. After the primary shard completes the operation, +if needed, the update is distributed to applicable replicas. + +[float] +=== Write Consistency + +To prevent writes from taking place on the "wrong" side of a network +partition, by default, index operations only succeed if a quorum +(>replicas/2+1) of active shards are available. This default can be +overridden on a node-by-node basis using the `action.write_consistency` +setting. To alter this behavior per-operation, the `consistency` request +parameter can be used. + +Valid write consistency values are `one`, `quorum`, and `all`. + +[float] +=== Asynchronous Replication + +By default, the index operation only returns after all shards within the +replication group have indexed the document (sync replication). To +enable asynchronous replication, causing the replication process to take +place in the background, set the `replication` parameter to `async`. +When asynchronous replication is used, the index operation will return +as soon as the operation succeeds on the primary shard. + +[float] +=== Refresh + +To refresh the index immediately after the operation occurs, so that the +document appears in search results immediately, the `refresh` parameter +can be set to `true`. Setting this option to `true` should *ONLY* be +done after careful thought and verification that it does not lead to +poor performance, both from an indexing and a search standpoint. Note, +getting a document using the get API is completely realtime. + +[float] +=== Timeout + +The primary shard assigned to perform the index operation might not be +available when the index operation is executed. Some reasons for this +might be that the primary shard is currently recovering from a gateway +or undergoing relocation. By default, the index operation will wait on +the primary shard to become available for up to 1 minute before failing +and responding with an error. The `timeout` parameter can be used to +explicitly specify how long it waits. Here is an example of setting it +to 5 minutes: + +[source,js] +-------------------------------------------------- +$ curl -XPUT 'http://localhost:9200/twitter/tweet/1?timeout=5m' -d '{ + "user" : "kimchy", + "post_date" : "2009-11-15T14:12:12", + "message" : "trying out Elastic Search" +}' +-------------------------------------------------- diff --git a/docs/reference/docs/multi-get.asciidoc b/docs/reference/docs/multi-get.asciidoc new file mode 100644 index 00000000000..d9995f039ba --- /dev/null +++ b/docs/reference/docs/multi-get.asciidoc @@ -0,0 +1,97 @@ +[[docs-multi-get]] +== Multi Get API + +Multi GET API allows to get multiple documents based on an index, type +(optional) and id (and possibly routing). The response includes a `docs` +array with all the fetched documents, each element similar in structure +to a document provided by the <> +API. Here is an example: + +[source,js] +-------------------------------------------------- +curl 'localhost:9200/_mget' -d '{ + "docs" : [ + { + "_index" : "test", + "_type" : "type", + "_id" : "1" + }, + { + "_index" : "test", + "_type" : "type", + "_id" : "2" + } + ] +}' +-------------------------------------------------- + +The `mget` endpoint can also be used against an index (in which case it +is not required in the body): + +[source,js] +-------------------------------------------------- +curl 'localhost:9200/test/_mget' -d '{ + "docs" : [ + { + "_type" : "type", + "_id" : "1" + }, + { + "_type" : "type", + "_id" : "2" + } + ] +}' +-------------------------------------------------- + +And type: + +[source,js] +-------------------------------------------------- +curl 'localhost:9200/test/type/_mget' -d '{ + "docs" : [ + { + "_id" : "1" + }, + { + "_id" : "2" + } + ] +}' +-------------------------------------------------- + +In which case, the `ids` element can directly be used to simplify the +request: + +[source,js] +-------------------------------------------------- +curl 'localhost:9200/test/type/_mget' -d '{ + "ids" : ["1", "2"] +}' +-------------------------------------------------- + +[float] +=== Fields + +Specific fields can be specified to be retrieved per document to get. +For example: + +[source,js] +-------------------------------------------------- +curl 'localhost:9200/_mget' -d '{ + "docs" : [ + { + "_index" : "test", + "_type" : "type", + "_id" : "1", + "fields" : ["field1", "field2"] + }, + { + "_index" : "test", + "_type" : "type", + "_id" : "2", + "fields" : ["field3", "field4"] + } + ] +}' +-------------------------------------------------- diff --git a/docs/reference/docs/update.asciidoc b/docs/reference/docs/update.asciidoc new file mode 100644 index 00000000000..67bc6be7e05 --- /dev/null +++ b/docs/reference/docs/update.asciidoc @@ -0,0 +1,179 @@ +[[docs-update]] +== Update API + +The update API allows to update a document based on a script provided. +The operation gets the document (collocated with the shard) from the +index, runs the script (with optional script language and parameters), +and index back the result (also allows to delete, or ignore the +operation). It uses versioning to make sure no updates have happened +during the "get" and "reindex". (available from `0.19` onwards). + +Note, this operation still means full reindex of the document, it just +removes some network roundtrips and reduces chances of version conflicts +between the get and the index. The `_source` field need to be enabled +for this feature to work. + +For example, lets index a simple doc: + +[source,js] +-------------------------------------------------- +curl -XPUT localhost:9200/test/type1/1 -d '{ + "counter" : 1, + "tags" : ["red"] +}' +-------------------------------------------------- + +Now, we can execute a script that would increment the counter: + +[source,js] +-------------------------------------------------- +curl -XPOST 'localhost:9200/test/type1/1/_update' -d '{ + "script" : "ctx._source.counter += count", + "params" : { + "count" : 4 + } +}' +-------------------------------------------------- + +We can also add a tag to the list of tags (note, if the tag exists, it +will still add it, since its a list): + +[source,js] +-------------------------------------------------- +curl -XPOST 'localhost:9200/test/type1/1/_update' -d '{ + "script" : "ctx._source.tags += tag", + "params" : { + "tag" : "blue" + } +}' +-------------------------------------------------- + +We can also add a new field to the document: + +[source,js] +-------------------------------------------------- +curl -XPOST 'localhost:9200/test/type1/1/_update' -d '{ + "script" : "ctx._source.text = \"some text\"" +}' +-------------------------------------------------- + +We can also remove a field from the document: + +[source,js] +-------------------------------------------------- +curl -XPOST 'localhost:9200/test/type1/1/_update' -d '{ + "script" : "ctx._source.remove(\"text\")" +}' +-------------------------------------------------- + +And, we can delete the doc if the tags contain blue, or ignore (noop): + +[source,js] +-------------------------------------------------- +curl -XPOST 'localhost:9200/test/type1/1/_update' -d '{ + "script" : "ctx._source.tags.contains(tag) ? ctx.op = \"delete\" : ctx.op = \"none\"", + "params" : { + "tag" : "blue" + } +}' +-------------------------------------------------- + +*Note*: Be aware of MVEL and handling of ternary operators and +assignments. Assignment operations have lower precedence than the +ternary operator. Compare the following statements: + +[source,js] +-------------------------------------------------- +// Will NOT update the tags array +ctx._source.tags.contains(tag) ? ctx.op = \"none\" : ctx._source.tags += tag +// Will update +ctx._source.tags.contains(tag) ? (ctx.op = \"none\") : ctx._source.tags += tag +// Also works +if (ctx._source.tags.contains(tag)) { ctx.op = \"none\" } else { ctx._source.tags += tag } +-------------------------------------------------- + +The update API also support passing a partial document (since 0.20), +which will be merged into the existing document (simple recursive merge, +inner merging of objects, replacing core "keys/values" and arrays). For +example: + +[source,js] +-------------------------------------------------- +curl -XPOST 'localhost:9200/test/type1/1/_update' -d '{ + "doc" : { + "name" : "new_name" + } +}' +-------------------------------------------------- + +If both `doc` and `script` is specified, then `doc` is ignored. Best is +to put your field pairs of the partial document in the script itself. + +There is also support for `upsert` (since 0.20). If the document does +not already exists, the content of the `upsert` element will be used to +index the fresh doc: + +[source,js] +-------------------------------------------------- +curl -XPOST 'localhost:9200/test/type1/1/_update' -d '{ + "script" : "ctx._source.counter += count", + "params" : { + "count" : 4 + }, + "upsert" : { + "counter" : 1 + } +}' +-------------------------------------------------- + +Last it also supports `doc_as_upsert` (since 0.90.2). So that the +provided document will be inserted if the document does not already +exist. This will reduce the amount of data that needs to be sent to +elasticsearch. + +[source,js] +-------------------------------------------------- +curl -XPOST 'localhost:9200/test/type1/1/_update' -d '{ + "doc" : { + "name" : "new_name" + }, + "doc_as_upsert" : true +}' +-------------------------------------------------- + +The update operation supports similar parameters as the index API, +including: + +[horizontal] +`routing`:: Sets the routing that will be used to route the + document to the relevant shard. + +`parent`:: Simply sets the routing. + +`timeout`:: Timeout waiting for a shard to become available. + +`replication`:: The replication type for the delete/index operation + (sync or async). + +`consistency`:: The write consistency of the index/delete operation. + +`percolate`:: Enables percolation and filters out which percolator + queries will be executed. + +`refresh`:: Refresh the index immediately after the operation occurs, + so that the updated document appears in search results + immediately. + +`fields`:: return the relevant fields from the document updated + (since 0.20). Support `_source` to return the full updated + source. + + +And also support `retry_on_conflict` which controls how many times to +retry if there is a version conflict between getting the document and +indexing / deleting it. Defaults to `0`. + +It also allows to update the `ttl` of a document using `ctx._ttl` and +timestamp using `ctx._timestamp`. Note that if the timestamp is not +updated and not extracted from the `_source` it will be set to the +update date. diff --git a/docs/reference/glossary.asciidoc b/docs/reference/glossary.asciidoc new file mode 100644 index 00000000000..6f7061f839a --- /dev/null +++ b/docs/reference/glossary.asciidoc @@ -0,0 +1,190 @@ +[glossary] +[[glossary]] += Glossary of terms + +[glossary] +[[glossary-analysis]] analysis :: + + Analysis is the process of converting <> to + <>. Depending on which analyzer is used, these phrases: + `FOO BAR`, `Foo-Bar`, `foo,bar` will probably all result in the + terms `foo` and `bar`. These terms are what is actually stored in + the index. + + + A full text query (not a <> query) for `FoO:bAR` will + also be analyzed to the terms `foo`,`bar` and will thus match the + terms stored in the index. + + + It is this process of analysis (both at index time and at search time) + that allows elasticsearch to perform full text queries. + + + Also see <> and <>. + +[[glossary-cluster]] cluster :: + + A cluster consists of one or more <> which share the + same cluster name. Each cluster has a single master node which is + chosen automatically by the cluster and which can be replaced if the + current master node fails. + +[[glossary-document]] document :: + + A document is a JSON document which is stored in elasticsearch. It is + like a row in a table in a relational database. Each document is + stored in an <> and has a <> and an + <>. + + + A document is a JSON object (also known in other languages as a hash / + hashmap / associative array) which contains zero or more + <>, or key-value pairs. + + + The original JSON document that is indexed will be stored in the + <>, which is returned by default when + getting or searching for a document. + +[[glossary-id]] id :: + + The ID of a <> identifies a document. The + `index/type/id` of a document must be unique. If no ID is provided, + then it will be auto-generated. (also see <>) + +[[glossary-field]] field :: + + A <> contains a list of fields, or key-value + pairs. The value can be a simple (scalar) value (eg a string, integer, + date), or a nested structure like an array or an object. A field is + similar to a column in a table in a relational database. + + + The <> for each field has a field _type_ (not to + be confused with document <>) which indicates the type + of data that can be stored in that field, eg `integer`, `string`, + `object`. The mapping also allows you to define (amongst other things) + how the value for a field should be analyzed. + +[[glossary-index]] index :: + + An index is like a _database_ in a relational database. It has a + <> which defines multiple <>. + + + An index is a logical namespace which maps to one or more + <> and can have zero or more + <>. + +[[glossary-mapping]] mapping :: + + A mapping is like a _schema definition_ in a relational database. Each + <> has a mapping, which defines each <> + within the index, plus a number of index-wide settings. + + + A mapping can either be defined explicitly, or it will be generated + automatically when a document is indexed. + +[[glossary-node]] node :: + + A node is a running instance of elasticsearch which belongs to a + <>. Multiple nodes can be started on a single + server for testing purposes, but usually you should have one node per + server. + + + At startup, a node will use unicast (or multicast, if specified) to + discover an existing cluster with the same cluster name and will try + to join that cluster. + + [[glossary-primary-shard]] primary shard :: + + Each document is stored in a single primary <>. When + you index a document, it is indexed first on the primary shard, then + on all <> of the primary shard. + + + By default, an <> has 5 primary shards. You can + specify fewer or more primary shards to scale the number of + <> that your index can handle. + + + You cannot change the number of primary shards in an index, once the + index is created. + + + See also <> + + [[glossary-replica-shard]] replica shard :: + + Each <> can have zero or more + replicas. A replica is a copy of the primary shard, and has two + purposes: + + + 1. increase failover: a replica shard can be promoted to a primary + shard if the primary fails + 2. increase performance: get and search requests can be handled by + primary or replica shards. + + + By default, each primary shard has one replica, but the number of + replicas can be changed dynamically on an existing index. A replica + shard will never be started on the same node as its primary shard. + +[[glossary-routing]] routing :: + + When you index a document, it is stored on a single + <>. That shard is chosen by hashing + the `routing` value. By default, the `routing` value is derived from + the ID of the document or, if the document has a specified parent + document, from the ID of the parent document (to ensure that child and + parent documents are stored on the same shard). + + + This value can be overridden by specifying a `routing` value at index + time, or a <> in the <>. + +[[glossary-shard]] shard :: + + A shard is a single Lucene instance. It is a low-level “worker” unit + which is managed automatically by elasticsearch. An index is a logical + namespace which points to <> and + <> shards. + + + Other than defining the number of primary and replica shards that an + index should have, you never need to refer to shards directly. + Instead, your code should deal only with an index. + + + Elasticsearch distributes shards amongst all <> in the + <>, and can move shards automatically from one + node to another in the case of node failure, or the addition of new + nodes. + + [[glossary-source_field]] source field :: + + By default, the JSON document that you index will be stored in the + `_source` field and will be returned by all get and search requests. + This allows you access to the original object directly from search + results, rather than requiring a second step to retrieve the object + from an ID. + + + Note: the exact JSON string that you indexed will be returned to you, + even if it contains invalid JSON. The contents of this field do not + indicate anything about how the data in the object has been indexed. + +[[glossary-term]] term :: + + A term is an exact value that is indexed in elasticsearch. The terms + `foo`, `Foo`, `FOO` are NOT equivalent. Terms (i.e. exact values) can + be searched for using _term_ queries. + + See also <> and <>. + +[[glossary-text]] text :: + + Text (or full text) is ordinary unstructured text, such as this + paragraph. By default, text will be <> into + <>, which is what is actually stored in the index. + + + Text <> need to be analyzed at index time in order to + be searchable as full text, and keywords in full text queries must be + analyzed at search time to produce (and search for) the same terms + that were generated at index time. + + + See also <> and <>. + +[[glossary-type]] type :: + + A type is like a _table_ in a relational database. Each type has a + list of <> that can be specified for + <> of that type. The <> + defines how each field in the document is analyzed. + diff --git a/docs/reference/index-modules.asciidoc b/docs/reference/index-modules.asciidoc new file mode 100644 index 00000000000..037512c51b0 --- /dev/null +++ b/docs/reference/index-modules.asciidoc @@ -0,0 +1,73 @@ +[[index-modules]] += Index Modules + +[partintro] +-- +Index Modules are modules created per index and control all aspects +related to an index. Since those modules lifecycle are tied to an index, +all the relevant modules settings can be provided when creating an index +(and it is actually the recommended way to configure an index). + +[float] +== Index Settings + +There are specific index level settings that are not associated with any +specific module. These include: + +`index.compound_format`:: + + Should the compound file format be used (boolean + setting). If not set, controlled by the actually store used, this is + because the compound format was created to reduce the number of open + file handles when using file based storage. By default, it is set to + `false` for better performance (really applicable for file system based + index storage), but, requires adapting the max open file handles. + +`index.term_index_interval`:: + + Set the interval between indexed terms. + Large values cause less memory to be used by a reader / searcher, but + slow random-access to terms. Small values cause more memory to be used + by a reader / searcher, and speed random-access to terms. Defaults to + `128`. + +`index.term_index_divisor`:: + Subsamples which indexed terms are loaded + into RAM. This has the same effect as `index.term_index_interval` except + that setting must be done at indexing time while this setting can be set + per reader / searcher. When set to N, then one in every + N*termIndexInterval terms in the index is loaded into memory. By setting + this to a value > 1 you can reduce memory usage, at the expense of + higher latency when loading a TermInfo. The default value is 1. Set this + to -1 to skip loading the terms index entirely. + +`index.refresh_interval`:: + A time setting controlling how often the + refresh operation will be executed. Defaults to `1s`. Can be set to `-1` + in order to disable it. + +-- + +include::index-modules/analysis.asciidoc[] + +include::index-modules/allocation.asciidoc[] + +include::index-modules/slowlog.asciidoc[] + +include::index-modules/merge.asciidoc[] + +include::index-modules/store.asciidoc[] + +include::index-modules/mapper.asciidoc[] + +include::index-modules/translog.asciidoc[] + +include::index-modules/cache.asciidoc[] + +include::index-modules/fielddata.asciidoc[] + +include::index-modules/codec.asciidoc[] + +include::index-modules/similarity.asciidoc[] + + diff --git a/docs/reference/index-modules/allocation.asciidoc b/docs/reference/index-modules/allocation.asciidoc new file mode 100644 index 00000000000..52d7a483c51 --- /dev/null +++ b/docs/reference/index-modules/allocation.asciidoc @@ -0,0 +1,95 @@ +[[index-modules-allocation]] +== Index Shard Allocation + +[float] +=== Shard Allocation Filtering + +Allow to control allocation if indices on nodes based on include/exclude +filters. The filters can be set both on the index level and on the +cluster level. Lets start with an example of setting it on the cluster +level: + +Lets say we have 4 nodes, each has specific attribute called `tag` +associated with it (the name of the attribute can be any name). Each +node has a specific value associated with `tag`. Node 1 has a setting +`node.tag: value1`, Node 2 a setting of `node.tag: value2`, and so on. + +We can create an index that will only deploy on nodes that have `tag` +set to `value1` and `value2` by setting +`index.routing.allocation.include.tag` to `value1,value2`. For example: + +[source,js] +-------------------------------------------------- +curl -XPUT localhost:9200/test/_settings -d '{ + "index.routing.allocation.include.tag" : "value1,value2" +}' +-------------------------------------------------- + +On the other hand, we can create an index that will be deployed on all +nodes except for nodes with a `tag` of value `value3` by setting +`index.routing.allocation.exclude.tag` to `value3`. For example: + +[source,js] +-------------------------------------------------- +curl -XPUT localhost:9200/test/_settings -d '{ + "index.routing.allocation.exclude.tag" : "value3" +}' +-------------------------------------------------- + +From version 0.90, `index.routing.allocation.require.*` can be used to +specify a number of rules, all of which MUST match in order for a shard +to be allocated to a node. This is in contrast to `include` which will +include a node if ANY rule matches. + +The `include`, `exclude` and `require` values can have generic simple +matching wildcards, for example, `value1*`. A special attribute name +called `_ip` can be used to match on node ip values. + +Obviously a node can have several attributes associated with it, and +both the attribute name and value are controlled in the setting. For +example, here is a sample of several node configurations: + +[source,js] +-------------------------------------------------- +node.group1: group1_value1 +node.group2: group2_value4 +-------------------------------------------------- + +In the same manner, `include`, `exclude` and `require` can work against +several attributes, for example: + +[source,js] +-------------------------------------------------- +curl -XPUT localhost:9200/test/_settings -d '{ + "index.routing.allocation.include.group1" : "xxx" + "index.routing.allocation.include.group2" : "yyy", + "index.routing.allocation.exclude.group3" : "zzz", + "index.routing.allocation.require.group4" : "aaa", +}' +-------------------------------------------------- + +The provided settings can also be updated in real time using the update +settings API, allowing to "move" indices (shards) around in realtime. + +Cluster wide filtering can also be defined, and be updated in real time +using the cluster update settings API. This setting can come in handy +for things like decommissioning nodes (even if the replica count is set +to 0). Here is a sample of how to decommission a node based on `_ip` +address: + +[source,js] +-------------------------------------------------- +curl -XPUT localhost:9200/_cluster/settings -d '{ + "transient" : { + "cluster.routing.allocation.exclude._ip" : "10.0.0.1" + } +}' +-------------------------------------------------- + +[float] +=== Total Shards Per Node + +The `index.routing.allocation.total_shards_per_node` setting allows to +control how many total shards for an index will be allocated per node. +It can be dynamically set on a live index using the update index +settings API. diff --git a/docs/reference/index-modules/analysis.asciidoc b/docs/reference/index-modules/analysis.asciidoc new file mode 100644 index 00000000000..1cf33e84db9 --- /dev/null +++ b/docs/reference/index-modules/analysis.asciidoc @@ -0,0 +1,18 @@ +[[index-modules-analysis]] +== Analysis + +The index analysis module acts as a configurable registry of Analyzers +that can be used in order to both break indexed (analyzed) fields when a +document is indexed and process query strings. It maps to the Lucene +`Analyzer`. + +Analyzers are (generally) composed of a single `Tokenizer` and zero or +more `TokenFilters`. A set of `CharFilters` can be associated with an +analyzer to process the characters prior to other analysis steps. The +analysis module allows one to register `TokenFilters`, `Tokenizers` and +`Analyzers` under logical names that can then be referenced either in +mapping definitions or in certain APIs. The Analysis module +automatically registers (*if not explicitly defined*) built in +analyzers, token filters, and tokenizers. + +See <> for configuration details. \ No newline at end of file diff --git a/docs/reference/index-modules/cache.asciidoc b/docs/reference/index-modules/cache.asciidoc new file mode 100644 index 00000000000..36f53d2b91b --- /dev/null +++ b/docs/reference/index-modules/cache.asciidoc @@ -0,0 +1,53 @@ +[[index-modules-cache]] +== Cache + +There are different caching inner modules associated with an index. They +include `filter` and others. + +[float] +=== Filter Cache + +The filter cache is responsible for caching the results of filters (used +in the query). The default implementation of a filter cache (and the one +recommended to use in almost all cases) is the `node` filter cache type. + +[float] +==== Node Filter Cache + +The `node` filter cache may be configured to use either a percentage of +the total memory allocated to the process or an specific amount of +memory. All shards present on a node share a single node cache (thats +why its called `node``). The cache implements an LRU eviction policy: +when a cache becomes full, the least recently used data is evicted to +make way for new data. + +The setting that allows one to control the memory size for the filter +cache is `indices.cache.filter.size`, which defaults to `20%`. *Note*, +this is *not* an index level setting but a node level setting (can be +configured in the node configuration). + +`indices.cache.filter.size` can accept either a percentage value, like +`30%`, or an exact value, like `512mb`. + +[float] +==== Index Filter Cache + +A filter cache that exists on the index level (on each node). Generally, +not recommended for use since its memory usage depends on which shards +are allocated on each node and its hard to predict it. The types are: +`resident`, `soft` and `weak`. + +All types support the following settings: + +[cols="<,<",options="header",] +|======================================================================= +|Setting |Description +|`index.cache.filter.max_size` |The max size (count, not byte size) of +the cache (per search segment in a shard). Defaults to not set (`-1`), +which is usually fine with `soft` cache and proper cacheable filters. + +|`index.cache.filter.expire` |A time based setting that expires filters +after a certain time of inactivity. Defaults to `-1`. For example, can +be set to `5m` for a 5 minute expiry. +|======================================================================= + diff --git a/docs/reference/index-modules/codec.asciidoc b/docs/reference/index-modules/codec.asciidoc new file mode 100644 index 00000000000..7673ee21418 --- /dev/null +++ b/docs/reference/index-modules/codec.asciidoc @@ -0,0 +1,168 @@ +[[index-modules-codec]] +== Codec module + +Codecs define how documents are written to disk and read from disk. The +postings format is the part of the codec that responsible for reading +and writing the term dictionary, postings lists and positions, payloads +and offsets stored in the postings list. + +Configuring custom postings formats is an expert feature and most likely +using the builtin postings formats will suite your needs as is described +in the <> + +Codecs are available in Elasticsearch from version `0.90.0.beta1`. + +[float] +=== Configuring a custom postings format + +Custom postings format can be defined in the index settings in the +`codec` part. The `codec` part can be configured when creating an index +or updating index settings. An example on how to define your custom +postings format: + +[source,js] +-------------------------------------------------- +curl -XPUT 'http://localhost:9200/twitter/' -d '{ + "settings" : { + "index" : { + "codec" : { + "postings_format" : { + "my_format" : { + "type" : "pulsing", + "freq_cut_off" : "5" + } + } + } + } + } +}' +-------------------------------------------------- + +Then we defining your mapping your can use the `my_format` name in the +`postings_format` option as the example below illustrates: + +[source,js] +-------------------------------------------------- +{ + "person" : { + "properties" : { + "second_person_id" : {"type" : "string", "postings_format" : "my_format"} + } + } +} +-------------------------------------------------- + +[float] +=== Available postings formats + +[float] +==== Direct postings format + +Wraps the default postings format for on-disk storage, but then at read +time loads and stores all terms & postings directly in RAM. This +postings format makes no effort to compress the terms and posting list +and therefore is memory intensive, but because of this it gives a +substantial increase in search performance. Because this holds all term +bytes as a single byte[], you cannot have more than 2.1GB worth of terms +in a single segment. + +This postings format offers the following parameters: + +`min_skip_count`:: + The minimum number terms with a shared prefix to + allow a skip pointer to be written. The default is *8*. + +`low_freq_cutoff`:: + Terms with a lower document frequency use a + single array object representation for postings and positions. The + default is *32*. + +Type name: `direct` + +[float] +==== Memory postings format + +A postings format that stores terms & postings (docs, positions, +payloads) in RAM, using an FST. This postings format does write to disk, +but loads everything into memory. The memory postings format has the +following options: + +`pack_fst`:: + A boolean option that defines if the in memory structure + should be packed once its build. Packed will reduce the size for the + data-structure in memory but requires more memory during building. + Default is *false*. + +`acceptable_overhead_ratio`:: + The compression ratio specified as a + float, that is used to compress internal structures. Example ratios `0` + (Compact, no memory overhead at all, but the returned implementation may + be slow), `0.5` (Fast, at most 50% memory overhead, always select a + reasonably fast implementation), `7` (Fastest, at most 700% memory + overhead, no compression). Default is `0.2`. + +Type name: `memory` + +[float] +==== Bloom filter posting format + +The bloom filter postings format wraps a delegate postings format and on +top of this creates a bloom filter that is written to disk. During +opening this bloom filter is loaded into memory and used to offer +"fast-fail" reads. This postings format is useful for low doc-frequency +fields such as primary keys. The bloom filter postings format has the +following options: + +`delegate`:: + The name of the configured postings format that the + bloom filter postings format will wrap. + +`fpp`:: + The desired false positive probability specified as a + floating point number between 0 and 1.0. The `fpp` can be configured for + multiple expected insertions. Example expression: *10k=0.01,1m=0.03*. If + number docs per index segment is larger than *1m* then use *0.03* as fpp + and if number of docs per segment is larger than *10k* use *0.01* as + fpp. The last fallback value is always *0.03*. This example expression + is also the default. + +Type name: `bloom` + +[float] +==== Pulsing postings format + +The pulsing implementation in-lines the posting lists for very low +frequent terms in the term dictionary. This is useful to improve lookup +performance for low-frequent terms. This postings format offers the +following parameters: + +`min_block_size`:: + The minimum block size the default Lucene term + dictionary uses to encode on-disk blocks. Defaults to *25*. + +`max_block_size`:: + The maximum block size the default Lucene term + dictionary uses to encode on-disk blocks. Defaults to *48*. + +`freq_cut_off`:: + The document frequency cut off where pulsing + in-lines posting lists into the term dictionary. Terms with a document + frequency less or equal to the cutoff will be in-lined. The default is + *1*. + +Type name: `pulsing` + +[float] +==== Default postings format + +The default postings format has the following options: + +`min_block_size`:: + The minimum block size the default Lucene term + dictionary uses to encode on-disk blocks. Defaults to *25*. + +`max_block_size`:: + The maximum block size the default Lucene term + dictionary uses to encode on-disk blocks. Defaults to *48*. + +Type name: `default` diff --git a/docs/reference/index-modules/fielddata.asciidoc b/docs/reference/index-modules/fielddata.asciidoc new file mode 100644 index 00000000000..7cc98180de3 --- /dev/null +++ b/docs/reference/index-modules/fielddata.asciidoc @@ -0,0 +1,142 @@ +[[index-modules-fielddata]] +== Field data + +The field data cache is used mainly when sorting on or faceting on a +field. It loads all the field values to memory in order to provide fast +document based access to those values. The field data cache can be +expensive to build for a field, so its recommended to have enough memory +to allocate it, and to keep it loaded. + +From version 0.90 onwards, the amount of memory used for the field +data cache can be controlled using `indices.fielddata.cache.size`. Note: +reloading the field data which does not fit into your cache will be expensive +and perform poorly. + +[cols="<,<",options="header",] +|======================================================================= +|Setting |Description +|`indices.fielddata.cache.size` |The max size of the field data cache, +eg `30%` of node heap space, or an absolute value, eg `12GB`. Defaults +to unbounded. + +|`indices.fielddata.cache.expire` |A time based setting that expires +field data after a certain time of inactivity. Defaults to `-1`. For +example, can be set to `5m` for a 5 minute expiry. +|======================================================================= + +[float] +=== Filtering fielddata + +It is possible to control which field values are loaded into memory, +which is particularly useful for string fields. When specifying the +<> for a field, you +can also specify a fielddata filter. + +Fielddata filters can be changed using the +<> +API. After changing the filters, use the +<> API +to reload the fielddata using the new filters. + +[float] +==== Filtering by frequency: + +The frequency filter allows you to only load terms whose frequency falls +between a `min` and `max` value, which can be expressed an absolute +number or as a percentage (eg `0.01` is `1%`). Frequency is calculated +*per segment*. Percentages are based on the number of docs which have a +value for the field, as opposed to all docs in the segment. + +Small segments can be excluded completely by specifying the minimum +number of docs that the segment should contain with `min_segment_size`: + +[source,js] +-------------------------------------------------- +{ + tag: { + type: "string", + fielddata: { + filter: { + frequency: { + min: 0.001, + max: 0.1, + min_segment_size: 500 + } + } + } + } +} +-------------------------------------------------- + +[float] +==== Filtering by regex + +Terms can also be filtered by regular expression - only values which +match the regular expression are loaded. Note: the regular expression is +applied to each term in the field, not to the whole field value. For +instance, to only load hashtags from a tweet, we can use a regular +expression which matches terms beginning with `#`: + +[source,js] +-------------------------------------------------- +{ + tweet: { + type: "string", + analyzer: "whitespace" + fielddata: { + filter: { + regex: "^#.*" + } + } + } +} +-------------------------------------------------- + +[float] +==== Combining filters + +The `frequency` and `regex` filters can be combined: + +[source,js] +-------------------------------------------------- +{ + tweet: { + type: "string", + analyzer: "whitespace" + fielddata: { + filter: { + regex: "^#.*", + frequency: { + min: 0.001, + max: 0.1, + min_segment_size: 500 + } + } + } + } +} +-------------------------------------------------- + +[float] +=== Settings before v0.90 + +[cols="<,<",options="header",] +|======================================================================= +|Setting |Description +|`index.cache.field.type` |The default type for the field data cache is +`resident` (because of the cost of rebuilding it). Other types include +`soft` + +|`index.cache.field.max_size` |The max size (count, not byte size) of +the cache (per search segment in a shard). Defaults to not set (`-1`). + +|`index.cache.field.expire` |A time based setting that expires filters +after a certain time of inactivity. Defaults to `-1`. For example, can +be set to `5m` for a 5 minute expiry. +|======================================================================= + +[float] +=== Monitoring field data + +You can monitor memory usage for field data using +<> diff --git a/docs/reference/index-modules/mapper.asciidoc b/docs/reference/index-modules/mapper.asciidoc new file mode 100644 index 00000000000..c1e1da052b5 --- /dev/null +++ b/docs/reference/index-modules/mapper.asciidoc @@ -0,0 +1,39 @@ +[[index-modules-mapper]] +== Mapper + +The mapper module acts as a registry for the type mapping definitions +added to an index either when creating it or by using the put mapping +api. It also handles the dynamic mapping support for types that have no +explicit mappings pre defined. For more information about mapping +definitions, check out the <>. + +[float] +=== Dynamic / Default Mappings + +Dynamic mappings allow to automatically apply generic mapping definition +to types that do not have mapping pre defined or applied to new mapping +definitions (overridden). This is mainly done thanks to the fact that +the `object` type and namely the root `object` type allow for schema +less dynamic addition of unmapped fields. + +The default mapping definition is plain mapping definition that is +embedded within ElasticSearch: + +[source,js] +-------------------------------------------------- +{ + _default_ : { + } +} +-------------------------------------------------- + +Pretty short, no? Basically, everything is defaulted, especially the +dynamic nature of the root object mapping. The default mapping +definition can be overridden in several manners. The simplest manner is +to simply define a file called `default-mapping.json` and placed it +under the `config` directory (which can be configured to exist in a +different location). It can also be explicitly set using the +`index.mapper.default_mapping_location` setting. + +Dynamic creation of mappings for unmapped types can be completely +disabled by setting `index.mapper.dynamic` to `false`. diff --git a/docs/reference/index-modules/merge.asciidoc b/docs/reference/index-modules/merge.asciidoc new file mode 100644 index 00000000000..fd987055fe5 --- /dev/null +++ b/docs/reference/index-modules/merge.asciidoc @@ -0,0 +1,203 @@ +[[index-modules-merge]] +== Merge + +A shard in elasticsearch is a Lucene index, and a Lucene index is broken +down into segments. Segments are internal storage elements in the index +where the index data is stored, and are immutable up to delete markers. +Segments are, periodically, merged into larger segments to keep the +index size at bay and expunge deletes. + +The more segments one has in the Lucene index mean slower searches and +more memory used, but, low number of segments means more merging that +has to go on. + +Since merges can be expensive to perform, especially on low IO +environments, they can be throttled using store level throttling. Read +the store module documentation on how to set it. + +[float] +=== Policy + +The index merge policy module allows one to control which segments of a +shard index are to be merged. There are several types of policies with +the default set to `tiered`. + +[float] +==== tiered + +Merges segments of approximately equal size, subject to an allowed +number of segments per tier. This is similar to `log_bytes_size` merge +policy, except this merge policy is able to merge non-adjacent segment, +and separates how many segments are merged at once from how many +segments are allowed per tier. This merge policy also does not +over-merge (i.e., cascade merges). + +This policy has the following settings: + +[cols="<,<",options="header",] +|======================================================================= +|Setting |Description +|`index.merge.policy.expunge_deletes_allowed` |When expungeDeletes is +called, we only merge away a segment if its delete percentage is over +this threshold. Default is `10`. + +|`index.merge.policy.floor_segment` |Segments smaller than this are +"rounded up" to this size, i.e. treated as equal (floor) size for merge +selection. This is to prevent frequent flushing of tiny segments from +allowing a long tail in the index. Default is `2mb`. + +|`index.merge.policy.max_merge_at_once` |Maximum number of segments to +be merged at a time during "normal" merging. Default is `10`. + +|`index.merge.policy.max_merge_at_once_explicit` |Maximum number of +segments to be merged at a time, during optimize or expungeDeletes. +Default is `30`. + +|`index.merge.policy.max_merged_segment` |Maximum sized segment to +produce during normal merging (not explicit optimize). This setting is +approximate: the estimate of the merged segment size is made by summing +sizes of to-be-merged segments (compensating for percent deleted docs). +Default is `5gb`. + +|`index.merge.policy.segments_per_tier` |Sets the allowed number of +segments per tier. Smaller values mean more merging but fewer segments. +Default is `10`. Note, this value needs to be >= then the +`max_merge_at_once_` otherwise you'll force too many merges to occur. + +|`index.reclaim_deletes_weight` |Controls how aggressively merges that +reclaim more deletions are favored. Higher values favor selecting merges +that reclaim deletions. A value of `0.0` means deletions don't impact +merge selection. Defaults to `2.0`. + +|`index.compound_format` |Should the index be stored in compound format +or not. Defaults to `false`. +|======================================================================= + +For normal merging, this policy first computes a "budget" of how many +segments are allowed by be in the index. If the index is over-budget, +then the policy sorts segments by decreasing size (pro-rating by percent +deletes), and then finds the least-cost merge. Merge cost is measured by +a combination of the "skew" of the merge (size of largest seg divided by +smallest seg), total merge size and pct deletes reclaimed, so that +merges with lower skew, smaller size and those reclaiming more deletes, +are favored. + +If a merge will produce a segment that's larger than +`max_merged_segment` then the policy will merge fewer segments (down to +1 at once, if that one has deletions) to keep the segment size under +budget. + +Note, this can mean that for large shards that holds many gigabytes of +data, the default of `max_merged_segment` (`5gb`) can cause for many +segments to be in an index, and causing searches to be slower. Use the +indices segments API to see the segments that an index have, and +possibly either increase the `max_merged_segment` or issue an optimize +call for the index (try and aim to issue it on a low traffic time). + +[float] +==== log_byte_size + +A merge policy that merges segments into levels of exponentially +increasing *byte size*, where each level has fewer segments than the +value of the merge factor. Whenever extra segments (beyond the merge +factor upper bound) are encountered, all segments within the level are +merged. + +This policy has the following settings: + +[cols="<,<",options="header",] +|======================================================================= +|Setting |Description +|index.merge.policy.merge_factor |Determines how often segment indices +are merged by index operation. With smaller values, less RAM is used +while indexing, and searches on unoptimized indices are faster, but +indexing speed is slower. With larger values, more RAM is used during +indexing, and while searches on unoptimized indices are slower, indexing +is faster. Thus larger values (greater than 10) are best for batch index +creation, and smaller values (lower than 10) for indices that are +interactively maintained. Defaults to `10`. + +|index.merge.policy.min_merge_size |A size setting type which sets the +minimum size for the lowest level segments. Any segments below this size +are considered to be on the same level (even if they vary drastically in +size) and will be merged whenever there are mergeFactor of them. This +effectively truncates the "long tail" of small segments that would +otherwise be created into a single level. If you set this too large, it +could greatly increase the merging cost during indexing (if you flush +many small segments). Defaults to `1.6mb` + +|index.merge.policy.max_merge_size |A size setting type which sets the +largest segment (measured by total byte size of the segment's files) +that may be merged with other segments. Defaults to unbounded. + +|index.merge.policy.max_merge_docs |Determines the largest segment +(measured by document count) that may be merged with other segments. +Defaults to unbounded. +|======================================================================= + +[float] +==== log_doc + +A merge policy that tries to merge segments into levels of exponentially +increasing *document count*, where each level has fewer segments than +the value of the merge factor. Whenever extra segments (beyond the merge +factor upper bound) are encountered, all segments within the level are +merged. + +[cols="<,<",options="header",] +|======================================================================= +|Setting |Description +|index.merge.policy.merge_factor |Determines how often segment indices +are merged by index operation. With smaller values, less RAM is used +while indexing, and searches on unoptimized indices are faster, but +indexing speed is slower. With larger values, more RAM is used during +indexing, and while searches on unoptimized indices are slower, indexing +is faster. Thus larger values (greater than 10) are best for batch index +creation, and smaller values (lower than 10) for indices that are +interactively maintained. Defaults to `10`. + +|index.merge.policy.min_merge_docs |Sets the minimum size for the lowest +level segments. Any segments below this size are considered to be on the +same level (even if they vary drastically in size) and will be merged +whenever there are mergeFactor of them. This effectively truncates the +"long tail" of small segments that would otherwise be created into a +single level. If you set this too large, it could greatly increase the +merging cost during indexing (if you flush many small segments). +Defaults to `1000`. + +|index.merge.policy.max_merge_docs |Determines the largest segment +(measured by document count) that may be merged with other segments. +Defaults to unbounded. +|======================================================================= + +[float] +=== Scheduling + +The merge schedule controls the execution of merge operations once they +are needed (according to the merge policy). The following types are +supported, with the default being the `ConcurrentMergeScheduler`. + +[float] +==== ConcurrentMergeScheduler + +A merge scheduler that runs merges using a separated thread, until the +maximum number of threads at which when a merge is needed, the thread(s) +that are updating the index will pause until one or more merges +completes. + +The scheduler supports the following settings: + +[cols="<,<",options="header",] +|======================================================================= +|Setting |Description +|index.merge.scheduler.max_thread_count |The maximum number of threads +to perform the merge operation. Defaults to +`Math.max(1, Math.min(3, Runtime.getRuntime().availableProcessors() / 2))`. +|======================================================================= + +[float] +==== SerialMergeScheduler + +A merge scheduler that simply does each merge sequentially using the +calling thread (blocking the operations that triggered the merge, the +index operation). diff --git a/docs/reference/index-modules/similarity.asciidoc b/docs/reference/index-modules/similarity.asciidoc new file mode 100644 index 00000000000..1143e6e418b --- /dev/null +++ b/docs/reference/index-modules/similarity.asciidoc @@ -0,0 +1,136 @@ +[[index-modules-similarity]] +== Similarity module + +A similarity (scoring / ranking model) defines how matching documents +are scored. Similarity is per field, meaning that via the mapping one +can define a different similarity per field. + +Configuring a custom similarity is considered a expert feature and the +builtin similarities are most likely sufficient as is described in the +<> + +Configuring similarities is a `0.90.0.Beta1` feature. + +[float] +=== Configuring a similarity + +Most existing or custom Similarities have configuration options which +can be configured via the index settings as shown below. The index +options can be provided when creating an index or updating index +settings. + +[source,js] +-------------------------------------------------- +"similarity" : { + "my_similarity" : { + "type" : "DFR", + "basic_model" : "g", + "after_effect" : "l", + "normalization" : "h2", + "normalization.h2.c" : "3.0" + } +} +-------------------------------------------------- + +Here we configure the DFRSimilarity so it can be referenced as +`my_similarity` in mappings as is illustrate in the below example: + +[source,js] +-------------------------------------------------- +{ + "book" : { + "properties" : { + "title" : { "type" : "string", "similarity" : "my_similarity" } + } +} +-------------------------------------------------- + +[float] +=== Available similarities + +[float] +==== Default similarity + +The default similarity that is based on the TF/IDF model. This +similarity has the following option: + +`discount_overlaps`:: + Determines whether overlap tokens (Tokens with + 0 position increment) are ignored when computing norm. By default this + is true, meaning overlap tokens do not count when computing norms. + +Type name: `default` + +[float] +==== BM25 similarity + +Another TF/IDF based similarity that has built-in tf normalization and +is supposed to work better for short fields (like names). See +http://en.wikipedia.org/wiki/Okapi_BM25[Okapi_BM25] for more details. +This similarity has the following options: + +[horizontal] +`k1`:: + Controls non-linear term frequency normalization + (saturation). + +`b`:: + Controls to what degree document length normalizes tf values. + +`discount_overlaps`:: + Determines whether overlap tokens (Tokens with + 0 position increment) are ignored when computing norm. By default this + is true, meaning overlap tokens do not count when computing norms. + +Type name: `BM25` + +[float] +==== DRF similarity + +Similarity that implements the +http://lucene.apache.org/core/4_1_0/core/org/apache/lucene/search/similarities/DFRSimilarity.html[divergence +from randomness] framework. This similarity has the following options: + +[horizontal] +`basic_model`:: + Possible values: `be`, `d`, `g`, `if`, `in`, `ine` and `p`. + +`after_effect`:: + Possible values: `no`, `b` and `l`. + +`normalization`:: + Possible values: `no`, `h1`, `h2`, `h3` and `z`. + +All options but the first option need a normalization value. + +Type name: `DFR` + +[float] +==== IB similarity. + +http://lucene.apache.org/core/4_1_0/core/org/apache/lucene/search/similarities/IBSimilarity.html[Information +based model] . This similarity has the following options: + +[horizontal] +`distribution`:: Possible values: `ll` and `spl`. +`lambda`:: Possible values: `df` and `ttf`. +`normalization`:: Same as in `DFR` similarity. + +Type name: `IB` + +[float] +==== Default and Base Similarities + +By default, Elasticsearch will use whatever similarity is configured as +`default`. However, the similarity functions `queryNorm()` and `coord()` +are not per-field. Consequently, for expert users wanting to change the +implementation used for these two methods, while not changing the +`default`, it is possible to configure a similarity with the name +`base`. This similarity will then be used for the two methods. + +You can change the default similarity for all fields like this: + +[source,js] +-------------------------------------------------- +index.similarity.default.type: BM25 +-------------------------------------------------- diff --git a/docs/reference/index-modules/slowlog.asciidoc b/docs/reference/index-modules/slowlog.asciidoc new file mode 100644 index 00000000000..ec4beecc41e --- /dev/null +++ b/docs/reference/index-modules/slowlog.asciidoc @@ -0,0 +1,85 @@ +[[index-modules-slowlog]] +== Index Slow Log + +[float] +=== Search Slow Log + +Shard level slow search log allows to log slow search (query and fetch +executions) into a dedicated log file. + +Thresholds can be set for both the query phase of the execution, and +fetch phase, here is a sample: + +[source,js] +-------------------------------------------------- +#index.search.slowlog.threshold.query.warn: 10s +#index.search.slowlog.threshold.query.info: 5s +#index.search.slowlog.threshold.query.debug: 2s +#index.search.slowlog.threshold.query.trace: 500ms + +#index.search.slowlog.threshold.fetch.warn: 1s +#index.search.slowlog.threshold.fetch.info: 800ms +#index.search.slowlog.threshold.fetch.debug: 500ms +#index.search.slowlog.threshold.fetch.trace: 200ms +-------------------------------------------------- + +By default, none are enabled (set to `-1`). Levels (`warn`, `info`, +`debug`, `trace`) allow to control under which logging level the log +will be logged. Not all are required to be configured (for example, only +`warn` threshold can be set). The benefit of several levels is the +ability to quickly "grep" for specific thresholds breached. + +The logging is done on the shard level scope, meaning the execution of a +search request within a specific shard. It does not encompass the whole +search request, which can be broadcast to several shards in order to +execute. Some of the benefits of shard level logging is the association +of the actual execution on the specific machine, compared with request +level. + +All settings are index level settings (and each index can have different +values for it), and can be changed in runtime using the index update +settings API. + +The logging file is configured by default using the following +configuration (found in `logging.yml`): + +[source,js] +-------------------------------------------------- +index_search_slow_log_file: + type: dailyRollingFile + file: ${path.logs}/${cluster.name}_index_search_slowlog.log + datePattern: "'.'yyyy-MM-dd" + layout: + type: pattern + conversionPattern: "[%d{ISO8601}][%-5p][%-25c] %m%n" +-------------------------------------------------- + +[float] +=== Index Slow log + +p.The indexing slow log, similar in functionality to the search slow +log. The log file is ends with `_index_indexing_slowlog.log`. Log and +the thresholds are configured in the elasticsearch.yml file in the same +way as the search slowlog. Index slowlog sample: + +[source,js] +-------------------------------------------------- +#index.indexing.slowlog.threshold.index.warn: 10s +#index.indexing.slowlog.threshold.index.info: 5s +#index.indexing.slowlog.threshold.index.debug: 2s +#index.indexing.slowlog.threshold.index.trace: 500ms +-------------------------------------------------- + +The index slow log file is configured by default in the `logging.yml` +file: + +[source,js] +-------------------------------------------------- +index_indexing_slow_log_file: + type: dailyRollingFile + file: ${path.logs}/${cluster.name}_index_indexing_slowlog.log + datePattern: "'.'yyyy-MM-dd" + layout: + type: pattern + conversionPattern: "[%d{ISO8601}][%-5p][%-25c] %m%n" +-------------------------------------------------- diff --git a/docs/reference/index-modules/store.asciidoc b/docs/reference/index-modules/store.asciidoc new file mode 100644 index 00000000000..5bf9b47ad56 --- /dev/null +++ b/docs/reference/index-modules/store.asciidoc @@ -0,0 +1,146 @@ +[[index-modules-store]] +== Store + +The store module allows you to control how index data is stored. + +The index can either be stored in-memory (no persistence) or on-disk +(the default). In-memory indices provide better performance at the cost +of limiting the index size to the amount of available physical memory. + +When using a local gateway (the default), file system storage with *no* +in memory storage is required to maintain index consistency. This is +required since the local gateway constructs its state from the local +index state of each node. + +Another important aspect of memory based storage is the fact that +ElasticSearch supports storing the index in memory *outside of the JVM +heap space* using the "Memory" (see below) storage type. It translates +to the fact that there is no need for extra large JVM heaps (with their +own consequences) for storing the index in memory. + +[float] +=== Store Level Compression + +*From version 0.90 onwards, store compression is always enabled.* + +For versions 0.19.5 to 0.20: + +In the mapping, one can configure the `_source` field to be compressed. +The problem with it is the fact that small documents don't end up +compressing well, as several documents compressed in a single +compression "block" will provide a considerable better compression +ratio. This version introduces the ability to compress stored fields +using the `index.store.compress.stored` setting, as well as term vector +using the `index.store.compress.tv` setting. + +The settings can be set on the index level, and are dynamic, allowing to +change them using the index update settings API. elasticsearch can +handle mixed stored / non stored cases. This allows, for example, to +enable compression at a later stage in the index lifecycle, and optimize +the index to make use of it (generating new segments that use +compression). + +Best compression, compared to _source level compression, will mainly +happen when indexing smaller documents (less than 64k). The price on the +other hand is the fact that for each doc returned, a block will need to +be decompressed (its fast though) in order to extract the document data. + +[float] +=== Store Level Throttling + +(0.19.5 and above). + +The way Lucene, the IR library elasticsearch uses under the covers, +works is by creating immutable segments (up to deletes) and constantly +merging them (the merge policy settings allow to control how those +merges happen). The merge process happens in an asynchronous manner +without affecting the indexing / search speed. The problem though, +especially on systems with low IO, is that the merge process can be +expensive and affect search / index operation simply by the fact that +the box is now taxed with more IO happening. + +The store module allows to have throttling configured for merges (or +all) either on the node level, or on the index level. The node level +throttling will make sure that out of all the shards allocated on that +node, the merge process won't pass the specific setting bytes per +second. It can be set by setting `indices.store.throttle.type` to +`merge`, and setting `indices.store.throttle.max_bytes_per_sec` to +something like `5mb`. The node level settings can be changed dynamically +using the cluster update settings API. Since 0.90.1 the default is set +to `20mb` with type `merge`. + +If specific index level configuration is needed, regardless of the node +level settings, it can be set as well using the +`index.store.throttle.type`, and +`index.store.throttle.max_bytes_per_sec`. The default value for the type +is `node`, meaning it will throttle based on the node level settings and +participate in the global throttling happening. Both settings can be set +using the index update settings API dynamically. + +The following sections lists all the different storage types supported. + +[float] +=== File System + +File system based storage is the default storage used. There are +different implementations or storage types. The best one for the +operating environment will be automatically chosen: `mmapfs` on +Solaris/Windows 64bit, `simplefs` on Windows 32bit, and `niofs` for the +rest. + +The following are the different file system based storage types: + +[float] +==== Simple FS + +The `simplefs` type is a straightforward implementation of file system +storage (maps to Lucene `SimpleFsDirectory`) using a random access file. +This implementation has poor concurrent performance (multiple threads +will bottleneck). It is usually better to use the `niofs` when you need +index persistence. + +[float] +==== NIO FS + +The `niofs` type stores the shard index on the file system (maps to +Lucene `NIOFSDirectory`) using NIO. It allows multiple threads to read +from the same file concurrently. It is not recommended on Windows +because of a bug in the SUN Java implementation. + +[float] +==== MMap FS + +The `mmapfs` type stores the shard index on the file system (maps to +Lucene `MMapDirectory`) by mapping a file into memory (mmap). Memory +mapping uses up a portion of the virtual memory address space in your +process equal to the size of the file being mapped. Before using this +class, be sure your have plenty of virtual address space. + +[float] +=== Memory + +The `memory` type stores the index in main memory with the following +configuration options: + +There are also *node* level settings that control the caching of buffers +(important when using direct buffers): + +[cols="<,<",options="header",] +|======================================================================= +|Setting |Description +|`cache.memory.direct` |Should the memory be allocated outside of the +JVM heap. Defaults to `true`. + +|`cache.memory.small_buffer_size` |The small buffer size, defaults to +`1kb`. + +|`cache.memory.large_buffer_size` |The large buffer size, defaults to +`1mb`. + +|`cache.memory.small_cache_size` |The small buffer cache size, defaults +to `10mb`. + +|`cache.memory.large_cache_size` |The large buffer cache size, defaults +to `500mb`. +|======================================================================= + diff --git a/docs/reference/index-modules/translog.asciidoc b/docs/reference/index-modules/translog.asciidoc new file mode 100644 index 00000000000..d887cb149f6 --- /dev/null +++ b/docs/reference/index-modules/translog.asciidoc @@ -0,0 +1,25 @@ +[[index-modules-translog]] +== Translog + +Each shard has a transaction log or write ahead log associated with it. +It allows to guarantee that when an index/delete operation occurs, it is +applied atomically, while not "committing" the internal Lucene index for +each request. A flush ("commit") still happens based on several +parameters: + +[cols="<,<",options="header",] +|======================================================================= +|Setting |Description +|index.translog.flush_threshold_ops |After how many operations to flush. +Defaults to `5000`. + +|index.translog.flush_threshold_size |Once the translog hits this size, +a flush will happen. Defaults to `200mb`. + +|index.translog.flush_threshold_period |The period with no flush +happening to force a flush. Defaults to `30m`. +|======================================================================= + +Note: these parameters can be updated at runtime using the Index +Settings Update API (for example, these number can be increased when +executing bulk updates to support higher TPS) diff --git a/docs/reference/index.asciidoc b/docs/reference/index.asciidoc new file mode 100644 index 00000000000..f2552ee87dd --- /dev/null +++ b/docs/reference/index.asciidoc @@ -0,0 +1,25 @@ +[[elasticsearch-reference]] += Reference + +include::setup.asciidoc[] + +include::docs.asciidoc[] + +include::search.asciidoc[] + +include::indices.asciidoc[] + +include::cluster.asciidoc[] + +include::query-dsl.asciidoc[] + +include::mapping.asciidoc[] + +include::analysis.asciidoc[] + +include::modules.asciidoc[] + +include::index-modules.asciidoc[] + +include::glossary.asciidoc[] + diff --git a/docs/reference/indices.asciidoc b/docs/reference/indices.asciidoc new file mode 100644 index 00000000000..493a53f5a33 --- /dev/null +++ b/docs/reference/indices.asciidoc @@ -0,0 +1,96 @@ +[[indices]] += Indices APIs + +[partintro] +-- +The indices APIs are used to manage individual indices, +index settings, aliases, mappings, index templates +and warmers. + +[float] +== Index management: + +* <> +* <> +* <> +* <> + +[float] +== Mapping management: + +* <> +* <> +* <> + +[float] +== Alias management: +* <> + +[float] +== Index settings: +* <> +* <> +* <> +* <> +* <> + +[float] +== Monitoring: +* <> +* <> +* <> + +[float] +== Status management: +* <> +* <> +* <> +* <> +* <> + +-- + +include::indices/create-index.asciidoc[] + +include::indices/delete-index.asciidoc[] + +include::indices/indices-exists.asciidoc[] + +include::indices/open-close.asciidoc[] + +include::indices/put-mapping.asciidoc[] + +include::indices/get-mapping.asciidoc[] + +include::indices/types-exists.asciidoc[] + +include::indices/delete-mapping.asciidoc[] + +include::indices/aliases.asciidoc[] + +include::indices/update-settings.asciidoc[] + +include::indices/get-settings.asciidoc[] + +include::indices/analyze.asciidoc[] + +include::indices/templates.asciidoc[] + +include::indices/warmers.asciidoc[] + +include::indices/status.asciidoc[] + +include::indices/stats.asciidoc[] + +include::indices/segments.asciidoc[] + +include::indices/clearcache.asciidoc[] + +include::indices/flush.asciidoc[] + +include::indices/refresh.asciidoc[] + +include::indices/optimize.asciidoc[] + +include::indices/gateway-snapshot.asciidoc[] + diff --git a/docs/reference/indices/aliases.asciidoc b/docs/reference/indices/aliases.asciidoc new file mode 100644 index 00000000000..14bc2c9d24b --- /dev/null +++ b/docs/reference/indices/aliases.asciidoc @@ -0,0 +1,351 @@ +[[indices-aliases]] +== Index Aliases + +APIs in elasticsearch accept an index name when working against a +specific index, and several indices when applicable. The index aliases +API allow to alias an index with a name, with all APIs automatically +converting the alias name to the actual index name. An alias can also be +mapped to more than one index, and when specifying it, the alias will +automatically expand to the aliases indices. An alias can also be +associated with a filter that will automatically be applied when +searching, and routing values. + +Here is a sample of associating the alias `alias1` with index `test1`: + +[source,js] +-------------------------------------------------- +curl -XPOST 'http://localhost:9200/_aliases' -d ' +{ + "actions" : [ + { "add" : { "index" : "test1", "alias" : "alias1" } } + ] +}' +-------------------------------------------------- + +An alias can also be removed, for example: + +[source,js] +-------------------------------------------------- +curl -XPOST 'http://localhost:9200/_aliases' -d ' +{ + "actions" : [ + { "remove" : { "index" : "test1", "alias" : "alias1" } } + ] +}' +-------------------------------------------------- + +Renaming an alias is a simple `remove` then `add` operation within the +same API. This operation is atomic, no need to worry about a short +period of time where the alias does not point to an index: + +[source,js] +-------------------------------------------------- +curl -XPOST 'http://localhost:9200/_aliases' -d ' +{ + "actions" : [ + { "remove" : { "index" : "test1", "alias" : "alias1" } }, + { "add" : { "index" : "test1", "alias" : "alias2" } } + ] +}' +-------------------------------------------------- + +Associating an alias with more than one index are simply several `add` +actions: + +[source,js] +-------------------------------------------------- +curl -XPOST 'http://localhost:9200/_aliases' -d ' +{ + "actions" : [ + { "add" : { "index" : "test1", "alias" : "alias1" } }, + { "add" : { "index" : "test2", "alias" : "alias1" } } + ] +}' +-------------------------------------------------- + +It is an error to index to an alias which points to more than one index. + +[float] +=== Filtered Aliases + +Aliases with filters provide an easy way to create different "views" of +the same index. The filter can be defined using Query DSL and is applied +to all Search, Count, Delete By Query and More Like This operations with +this alias. Here is an example: + +[source,js] +-------------------------------------------------- +curl -XPOST 'http://localhost:9200/_aliases' -d ' +{ + "actions" : [ + { + "add" : { + "index" : "test1", + "alias" : "alias2", + "filter" : { "term" : { "user" : "kimchy" } } + } + } + ] +}' +-------------------------------------------------- + +[float] +==== Routing + +It is possible to associate routing values with aliases. This feature +can be used together with filtering aliases in order to avoid +unnecessary shard operations. + +The following command creates a new alias `alias1` that points to index +`test`. After `alias1` is created, all operations with this alias are +automatically modified to use value `1` for routing: + +[source,js] +-------------------------------------------------- +curl -XPOST 'http://localhost:9200/_aliases' -d ' +{ + "actions" : [ + { + "add" : { + "index" : "test", + "alias" : "alias1", + "routing" : "1" + } + } + ] +}' +-------------------------------------------------- + +It's also possible to specify different routing values for searching +and indexing operations: + +[source,js] +-------------------------------------------------- +curl -XPOST 'http://localhost:9200/_aliases' -d ' +{ + "actions" : [ + { + "add" : { + "index" : "test", + "alias" : "alias2", + "search_routing" : "1,2", + "index_routing" : "2" + } + } + ] +}' +-------------------------------------------------- + +As shown in the example above, search routing may contain several values +separated by comma. Index routing can contain only a single value. + +If an operation that uses routing alias also has a routing parameter, an +intersection of both alias routing and routing specified in the +parameter is used. For example the following command will use "2" as a +routing value: + +[source,js] +-------------------------------------------------- +curl -XGET 'http://localhost:9200/alias2/_search?q=user:kimchy&routing=2,3' +-------------------------------------------------- + +[float] +=== Add a single index alias + +From version `0.90.1` there is an api to add a single index alias, +options: + +[horizontal] +`index`:: The index to alias refers to. This is a required option. +`alias`:: The name of the alias. This is a required option. +`routing`:: An optional routing that can be associated with an alias. +`filter`:: An optional filter that can be associated with an alias. + +The rest endpoint is: `/{index}/_alias/{alias}`. + +[float] +==== Examples: + +Adding time based alias: + +[source,js] +-------------------------------------------------- +curl -XPUT 'localhost:9200/logs_201305/_alias/2013' +-------------------------------------------------- + +Adding user alias: + +[source,js] +-------------------------------------------------- +curl -XPUT 'localhost:9200/users/_alias/user_12' -d '{ + "routing" : "12", + "filter" : { + "term" : { + "user_id" : 12 + } + } +}' +-------------------------------------------------- + +[float] +=== Delete a single index alias + +From version `0.90.1` there is an api to delete a single index alias, +options: + +[horizontal] +`index`:: The index the alias is in, the needs to be deleted. This is + a required option. +`alias`:: The name of the alias to delete. This is a required option. + +The rest endpoint is: `/{index}/_alias/{alias}`. Example: + +[source,js] +-------------------------------------------------- +curl -XDELETE 'localhost:9200/users/_alias/user_12' +-------------------------------------------------- + +[float] +=== Retrieving existing aliases + +The get index alias api (Available since `0.90.1`) allows to filter by +alias name and index name. This api redirects to the master and fetches +the requested index aliases, if available. This api only serialises the +found index aliases. + +Possible options: +[horizontal] +`index`:: + + The index name to get aliases for. Partially names are + supported via wildcards, also multiple index names can be specified + separated with a comma. Also the alias name for an index can be used. + +`alias`:: + The name of alias to return in the response. Like the index + option, this option supports wildcards and the option the specify + multiple alias names separated by a comma. This is a required option. + +`ignore_indices`:: + What to do is an specified index name doesn't + exist. If set to `missing` then those indices are ignored. + +The rest endpoint is: `/{index}/_alias/{alias}`. + +[float] +==== Examples: + +All aliases for the index users: + +[source,js] +-------------------------------------------------- +curl -XGET 'localhost:9200/users/_alias/*' +-------------------------------------------------- + +Response: + +[source,js] +-------------------------------------------------- + { + "users" : { + "aliases" : { + "user_13" : { + "filter" : { + "term" : { + "user_id" : 13 + } + }, + "index_routing" : "13", + "search_routing" : "13" + }, + "user_14" : { + "filter" : { + "term" : { + "user_id" : 14 + } + }, + "index_routing" : "14", + "search_routing" : "14" + }, + "user_12" : { + "filter" : { + "term" : { + "user_id" : 12 + } + }, + "index_routing" : "12", + "search_routing" : "12" + } + } + } +} +-------------------------------------------------- + +All aliases with the name 2013 in any index: + +[source,js] +-------------------------------------------------- +curl -XGET 'localhost:9200/_alias/2013' +-------------------------------------------------- + +Response: + +[source,js] +-------------------------------------------------- +{ + "logs_201304" : { + "aliases" : { + "2013" : { } + } + }, + "logs_201305" : { + "aliases" : { + "2013" : { } + } + } +} +-------------------------------------------------- + +All aliases that start with 2013_01 in any index: + +[source,js] +-------------------------------------------------- +curl -XGET 'localhost:9200/_alias/2013_01*' +-------------------------------------------------- + +Response: + +[source,js] +-------------------------------------------------- +{ + "logs_20130101" : { + "aliases" : { + "2013_01" : { } + } + } +} +-------------------------------------------------- + +There is also a HEAD variant of the get indices aliases api to check if +index aliases exist. The indices aliases exists api supports the same +option as the get indices aliases api. Examples: + +[source,js] +-------------------------------------------------- +curl -XHEAD 'localhost:9200/_alias/2013' +curl -XHEAD 'localhost:9200/_alias/2013_01*' +curl -XHEAD 'localhost:9200/users/_alias/*' +-------------------------------------------------- + +[float] +=== Pre 0.90.1 way of getting index aliases + +Aliases can be retrieved using the get aliases API, which can either +return all indices with all aliases, or just for specific indices: + +[source,js] +-------------------------------------------------- +curl -XGET 'localhost:9200/test/_aliases' +curl -XGET 'localhost:9200/test1,test2/_aliases' +curl -XGET 'localhost:9200/_aliases' +-------------------------------------------------- diff --git a/docs/reference/indices/analyze.asciidoc b/docs/reference/indices/analyze.asciidoc new file mode 100644 index 00000000000..98ed9934ce6 --- /dev/null +++ b/docs/reference/indices/analyze.asciidoc @@ -0,0 +1,57 @@ +[[indices-analyze]] +== Analyze + +Performs the analysis process on a text and return the tokens breakdown +of the text. + +Can be used without specifying an index against one of the many built in +analyzers: + +[source,js] +-------------------------------------------------- +curl -XGET 'localhost:9200/_analyze?analyzer=standard' -d 'this is a test' +-------------------------------------------------- + +Or by building a custom transient analyzer out of tokenizers and +filters: + +[source,js] +-------------------------------------------------- +curl -XGET 'localhost:9200/_analyze?tokenizer=keyword&filters=lowercase' -d 'this is a test' +-------------------------------------------------- + +It can also run against a specific index: + +[source,js] +-------------------------------------------------- +curl -XGET 'localhost:9200/test/_analyze?text=this+is+a+test' +-------------------------------------------------- + +The above will run an analysis on the "this is a test" text, using the +default index analyzer associated with the `test` index. An `analyzer` +can also be provided to use a different analyzer: + +[source,js] +-------------------------------------------------- +curl -XGET 'localhost:9200/test/_analyze?analyzer=whitespace' -d 'this is a test' +-------------------------------------------------- + +Also, the analyzer can be derived based on a field mapping, for example: + +[source,js] +-------------------------------------------------- +curl -XGET 'localhost:9200/test/_analyze?field=obj1.field1' -d 'this is a test' +-------------------------------------------------- + +Will cause the analysis to happen based on the analyzer configure in the +mapping for `obj1.field1` (and if not, the default index analyzer). + +Also, the text can be provided as part of the request body, and not as a +parameter. + +[float] +=== Format + +By default, the format the tokens are returned in are in json and its +called `detailed`. The `text` format value provides the analyzed data in +a text stream that is a bit more readable. diff --git a/docs/reference/indices/clearcache.asciidoc b/docs/reference/indices/clearcache.asciidoc new file mode 100644 index 00000000000..4d2d391d87f --- /dev/null +++ b/docs/reference/indices/clearcache.asciidoc @@ -0,0 +1,31 @@ +[[indices-clearcache]] +== Clear Cache + +The clear cache API allows to clear either all caches or specific cached +associated with one ore more indices. + +[source,js] +-------------------------------------------------- +$ curl -XPOST 'http://localhost:9200/twitter/_cache/clear' +-------------------------------------------------- + +The API, by default, will clear all caches. Specific caches can be +cleaned explicitly by setting `filter`, `field_data` or `bloom` to +`true`. + +All caches relating to a specific field(s) can also be cleared by +specifying `fields` parameter with a comma delimited list of the +relevant fields. + +[float] +=== Multi Index + +The clear cache API can be applied to more than one index with a single +call, or even on `_all` the indices. + +[source,js] +-------------------------------------------------- +$ curl -XPOST 'http://localhost:9200/kimchy,elasticsearch/_cache/clear' + +$ curl -XPOST 'http://localhost:9200/_cache/clear' +-------------------------------------------------- diff --git a/docs/reference/indices/create-index.asciidoc b/docs/reference/indices/create-index.asciidoc new file mode 100644 index 00000000000..b1b1d20682c --- /dev/null +++ b/docs/reference/indices/create-index.asciidoc @@ -0,0 +1,79 @@ +[[indices-create-index]] +== Create Index + +The create index API allows to instantiate an index. ElasticSearch +provides support for multiple indices, including executing operations +across several indices. Each index created can have specific settings +associated with it. + +[source,js] +-------------------------------------------------- +$ curl -XPUT 'http://localhost:9200/twitter/' + +$ curl -XPUT 'http://localhost:9200/twitter/' -d ' +index : + number_of_shards : 3 + number_of_replicas : 2 +' +-------------------------------------------------- + +The above second curl example shows how an index called `twitter` can be +created with specific settings for it using http://www.yaml.org[YAML]. +In this case, creating an index with 3 shards, each with 2 replicas. The +index settings can also be defined with http://www.json.org[JSON]: + +[source,js] +-------------------------------------------------- +$ curl -XPUT 'http://localhost:9200/twitter/' -d '{ + "settings" : { + "index" : { + "number_of_shards" : 3, + "number_of_replicas" : 2 + } + } +}' +-------------------------------------------------- + +or more simplified + +[source,js] +-------------------------------------------------- +$ curl -XPUT 'http://localhost:9200/twitter/' -d '{ + "settings" : { + "number_of_shards" : 3, + "number_of_replicas" : 2 + } +}' +-------------------------------------------------- + +_Note you do not have to explicitly specify `index` section inside +`settings` section._ + +[float] +=== Mappings + +The create index API allows to provide a set of one or more mappings: + +[source,js] +-------------------------------------------------- +curl -XPOST localhost:9200/test -d '{ + "settings" : { + "number_of_shards" : 1 + }, + "mappings" : { + "type1" : { + "_source" : { "enabled" : false }, + "properties" : { + "field1" : { "type" : "string", "index" : "not_analyzed" } + } + } + } +}' +-------------------------------------------------- + +[float] +=== Index Settings + +For more information regarding all the different index level settings +that can be set when creating an index, please check the +<> section. diff --git a/docs/reference/indices/delete-index.asciidoc b/docs/reference/indices/delete-index.asciidoc new file mode 100644 index 00000000000..3dbfaee12b4 --- /dev/null +++ b/docs/reference/indices/delete-index.asciidoc @@ -0,0 +1,17 @@ +[[indices-delete-index]] +== Delete Index + +The delete index API allows to delete an existing index. + +[source,js] +-------------------------------------------------- +$ curl -XDELETE 'http://localhost:9200/twitter/' +-------------------------------------------------- + +The above example deletes an index called `twitter`. + +The delete index API can also be applied to more than one index, or on +`_all` indices (be careful!). All indices will also be deleted when no +specific index is provided. In order to disable allowing to delete all +indices, set `action.disable_delete_all_indices` setting in the config +to `true`. diff --git a/docs/reference/indices/delete-mapping.asciidoc b/docs/reference/indices/delete-mapping.asciidoc new file mode 100644 index 00000000000..6f066dbfdce --- /dev/null +++ b/docs/reference/indices/delete-mapping.asciidoc @@ -0,0 +1,8 @@ +[[indices-delete-mapping]] +== Delete Mapping + +Allow to delete a mapping (type) along with its data. The REST endpoint +is `/{index}/{type}` with `DELETE` method. + +Note, most times, it make more sense to reindex the data into a fresh +index compared to delete large chunks of it. diff --git a/docs/reference/indices/flush.asciidoc b/docs/reference/indices/flush.asciidoc new file mode 100644 index 00000000000..02509ffd769 --- /dev/null +++ b/docs/reference/indices/flush.asciidoc @@ -0,0 +1,39 @@ +[[indices-flush]] +== Flush + +The flush API allows to flush one or more indices through an API. The +flush process of an index basically frees memory from the index by +flushing data to the index storage and clearing the internal +<>. By +default, ElasticSearch uses memory heuristics in order to automatically +trigger flush operations as required in order to clear memory. + +[source,js] +-------------------------------------------------- +$ curl -XPOST 'http://localhost:9200/twitter/_flush' +-------------------------------------------------- + +[float] +=== Request Parameters + +The flush API accepts the following request parameters: + +[cols="<,<",options="header",] +|======================================================================= +|Name |Description +|`refresh` |Should a refresh be performed after the flush. Defaults to +`false`. +|======================================================================= + +[float] +=== Multi Index + +The flush API can be applied to more than one index with a single call, +or even on `_all` the indices. + +[source,js] +-------------------------------------------------- +$ curl -XPOST 'http://localhost:9200/kimchy,elasticsearch/_flush' + +$ curl -XPOST 'http://localhost:9200/_flush' +-------------------------------------------------- diff --git a/docs/reference/indices/gateway-snapshot.asciidoc b/docs/reference/indices/gateway-snapshot.asciidoc new file mode 100644 index 00000000000..188d1aa5c8e --- /dev/null +++ b/docs/reference/indices/gateway-snapshot.asciidoc @@ -0,0 +1,29 @@ +[[indices-gateway-snapshot]] +== Gateway Snapshot + +The gateway snapshot API allows to explicitly perform a snapshot through +the gateway of one or more indices (backup them). By default, each index +gateway periodically snapshot changes, though it can be disabled and be +controlled completely through this API. + +Note, this API only applies when using shared storage gateway +implementation, and does not apply when using the (default) local +gateway. + +[source,js] +-------------------------------------------------- +$ curl -XPOST 'http://localhost:9200/twitter/_gateway/snapshot' +-------------------------------------------------- + +[float] +=== Multi Index + +The gateway snapshot API can be applied to more than one index with a +single call, or even on `_all` the indices. + +[source,js] +-------------------------------------------------- +$ curl -XPOST 'http://localhost:9200/kimchy,elasticsearch/_gateway/snapshot' + +$ curl -XPOST 'http://localhost:9200/_gateway/snapshot' +-------------------------------------------------- diff --git a/docs/reference/indices/get-mapping.asciidoc b/docs/reference/indices/get-mapping.asciidoc new file mode 100644 index 00000000000..d084ffcfed0 --- /dev/null +++ b/docs/reference/indices/get-mapping.asciidoc @@ -0,0 +1,37 @@ +[[indices-get-mapping]] +== Get Mapping + +The get mapping API allows to retrieve mapping definition of index or +index/type. + +[source,js] +-------------------------------------------------- +$ curl -XGET 'http://localhost:9200/twitter/tweet/_mapping' +-------------------------------------------------- + +[float] +=== Multiple Indices and Types + +The get mapping API can be used to get more than one index or type +mapping with a single call. General usage of the API follows the +following syntax: `host:port/{index}/{type}/_mapping` where both +`{index}` and `{type}` can stand for comma-separated list of names. To +get mappings for all indices you can use `_all` for `{index}`. The +following are some examples: + +[source,js] +-------------------------------------------------- +$ curl -XGET 'http://localhost:9200/twitter,kimchy/_mapping' + +$ curl -XGET 'http://localhost:9200/_all/tweet,book/_mapping' +-------------------------------------------------- + +If you want to get mappings of all indices and types then the following +two examples are equivalent: + +[source,js] +-------------------------------------------------- +$ curl -XGET 'http://localhost:9200/_all/_mapping' + +$ curl -XGET 'http://localhost:9200/_mapping' +-------------------------------------------------- diff --git a/docs/reference/indices/get-settings.asciidoc b/docs/reference/indices/get-settings.asciidoc new file mode 100644 index 00000000000..fd4bae04186 --- /dev/null +++ b/docs/reference/indices/get-settings.asciidoc @@ -0,0 +1,9 @@ +[[indices-get-settings]] +== Get Settings + +The get settings API allows to retrieve settings of index/indices: + +[source,js] +-------------------------------------------------- +$ curl -XGET 'http://localhost:9200/twitter/_settings' +-------------------------------------------------- diff --git a/docs/reference/indices/indices-exists.asciidoc b/docs/reference/indices/indices-exists.asciidoc new file mode 100644 index 00000000000..422dcba39e9 --- /dev/null +++ b/docs/reference/indices/indices-exists.asciidoc @@ -0,0 +1,12 @@ +[[indices-exists]] +== Indices Exists + +Used to check if the index (indices) exists or not. For example: + +[source,js] +-------------------------------------------------- +curl -XHEAD 'http://localhost:9200/twitter' +-------------------------------------------------- + +The HTTP status code indicates if the index exists or not. A `404` means +it does not exist, and `200` means it does. diff --git a/docs/reference/indices/open-close.asciidoc b/docs/reference/indices/open-close.asciidoc new file mode 100644 index 00000000000..44fca52e559 --- /dev/null +++ b/docs/reference/indices/open-close.asciidoc @@ -0,0 +1,18 @@ +[[indices-open-close]] +== Open / Close Index API + +The open and close index APIs allow to close an index, and later on +opening it. A closed index has almost no overhead on the cluster (except +for maintaining its metadata), and is blocked for read/write operations. +A closed index can be opened which will then go through the normal +recovery process. + +The REST endpoint is `/{index}/_close` and `/{index}/_open`. For +example: + +[source,js] +-------------------------------------------------- +curl -XPOST 'localhost:9200/my_index/_close' + +curl -XPOST 'localhost:9200/my_index/_open' +-------------------------------------------------- diff --git a/docs/reference/indices/optimize.asciidoc b/docs/reference/indices/optimize.asciidoc new file mode 100644 index 00000000000..39f94b3e51b --- /dev/null +++ b/docs/reference/indices/optimize.asciidoc @@ -0,0 +1,55 @@ +[[indices-optimize]] +== Optimize + +The optimize API allows to optimize one or more indices through an API. +The optimize process basically optimizes the index for faster search +operations (and relates to the number of segments a Lucene index holds +within each shard). The optimize operation allows to reduce the number +of segments by merging them. + +[source,js] +-------------------------------------------------- +$ curl -XPOST 'http://localhost:9200/twitter/_optimize' +-------------------------------------------------- + +[float] +=== Request Parameters + +The optimize API accepts the following request parameters: + +[cols="<,<",options="header",] +|======================================================================= +|Name |Description +|max_num_segments |The number of segments to optimize to. To fully +optimize the index, set it to `1`. Defaults to simply checking if a +merge needs to execute, and if so, executes it. + +|only_expunge_deletes |Should the optimize process only expunge segments +with deletes in it. In Lucene, a document is not deleted from a segment, +just marked as deleted. During a merge process of segments, a new +segment is created that does not have those deletes. This flag allow to +only merge segments that have deletes. Defaults to `false`. + +|refresh |Should a refresh be performed after the optimize. Defaults to +`true`. + +|flush |Should a flush be performed after the optimize. Defaults to +`true`. + +|wait_for_merge |Should the request wait for the merge to end. Defaults +to `true`. Note, a merge can potentially be a very heavy operation, so +it might make sense to run it set to `false`. +|======================================================================= + +[float] +=== Multi Index + +The optimize API can be applied to more than one index with a single +call, or even on `_all` the indices. + +[source,js] +-------------------------------------------------- +$ curl -XPOST 'http://localhost:9200/kimchy,elasticsearch/_optimize' + +$ curl -XPOST 'http://localhost:9200/_optimize' +-------------------------------------------------- diff --git a/docs/reference/indices/put-mapping.asciidoc b/docs/reference/indices/put-mapping.asciidoc new file mode 100644 index 00000000000..029514f9344 --- /dev/null +++ b/docs/reference/indices/put-mapping.asciidoc @@ -0,0 +1,59 @@ +[[indices-put-mapping]] +== Put Mapping + +The put mapping API allows to register specific mapping definition for a +specific type. + +[source,js] +-------------------------------------------------- +$ curl -XPUT 'http://localhost:9200/twitter/tweet/_mapping' -d ' +{ + "tweet" : { + "properties" : { + "message" : {"type" : "string", "store" : "yes"} + } + } +} +' +-------------------------------------------------- + +The above example creates a mapping called `tweet` within the `twitter` +index. The mapping simply defines that the `message` field should be +stored (by default, fields are not stored, just indexed) so we can +retrieve it later on using selective loading. + +More information on how to define type mappings can be found in the +<> section. + +[float] +=== Merging & Conflicts + +When an existing mapping already exists under the given type, the two +mapping definitions, the one already defined, and the new ones are +merged. The `ignore_conflicts` parameters can be used to control if +conflicts should be ignored or not, by default, it is set to `false` +which means conflicts are *not* ignored. + +The definition of conflict is really dependent on the type merged, but +in general, if a different core type is defined, it is considered as a +conflict. New mapping definitions can be added to object types, and core +type mapping can be upgraded to `multi_field` type. + +[float] +=== Multi Index + +The put mapping API can be applied to more than one index with a single +call, or even on `_all` the indices. + +[source,js] +-------------------------------------------------- +$ curl -XPUT 'http://localhost:9200/kimchy,elasticsearch/tweet/_mapping' -d ' +{ + "tweet" : { + "properties" : { + "message" : {"type" : "string", "store" : "yes"} + } + } +} +' +-------------------------------------------------- diff --git a/docs/reference/indices/refresh.asciidoc b/docs/reference/indices/refresh.asciidoc new file mode 100644 index 00000000000..1b0fb5daf48 --- /dev/null +++ b/docs/reference/indices/refresh.asciidoc @@ -0,0 +1,26 @@ +[[indices-refresh]] +== Refresh + +The refresh API allows to explicitly refresh one or more index, making +all operations performed since the last refresh available for search. +The (near) real-time capabilities depend on the index engine used. For +example, the robin one requires refresh to be called, but by default a +refresh is scheduled periodically. + +[source,js] +-------------------------------------------------- +$ curl -XPOST 'http://localhost:9200/twitter/_refresh' +-------------------------------------------------- + +[float] +=== Multi Index + +The refresh API can be applied to more than one index with a single +call, or even on `_all` the indices. + +[source,js] +-------------------------------------------------- +$ curl -XPOST 'http://localhost:9200/kimchy,elasticsearch/_refresh' + +$ curl -XPOST 'http://localhost:9200/_refresh' +-------------------------------------------------- diff --git a/docs/reference/indices/segments.asciidoc b/docs/reference/indices/segments.asciidoc new file mode 100644 index 00000000000..13192652581 --- /dev/null +++ b/docs/reference/indices/segments.asciidoc @@ -0,0 +1,17 @@ +[[indices-segments]] +== Indices Segments + +Provide low level segments information that a Lucene index (shard level) +is built with. Allows to be used to provide more information on the +state of a shard and an index, possibly optimization information, data +"wasted" on deletes, and so on. + +Endpoints include segments for a specific index, several indices, or +all: + +[source,js] +-------------------------------------------------- +curl -XGET 'http://localhost:9200/test/_segments' +curl -XGET 'http://localhost:9200/test1,test2/_segments' +curl -XGET 'http://localhost:9200/_segments' +-------------------------------------------------- diff --git a/docs/reference/indices/stats.asciidoc b/docs/reference/indices/stats.asciidoc new file mode 100644 index 00000000000..0468a8cb202 --- /dev/null +++ b/docs/reference/indices/stats.asciidoc @@ -0,0 +1,84 @@ +[[indices-stats]] +== Indices Stats + +Indices level stats provide statistics on different operations happening +on an index. The API provides statistics on the index level scope +(though most stats can also be retrieved using node level scope). + +The following returns high level aggregation and index level stats for +all indices: + +[source,js] +-------------------------------------------------- +curl localhost:9200/_stats +-------------------------------------------------- + +Specific index stats can be retrieved using: + +[source,js] +-------------------------------------------------- +curl localhost:9200/index1,index2/_stats +-------------------------------------------------- + +By default, `docs`, `store`, and `indexing`, `get`, and `search` stats +are returned, other stats can be enabled as well: + +[horizontal] +`docs`:: The number of docs / deleted docs (docs not yet merged out). + Note, affected by refreshing the index. + +`store`:: The size of the index. + +`indexing`:: Indexing statistics, can be combined with a comma + separated list of `types` to provide document type level stats. + +`get`:: Get statistics, including missing stats. + +`search`:: Search statistics, including custom grouping using the + `groups` parameter (search operations can be associated with one or more + groups). + +`warmer`:: Warmer statistics. +`merge`:: merge stats. +`flush`:: flush stats. +`refresh`:: refresh stats. +`clear`:: Clears all the flags (first). + +Here are some samples: + +[source,js] +-------------------------------------------------- +# Get back stats for merge and refresh on top of the defaults +curl 'localhost:9200/_stats?merge=true&refresh=true' +# Get back stats just for flush +curl 'localhost:9200/_stats?clear=true&flush=true' +# Get back stats for type1 and type2 documents for the my_index index +curl 'localhost:9200/my_index/_stats?clear=true&indexing=true&types=type1,type2 +-------------------------------------------------- + +The stats returned are aggregated on the index level, with +`primaries` and `total` aggregations. In order to get back shard level +stats, set the `level` parameter to `shards`. + +Note, as shards move around the cluster, their stats will be cleared as +they are created on other nodes. On the other hand, even though a shard +"left" a node, that node will still retain the stats that shard +contributed to. + +[float] +=== Specific stats endpoints + +Instead of using flags to indicate which stats to return, specific REST +endpoints can be used, for example: + +[source,js] +-------------------------------------------------- +# Merge stats across all indices +curl localhost:9200/_stats/merge +# Merge stats for the my_index index +curl localhost:9200/my_index/_stats/merge +# Indexing stats for my_index +curl localhost:9200/my_index/_stats/indexing +# Indexing stats for my_index for my_type1 and my_type2 +curl localhost:9200/my_index/_stats/indexing/my_type1,my_type2 +-------------------------------------------------- diff --git a/docs/reference/indices/status.asciidoc b/docs/reference/indices/status.asciidoc new file mode 100644 index 00000000000..c454b0fc43e --- /dev/null +++ b/docs/reference/indices/status.asciidoc @@ -0,0 +1,27 @@ +[[indices-status]] +== Status + +The indices status API allows to get a comprehensive status information +of one or more indices. + +[source,js] +-------------------------------------------------- +curl -XGET 'http://localhost:9200/twitter/_status' +-------------------------------------------------- + +In order to see the recovery status of shards, pass `recovery` flag and +set it to `true`. For snapshot status, pass the `snapshot` flag and set +it to `true`. + +[float] +=== Multi Index + +The status API can be applied to more than one index with a single call, +or even on `_all` the indices. + +[source,js] +-------------------------------------------------- +curl -XGET 'http://localhost:9200/kimchy,elasticsearch/_status' + +curl -XGET 'http://localhost:9200/_status' +-------------------------------------------------- diff --git a/docs/reference/indices/templates.asciidoc b/docs/reference/indices/templates.asciidoc new file mode 100644 index 00000000000..e565b61716d --- /dev/null +++ b/docs/reference/indices/templates.asciidoc @@ -0,0 +1,135 @@ +[[indices-templates]] +== Index Templates + +Index templates allow to define templates that will automatically be +applied to new indices created. The templates include both settings and +mappings, and a simple pattern template that controls if the template +will be applied to the index created. For example: + +[source,js] +-------------------------------------------------- +curl -XPUT localhost:9200/_template/template_1 -d ' +{ + "template" : "te*", + "settings" : { + "number_of_shards" : 1 + }, + "mappings" : { + "type1" : { + "_source" : { "enabled" : false } + } + } +} +' +-------------------------------------------------- + +Defines a template named template_1, with a template pattern of `te*`. +The settings and mappings will be applied to any index name that matches +the `te*` template. + +[float] +=== Deleting a Template + +Index templates are identified by a name (in the above case +`template_1`) and can be delete as well: + +[source,js] +-------------------------------------------------- +curl -XDELETE localhost:9200/_template/template_1 +-------------------------------------------------- + +[float] +=== GETting a Template + +Index templates are identified by a name (in the above case +`template_1`) and can be retrieved using the following: + +[source,js] +-------------------------------------------------- +curl -XGET localhost:9200/_template/template_1 +-------------------------------------------------- + +To get list of all index templates you can use +<> API +and check for the metadata/templates section of the response. + +[float] +=== Multiple Template Matching + +Multiple index templates can potentially match an index, in this case, +both the settings and mappings are merged into the final configuration +of the index. The order of the merging can be controlled using the +`order` parameter, with lower order being applied first, and higher +orders overriding them. For example: + +[source,js] +-------------------------------------------------- +curl -XPUT localhost:9200/_template/template_1 -d ' +{ + "template" : "*", + "order" : 0, + "settings" : { + "number_of_shards" : 1 + }, + "mappings" : { + "type1" : { + "_source" : { "enabled" : false } + } + } +} +' + +curl -XPUT localhost:9200/_template/template_2 -d ' +{ + "template" : "te*", + "order" : 1, + "settings" : { + "number_of_shards" : 1 + }, + "mappings" : { + "type1" : { + "_source" : { "enabled" : true } + } + } +} +' +-------------------------------------------------- + +The above will disable storing the `_source` on all `type1` types, but +for indices of that start with `te*`, source will still be enabled. +Note, for mappings, the merging is "deep", meaning that specific +object/property based mappings can easily be added/overridden on higher +order templates, with lower order templates providing the basis. + +[float] +=== Config + +Index templates can also be placed within the config location +(`path.conf`) under the `templates` directory (note, make sure to place +them on all master eligible nodes). For example, a file called +`template_1.json` can be placed under `config/templates` and it will be +added if it matches an index. Here is a sample of the mentioned file: + +[source,js] +-------------------------------------------------- +{ + "template_1" : { + "template" : "*", + "settings" : { + "index.number_of_shards" : 2 + }, + "mappings" : { + "_default_" : { + "_source" : { + "enabled" : false + } + }, + "type1" : { + "_all" : { + "enabled" : false + } + } + } + } +} +-------------------------------------------------- diff --git a/docs/reference/indices/types-exists.asciidoc b/docs/reference/indices/types-exists.asciidoc new file mode 100644 index 00000000000..59ff3282417 --- /dev/null +++ b/docs/reference/indices/types-exists.asciidoc @@ -0,0 +1,13 @@ +[[indices-types-exists]] +== Types Exists + +Used to check if a type/types exists in an index/indices (available +since 0.20). + +[source,js] +-------------------------------------------------- +curl -XHEAD 'http://localhost:9200/twitter/tweet' +-------------------------------------------------- + +The HTTP status code indicates if the type exists or not. A `404` means +it does not exist, and `200` means it does. diff --git a/docs/reference/indices/update-settings.asciidoc b/docs/reference/indices/update-settings.asciidoc new file mode 100644 index 00000000000..7403f4b6b31 --- /dev/null +++ b/docs/reference/indices/update-settings.asciidoc @@ -0,0 +1,214 @@ +[[indices-update-settings]] +== Update Indices Settings + +Change specific index level settings in real time. + +The REST endpoint is `/_settings` (to update all indices) or +`{index}/_settings` to update one (or more) indices settings. The body +of the request includes the updated settings, for example: + +[source,js] +-------------------------------------------------- +{ + "index" : { + "number_of_replicas" : 4 + } } +-------------------------------------------------- + +The above will change the number of replicas to 4 from the current +number of replicas. Here is a curl example: + +[source,js] +-------------------------------------------------- +curl -XPUT 'localhost:9200/my_index/_settings' -d ' +{ + "index" : { + "number_of_replicas" : 4 + } } +' +-------------------------------------------------- + +Below is the list of settings that can be changed using the update +settings API: + +[horizontal] + +`index.number_of_replicas`:: + The number of replicas each shard has. + +`index.auto_expand_replicas`:: + Set to an actual value (like `0-all`) or `false` to disable it. + +`index.blocks.read_only`:: + Set to `true` to have the index read only, `false` to allow writes + and metadata changes. + +`index.blocks.read`:: + Set to `true` to disable read operations againstthe index. + +`index.blocks.write`:: + Set to `true` to disable write operations against the index. + +`index.blocks.metadata`:: + Set to `true` to disable metadata operations against the index. + +`index.refresh_interval`:: + The async refresh interval of a shard. + +`index.term_index_interval`:: + The Lucene index term interval. Only applies to newly created docs. + +`index.term_index_divisor`:: + The Lucene reader term index divisor. + +`index.index_concurrency`:: + Defaults to `8`. + +`index.codec`:: + Codec. Default to `default`. + +`index.fail_on_merge_failure`:: + Default to `true`. + +`index.translog.flush_threshold_ops`:: + When to flush based on operations. + +`index.translog.flush_threshold_size`:: + When to flush based on translog (bytes) size. + +`index.translog.flush_threshold_period`:: + When to flush based on a period of not flushing. + +`index.translog.disable_flush`:: + Disables flushing. Note, should be set for a short + interval and then enabled. + +`index.cache.filter.max_size`:: + The maximum size of filter cache (per segment in shard). + Set to `-1` to disable. + +`index.cache.filter.expire`:: + The expire after access time for filter cache. + Set to `-1` to disable. + +`index.gateway.snapshot_interval`:: + The gateway snapshot interval (only applies to shared gateways). + Defaults to 10s. + +<>:: + All the settings for the merge policy currently configured. + A different merge policy can't be set. + +`index.routing.allocation.include.*`:: + A node matching any rule will be allowed to host shards from the index. + +`index.routing.allocation.exclude.*`:: + A node matching any rule will NOT be allowed to host shards from the index. + +`index.routing.allocation.require.*`:: + Only nodes matching all rules will be allowed to host shards from the index. + +`index.routing.allocation.disable_allocation`:: + Disable allocation. Defaults to `false`. + +`index.routing.allocation.disable_new_allocation`:: + Disable new allocation. Defaults to `false`. + +`index.routing.allocation.disable_replica_allocation`:: + Disable replica allocation. Defaults to `false`. + +`index.routing.allocation.total_shards_per_node`:: + Controls the total number of shards allowed to be allocated on a single node. Defaults to unbounded (`-1`). + +`index.recovery.initial_shards`:: + When using local gateway a particular shard is recovered only if there can be allocated quorum shards in the cluster. It can be set to: + * `quorum` (default) + * `quorum-1` (or `half`) + * `full` + * full-1`. + * Number values are also supported, e.g. `1`. + +`index.gc_deletes`:: + +`index.ttl.disable_purge`:: + Disables temporarily the purge of expired docs. + +<>:: + All the settings for the store level throttling policy currently configured. + +`index.translog.fs.type`:: + Either `simple` or `buffered` (default). + +`index.compound_format`:: + See <> + +|<>:: + All the settings for slow log. + +`index.warmer.enabled`:: + See <>. Defaults to `true`. + +[float] +=== Bulk Indexing Usage + +For example, the update settings API can be used to dynamically change +the index from being more performant for bulk indexing, and then move it +to more real time indexing state. Before the bulk indexing is started, +use: + +[source,js] +-------------------------------------------------- +curl -XPUT localhost:9200/test/_settings -d '{ + "index" : { + "refresh_interval" : "-1" + } }' +-------------------------------------------------- + +(Another optimization option is to start the index without any replicas, +and only later adding them, but that really depends on the use case). + +Then, once bulk indexing is done, the settings can be updated (back to +the defaults for example): + +[source,js] +-------------------------------------------------- +curl -XPUT localhost:9200/test/_settings -d '{ + "index" : { + "refresh_interval" : "1s" + } }' +-------------------------------------------------- + +And, an optimize should be called: + +[source,js] +-------------------------------------------------- +curl -XPOST 'http://localhost:9200/test/_optimize?max_num_segments=5' +-------------------------------------------------- + +[float] +=== Updating Index Analysis + +It is also possible to define new <> for the index. +But it is required to <> the index +first and <> it after the changes are made. + +For example if `content` analyzer hasn't been defined on `myindex` yet +you can use the following commands to add it: + +[source,js] +-------------------------------------------------- +curl -XPOST 'localhost:9200/myindex/_close' + +curl -XPUT 'localhost:9200/myindex/_settings' -d '{ + "analysis" : { + "analyzer":{ + "content":{ + "type":"custom", + "tokenizer":"whitespace" + } + } + } +}' + +curl -XPOST 'localhost:9200/myindex/_open' +-------------------------------------------------- diff --git a/docs/reference/indices/warmers.asciidoc b/docs/reference/indices/warmers.asciidoc new file mode 100644 index 00000000000..14ae28caca0 --- /dev/null +++ b/docs/reference/indices/warmers.asciidoc @@ -0,0 +1,150 @@ +[[indices-warmers]] +== Warmers + +Index warming allows to run registered search requests to warm up the +index before it is available for search. With the near real time aspect +of search, cold data (segments) will be warmed up before they become +available for search. This feature is available from version 0.20 +onwards. + +Warmup searches typically include requests that require heavy loading of +data, such as faceting or sorting on specific fields. The warmup APIs +allows to register warmup (search) under specific names, remove them, +and get them. + +Index warmup can be disabled by setting `index.warmer.enabled` to +`false`. It is supported as a realtime setting using update settings +API. This can be handy when doing initial bulk indexing, disabling pre +registered warmers to make indexing faster and less expensive and then +enable it. + +[float] +=== Index Creation / Templates + +Warmers can be registered when an index gets created, for example: + +[source,js] +-------------------------------------------------- +curl -XPUT localhost:9200/test -d '{ + "warmers" : { + "warmer_1" : { + "types" : [], + "source" : { + "query" : { + ... + }, + "facets" : { + ... + } + } + } + } +}' +-------------------------------------------------- + +Or, in an index template: + +[source,js] +-------------------------------------------------- +curl -XPUT localhost:9200/_template/template_1 -d ' +{ + "template" : "te*", + "warmers" : { + "warmer_1" : { + "types" : [], + "source" : { + "query" : { + ... + }, + "facets" : { + ... + } + } + } + } +}' +-------------------------------------------------- + +[float] +=== Put Warmer + +Allows to put a warmup search request on a specific index (or indices), +with the body composing of a regular search request. Types can be +provided as part of the URI if the search request is designed to be run +only against the specific types. + +Here is an example that registers a warmup called `warmer_1` against +index `test` (can be alias or several indices), for a search request +that runs against all types: + +[source,js] +-------------------------------------------------- +curl -XPUT localhost:9200/test/_warmer/warmer_1 -d '{ + "query" : { + "match_all" : {} + }, + "facets" : { + "facet_1" : { + "terms" : { + "field" : "field" + } + } + } +}' +-------------------------------------------------- + +And an example that registers a warmup against specific types: + +[source,js] +-------------------------------------------------- +curl -XPUT localhost:9200/test/type1/_warmer/warmer_1 -d '{ + "query" : { + "match_all" : {} + }, + "facets" : { + "facet_1" : { + "terms" : { + "field" : "field" + } + } + } +}' +-------------------------------------------------- + +[float] +=== Delete Warmer + +Removing a warmer can be done against an index (or alias / indices) +based on its name. The provided name can be a simple wildcard expression +or omitted to remove all warmers. Some samples: + +[source,js] +-------------------------------------------------- +# delete warmer named warmer_1 on test index +curl -XDELETE localhost:9200/test/_warmer/warmer_1 + +# delete all warmers that start with warm on test index +curl -XDELETE localhost:9200/test/_warmer/warm* + +# delete all warmers for test index +curl -XDELETE localhost:9200/test/_warmer/ +-------------------------------------------------- + +[float] +=== GETting Warmer + +Getting a warmer for specific index (or alias, or several indices) based +on its name. The provided name can be a simple wildcard expression or +omitted to get all warmers. Some examples: + +[source,js] +-------------------------------------------------- +# get warmer named warmer_1 on test index +curl -XGET localhost:9200/test/_warmer/warmer_1 + +# get all warmers that start with warm on test index +curl -XGET localhost:9200/test/_warmer/warm* + +# get all warmers for test index +curl -XGET localhost:9200/test/_warmer/ +-------------------------------------------------- diff --git a/docs/reference/mapping.asciidoc b/docs/reference/mapping.asciidoc new file mode 100644 index 00000000000..70d9a931aa1 --- /dev/null +++ b/docs/reference/mapping.asciidoc @@ -0,0 +1,66 @@ +[[mapping]] += Mapping + +[partintro] +-- +Mapping is the process of defining how a document should be mapped to +the Search Engine, including its searchable characteristics such as +which fields are searchable and if/how they are tokenized. In +ElasticSearch, an index may store documents of different "mapping +types". ElasticSearch allows one to associate multiple mapping +definitions for each mapping type. + +Explicit mapping is defined on an index/type level. By default, there +isn't a need to define an explicit mapping, since one is automatically +created and registered when a new type or new field is introduced (with +no performance overhead) and have sensible defaults. Only when the +defaults need to be overridden must a mapping definition be provided. + +[float] +=== Mapping Types + +Mapping types are a way to divide the documents in an index into logical +groups. Think of it as tables in a database. Though there is separation +between types, it's not a full separation (all end up as a document +within the same Lucene index). + +Field names with the same name across types are highly recommended to +have the same type and same mapping characteristics (analysis settings +for example). There is an effort to allow to explicitly "choose" which +field to use by using type prefix (`my_type.my_field`), but it's not +complete, and there are places where it will never work (like faceting +on the field). + +In practice though, this restriction is almost never an issue. The field +name usually ends up being a good indication to its "typeness" (e.g. +"first_name" will always be a string). Note also, that this does not +apply to the cross index case. + +[float] +=== Mapping API + +To create a mapping, you will need the <>, or you can add multiple mappings when you <>. + +[float] +=== Global Settings + +The `index.mapping.ignore_malformed` global setting can be set on the +index level to allow to ignore malformed content globally across all +mapping types (malformed content example is trying to index a string +value as a numeric type). +-- + +include::mapping/fields.asciidoc[] + +include::mapping/types.asciidoc[] + +include::mapping/date-format.asciidoc[] + +include::mapping/dynamic-mapping.asciidoc[] + +include::mapping/conf-mappings.asciidoc[] + +include::mapping/meta.asciidoc[] + diff --git a/docs/reference/mapping/conf-mappings.asciidoc b/docs/reference/mapping/conf-mappings.asciidoc new file mode 100644 index 00000000000..e9bb3f91f93 --- /dev/null +++ b/docs/reference/mapping/conf-mappings.asciidoc @@ -0,0 +1,19 @@ +[[mapping-conf-mappings]] +== Config Mappings + +Creating new mappings can be done using the +<> +API. When a document is indexed with no mapping associated with it in +the specific index, the +<> feature will kick in and automatically create mapping +definition for it. + +Mappings can also be provided on the node level, meaning that each index +created will automatically be started with all the mappings defined +within a certain location. + +Mappings can be defined within files called `[mapping_name].json` and be +placed either under `config/mappings/_default` location, or under +`config/mappings/[index_name]` (for mappings that should be associated +only with a specific index). diff --git a/docs/reference/mapping/date-format.asciidoc b/docs/reference/mapping/date-format.asciidoc new file mode 100644 index 00000000000..2613ac66c7a --- /dev/null +++ b/docs/reference/mapping/date-format.asciidoc @@ -0,0 +1,197 @@ +[[mapping-date-format]] +== Date Format + +When defining a `date` type, or when defining `date_formats` in the +`object` mapping, the value of it is the actual date format that will be +used to parse the string representation of the date. There are built in +formats supported, as well as complete custom one. + +The parsing of dates uses http://joda-time.sourceforge.net/[Joda]. The +default date parsing used if no format is specified is +http://joda-time.sourceforge.net/api-release/org/joda/time/format/ISODateTimeFormat.html#dateOptionalTimeParser[ISODateTimeFormat.dateOptionalTimeParser](). + +An extension to the format allow to define several formats using `||` +separator. This allows to define less strict formats that can be used, +for example, the `yyyy/MM/dd HH:mm:ss||yyyy/MM/dd` format will parse +both `yyyy/MM/dd HH:mm:ss` and `yyyy/MM/dd`. The first format will also +act as the one that converts back from milliseconds to a string +representation. + +[float] +=== Date Math + +The `date` type supports using date math expression when using it in a +query/filter (mainly make sense in `range` query/filter). + +The expression starts with an "anchor" date, which can be either `now` +or a date string (in the applicable format) ending with `||`. It can +then follow by a math expression, supporting `+`, `-` and `/` +(rounding). The units supported are `M` (month), `w` (week), `h` (hour), +`m` (minute), and `s` (second). + +Here are some samples: `now+1h`, `now+1h+1m`, `now+1h/d`, +`2012-01-01||+1M/d`. + +Note, when doing `range` type searches, and the upper value is +inclusive, the rounding will properly be rounded to the ceiling instead +of flooring it. + +[float] +=== Built In Formats + +The following tables lists all the defaults ISO formats supported: + +[cols="<,<",options="header",] +|======================================================================= +|Name |Description +|`basic_date`|A basic formatter for a full date as four digit year, two +digit month of year, and two digit day of month (yyyyMMdd). + +|`basic_date_time`|A basic formatter that combines a basic date and time, +separated by a 'T' (yyyyMMdd'T'HHmmss.SSSZ). + +|`basic_date_time_no_millis`|A basic formatter that combines a basic date +and time without millis, separated by a 'T' (yyyyMMdd'T'HHmmssZ). + +|`basic_ordinal_date`|A formatter for a full ordinal date, using a four +digit year and three digit dayOfYear (yyyyDDD). + +|`basic_ordinal_date_time`|A formatter for a full ordinal date and time, +using a four digit year and three digit dayOfYear +(yyyyDDD'T'HHmmss.SSSZ). + +|`basic_ordinal_date_time_no_millis`|A formatter for a full ordinal date +and time without millis, using a four digit year and three digit +dayOfYear (yyyyDDD'T'HHmmssZ). + +|`basic_time`|A basic formatter for a two digit hour of day, two digit +minute of hour, two digit second of minute, three digit millis, and time +zone offset (HHmmss.SSSZ). + +|`basic_time_no_millis`|A basic formatter for a two digit hour of day, +two digit minute of hour, two digit second of minute, and time zone +offset (HHmmssZ). + +|`basic_t_time`|A basic formatter for a two digit hour of day, two digit +minute of hour, two digit second of minute, three digit millis, and time +zone off set prefixed by 'T' ('T'HHmmss.SSSZ). + +|`basic_t_time_no_millis`|A basic formatter for a two digit hour of day, +two digit minute of hour, two digit second of minute, and time zone +offset prefixed by 'T' ('T'HHmmssZ). + +|`basic_week_date`|A basic formatter for a full date as four digit +weekyear, two digit week of weekyear, and one digit day of week +(xxxx'W'wwe). + +|`basic_week_date_time`|A basic formatter that combines a basic weekyear +date and time, separated by a 'T' (xxxx'W'wwe'T'HHmmss.SSSZ). + +|`basic_week_date_time_no_millis`|A basic formatter that combines a basic +weekyear date and time without millis, separated by a 'T' +(xxxx'W'wwe'T'HHmmssZ). + +|`date`|A formatter for a full date as four digit year, two digit month +of year, and two digit day of month (yyyy-MM-dd). + +|`date_hour`|A formatter that combines a full date and two digit hour of +day. + +|`date_hour_minute`|A formatter that combines a full date, two digit hour +of day, and two digit minute of hour. + +|`date_hour_minute_second`|A formatter that combines a full date, two +digit hour of day, two digit minute of hour, and two digit second of +minute. + +|`date_hour_minute_second_fraction`|A formatter that combines a full +date, two digit hour of day, two digit minute of hour, two digit second +of minute, and three digit fraction of second +(yyyy-MM-dd'T'HH:mm:ss.SSS). + +|`date_hour_minute_second_millis`|A formatter that combines a full date, +two digit hour of day, two digit minute of hour, two digit second of +minute, and three digit fraction of second (yyyy-MM-dd'T'HH:mm:ss.SSS). + +|`date_optional_time`|a generic ISO datetime parser where the date is +mandatory and the time is optional. + +|`date_time`|A formatter that combines a full date and time, separated by +a 'T' (yyyy-MM-dd'T'HH:mm:ss.SSSZZ). + +|`date_time_no_millis`|A formatter that combines a full date and time +without millis, separated by a 'T' (yyyy-MM-dd'T'HH:mm:ssZZ). + +|`hour`|A formatter for a two digit hour of day. + +|`hour_minute`|A formatter for a two digit hour of day and two digit +minute of hour. + +|`hour_minute_second`|A formatter for a two digit hour of day, two digit +minute of hour, and two digit second of minute. + +|`hour_minute_second_fraction`|A formatter for a two digit hour of day, +two digit minute of hour, two digit second of minute, and three digit +fraction of second (HH:mm:ss.SSS). + +|`hour_minute_second_millis`|A formatter for a two digit hour of day, two +digit minute of hour, two digit second of minute, and three digit +fraction of second (HH:mm:ss.SSS). + +|`ordinal_date`|A formatter for a full ordinal date, using a four digit +year and three digit dayOfYear (yyyy-DDD). + +|`ordinal_date_time`|A formatter for a full ordinal date and time, using +a four digit year and three digit dayOfYear (yyyy-DDD'T'HH:mm:ss.SSSZZ). + +|`ordinal_date_time_no_millis`|A formatter for a full ordinal date and +time without millis, using a four digit year and three digit dayOfYear +(yyyy-DDD'T'HH:mm:ssZZ). + +|`time`|A formatter for a two digit hour of day, two digit minute of +hour, two digit second of minute, three digit fraction of second, and +time zone offset (HH:mm:ss.SSSZZ). + +|`time_no_millis`|A formatter for a two digit hour of day, two digit +minute of hour, two digit second of minute, and time zone offset +(HH:mm:ssZZ). + +|`t_time`|A formatter for a two digit hour of day, two digit minute of +hour, two digit second of minute, three digit fraction of second, and +time zone offset prefixed by 'T' ('T'HH:mm:ss.SSSZZ). + +|`t_time_no_millis`|A formatter for a two digit hour of day, two digit +minute of hour, two digit second of minute, and time zone offset +prefixed by 'T' ('T'HH:mm:ssZZ). + +|`week_date`|A formatter for a full date as four digit weekyear, two +digit week of weekyear, and one digit day of week (xxxx-'W'ww-e). + +|`week_date_time`|A formatter that combines a full weekyear date and +time, separated by a 'T' (xxxx-'W'ww-e'T'HH:mm:ss.SSSZZ). + +|`weekDateTimeNoMillis`|A formatter that combines a full weekyear date +and time without millis, separated by a 'T' (xxxx-'W'ww-e'T'HH:mm:ssZZ). + +|`week_year`|A formatter for a four digit weekyear. + +|`weekyearWeek`|A formatter for a four digit weekyear and two digit week +of weekyear. + +|`weekyearWeekDay`|A formatter for a four digit weekyear, two digit week +of weekyear, and one digit day of week. + +|`year`|A formatter for a four digit year. + +|`year_month`|A formatter for a four digit year and two digit month of +year. + +|`year_month_day`|A formatter for a four digit year, two digit month of +year, and two digit day of month. +|======================================================================= + +[float] +=== Custom Format + +Allows for a completely customizable date format explained +http://joda-time.sourceforge.net/api-release/org/joda/time/format/DateTimeFormat.html[here]. diff --git a/docs/reference/mapping/dynamic-mapping.asciidoc b/docs/reference/mapping/dynamic-mapping.asciidoc new file mode 100644 index 00000000000..8c317506105 --- /dev/null +++ b/docs/reference/mapping/dynamic-mapping.asciidoc @@ -0,0 +1,45 @@ +[[mapping-dynamic-mapping]] +== Dynamic Mapping + +Default mappings allow to automatically apply generic mapping definition +to types that do not have mapping pre defined. This is mainly done +thanks to the fact that the +<> and +namely the <> allow for schema-less dynamic addition of unmapped +fields. + +The default mapping definition is plain mapping definition that is +embedded within the distribution: + +[source,js] +-------------------------------------------------- +{ + "_default_" : { + } +} +-------------------------------------------------- + +Pretty short, no? Basically, everything is defaulted, especially the +dynamic nature of the root object mapping. The default mapping +definition can be overridden in several manners. The simplest manner is +to simply define a file called `default-mapping.json` and placed it +under the `config` directory (which can be configured to exist in a +different location). It can also be explicitly set using the +`index.mapper.default_mapping_location` setting. + +The dynamic creation of mappings for unmapped types can be completely +disabled by setting `index.mapper.dynamic` to `false`. + +As an example, here is how we can change the default +<> used in the +root and inner object types: + +[source,js] +-------------------------------------------------- +{ + "_default_" : { + "date_formats" : ["yyyy-MM-dd", "dd-MM-yyyy", "date_optional_time"], + } +} +-------------------------------------------------- diff --git a/docs/reference/mapping/fields.asciidoc b/docs/reference/mapping/fields.asciidoc new file mode 100644 index 00000000000..a1f7e9824ae --- /dev/null +++ b/docs/reference/mapping/fields.asciidoc @@ -0,0 +1,33 @@ +[[mapping-fields]] +== Fields + +Each mapping has a number of fields associated with it +which can be used to control how the document metadata +(eg <>) is indexed. + +include::fields/uid-field.asciidoc[] + +include::fields/id-field.asciidoc[] + +include::fields/type-field.asciidoc[] + +include::fields/source-field.asciidoc[] + +include::fields/all-field.asciidoc[] + +include::fields/analyzer-field.asciidoc[] + +include::fields/boost-field.asciidoc[] + +include::fields/parent-field.asciidoc[] + +include::fields/routing-field.asciidoc[] + +include::fields/index-field.asciidoc[] + +include::fields/size-field.asciidoc[] + +include::fields/timestamp-field.asciidoc[] + +include::fields/ttl-field.asciidoc[] + diff --git a/docs/reference/mapping/fields/all-field.asciidoc b/docs/reference/mapping/fields/all-field.asciidoc new file mode 100644 index 00000000000..f0eaa96d45f --- /dev/null +++ b/docs/reference/mapping/fields/all-field.asciidoc @@ -0,0 +1,77 @@ +[[mapping-all-field]] +=== `_all` + +The idea of the `_all` field is that it includes the text of one or more +other fields within the document indexed. It can come very handy +especially for search requests, where we want to execute a search query +against the content of a document, without knowing which fields to +search on. This comes at the expense of CPU cycles and index size. + +The `_all` fields can be completely disabled. Explicit field mapping and +object mapping can be excluded / included in the `_all` field. By +default, it is enabled and all fields are included in it for ease of +use. + +When disabling the `_all` field, it is a good practice to set +`index.query.default_field` to a different value (for example, if you +have a main "message" field in your data, set it to `message`). + +One of the nice features of the `_all` field is that it takes into +account specific fields boost levels. Meaning that if a title field is +boosted more than content, the title (part) in the `_all` field will +mean more than the content (part) in the `_all` field. + +Here is a sample mapping: + +[source,js] +-------------------------------------------------- +{ + "person" : { + "_all" : {"enabled" : true}, + "properties" : { + "name" : { + "type" : "object", + "dynamic" : false, + "properties" : { + "first" : {"type" : "string", "store" : "yes", "include_in_all" : false}, + "last" : {"type" : "string", "index" : "not_analyzed"} + } + }, + "address" : { + "type" : "object", + "include_in_all" : false, + "properties" : { + "first" : { + "properties" : { + "location" : {"type" : "string", "store" : "yes", "index_name" : "firstLocation"} + } + }, + "last" : { + "properties" : { + "location" : {"type" : "string"} + } + } + } + }, + "simple1" : {"type" : "long", "include_in_all" : true}, + "simple2" : {"type" : "long", "include_in_all" : false} + } + } +} +-------------------------------------------------- + +The `_all` fields allows for `store`, `term_vector` and `analyzer` (with +specific `index_analyzer` and `search_analyzer`) to be set. + +[float] +==== Highlighting + +For any field to allow +<> it has +to be either stored or part of the `_source` field. By default `_all` +field does not qualify for either, so highlighting for it does not yield +any data. + +Although it is possible to `store` the `_all` field, it is basically an +aggregation of all fields, which means more data will be stored, and +highlighting it might produce strange results. diff --git a/docs/reference/mapping/fields/analyzer-field.asciidoc b/docs/reference/mapping/fields/analyzer-field.asciidoc new file mode 100644 index 00000000000..30bb0723f36 --- /dev/null +++ b/docs/reference/mapping/fields/analyzer-field.asciidoc @@ -0,0 +1,41 @@ +[[mapping-analyzer-field]] +=== `_analyzer` + +The `_analyzer` mapping allows to use a document field property as the +name of the analyzer that will be used to index the document. The +analyzer will be used for any field that does not explicitly defines an +`analyzer` or `index_analyzer` when indexing. + +Here is a simple mapping: + +[source,js] +-------------------------------------------------- +{ + "type1" : { + "_analyzer" : { + "path" : "my_field" + } + } +} +-------------------------------------------------- + +The above will use the value of the `my_field` to lookup an analyzer +registered under it. For example, indexing a the following doc: + +[source,js] +-------------------------------------------------- +{ + "my_field" : "whitespace" +} +-------------------------------------------------- + +Will cause the `whitespace` analyzer to be used as the index analyzer +for all fields without explicit analyzer setting. + +The default path value is `_analyzer`, so the analyzer can be driven for +a specific document by setting `_analyzer` field in it. If custom json +field name is needed, an explicit mapping with a different path should +be set. + +By default, the `_analyzer` field is indexed, it can be disabled by +settings `index` to `no` in the mapping. diff --git a/docs/reference/mapping/fields/boost-field.asciidoc b/docs/reference/mapping/fields/boost-field.asciidoc new file mode 100644 index 00000000000..0f07e0914cd --- /dev/null +++ b/docs/reference/mapping/fields/boost-field.asciidoc @@ -0,0 +1,32 @@ +[[mapping-boost-field]] +=== `_boost` + +Boosting is the process of enhancing the relevancy of a document or +field. Field level mapping allows to define explicit boost level on a +specific field. The boost field mapping (applied on the +<>) allows +to define a boost field mapping where *its content will control the +boost level of the document*. For example, consider the following +mapping: + +[source,js] +-------------------------------------------------- +{ + "tweet" : { + "_boost" : {"name" : "my_boost", "null_value" : 1.0} + } +} +-------------------------------------------------- + +The above mapping defines mapping for a field named `my_boost`. If the +`my_boost` field exists within the JSON document indexed, its value will +control the boost level of the document indexed. For example, the +following JSON document will be indexed with a boost value of `2.2`: + +[source,js] +-------------------------------------------------- +{ + "my_boost" : 2.2, + "message" : "This is a tweet!" +} +-------------------------------------------------- diff --git a/docs/reference/mapping/fields/id-field.asciidoc b/docs/reference/mapping/fields/id-field.asciidoc new file mode 100644 index 00000000000..2defafcc79f --- /dev/null +++ b/docs/reference/mapping/fields/id-field.asciidoc @@ -0,0 +1,57 @@ +[[mapping-id-field]] +=== `_id` + +Each document indexed is associated with an id and a type. The `_id` +field can be used to index just the id, and possible also store it. By +default it is not indexed and not stored (thus, not created). + +Note, even though the `_id` is not indexed, all the APIs still work +(since they work with the `_uid` field), as well as fetching by ids +using `term`, `terms` or `prefix` queries/filters (including the +specific `ids` query/filter). + +The `_id` field can be enabled to be indexed, and possibly stored, +using: + +[source,js] +-------------------------------------------------- +{ + "tweet" : { + "_id" : {"index": "not_analyzed", "store" : "yes"} + } +} +-------------------------------------------------- + +In order to maintain backward compatibility, a node level setting +`index.mapping._id.indexed` can be set to `true` to make sure that the +id is indexed when upgrading to `0.16`, though it's recommended to not +index the id. + +The `_id` mapping can also be associated with a `path` that will be used +to extract the id from a different location in the source document. For +example, having the following mapping: + +[source,js] +-------------------------------------------------- +{ + "tweet" : { + "_id" : { + "path" : "post_id" + } + } +} +-------------------------------------------------- + +Will cause `1` to be used as the id for: + +[source,js] +-------------------------------------------------- +{ + "message" : "You know, for Search", + "post_id" : "1" +} +-------------------------------------------------- + +This does require an additional lightweight parsing step while indexing, +in order to extract the id to decide which shard the index operation +will be executed on. diff --git a/docs/reference/mapping/fields/index-field.asciidoc b/docs/reference/mapping/fields/index-field.asciidoc new file mode 100644 index 00000000000..96a320b9fa5 --- /dev/null +++ b/docs/reference/mapping/fields/index-field.asciidoc @@ -0,0 +1,15 @@ +[[mapping-index-field]] +=== `_index` + +The ability to store in a document the index it belongs to. By default +it is disabled, in order to enable it, the following mapping should be +defined: + +[source,js] +-------------------------------------------------- +{ + "tweet" : { + "_index" : { "enabled" : true } + } +} +-------------------------------------------------- diff --git a/docs/reference/mapping/fields/parent-field.asciidoc b/docs/reference/mapping/fields/parent-field.asciidoc new file mode 100644 index 00000000000..3225b53c399 --- /dev/null +++ b/docs/reference/mapping/fields/parent-field.asciidoc @@ -0,0 +1,21 @@ +[[mapping-parent-field]] +=== `_parent` + +The parent field mapping is defined on a child mapping, and points to +the parent type this child relates to. For example, in case of a `blog` +type and a `blog_tag` type child document, the mapping for `blog_tag` +should be: + +[source,js] +-------------------------------------------------- +{ + "blog_tag" : { + "_parent" : { + "type" : "blog" + } + } +} +-------------------------------------------------- + +The mapping is automatically stored and indexed (meaning it can be +searched on using the `_parent` field notation). diff --git a/docs/reference/mapping/fields/routing-field.asciidoc b/docs/reference/mapping/fields/routing-field.asciidoc new file mode 100644 index 00000000000..bf94bdc6a92 --- /dev/null +++ b/docs/reference/mapping/fields/routing-field.asciidoc @@ -0,0 +1,69 @@ +[[mapping-routing-field]] +=== `_routing` + +The routing field allows to control the `_routing` aspect when indexing +data and explicit routing control is required. + +[float] +==== store / index + +The first thing the `_routing` mapping does is to store the routing +value provided (`store` set to `yes`) and index it (`index` set to +`not_analyzed`). The reason why the routing is stored by default is so +reindexing data will be possible if the routing value is completely +external and not part of the docs. + +[float] +==== required + +Another aspect of the `_routing` mapping is the ability to define it as +required by setting `required` to `true`. This is very important to set +when using routing features, as it allows different APIs to make use of +it. For example, an index operation will be rejected if no routing value +has been provided (or derived from the doc). A delete operation will be +broadcasted to all shards if no routing value is provided and `_routing` +is required. + +[float] +==== path + +The routing value can be provided as an external value when indexing +(and still stored as part of the document, in much the same way +`_source` is stored). But, it can also be automatically extracted from +the index doc based on a `path`. For example, having the following +mapping: + +[source,js] +-------------------------------------------------- +{ + "comment" : { + "_routing" : { + "required" : true, + "path" : "blog.post_id" + } + } +} +-------------------------------------------------- + +Will cause the following doc to be routed based on the `111222` value: + +[source,js] +-------------------------------------------------- +{ + "text" : "the comment text" + "blog" : { + "post_id" : "111222" + } +} +-------------------------------------------------- + +Note, using `path` without explicit routing value provided required an +additional (though quite fast) parsing phase. + +[float] +==== id uniqueness + +When indexing documents specifying a custom `_routing`, the uniqueness +of the `_id` is not guaranteed throughout all the shards that the index +is composed of. In fact, documents with the same `_id` might end up in +different shards if indexed with different `_routing` values. diff --git a/docs/reference/mapping/fields/size-field.asciidoc b/docs/reference/mapping/fields/size-field.asciidoc new file mode 100644 index 00000000000..475da19d7ea --- /dev/null +++ b/docs/reference/mapping/fields/size-field.asciidoc @@ -0,0 +1,26 @@ +[[mapping-size-field]] +=== `_size` + +The `_size` field allows to automatically index the size of the original +`_source` indexed. By default, it's disabled. In order to enable it, set +the mapping to: + +[source,js] +-------------------------------------------------- +{ + "tweet" : { + "_size" : {"enabled" : true} + } +} +-------------------------------------------------- + +In order to also store it, use: + +[source,js] +-------------------------------------------------- +{ + "tweet" : { + "_size" : {"enabled" : true, "store" : "yes"} + } +} +-------------------------------------------------- diff --git a/docs/reference/mapping/fields/source-field.asciidoc b/docs/reference/mapping/fields/source-field.asciidoc new file mode 100644 index 00000000000..9e0c5d6d3ee --- /dev/null +++ b/docs/reference/mapping/fields/source-field.asciidoc @@ -0,0 +1,64 @@ +[[mapping-source-field]] +=== `_source` + +The `_source` field is an automatically generated field that stores the +actual JSON that was used as the indexed document. It is not indexed +(searchable), just stored. When executing "fetch" requests, like +<> or +<>, the `_source` field is +returned by default. + +Though very handy to have around, the source field does incur storage +overhead within the index. For this reason, it can be disabled. For +example: + +[source,js] +-------------------------------------------------- +{ + "tweet" : { + "_source" : {"enabled" : false} + } +} +-------------------------------------------------- + +[float] +==== Compression + +*From version 0.90 onwards, all stored fields (including `_source`) are +always compressed.* + +For versions before 0.90: + +The source field can be compressed (LZF) when stored in the index. This +can greatly reduce the index size, as well as possibly improving +performance (when decompression overhead is better than loading a bigger +source from disk). The code takes special care to decompress the source +only when needed, for example decompressing it directly into the REST +stream of a result. + +In order to enable compression, the `compress` option should be set to +`true`. By default it is set to `false`. Note, this can be changed on an +existing index, as a mix of compressed and uncompressed sources is +supported. + +Moreover, a `compress_threshold` can be set to control when the source +will be compressed. It accepts a byte size value (for example `100b`, +`10kb`). Note, `compress` should be set to `true`. + +[float] +==== Includes / Excludes + +Allow to specify paths in the source that would be included / excluded +when it's stored, supporting `*` as wildcard annotation. For example: + +[source,js] +-------------------------------------------------- +{ + "my_type" : { + "_source" : { + "includes" : ["path1.*", "path2.*"], + "excludes" : ["pat3.*"] + } + } +} +-------------------------------------------------- diff --git a/docs/reference/mapping/fields/timestamp-field.asciidoc b/docs/reference/mapping/fields/timestamp-field.asciidoc new file mode 100644 index 00000000000..a68fac31251 --- /dev/null +++ b/docs/reference/mapping/fields/timestamp-field.asciidoc @@ -0,0 +1,82 @@ +[[mapping-timestamp-field]] +=== `_timestamp` + +The `_timestamp` field allows to automatically index the timestamp of a +document. It can be provided externally via the index request or in the +`_source`. If it is not provided externally it will be automatically set +to the date the document was processed by the indexing chain. + +[float] +==== enabled + +By default it is disabled, in order to enable it, the following mapping +should be defined: + +[source,js] +-------------------------------------------------- +{ + "tweet" : { + "_timestamp" : { "enabled" : true } + } +} +-------------------------------------------------- + +[float] +==== store / index + +By default the `_timestamp` field has `store` set to `no` and `index` +set to `not_analyzed`. It can be queried as a standard date field. + +[float] +==== path + +The `_timestamp` value can be provided as an external value when +indexing. But, it can also be automatically extracted from the document +to index based on a `path`. For example, having the following mapping: + +[source,js] +-------------------------------------------------- +{ + "tweet" : { + "_timestamp" : { + "enabled" : true, + "path" : "post_date" + } + } +} +-------------------------------------------------- + +Will cause `2009-11-15T14:12:12` to be used as the timestamp value for: + +[source,js] +-------------------------------------------------- +{ + "message" : "You know, for Search", + "post_date" : "2009-11-15T14:12:12" +} +-------------------------------------------------- + +Note, using `path` without explicit timestamp value provided require an +additional (though quite fast) parsing phase. + +[float] +==== format + +You can define the <> used to parse the provided timestamp value. For example: + +[source,js] +-------------------------------------------------- +{ + "tweet" : { + "_timestamp" : { + "enabled" : true, + "path" : "post_date", + "format" : "YYYY-MM-dd" + } + } +} +-------------------------------------------------- + +Note, the default format is `dateOptionalTime`. The timestamp value will +first be parsed as a number and if it fails the format will be tried. diff --git a/docs/reference/mapping/fields/ttl-field.asciidoc b/docs/reference/mapping/fields/ttl-field.asciidoc new file mode 100644 index 00000000000..a2cf8c37e90 --- /dev/null +++ b/docs/reference/mapping/fields/ttl-field.asciidoc @@ -0,0 +1,70 @@ +[[mapping-ttl-field]] +=== `_ttl` + +A lot of documents naturally come with an expiration date. Documents can +therefore have a `_ttl` (time to live), which will cause the expired +documents to be deleted automatically. + +[float] +==== enabled + +By default it is disabled, in order to enable it, the following mapping +should be defined: + +[source,js] +-------------------------------------------------- +{ + "tweet" : { + "_ttl" : { "enabled" : true } + } +} +-------------------------------------------------- + +[float] +==== store / index + +By default the `_ttl` field has `store` set to `yes` and `index` set to +`not_analyzed`. Note that `index` property has to be set to +`not_analyzed` in order for the purge process to work. + +[float] +==== default + +You can provide a per index/type default `_ttl` value as follows: + +[source,js] +-------------------------------------------------- +{ + "tweet" : { + "_ttl" : { "enabled" : true, "default" : "1d" } + } +} +-------------------------------------------------- + +In this case, if you don't provide a `_ttl` value in your query or in +the `_source` all tweets will have a `_ttl` of one day. + +In case you do not specify a time unit like `d` (days), `m` (minutes), +`h` (hours), `ms` (milliseconds) or `w` (weeks), milliseconds is used as +default unit. + +If no `default` is set and no `_ttl` value is given then the document +has an infinite `_ttl` and will not expire. + +You can dynamically update the `default` value using the put mapping +API. It won't change the `_ttl` of already indexed documents but will be +used for future documents. + +[float] +==== Note on documents expiration + +Expired documents will be automatically deleted regularly. You can +dynamically set the `indices.ttl.interval` to fit your needs. The +default value is `60s`. + +The deletion orders are processed by bulk. You can set +`indices.ttl.bulk_size` to fit your needs. The default value is `10000`. + +Note that the expiration procedure handle versioning properly so if a +document is updated between the collection of documents to expire and +the delete order, the document won't be deleted. diff --git a/docs/reference/mapping/fields/type-field.asciidoc b/docs/reference/mapping/fields/type-field.asciidoc new file mode 100644 index 00000000000..81141ccc885 --- /dev/null +++ b/docs/reference/mapping/fields/type-field.asciidoc @@ -0,0 +1,31 @@ +[[mapping-type-field]] +=== Type Field + +Each document indexed is associated with an id and a type. The type, +when indexing, is automatically indexed into a `_type` field. By +default, the `_type` field is indexed (but *not* analyzed) and not +stored. This means that the `_type` field can be queried. + +The `_type` field can be stored as well, for example: + +[source,js] +-------------------------------------------------- +{ + "tweet" : { + "_type" : {"store" : "yes"} + } +} +-------------------------------------------------- + +The `_type` field can also not be indexed, and all the APIs will still +work except for specific queries (term queries / filters) or faceting +done on the `_type` field. + +[source,js] +-------------------------------------------------- +{ + "tweet" : { + "_type" : {"index" : "no"} + } +} +-------------------------------------------------- diff --git a/docs/reference/mapping/fields/uid-field.asciidoc b/docs/reference/mapping/fields/uid-field.asciidoc new file mode 100644 index 00000000000..f9ce245adc8 --- /dev/null +++ b/docs/reference/mapping/fields/uid-field.asciidoc @@ -0,0 +1,11 @@ +[[mapping-uid-field]] +=== `_uid` + +Each document indexed is associated with an id and a type, the internal +`_uid` field is the unique identifier of a document within an index and +is composed of the type and the id (meaning that different types can +have the same id and still maintain uniqueness). + +The `_uid` field is automatically used when `_type` is not indexed to +perform type based filtering, and does not require the `_id` to be +indexed. diff --git a/docs/reference/mapping/meta.asciidoc b/docs/reference/mapping/meta.asciidoc new file mode 100644 index 00000000000..5cb0c14eaad --- /dev/null +++ b/docs/reference/mapping/meta.asciidoc @@ -0,0 +1,25 @@ +[[mapping-meta]] +== Meta + +Each mapping can have custom meta data associated with it. These are +simple storage elements that are simply persisted along with the mapping +and can be retrieved when fetching the mapping definition. The meta is +defined under the `_meta` element, for example: + +[source,js] +-------------------------------------------------- +{ + "tweet" : { + "_meta" : { + "attr1" : "value1", + "attr2" : { + "attr3" : "value3" + } + } + } +} +-------------------------------------------------- + +Meta can be handy for example for client libraries that perform +serialization and deserialization to store its meta model (for example, +the class the document maps to). diff --git a/docs/reference/mapping/misc.asciidoc b/docs/reference/mapping/misc.asciidoc new file mode 100644 index 00000000000..5b4a443aaae --- /dev/null +++ b/docs/reference/mapping/misc.asciidoc @@ -0,0 +1,8 @@ +[[mapping-misc]] +== Miscellaneous + +include::mapping/date-format.asciidoc[] + +include::mapping/conf-mappings.asciidoc[] + +include::mapping/meta.asciidoc[] diff --git a/docs/reference/mapping/types.asciidoc b/docs/reference/mapping/types.asciidoc new file mode 100644 index 00000000000..33a663f1215 --- /dev/null +++ b/docs/reference/mapping/types.asciidoc @@ -0,0 +1,26 @@ +[[mapping-types]] +== Types + +The datatype for each field in a document (eg strings, numbers, +objects etc) can be controlled via the type mapping. + +include::types/core-types.asciidoc[] + +include::types/array-type.asciidoc[] + +include::types/object-type.asciidoc[] + +include::types/root-object-type.asciidoc[] + +include::types/nested-type.asciidoc[] + +include::types/multi-field-type.asciidoc[] + +include::types/ip-type.asciidoc[] + +include::types/geo-point-type.asciidoc[] + +include::types/geo-shape-type.asciidoc[] + +include::types/attachment-type.asciidoc[] + diff --git a/docs/reference/mapping/types/array-type.asciidoc b/docs/reference/mapping/types/array-type.asciidoc new file mode 100644 index 00000000000..3f887b1c6f7 --- /dev/null +++ b/docs/reference/mapping/types/array-type.asciidoc @@ -0,0 +1,74 @@ +[[mapping-array-type]] +=== Array Type + +JSON documents allow to define an array (list) of fields or objects. +Mapping array types could not be simpler since arrays gets automatically +detected and mapping them can be done either with +<> or +<> mappings. +For example, the following JSON defines several arrays: + +[source,js] +-------------------------------------------------- +{ + "tweet" : { + "message" : "some arrays in this tweet...", + "tags" : ["elasticsearch", "wow"], + "lists" : [ + { + "name" : "prog_list", + "description" : "programming list" + }, + { + "name" : "cool_list", + "description" : "cool stuff list" + } + ] + } +} +-------------------------------------------------- + +The above JSON has the `tags` property defining a list of a simple +`string` type, and the `lists` property is an `object` type array. Here +is a sample explicit mapping: + +[source,js] +-------------------------------------------------- +{ + "tweet" : { + "properties" : { + "message" : {"type" : "string"}, + "tags" : {"type" : "string", "index_name" : "tag"}, + "lists" : { + "properties" : { + "name" : {"type" : "string"}, + "description" : {"type" : "string"} + } + } + } + } +} +-------------------------------------------------- + +The fact that array types are automatically supported can be shown by +the fact that the following JSON document is perfectly fine: + +[source,js] +-------------------------------------------------- +{ + "tweet" : { + "message" : "some arrays in this tweet...", + "tags" : "elasticsearch", + "lists" : { + "name" : "prog_list", + "description" : "programming list" + } + } +} +-------------------------------------------------- + +Note also, that thanks to the fact that we used the `index_name` to use +the non plural form (`tag` instead of `tags`), we can actually refer to +the field using the `index_name` as well. For example, we can execute a +query using `tweet.tags:wow` or `tweet.tag:wow`. We could, of course, +name the field as `tag` and skip the `index_name` all together). diff --git a/docs/reference/mapping/types/attachment-type.asciidoc b/docs/reference/mapping/types/attachment-type.asciidoc new file mode 100644 index 00000000000..e2bfce67c9d --- /dev/null +++ b/docs/reference/mapping/types/attachment-type.asciidoc @@ -0,0 +1,90 @@ +[[mapping-attachment-type]] +=== Attachment Type + +The `attachment` type allows to index different "attachment" type field +(encoded as `base64`), for example, Microsoft Office formats, open +document formats, ePub, HTML, and so on (full list can be found +http://lucene.apache.org/tika/0.10/formats.html[here]). + +The `attachment` type is provided as a +https://github.com/elasticsearch/elasticsearch-mapper-attachments[plugin +extension]. The plugin is a simple zip file that can be downloaded and +placed under `$ES_HOME/plugins` location. It will be automatically +detected and the `attachment` type will be added. + +Note, the `attachment` type is experimental. + +Using the attachment type is simple, in your mapping JSON, simply set a +certain JSON element as attachment, for example: + +[source,js] +-------------------------------------------------- +{ + "person" : { + "properties" : { + "my_attachment" : { "type" : "attachment" } + } + } +} +-------------------------------------------------- + +In this case, the JSON to index can be: + +[source,js] +-------------------------------------------------- +{ + "my_attachment" : "... base64 encoded attachment ..." +} +-------------------------------------------------- + +Or it is possible to use more elaborated JSON if content type or +resource name need to be set explicitly: + +[source,js] +-------------------------------------------------- +{ + "my_attachment" : { + "_content_type" : "application/pdf", + "_name" : "resource/name/of/my.pdf", + "content" : "... base64 encoded attachment ..." + } +} +-------------------------------------------------- + +The `attachment` type not only indexes the content of the doc, but also +automatically adds meta data on the attachment as well (when available). +The metadata supported are: `date`, `title`, `author`, and `keywords`. +They can be queried using the "dot notation", for example: +`my_attachment.author`. + +Both the meta data and the actual content are simple core type mappers +(string, date, ...), thus, they can be controlled in the mappings. For +example: + +[source,js] +-------------------------------------------------- +{ + "person" : { + "properties" : { + "file" : { + "type" : "attachment", + "fields" : { + "file" : {"index" : "no"}, + "date" : {"store" : "yes"}, + "author" : {"analyzer" : "myAnalyzer"} + } + } + } + } +} +-------------------------------------------------- + +In the above example, the actual content indexed is mapped under +`fields` name `file`, and we decide not to index it, so it will only be +available in the `_all` field. The other fields map to their respective +metadata names, but there is no need to specify the `type` (like +`string` or `date`) since it is already known. + +The plugin uses http://lucene.apache.org/tika/[Apache Tika] to parse +attachments, so many formats are supported, listed +http://lucene.apache.org/tika/0.10/formats.html[here]. diff --git a/docs/reference/mapping/types/core-types.asciidoc b/docs/reference/mapping/types/core-types.asciidoc new file mode 100644 index 00000000000..7afc709cb1e --- /dev/null +++ b/docs/reference/mapping/types/core-types.asciidoc @@ -0,0 +1,501 @@ +[[mapping-core-types]] +=== Core Types + +Each JSON field can be mapped to a specific core type. JSON itself +already provides us with some typing, with its support for `string`, +`integer`/@long@, `float`/@double@, `boolean`, and `null`. + +The following sample tweet JSON document will be used to explain the +core types: + +[source,js] +-------------------------------------------------- +{ + "tweet" { + "user" : "kimchy" + "message" : "This is a tweet!", + "postDate" : "2009-11-15T14:12:12", + "priority" : 4, + "rank" : 12.3 + } +} +-------------------------------------------------- + +Explicit mapping for the above JSON tweet can be: + +[source,js] +-------------------------------------------------- +{ + "tweet" : { + "properties" : { + "user" : {"type" : "string", "index" : "not_analyzed"}, + "message" : {"type" : "string", "null_value" : "na"}, + "postDate" : {"type" : "date"}, + "priority" : {"type" : "integer"}, + "rank" : {"type" : "float"} + } + } +} +-------------------------------------------------- + +[float] +==== String + +The text based string type is the most basic type, and contains one or +more characters. An example mapping can be: + +[source,js] +-------------------------------------------------- +{ + "tweet" : { + "properties" : { + "message" : { + "type" : "string", + "store" : "yes", + "index" : "analyzed", + "null_value" : "na" + } + } + } +} +-------------------------------------------------- + +The above mapping defines a `string` `message` property/field within the +`tweet` type. The field is stored in the index (so it can later be +retrieved using selective loading when searching), and it gets analyzed +(broken down into searchable terms). If the message has a `null` value, +then the value that will be stored is `na`. + +The following table lists all the attributes that can be used with the +`string` type: + +[cols="<,<",options="header",] +|======================================================================= +|Attribute |Description +|`index_name` |The name of the field that will be stored in the index. +Defaults to the property/field name. + +|`store` |Set to `yes` to store actual field in the index, `no` to not +store it. Defaults to `no` (note, the JSON document itself is stored, +and it can be retrieved from it). + +|`index` |Set to `analyzed` for the field to be indexed and searchable +after being broken down into token using an analyzer. `not_analyzed` +means that its still searchable, but does not go through any analysis +process or broken down into tokens. `no` means that it won't be +searchable at all (as an individual field; it may still be included in +`_all`). Setting to `no` disables `include_in_all`. Defaults to +`analyzed`. + +|`term_vector` |Possible values are `no`, `yes`, `with_offsets`, +`with_positions`, `with_positions_offsets`. Defaults to `no`. + +|`boost` |The boost value. Defaults to `1.0`. + +|`null_value` |When there is a (JSON) null value for the field, use the +`null_value` as the field value. Defaults to not adding the field at +all. + +|`omit_norms` |Boolean value if norms should be omitted or not. Defaults +to `false` for `analyzed` fields, and to `true` for `not_analyzed` +fields. + +|`omit_term_freq_and_positions` |Boolean value if term freq and +positions should be omitted. Defaults to `false`. Deprecated since 0.20, +see `index_options`. + +|`index_options` |Available since 0.20. Allows to set the indexing +options, possible values are `docs` (only doc numbers are indexed), +`freqs` (doc numbers and term frequencies), and `positions` (doc +numbers, term frequencies and positions). Defaults to `positions` for +`analyzed` fields, and to `docs` for `not_analyzed` fields. Since 0.90 +it is also possible to set it to `offsets` (doc numbers, term +frequencies, positions and offsets). + +|`analyzer` |The analyzer used to analyze the text contents when +`analyzed` during indexing and when searching using a query string. +Defaults to the globally configured analyzer. + +|`index_analyzer` |The analyzer used to analyze the text contents when +`analyzed` during indexing. + +|`search_analyzer` |The analyzer used to analyze the field when part of +a query string. Can be updated on an existing field. + +|`include_in_all` |Should the field be included in the `_all` field (if +enabled). If `index` is set to `no` this defaults to `false`, otherwise, +defaults to `true` or to the parent `object` type setting. + +|`ignore_above` |The analyzer will ignore strings larger than this size. +Useful for generic `not_analyzed` fields that should ignore long text. +(since @0.19.9). + +|`position_offset_gap` |Position increment gap between field instances +with the same field name. Defaults to 0. +|======================================================================= + +The `string` type also support custom indexing parameters associated +with the indexed value. For example: + +[source,js] +-------------------------------------------------- +{ + "message" : { + "_value": "boosted value", + "_boost": 2.0 + } +} +-------------------------------------------------- + +The mapping is required to disambiguate the meaning of the document. +Otherwise, the structure would interpret "message" as a value of type +"object". The key `_value` (or `value`) in the inner document specifies +the real string content that should eventually be indexed. The `_boost` +(or `boost`) key specifies the per field document boost (here 2.0). + +[float] +==== Number + +A number based type supporting `float`, `double`, `byte`, `short`, +`integer`, and `long`. It uses specific constructs within Lucene in +order to support numeric values. The number types have the same ranges +as corresponding +http://docs.oracle.com/javase/tutorial/java/nutsandbolts/datatypes.html[Java +types]. An example mapping can be: + +[source,js] +-------------------------------------------------- +{ + "tweet" : { + "properties" : { + "rank" : { + "type" : "float", + "null_value" : 1.0 + } + } + } +} +-------------------------------------------------- + +The following table lists all the attributes that can be used with a +numbered type: + +[cols="<,<",options="header",] +|======================================================================= +|Attribute |Description +|`type` |The type of the number. Can be `float`, `double`, `integer`, +`long`, `short`, `byte`. Required. + +|`index_name` |The name of the field that will be stored in the index. +Defaults to the property/field name. + +|`store` |Set to `yes` to store actual field in the index, `no` to not +store it. Defaults to `no` (note, the JSON document itself is stored, +and it can be retrieved from it). + +|`index` |Set to `no` if the value should not be indexed. Setting to +`no` disables `include_in_all`. If set to `no` the field can be stored +in `_source`, have `include_in_all` enabled, or `store` should be set to +`yes` for this to be useful. + +|`precision_step` |The precision step (number of terms generated for +each number value). Defaults to `4`. + +|`boost` |The boost value. Defaults to `1.0`. + +|`null_value` |When there is a (JSON) null value for the field, use the +`null_value` as the field value. Defaults to not adding the field at +all. + +|`include_in_all` |Should the field be included in the `_all` field (if +enabled). If `index` is set to `no` this defaults to `false`, otherwise, +defaults to `true` or to the parent `object` type setting. + +|`ignore_malformed` |Ignored a malformed number. Defaults to `false`. +(Since @0.19.9). +|======================================================================= + +[float] +==== Date + +The date type is a special type which maps to JSON string type. It +follows a specific format that can be explicitly set. All dates are +`UTC`. Internally, a date maps to a number type `long`, with the added +parsing stage from string to long and from long to string. An example +mapping: + +[source,js] +-------------------------------------------------- +{ + "tweet" : { + "properties" : { + "postDate" : { + "type" : "date", + "format" : "YYYY-MM-dd" + } + } + } +} +-------------------------------------------------- + +The date type will also accept a long number representing UTC +milliseconds since the epoch, regardless of the format it can handle. + +The following table lists all the attributes that can be used with a +date type: + +[cols="<,<",options="header",] +|======================================================================= +|Attribute |Description +|`index_name` |The name of the field that will be stored in the index. +Defaults to the property/field name. + +|`format` |The <>. Defaults to `dateOptionalTime`. + +|`store` |Set to `yes` to store actual field in the index, `no` to not +store it. Defaults to `no` (note, the JSON document itself is stored, +and it can be retrieved from it). + +|`index` |Set to `no` if the value should not be indexed. Setting to +`no` disables `include_in_all`. If set to `no` the field can be stored +in `_source`, have `include_in_all` enabled, or `store` should be set to +`yes` for this to be useful. + +|`precision_step` |The precision step (number of terms generated for +each number value). Defaults to `4`. + +|`boost` |The boost value. Defaults to `1.0`. + +|`null_value` |When there is a (JSON) null value for the field, use the +`null_value` as the field value. Defaults to not adding the field at +all. + +|`include_in_all` |Should the field be included in the `_all` field (if +enabled). If `index` is set to `no` this defaults to `false`, otherwise, +defaults to `true` or to the parent `object` type setting. + +|`ignore_malformed` |Ignored a malformed number. Defaults to `false`. +(Since @0.19.9). +|======================================================================= + +[float] +==== Boolean + +The boolean type Maps to the JSON boolean type. It ends up storing +within the index either `T` or `F`, with automatic translation to `true` +and `false` respectively. + +[source,js] +-------------------------------------------------- +{ + "tweet" : { + "properties" : { + "hes_my_special_tweet" : { + "type" : "boolean", + } + } + } +} +-------------------------------------------------- + +The boolean type also supports passing the value as a number (in this +case `0` is `false`, all other values are `true`). + +The following table lists all the attributes that can be used with the +boolean type: + +[cols="<,<",options="header",] +|======================================================================= +|Attribute |Description +|`index_name` |The name of the field that will be stored in the index. +Defaults to the property/field name. + +|`store` |Set to `yes` to store actual field in the index, `no` to not +store it. Defaults to `no` (note, the JSON document itself is stored, +and it can be retrieved from it). + +|`index` |Set to `no` if the value should not be indexed. Setting to +`no` disables `include_in_all`. If set to `no` the field can be stored +in `_source`, have `include_in_all` enabled, or `store` should be set to +`yes` for this to be useful. + +|`boost` |The boost value. Defaults to `1.0`. + +|`null_value` |When there is a (JSON) null value for the field, use the +`null_value` as the field value. Defaults to not adding the field at +all. + +|`include_in_all` |Should the field be included in the `_all` field (if +enabled). If `index` is set to `no` this defaults to `false`, otherwise, +defaults to `true` or to the parent `object` type setting. +|======================================================================= + +[float] +==== Binary + +The binary type is a base64 representation of binary data that can be +stored in the index. The field is stored by default and not indexed at +all. + +[source,js] +-------------------------------------------------- +{ + "tweet" : { + "properties" : { + "image" : { + "type" : "binary", + } + } + } +} +-------------------------------------------------- + +The following table lists all the attributes that can be used with the +binary type: + +[cols="<,<",options="header",] +|======================================================================= +|Attribute |Description +|`index_name` |The name of the field that will be stored in the index. +Defaults to the property/field name. +|======================================================================= + +[float] +==== Fielddata filters + +It is possible to control which field values are loaded into memory, +which is particularly useful for faceting on string fields, using +fielddata filters, which are explained in detail in the +<> section. + +Fielddata filters can exclude terms which do not match a regex, or which +don't fall between a `min` and `max` frequency range: + +[source,js] +-------------------------------------------------- +{ + tweet: { + type: "string", + analyzer: "whitespace" + fielddata: { + filter: { + regex: "^#.*", + frequency: { + min: 0.001, + max: 0.1, + min_segment_size: 500 + } + } + } + } +} +-------------------------------------------------- + +These filters can be updated on an existing field mapping and will take +effect the next time the fielddata for a segment is loaded. Use the +<> API +to reload the fielddata using the new filters. + +[float] +==== Postings format + +Posting formats define how fields are written into the index and how +fields are represented into memory. Posting formats can be defined per +field via the `postings_format` option. Postings format are configurable +since version `0.90.0.Beta1`. Elasticsearch has several builtin +formats: + +`direct`:: + A postings format that uses disk-based storage but loads + its terms and postings directly into memory. Note this postings format + is very memory intensive and has certain limitation that don't allow + segments to grow beyond 2.1GB see \{@link DirectPostingsFormat} for + details. + +`memory`:: + A postings format that stores its entire terms, postings, + positions and payloads in a finite state transducer. This format should + only be used for primary keys or with fields where each term is + contained in a very low number of documents. + +`pulsing`:: + A postings format in-lines the posting lists for very low + frequent terms in the term dictionary. This is useful to improve lookup + performance for low-frequent terms. + +`bloom_default`:: + A postings format that uses a bloom filter to + improve term lookup performance. This is useful for primarily keys or + fields that are used as a delete key. + +`bloom_pulsing`:: + A postings format that combines the advantages of + *bloom* and *pulsing* to further improve lookup performance. + +`default`:: + The default Elasticsearch postings format offering best + general purpose performance. This format is used if no postings format + is specified in the field mapping. + +[float] +===== Postings format example + +On all field types it possible to configure a `postings_format` +attribute: + +[source,js] +-------------------------------------------------- +{ + "person" : { + "properties" : { + "second_person_id" : {"type" : "string", "postings_format" : "pulsing"} + } + } +} +-------------------------------------------------- + +On top of using the built-in posting formats it is possible define +custom postings format. See +<> for more +information. + +[float] +==== Similarity + +From version `0.90.Beta1` Elasticsearch includes changes from Lucene 4 +that allows you to configure a similarity (scoring algorithm) per field. +Allowing users a simpler extension beyond the usual TF/IDF algorithm. As +part of this, new algorithms have been added including BM25. Also as +part of the changes, it is now possible to define a Similarity per +field, giving even greater control over scoring. + +You can configure similarities via the +<> + +[float] +===== Configuring Similarity per Field + +Defining the Similarity for a field is done via the `similarity` mapping +property, as this example shows: + +[source,js] +-------------------------------------------------- +{ + "book" : { + "properties" : { + "title" : { "type" : "string", "similarity" : "BM25" } + } +} +-------------------------------------------------- + +The following Similarities are configured out-of-box: + +`default`:: + The Default TF/IDF algorithm used by Elasticsearch and + Lucene in previous versions. + +`BM25`:: + The BM25 algorithm. + http://en.wikipedia.org/wiki/Okapi_BM25[See Okapi_BM25] for more + details. diff --git a/docs/reference/mapping/types/geo-point-type.asciidoc b/docs/reference/mapping/types/geo-point-type.asciidoc new file mode 100644 index 00000000000..66f91bcad50 --- /dev/null +++ b/docs/reference/mapping/types/geo-point-type.asciidoc @@ -0,0 +1,136 @@ +[[mapping-geo-point-type]] +=== Geo Point Type + +Mapper type called `geo_point` to support geo based points. The +declaration looks as follows: + +[source,js] +-------------------------------------------------- +{ + "pin" : { + "properties" : { + "location" : { + "type" : "geo_point" + } + } + } +} +-------------------------------------------------- + +[float] +==== Indexed Fields + +The `geo_point` mapping will index a single field with the format of +`lat,lon`. The `lat_lon` option can be set to also index the `.lat` and +`.lon` as numeric fields, and `geohash` can be set to `true` to also +index `.geohash` value. + +A good practice is to enable indexing `lat_lon` as well, since both the +geo distance and bounding box filters can either be executed using in +memory checks, or using the indexed lat lon values, and it really +depends on the data set which one performs better. Note though, that +indexed lat lon only make sense when there is a single geo point value +for the field, and not multi values. + +[float] +==== Input Structure + +The above mapping defines a `geo_point`, which accepts different +formats. The following formats are supported: + +[float] +===== Lat Lon as Properties + +[source,js] +-------------------------------------------------- +{ + "pin" : { + "location" : { + "lat" : 41.12, + "lon" : -71.34 + } + } +} +-------------------------------------------------- + +[float] +===== Lat Lon as String + +Format in `lat,lon`. + +[source,js] +-------------------------------------------------- +{ + "pin" : { + "location" : "41.12,-71.34" + } +} +-------------------------------------------------- + +[float] +===== Geohash + +[source,js] +-------------------------------------------------- +{ + "pin" : { + "location" : "drm3btev3e86" + } +} +-------------------------------------------------- + +[float] +===== Lat Lon as Array + +Format in `[lon, lat]`, note, the order of lon/lat here in order to +conform with http://geojson.org/[GeoJSON]. + +[source,js] +-------------------------------------------------- +{ + "pin" : { + "location" : [-71.34, 41.12] + } +} +-------------------------------------------------- + +[float] +==== Mapping Options + +[cols="<,<",options="header",] +|======================================================================= +|Option |Description +|`lat_lon` |Set to `true` to also index the `.lat` and `.lon` as fields. +Defaults to `false`. + +|`geohash` |Set to `true` to also index the `.geohash` as a field. +Defaults to `false`. + +|`geohash_precision` |Sets the geohash precision, defaults to 12. + +|`validate` |Set to `true` to reject geo points with invalid latitude or +longitude (default is `false`) *Note*: Validation only works when +normalization has been disabled. + +|`validate_lat` |Set to `true` to reject geo points with an invalid +latitude + +|`validate_lon` |Set to `true` to reject geo points with an invalid +longitude + +|`normalize` |Set to `true` to normalize latitude and longitude (default +is `true`) + +|`normalize_lat` |Set to `true` to normalize latitude + +|`normalize_lon` |Set to `true` to normalize longitude +|======================================================================= + +[float] +==== Usage in Scripts + +When using `doc[geo_field_name]` (in the above mapping, +`doc['location']`), the `doc[...].value` returns a `GeoPoint`, which +then allows access to `lat` and `lon` (for example, +`doc[...].value.lat`). For performance, it is better to access the `lat` +and `lon` directly using `doc[...].lat` and `doc[...].lon`. diff --git a/docs/reference/mapping/types/geo-shape-type.asciidoc b/docs/reference/mapping/types/geo-shape-type.asciidoc new file mode 100644 index 00000000000..142637c9030 --- /dev/null +++ b/docs/reference/mapping/types/geo-shape-type.asciidoc @@ -0,0 +1,237 @@ +[[mapping-geo-shape-type]] +=== Geo Shape Type + +The `geo_shape` mapping type facilitates the indexing of and searching +with arbitrary geo shapes such as rectangles and polygons. It should be +used when either the data being indexed or the queries being executed +contain shapes other than just points. + +You can query documents using this type using +<> +or <>. + +Note, the `geo_shape` type uses +https://github.com/spatial4j/spatial4j[Spatial4J] and +http://www.vividsolutions.com/jts/jtshome.htm[JTS], both of which are +optional dependencies. Consequently you must add Spatial4J v0.3 and JTS +v1.12 to your classpath in order to use this type. + +Note, the implementation of geo_shape was modified in an API breaking +way in 0.90. Implementations prior to this version had significant +issues and users are recommended to update to the latest version of +Elasticsearch if they wish to use the geo_shape functionality. + +[float] +==== Mapping Options + +The geo_shape mapping maps geo_json geometry objects to the geo_shape +type. To enable it, users must explicitly map fields to the geo_shape +type. + +[cols="<,<",options="header",] +|======================================================================= +|Option |Description + +|`tree` |Name of the PrefixTree implementation to be used: `geohash` for +GeohashPrefixTree and `quadtree` for QuadPrefixTree. Defaults to +`geohash`. + +|`precision` |This parameter may be used instead of `tree_levels` to set +an appropriate value for the `tree_levels` parameter. The value +specifies the desired precision and Elasticsearch will calculate the +best tree_levels value to honor this precision. The value should be a +number followed by an optional distance unit. Valid distance units +include: `in`, `inch`, `yd`, `yard`, `mi`, `miles`, `km`, `kilometers`, +`m`,@meters@ (default), `cm`,@centimeters@, `mm`, `millimeters`. + +|`tree_levels` |Maximum number of layers to be used by the PrefixTree. +This can be used to control the precision of shape representations and +therefore how many terms are indexed. Defaults to the default value of +the chosen PrefixTree implementation. Since this parameter requires a +certain level of understanding of the underlying implementation, users +may use the `precision` parameter instead. However, Elasticsearch only +uses the tree_levels parameter internally and this is what is returned +via the mapping API even if you use the precision parameter. + +|`distance_error_pct` |Used as a hint to the PrefixTree about how +precise it should be. Defaults to 0.025 (2.5%) with 0.5 as the maximum +supported value. +|======================================================================= + +[float] +==== Prefix trees + +To efficiently represent shapes in the index, Shapes are converted into +a series of hashes representing grid squares using implementations of a +PrefixTree. The tree notion comes from the fact that the PrefixTree uses +multiple grid layers, each with an increasing level of precision to +represent the Earth. + +Multiple PrefixTree implementations are provided: + +* GeohashPrefixTree - Uses +http://en.wikipedia.org/wiki/Geohash[geohashes] for grid squares. +Geohashes are base32 encoded strings of the bits of the latitude and +longitude interleaved. So the longer the hash, the more precise it is. +Each character added to the geohash represents another tree level and +adds 5 bits of precision to the geohash. A geohash represents a +rectangular area and has 32 sub rectangles. The maximum amount of levels +in Elasticsearch is 24. +* QuadPrefixTree - Uses a +http://en.wikipedia.org/wiki/Quadtree[quadtree] for grid squares. +Similar to geohash, quad trees interleave the bits of the latitude and +longitude the resulting hash is a bit set. A tree level in a quad tree +represents 2 bits in this bit set, one for each coordinate. The maximum +amount of levels for the quad trees in elastic search is 50. + +[float] +===== Accuracy + +Geo_shape does not provide 100% accuracy and depending on how it is +configured it may return some false positives or false negatives for +certain queries. To mitigate this, it is important to select an +appropriate value for the tree_levels parameter and to adjust +expectations accordingly. For example, a point may be near the border of +a particular grid cell. And may not match a query that only matches the +cell right next to it even though the shape is very close to the point. + +[float] +===== Example + +[source,js] +-------------------------------------------------- +{ + "properties": { + "location": { + "type": "geo_shape", + "tree": "quadtree", + "precision": "1m" + } + } +} +-------------------------------------------------- + +This mapping maps the location field to the geo_shape type using the +quad_tree implementation and a precision of 1m. Elasticsearch translates +this into a tree_levels setting of 26. + +[float] +===== Performance considerations + +Elasticsearch uses the paths in the prefix tree as terms in the index +and in queries. The higher the levels is (and thus the precision), the +more terms are generated. Both calculating the terms, keeping them in +memory, and storing them has a price of course. Especially with higher +tree levels, indices can become extremely large even with a modest +amount of data. Additionally, the size of the features also matters. +Big, complex polygons can take up a lot of space at higher tree levels. +Which setting is right depends on the use case. Generally one trades off +accuracy against index size and query performance. + +The defaults in elastic search for both implementations are a compromise +between index size and a reasonable level of precision of 50m at the +equator. This allows for indexing tens of millions of shapes without +overly bloating the resulting index too much relative to the input size. + +[float] +==== Input Structure + +The http://www.geojson.org[GeoJSON] format is used to represent Shapes +as input as follows: + +[source,js] +-------------------------------------------------- +{ + "location" : { + "type" : "point", + "coordinates" : [45.0, -45.0] + } +} +-------------------------------------------------- + +Note, both the `type` and `coordinates` fields are required. + +The supported `types` are `point`, `linestring`, `polygon`, `multipoint` +and `multipolygon`. + +Note, in geojson the correct order is longitude, latitude coordinate +arrays. This differs from some APIs such as e.g. Google Maps that +generally use latitude, longitude. + +[float] +===== Envelope + +Elasticsearch supports an `envelope` type which consists of coordinates +for upper left and lower right points of the shape: + +[source,js] +-------------------------------------------------- +{ + "location" : { + "type" : "envelope", + "coordinates" : [[-45.0, 45.0], [45.0, -45.0]] + } +} +-------------------------------------------------- + +[float] +===== http://www.geojson.org/geojson-spec.html#id4[Polygon] + +A polygon is defined by a list of a list of points. The first and last +points in each list must be the same (the polygon must be closed). + +[source,js] +-------------------------------------------------- +{ + "location" : { + "type" : "polygon", + "coordinates" : [ + [ [100.0, 0.0], [101.0, 0.0], [101.0, 1.0], [100.0, 1.0], [100.0, 0.0] ] + ] + } +} +-------------------------------------------------- + +The first array represents the outer boundary of the polygon, the other +arrays represent the interior shapes ("holes"): + +[source,js] +-------------------------------------------------- +{ + "location" : { + "type" : "polygon", + "coordinates" : [ + [ [100.0, 0.0], [101.0, 0.0], [101.0, 1.0], [100.0, 1.0], [100.0, 0.0] ], + [ [100.2, 0.2], [100.8, 0.2], [100.8, 0.8], [100.2, 0.8], [100.2, 0.2] ] + ] + } +} +-------------------------------------------------- + +[float] +===== http://www.geojson.org/geojson-spec.html#id7[MultiPolygon] + +A list of geojson polygons. + +[source,js] +-------------------------------------------------- +{ + "location" : { + "type" : "multipolygon", + "coordinates" : [ + [[[102.0, 2.0], [103.0, 2.0], [103.0, 3.0], [102.0, 3.0], [102.0, 2.0]]], + [[[100.0, 0.0], [101.0, 0.0], [101.0, 1.0], [100.0, 1.0], [100.0, 0.0]], + [[100.2, 0.2], [100.8, 0.2], [100.8, 0.8], [100.2, 0.8], [100.2, 0.2]]] + ] + } +} +-------------------------------------------------- + +[float] +==== Sorting and Retrieving index Shapes + +Due to the complex input structure and index representation of shapes, +it is not currently possible to sort shapes or retrieve their fields +directly. The geo_shape value is only retrievable through the `_source` +field. diff --git a/docs/reference/mapping/types/ip-type.asciidoc b/docs/reference/mapping/types/ip-type.asciidoc new file mode 100644 index 00000000000..d7dfabf50c2 --- /dev/null +++ b/docs/reference/mapping/types/ip-type.asciidoc @@ -0,0 +1,36 @@ +[[mapping-ip-type]] +=== IP Type + +An `ip` mapping type allows to store _ipv4_ addresses in a numeric form +allowing to easily sort, and range query it (using ip values). + +The following table lists all the attributes that can be used with an ip +type: + +[cols="<,<",options="header",] +|======================================================================= +|Attribute |Description +|`index_name` |The name of the field that will be stored in the index. +Defaults to the property/field name. + +|`store` |Set to `yes` to store actual field in the index, `no` to not +store it. Defaults to `no` (note, the JSON document itself is stored, +and it can be retrieved from it). + +|`index` |Set to `no` if the value should not be indexed. In this case, +`store` should be set to `yes`, since if it's not indexed and not +stored, there is nothing to do with it. + +|`precision_step` |The precision step (number of terms generated for +each number value). Defaults to `4`. + +|`boost` |The boost value. Defaults to `1.0`. + +|`null_value` |When there is a (JSON) null value for the field, use the +`null_value` as the field value. Defaults to not adding the field at +all. + +|`include_in_all` |Should the field be included in the `_all` field (if +enabled). Defaults to `true` or to the parent `object` type setting. +|======================================================================= + diff --git a/docs/reference/mapping/types/multi-field-type.asciidoc b/docs/reference/mapping/types/multi-field-type.asciidoc new file mode 100644 index 00000000000..02937a71c78 --- /dev/null +++ b/docs/reference/mapping/types/multi-field-type.asciidoc @@ -0,0 +1,93 @@ +[[mapping-multi-field-type]] +=== Multi Field Type + +The `multi_field` type allows to map several +<> of the same +value. This can come very handy, for example, when wanting to map a +`string` type, once when it's `analyzed` and once when it's +`not_analyzed`. For example: + +[source,js] +-------------------------------------------------- +{ + "tweet" : { + "properties" : { + "name" : { + "type" : "multi_field", + "fields" : { + "name" : {"type" : "string", "index" : "analyzed"}, + "untouched" : {"type" : "string", "index" : "not_analyzed"} + } + } + } + } +} +-------------------------------------------------- + +The above example shows how the `name` field, which is of simple +`string` type, gets mapped twice, once with it being `analyzed` under +`name`, and once with it being `not_analyzed` under `untouched`. + +[float] +==== Accessing Fields + +With `multi_field` mapping, the field that has the same name as the +property is treated as if it was mapped without a multi field. That's +the "default" field. It can be accessed regularly, for example using +`name` or using typed navigation `tweet.name`. + +The `path` attribute allows to control how non-default fields can be +accessed. If the `path` attribute is set to `full`, which is the default +setting, all non-default fields are prefixed with the name of the +property and can be accessed by their full path using the navigation +notation: `name.untouched`, or using the typed navigation notation +`tweet.name.untouched`. If the `path` attribute is set to `just_name` +the actual field name without a prefix is used. The `just_name` setting, +among other things, allows indexing content of multiple fields under the +same name. In the example below the content of both fields `first_name` +and `last_name` can be accessed by using `any_name` or `tweet.any_name` + +[source,js] +-------------------------------------------------- +{ + "tweet" : { + "properties": { + "first_name": { + "type": "multi_field", + "path": "just_name", + "fields": { + "first_name": {"type": "string", "index": "analyzed"}, + "any_name": {"type": "string","index": "analyzed"} + } + }, + "last_name": { + "type": "multi_field", + "path": "just_name", + "fields": { + "last_name": {"type": "string", "index": "analyzed"}, + "any_name": {"type": "string","index": "analyzed"} + } + } + } + } +} +-------------------------------------------------- + +[float] +==== Include in All + +The `include_in_all` setting on the "default" field allows to control if +the value of the field should be included in the `_all` field. Note, the +value of the field is copied to `_all`, not the tokens. So, it only +makes sense to copy the field value once. Because of this, the +`include_in_all` setting on all non-default fields is automatically set +to `false` and can't be changed. + +[float] +==== Merging + +When updating mapping definition using the `put_mapping` API, a core +type mapping can be "upgraded" to a `multi_field` mapping. This means +that if the old mapping has a plain core type mapping, the updated +mapping for the same property can be a `multi_field` type, with the +default field being the one being replaced. diff --git a/docs/reference/mapping/types/nested-type.asciidoc b/docs/reference/mapping/types/nested-type.asciidoc new file mode 100644 index 00000000000..d1445bfc95f --- /dev/null +++ b/docs/reference/mapping/types/nested-type.asciidoc @@ -0,0 +1,72 @@ +[[mapping-nested-type]] +=== Nested Type + +Nested objects/documents allow to map certain sections in the document +indexed as nested allowing to query them as if they are separate docs +joining with the parent owning doc. + +One of the problems when indexing inner objects that occur several times +in a doc is that "cross object" search match will occur, for example: + +[source,js] +-------------------------------------------------- +{ + "obj1" : [ + { + "name" : "blue", + "count" : 4 + }, + { + "name" : "green", + "count" : 6 + } + ] +} +-------------------------------------------------- + +Searching for name set to blue and count higher than 5 will match the +doc, because in the first element the name matches blue, and in the +second element, count matches "higher than 5". + +Nested mapping allows mapping certain inner objects (usually multi +instance ones), for example: + +[source,js] +-------------------------------------------------- +{ + "type1" : { + "properties" : { + "obj1" : { + "type" : "nested" + } + } + } +} +-------------------------------------------------- + +The above will cause all `obj1` to be indexed as a nested doc. The +mapping is similar in nature to setting `type` to `object`, except that +it's `nested`. + +Note: changing an object type to nested type requires reindexing. + +The `nested` object fields can also be automatically added to the +immediate parent by setting `include_in_parent` to true, and also +included in the root object by setting `include_in_root` to true. + +Nested docs will also automatically use the root doc `_all` field. + +Searching on nested docs can be done using either the +<> or +<>. + +[float] +==== Internal Implementation + +Internally, nested objects are indexed as additional documents, but, +since they can be guaranteed to be indexed within the same "block", it +allows for extremely fast joining with parent docs. + +Those internal nested documents are automatically masked away when doing +operations against the index (like searching with a match_all query), +and they bubble out when using the nested query. diff --git a/docs/reference/mapping/types/object-type.asciidoc b/docs/reference/mapping/types/object-type.asciidoc new file mode 100644 index 00000000000..0181338e359 --- /dev/null +++ b/docs/reference/mapping/types/object-type.asciidoc @@ -0,0 +1,239 @@ +[[mapping-object-type]] +=== Object Type + +JSON documents are hierarchical in nature, allowing them to define inner +"objects" within the actual JSON. ElasticSearch completely understands +the nature of these inner objects and can map them easily, providing +query support for their inner fields. Because each document can have +objects with different fields each time, objects mapped this way are +known as "dynamic". Dynamic mapping is enabled by default. Let's take +the following JSON as an example: + +[source,js] +-------------------------------------------------- +{ + "tweet" : { + "person" : { + "name" : { + "first_name" : "Shay", + "last_name" : "Banon" + }, + "sid" : "12345" + }, + "message" : "This is a tweet!" + } +} +-------------------------------------------------- + +The above shows an example where a tweet includes the actual `person` +details. A `person` is an object, with a `sid`, and a `name` object +which has `first_name` and `last_name`. It's important to note that +`tweet` is also an object, although it is a special +<> +which allows for additional mapping definitions. + +The following is an example of explicit mapping for the above JSON: + +[source,js] +-------------------------------------------------- +{ + "tweet" : { + "properties" : { + "person" : { + "type" : "object", + "properties" : { + "name" : { + "properties" : { + "first_name" : {"type" : "string"}, + "last_name" : {"type" : "string"} + } + }, + "sid" : {"type" : "string", "index" : "not_analyzed"} + } + }, + "message" : {"type" : "string"} + } + } +} +-------------------------------------------------- + +In order to mark a mapping of type `object`, set the `type` to object. +This is an optional step, since if there are `properties` defined for +it, it will automatically be identified as an `object` mapping. + +[float] +==== properties + +An object mapping can optionally define one or more properties using the +`properties` tag for a field. Each property can be either another +`object`, or one of the +<>. + +[float] +==== dynamic + +One of the most important features of ElasticSearch is its ability to be +schema-less. This means that, in our example above, the `person` object +can be indexed later with a new property -- `age`, for example -- and it +will automatically be added to the mapping definitions. Same goes for +the `tweet` root object. + +This feature is by default turned on, and it's the `dynamic` nature of +each object mapped. Each object mapped is automatically dynamic, though +it can be explicitly turned off: + +[source,js] +-------------------------------------------------- +{ + "tweet" : { + "properties" : { + "person" : { + "type" : "object", + "properties" : { + "name" : { + "dynamic" : false, + "properties" : { + "first_name" : {"type" : "string"}, + "last_name" : {"type" : "string"} + } + }, + "sid" : {"type" : "string", "index" : "not_analyzed"} + } + }, + "message" : {"type" : "string"} + } + } +} +-------------------------------------------------- + +In the above example, the `name` object mapped is not dynamic, meaning +that if, in the future, we try to index JSON with a `middle_name` within +the `name` object, it will get discarded and not added. + +There is no performance overhead if an `object` is dynamic, the ability +to turn it off is provided as a safety mechanism so "malformed" objects +won't, by mistake, index data that we do not wish to be indexed. + +If a dynamic object contains yet another inner `object`, it will be +automatically added to the index and mapped as well. + +When processing dynamic new fields, their type is automatically derived. +For example, if it is a `number`, it will automatically be treated as +number <>. Dynamic +fields default to their default attributes, for example, they are not +stored and they are always indexed. + +Date fields are special since they are represented as a `string`. Date +fields are detected if they can be parsed as a date when they are first +introduced into the system. The set of date formats that are tested +against can be configured using the `date_formats` and explained later. + +Note, once a field has been added, *its type can not change*. For +example, if we added age and its value is a number, then it can't be +treated as a string. + +The `dynamic` parameter can also be set to `strict`, meaning that not +only new fields will not be introduced into the mapping, parsing +(indexing) docs with such new fields will fail. + +[float] +==== enabled + +The `enabled` flag allows to disable parsing and adding a named object +completely. This is handy when a portion of the JSON document passed +should not be indexed. For example: + +[source,js] +-------------------------------------------------- +{ + "tweet" : { + "properties" : { + "person" : { + "type" : "object", + "properties" : { + "name" : { + "type" : "object", + "enabled" : false + }, + "sid" : {"type" : "string", "index" : "not_analyzed"} + } + }, + "message" : {"type" : "string"} + } + } +} +-------------------------------------------------- + +In the above, `name` and its content will not be indexed at all. + +[float] +==== path + +In the <> +section, a field can have a `index_name` associated with it in order to +control the name of the field that will be stored within the index. When +that field exists within an object(s) that are not the root object, the +name of the field of the index can either include the full "path" to the +field with its `index_name`, or just the `index_name`. For example +(under mapping of _type_ `person`, removed the tweet type for clarity): + +[source,js] +-------------------------------------------------- +{ + "person" : { + "properties" : { + "name1" : { + "type" : "object", + "path" : "just_name", + "properties" : { + "first1" : {"type" : "string"}, + "last1" : {"type" : "string", "index_name" : "i_last_1"} + } + }, + "name2" : { + "type" : "object", + "path" : "full", + "properties" : { + "first2" : {"type" : "string"}, + "last2" : {"type" : "string", "index_name" : "i_last_2"} + } + } + } + } +} +-------------------------------------------------- + +In the above example, the `name1` and `name2` objects within the +`person` object have different combination of `path` and `index_name`. +The document fields that will be stored in the index as a result of that +are: + +[cols="<,<",options="header",] +|================================= +|JSON Name |Document Field Name +|`name1`/`first1` |`first1` +|`name1`/`last1` |`i_last_1` +|`name2`/`first2` |`name2.first2` +|`name2`/`last2` |`name2.i_last_2` +|================================= + +Note, when querying or using a field name in any of the APIs provided +(search, query, selective loading, ...), there is an automatic detection +from logical full path and into the `index_name` and vice versa. For +example, even though `name1`/`last1` defines that it is stored with +`just_name` and a different `index_name`, it can either be referred to +using `name1.last1` (logical name), or its actual indexed name of +`i_last_1`. + +More over, where applicable, for example, in queries, the full path +including the type can be used such as `person.name.last1`, in this +case, both the actual indexed name will be resolved to match against the +index, and an automatic query filter will be added to only match +`person` types. + +[float] +==== include_in_all + +`include_in_all` can be set on the `object` type level. When set, it +propagates down to all the inner mapping defined within the `object` +that do no explicitly set it. diff --git a/docs/reference/mapping/types/root-object-type.asciidoc b/docs/reference/mapping/types/root-object-type.asciidoc new file mode 100644 index 00000000000..aead1a9d412 --- /dev/null +++ b/docs/reference/mapping/types/root-object-type.asciidoc @@ -0,0 +1,224 @@ +[[mapping-root-object-type]] +=== Root Object Type + +The root object mapping is an +<> that +maps the root object (the type itself). On top of all the different +mappings that can be set using the +<>, it +allows for additional, type level mapping definitions. + +The root object mapping allows to index a JSON document that either +starts with the actual mapping type, or only contains its fields. For +example, the following `tweet` JSON can be indexed: + +[source,js] +-------------------------------------------------- +{ + "message" : "This is a tweet!" +} +-------------------------------------------------- + +But, also the following JSON can be indexed: + +[source,js] +-------------------------------------------------- +{ + "tweet" : { + "message" : "This is a tweet!" + } +} +-------------------------------------------------- + +Out of the two, it is preferable to use the document *without* the type +explicitly set. + +[float] +==== Index / Search Analyzers + +The root object allows to define type mapping level analyzers for index +and search that will be used with all different fields that do not +explicitly set analyzers on their own. Here is an example: + +[source,js] +-------------------------------------------------- +{ + "tweet" : { + "index_analyzer" : "standard", + "search_analyzer" : "standard" + } +} +-------------------------------------------------- + +The above simply explicitly defines both the `index_analyzer` and +`search_analyzer` that will be used. There is also an option to use the +`analyzer` attribute to set both the `search_analyzer` and +`index_analyzer`. + +[float] +==== dynamic_date_formats + +`dynamic_date_formats` (old setting called `date_formats` still works) +is the ability to set one or more date formats that will be used to +detect `date` fields. For example: + +[source,js] +-------------------------------------------------- +{ + "tweet" : { + "dynamic_date_formats" : ["yyyy-MM-dd", "dd-MM-yyyy"], + "properties" : { + "message" : {"type" : "string"} + } + } +} +-------------------------------------------------- + +In the above mapping, if a new JSON field of type string is detected, +the date formats specified will be used in order to check if its a date. +If it passes parsing, then the field will be declared with `date` type, +and will use the matching format as its format attribute. The date +format itself is explained +<>. + +The default formats are: `dateOptionalTime` (ISO) and +`yyyy/MM/dd HH:mm:ss Z||yyyy/MM/dd Z`. + +*Note:* `dynamic_date_formats` are used *only* for dynamically added +date fields, not for `date` fields that you specify in your mapping. + +[float] +==== date_detection + +Allows to disable automatic date type detection (a new field introduced +and matches the provided format), for example: + +[source,js] +-------------------------------------------------- +{ + "tweet" : { + "date_detection" : false, + "properties" : { + "message" : {"type" : "string"} + } + } +} +-------------------------------------------------- + +[float] +==== numeric_detection + +Sometimes, even though json has support for native numeric types, +numeric values are still provided as strings. In order to try and +automatically detect numeric values from string, the `numeric_detection` +can be set to `true`. For example: + +[source,js] +-------------------------------------------------- +{ + "tweet" : { + "numeric_detection" : true, + "properties" : { + "message" : {"type" : "string"} + } + } +} +-------------------------------------------------- + +[float] +==== dynamic_templates + +Dynamic templates allow to define mapping templates that will be applied +when dynamic introduction of fields / objects happens. + +For example, we might want to have all fields to be stored by default, +or all `string` fields to be stored, or have `string` fields to always +be indexed as `multi_field`, once analyzed and once not_analyzed. Here +is a simple example: + +[source,js] +-------------------------------------------------- +{ + "person" : { + "dynamic_templates" : [ + { + "template_1" : { + "match" : "multi*", + "mapping" : { + "type" : "multi_field", + "fields" : { + "{name}" : {"type": "{dynamic_type}", "index" : "analyzed"}, + "org" : {"type": "{dynamic_type}", "index" : "not_analyzed"} + } + } + } + }, + { + "template_2" : { + "match" : "*", + "match_mapping_type" : "string", + "mapping" : { + "type" : "string", + "index" : "not_analyzed" + } + } + } + ] + } +} +-------------------------------------------------- + +The above mapping will create a `multi_field` mapping for all field +names starting with multi, and will map all `string` types to be +`not_analyzed`. + +Dynamic templates are named to allow for simple merge behavior. A new +mapping, just with a new template can be "put" and that template will be +added, or if it has the same name, the template will be replaced. + +The `match` allow to define matching on the field name. An `unmatch` +option is also available to exclude fields if they do match on `match`. +The `match_mapping_type` controls if this template will be applied only +for dynamic fields of the specified type (as guessed by the json +format). + +Another option is to use `path_match`, which allows to match the dynamic +template against the "full" dot notation name of the field (for example +`obj1.*.value` or `obj1.obj2.*`), with the respective `path_unmatch`. + +The format of all the matching is simple format, allowing to use * as a +matching element supporting simple patterns such as xxx*, *xxx, xxx*yyy +(with arbitrary number of pattern types), as well as direct equality. +The `match_pattern` can be set to `regex` to allow for regular +expression based matching. + +The `mapping` element provides the actual mapping definition. The +`{name}` keyword can be used and will be replaced with the actual +dynamic field name being introduced. The `{dynamic_type}` (or +`{dynamicType}`) can be used and will be replaced with the mapping +derived based on the field type (or the derived type, like `date`). + +Complete generic settings can also be applied, for example, to have all +mappings be stored, just set: + +[source,js] +-------------------------------------------------- +{ + "person" : { + "dynamic_templates" : [ + { + "store_generic" : { + "match" : "*", + "mapping" : { + "store" : "yes" + } + } + } + ] + } +} +-------------------------------------------------- + +Such generic templates should be placed at the end of the +`dynamic_templates` list because when two or more dynamic templates +match a field, only the first matching one from the list is used. diff --git a/docs/reference/modules.asciidoc b/docs/reference/modules.asciidoc new file mode 100644 index 00000000000..3b912444145 --- /dev/null +++ b/docs/reference/modules.asciidoc @@ -0,0 +1,33 @@ +[[modules]] += Modules + +include::modules/cluster.asciidoc[] + +include::modules/discovery.asciidoc[] + +include::modules/gateway.asciidoc[] + +include::modules/http.asciidoc[] + +include::modules/indices.asciidoc[] + +include::modules/jmx.asciidoc[] + +include::modules/memcached.asciidoc[] + +include::modules/network.asciidoc[] + +include::modules/node.asciidoc[] + +include::modules/plugins.asciidoc[] + +include::modules/scripting.asciidoc[] + +include::modules/threadpool.asciidoc[] + +include::modules/thrift.asciidoc[] + +include::modules/transport.asciidoc[] + + + diff --git a/docs/reference/modules/cluster.asciidoc b/docs/reference/modules/cluster.asciidoc new file mode 100644 index 00000000000..65a24e70dbd --- /dev/null +++ b/docs/reference/modules/cluster.asciidoc @@ -0,0 +1,230 @@ +[[modules-cluster]] +== Cluster + +[float] +=== Shards Allocation + +Shards allocation is the process of allocating shards to nodes. This can +happen during initial recovery, replica allocation, rebalancing, or +handling nodes being added or removed. + +The following settings may be used: + +`cluster.routing.allocation.allow_rebalance`:: + Allow to control when rebalancing will happen based on the total + state of all the indices shards in the cluster. `always`, + `indices_primaries_active`, and `indices_all_active` are allowed, + defaulting to `indices_all_active` to reduce chatter during + initial recovery. + + +`cluster.routing.allocation.cluster_concurrent_rebalance`:: + Allow to control how many concurrent rebalancing of shards are + allowed cluster wide, and default it to `2`. + + +`cluster.routing.allocation.node_initial_primaries_recoveries`:: + Allow to control specifically the number of initial recoveries + of primaries that are allowed per node. Since most times local + gateway is used, those should be fast and we can handle more of + those per node without creating load. + + +`cluster.routing.allocation.node_concurrent_recoveries`:: + How many concurrent recoveries are allowed to happen on a node. + Defaults to `2`. + + +`cluster.routing.allocation.disable_new_allocation`:: + Allows to disable new primary allocations. Note, this will prevent + allocations for newly created indices. This setting really make + sense when dynamically updating it using the cluster update + settings API. + + +`cluster.routing.allocation.disable_allocation`:: + Allows to disable either primary or replica allocation (does not + apply to newly created primaries, see `disable_new_allocation` + above). Note, a replica will still be promoted to primary if + one does not exist. This setting really make sense when + dynamically updating it using the cluster update settings API. + + +`cluster.routing.allocation.disable_replica_allocation`:: + Allows to disable only replica allocation. Similar to the previous + setting, mainly make sense when using it dynamically using the + cluster update settings API. + + +`indices.recovery.concurrent_streams`:: + The number of streams to open (on a *node* level) to recover a + shard from a peer shard. Defaults to `3`. + +[float] +=== Shard Allocation Awareness + +Cluster allocation awareness allows to configure shard and replicas +allocation across generic attributes associated the nodes. Lets explain +it through an example: + +Assume we have several racks. When we start a node, we can configure an +attribute called `rack_id` (any attribute name works), for example, here +is a sample config: + +---------------------- +node.rack_id: rack_one +---------------------- + +The above sets an attribute called `rack_id` for the relevant node with +a value of `rack_one`. Now, we need to configure the `rack_id` attribute +as one of the awareness allocation attributes (set it on *all* (master +eligible) nodes config): + +-------------------------------------------------------- +cluster.routing.allocation.awareness.attributes: rack_id +-------------------------------------------------------- + +The above will mean that the `rack_id` attribute will be used to do +awareness based allocation of shard and its replicas. For example, lets +say we start 2 nodes with `node.rack_id` set to `rack_one`, and deploy a +single index with 5 shards and 1 replica. The index will be fully +deployed on the current nodes (5 shards and 1 replica each, total of 10 +shards). + +Now, if we start two more nodes, with `node.rack_id` set to `rack_two`, +shards will relocate to even the number of shards across the nodes, but, +a shard and its replica will not be allocated in the same `rack_id` +value. + +The awareness attributes can hold several values, for example: + +------------------------------------------------------------- +cluster.routing.allocation.awareness.attributes: rack_id,zone +------------------------------------------------------------- + +*NOTE*: When using awareness attributes, shards will not be allocated to +nodes that don't have values set for those attributes. + +[float] +=== Forced Awareness + +Sometimes, we know in advance the number of values an awareness +attribute can have, and more over, we would like never to have more +replicas then needed allocated on a specific group of nodes with the +same awareness attribute value. For that, we can force awareness on +specific attributes. + +For example, lets say we have an awareness attribute called `zone`, and +we know we are going to have two zones, `zone1` and `zone2`. Here is how +we can force awareness one a node: + +[source,js] +------------------------------------------------------------------- +cluster.routing.allocation.awareness.force.zone.values: zone1,zone2 +cluster.routing.allocation.awareness.attributes: zone +------------------------------------------------------------------- + +Now, lets say we start 2 nodes with `node.zone` set to `zone1` and +create an index with 5 shards and 1 replica. The index will be created, +but only 5 shards will be allocated (with no replicas). Only when we +start more shards with `node.zone` set to `zone2` will the replicas be +allocated. + +[float] +==== Automatic Preference When Searching / GETing + +When executing a search, or doing a get, the node receiving the request +will prefer to execute the request on shards that exists on nodes that +have the same attribute values as the executing node. + +[float] +==== Realtime Settings Update + +The settings can be updated using the <> on a live cluster. + +[float] +=== Shard Allocation Filtering + +Allow to control allocation if indices on nodes based on include/exclude +filters. The filters can be set both on the index level and on the +cluster level. Lets start with an example of setting it on the cluster +level: + +Lets say we have 4 nodes, each has specific attribute called `tag` +associated with it (the name of the attribute can be any name). Each +node has a specific value associated with `tag`. Node 1 has a setting +`node.tag: value1`, Node 2 a setting of `node.tag: value2`, and so on. + +We can create an index that will only deploy on nodes that have `tag` +set to `value1` and `value2` by setting +`index.routing.allocation.include.tag` to `value1,value2`. For example: + +[source,js] +-------------------------------------------------- +curl -XPUT localhost:9200/test/_settings -d '{ + "index.routing.allocation.include.tag" : "value1,value2" +}' +-------------------------------------------------- + +On the other hand, we can create an index that will be deployed on all +nodes except for nodes with a `tag` of value `value3` by setting +`index.routing.allocation.exclude.tag` to `value3`. For example: + +[source,js] +-------------------------------------------------- +curl -XPUT localhost:9200/test/_settings -d '{ + "index.routing.allocation.exclude.tag" : "value3" +}' +-------------------------------------------------- + +From version 0.90, `index.routing.allocation.require.*` can be used to +specify a number of rules, all of which MUST match in order for a shard +to be allocated to a node. This is in contrast to `include` which will +include a node if ANY rule matches. + +The `include`, `exclude` and `require` values can have generic simple +matching wildcards, for example, `value1*`. A special attribute name +called `_ip` can be used to match on node ip values. In addition `_host` +attribute can be used to match on either the node's hostname or its ip +address. + +Obviously a node can have several attributes associated with it, and +both the attribute name and value are controlled in the setting. For +example, here is a sample of several node configurations: + +[source,js] +-------------------------------------------------- +node.group1: group1_value1 +node.group2: group2_value4 +-------------------------------------------------- + +In the same manner, `include`, `exclude` and `require` can work against +several attributes, for example: + +[source,js] +-------------------------------------------------- +curl -XPUT localhost:9200/test/_settings -d '{ + "index.routing.allocation.include.group1" : "xxx" + "index.routing.allocation.include.group2" : "yyy", + "index.routing.allocation.exclude.group3" : "zzz", + "index.routing.allocation.require.group4" : "aaa" +}' +-------------------------------------------------- + +The provided settings can also be updated in real time using the update +settings API, allowing to "move" indices (shards) around in realtime. + +Cluster wide filtering can also be defined, and be updated in real time +using the cluster update settings API. This setting can come in handy +for things like decommissioning nodes (even if the replica count is set +to 0). Here is a sample of how to decommission a node based on `_ip` +address: + +[source,js] +-------------------------------------------------- +curl -XPUT localhost:9200/_cluster/settings -d '{ + "transient" : { + "cluster.routing.allocation.exclude._ip" : "10.0.0.1" + } +}' +-------------------------------------------------- diff --git a/docs/reference/modules/discovery.asciidoc b/docs/reference/modules/discovery.asciidoc new file mode 100644 index 00000000000..05eb4f5bfd3 --- /dev/null +++ b/docs/reference/modules/discovery.asciidoc @@ -0,0 +1,26 @@ +[[modules-discovery]] +== Discovery + +The discovery module is responsible for discovering nodes within a +cluster, as well as electing a master node. + +Note, ElasticSearch is a peer to peer based system, nodes communicate +with one another directly if operations are delegated / broadcast. All +the main APIs (index, delete, search) do not communicate with the master +node. The responsibility of the master node is to maintain the global +cluster state, and act if nodes join or leave the cluster by reassigning +shards. Each time a cluster state is changed, the state is made known to +the other nodes in the cluster (the manner depends on the actual +discovery implementation). + +[float] +=== Settings + +The `cluster.name` allows to create separated clusters from one another. +The default value for the cluster name is `elasticsearch`, though it is +recommended to change this to reflect the logical group name of the +cluster running. + +include::discovery/ec2.asciidoc[] + +include::discovery/zen.asciidoc[] diff --git a/docs/reference/modules/discovery/ec2.asciidoc b/docs/reference/modules/discovery/ec2.asciidoc new file mode 100644 index 00000000000..c8b9abe964b --- /dev/null +++ b/docs/reference/modules/discovery/ec2.asciidoc @@ -0,0 +1,82 @@ +[[modules-discovery-ec2]] +=== EC2 Discovery + +EC2 discovery allows to use the EC2 APIs to perform automatic discovery +(similar to multicast in non hostile multicast environments). Here is a +simple sample configuration: + +[source,js] +-------------------------------------------------- +cloud: + aws: + access_key: AKVAIQBF2RECL7FJWGJQ + secret_key: vExyMThREXeRMm/b/LRzEB8jWwvzQeXgjqMX+6br + +discovery: + type: ec2 +-------------------------------------------------- + +You'll need to install the `cloud-aws` plugin. Please check the +https://github.com/elasticsearch/elasticsearch-cloud-aws[plugin website] +to find the most up-to-date version to install before (re)starting +elasticsearch. + +The following are a list of settings (prefixed with `discovery.ec2`) +that can further control the discovery: + +[cols="<,<",options="header",] +|======================================================================= +|Setting |Description +|`groups` |Either a comma separated list or array based list of +(security) groups. Only instances with the provided security groups will +be used in the cluster discovery. + +|`host_type` |The type of host type to use to communicate with other +instances. Can be one of `private_ip`, `public_ip`, `private_dns`, +`public_dns`. Defaults to `private_ip`. + +|`availability_zones` |Either a comma separated list or array based list +of availability zones. Only instances within the provided availability +zones will be used in the cluster discovery. + +|`any_group` |If set to `false`, will require all security groups to be +present for the instance to be used for the discovery. Defaults to +`true`. + +|`ping_timeout` |How long to wait for existing EC2 nodes to reply during +discovery. Defaults to 3s. +|======================================================================= + +[float] +==== Filtering by Tags + +EC2 discovery can also filter machines to include in the cluster based +on tags (and not just groups). The settings to use include the +`discovery.ec2.tag.` prefix. For example, setting +`discovery.ec2.tag.stage` to `dev` will only filter instances with a tag +key set to `stage`, and a value of `dev`. Several tags set will require +all of those tags to be set for the instance to be included. + +One practical use for tag filtering is when an EC2 cluster contains many +nodes that are not running elasticsearch. In this case (particularly +with high `ping_timeout` values) there is a risk that a new node's +discovery phase will end before it has found the cluster (which will +result in it declaring itself master of a new cluster with the same name +- highly undesirable). Tagging elasticsearch EC2 nodes and then +filtering by that tag will resolve this issue. + +[float] +==== Region + +The `cloud.aws.region` can be set to a region and will automatically use +the relevant settings for both `ec2` and `s3`. The available values are: +`us-east-1`, `us-west-1`, `ap-southeast-1`, `eu-west-1`. + +[float] +==== Automatic Node Attributes + +Though not dependent on actually using `ec2` as discovery (but still +requires the cloud aws plugin installed), the plugin can automatically +add node attributes relating to EC2 (for example, availability zone, +that can be used with the awareness allocation feature). In order to +enable it, set `cloud.node.auto_attributes` to `true` in the settings. diff --git a/docs/reference/modules/discovery/zen.asciidoc b/docs/reference/modules/discovery/zen.asciidoc new file mode 100644 index 00000000000..fff4d3c4712 --- /dev/null +++ b/docs/reference/modules/discovery/zen.asciidoc @@ -0,0 +1,145 @@ +[[modules-discovery-zen]] +=== Zen Discovery + +The zen discovery is the built in discovery module for elasticsearch and +the default. It provides both multicast and unicast discovery as well +being easily extended to support cloud environments. + +The zen discovery is integrated with other modules, for example, all +communication between nodes is done using the +<> module. + +It is separated into several sub modules, which are explained below: + +[float] +==== Ping + +This is the process where a node uses the discovery mechanisms to find +other nodes. There is support for both multicast and unicast based +discovery (can be used in conjunction as well). + +[float] +===== Multicast + +Multicast ping discovery of other nodes is done by sending one or more +multicast requests where existing nodes that exists will receive and +respond to. It provides the following settings with the +`discovery.zen.ping.multicast` prefix: + +[cols="<,<",options="header",] +|======================================================================= +|Setting |Description +|`group` |The group address to use. Defaults to `224.2.2.4`. + +|`port` |The port to use. Defaults to `54328`. + +|`ttl` |The ttl of the multicast message. Defaults to `3`. + +|`address` |The address to bind to, defaults to `null` which means it +will bind to all available network interfaces. +|======================================================================= + +Multicast can be disabled by setting `multicast.enabled` to `false`. + +[float] +===== Unicast + +The unicast discovery allows to perform the discovery when multicast is +not enabled. It basically requires a list of hosts to use that will act +as gossip routers. It provides the following settings with the +`discovery.zen.ping.unicast` prefix: + +[cols="<,<",options="header",] +|======================================================================= +|Setting |Description +|`hosts` |Either an array setting or a comma delimited setting. Each +value is either in the form of `host:port`, or in the form of +`host[port1-port2]`. +|======================================================================= + +The unicast discovery uses the +<> module to +perform the discovery. + +[float] +==== Master Election + +As part of the initial ping process a master of the cluster is either +elected or joined to. This is done automatically. The +`discovery.zen.ping_timeout` (which defaults to `3s`) allows to +configure the election to handle cases of slow or congested networks +(higher values assure less chance of failure). Note, this setting was +changed from 0.15.1 onwards, prior it was called +`discovery.zen.initial_ping_timeout`. + +Nodes can be excluded from becoming a master by setting `node.master` to +`false`. Note, once a node is a client node (`node.client` set to +`true`), it will not be allowed to become a master (`node.master` is +automatically set to `false`). + +The `discovery.zen.minimum_master_nodes` allows to control the minimum +number of master eligible nodes a node should "see" in order to operate +within the cluster. Its recommended to set it to a higher value than 1 +when running more than 2 nodes in the cluster. + +[float] +==== Fault Detection + +There are two fault detection processes running. The first is by the +master, to ping all the other nodes in the cluster and verify that they +are alive. And on the other end, each node pings to master to verify if +its still alive or an election process needs to be initiated. + +The following settings control the fault detection process using the +`discovery.zen.fd` prefix: + +[cols="<,<",options="header",] +|======================================================================= +|Setting |Description +|`ping_interval` |How often a node gets pinged. Defaults to `1s`. + +|`ping_timeout` |How long to wait for a ping response, defaults to +`30s`. + +|`ping_retries` |How many ping failures / timeouts cause a node to be +considered failed. Defaults to `3`. +|======================================================================= + +[float] +==== External Multicast + +The multicast discovery also supports external multicast requests to +discover nodes. The external client can send a request to the multicast +IP/group and port, in the form of: + +[source,js] +-------------------------------------------------- +{ + "request" : { + "cluster_name": "test_cluster" + } +} +-------------------------------------------------- + +And the response will be similar to node info response (with node level +information only, including transport/http addresses, and node +attributes): + +[source,js] +-------------------------------------------------- +{ + "response" : { + "cluster_name" : "test_cluster", + "transport_address" : "...", + "http_address" : "...", + "attributes" : { + "..." + } + } +} +-------------------------------------------------- + +Note, it can still be enabled, with disabled internal multicast +discovery, but still have external discovery working by keeping +`discovery.zen.ping.multicast.enabled` set to `true` (the default), but, +setting `discovery.zen.ping.multicast.ping.enabled` to `false`. diff --git a/docs/reference/modules/gateway.asciidoc b/docs/reference/modules/gateway.asciidoc new file mode 100644 index 00000000000..98802632397 --- /dev/null +++ b/docs/reference/modules/gateway.asciidoc @@ -0,0 +1,74 @@ +[[modules-gateway]] +== Gateway + +The gateway module allows one to store the state of the cluster meta +data across full cluster restarts. The cluster meta data mainly holds +all the indices created with their respective (index level) settings and +explicit type mappings. + +Each time the cluster meta data changes (for example, when an index is +added or deleted), those changes will be persisted using the gateway. +When the cluster first starts up, the state will be read from the +gateway and applied. + +The gateway set on the node level will automatically control the index +gateway that will be used. For example, if the `fs` gateway is used, +then automatically, each index created on the node will also use its own +respective index level `fs` gateway. In this case, if an index should +not persist its state, it should be explicitly set to `none` (which is +the only other value it can be set to). + +The default gateway used is the +<> gateway. + +[float] +=== Recovery After Nodes / Time + +In many cases, the actual cluster meta data should only be recovered +after specific nodes have started in the cluster, or a timeout has +passed. This is handy when restarting the cluster, and each node local +index storage still exists to be reused and not recovered from the +gateway (which reduces the time it takes to recover from the gateway). + +The `gateway.recover_after_nodes` setting (which accepts a number) +controls after how many data and master eligible nodes within the +cluster recovery will start. The `gateway.recover_after_data_nodes` and +`gateway.recover_after_master_nodes` setting work in a similar fashion, +except they consider only the number of data nodes and only the number +of master nodes respectively. The `gateway.recover_after_time` setting +(which accepts a time value) sets the time to wait till recovery happens +once all `gateway.recover_after...nodes` conditions are met. + +The `gateway.expected_nodes` allows to set how many data and master +eligible nodes are expected to be in the cluster, and once met, the +`recover_after_time` is ignored and recovery starts. The +`gateway.expected_data_nodes` and `gateway.expected_master_nodes` +settings are also supported. For example setting: + +[source,js] +-------------------------------------------------- +gateway: + recover_after_nodes: 1 + recover_after_time: 5m + expected_nodes: 2 +-------------------------------------------------- + +In an expected 2 nodes cluster will cause recovery to start 5 minutes +after the first node is up, but once there are 2 nodes in the cluster, +recovery will begin immediately (without waiting). + +Note, once the meta data has been recovered from the gateway (which +indices to create, mappings and so on), then this setting is no longer +effective until the next full restart of the cluster. + +Operations are blocked while the cluster meta data has not been +recovered in order not to mix with the actual cluster meta data that +will be recovered once the settings has been reached. + +include::gateway/local.asciidoc[] + +include::gateway/fs.asciidoc[] + +include::gateway/hadoop.asciidoc[] + +include::gateway/s3.asciidoc[] diff --git a/docs/reference/modules/gateway/fs.asciidoc b/docs/reference/modules/gateway/fs.asciidoc new file mode 100644 index 00000000000..7c6747a1809 --- /dev/null +++ b/docs/reference/modules/gateway/fs.asciidoc @@ -0,0 +1,39 @@ +[[modules-gateway-fs]] +=== Shared FS Gateway + +*The shared FS gateway is deprecated and will be removed in a future +version. Please use the +<> +instead.* + +The file system based gateway stores the cluster meta data and indices +in a *shared* file system. Note, since it is a distributed system, the +file system should be shared between all different nodes. Here is an +example config to enable it: + +[source,js] +-------------------------------------------------- +gateway: + type: fs +-------------------------------------------------- + +[float] +==== location + +The location where the gateway stores the cluster state can be set using +the `gateway.fs.location` setting. By default, it will be stored under +the `work` directory. Note, the `work` directory is considered a +temporal directory with ElasticSearch (meaning it is safe to `rm -rf` +it), the default location of the persistent gateway in work intentional, +*it should be changed*. + +When explicitly specifying the `gateway.fs.location`, each node will +append its `cluster.name` to the provided location. It means that the +location provided can safely support several clusters. + +[float] +==== concurrent_streams + +The `gateway.fs.concurrent_streams` allow to throttle the number of +streams (per node) opened against the shared gateway performing the +snapshot operation. It defaults to `5`. diff --git a/docs/reference/modules/gateway/hadoop.asciidoc b/docs/reference/modules/gateway/hadoop.asciidoc new file mode 100644 index 00000000000..b55a4be5332 --- /dev/null +++ b/docs/reference/modules/gateway/hadoop.asciidoc @@ -0,0 +1,36 @@ +[[modules-gateway-hadoop]] +=== Hadoop Gateway + +*The hadoop gateway is deprecated and will be removed in a future +version. Please use the +<> +instead.* + +The hadoop (HDFS) based gateway stores the cluster meta and indices data +in hadoop. Hadoop support is provided as a plugin and installing is +explained https://github.com/elasticsearch/elasticsearch-hadoop[here] or +downloading the hadoop plugin and placing it under the `plugins` +directory. Here is an example config to enable it: + +[source,js] +-------------------------------------------------- +gateway: + type: hdfs + hdfs: + uri: hdfs://myhost:8022 +-------------------------------------------------- + +[float] +==== Settings + +The hadoop gateway requires two simple settings. The `gateway.hdfs.uri` +controls the URI to connect to the hadoop cluster, for example: +`hdfs://myhost:8022`. The `gateway.hdfs.path` controls the path under +which the gateway will store the data. + +[float] +==== concurrent_streams + +The `gateway.hdfs.concurrent_streams` allow to throttle the number of +streams (per node) opened against the shared gateway performing the +snapshot operation. It defaults to `5`. diff --git a/docs/reference/modules/gateway/local.asciidoc b/docs/reference/modules/gateway/local.asciidoc new file mode 100644 index 00000000000..f5fa542b964 --- /dev/null +++ b/docs/reference/modules/gateway/local.asciidoc @@ -0,0 +1,31 @@ +[[modules-gateway-local]] +=== Local Gateway + +The local gateway allows for recovery of the full cluster state and +indices from the local storage of each node, and does not require a +common node level shared storage. + +Note, different from shared gateway types, the persistency to the local +gateway is *not* done in an async manner. Once an operation is +performed, the data is there for the local gateway to recover it in case +of full cluster failure. + +It is important to configure the `gateway.recover_after_nodes` setting +to include most of the expected nodes to be started after a full cluster +restart. This will insure that the latest cluster state is recovered. +For example: + +[source,js] +-------------------------------------------------- +gateway: + recover_after_nodes: 1 + recover_after_time: 5m + expected_nodes: 2 +-------------------------------------------------- + +Note, to backup/snapshot the full cluster state it is recommended that +the local storage for all nodes be copied (in theory not all are +required, just enough to guarantee a copy of each shard has been copied, +i.e. depending on the replication settings) while disabling flush. +Shared storage such as S3 can be used to keep the different nodes' +copies in one place, though it does comes at a price of more IO. diff --git a/docs/reference/modules/gateway/s3.asciidoc b/docs/reference/modules/gateway/s3.asciidoc new file mode 100644 index 00000000000..8f2f5d9fa0a --- /dev/null +++ b/docs/reference/modules/gateway/s3.asciidoc @@ -0,0 +1,51 @@ +[[modules-gateway-s3]] +=== S3 Gateway + +*The S3 gateway is deprecated and will be removed in a future version. +Please use the <> instead.* + +S3 based gateway allows to do long term reliable async persistency of +the cluster state and indices directly to Amazon S3. Here is how it can +be configured: + +[source,js] +-------------------------------------------------- +cloud: + aws: + access_key: AKVAIQBF2RECL7FJWGJQ + secret_key: vExyMThREXeRMm/b/LRzEB8jWwvzQeXgjqMX+6br + + +gateway: + type: s3 + s3: + bucket: bucket_name +-------------------------------------------------- + +You’ll need to install the `cloud-aws` plugin, by running +`bin/plugin install cloud-aws` before (re)starting elasticsearch. + +The following are a list of settings (prefixed with `gateway.s3`) that +can further control the S3 gateway: + +[cols="<,<",options="header",] +|======================================================================= +|Setting |Description +|`chunk_size` |Big files are broken down into chunks (to overcome AWS 5g +limit and use concurrent snapshotting). Default set to `100m`. +|======================================================================= + +[float] +==== concurrent_streams + +The `gateway.s3.concurrent_streams` allow to throttle the number of +streams (per node) opened against the shared gateway performing the +snapshot operation. It defaults to `5`. + +[float] +==== Region + +The `cloud.aws.region` can be set to a region and will automatically use +the relevant settings for both `ec2` and `s3`. The available values are: +`us-east-1`, `us-west-1`, `ap-southeast-1`, `eu-west-1`. diff --git a/docs/reference/modules/http.asciidoc b/docs/reference/modules/http.asciidoc new file mode 100644 index 00000000000..fbc153be60e --- /dev/null +++ b/docs/reference/modules/http.asciidoc @@ -0,0 +1,51 @@ +[[modules-http]] +== HTTP + +The http module allows to expose *elasticsearch* APIs +over HTTP. + +The http mechanism is completely asynchronous in nature, meaning that +there is no blocking thread waiting for a response. The benefit of using +asynchronous communication for HTTP is solving the +http://en.wikipedia.org/wiki/C10k_problem[C10k problem]. + +When possible, consider using +http://en.wikipedia.org/wiki/Keepalive#HTTP_Keepalive[HTTP keep alive] +when connecting for better performance and try to get your favorite +client not to do +http://en.wikipedia.org/wiki/Chunked_transfer_encoding[HTTP chunking]. + +[float] +=== Settings + +The following are the settings the can be configured for HTTP: + +[cols="<,<",options="header",] +|======================================================================= +|Setting |Description +|`http.port` |A bind port range. Defaults to `9200-9300`. + +|`http.max_content_length` |The max content of an HTTP request. Defaults +to `100mb` + +|`http.max_initial_line_length` |The max length of an HTTP URL. Defaults +to `4kb` + +|`http.compression` |Support for compression when possible (with +Accept-Encoding). Defaults to `false`. + +|`http.compression_level` |Defines the compression level to use. +Defaults to `6`. +|======================================================================= + +It also shares the uses the common +<>. + +[float] +=== Disable HTTP + +The http module can be completely disabled and not started by setting +`http.enabled` to `false`. This make sense when creating non +<> which accept HTTP +requests, and communicate with data nodes using the internal +<>. diff --git a/docs/reference/modules/indices.asciidoc b/docs/reference/modules/indices.asciidoc new file mode 100644 index 00000000000..b81aa4e064f --- /dev/null +++ b/docs/reference/modules/indices.asciidoc @@ -0,0 +1,75 @@ +[[modules-indices]] +== Indices + +The indices module allow to control settings that are globally managed +for all indices. + +[float] +=== Indexing Buffer + +The indexing buffer setting allows to control how much memory will be +allocated for the indexing process. It is a global setting that bubbles +down to all the different shards allocated on a specific node. + +The `indices.memory.index_buffer_size` accepts either a percentage or a +byte size value. It defaults to `10%`, meaning that `10%` of the total +memory allocated to a node will be used as the indexing buffer size. +This amount is then divided between all the different shards. Also, if +percentage is used, allow to set `min_index_buffer_size` (defaults to +`48mb`) and `max_index_buffer_size` which by default is unbounded. + +The `indices.memory.min_shard_index_buffer_size` allows to set a hard +lower limit for the memory allocated per shard for its own indexing +buffer. It defaults to `4mb`. + +[float] +=== TTL interval + +You can dynamically set the `indices.ttl.interval` allows to set how +often expired documents will be automatically deleted. The default value +is 60s. + +The deletion orders are processed by bulk. You can set +`indices.ttl.bulk_size` to fit your needs. The default value is 10000. + +See also <>. + +[float] +=== Recovery + +The following settings can be set to manage recovery policy: + +[horizontal] +`indices.recovery.concurrent_streams`:: + defaults to `3`. + +`indices.recovery.file_chunk_size`:: + defaults to `512kb`. + +`indices.recovery.translog_ops`:: + defaults to `1000`. + +`indices.recovery.translog_size`:: + defaults to `512kb`. + +`indices.recovery.compress`:: + defaults to `true`. + +`indices.recovery.max_bytes_per_sec`:: + since 0.90.1, defaults to `20mb`. + +`indices.recovery.max_size_per_sec`:: + deprecated from 0.90.1. Replaced by `indices.recovery.max_bytes_per_sec`. + +[float] +=== Store level throttling + +The following settings can be set to control store throttling: + +[horizontal] +`indices.store.throttle.type`:: + could be `merge` (default), `not` or `all`. See <>. + +`indices.store.throttle.max_bytes_per_sec`:: + defaults to `20mb`. + diff --git a/docs/reference/modules/jmx.asciidoc b/docs/reference/modules/jmx.asciidoc new file mode 100644 index 00000000000..97f108eacbe --- /dev/null +++ b/docs/reference/modules/jmx.asciidoc @@ -0,0 +1,34 @@ +[[modules-jmx]] +== JMX + +[float] +=== REMOVED AS OF v0.90 + +Use the stats APIs instead. + +The JMX module exposes node information through +http://java.sun.com/javase/technologies/core/mntr-mgmt/javamanagement/[JMX]. +JMX can be used by either +http://en.wikipedia.org/wiki/JConsole[jconsole] or +http://en.wikipedia.org/wiki/VisualVM[VisualVM]. + +Exposed JMX data include both node level information, as well as +instantiated index and shard on specific node. This is a work in +progress with each version exposing more information. + +[float] +=== jmx.domain + +The domain under which the JMX will register under can be set using +`jmx.domain` setting. It defaults to `{elasticsearch}`. + +[float] +=== jmx.create_connector + +An RMI connector can be started to accept JMX requests. This can be +enabled by setting `jmx.create_connector` to `true`. An RMI connector +does come with its own overhead, make sure you really need it. + +When an RMI connector is created, the `jmx.port` setting provides a port +range setting for the ports the rmi connector can open on. By default, +it is set to `9400-9500`. diff --git a/docs/reference/modules/memcached.asciidoc b/docs/reference/modules/memcached.asciidoc new file mode 100644 index 00000000000..20276d0fde8 --- /dev/null +++ b/docs/reference/modules/memcached.asciidoc @@ -0,0 +1,69 @@ +[[modules-memcached]] +== memcached + +The memcached module allows to expose *elasticsearch* +APIs over the memcached protocol (as closely +as possible). + +It is provided as a plugin called `transport-memcached` and installing +is explained +https://github.com/elasticsearch/elasticsearch-transport-memcached[here] +. Another option is to download the memcached plugin and placing it +under the `plugins` directory. + +The memcached protocol supports both the binary and the text protocol, +automatically detecting the correct one to use. + +[float] +=== Mapping REST to Memcached Protocol + +Memcached commands are mapped to REST and handled by the same generic +REST layer in elasticsearch. Here is a list of the memcached commands +supported: + +[float] +==== GET + +The memcached `GET` command maps to a REST `GET`. The key used is the +URI (with parameters). The main downside is the fact that the memcached +`GET` does not allow body in the request (and `SET` does not allow to +return a result...). For this reason, most REST APIs (like search) allow +to accept the "source" as a URI parameter as well. + +[float] +==== SET + +The memcached `SET` command maps to a REST `POST`. The key used is the +URI (with parameters), and the body maps to the REST body. + +[float] +==== DELETE + +The memcached `DELETE` command maps to a REST `DELETE`. The key used is +the URI (with parameters). + +[float] +==== QUIT + +The memcached `QUIT` command is supported and disconnects the client. + +[float] +=== Settings + +The following are the settings the can be configured for memcached: + +[cols="<,<",options="header",] +|=============================================================== +|Setting |Description +|`memcached.port` |A bind port range. Defaults to `11211-11311`. +|=============================================================== + +It also shares the uses the common +<>. + +[float] +=== Disable memcached + +The memcached module can be completely disabled and not started using by +setting `memcached.enabled` to `false`. By default it is enabled once it +is detected as a plugin. diff --git a/docs/reference/modules/network.asciidoc b/docs/reference/modules/network.asciidoc new file mode 100644 index 00000000000..23371080840 --- /dev/null +++ b/docs/reference/modules/network.asciidoc @@ -0,0 +1,88 @@ +[[modules-network]] +== Network Settings + +There are several modules within a Node that use network based +configuration, for example, the +<> and +<> modules. Node level +network settings allows to set common settings that will be shared among +all network based modules (unless explicitly overridden in each module). + +The `network.bind_host` setting allows to control the host different +network components will bind on. By default, the bind host will be +`anyLocalAddress` (typically `0.0.0.0` or `::0`). + +The `network.publish_host` setting allows to control the host the node +will publish itself within the cluster so other nodes will be able to +connect to it. Of course, this can't be the `anyLocalAddress`, and by +default, it will be the first non loopback address (if possible), or the +local address. + +The `network.host` setting is a simple setting to automatically set both +`network.bind_host` and `network.publish_host` to the same host value. + +Both settings allows to be configured with either explicit host address +or host name. The settings also accept logical setting values explained +in the following table: + +[cols="<,<",options="header",] +|======================================================================= +|Logical Host Setting Value |Description +|`_local_` |Will be resolved to the local ip address. + +|`_non_loopback_` |The first non loopback address. + +|`_non_loopback:ipv4_` |The first non loopback IPv4 address. + +|`_non_loopback:ipv6_` |The first non loopback IPv6 address. + +|`_[networkInterface]_` |Resolves to the ip address of the provided +network interface. For example `_en0_`. + +|`_[networkInterface]:ipv4_` |Resolves to the ipv4 address of the +provided network interface. For example `_en0:ipv4_`. + +|`_[networkInterface]:ipv6_` |Resolves to the ipv6 address of the +provided network interface. For example `_en0:ipv6_`. +|======================================================================= + +When the `cloud-aws` plugin is installed, the following are also allowed +as valid network host settings: + +[cols="<,<",options="header",] +|================================================================== +|EC2 Host Value |Description +|`_ec2:privateIpv4_` |The private IP address (ipv4) of the machine. +|`_ec2:privateDns_` |The private host of the machine. +|`_ec2:publicIpv4_` |The public IP address (ipv4) of the machine. +|`_ec2:publicDns_` |The public host of the machine. +|`_ec2_` |Less verbose option for the private ip address. +|`_ec2:privateIp_` |Less verbose option for the private ip address. +|`_ec2:publicIp_` |Less verbose option for the public ip address. +|================================================================== + +[float] +=== TCP Settings + +Any component that uses TCP (like the HTTP, Transport and Memcached) +share the following allowed settings: + +[cols="<,<",options="header",] +|======================================================================= +|Setting |Description +|`network.tcp.no_delay` |Enable or disable tcp no delay setting. +Defaults to `true`. + +|`network.tcp.keep_alive` |Enable or disable tcp keep alive. By default +not explicitly set. + +|`network.tcp.reuse_address` |Should an address be reused or not. +Defaults to `true` on none windows machines. + +|`network.tcp.send_buffer_size` |The size of the tcp send buffer size +(in size setting format). By default not explicitly set. + +|`network.tcp.receive_buffer_size` |The size of the tcp receive buffer +size (in size setting format). By default not explicitly set. +|======================================================================= + diff --git a/docs/reference/modules/node.asciidoc b/docs/reference/modules/node.asciidoc new file mode 100644 index 00000000000..5e40522264a --- /dev/null +++ b/docs/reference/modules/node.asciidoc @@ -0,0 +1,32 @@ +[[modules-node]] +== Node + +*elasticsearch* allows to configure a node to either be allowed to store +data locally or not. Storing data locally basically means that shards of +different indices are allowed to be allocated on that node. By default, +each node is considered to be a data node, and it can be turned off by +setting `node.data` to `false`. + +This is a powerful setting allowing to simply create smart load +balancers that take part in some of different API processing. Lets take +an example: + +We can start a whole cluster of data nodes which do not even start an +HTTP transport by setting `http.enabled` to `false`. Such nodes will +communicate with one another using the +<> module. In front +of the cluster we can start one or more "non data" nodes which will +start with HTTP enabled. All HTTP communication will be performed +through these "non data" nodes. + +The benefit of using that is first the ability to create smart load +balancers. These "non data" nodes are still part of the cluster, and +they redirect operations exactly to the node that holds the relevant +data. The other benefit is the fact that for scatter / gather based +operations (such as search), these nodes will take part of the +processing since they will start the scatter process, and perform the +actual gather processing. + +This relieves the data nodes to do the heavy duty of indexing and +searching, without needing to process HTTP requests (parsing), overload +the network, or perform the gather processing. diff --git a/docs/reference/modules/plugins.asciidoc b/docs/reference/modules/plugins.asciidoc new file mode 100644 index 00000000000..f25c6629b57 --- /dev/null +++ b/docs/reference/modules/plugins.asciidoc @@ -0,0 +1,245 @@ +[[modules-plugins]] +== Plugins + +[float] +=== Plugins + +Plugins are a way to enhance the basic elasticsearch functionality in a +custom manner. They range from adding custom mapping types, custom +analyzers (in a more built in fashion), native scripts, custom discovery +and more. + +[float] +==== Installing plugins + +Installing plugins can either be done manually by placing them under the +`plugins` directory, or using the `plugin` script. Several plugins can +be found under the https://github.com/elasticsearch[elasticsearch] +organization in GitHub, starting with `elasticsearch-`. + +Starting from 0.90.2, installing plugins typically take the form of +`plugin --install //`. The plugins will be +automatically downloaded in this case from `download.elasticsearch.org`, +and in case they don't exist there, from maven (central and sonatype). + +Note that when the plugin is located in maven central or sonatype +repository, `` is the artifact `groupId` and `` is +the `artifactId`. + +For prior version, the older form is +`plugin -install //` + +A plugin can also be installed directly by specifying the URL for it, +for example: +`bin/plugin --url file://path/to/plugin --install plugin-name` or +`bin/plugin -url file://path/to/plugin -install plugin-name` for older +version. + +Starting from 0.90.2, for more information about plugins, you can run +`bin/plugin -h`. + +[float] +==== Site Plugins + +Plugins can have "sites" in them, any plugin that exists under the +`plugins` directory with a `_site` directory, its content will be +statically served when hitting `/_plugin/[plugin_name]/` url. Those can +be added even after the process has started. + +Installed plugins that do not contain any java related content, will +automatically be detected as site plugins, and their content will be +moved under `_site`. + +The ability to install plugins from Github allows to easily install site +plugins hosted there by downloading the actual repo, for example, +running: + +[source,js] +-------------------------------------------------- +# From 0.90.2 +bin/plugin --install mobz/elasticsearch-head +bin/plugin --install lukas-vlcek/bigdesk + +# From a prior version +bin/plugin -install mobz/elasticsearch-head +bin/plugin -install lukas-vlcek/bigdesk +-------------------------------------------------- + +Will install both of those site plugins, with `elasticsearch-head` +available under `http://localhost:9200/_plugin/head/` and `bigdesk` +available under `http://localhost:9200/_plugin/bigdesk/`. + +[float] +==== Mandatory Plugins + +If you rely on some plugins, you can define mandatory plugins using the +`plugin.mandatory` attribute, for example, here is a sample config: + +[source,js] +-------------------------------------------------- +plugin.mandatory: mapper-attachments,lang-groovy +-------------------------------------------------- + +For safety reasons, if a mandatory plugin is not installed, the node +will not start. + +[float] +==== Installed Plugins + +A list of the currently loaded plugins can be retrieved using the +<>. + +[float] +=== Known Plugins + +[float] +==== Analysis Plugins + +* https://github.com/yakaz/elasticsearch-analysis-combo/[Combo Analysis +Plugin] (by Olivier Favre, Yakaz) +* https://github.com/elasticsearch/elasticsearch-analysis-smartcn[Smart +Chinese Analysis Plugin] (by elasticsearch team) +* https://github.com/elasticsearch/elasticsearch-analysis-icu[ICU +Analysis plugin] (by elasticsearch team) +* https://github.com/elasticsearch/elasticsearch-analysis-stempel[Stempel +(Polish) Analysis plugin] (by elasticsearch team) +* https://github.com/chytreg/elasticsearch-analysis-morfologik[Morfologik +(Polish) Analysis plugin] (by chytreg) +* https://github.com/medcl/elasticsearch-analysis-ik[IK Analysis Plugin] +(by Medcl) +* https://github.com/medcl/elasticsearch-analysis-mmseg[Mmseg Analysis +Plugin] (by Medcl) +* https://github.com/jprante/elasticsearch-analysis-hunspell[Hunspell +Analysis Plugin] (by Jörg Prante) +* https://github.com/elasticsearch/elasticsearch-analysis-kuromoji[Japanese +(Kuromoji) Analysis plugin] (by elasticsearch team). +* https://github.com/suguru/elasticsearch-analysis-japanese[Japanese +Analysis plugin] (by suguru). +* https://github.com/imotov/elasticsearch-analysis-morphology[Russian +and English Morphological Analysis Plugin] (by Igor Motov) +* https://github.com/medcl/elasticsearch-analysis-pinyin[Pinyin Analysis +Plugin] (by Medcl) +* https://github.com/medcl/elasticsearch-analysis-string2int[String2Integer +Analysis Plugin] (by Medcl) +* https://github.com/barminator/elasticsearch-analysis-annotation[Annotation +Analysis Plugin] (by Michal Samek) + +[float] +==== River Plugins + +* https://github.com/elasticsearch/elasticsearch-river-couchdb[CouchDB +River Plugin] (by elasticsearch team) +* https://github.com/elasticsearch/elasticsearch-river-wikipedia[Wikipedia +River Plugin] (by elasticsearch team) +* https://github.com/elasticsearch/elasticsearch-river-twitter[Twitter +River Plugin] (by elasticsearch team) +* https://github.com/elasticsearch/elasticsearch-river-rabbitmq[RabbitMQ +River Plugin] (by elasticsearch team) +* https://github.com/domdorn/elasticsearch-river-activemq/[ActiveMQ +River Plugin] (by Dominik Dorn) +* https://github.com/albogdano/elasticsearch-river-amazonsqs[Amazon SQS +River Plugin] (by Alex Bogdanovski) +* https://github.com/xxBedy/elasticsearch-river-csv[CSV River Plugin] +(by Martin Bednar) +* http://www.pilato.fr/dropbox/[Dropbox River Plugin] (by David Pilato) +* http://www.pilato.fr/fsriver/[FileSystem River Plugin] (by David +Pilato) +* https://github.com/sksamuel/elasticsearch-river-hazelcast[Hazelcast +River Plugin] (by Steve Samuel) +* https://github.com/jprante/elasticsearch-river-jdbc[JDBC River Plugin] +(by Jörg Prante) +* https://github.com/qotho/elasticsearch-river-jms[JMS River Plugin] (by +Steve Sarandos) +* https://github.com/tlrx/elasticsearch-river-ldap[LDAP River Plugin] +(by Tanguy Leroux) +* https://github.com/richardwilly98/elasticsearch-river-mongodb/[MongoDB +River Plugin] (by Richard Louapre) +* https://github.com/sksamuel/elasticsearch-river-neo4j[Neo4j River +Plugin] (by Steve Samuel) +* https://github.com/jprante/elasticsearch-river-oai/[Open Archives +Initiative (OAI) River Plugin] (by Jörg Prante) +* https://github.com/sksamuel/elasticsearch-river-redis[Redis River +Plugin] (by Steve Samuel) +* http://dadoonet.github.com/rssriver/[RSS River Plugin] (by David +Pilato) +* https://github.com/adamlofts/elasticsearch-river-sofa[Sofa River +Plugin] (by adamlofts) +* https://github.com/javanna/elasticsearch-river-solr/[Solr River +Plugin] (by Luca Cavanna) +* https://github.com/sunnygleason/elasticsearch-river-st9[St9 River +Plugin] (by Sunny Gleason) +* https://github.com/endgameinc/elasticsearch-river-kafka[Kafka River +Plugin] (by Endgame Inc.) +* https://github.com/obazoud/elasticsearch-river-git[Git River Plugin] (by Olivier Bazoud) + +[float] +==== Transport Plugins + +* https://github.com/elasticsearch/elasticsearch-transport-wares[Servlet +transport] (by elasticsearch team) +* https://github.com/elasticsearch/elasticsearch-transport-memcached[Memcached +transport plugin] (by elasticsearch team) +* https://github.com/elasticsearch/elasticsearch-transport-thrift[Thrift +Transport] (by elasticsearch team) +* https://github.com/tlrx/transport-zeromq[ZeroMQ transport layer +plugin] (by Tanguy Leroux) +* https://github.com/sonian/elasticsearch-jetty[Jetty HTTP transport +plugin] (by Sonian Inc.) + +[float] +==== Scripting Plugins + +* https://github.com/elasticsearch/elasticsearch-lang-python[Python +language Plugin] (by elasticsearch team) +* https://github.com/elasticsearch/elasticsearch-lang-javascript[JavaScript +language Plugin] (by elasticsearch team) +* https://github.com/elasticsearch/elasticsearch-lang-groovy[Groovy lang +Plugin] (by elasticsearch team) +* https://github.com/hiredman/elasticsearch-lang-clojure[Clojure +Language Plugin] (by Kevin Downey) + +[float] +==== Site Plugins + +* https://github.com/lukas-vlcek/bigdesk[BigDesk Plugin] (by Lukáš Vlček) +* https://github.com/mobz/elasticsearch-head[Elasticsearch Head Plugin] +(by Ben Birch) +* https://github.com/royrusso/elasticsearch-HQ[ElasticSearch HQ] (by Roy +Russo) +* https://github.com/karmi/elasticsearch-paramedic[Paramedic Plugin] (by +Karel Minařík) +* https://github.com/polyfractal/elasticsearch-segmentspy[SegmentSpy +Plugin] (by Zachary Tong) +* https://github.com/polyfractal/elasticsearch-inquisitor[Inquisitor +Plugin] (by Zachary Tong) +* https://github.com/andrewvc/elastic-hammer[Hammer Plugin] (by Andrew +Cholakian) + +[float] +==== Misc Plugins + +* https://github.com/elasticsearch/elasticsearch-mapper-attachments[Mapper +Attachments Type plugin] (by elasticsearch team) +* https://github.com/elasticsearch/elasticsearch-hadoop[Hadoop Plugin] +(by elasticsearch team) +* https://github.com/elasticsearch/elasticsearch-cloud-aws[AWS Cloud +Plugin] (by elasticsearch team) +* https://github.com/mattweber/elasticsearch-mocksolrplugin[ElasticSearch +Mock Solr Plugin] (by Matt Weber) +* https://github.com/spinscale/elasticsearch-suggest-plugin[Suggester +Plugin] (by Alexander Reelsen) +* https://github.com/medcl/elasticsearch-partialupdate[ElasticSearch +PartialUpdate Plugin] (by Medcl) +* https://github.com/sonian/elasticsearch-zookeeper[ZooKeeper Discovery +Plugin] (by Sonian Inc.) +* https://github.com/derryx/elasticsearch-changes-plugin[ElasticSearch +Changes Plugin] (by Thomas Peuss) +* http://tlrx.github.com/elasticsearch-view-plugin[ElasticSearch View +Plugin] (by Tanguy Leroux) +* https://github.com/viniciusccarvalho/elasticsearch-newrelic[ElasticSearch +New Relic Plugin] (by Vinicius Carvalho) +* https://github.com/endgameinc/elasticsearch-term-plugin[Terms +Component Plugin] (by Endgame Inc.) +* https://github.com/carrot2/elasticsearch-carrot2[carrot2 Plugin]: +Results clustering with carrot2 (by Dawid Weiss) + diff --git a/docs/reference/modules/scripting.asciidoc b/docs/reference/modules/scripting.asciidoc new file mode 100644 index 00000000000..77cc3463670 --- /dev/null +++ b/docs/reference/modules/scripting.asciidoc @@ -0,0 +1,242 @@ +[[modules-scripting]] +== Scripting + +The scripting module allows to use scripts in order to evaluate custom +expressions. For example, scripts can be used to return "script fields" +as part of a search request, or can be used to evaluate a custom score +for a query and so on. + +The scripting module uses by default http://mvel.codehaus.org/[mvel] as +the scripting language with some extensions. mvel is used since it is +extremely fast and very simple to use, and in most cases, simple +expressions are needed (for example, mathematical equations). + +Additional `lang` plugins are provided to allow to execute scripts in +different languages. Currently supported plugins are `lang-javascript` +for JavaScript, `lang-groovy` for Groovy, and `lang-python` for Python. +All places where a `script` parameter can be used, a `lang` parameter +(on the same level) can be provided to define the language of the +script. The `lang` options are `mvel`, `js`, `groovy`, `python`, and +`native`. + +[float] +=== Default Scripting Language + +The default scripting language (assuming no `lang` parameter is +provided) is `mvel`. In order to change it set the `script.default_lang` +to the appropriate language. + +[float] +=== Preloaded Scripts + +Scripts can always be provided as part of the relevant API, but they can +also be preloaded by placing them under `config/scripts` and then +referencing them by the script name (instead of providing the full +script). This helps reduce the amount of data passed between the client +and the nodes. + +The name of the script is derived from the hierarchy of directories it +exists under, and the file name without the lang extension. For example, +a script placed under `config/scripts/group1/group2/test.py` will be +named `group1_group2_test`. + +[float] +=== Native (Java) Scripts + +Even though `mvel` is pretty fast, allow to register native Java based +scripts for faster execution. + +In order to allow for scripts, the `NativeScriptFactory` needs to be +implemented that constructs the script that will be executed. There are +two main types, one that extends `AbstractExecutableScript` and one that +extends `AbstractSearchScript` (probably the one most users will extend, +with additional helper classes in `AbstractLongSearchScript`, +`AbstractDoubleSearchScript`, and `AbstractFloatSearchScript`). + +Registering them can either be done by settings, for example: +`script.native.my.type` set to `sample.MyNativeScriptFactory` will +register a script named `my`. Another option is in a plugin, access +`ScriptModule` and call `registerScript` on it. + +Executing the script is done by specifying the `lang` as `native`, and +the name of the script as the `script`. + +Note, the scripts need to be in the classpath of elasticsearch. One +simple way to do it is to create a directory under plugins (choose a +descriptive name), and place the jar / classes files there, they will be +automatically loaded. + +[float] +=== Score + +In all scripts that can be used in facets, allow to access the current +doc score using `doc.score`. + +[float] +=== Document Fields + +Most scripting revolve around the use of specific document fields data. +The `doc['field_name']` can be used to access specific field data within +a document (the document in question is usually derived by the context +the script is used). Document fields are very fast to access since they +end up being loaded into memory (all the relevant field values/tokens +are loaded to memory). + +The following data can be extracted from a field: + +[cols="<,<",options="header",] +|======================================================================= +|Expression |Description +|`doc['field_name'].value` |The native value of the field. For example, +if its a short type, it will be short. + +|`doc['field_name'].values` |The native array values of the field. For +example, if its a short type, it will be short[]. Remember, a field can +have several values within a single doc. Returns an empty array if the +field has no values. + +|`doc['field_name'].empty` |A boolean indicating if the field has no +values within the doc. + +|`doc['field_name'].multiValued` |A boolean indicating that the field +has several values within the corpus. + +|`doc['field_name'].lat` |The latitude of a geo point type. + +|`doc['field_name'].lon` |The longitude of a geo point type. + +|`doc['field_name'].lats` |The latitudes of a geo point type. + +|`doc['field_name'].lons` |The longitudes of a geo point type. + +|`doc['field_name'].distance(lat, lon)` |The `plane` distance (in miles) +of this geo point field from the provided lat/lon. + +|`doc['field_name'].arcDistance(lat, lon)` |The `arc` distance (in +miles) of this geo point field from the provided lat/lon. + +|`doc['field_name'].distanceInKm(lat, lon)` |The `plane` distance (in +km) of this geo point field from the provided lat/lon. + +|`doc['field_name'].arcDistanceInKm(lat, lon)` |The `arc` distance (in +km) of this geo point field from the provided lat/lon. + +|`doc['field_name'].geohashDistance(geohash)` |The distance (in miles) +of this geo point field from the provided geohash. + +|`doc['field_name'].geohashDistanceInKm(geohash)` |The distance (in km) +of this geo point field from the provided geohash. +|======================================================================= + +[float] +=== Stored Fields + +Stored fields can also be accessed when executed a script. Note, they +are much slower to access compared with document fields, but are not +loaded into memory. They can be simply accessed using +`_fields['my_field_name'].value` or `_fields['my_field_name'].values`. + +[float] +=== Source Field + +The source field can also be accessed when executing a script. The +source field is loaded per doc, parsed, and then provided to the script +for evaluation. The `_source` forms the context under which the source +field can be accessed, for example `_source.obj2.obj1.field3`. + +[float] +=== mvel Built In Functions + +There are several built in functions that can be used within scripts. +They include: + +[cols="<,<",options="header",] +|======================================================================= +|Function |Description +|`time()` |The current time in milliseconds. + +|`sin(a)` |Returns the trigonometric sine of an angle. + +|`cos(a)` |Returns the trigonometric cosine of an angle. + +|`tan(a)` |Returns the trigonometric tangent of an angle. + +|`asin(a)` |Returns the arc sine of a value. + +|`acos(a)` |Returns the arc cosine of a value. + +|`atan(a)` |Returns the arc tangent of a value. + +|`toRadians(angdeg)` |Converts an angle measured in degrees to an +approximately equivalent angle measured in radians + +|`toDegrees(angrad)` |Converts an angle measured in radians to an +approximately equivalent angle measured in degrees. + +|`exp(a)` |Returns Euler's number _e_ raised to the power of value. + +|`log(a)` |Returns the natural logarithm (base _e_) of a value. + +|`log10(a)` |Returns the base 10 logarithm of a value. + +|`sqrt(a)` |Returns the correctly rounded positive square root of a +value. + +|`cbrt(a)` |Returns the cube root of a double value. + +|`IEEEremainder(f1, f2)` |Computes the remainder operation on two +arguments as prescribed by the IEEE 754 standard. + +|`ceil(a)` |Returns the smallest (closest to negative infinity) value +that is greater than or equal to the argument and is equal to a +mathematical integer. + +|`floor(a)` |Returns the largest (closest to positive infinity) value +that is less than or equal to the argument and is equal to a +mathematical integer. + +|`rint(a)` |Returns the value that is closest in value to the argument +and is equal to a mathematical integer. + +|`atan2(y, x)` |Returns the angle _theta_ from the conversion of +rectangular coordinates (_x_, _y_) to polar coordinates (r,_theta_). + +|`pow(a, b)` |Returns the value of the first argument raised to the +power of the second argument. + +|`round(a)` |Returns the closest _int_ to the argument. + +|`random()` |Returns a random _double_ value. + +|`abs(a)` |Returns the absolute value of a value. + +|`max(a, b)` |Returns the greater of two values. + +|`min(a, b)` |Returns the smaller of two values. + +|`ulp(d)` |Returns the size of an ulp of the argument. + +|`signum(d)` |Returns the signum function of the argument. + +|`sinh(x)` |Returns the hyperbolic sine of a value. + +|`cosh(x)` |Returns the hyperbolic cosine of a value. + +|`tanh(x)` |Returns the hyperbolic tangent of a value. + +|`hypot(x, y)` |Returns sqrt(_x2_ + _y2_) without intermediate overflow +or underflow. +|======================================================================= + +[float] +=== Arithmetic precision in MVEL + +When dividing two numbers using MVEL based scripts, the engine tries to +be smart and adheres to the default behaviour of java. This means if you +divide two integers (you might have configured the fields as integer in +the mapping), the result will also be an integer. This means, if a +calculation like `1/num` is happening in your scripts and `num` is an +integer with the value of `8`, the result is `0` even though you were +expecting it to be `0.125`. You may need to enforce precision by +explicitly using a double like `1.0/num` in order to get the expected +result. diff --git a/docs/reference/modules/threadpool.asciidoc b/docs/reference/modules/threadpool.asciidoc new file mode 100644 index 00000000000..835c6267ec3 --- /dev/null +++ b/docs/reference/modules/threadpool.asciidoc @@ -0,0 +1,120 @@ +[[modules-threadpool]] +== Thread Pool + +A node holds several thread pools in order to improve how threads are +managed and memory consumption within a node. There are several thread +pools, but the important ones include: + +[horizontal] +`index`:: + For index/delete operations, defaults to `fixed` type since + `0.90.0`, size `# of available processors`. (previously type `cached`) + +`search`:: + For count/search operations, defaults to `fixed` type since + `0.90.0`, size `3x # of available processors`. (previously type + `cached`) + +`get`:: + For get operations, defaults to `fixed` type since `0.90.0`, + size `# of available processors`. (previously type `cached`) + +`bulk`:: + For bulk operations, defaults to `fixed` type since `0.90.0`, + size `# of available processors`. (previously type `cached`) + +`warmer`:: + For segment warm-up operations, defaults to `scaling` since + `0.90.0` with a `5m` keep-alive. (previously type `cached`) + +`refresh`:: + For refresh operations, defaults to `scaling` since + `0.90.0` with a `5m` keep-alive. (previously type `cached`) + +Changing a specific thread pool can be done by setting its type and +specific type parameters, for example, changing the `index` thread pool +to `blocking` type: + +[source,js] +-------------------------------------------------- +threadpool: + index: + type: blocking + min: 1 + size: 30 + wait_time: 30s +-------------------------------------------------- + +NOTE: you can update threadpool settings live using + <>. + + +[float] +=== Thread pool types + +The following are the types of thread pools that can be used and their +respective parameters: + +[float] +==== `cache` + +The `cache` thread pool is an unbounded thread pool that will spawn a +thread if there are pending requests. Here is an example of how to set +it: + +[source,js] +-------------------------------------------------- +threadpool: + index: + type: cached +-------------------------------------------------- + +[float] +==== `fixed` + +The `fixed` thread pool holds a fixed size of threads to handle the +requests with a queue (optionally bounded) for pending requests that +have no threads to service them. + +The `size` parameter controls the number of threads, and defaults to the +number of cores times 5. + +The `queue_size` allows to control the size of the queue of pending +requests that have no threads to execute them. By default, it is set to +`-1` which means its unbounded. When a request comes in and the queue is +full, the `reject_policy` parameter can control how it will behave. The +default, `abort`, will simply fail the request. Setting it to `caller` +will cause the request to execute on an IO thread allowing to throttle +the execution on the networking layer. + +[source,js] +-------------------------------------------------- +threadpool: + index: + type: fixed + size: 30 + queue_size: 1000 + reject_policy: caller +-------------------------------------------------- + +[float] +==== `blocking` + +The `blocking` pool allows to configure a `min` (defaults to `1`) and +`size` (defaults to the number of cores times 5) parameters for the +number of threads. + +It also has a backlog queue with a default `queue_size` of `1000`. Once +the queue is full, it will wait for the provided `wait_time` (defaults +to `60s`) on the calling IO thread, and fail if it has not been +executed. + +[source,js] +-------------------------------------------------- +threadpool: + index: + type: blocking + min: 1 + size: 30 + wait_time: 30s +-------------------------------------------------- diff --git a/docs/reference/modules/thrift.asciidoc b/docs/reference/modules/thrift.asciidoc new file mode 100644 index 00000000000..85e229fbba4 --- /dev/null +++ b/docs/reference/modules/thrift.asciidoc @@ -0,0 +1,25 @@ +[[modules-thrift]] +== Thrift + +The thrift transport module allows to expose the REST interface of +elasticsearch using thrift. Thrift should provide better performance +over http. Since thrift provides both the wire protocol and the +transport, it should make using it simpler (thought its lacking on +docs...). + +Using thrift requires installing the `transport-thrift` plugin, located +https://github.com/elasticsearch/elasticsearch-transport-thrift[here]. + +The thrift +https://github.com/elasticsearch/elasticsearch-transport-thrift/blob/master/elasticsearch.thrift[schema] +can be used to generate thrift clients. + +[cols="<,<",options="header",] +|======================================================================= +|Setting |Description +|`thrift.port` |The port to bind to. Defaults to 9500-9600 + +|`thrift.frame` |Defaults to `-1`, which means no framing. Set to a +higher value to specify the frame size (like `15mb`). +|======================================================================= + diff --git a/docs/reference/modules/transport.asciidoc b/docs/reference/modules/transport.asciidoc new file mode 100644 index 00000000000..99579332599 --- /dev/null +++ b/docs/reference/modules/transport.asciidoc @@ -0,0 +1,43 @@ +[[modules-transport]] +== Transport + +The transport module is used for internal communication between nodes +within the cluster. Each call that goes from one node to the other uses +the transport module (for example, when an HTTP GET request is processed +by one node, and should actually be processed by another node that holds +the data). + +The transport mechanism is completely asynchronous in nature, meaning +that there is no blocking thread waiting for a response. The benefit of +using asynchronous communication is first solving the +http://en.wikipedia.org/wiki/C10k_problem[C10k problem], as well as +being the idle solution for scatter (broadcast) / gather operations such +as search in ElasticSearch. + +[float] +=== TCP Transport + +The TCP transport is an implementation of the transport module using +TCP. It allows for the following settings: + +[cols="<,<",options="header",] +|======================================================================= +|Setting |Description +|`transport.tcp.port` |A bind port range. Defaults to `9300-9400`. + +|`transport.tcp.connect_timeout` |The socket connect timeout setting (in +time setting format). Defaults to `2s`. + +|`transport.tcp.compress` |Set to `true` to enable compression (LZF) +between all nodes. Defaults to `false`. +|======================================================================= + +It also shares the uses the common +<>. + +[float] +=== Local Transport + +This is a handy transport to use when running integration tests within +the JVM. It is automatically enabled when using +`NodeBuilder#local(true)`. diff --git a/docs/reference/query-dsl.asciidoc b/docs/reference/query-dsl.asciidoc new file mode 100644 index 00000000000..32c1fb5cfd4 --- /dev/null +++ b/docs/reference/query-dsl.asciidoc @@ -0,0 +1,42 @@ +[[query-dsl]] += Query DSL + +[partintro] +-- +*elasticsearch* provides a full Query DSL based on JSON to define +queries. In general, there are basic queries such as +<> or +<>. There are +also compound queries like the +<> query. Queries can +also have filters associated with them such as the +<> or +<> +queries, with specific filter queries. + +Think of the Query DSL as an AST of queries. Certain queries can contain +other queries (like the +<> query), others can +contain filters (like the +<>), +and some can contain both a query and a filter (like the +<>). Each of +those can contain *any* query of the list of queries or *any* filter +from the list of filters, resulting in the ability to build quite +complex (and interesting) queries. + +Both queries and filters can be used in different APIs. For example, +within a <>, or +as a <>. This +section explains the components (queries and filters) that can form the +AST one can use. + +Filters are very handy since they perform an order of magnitude better +than plain queries since no scoring is performed and they are +automatically cached. + +-- + +include::query-dsl/queries.asciidoc[] + +include::query-dsl/filters.asciidoc[] diff --git a/docs/reference/query-dsl/filters.asciidoc b/docs/reference/query-dsl/filters.asciidoc new file mode 100644 index 00000000000..22878c3ff38 --- /dev/null +++ b/docs/reference/query-dsl/filters.asciidoc @@ -0,0 +1,104 @@ +[[query-dsl-filters]] +== Filters + +As a general rule, filters should be used instead of queries: + +* for binary yes/no searches +* for queries on exact values + +[float] +=== Filters and Caching + +Filters can be a great candidate for caching. Caching the result of a +filter does not require a lot of memory, and will cause other queries +executing against the same filter (same parameters) to be blazingly +fast. + +Some filters already produce a result that is easily cacheable, and the +difference between caching and not caching them is the act of placing +the result in the cache or not. These filters, which include the +<>, +<>, +<>, and +<> filters, are by +default cached and are recommended to use (compared to the equivalent +query version) when the same filter (same parameters) will be used +across multiple different queries (for example, a range filter with age +higher than 10). + +Other filters, usually already working with the field data loaded into +memory, are not cached by default. Those filters are already very fast, +and the process of caching them requires extra processing in order to +allow the filter result to be used with different queries than the one +executed. These filters, including the geo, +<>, +and <> filters +are not cached by default. + +The last type of filters are those working with other filters. The +<>, +<> and +<> filters are not +cached as they basically just manipulate the internal filters. + +All filters allow to set `_cache` element on them to explicitly control +caching. They also allow to set `_cache_key` which will be used as the +caching key for that filter. This can be handy when using very large +filters (like a terms filter with many elements in it). + +include::filters/and-filter.asciidoc[] + +include::filters/bool-filter.asciidoc[] + +include::filters/exists-filter.asciidoc[] + +include::filters/geo-bounding-box-filter.asciidoc[] + +include::filters/geo-distance-filter.asciidoc[] + +include::filters/geo-distance-range-filter.asciidoc[] + +include::filters/geo-polygon-filter.asciidoc[] + +include::filters/geo-shape-filter.asciidoc[] + +include::filters/geohash-cell-filter.asciidoc[] + +include::filters/has-child-filter.asciidoc[] + +include::filters/has-parent-filter.asciidoc[] + +include::filters/ids-filter.asciidoc[] + +include::filters/limit-filter.asciidoc[] + +include::filters/match-all-filter.asciidoc[] + +include::filters/missing-filter.asciidoc[] + +include::filters/nested-filter.asciidoc[] + +include::filters/not-filter.asciidoc[] + +include::filters/numeric-range-filter.asciidoc[] + +include::filters/or-filter.asciidoc[] + +include::filters/prefix-filter.asciidoc[] + +include::filters/query-filter.asciidoc[] + +include::filters/range-filter.asciidoc[] + +include::filters/regexp-filter.asciidoc[] + +include::filters/script-filter.asciidoc[] + +include::filters/term-filter.asciidoc[] + +include::filters/terms-filter.asciidoc[] + +include::filters/type-filter.asciidoc[] + + + diff --git a/docs/reference/query-dsl/filters/and-filter.asciidoc b/docs/reference/query-dsl/filters/and-filter.asciidoc new file mode 100644 index 00000000000..6f171cf9189 --- /dev/null +++ b/docs/reference/query-dsl/filters/and-filter.asciidoc @@ -0,0 +1,69 @@ +[[query-dsl-and-filter]] +=== And Filter + +A filter that matches documents using `AND` boolean operator on other +filters. Can be placed within queries that accept a filter. + +[source,js] +-------------------------------------------------- +{ + "filtered" : { + "query" : { + "term" : { "name.first" : "shay" } + }, + "filter" : { + "and" : [ + { + "range" : { + "postDate" : { + "from" : "2010-03-01", + "to" : "2010-04-01" + } + } + }, + { + "prefix" : { "name.second" : "ba" } + } + ] + } + } +} +-------------------------------------------------- + +[float] +==== Caching + +The result of the filter is not cached by default. The `_cache` can be +set to `true` in order to cache it (though usually not needed). Since +the `_cache` element requires to be set on the `and` filter itself, the +structure then changes a bit to have the filters provided within a +`filters` element: + +[source,js] +-------------------------------------------------- +{ + "filtered" : { + "query" : { + "term" : { "name.first" : "shay" } + }, + "filter" : { + "and" : { + "filters": [ + { + "range" : { + "postDate" : { + "from" : "2010-03-01", + "to" : "2010-04-01" + } + } + }, + { + "prefix" : { "name.second" : "ba" } + } + ], + "_cache" : true + } + } + } +} +-------------------------------------------------- diff --git a/docs/reference/query-dsl/filters/bool-filter.asciidoc b/docs/reference/query-dsl/filters/bool-filter.asciidoc new file mode 100644 index 00000000000..bf36d264f9b --- /dev/null +++ b/docs/reference/query-dsl/filters/bool-filter.asciidoc @@ -0,0 +1,49 @@ +[[query-dsl-bool-filter]] +=== Bool Filter + +A filter that matches documents matching boolean combinations of other +queries. Similar in concept to +<>, except +that the clauses are other filters. Can be placed within queries that +accept a filter. + +[source,js] +-------------------------------------------------- +{ + "filtered" : { + "query" : { + "queryString" : { + "default_field" : "message", + "query" : "elasticsearch" + } + }, + "filter" : { + "bool" : { + "must" : { + "term" : { "tag" : "wow" } + }, + "must_not" : { + "range" : { + "age" : { "from" : 10, "to" : 20 } + } + }, + "should" : [ + { + "term" : { "tag" : "sometag" } + }, + { + "term" : { "tag" : "sometagtag" } + } + ] + } + } + } +} +-------------------------------------------------- + +[float] +==== Caching + +The result of the `bool` filter is not cached by default (though +internal filters might be). The `_cache` can be set to `true` in order +to enable caching. diff --git a/docs/reference/query-dsl/filters/exists-filter.asciidoc b/docs/reference/query-dsl/filters/exists-filter.asciidoc new file mode 100644 index 00000000000..80e9495c7d3 --- /dev/null +++ b/docs/reference/query-dsl/filters/exists-filter.asciidoc @@ -0,0 +1,20 @@ +[[query-dsl-exists-filter]] +=== Exists Filter + +Filters documents where a specific field has a value in them. + +[source,js] +-------------------------------------------------- +{ + "constant_score" : { + "filter" : { + "exists" : { "field" : "user" } + } + } +} +-------------------------------------------------- + +[float] +==== Caching + +The result of the filter is always cached. diff --git a/docs/reference/query-dsl/filters/geo-bounding-box-filter.asciidoc b/docs/reference/query-dsl/filters/geo-bounding-box-filter.asciidoc new file mode 100644 index 00000000000..6782ad6c688 --- /dev/null +++ b/docs/reference/query-dsl/filters/geo-bounding-box-filter.asciidoc @@ -0,0 +1,208 @@ +[[query-dsl-geo-bounding-box-filter]] +=== Geo Bounding Box Filter + +A filter allowing to filter hits based on a point location using a +bounding box. Assuming the following indexed document: + +[source,js] +-------------------------------------------------- +{ + "pin" : { + "location" : { + "lat" : 40.12, + "lon" : -71.34 + } + } +} +-------------------------------------------------- + +Then the following simple query can be executed with a +`geo_bounding_box` filter: + +[source,js] +-------------------------------------------------- +{ + "filtered" : { + "query" : { + "match_all" : {} + }, + "filter" : { + "geo_bounding_box" : { + "pin.location" : { + "top_left" : { + "lat" : 40.73, + "lon" : -74.1 + }, + "bottom_right" : { + "lat" : 40.01, + "lon" : -71.12 + } + } + } + } + } +} +-------------------------------------------------- + +[float] +==== Accepted Formats + +In much the same way the geo_point type can accept different +representation of the geo point, the filter can accept it as well: + +[float] +===== Lat Lon As Properties + +[source,js] +-------------------------------------------------- +{ + "filtered" : { + "query" : { + "match_all" : {} + }, + "filter" : { + "geo_bounding_box" : { + "pin.location" : { + "top_left" : { + "lat" : 40.73, + "lon" : -74.1 + }, + "bottom_right" : { + "lat" : 40.01, + "lon" : -71.12 + } + } + } + } + } +} +-------------------------------------------------- + +[float] +===== Lat Lon As Array + +Format in `[lon, lat]`, note, the order of lon/lat here in order to +conform with http://geojson.org/[GeoJSON]. + +[source,js] +-------------------------------------------------- +{ + "filtered" : { + "query" : { + "match_all" : {} + }, + "filter" : { + "geo_bounding_box" : { + "pin.location" : { + "top_left" : [-74.1, 40.73], + "bottom_right" : [-71.12, 40.01] + } + } + } + } +} +-------------------------------------------------- + +[float] +===== Lat Lon As String + +Format in `lat,lon`. + +[source,js] +-------------------------------------------------- +{ + "filtered" : { + "query" : { + "match_all" : {} + }, + "filter" : { + "geo_bounding_box" : { + "pin.location" : { + "top_left" : "40.73, -74.1", + "bottom_right" : "40.01, -71.12" + } + } + } + } +} +-------------------------------------------------- + +[float] +===== Geohash + +[source,js] +-------------------------------------------------- +{ + "filtered" : { + "query" : { + "match_all" : {} + }, + "filter" : { + "geo_bounding_box" : { + "pin.location" : { + "top_left" : "dr5r9ydj2y73", + "bottom_right" : "drj7teegpus6" + } + } + } + } +} +-------------------------------------------------- + +[float] +==== geo_point Type + +The filter *requires* the `geo_point` type to be set on the relevant +field. + +[float] +==== Multi Location Per Document + +The filter can work with multiple locations / points per document. Once +a single location / point matches the filter, the document will be +included in the filter + +[float] +==== Type + +The type of the bounding box execution by default is set to `memory`, +which means in memory checks if the doc falls within the bounding box +range. In some cases, an `indexed` option will perform faster (but note +that the `geo_point` type must have lat and lon indexed in this case). +Note, when using the indexed option, multi locations per document field +are not supported. Here is an example: + +[source,js] +-------------------------------------------------- +{ + "filtered" : { + "query" : { + "match_all" : {} + }, + "filter" : { + "geo_bounding_box" : { + "pin.location" : { + "top_left" : { + "lat" : 40.73, + "lon" : -74.1 + }, + "bottom_right" : { + "lat" : 40.10, + "lon" : -71.12 + } + }, + "type" : "indexed" + } + } + } +} +-------------------------------------------------- + +[float] +==== Caching + +The result of the filter is not cached by default. The `_cache` can be +set to `true` to cache the *result* of the filter. This is handy when +the same bounding box parameters are used on several (many) other +queries. Note, the process of caching the first execution is higher when +caching (since it needs to satisfy different queries). diff --git a/docs/reference/query-dsl/filters/geo-distance-filter.asciidoc b/docs/reference/query-dsl/filters/geo-distance-filter.asciidoc new file mode 100644 index 00000000000..7b8370ba950 --- /dev/null +++ b/docs/reference/query-dsl/filters/geo-distance-filter.asciidoc @@ -0,0 +1,179 @@ +[[query-dsl-geo-distance-filter]] +=== Geo Distance Filter + +Filters documents that include only hits that exists within a specific +distance from a geo point. Assuming the following indexed json: + +[source,js] +-------------------------------------------------- +{ + "pin" : { + "location" : { + "lat" : 40.12, + "lon" : -71.34 + } + } +} +-------------------------------------------------- + +Then the following simple query can be executed with a `geo_distance` +filter: + +[source,js] +-------------------------------------------------- +{ + "filtered" : { + "query" : { + "match_all" : {} + }, + "filter" : { + "geo_distance" : { + "distance" : "200km", + "pin.location" : { + "lat" : 40, + "lon" : -70 + } + } + } + } +} +-------------------------------------------------- + +[float] +==== Accepted Formats + +In much the same way the `geo_point` type can accept different +representation of the geo point, the filter can accept it as well: + +[float] +===== Lat Lon As Properties + +[source,js] +-------------------------------------------------- +{ + "filtered" : { + "query" : { + "match_all" : {} + }, + "filter" : { + "geo_distance" : { + "distance" : "12km", + "pin.location" : { + "lat" : 40, + "lon" : -70 + } + } + } + } +} +-------------------------------------------------- + +[float] +===== Lat Lon As Array + +Format in `[lon, lat]`, note, the order of lon/lat here in order to +conform with http://geojson.org/[GeoJSON]. + +[source,js] +-------------------------------------------------- +{ + "filtered" : { + "query" : { + "match_all" : {} + }, + "filter" : { + "geo_distance" : { + "distance" : "12km", + "pin.location" : [40, -70] + } + } + } +} +-------------------------------------------------- + +[float] +===== Lat Lon As String + +Format in `lat,lon`. + +[source,js] +-------------------------------------------------- +{ + "filtered" : { + "query" : { + "match_all" : {} + }, + "filter" : { + "geo_distance" : { + "distance" : "12km", + "pin.location" : "40,-70" + } + } + } +} +-------------------------------------------------- + +[float] +===== Geohash + +[source,js] +-------------------------------------------------- +{ + "filtered" : { + "query" : { + "match_all" : {} + }, + "filter" : { + "geo_distance" : { + "distance" : "12km", + "pin.location" : "drm3btev3e86" + } + } + } +} +-------------------------------------------------- + +[float] +==== Options + +The following are options allowed on the filter: + +[cols="<,<",options="header",] +|======================================================================= +|Option |Description +|`distance` |The distance to include hits in the filter. The distance +can be a numeric value, and then the `unit` (either `mi`/@miles@ or `km` +can be set) controlling the unit. Or a single string with the unit as +well. + +|`distance_type` |How to compute the distance. Can either be `arc` +(better precision) or `plane` (faster). Defaults to `arc`. + +|`optimize_bbox` |Will an optimization of using first a bounding box +check will be used. Defaults to `memory` which will do in memory checks. +Can also have values of `indexed` to use indexed value check (make sure +the `geo_point` type index lat lon in this case), or `none` which +disables bounding box optimization. +|======================================================================= + +[float] +==== geo_point Type + +The filter *requires* the `geo_point` type to be set on the relevant +field. + +[float] +==== Multi Location Per Document + +The `geo_distance` filter can work with multiple locations / points per +document. Once a single location / point matches the filter, the +document will be included in the filter. + +[float] +==== Caching + +The result of the filter is not cached by default. The `_cache` can be +set to `true` to cache the *result* of the filter. This is handy when +the same point and distance parameters are used on several (many) other +queries. Note, the process of caching the first execution is higher when +caching (since it needs to satisfy different queries). diff --git a/docs/reference/query-dsl/filters/geo-distance-range-filter.asciidoc b/docs/reference/query-dsl/filters/geo-distance-range-filter.asciidoc new file mode 100644 index 00000000000..1bc4197e5b3 --- /dev/null +++ b/docs/reference/query-dsl/filters/geo-distance-range-filter.asciidoc @@ -0,0 +1,30 @@ +[[query-dsl-geo-distance-range-filter]] +=== Geo Distance Range Filter + +Filters documents that exists within a range from a specific point: + +[source,js] +-------------------------------------------------- +{ + "filtered" : { + "query" : { + "match_all" : {} + }, + "filter" : { + "geo_distance_range" : { + "from" : "200km", + "to" : "400km" + "pin.location" : { + "lat" : 40, + "lon" : -70 + } + } + } + } +} +-------------------------------------------------- + +Supports the same point location parameter as the +<> +filter. And also support the common parameters for range (lt, lte, gt, +gte, from, to, include_upper and include_lower). diff --git a/docs/reference/query-dsl/filters/geo-polygon-filter.asciidoc b/docs/reference/query-dsl/filters/geo-polygon-filter.asciidoc new file mode 100644 index 00000000000..a4212343eff --- /dev/null +++ b/docs/reference/query-dsl/filters/geo-polygon-filter.asciidoc @@ -0,0 +1,126 @@ +[[query-dsl-geo-polygon-filter]] +=== Geo Polygon Filter + +A filter allowing to include hits that only fall within a polygon of +points. Here is an example: + +[source,js] +-------------------------------------------------- +{ + "filtered" : { + "query" : { + "match_all" : {} + }, + "filter" : { + "geo_polygon" : { + "person.location" : { + "points" : [ + {"lat" : 40, "lon" : -70}, + {"lat" : 30, "lon" : -80}, + {"lat" : 20, "lon" : -90} + ] + } + } + } + } +} +-------------------------------------------------- + +[float] +==== Allowed Formats + +[float] +===== Lat Long as Array + +Format in `[lon, lat]`, note, the order of lon/lat here in order to +conform with http://geojson.org/[GeoJSON]. + +[source,js] +-------------------------------------------------- +{ + "filtered" : { + "query" : { + "match_all" : {} + }, + "filter" : { + "geo_polygon" : { + "person.location" : { + "points" : [ + [-70, 40], + [-80, 30], + [-90, 20] + ] + } + } + } + } +} +-------------------------------------------------- + +[float] +===== Lat Lon as String + +Format in `lat,lon`. + +[source,js] +-------------------------------------------------- +{ + "filtered" : { + "query" : { + "match_all" : {} + }, + "filter" : { + "geo_polygon" : { + "person.location" : { + "points" : [ + "40, -70", + "30, -80", + "20, -90" + ] + } + } + } + } +} +-------------------------------------------------- + +[float] +===== Geohash + +[source,js] +-------------------------------------------------- +{ + "filtered" : { + "query" : { + "match_all" : {} + }, + "filter" : { + "geo_polygon" : { + "person.location" : { + "points" : [ + "drn5x1g8cu2y", + "30, -80", + "20, -90" + ] + } + } + } + } +} +-------------------------------------------------- + +[float] +==== geo_point Type + +The filter *requires* the +<> type to be +set on the relevant field. + +[float] +==== Caching + +The result of the filter is not cached by default. The `_cache` can be +set to `true` to cache the *result* of the filter. This is handy when +the same points parameters are used on several (many) other queries. +Note, the process of caching the first execution is higher when caching +(since it needs to satisfy different queries). diff --git a/docs/reference/query-dsl/filters/geo-shape-filter.asciidoc b/docs/reference/query-dsl/filters/geo-shape-filter.asciidoc new file mode 100644 index 00000000000..34d773c4876 --- /dev/null +++ b/docs/reference/query-dsl/filters/geo-shape-filter.asciidoc @@ -0,0 +1,137 @@ +[[query-dsl-geo-shape-filter]] +=== GeoShape Filter + +Filter documents indexed using the `geo_shape` type. + +Requires the <>. + +You may also use the +<>. + +The `geo_shape` Filter uses the same grid square representation as the +geo_shape mapping to find documents that have a shape that intersects +with the query shape. It will also use the same PrefixTree configuration +as defined for the field mapping. + +[float] +==== Filter Format + +The Filter supports two ways of defining the Filter shape, either by +providing a whole shape defintion, or by referencing the name of a shape +pre-indexed in another index. Both formats are defined below with +examples. + +[float] +===== Provided Shape Definition + +Similar to the `geo_shape` type, the `geo_shape` Filter uses +http://www.geojson.org[GeoJSON] to represent shapes. + +Given a document that looks like this: + +[source,js] +-------------------------------------------------- +{ + "name": "Wind & Wetter, Berlin, Germany", + "location": { + "type": "Point", + "coordinates": [13.400544, 52.530286] + } +} +-------------------------------------------------- + +The following query will find the point using the Elasticsearch's +`envelope` GeoJSON extension: + +[source,js] +-------------------------------------------------- +{ + "query":{ + "filtered": { + "query": { + "match_all": {} + }, + "filter": { + "geo_shape": { + "location": { + "shape": { + "type": "envelope", + "coordinates" : [[13.0, 53.0], [14.0, 52.0]] + } + } + } + } + } + } +} +-------------------------------------------------- + +[float] +===== Pre-Indexed Shape + +The Filter also supports using a shape which has already been indexed in +another index and/or index type. This is particularly useful for when +you have a pre-defined list of shapes which are useful to your +application and you want to reference this using a logical name (for +example 'New Zealand') rather than having to provide their coordinates +each time. In this situation it is only necessary to provide: + +* `id` - The ID of the document that containing the pre-indexed shape. +* `index` - Name of the index where the pre-indexed shape is. Defaults +to 'shapes'. +* `type` - Index type where the pre-indexed shape is. +* `shape_field_name` - Name of the field in the document containing the +pre-indexed shape. Defaults to 'shape'. + +The following is an example of using the Filter with a pre-indexed +shape: + +[source,js] +-------------------------------------------------- +{ + "filtered": { + "query": { + "match_all": {} + }, + "filter": { + "geo_shape": { + "location": { + "indexed_shape": { + "id": "DEU", + "type": "countries", + "index": "shapes", + "shape_field_name": "location" + } + } + } + } + } +} +-------------------------------------------------- + +[float] +==== Caching + +The result of the Filter is not cached by default. Setting `_cache` to +`true` will mean the results of the Filter will be cached. Since shapes +can contain 10s-100s of coordinates and any one differing means a new +shape, it may make sense to only using caching when you are sure that +the shapes will remain reasonably static. + +[float] +==== Compatibility with older versions + +Elasticsearch 0.90 changed the geo_shape implementation in a way that is +not compatible. Prior to this version, there was a required `relation` +field on queries and filter queries that indicated the relation of the +query shape to the indexed shapes. Support for this was implemented in +Elasticsearch and was poorly aligned with the underlying Lucene +implementation, which has no notion of a relation. From 0.90, this field +defaults to its only supported value: `intersects`. The other values of +`contains`, `within`, `disjoint` are no longer supported. By using e.g. +a bool filter, one can easily emulate `disjoint`. Given the imprecise +accuracy (see +<>), +`within` and `contains` were always somewhat problematic and +`intersects` is generally good enough. diff --git a/docs/reference/query-dsl/filters/geohash-cell-filter.asciidoc b/docs/reference/query-dsl/filters/geohash-cell-filter.asciidoc new file mode 100644 index 00000000000..2d9c256f17b --- /dev/null +++ b/docs/reference/query-dsl/filters/geohash-cell-filter.asciidoc @@ -0,0 +1,62 @@ +[[query-dsl-geohash-cell-filter]] +=== Geohash Cell Filter + +A geohash is a hierarchical datastructure which allows a division of a +spatial geometry. Each geohash defines a cell on the earths surface and +the longer the geohash the smaller cell. Also the size of a cell can be +seen as precision. The geohash filter provides an access to this +datastructure by defining a cell and match only points that lie within +this cell. + +To get this filter work all prefixes of a geohash need to be indexed. In +example a geohash `u30` needs to be decomposed into three terms: `u30`, +`u3` and `u`. This decomposition must be enabled in the mapping of the +`geo_point` field that's going to be filtered by setting the +`geohash_prefix` option: + +[source,js] +-------------------------------------------------- +{ + "mappings" : { + "location": { + "properties": { + "pin": { + "type": "geo_point", + "geohash": true, + "geohash_prefix": true + } + } + } + } +} +-------------------------------------------------- + +The geohash cell can defined by all formats of `geo_points`. If such a +cell is defined by a latitude and longitude pair the size of the cell +needs to be setup. This can be done by the `precision` parameter of the +filter. This parameter can be set to an integer value which sets the +length of the geohash prefix. Instead of setting a geohash length +directly it is also possible to define the precision as distance, in +example `"precision": "50m"`. The `neighbor` option of the filter offers +the possibility to filter cells next to the given cell. + +[source,js] +-------------------------------------------------- +{ + "filtered" : { + "query" : { + "match_all" : {} + }, + "filter" : { + "geohash_cell": { + "pin": { + "lat": 13.4080, + "lon": 52.5186 + }, + "precision": 3, + "neighbors": true + } + } + } +} +-------------------------------------------------- diff --git a/docs/reference/query-dsl/filters/has-child-filter.asciidoc b/docs/reference/query-dsl/filters/has-child-filter.asciidoc new file mode 100644 index 00000000000..0bfb21b0b40 --- /dev/null +++ b/docs/reference/query-dsl/filters/has-child-filter.asciidoc @@ -0,0 +1,50 @@ +[[query-dsl-has-child-filter]] +=== Has Child Filter + +The `has_child` filter accepts a query and the child type to run +against, and results in parent documents that have child docs matching +the query. Here is an example: + +[source,js] +-------------------------------------------------- +{ + "has_child" : { + "type" : "blog_tag", + "query" : { + "term" : { + "tag" : "something" + } + } + } +} +-------------------------------------------------- + +The `type` is the child type to query against. The parent type to return +is automatically detected based on the mappings. + +The way that the filter is implemented is by first running the child +query, doing the matching up to the parent doc for each document +matched. + +The `has_child` filter also accepts a filter instead of a query: + +[source,js] +-------------------------------------------------- +{ + "has_child" : { + "type" : "comment", + "filter" : { + "term" : { + "user" : "john" + } + } + } +} +-------------------------------------------------- + +[float] +==== Memory Considerations + +With the current implementation, all `_id` values are loaded to memory +(heap) in order to support fast lookups, so make sure there is enough +memory for it. diff --git a/docs/reference/query-dsl/filters/has-parent-filter.asciidoc b/docs/reference/query-dsl/filters/has-parent-filter.asciidoc new file mode 100644 index 00000000000..c307035f89b --- /dev/null +++ b/docs/reference/query-dsl/filters/has-parent-filter.asciidoc @@ -0,0 +1,56 @@ +[[query-dsl-has-parent-filter]] +=== Has Parent Filter + +The `has_parent` filter accepts a query and a parent type. The query is +executed in the parent document space, which is specified by the parent +type. This filter return child documents which associated parents have +matched. For the rest `has_parent` filter has the same options and works +in the same manner as the `has_child` filter. + +The `has_parent` filter is available from version `0.19.10`. This is an +experimental filter. + +[float] +==== Filter example + +[source,js] +-------------------------------------------------- +{ + "has_parent" : { + "parent_type" : "blog", + "query" : { + "term" : { + "tag" : "something" + } + } + } +} +-------------------------------------------------- + +The `parent_type` field name can also be abbreviated to `type`. + +The way that the filter is implemented is by first running the parent +query, doing the matching up to the child doc for each document matched. + +The `has_parent` filter also accepts a filter instead of a query: + +[source,js] +-------------------------------------------------- +{ + "has_parent" : { + "type" : "blog", + "filter" : { + "term" : { + "text" : "bonsai three" + } + } + } +} +-------------------------------------------------- + +[float] +==== Memory considerations + +With the current implementation, all `_id` values are loaded to memory +(heap) in order to support fast lookups, so make sure there is enough +memory for it. diff --git a/docs/reference/query-dsl/filters/ids-filter.asciidoc b/docs/reference/query-dsl/filters/ids-filter.asciidoc new file mode 100644 index 00000000000..303fffb9236 --- /dev/null +++ b/docs/reference/query-dsl/filters/ids-filter.asciidoc @@ -0,0 +1,20 @@ +[[query-dsl-ids-filter]] +=== Ids Filter + +Filters documents that only have the provided ids. Note, this filter +does not require the <> +field to be indexed since it works using the +<> field. + +[source,js] +-------------------------------------------------- +{ + "ids" : { + "type" : "my_type", + "values" : ["1", "4", "100"] + } +} +-------------------------------------------------- + +The `type` is optional and can be omitted, and can also accept an array +of values. diff --git a/docs/reference/query-dsl/filters/limit-filter.asciidoc b/docs/reference/query-dsl/filters/limit-filter.asciidoc new file mode 100644 index 00000000000..a590c2567f7 --- /dev/null +++ b/docs/reference/query-dsl/filters/limit-filter.asciidoc @@ -0,0 +1,19 @@ +[[query-dsl-limit-filter]] +=== Limit Filter + +A limit filter limits the number of documents (per shard) to execute on. +For example: + +[source,js] +-------------------------------------------------- +{ + "filtered" : { + "filter" : { + "limit" : {"value" : 100} + }, + "query" : { + "term" : { "name.first" : "shay" } + } + } +} +-------------------------------------------------- diff --git a/docs/reference/query-dsl/filters/match-all-filter.asciidoc b/docs/reference/query-dsl/filters/match-all-filter.asciidoc new file mode 100644 index 00000000000..97adbd1fb2a --- /dev/null +++ b/docs/reference/query-dsl/filters/match-all-filter.asciidoc @@ -0,0 +1,15 @@ +[[query-dsl-match-all-filter]] +=== Match All Filter + +A filter that matches on all documents: + +[source,js] +-------------------------------------------------- +{ + "constant_score" : { + "filter" : { + "match_all" : { } + } + } +} +-------------------------------------------------- diff --git a/docs/reference/query-dsl/filters/missing-filter.asciidoc b/docs/reference/query-dsl/filters/missing-filter.asciidoc new file mode 100644 index 00000000000..70685bd128c --- /dev/null +++ b/docs/reference/query-dsl/filters/missing-filter.asciidoc @@ -0,0 +1,41 @@ +[[query-dsl-missing-filter]] +=== Missing Filter + +Filters documents where a specific field has no value in them. + +[source,js] +-------------------------------------------------- +{ + "constant_score" : { + "filter" : { + "missing" : { "field" : "user" } + } + } +} +-------------------------------------------------- + +By default, the filter will only find "missing" fields, i.e., fields +that have no values. It can be configured also to find fields with an +explicit `null_value` mapped for them. Here is an example that will both +find missing field that don't exists (`existence` set to `true`), or +have null values (`null_value` set to `true`). + +[source,js] +-------------------------------------------------- +{ + "constant_score" : { + "filter" : { + "missing" : { + "field" : "user", + "existence" : true, + "null_value" : true + } + } + } +} +-------------------------------------------------- + +[float] +==== Caching + +The result of the filter is always cached. diff --git a/docs/reference/query-dsl/filters/nested-filter.asciidoc b/docs/reference/query-dsl/filters/nested-filter.asciidoc new file mode 100644 index 00000000000..83c6928337b --- /dev/null +++ b/docs/reference/query-dsl/filters/nested-filter.asciidoc @@ -0,0 +1,35 @@ +[[query-dsl-nested-filter]] +=== Nested Filter + +A `nested` filter, works in a similar fashion to the +<> query, except +used as a filter. It follows exactly the same structure, but also allows +to cache the results (set `_cache` to `true`), and have it named (set +the `_name` value). For example: + +[source,js] +-------------------------------------------------- +{ + "filtered" : { + "query" : { "match_all" : {} }, + "filter" : { + "nested" : { + "path" : "obj1", + "query" : { + "bool" : { + "must" : [ + { + "match" : {"obj1.name" : "blue"} + }, + { + "range" : {"obj1.count" : {"gt" : 5}} + } + ] + } + }, + "_cache" : true + } + } + } +} +-------------------------------------------------- diff --git a/docs/reference/query-dsl/filters/not-filter.asciidoc b/docs/reference/query-dsl/filters/not-filter.asciidoc new file mode 100644 index 00000000000..629cb178499 --- /dev/null +++ b/docs/reference/query-dsl/filters/not-filter.asciidoc @@ -0,0 +1,82 @@ +[[query-dsl-not-filter]] +=== Not Filter + +A filter that filters out matched documents using a query. Can be placed +within queries that accept a filter. + +[source,js] +-------------------------------------------------- +{ + "filtered" : { + "query" : { + "term" : { "name.first" : "shay" } + }, + "filter" : { + "not" : { + "range" : { + "postDate" : { + "from" : "2010-03-01", + "to" : "2010-04-01" + } + } + } + } + } +} +-------------------------------------------------- + +Or, in a longer form with a `filter` element: + +[source,js] +-------------------------------------------------- +{ + "filtered" : { + "query" : { + "term" : { "name.first" : "shay" } + }, + "filter" : { + "not" : { + "filter" : { + "range" : { + "postDate" : { + "from" : "2010-03-01", + "to" : "2010-04-01" + } + } + } + } + } + } +} +-------------------------------------------------- + +[float] +==== Caching + +The result of the filter is not cached by default. The `_cache` can be +set to `true` in order to cache it (though usually not needed). Here is +an example: + +[source,js] +-------------------------------------------------- +{ + "filtered" : { + "query" : { + "term" : { "name.first" : "shay" } + }, + "filter" : { + "not" : { + "filter" : { + "range" : { + "postDate" : { + "from" : "2010-03-01", + "to" : "2010-04-01" + } + } + }, + "_cache" : true + } + } + } +} +-------------------------------------------------- diff --git a/docs/reference/query-dsl/filters/numeric-range-filter.asciidoc b/docs/reference/query-dsl/filters/numeric-range-filter.asciidoc new file mode 100644 index 00000000000..41705f8ca75 --- /dev/null +++ b/docs/reference/query-dsl/filters/numeric-range-filter.asciidoc @@ -0,0 +1,72 @@ +[[query-dsl-numeric-range-filter]] +=== Numeric Range Filter + +Filters documents with fields that have values within a certain numeric +range. Similar to +<>, except +that it works only with numeric values, and the filter execution works +differently. + +[source,js] +-------------------------------------------------- +{ + "constant_score" : { + "filter" : { + "numeric_range" : { + "age" : { + "from" : "10", + "to" : "20", + "include_lower" : true, + "include_upper" : false + } + } + } + } +} +-------------------------------------------------- + +The numeric range filter works by loading all the relevant field values +into memory, and checking for the relevant docs if they satisfy the +range requirements. This requires more memory since the numeric range +data are loaded to memory, but can provide a significant increase in +performance. Note, if the relevant field values have already been loaded +to memory, for example because it was used in facets or was sorted on, +then this filter should be used. + +The `numeric_range` filter top level parameters include: + +[cols="<,<",options="header",] +|======================================================================= +|Name |Description +|`from` |The lower bound. Defaults to start from the first. + +|`to` |The upper bound. Defaults to unbounded. + +|`include_lower` |Should the first from (if set) be inclusive or not. +Defaults to `true` + +|`include_upper` |Should the last to (if set) be inclusive or not. +Defaults to `true`. + +|`gt` |Same as setting `from` and `include_lower` to `false`. + +|`gte` |Same as setting `from` and `include_lower` to `true`. + +|`lt` |Same as setting `to` and `include_upper` to `false`. + +|`lte` |Same as setting `to` and `include_upper` to `true`. +|======================================================================= + +[float] +==== Caching + +The result of the filter is not cached by default. The `_cache` can be +set to `true` to cache the *result* of the filter. This is handy when +the same points parameters are used on several (many) other queries. +Note, the process of caching the first execution is higher when caching +(since it needs to satisfy different queries). + +If caching the *result* of the filter is desired (for example, using the +same "teen" filter with ages between 10 and 20), then it is advisable to +simply use the <> +filter. diff --git a/docs/reference/query-dsl/filters/or-filter.asciidoc b/docs/reference/query-dsl/filters/or-filter.asciidoc new file mode 100644 index 00000000000..9c68cb97dca --- /dev/null +++ b/docs/reference/query-dsl/filters/or-filter.asciidoc @@ -0,0 +1,59 @@ +[[query-dsl-or-filter]] +=== Or Filter + +A filter that matches documents using `OR` boolean operator on other +queries. Can be placed within queries that accept a filter. + +[source,js] +-------------------------------------------------- +{ + "filtered" : { + "query" : { + "term" : { "name.first" : "shay" } + }, + "filter" : { + "or" : [ + { + "term" : { "name.second" : "banon" } + }, + { + "term" : { "name.nick" : "kimchy" } + } + ] + } + } +} +-------------------------------------------------- + +[float] +==== Caching + +The result of the filter is not cached by default. The `_cache` can be +set to `true` in order to cache it (though usually not needed). Since +the `_cache` element requires to be set on the `or` filter itself, the +structure then changes a bit to have the filters provided within a +`filters` element: + +[source,js] +-------------------------------------------------- +{ + "filtered" : { + "query" : { + "term" : { "name.first" : "shay" } + }, + "filter" : { + "or" : { + "filters" : [ + { + "term" : { "name.second" : "banon" } + }, + { + "term" : { "name.nick" : "kimchy" } + } + ], + "_cache" : true + } + } + } +} +-------------------------------------------------- diff --git a/docs/reference/query-dsl/filters/prefix-filter.asciidoc b/docs/reference/query-dsl/filters/prefix-filter.asciidoc new file mode 100644 index 00000000000..d29f5705f11 --- /dev/null +++ b/docs/reference/query-dsl/filters/prefix-filter.asciidoc @@ -0,0 +1,37 @@ +[[query-dsl-prefix-filter]] +=== Prefix Filter + +Filters documents that have fields containing terms with a specified +prefix (*not analyzed*). Similar to phrase query, except that it acts as +a filter. Can be placed within queries that accept a filter. + +[source,js] +-------------------------------------------------- +{ + "constant_score" : { + "filter" : { + "prefix" : { "user" : "ki" } + } + } +} +-------------------------------------------------- + +[float] +==== Caching + +The result of the filter is cached by default. The `_cache` can be set +to `false` in order not to cache it. Here is an example: + +[source,js] +-------------------------------------------------- +{ + "constant_score" : { + "filter" : { + "prefix" : { + "user" : "ki", + "_cache" : false + } + } + } +} +-------------------------------------------------- diff --git a/docs/reference/query-dsl/filters/query-filter.asciidoc b/docs/reference/query-dsl/filters/query-filter.asciidoc new file mode 100644 index 00000000000..8cd3858fdf7 --- /dev/null +++ b/docs/reference/query-dsl/filters/query-filter.asciidoc @@ -0,0 +1,50 @@ +[[query-dsl-query-filter]] +=== Query Filter + +Wraps any query to be used as a filter. Can be placed within queries +that accept a filter. + +[source,js] +-------------------------------------------------- +{ + "constantScore" : { + "filter" : { + "query" : { + "query_string" : { + "query" : "this AND that OR thus" + } + } + } + } +} +-------------------------------------------------- + +[float] +==== Caching + +The result of the filter is not cached by default. The `_cache` can be +set to `true` to cache the *result* of the filter. This is handy when +the same query is used on several (many) other queries. Note, the +process of caching the first execution is higher when not caching (since +it needs to satisfy different queries). + +Setting the `_cache` element requires a different format for the +`query`: + +[source,js] +-------------------------------------------------- +{ + "constantScore" : { + "filter" : { + "fquery" : { + "query" : { + "query_string" : { + "query" : "this AND that OR thus" + } + }, + "_cache" : true + } + } + } +} +-------------------------------------------------- diff --git a/docs/reference/query-dsl/filters/range-filter.asciidoc b/docs/reference/query-dsl/filters/range-filter.asciidoc new file mode 100644 index 00000000000..a96462d1ea5 --- /dev/null +++ b/docs/reference/query-dsl/filters/range-filter.asciidoc @@ -0,0 +1,59 @@ +[[query-dsl-range-filter]] +=== Range Filter + +Filters documents with fields that have terms within a certain range. +Similar to <>, except that it acts as a filter. Can be placed within queries +that accept a filter. + +[source,js] +-------------------------------------------------- +{ + "constant_score" : { + "filter" : { + "range" : { + "age" : { + "from" : "10", + "to" : "20", + "include_lower" : true, + "include_upper" : false + } + } + } + } +} +-------------------------------------------------- + +The `range` filter top level parameters include: + +[cols="<,<",options="header",] +|======================================================================= +|Name |Description +|`from` |The lower bound. Defaults to start from the first. + +|`to` |The upper bound. Defaults to unbounded. + +|`include_lower` |Should the first from (if set) be inclusive or not. +Defaults to `true` + +|`include_upper` |Should the last to (if set) be inclusive or not. +Defaults to `true`. + +|`gt` |Same as setting `from` to the value, and `include_lower` to +`false`. + +|`gte` |Same as setting `from` to the value, and `include_lower` to +`true`. + +|`lt` |Same as setting `to` to the value, and `include_upper` to +`false`. + +|`lte` |Same as setting `to` to the value, and `include_upper` to +`true`. +|======================================================================= + +[float] +==== Caching + +The result of the filter is automatically cached by default. The +`_cache` can be set to `false` to turn it off. diff --git a/docs/reference/query-dsl/filters/regexp-filter.asciidoc b/docs/reference/query-dsl/filters/regexp-filter.asciidoc new file mode 100644 index 00000000000..e0578984d34 --- /dev/null +++ b/docs/reference/query-dsl/filters/regexp-filter.asciidoc @@ -0,0 +1,51 @@ +[[query-dsl-regexp-filter]] +=== Regexp Filter + +The `regexp` filter is similar to the +<> query, except +that it is cacheable and can speedup performance in case you are reusing +this filter in your queries. + +[source,js] +-------------------------------------------------- +{ + "filtered": { + "query": { + "match_all": {} + }, + "filter": { + "regexp":{ + "name.first" : "s.*y" + } + } + } +} +-------------------------------------------------- + +You can also select the cache name and use the same regexp flags in the +filter as in the query. + +*Note*: You have to enable caching explicitly in order to have the +`regexp` filter cached. + +[source,js] +-------------------------------------------------- +{ + "filtered": { + "query": { + "match_all": {} + }, + "filter": { + "regexp":{ + "name.first" : { + "value" : "s.*y", + "flags" : "INTERSECTION|COMPLEMENT|EMPTY" + }, + "_name":"test", + "_cache" : true, + "_cache_key" : "key" + } + } + } +} +-------------------------------------------------- diff --git a/docs/reference/query-dsl/filters/script-filter.asciidoc b/docs/reference/query-dsl/filters/script-filter.asciidoc new file mode 100644 index 00000000000..faebca02f18 --- /dev/null +++ b/docs/reference/query-dsl/filters/script-filter.asciidoc @@ -0,0 +1,53 @@ +[[query-dsl-script-filter]] +=== Script Filter + +A filter allowing to define +<> as filters. For +example: + +[source,js] +---------------------------------------------- +"filtered" : { + "query" : { + ... + }, + "filter" : { + "script" : { + "script" : "doc['num1'].value > 1" + } + } +} +---------------------------------------------- + +[float] +==== Custom Parameters + +Scripts are compiled and cached for faster execution. If the same script +can be used, just with different parameters provider, it is preferable +to use the ability to pass parameters to the script itself, for example: + +[source,js] +---------------------------------------------- +"filtered" : { + "query" : { + ... + }, + "filter" : { + "script" : { + "script" : "doc['num1'].value > param1" + "params" : { + "param1" : 5 + } + } + } +} +--------------------------------------------------- + +[float] +==== Caching + +The result of the filter is not cached by default. The `_cache` can be +set to `true` to cache the *result* of the filter. This is handy when +the same script and parameters are used on several (many) other queries. +Note, the process of caching the first execution is higher when caching +(since it needs to satisfy different queries). diff --git a/docs/reference/query-dsl/filters/term-filter.asciidoc b/docs/reference/query-dsl/filters/term-filter.asciidoc new file mode 100644 index 00000000000..09cd32d4ef1 --- /dev/null +++ b/docs/reference/query-dsl/filters/term-filter.asciidoc @@ -0,0 +1,38 @@ +[[query-dsl-term-filter]] +=== Term Filter + +Filters documents that have fields that contain a term (*not analyzed*). +Similar to <>, +except that it acts as a filter. Can be placed within queries that +accept a filter, for example: + +[source,js] +-------------------------------------------------- +{ + "constant_score" : { + "filter" : { + "term" : { "user" : "kimchy"} + } + } +} +-------------------------------------------------- + +[float] +==== Caching + +The result of the filter is automatically cached by default. The +`_cache` can be set to `false` to turn it off. Here is an example: + +[source,js] +-------------------------------------------------- +{ + "constant_score" : { + "filter" : { + "term" : { + "user" : "kimchy", + "_cache" : false + } + } + } +} +-------------------------------------------------- diff --git a/docs/reference/query-dsl/filters/terms-filter.asciidoc b/docs/reference/query-dsl/filters/terms-filter.asciidoc new file mode 100644 index 00000000000..56a6f3d49ca --- /dev/null +++ b/docs/reference/query-dsl/filters/terms-filter.asciidoc @@ -0,0 +1,220 @@ +[[query-dsl-terms-filter]] +=== Terms Filter + +Filters documents that have fields that match any of the provided terms +(*not analyzed*). For example: + +[source,js] +-------------------------------------------------- +{ + "constant_score" : { + "filter" : { + "terms" : { "user" : ["kimchy", "elasticsearch"]} + } + } +} +-------------------------------------------------- + +The `terms` filter is also aliased with `in` as the filter name for +simpler usage. + +[float] +==== Execution Mode + +The way terms filter executes is by iterating over the terms provided +and finding matches docs (loading into a bitset) and caching it. +Sometimes, we want a different execution model that can still be +achieved by building more complex queries in the DSL, but we can support +them in the more compact model that terms filter provides. + +The `execution` option now has the following options : + +[horizontal] +`plain`:: + The default. Works as today. Iterates over all the terms, + building a bit set matching it, and filtering. The total filter is + cached. + +`bool`:: + Generates a term filter (which is cached) for each term, and + wraps those in a bool filter. The bool filter itself is not cached as it + can operate very quickly on the cached term filters. + +`and`:: + Generates a term filter (which is cached) for each term, and + wraps those in an and filter. The and filter itself is not cached. + +`or`:: + Generates a term filter (which is cached) for each term, and + wraps those in an or filter. The or filter itself is not cached. + Generally, the `bool` execution mode should be preferred. + +If you don't want the generated individual term queries to be cached, +you can use: `bool_nocache`, `and_nocache` or `or_nocache` instead, but +be aware that this will affect performance. + +The "total" terms filter caching can still be explicitly controlled +using the `_cache` option. Note the default value for it depends on the +execution value. + +For example: + +[source,js] +-------------------------------------------------- +{ + "constant_score" : { + "filter" : { + "terms" : { + "user" : ["kimchy", "elasticsearch"], + "execution" : "bool", + "_cache": true + } + } + } +} +-------------------------------------------------- + +[float] +==== Caching + +The result of the filter is automatically cached by default. The +`_cache` can be set to `false` to turn it off. + +[float] +==== Terms lookup mechanism + +When it's needed to specify a `terms` filter with a lot of terms it can +be beneficial to fetch those term values from a document in an index. A +concrete example would be to filter tweets tweeted by your followers. +Potentially the amount of user ids specified in the terms filter can be +a lot. In this scenario it makes sense to use the terms filter's terms +lookup mechanism. + +The terms lookup mechanism is supported from version `0.90.0.Beta1`. + +The terms lookup mechanism supports the following options: + +[horizontal] +`index`:: + The index to fetch the term values from. Defaults to the + current index. + +`type`:: + The type to fetch the term values from. + +`id`:: + The id of the document to fetch the term values from. + +`path`:: + The field specified as path to fetch the actual values for the + `terms` filter. + +`routing`:: + A custom routing value to be used when retrieving the + external terms doc. + +`cache`:: + Whether to cache the filter built from the retrieved document + (`true` - default) or whether to fetch and rebuild the filter on every + request (`false`). See "<>" below + +The values for the `terms` filter will be fetched from a field in a +document with the specified id in the specified type and index. +Internally a get request is executed to fetch the values from the +specified path. At the moment for this feature to work the `_source` +needs to be stored. + +Also, consider using an index with a single shard and fully replicated +across all nodes if the "reference" terms data is not large. The lookup +terms filter will prefer to execute the get request on a local node if +possible, reducing the need for networking. + +["float",id="query-dsl-terms-filter-lookup-caching"] +==== Terms lookup caching + +There is an additional cache involved, which caches the lookup of the +lookup document to the actual terms. This lookup cache is a LRU cache. +This cache has the following options: + +`indices.cache.filter.terms.size`:: + The size of the lookup cache. The default is `10mb`. + +`indices.cache.filter.terms.expire_after_access`:: + The time after the last read an entry should expire. Disabled by default. + +`indices.cache.filter.terms.expire_after_write`: + The time after the last write an entry should expire. Disabled by default. + +All options for the lookup of the documents cache can only be configured +via the `elasticsearch.yml` file. + +When using the terms lookup the `execution` option isn't taken into +account and behaves as if the execution mode was set to `plain`. + +[float] +==== Terms lookup twitter example + +[source,js] +-------------------------------------------------- +# index the information for user with id 2, specifically, its followers +curl -XPUT localhost:9200/users/user/2 -d '{ + "followers" : ["1", "3"] +}' + +# index a tweet, from user with id 2 +curl -XPUT localhost:9200/tweets/tweet/1 -d '{ + "user" : "2" +}' + +# search on all the tweets that match the followers of user 2 +curl -XGET localhost:9200/tweets/_search -d '{ + "query" : { + "filtered" : { + "filter" : { + "terms" : { + "user" : { + "index" : "users", + "type" : "user", + "id" : "2", + "path" : "followers" + }, + "_cache_key" : "user_2_friends" + } + } + } + } +}' +-------------------------------------------------- + +The above is highly optimized, both in a sense that the list of +followers will not be fetched if the filter is already cached in the +filter cache, and with internal LRU cache for fetching external values +for the terms filter. Also, the entry in the filter cache will not hold +`all` the terms reducing the memory required for it. + +`_cache_key` is recommended to be set, so its simple to clear the cache +associated with it using the clear cache API. For example: + +[source,js] +-------------------------------------------------- +curl -XPOST 'localhost:9200/tweets/_cache/clear?filter_keys=user_2_friends' +-------------------------------------------------- + +The structure of the external terms document can also include array of +inner objects, for example: + +[source,js] +-------------------------------------------------- +curl -XPUT localhost:9200/users/user/2 -d '{ + "followers" : [ + { + "id" : "1" + }, + { + "id" : "2" + } + ] +}' +-------------------------------------------------- + +In which case, the lookup path will be `followers.id`. diff --git a/docs/reference/query-dsl/filters/type-filter.asciidoc b/docs/reference/query-dsl/filters/type-filter.asciidoc new file mode 100644 index 00000000000..07bde382622 --- /dev/null +++ b/docs/reference/query-dsl/filters/type-filter.asciidoc @@ -0,0 +1,15 @@ +[[query-dsl-type-filter]] +=== Type Filter + +Filters documents matching the provided document / mapping type. Note, +this filter can work even when the `_type` field is not indexed (using +the <> field). + +[source,js] +-------------------------------------------------- +{ + "type" : { + "value" : "my_type" + } +} +-------------------------------------------------- diff --git a/docs/reference/query-dsl/queries.asciidoc b/docs/reference/query-dsl/queries.asciidoc new file mode 100644 index 00000000000..3fb508376b1 --- /dev/null +++ b/docs/reference/query-dsl/queries.asciidoc @@ -0,0 +1,90 @@ +[[query-dsl-queries]] +== Queries + +As a general rule, queries should be used instead of filters: + +* for full text search +* where the result depends on a relevance score + +include::queries/match-query.asciidoc[] + +include::queries/multi-match-query.asciidoc[] + +include::queries/bool-query.asciidoc[] + +include::queries/boosting-query.asciidoc[] + +include::queries/common-terms-query.asciidoc[] + +include::queries/custom-filters-score-query.asciidoc[] + +include::queries/ids-query.asciidoc[] + +include::queries/custom-score-query.asciidoc[] + +include::queries/custom-boost-factor-query.asciidoc[] + +include::queries/constant-score-query.asciidoc[] + +include::queries/dis-max-query.asciidoc[] + +include::queries/field-query.asciidoc[] + +include::queries/filtered-query.asciidoc[] + +include::queries/flt-query.asciidoc[] + +include::queries/flt-field-query.asciidoc[] + +include::queries/fuzzy-query.asciidoc[] + +include::queries/geo-shape-query.asciidoc[] + +include::queries/has-child-query.asciidoc[] + +include::queries/has-parent-query.asciidoc[] + +include::queries/indices-query.asciidoc[] + +include::queries/match-all-query.asciidoc[] + +include::queries/mlt-query.asciidoc[] + +include::queries/mlt-field-query.asciidoc[] + +include::queries/nested-query.asciidoc[] + +include::queries/prefix-query.asciidoc[] + +include::queries/query-string-query.asciidoc[] + +include::queries/range-query.asciidoc[] + +include::queries/regexp-query.asciidoc[] + +include::queries/span-first-query.asciidoc[] + +include::queries/span-multi-term-query.asciidoc[] + +include::queries/span-near-query.asciidoc[] + +include::queries/span-not-query.asciidoc[] + +include::queries/span-or-query.asciidoc[] + +include::queries/span-term-query.asciidoc[] + +include::queries/term-query.asciidoc[] + +include::queries/terms-query.asciidoc[] + +include::queries/top-children-query.asciidoc[] + +include::queries/wildcard-query.asciidoc[] + +include::queries/text-query.asciidoc[] + +include::queries/minimum-should-match.asciidoc[] + +include::queries/multi-term-rewrite.asciidoc[] + diff --git a/docs/reference/query-dsl/queries/bool-query.asciidoc b/docs/reference/query-dsl/queries/bool-query.asciidoc new file mode 100644 index 00000000000..a9b565c80ab --- /dev/null +++ b/docs/reference/query-dsl/queries/bool-query.asciidoc @@ -0,0 +1,54 @@ +[[query-dsl-bool-query]] +=== Bool Query + +A query that matches documents matching boolean combinations of other +queries. The bool query maps to Lucene `BooleanQuery`. It is built using +one or more boolean clauses, each clause with a typed occurrence. The +occurrence types are: + +[cols="<,<",options="header",] +|======================================================================= +|Occur |Description +|`must` |The clause (query) must appear in matching documents. + +|`should` |The clause (query) should appear in the matching document. In +a boolean query with no `must` clauses, one or more `should` clauses +must match a document. The minimum number of should clauses to match can +be set using the +<> +parameter. + +|`must_not` |The clause (query) must not appear in the matching +documents. +|======================================================================= + +The bool query also supports `disable_coord` parameter (defaults to +`false`). Basically the coord similarity computes a score factor based +on the fraction of all query terms that a document contains. See Lucene +`BooleanQuery` for more details. + +[source,js] +-------------------------------------------------- +{ + "bool" : { + "must" : { + "term" : { "user" : "kimchy" } + }, + "must_not" : { + "range" : { + "age" : { "from" : 10, "to" : 20 } + } + }, + "should" : [ + { + "term" : { "tag" : "wow" } + }, + { + "term" : { "tag" : "elasticsearch" } + } + ], + "minimum_should_match" : 1, + "boost" : 1.0 + } +} +-------------------------------------------------- diff --git a/docs/reference/query-dsl/queries/boosting-query.asciidoc b/docs/reference/query-dsl/queries/boosting-query.asciidoc new file mode 100644 index 00000000000..969b3bbedfe --- /dev/null +++ b/docs/reference/query-dsl/queries/boosting-query.asciidoc @@ -0,0 +1,26 @@ +[[query-dsl-boosting-query]] +=== Boosting Query + +The `boosting` query can be used to effectively demote results that +match a given query. Unlike the "NOT" clause in bool query, this still +selects documents that contain undesirable terms, but reduces their +overall score. + +[source,js] +-------------------------------------------------- +{ + "boosting" : { + "positive" : { + "term" : { + "field1" : "value1" + } + }, + "negative" : { + "term" : { + "field2" : "value2" + } + }, + "negative_boost" : 0.2 + } +} +-------------------------------------------------- diff --git a/docs/reference/query-dsl/queries/common-terms-query.asciidoc b/docs/reference/query-dsl/queries/common-terms-query.asciidoc new file mode 100644 index 00000000000..256d9bb74af --- /dev/null +++ b/docs/reference/query-dsl/queries/common-terms-query.asciidoc @@ -0,0 +1,263 @@ +[[query-dsl-common-terms-query]] +=== Common Terms Query + +The `common` terms query is a modern alternative to stopwords which +improves the precision and recall of search results (by taking stopwords +into account), without sacrificing performance. + +[float] +==== The problem + +Every term in a query has a cost. A search for `"The brown fox"` +requires three term queries, one for each of `"the"`, `"brown"` and +`"fox"`, all of which are executed against all documents in the index. +The query for `"the"` is likely to match many documents and thus has a +much smaller impact on relevance than the other two terms. + +Previously, the solution to this problem was to ignore terms with high +frequency. By treating `"the"` as a _stopword_, we reduce the index size +and reduce the number of term queries that need to be executed. + +The problem with this approach is that, while stopwords have a small +impact on relevance, they are still important. If we remove stopwords, +we lose precision, (eg we are unable to distinguish between `"happy"` +and `"not happy"`) and we lose recall (eg text like `"The The"` or +`"To be or not to be"` would simply not exist in the index). + +[float] +==== The solution + +The `common` terms query divides the query terms into two groups: more +important (ie _low frequency_ terms) and less important (ie _high +frequency_ terms which would previously have been stopwords). + +First it searches for documents which match the more important terms. +These are the terms which appear in fewer documents and have a greater +impact on relevance. + +Then, it executes a second query for the less important terms -- terms +which appear frequently and have a low impact on relevance. But instead +of calculating the relevance score for *all* matching documents, it only +calculates the `_score` for documents already matched by the first +query. In this way the high frequency terms can improve the relevance +calculation without paying the cost of poor performance. + +If a query consists only of high frequency terms, then a single query is +executed as an `AND` (conjunction) query, in other words all terms are +required. Even though each individual term will match many documents, +the combination of terms narrows down the resultset to only the most +relevant. The single query can also be executed as an `OR` with a +specific +<>, +in this case a high enough value should probably be used. + +Terms are allocated to the high or low frequency groups based on the +`cutoff_frequency`, which can be specified as an absolute frequency +(`>=1`) or as a relative frequency (`0.0 .. 1.0`). + +Perhaps the most interesting property of this query is that it adapts to +domain specific stopwords automatically. For example, on a video hosting +site, common terms like `"clip"` or `"video"` will automatically behave +as stopwords without the need to maintain a manual list. + +[float] +==== Examples + +In this example, words that have a document frequency greater than 0.1% +(eg `"this"` and `"is"`) will be treated as _common terms_. + +[source,js] +-------------------------------------------------- +{ + "common": { + "body": { + "query": "this is bonsai cool", + "cutoff_frequency": 0.001 + } + } +} +-------------------------------------------------- + +The number of terms which should match can be controlled with the +<> +(`high_freq`, `low_freq`), `low_freq_operator` (default `"or"`) and +`high_freq_operator` (default `"or"`) parameters. + +For low frequency terms, set the `low_freq_operator` to `"and"` to make +all terms required: + +[source,js] +-------------------------------------------------- +{ + "common": { + "body": { + "query": "nelly the elephant as a cartoon", + "cutoff_frequency": 0.001, + "low_freq_operator" "and" + } + } +} +-------------------------------------------------- + +which is roughly equivalent to: + +[source,js] +-------------------------------------------------- +{ + "bool": { + "must": [ + { "term": { "body": "nelly"}}, + { "term": { "body": "elephant"}}, + { "term": { "body": "cartoon"}} + ], + "should": [ + { "term": { "body": "the"}} + { "term": { "body": "as"}} + { "term": { "body": "a"}} + ] + } +} +-------------------------------------------------- + +Alternatively use +<> +to specify a minimum number or percentage of low frequency terms which +must be present, for instance: + +[source,js] +-------------------------------------------------- +{ + "common": { + "body": { + "query": "nelly the elephant as a cartoon", + "cutoff_frequency": 0.001, + "minimum_should_match": 2 + } + } +} +-------------------------------------------------- + +which is roughly equivalent to: + +[source,js] +-------------------------------------------------- +{ + "bool": { + "must": { + "bool": { + "should": [ + { "term": { "body": "nelly"}}, + { "term": { "body": "elephant"}}, + { "term": { "body": "cartoon"}} + ], + "minimum_should_match": 2 + } + }, + "should": [ + { "term": { "body": "the"}} + { "term": { "body": "as"}} + { "term": { "body": "a"}} + ] + } +} +-------------------------------------------------- + +minimum_should_match + +A different +<> +can be applied for low and high frequency terms with the additional +`low_freq` and `high_freq` parameters Here is an example when providing +additional parameters (note the change in structure): + +[source,js] +-------------------------------------------------- +{ + "common": { + "body": { + "query": "nelly the elephant not as a cartoon", + "cutoff_frequency": 0.001, + "minimum_should_match": { + "low_freq" : 2, + "high_freq" : 3 + } + } + } +} +-------------------------------------------------- + +which is roughly equivalent to: + +[source,js] +-------------------------------------------------- +{ + "bool": { + "must": { + "bool": { + "should": [ + { "term": { "body": "nelly"}}, + { "term": { "body": "elephant"}}, + { "term": { "body": "cartoon"}} + ], + "minimum_should_match": 2 + } + }, + "should": { + "bool": { + "should": [ + { "term": { "body": "the"}}, + { "term": { "body": "not"}}, + { "term": { "body": "as"}}, + { "term": { "body": "a"}} + ], + "minimum_should_match": 3 + } + } + } +} +-------------------------------------------------- + +In this case it means the high frequency terms have only an impact on +relevance when there are at least three of them. But the most +interesting use of the +<> +for high frequency terms is when there are only high frequency terms: + +[source,js] +-------------------------------------------------- +{ + "common": { + "body": { + "query": "how not to be", + "cutoff_frequency": 0.001, + "minimum_should_match": { + "low_freq" : 2, + "high_freq" : 3 + } + } + } +} +-------------------------------------------------- + +which is roughly equivalent to: + +[source,js] +-------------------------------------------------- +{ + "bool": { + "should": [ + { "term": { "body": "how"}}, + { "term": { "body": "not"}}, + { "term": { "body": "to"}}, + { "term": { "body": "be"}} + ], + "minimum_should_match": "3<50%" + } +} +-------------------------------------------------- + +The high frequency generated query is then slightly less restrictive +than with an `AND`. + +The `common` terms query also supports `boost`, `analyzer` and +`disable_coord` as parameters. diff --git a/docs/reference/query-dsl/queries/constant-score-query.asciidoc b/docs/reference/query-dsl/queries/constant-score-query.asciidoc new file mode 100644 index 00000000000..06ed6f767cb --- /dev/null +++ b/docs/reference/query-dsl/queries/constant-score-query.asciidoc @@ -0,0 +1,36 @@ +[[query-dsl-constant-score-query]] +=== Constant Score Query + +A query that wraps a filter or another query and simply returns a +constant score equal to the query boost for every document in the +filter. Maps to Lucene `ConstantScoreQuery`. + +[source,js] +-------------------------------------------------- +{ + "constant_score" : { + "filter" : { + "term" : { "user" : "kimchy"} + }, + "boost" : 1.2 + } +} +-------------------------------------------------- + +The filter object can hold only filter elements, not queries. Filters +can be much faster compared to queries since they don't perform any +scoring, especially when they are cached. + +A query can also be wrapped in a `constant_score` query: + +[source,js] +-------------------------------------------------- +{ + "constant_score" : { + "query" : { + "term" : { "user" : "kimchy"} + }, + "boost" : 1.2 + } +} +-------------------------------------------------- diff --git a/docs/reference/query-dsl/queries/custom-boost-factor-query.asciidoc b/docs/reference/query-dsl/queries/custom-boost-factor-query.asciidoc new file mode 100644 index 00000000000..9d5cf9c428a --- /dev/null +++ b/docs/reference/query-dsl/queries/custom-boost-factor-query.asciidoc @@ -0,0 +1,17 @@ +[[query-dsl-custom-boost-factor-query]] +=== Custom Boost Factor Query + +`custom_boost_factor` query allows to wrap another query and multiply +its score by the provided `boost_factor`. This can sometimes be desired +since `boost` value set on specific queries gets normalized, while this +query boost factor does not. + +[source,js] +-------------------------------------------------- +"custom_boost_factor" : { + "query" : { + .... + }, + "boost_factor" : 5.2 +} +-------------------------------------------------- diff --git a/docs/reference/query-dsl/queries/custom-filters-score-query.asciidoc b/docs/reference/query-dsl/queries/custom-filters-score-query.asciidoc new file mode 100644 index 00000000000..9cd2b0fc30d --- /dev/null +++ b/docs/reference/query-dsl/queries/custom-filters-score-query.asciidoc @@ -0,0 +1,53 @@ +[[query-dsl-custom-filters-score-query]] +=== Custom Filters Score Query + +A `custom_filters_score` query allows to execute a query, and if the hit +matches a provided filter (ordered), use either a boost or a script +associated with it to compute the score. Here is an example: + +[source,js] +-------------------------------------------------- +{ + "custom_filters_score" : { + "query" : { + "match_all" : {} + }, + "filters" : [ + { + "filter" : { "range" : { "age" : {"from" : 0, "to" : 10} } }, + "boost" : "3" + }, + { + "filter" : { "range" : { "age" : {"from" : 10, "to" : 20} } }, + "boost" : "2" + } + ], + "score_mode" : "first" + } +} +-------------------------------------------------- + +This can considerably simplify and increase performance for +parameterized based scoring since filters are easily cached for faster +performance, and boosting / script is considerably simpler. + +[float] +==== Score Mode + +A `score_mode` can be defined to control how multiple matching filters +control the score. By default, it is set to `first` which means the +first matching filter will control the score of the result. It can also +be set to `min`/@max@/@total@/@avg@/@multiply@ which will aggregate the +result from all matching filters based on the aggregation type. + +[float] +==== max_boost + +An option to cap the boost value computed. + +[float] +==== Script + +A `script` can be used instead of `boost` for more complex score +calculations. With optional `params` and `lang` (on the same level as +`query` and `filters`). diff --git a/docs/reference/query-dsl/queries/custom-score-query.asciidoc b/docs/reference/query-dsl/queries/custom-score-query.asciidoc new file mode 100644 index 00000000000..5ea0e15202f --- /dev/null +++ b/docs/reference/query-dsl/queries/custom-score-query.asciidoc @@ -0,0 +1,43 @@ +[[query-dsl-custom-score-query]] +=== Custom Score Query + +`custom_score` query allows to wrap another query and customize the +scoring of it optionally with a computation derived from other field +values in the doc (numeric ones) using +<>. Here is +a simple sample: + +[source,js] +-------------------------------------------------- +"custom_score" : { + "query" : { + .... + }, + "script" : "_score * doc['my_numeric_field'].value" +} +-------------------------------------------------- + +On top of the different scripting field values and expression, the +`_score` script parameter can be used to retrieve the score based on the +wrapped query. + +[float] +==== Script Parameters + +Scripts are cached for faster execution. If the script has parameters +that it needs to take into account, it is preferable to use the same +script, and provide parameters to it: + +[source,js] +-------------------------------------------------- +"custom_score" : { + "query" : { + .... + }, + "params" : { + "param1" : 2, + "param2" : 3.1 + }, + "script" : "_score * doc['my_numeric_field'].value / pow(param1, param2)" +} +-------------------------------------------------- diff --git a/docs/reference/query-dsl/queries/dis-max-query.asciidoc b/docs/reference/query-dsl/queries/dis-max-query.asciidoc new file mode 100644 index 00000000000..2938c8db8ea --- /dev/null +++ b/docs/reference/query-dsl/queries/dis-max-query.asciidoc @@ -0,0 +1,44 @@ +[[query-dsl-dis-max-query]] +=== Dis Max Query + +A query that generates the union of documents produced by its +subqueries, and that scores each document with the maximum score for +that document as produced by any subquery, plus a tie breaking increment +for any additional matching subqueries. + +This is useful when searching for a word in multiple fields with +different boost factors (so that the fields cannot be combined +equivalently into a single search field). We want the primary score to +be the one associated with the highest boost, not the sum of the field +scores (as Boolean Query would give). If the query is "albino elephant" +this ensures that "albino" matching one field and "elephant" matching +another gets a higher score than "albino" matching both fields. To get +this result, use both Boolean Query and DisjunctionMax Query: for each +term a DisjunctionMaxQuery searches for it in each field, while the set +of these DisjunctionMaxQuery's is combined into a BooleanQuery. + +The tie breaker capability allows results that include the same term in +multiple fields to be judged better than results that include this term +in only the best of those multiple fields, without confusing this with +the better case of two different terms in the multiple fields.The +default `tie_breaker` is `0.0`. + +This query maps to Lucene `DisjunctionMaxQuery`. + +[source,js] +-------------------------------------------------- +{ + "dis_max" : { + "tie_breaker" : 0.7, + "boost" : 1.2, + "queries" : [ + { + "term" : { "age" : 34 } + }, + { + "term" : { "age" : 35 } + } + ] + } +} +-------------------------------------------------- diff --git a/docs/reference/query-dsl/queries/field-query.asciidoc b/docs/reference/query-dsl/queries/field-query.asciidoc new file mode 100644 index 00000000000..7affdb21b33 --- /dev/null +++ b/docs/reference/query-dsl/queries/field-query.asciidoc @@ -0,0 +1,33 @@ +[[query-dsl-field-query]] +=== Field Query + +A query that executes a query string against a specific field. It is a +simplified version of +<> +query (by setting the `default_field` to the field this query executed +against). In its simplest form: + +[source,js] +-------------------------------------------------- +{ + "field" : { + "name.first" : "+something -else" + } +} +-------------------------------------------------- + +Most of the `query_string` parameters are allowed with the `field` query +as well, in such a case, the query should be formatted as follows: + +[source,js] +-------------------------------------------------- +{ + "field" : { + "name.first" : { + "query" : "+something -else", + "boost" : 2.0, + "enable_position_increments": false + } + } +} +-------------------------------------------------- diff --git a/docs/reference/query-dsl/queries/filtered-query.asciidoc b/docs/reference/query-dsl/queries/filtered-query.asciidoc new file mode 100644 index 00000000000..bf51e9c0710 --- /dev/null +++ b/docs/reference/query-dsl/queries/filtered-query.asciidoc @@ -0,0 +1,25 @@ +[[query-dsl-filtered-query]] +=== Filtered Query + +A query that applies a filter to the results of another query. This +query maps to Lucene `FilteredQuery`. + +[source,js] +-------------------------------------------------- +{ + "filtered" : { + "query" : { + "term" : { "tag" : "wow" } + }, + "filter" : { + "range" : { + "age" : { "from" : 10, "to" : 20 } + } + } + } +} +-------------------------------------------------- + +The filter object can hold only filter elements, not queries. Filters +can be much faster compared to queries since they don't perform any +scoring, especially when they are cached. diff --git a/docs/reference/query-dsl/queries/flt-field-query.asciidoc b/docs/reference/query-dsl/queries/flt-field-query.asciidoc new file mode 100644 index 00000000000..734983c3389 --- /dev/null +++ b/docs/reference/query-dsl/queries/flt-field-query.asciidoc @@ -0,0 +1,47 @@ +[[query-dsl-flt-field-query]] +=== Fuzzy Like This Field Query + +The `fuzzy_like_this_field` query is the same as the `fuzzy_like_this` +query, except that it runs against a single field. It provides nicer +query DSL over the generic `fuzzy_like_this` query, and support typed +fields query (automatically wraps typed fields with type filter to match +only on the specific type). + +[source,js] +-------------------------------------------------- +{ + "fuzzy_like_this_field" : { + "name.first" : { + "like_text" : "text like this one", + "max_query_terms" : 12 + } + } +} +-------------------------------------------------- + +`fuzzy_like_this_field` can be shortened to `flt_field`. + +The `fuzzy_like_this_field` top level parameters include: + +[cols="<,<",options="header",] +|======================================================================= +|Parameter |Description +|`like_text` |The text to find documents like it, *required*. + +|`ignore_tf` |Should term frequency be ignored. Defaults to `false`. + +|`max_query_terms` |The maximum number of query terms that will be +included in any generated query. Defaults to `25`. + +|`min_similarity` |The minimum similarity of the term variants. Defaults +to `0.5`. + +|`prefix_length` |Length of required common prefix on variant terms. +Defaults to `0`. + +|`boost` |Sets the boost value of the query. Defaults to `1.0`. + +|`analyzer` |The analyzer that will be used to analyze the text. +Defaults to the analyzer associated with the field. +|======================================================================= + diff --git a/docs/reference/query-dsl/queries/flt-query.asciidoc b/docs/reference/query-dsl/queries/flt-query.asciidoc new file mode 100644 index 00000000000..beb49ea9367 --- /dev/null +++ b/docs/reference/query-dsl/queries/flt-query.asciidoc @@ -0,0 +1,65 @@ +[[query-dsl-flt-query]] +=== Fuzzy Like This Query + +Fuzzy like this query find documents that are "like" provided text by +running it against one or more fields. + +[source,js] +-------------------------------------------------- +{ + "fuzzy_like_this" : { + "fields" : ["name.first", "name.last"], + "like_text" : "text like this one", + "max_query_terms" : 12 + } +} +-------------------------------------------------- + +`fuzzy_like_this` can be shortened to `flt`. + +The `fuzzy_like_this` top level parameters include: + +[cols="<,<",options="header",] +|======================================================================= +|Parameter |Description +|`fields` |A list of the fields to run the more like this query against. +Defaults to the `_all` field. + +|`like_text` |The text to find documents like it, *required*. + +|`ignore_tf` |Should term frequency be ignored. Defaults to `false`. + +|`max_query_terms` |The maximum number of query terms that will be +included in any generated query. Defaults to `25`. + +|`min_similarity` |The minimum similarity of the term variants. Defaults +to `0.5`. + +|`prefix_length` |Length of required common prefix on variant terms. +Defaults to `0`. + +|`boost` |Sets the boost value of the query. Defaults to `1.0`. + +|`analyzer` |The analyzer that will be used to analyze the text. +Defaults to the analyzer associated with the field. +|======================================================================= + +[float] +==== How it Works + +Fuzzifies ALL terms provided as strings and then picks the best n +differentiating terms. In effect this mixes the behaviour of FuzzyQuery +and MoreLikeThis but with special consideration of fuzzy scoring +factors. This generally produces good results for queries where users +may provide details in a number of fields and have no knowledge of +boolean query syntax and also want a degree of fuzzy matching and a fast +query. + +For each source term the fuzzy variants are held in a BooleanQuery with +no coord factor (because we are not looking for matches on multiple +variants in any one doc). Additionally, a specialized TermQuery is used +for variants and does not use that variant term's IDF because this would +favor rarer terms, such as misspellings. Instead, all variants use the +same IDF ranking (the one for the source query term) and this is +factored into the variant's boost. If the source query term does not +exist in the index the average IDF of the variants is used. diff --git a/docs/reference/query-dsl/queries/fuzzy-query.asciidoc b/docs/reference/query-dsl/queries/fuzzy-query.asciidoc new file mode 100644 index 00000000000..e8100610763 --- /dev/null +++ b/docs/reference/query-dsl/queries/fuzzy-query.asciidoc @@ -0,0 +1,79 @@ +[[query-dsl-fuzzy-query]] +=== Fuzzy Query + +A fuzzy based query that uses similarity based on Levenshtein (edit +distance) algorithm. + +Warning: this query is not very scalable with its default prefix length +of 0 - in this case, *every* term will be enumerated and cause an edit +score calculation or `max_expansions` is not set. + +Here is a simple example: + +[source,js] +-------------------------------------------------- +{ + "fuzzy" : { "user" : "ki" } +} +-------------------------------------------------- + +More complex settings can be set (the values here are the default +values): + +[source,js] +-------------------------------------------------- + { + "fuzzy" : { + "user" : { + "value" : "ki", + "boost" : 1.0, + "min_similarity" : 0.5, + "prefix_length" : 0 + } + } + } +-------------------------------------------------- + +The `max_expansions` parameter (unbounded by default) controls the +number of terms the fuzzy query will expand to. + +[float] +==== Numeric / Date Fuzzy + +`fuzzy` query on a numeric field will result in a range query "around" +the value using the `min_similarity` value. For example: + +[source,js] +-------------------------------------------------- +{ + "fuzzy" : { + "price" : { + "value" : 12, + "min_similarity" : 2 + } + } +} +-------------------------------------------------- + +Will result in a range query between 10 and 14. Same applies to dates, +with support for time format for the `min_similarity` field: + +[source,js] +-------------------------------------------------- +{ + "fuzzy" : { + "created" : { + "value" : "2010-02-05T12:05:07", + "min_similarity" : "1d" + } + } +} +-------------------------------------------------- + +In the mapping, numeric and date types now allow to configure a +`fuzzy_factor` mapping value (defaults to 1), which will be used to +multiply the fuzzy value by it when used in a `query_string` type query. +For example, for dates, a fuzzy factor of "1d" will result in +multiplying whatever fuzzy value provided in the min_similarity by it. +Note, this is explicitly supported since query_string query only allowed +for similarity valued between 0.0 and 1.0. diff --git a/docs/reference/query-dsl/queries/geo-shape-query.asciidoc b/docs/reference/query-dsl/queries/geo-shape-query.asciidoc new file mode 100644 index 00000000000..ea5f2c3fce8 --- /dev/null +++ b/docs/reference/query-dsl/queries/geo-shape-query.asciidoc @@ -0,0 +1,66 @@ +[[query-dsl-geo-shape-query]] +=== GeoShape Query + +Query version of the +<>. + +Requires the <>. + +Given a document that looks like this: + +[source,js] +-------------------------------------------------- +{ + "name": "Wind & Wetter, Berlin, Germany", + "location": { + "type": "Point", + "coordinates": [13.400544, 52.530286] + } +} +-------------------------------------------------- + +The following query will find the point: + +[source,js] +-------------------------------------------------- +{ + "query": { + "geo_shape": { + "location": { + "shape": { + "type": "envelope", + "coordinates": [[13, 53],[14, 52]] + } + } + } + } +} +-------------------------------------------------- + +See the Filter's documentation for more information. + +[float] +==== Relevancy and Score + +Currently Elasticsearch does not have any notion of geo shape relevancy, +consequently the Query internally uses a `constant_score` Query which +wraps a <>. + +[float] +==== Compatibility with older versions + +Elasticsearch 0.90 changed the geo_shape implementation in a way that is +not compatible. Prior to this version, there was a required `relation` +field on queries and filter queries that indicated the relation of the +query shape to the indexed shapes. Support for this was implemented in +Elasticsearch and was poorly aligned with the underlying Lucene +implementation, which has no notion of a relation. From 0.90, this field +defaults to its only supported value: `intersects`. The other values of +`contains`, `within`, `disjoint` are no longer supported. By using e.g. +a bool filter, one can easily emulate `disjoint`. Given the imprecise +accuracy (see +<>), +`within` and `contains` were always somewhat problematic and +`intersects` is generally good enough. diff --git a/docs/reference/query-dsl/queries/has-child-query.asciidoc b/docs/reference/query-dsl/queries/has-child-query.asciidoc new file mode 100644 index 00000000000..1327d855700 --- /dev/null +++ b/docs/reference/query-dsl/queries/has-child-query.asciidoc @@ -0,0 +1,85 @@ +[[query-dsl-has-child-query]] +=== Has Child Query + +The `has_child` query works the same as the +<> filter, +by automatically wrapping the filter with a +<> +(when using the default score type). It has the same syntax as the +<> filter: + +[source,js] +-------------------------------------------------- +{ + "has_child" : { + "type" : "blog_tag", + "query" : { + "term" : { + "tag" : "something" + } + } + } +} +-------------------------------------------------- + +An important difference with the `top_children` query is that this query +is always executed in two iterations whereas the `top_children` query +can be executed in one or more iteration. When using the `has_child` +query the `total_hits` is always correct. + +[float] +==== Scoring capabilities + +The `has_child` also has scoring support from version `0.20.2`. The +supported score types are `max`, `sum`, `avg` or `none`. The default is +`none` and yields the same behaviour as in previous versions. If the +score type is set to another value than `none`, the scores of all the +matching child documents are aggregated into the associated parent +documents. The score type can be specified with the `score_type` field +inside the `has_child` query: + +[source,js] +-------------------------------------------------- +{ + "has_child" : { + "type" : "blog_tag", + "score_type" : "sum", + "query" : { + "term" : { + "tag" : "something" + } + } + } +} +-------------------------------------------------- + +[float] +==== Scope + +The `_scope` support has been removed from version `0.90.beta1`. See: +https://github.com/elasticsearch/elasticsearch/issues/2606 + +A `_scope` can be defined on the filter allowing to run facets on the +same scope name that will work against the child documents. For example: + +[source,js] +-------------------------------------------------- +{ + "has_child" : { + "_scope" : "my_scope", + "type" : "blog_tag", + "query" : { + "term" : { + "tag" : "something" + } + } + } +} +-------------------------------------------------- + +[float] +==== Memory Considerations + +With the current implementation, all `_id` values are loaded to memory +(heap) in order to support fast lookups, so make sure there is enough +memory for it. diff --git a/docs/reference/query-dsl/queries/has-parent-query.asciidoc b/docs/reference/query-dsl/queries/has-parent-query.asciidoc new file mode 100644 index 00000000000..3d92049bc9f --- /dev/null +++ b/docs/reference/query-dsl/queries/has-parent-query.asciidoc @@ -0,0 +1,83 @@ +[[query-dsl-has-parent-query]] +=== Has Parent Query + +The `has_parent` query works the same as the +<> +filter, by automatically wrapping the filter with a constant_score (when +using the default score type). It has the same syntax as the +<> +filter. This query is experimental and is available from version +`0.19.10`. + +[source,js] +-------------------------------------------------- +{ + "has_parent" : { + "parent_type" : "blog", + "query" : { + "term" : { + "tag" : "something" + } + } + } +} +-------------------------------------------------- + +[float] +==== Scoring capabilities + +The `has_parent` also has scoring support from version `0.20.2`. The +supported score types are `score` or `none`. The default is `none` and +this ignores the score from the parent document. The score is in this +case equal to the boost on the `has_parent` query (Defaults to 1). If +the score type is set to `score`, then the score of the matching parent +document is aggregated into the child documents belonging to the +matching parent document. The score type can be specified with the +`score_type` field inside the `has_parent` query: + +[source,js] +-------------------------------------------------- +{ + "has_parent" : { + "parent_type" : "blog", + "score_type" : "score", + "query" : { + "term" : { + "tag" : "something" + } + } + } +} +-------------------------------------------------- + +[float] +==== Scope + +The `_scope` support has been removed from version `0.90.beta1`. See: +https://github.com/elasticsearch/elasticsearch/issues/2606 + +A `_scope` can be defined on the filter allowing to run facets on the +same scope name that will work against the parent documents. For +example: + +[source,js] +-------------------------------------------------- +{ + "has_parent" : { + "_scope" : "my_scope", + "parent_type" : "blog", + "query" : { + "term" : { + "tag" : "something" + } + } + } +} +-------------------------------------------------- + +[float] +==== Memory Considerations + +With the current implementation, all `_id` values are loaded to memory +(heap) in order to support fast lookups, so make sure there is enough +memory for it. diff --git a/docs/reference/query-dsl/queries/ids-query.asciidoc b/docs/reference/query-dsl/queries/ids-query.asciidoc new file mode 100644 index 00000000000..8de62d7a3e9 --- /dev/null +++ b/docs/reference/query-dsl/queries/ids-query.asciidoc @@ -0,0 +1,20 @@ +[[query-dsl-ids-query]] +=== Ids Query + +Filters documents that only have the provided ids. Note, this filter +does not require the <> +field to be indexed since it works using the +<> field. + +[source,js] +-------------------------------------------------- +{ + "ids" : { + "type" : "my_type", + "values" : ["1", "4", "100"] + } +} +-------------------------------------------------- + +The `type` is optional and can be omitted, and can also accept an array +of values. diff --git a/docs/reference/query-dsl/queries/indices-query.asciidoc b/docs/reference/query-dsl/queries/indices-query.asciidoc new file mode 100644 index 00000000000..8ef16a6c6b7 --- /dev/null +++ b/docs/reference/query-dsl/queries/indices-query.asciidoc @@ -0,0 +1,25 @@ +[[query-dsl-indices-query]] +=== Indices Query + +The `indices` query can be used when executed across multiple indices, +allowing to have a query that executes only when executed on an index +that matches a specific list of indices, and another query that executes +when it is executed on an index that does not match the listed indices. + +[source,js] +-------------------------------------------------- +{ + "indices" : { + "indices" : ["index1", "index2"], + "query" : { + "term" : { "tag" : "wow" } + }, + "no_match_query" : { + "term" : { "tag" : "kow" } + } + } +} +-------------------------------------------------- + +`no_match_query` can also have "string" value of `none` (to match no +documents), and `all` (to match all). diff --git a/docs/reference/query-dsl/queries/match-all-query.asciidoc b/docs/reference/query-dsl/queries/match-all-query.asciidoc new file mode 100644 index 00000000000..2ea3d410bfd --- /dev/null +++ b/docs/reference/query-dsl/queries/match-all-query.asciidoc @@ -0,0 +1,20 @@ +[[query-dsl-match-all-query]] +=== Match All Query + +A query that matches all documents. Maps to Lucene `MatchAllDocsQuery`. + +[source,js] +-------------------------------------------------- +{ + "match_all" : { } +} +-------------------------------------------------- + +Which can also have boost associated with it: + +[source,js] +-------------------------------------------------- +{ + "match_all" : { "boost" : 1.2 } +} +-------------------------------------------------- diff --git a/docs/reference/query-dsl/queries/match-query.asciidoc b/docs/reference/query-dsl/queries/match-query.asciidoc new file mode 100644 index 00000000000..2a47aa8a858 --- /dev/null +++ b/docs/reference/query-dsl/queries/match-query.asciidoc @@ -0,0 +1,222 @@ +[[query-dsl-match-query]] +=== Match Query + +A family of `match` queries that accept text/numerics/dates, analyzes +it, and constructs a query out of it. For example: + +[source,js] +-------------------------------------------------- +{ + "match" : { + "message" : "this is a test" + } +} +-------------------------------------------------- + +Note, `message` is the name of a field, you can substitute the name of +any field (including `_all`) instead. + +[float] +==== Types of Match Queries + +[float] +===== boolean + +The default `match` query is of type `boolean`. It means that the text +provided is analyzed and the analysis process constructs a boolean query +from the provided text. The `operator` flag can be set to `or` or `and` +to control the boolean clauses (defaults to `or`). The minimum number of +should clauses to match can be set using the +<> +parameter. + +The `analyzer` can be set to control which analyzer will perform the +analysis process on the text. It default to the field explicit mapping +definition, or the default search analyzer. + +`fuzziness` can be set to a value (depending on the relevant type, for +string types it should be a value between `0.0` and `1.0`) to constructs +fuzzy queries for each term analyzed. The `prefix_length` and +`max_expansions` can be set in this case to control the fuzzy process. +If the fuzzy option is set the query will use `constant_score_rewrite` +as its <> the `rewrite` parameter allows to control how the query will get +rewritten. + +Here is an example when providing additional parameters (note the slight +change in structure, `message` is the field name): + +[source,js] +-------------------------------------------------- +{ + "match" : { + "message" : { + "query" : "this is a test", + "operator" : "and" + } + } +} +-------------------------------------------------- + +zero_terms_query + +If the analyzer used removes all tokens in a query like a `stop` filter +does, the default behavior is to match no documents at all. In order to +change that the `zero_terms_query` option can be used, which accepts +`none` (default) and `all` which corresponds to a `match_all` query. + +[source,js] +-------------------------------------------------- +{ + "match" : { + "message" : { + "query" : "to be or not to be", + "operator" : "and", + "zero_terms_query": "all" + } + } +} +-------------------------------------------------- + +cutoff_frequency + +Since `0.90.0` match query supports a `cutoff_frequency` that allows +specifying an absolute or relative document frequency where high +frequent terms are moved into an optional subquery and are only scored +if one of the low frequent (below the cutoff) terms in the case of an +`or` operator or all of the low frequent terms in the case of an `and` +operator match. + +This query allows handling `stopwords` dynamically at runtime, is domain +independent and doesn't require on a stopword file. It prevent scoring / +iterating high frequent terms and only takes the terms into account if a +more significant / lower frequent terms match a document. Yet, if all of +the query terms are above the given `cutoff_frequency` the query is +automatically transformed into a pure conjunction (`and`) query to +ensure fast execution. + +The `cutoff_frequency` can either be relative to the number of documents +in the index if in the range `[0..1)` or absolute if greater or equal to +`1.0`. + +Here is an example showing a query composed of stopwords exclusivly: + +[source,js] +-------------------------------------------------- +{ + "match" : { + "message" : { + "query" : "to be or not to be", + "cutoff_frequency" : 0.001 + } + } +} +-------------------------------------------------- + +[float] +===== phrase + +The `match_phrase` query analyzes the text and creates a `phrase` query +out of the analyzed text. For example: + +[source,js] +-------------------------------------------------- +{ + "match_phrase" : { + "message" : "this is a test" + } +} +-------------------------------------------------- + +Since `match_phrase` is only a `type` of a `match` query, it can also be +used in the following manner: + +[source,js] +-------------------------------------------------- +{ + "match" : { + "message" : { + "query" : "this is a test", + "type" : "phrase" + } + } +} +-------------------------------------------------- + +A phrase query maintains order of the terms up to a configurable `slop` +(which defaults to 0). + +The `analyzer` can be set to control which analyzer will perform the +analysis process on the text. It default to the field explicit mapping +definition, or the default search analyzer, for example: + +[source,js] +-------------------------------------------------- +{ + "match_phrase" : { + "message" : { + "query" : "this is a test", + "analyzer" : "my_analyzer" + } + } +} +-------------------------------------------------- + +[float] +===== match_phrase_prefix + +The `match_phrase_prefix` is the same as `match_phrase`, except that it +allows for prefix matches on the last term in the text. For example: + +[source,js] +-------------------------------------------------- +{ + "match_phrase_prefix" : { + "message" : "this is a test" + } +} +-------------------------------------------------- + +Or: + +[source,js] +-------------------------------------------------- +{ + "match" : { + "message" : { + "query" : "this is a test", + "type" : "phrase_prefix" + } + } +} +-------------------------------------------------- + +It accepts the same parameters as the phrase type. In addition, it also +accepts a `max_expansions` parameter that can control to how many +prefixes the last term will be expanded. It is highly recommended to set +it to an acceptable value to control the execution time of the query. +For example: + +[source,js] +-------------------------------------------------- +{ + "match_phrase_prefix" : { + "message" : { + "query" : "this is a test", + "max_expansions" : 10 + } + } +} +-------------------------------------------------- + +[float] +==== Comparison to query_string / field + +The match family of queries does not go through a "query parsing" +process. It does not support field name prefixes, wildcard characters, +or other "advance" features. For this reason, chances of it failing are +very small / non existent, and it provides an excellent behavior when it +comes to just analyze and run that text as a query behavior (which is +usually what a text search box does). Also, the `phrase_prefix` type can +provide a great "as you type" behavior to automatically load search +results. diff --git a/docs/reference/query-dsl/queries/minimum-should-match.asciidoc b/docs/reference/query-dsl/queries/minimum-should-match.asciidoc new file mode 100644 index 00000000000..bedd4964540 --- /dev/null +++ b/docs/reference/query-dsl/queries/minimum-should-match.asciidoc @@ -0,0 +1,56 @@ +[[query-dsl-minimum-should-match]] +=== Minimum Should Match + +The `minimum_should_match` parameter possible values: + +[cols="<,<,<",options="header",] +|======================================================================= +|Type |Example |Description +|Integer |`3` |Indicates a fixed value regardless of the number of +optional clauses. + +|Negative integer |`-2` |Indicates that the total number of optional +clauses, minus this number should be mandatory. + +|Percentage |`75%` |Indicates that this percent of the total number of +optional clauses are necessary. The number computed from the percentage +is rounded down and used as the minimum. + +|Negative percentage |`-25%` |Indicates that this percent of the total +number of optional clauses can be missing. The number computed from the +percentage is rounded down, before being subtracted from the total to +determine the minimum. + +|Combination |`3<90%` |A positive integer, followed by the less-than +symbol, followed by any of the previously mentioned specifiers is a +conditional specification. It indicates that if the number of optional +clauses is equal to (or less than) the integer, they are all required, +but if it's greater than the integer, the specification applies. In this +example: if there are 1 to 3 clauses they are all required, but for 4 or +more clauses only 90% are required. + +|Multiple combinations |`2<-25 9<-3` |Multiple conditional +specifications can be separated by spaces, each one only being valid for +numbers greater than the one before it. In this example: if there are 1 +or 2 clauses both are required, if there are 3-9 clauses all but 25% are +required, and if there are more than 9 clauses, all but three are +required. +|======================================================================= + +*NOTE:* + +When dealing with percentages, negative values can be used to get +different behavior in edge cases. 75% and -25% mean the same thing when +dealing with 4 clauses, but when dealing with 5 clauses 75% means 3 are +required, but -25% means 4 are required. + +If the calculations based on the specification determine that no +optional clauses are needed, the usual rules about BooleanQueries still +apply at search time (a BooleanQuery containing no required clauses must +still match at least one optional clause) + +No matter what number the calculation arrives at, a value greater than +the number of optional clauses, or a value less than 1 will never be +used. (ie: no matter how low or how high the result of the calculation +result is, the minimum number of required matches will never be lower +than 1 or greater than the number of clauses. diff --git a/docs/reference/query-dsl/queries/mlt-field-query.asciidoc b/docs/reference/query-dsl/queries/mlt-field-query.asciidoc new file mode 100644 index 00000000000..58d4070b4d3 --- /dev/null +++ b/docs/reference/query-dsl/queries/mlt-field-query.asciidoc @@ -0,0 +1,68 @@ +[[query-dsl-mlt-field-query]] +=== More Like This Field Query + +The `more_like_this_field` query is the same as the `more_like_this` +query, except it runs against a single field. It provides nicer query +DSL over the generic `more_like_this` query, and support typed fields +query (automatically wraps typed fields with type filter to match only +on the specific type). + +[source,js] +-------------------------------------------------- +{ + "more_like_this_field" : { + "name.first" : { + "like_text" : "text like this one", + "min_term_freq" : 1, + "max_query_terms" : 12 + } + } +} +-------------------------------------------------- + +`more_like_this_field` can be shortened to `mlt_field`. + +The `more_like_this_field` top level parameters include: + +[cols="<,<",options="header",] +|======================================================================= +|Parameter |Description +|`like_text` |The text to find documents like it, *required*. + +|`percent_terms_to_match` |The percentage of terms to match on (float +value). Defaults to `0.3` (30 percent). + +|`min_term_freq` |The frequency below which terms will be ignored in the +source doc. The default frequency is `2`. + +|`max_query_terms` |The maximum number of query terms that will be +included in any generated query. Defaults to `25`. + +|`stop_words` |An array of stop words. Any word in this set is +considered "uninteresting" and ignored. Even if your Analyzer allows +stopwords, you might want to tell the MoreLikeThis code to ignore them, +as for the purposes of document similarity it seems reasonable to assume +that "a stop word is never interesting". + +|`min_doc_freq` |The frequency at which words will be ignored which do +not occur in at least this many docs. Defaults to `5`. + +|`max_doc_freq` |The maximum frequency in which words may still appear. +Words that appear in more than this many docs will be ignored. Defaults +to unbounded. + +|`min_word_len` |The minimum word length below which words will be +ignored. Defaults to `0`. + +|`max_word_len` |The maximum word length above which words will be +ignored. Defaults to unbounded (`0`). + +|`boost_terms` |Sets the boost factor to use when boosting terms. +Defaults to `1`. + +|`boost` |Sets the boost value of the query. Defaults to `1.0`. + +|`analyzer` |The analyzer that will be used to analyze the text. +Defaults to the analyzer associated with the field. +|======================================================================= + diff --git a/docs/reference/query-dsl/queries/mlt-query.asciidoc b/docs/reference/query-dsl/queries/mlt-query.asciidoc new file mode 100644 index 00000000000..979854632e8 --- /dev/null +++ b/docs/reference/query-dsl/queries/mlt-query.asciidoc @@ -0,0 +1,67 @@ +[[query-dsl-mlt-query]] +=== More Like This Query + +More like this query find documents that are "like" provided text by +running it against one or more fields. + +[source,js] +-------------------------------------------------- +{ + "more_like_this" : { + "fields" : ["name.first", "name.last"], + "like_text" : "text like this one", + "min_term_freq" : 1, + "max_query_terms" : 12 + } +} +-------------------------------------------------- + +`more_like_this` can be shortened to `mlt`. + +The `more_like_this` top level parameters include: + +[cols="<,<",options="header",] +|======================================================================= +|Parameter |Description +|`fields` |A list of the fields to run the more like this query against. +Defaults to the `_all` field. + +|`like_text` |The text to find documents like it, *required*. + +|`percent_terms_to_match` |The percentage of terms to match on (float +value). Defaults to `0.3` (30 percent). + +|`min_term_freq` |The frequency below which terms will be ignored in the +source doc. The default frequency is `2`. + +|`max_query_terms` |The maximum number of query terms that will be +included in any generated query. Defaults to `25`. + +|`stop_words` |An array of stop words. Any word in this set is +considered "uninteresting" and ignored. Even if your Analyzer allows +stopwords, you might want to tell the MoreLikeThis code to ignore them, +as for the purposes of document similarity it seems reasonable to assume +that "a stop word is never interesting". + +|`min_doc_freq` |The frequency at which words will be ignored which do +not occur in at least this many docs. Defaults to `5`. + +|`max_doc_freq` |The maximum frequency in which words may still appear. +Words that appear in more than this many docs will be ignored. Defaults +to unbounded. + +|`min_word_len` |The minimum word length below which words will be +ignored. Defaults to `0`. + +|`max_word_len` |The maximum word length above which words will be +ignored. Defaults to unbounded (`0`). + +|`boost_terms` |Sets the boost factor to use when boosting terms. +Defaults to `1`. + +|`boost` |Sets the boost value of the query. Defaults to `1.0`. + +|`analyzer` |The analyzer that will be used to analyze the text. +Defaults to the analyzer associated with the field. +|======================================================================= + diff --git a/docs/reference/query-dsl/queries/multi-match-query.asciidoc b/docs/reference/query-dsl/queries/multi-match-query.asciidoc new file mode 100644 index 00000000000..cb098cd48b7 --- /dev/null +++ b/docs/reference/query-dsl/queries/multi-match-query.asciidoc @@ -0,0 +1,64 @@ +[[query-dsl-multi-match-query]] +=== Multi Match Query + +The `multi_match` query builds further on top of the `match` query by +allowing multiple fields to be specified. The idea here is to allow to +more easily build a concise match type query over multiple fields +instead of using a relatively more expressive query by using multiple +match queries within a `bool` query. + +The structure of the query is a bit different. Instead of a nested json +object defining the query field, there is a top json level field for +defining the query fields. Example: + +[source,js] +-------------------------------------------------- +{ + "multi_match" : { + "query" : "this is a test", + "fields" : [ "subject", "message" ] + } +} +-------------------------------------------------- + +The `multi_match` query creates either a `bool` or a `dis_max` top level +query. Each field is a query clause in this top level query. The query +clause contains the actual query (the specified 'type' defines what +query this will be). Each query clause is basically a `should` clause. + +[float] +[float] +==== Options + +All options that apply on the `match` query also apply on the +`multi_match` query. The `match` query options apply only on the +individual clauses inside the top level query. + +* `fields` - Fields to be used in the query. +* `use_dis_max` - Boolean indicating to either create a `dis_max` query +or a `bool` query. Defaults to `true`. +* `tie_breaker` - Multiplier value to balance the scores between lower +and higher scoring fields. Only applicable when `use_dis_max` is set to +true. Defaults to `0.0`. + +The query accepts all the options that a regular `match` query accepts. + +[float] +[float] +==== Boosting + +The `multi_match` query supports field boosting via `^` notation in the +fields json field. + +[source,js] +-------------------------------------------------- +{ + "multi_match" : { + "query" : "this is a test", + "fields" : [ "subject^2", "message" ] + } +} +-------------------------------------------------- + +In the above example hits in the `subject` field are 2 times more +important than in the `message` field. diff --git a/docs/reference/query-dsl/queries/multi-term-rewrite.asciidoc b/docs/reference/query-dsl/queries/multi-term-rewrite.asciidoc new file mode 100644 index 00000000000..135be67a044 --- /dev/null +++ b/docs/reference/query-dsl/queries/multi-term-rewrite.asciidoc @@ -0,0 +1,42 @@ +[[query-dsl-multi-term-rewrite]] +=== Multi Term Query Rewrite + +Multi term queries, like +<> and +<> are called +multi term queries and end up going through a process of rewrite. This +also happens on the +<>. +All of those queries allow to control how they will get rewritten using +the `rewrite` parameter: + +* When not set, or set to `constant_score_auto`, defaults to +automatically choosing either `constant_score_boolean` or +`constant_score_filter` based on query characteristics. +* `scoring_boolean`: A rewrite method that first translates each term +into a should clause in a boolean query, and keeps the scores as +computed by the query. Note that typically such scores are meaningless +to the user, and require non-trivial CPU to compute, so it's almost +always better to use `constant_score_auto`. This rewrite method will hit +too many clauses failure if it exceeds the boolean query limit (defaults +to `1024`). +* `constant_score_boolean`: Similar to `scoring_boolean` except scores +are not computed. Instead, each matching document receives a constant +score equal to the query's boost. This rewrite method will hit too many +clauses failure if it exceeds the boolean query limit (defaults to +`1024`). +* `constant_score_filter`: A rewrite method that first creates a private +Filter by visiting each term in sequence and marking all docs for that +term. Matching documents are assigned a constant score equal to the +query's boost. +* `top_terms_N`: A rewrite method that first translates each term into +should clause in boolean query, and keeps the scores as computed by the +query. This rewrite method only uses the top scoring terms so it will +not overflow boolean max clause count. The `N` controls the size of the +top scoring terms to use. +* `top_terms_boost_N`: A rewrite method that first translates each term +into should clause in boolean query, but the scores are only computed as +the boost. This rewrite method only uses the top scoring terms so it +will not overflow the boolean max clause count. The `N` controls the +size of the top scoring terms to use. + diff --git a/docs/reference/query-dsl/queries/nested-query.asciidoc b/docs/reference/query-dsl/queries/nested-query.asciidoc new file mode 100644 index 00000000000..bc7e07cb47b --- /dev/null +++ b/docs/reference/query-dsl/queries/nested-query.asciidoc @@ -0,0 +1,58 @@ +[[query-dsl-nested-query]] +=== Nested Query + +Nested query allows to query nested objects / docs (see +<>). The +query is executed against the nested objects / docs as if they were +indexed as separate docs (they are, internally) and resulting in the +root parent doc (or parent nested mapping). Here is a sample mapping we +will work with: + +[source,js] +-------------------------------------------------- +{ + "type1" : { + "properties" : { + "obj1" : { + "type" : "nested" + } + } + } +} +-------------------------------------------------- + +And here is a sample nested query usage: + +[source,js] +-------------------------------------------------- +{ + "nested" : { + "path" : "obj1", + "score_mode" : "avg", + "query" : { + "bool" : { + "must" : [ + { + "match" : {"obj1.name" : "blue"} + }, + { + "range" : {"obj1.count" : {"gt" : 5}} + } + ] + } + } + } +} +-------------------------------------------------- + +The query `path` points to the nested object path, and the `query` (or +`filter`) includes the query that will run on the nested docs matching +the direct path, and joining with the root parent docs. + +The `score_mode` allows to set how inner children matching affects +scoring of parent. It defaults to `avg`, but can be `total`, `max` and +`none`. + +Multi level nesting is automatically supported, and detected, resulting +in an inner nested query to automatically match the relevant nesting +level (and not root) if it exists within another nested query. diff --git a/docs/reference/query-dsl/queries/prefix-query.asciidoc b/docs/reference/query-dsl/queries/prefix-query.asciidoc new file mode 100644 index 00000000000..1bcf75a0226 --- /dev/null +++ b/docs/reference/query-dsl/queries/prefix-query.asciidoc @@ -0,0 +1,36 @@ +[[query-dsl-prefix-query]] +=== Prefix Query + +Matches documents that have fields containing terms with a specified +prefix (*not analyzed*). The prefix query maps to Lucene `PrefixQuery`. +The following matches documents where the user field contains a term +that starts with `ki`: + +[source,js] +-------------------------------------------------- +{ + "prefix" : { "user" : "ki" } +} +-------------------------------------------------- + +A boost can also be associated with the query: + +[source,js] +-------------------------------------------------- +{ + "prefix" : { "user" : { "value" : "ki", "boost" : 2.0 } } +} +-------------------------------------------------- + +Or : + +[source,js] +-------------------------------------------------- +{ + "prefix" : { "user" : { "prefix" : "ki", "boost" : 2.0 } } +} +-------------------------------------------------- + +This multi term query allows to control how it gets rewritten using the +<> +parameter. diff --git a/docs/reference/query-dsl/queries/query-string-query.asciidoc b/docs/reference/query-dsl/queries/query-string-query.asciidoc new file mode 100644 index 00000000000..62aa6105bd1 --- /dev/null +++ b/docs/reference/query-dsl/queries/query-string-query.asciidoc @@ -0,0 +1,173 @@ +[[query-dsl-query-string-query]] +=== Query String Query + +A query that uses a query parser in order to parse its content. Here is +an example: + +[source,js] +-------------------------------------------------- +{ + "query_string" : { + "default_field" : "content", + "query" : "this AND that OR thus" + } +} +-------------------------------------------------- + +The `query_string` top level parameters include: + +[cols="<,<",options="header",] +|======================================================================= +|Parameter |Description +|`query` |The actual query to be parsed. + +|`default_field` |The default field for query terms if no prefix field +is specified. Defaults to the `index.query.default_field` index +settings, which in turn defaults to `_all`. + +|`default_operator` |The default operator used if no explicit operator +is specified. For example, with a default operator of `OR`, the query +`capital of Hungary` is translated to `capital OR of OR Hungary`, and +with default operator of `AND`, the same query is translated to +`capital AND of AND Hungary`. The default value is `OR`. + +|`analyzer` |The analyzer name used to analyze the query string. + +|`allow_leading_wildcard` |When set, `*` or `?` are allowed as the first +character. Defaults to `true`. + +|`lowercase_expanded_terms` |Whether terms of wildcard, prefix, fuzzy, +and range queries are to be automatically lower-cased or not (since they +are not analyzed). Default it `true`. + +|`enable_position_increments` |Set to `true` to enable position +increments in result queries. Defaults to `true`. + +|`fuzzy_max_expansions` |Controls the number of terms fuzzy queries will +expand to. Defaults to `50` + +|`fuzzy_min_sim` |Set the minimum similarity for fuzzy queries. Defaults +to `0.5` + +|`fuzzy_prefix_length` |Set the prefix length for fuzzy queries. Default +is `0`. + +|`phrase_slop` |Sets the default slop for phrases. If zero, then exact +phrase matches are required. Default value is `0`. + +|`boost` |Sets the boost value of the query. Defaults to `1.0`. + +|`analyze_wildcard` |By default, wildcards terms in a query string are +not analyzed. By setting this value to `true`, a best effort will be +made to analyze those as well. + +|`auto_generate_phrase_queries` |Default to `false`. + +|`minimum_should_match` |A value controlling how many "should" clauses +in the resulting boolean query should match. It can be an absolute value +(`2`), a percentage (`30%`) or a +<>. + +|`lenient` |If set to `true` will cause format based failures (like +providing text to a numeric field) to be ignored. (since 0.19.4). +|======================================================================= + +When a multi term query is being generated, one can control how it gets +rewritten using the +<> +parameter. + +[float] +==== Default Field + +When not explicitly specifying the field to search on in the query +string syntax, the `index.query.default_field` will be used to derive +which field to search on. It defaults to `_all` field. + +So, if `_all` field is disabled, it might make sense to change it to set +a different default field. + +[float] +==== Multi Field + +The `query_string` query can also run against multiple fields. The idea +of running the `query_string` query against multiple fields is by +internally creating several queries for the same query string, each with +`default_field` that match the fields provided. Since several queries +are generated, combining them can be automatically done either using a +`dis_max` query or a simple `bool` query. For example (the `name` is +boosted by 5 using `^5` notation): + +[source,js] +-------------------------------------------------- +{ + "query_string" : { + "fields" : ["content", "name^5"], + "query" : "this AND that OR thus", + "use_dis_max" : true + } +} +-------------------------------------------------- + +Simple wildcard can also be used to search "within" specific inner +elements of the document. For example, if we have a `city` object with +several fields (or inner object with fields) in it, we can automatically +search on all "city" fields: + +[source,js] +-------------------------------------------------- +{ + "query_string" : { + "fields" : ["city.*"], + "query" : "this AND that OR thus", + "use_dis_max" : true + } +} +-------------------------------------------------- + +Another option is to provide the wildcard fields search in the query +string itself (properly escaping the `*` sign), for example: +`city.\*:something`. (since 0.19.4). + +When running the `query_string` query against multiple fields, the +following additional parameters are allowed: + +[cols="<,<",options="header",] +|======================================================================= +|Parameter |Description +|`use_dis_max` |Should the queries be combined using `dis_max` (set it +to `true`), or a `bool` query (set it to `false`). Defaults to `true`. + +|`tie_breaker` |When using `dis_max`, the disjunction max tie breaker. +Defaults to `0`. +|======================================================================= + +The fields parameter can also include pattern based field names, +allowing to automatically expand to the relevant fields (dynamically +introduced fields included). For example: + +[source,js] +-------------------------------------------------- +{ + "query_string" : { + "fields" : ["content", "name.*^5"], + "query" : "this AND that OR thus", + "use_dis_max" : true + } +} +-------------------------------------------------- + +[[Syntax_Extension]] +[float] +==== Syntax Extension + +There are several syntax extensions to the Lucene query language. + +[float] +===== missing / exists + +The `_exists_` and `_missing_` syntax allows to control docs that have +fields that exists within them (have a value) and missing. The syntax +is: `_exists_:field1`, `_missing_:field` and can be used anywhere a +query string is used. diff --git a/docs/reference/query-dsl/queries/range-query.asciidoc b/docs/reference/query-dsl/queries/range-query.asciidoc new file mode 100644 index 00000000000..e8b43f563dd --- /dev/null +++ b/docs/reference/query-dsl/queries/range-query.asciidoc @@ -0,0 +1,54 @@ +[[query-dsl-range-query]] +=== Range Query + +Matches documents with fields that have terms within a certain range. +The type of the Lucene query depends on the field type, for `string` +fields, the `TermRangeQuery`, while for number/date fields, the query is +a `NumericRangeQuery`. The following example returns all documents where +`age` is between `10` and `20`: + +[source,js] +-------------------------------------------------- +{ + "range" : { + "age" : { + "from" : 10, + "to" : 20, + "include_lower" : true, + "include_upper": false, + "boost" : 2.0 + } + } +} +-------------------------------------------------- + +The `range` query top level parameters include: + +[cols="<,<",options="header",] +|======================================================================= +|Name |Description +|`from` |The lower bound. Defaults to start from the first. + +|`to` |The upper bound. Defaults to unbounded. + +|`include_lower` |Should the first from (if set) be inclusive or not. +Defaults to `true` + +|`include_upper` |Should the last to (if set) be inclusive or not. +Defaults to `true`. + +|`gt` |Same as setting `from` to the value, and `include_lower` to +`false`. + +|`gte` |Same as setting `from` to the value,and `include_lower` to +`true`. + +|`lt` |Same as setting `to` to the value, and `include_upper` to +`false`. + +|`lte` |Same as setting `to` to the value, and `include_upper` to +`true`. + +|`boost` |Sets the boost value of the query. Defaults to `1.0`. +|======================================================================= + diff --git a/docs/reference/query-dsl/queries/regexp-query.asciidoc b/docs/reference/query-dsl/queries/regexp-query.asciidoc new file mode 100644 index 00000000000..5f06c232d51 --- /dev/null +++ b/docs/reference/query-dsl/queries/regexp-query.asciidoc @@ -0,0 +1,54 @@ +[[query-dsl-regexp-query]] +=== Regexp Query + +The `regexp` query allows you to use regular expression term queries. + +*Note*: The performance of a `regexp` query heavily depends on the +regular expression chosen. Matching everything like `.*` is very slow as +well as using lookaround regular expressions. If possible, you should +try to use a long prefix before your regular expression starts. Wildcard +matchers like `.*?+` will mostly lower performance. + +[source,js] +-------------------------------------------------- +{ + "regexp":{ + "name.first": "s.*y" + } +} +-------------------------------------------------- + +Boosting is also supported + +[source,js] +-------------------------------------------------- +{ + "regexp":{ + "name.first":{ + "value":"s.*y", + "boost":1.2 + } + } +} +-------------------------------------------------- + +You can also use special flags + +[source,js] +-------------------------------------------------- +{ + "regexp":{ + "name.first": "s.*y", + "flags" : "INTERSECTION|COMPLEMENT|EMPTY" + } +} +-------------------------------------------------- + +Possible flags are `ALL`, `ANYSTRING`, `AUTOMATON`, `COMPLEMENT`, +`EMPTY`, `INTERSECTION`, `INTERVAL`, or `NONE`. Please check the +http://lucene.apache.org/core/4_3_0/core/index.html?org%2Fapache%2Flucene%2Futil%2Fautomaton%2FRegExp.html[Lucene +documentation] for their meaning + +For more information see the +http://lucene.apache.org/core/4_3_0/core/index.html?org%2Fapache%2Flucene%2Fsearch%2FRegexpQuery.html[Lucene +RegexpQuery documentation]. diff --git a/docs/reference/query-dsl/queries/span-first-query.asciidoc b/docs/reference/query-dsl/queries/span-first-query.asciidoc new file mode 100644 index 00000000000..74fe7ff88ba --- /dev/null +++ b/docs/reference/query-dsl/queries/span-first-query.asciidoc @@ -0,0 +1,20 @@ +[[query-dsl-span-first-query]] +=== Span First Query + +Matches spans near the beginning of a field. The span first query maps +to Lucene `SpanFirstQuery`. Here is an example: + +[source,js] +-------------------------------------------------- +{ + "span_first" : { + "match" : { + "span_term" : { "user" : "kimchy" } + }, + "end" : 3 + } +} +-------------------------------------------------- + +The `match` clause can be any other span type query. The `end` controls +the maximum end position permitted in a match. diff --git a/docs/reference/query-dsl/queries/span-multi-term-query.asciidoc b/docs/reference/query-dsl/queries/span-multi-term-query.asciidoc new file mode 100644 index 00000000000..3ea0f8422de --- /dev/null +++ b/docs/reference/query-dsl/queries/span-multi-term-query.asciidoc @@ -0,0 +1,32 @@ +[[query-dsl-span-multi-term-query]] +=== Span Multi Term Query + +The `span_multi` query allows you to wrap a `multi term query` (one of +fuzzy, prefix, term range or numeric range query) as a `span query`, so +it can be nested. Example: + +[source,js] +-------------------------------------------------- +{ + "span_multi":{ + "match":{ + "prefix" : { "user" : { "value" : "ki" } } + } + } +} +-------------------------------------------------- + +A boost can also be associated with the query: + +[source,js] +-------------------------------------------------- +{ + "span_multi":{ + "match":{ + "prefix" : { "user" : { "value" : "ki", "boost" : 1.08 } } + } + } +} +-------------------------------------------------- + +The `span_multi` query is supported from version `0.90.1` diff --git a/docs/reference/query-dsl/queries/span-near-query.asciidoc b/docs/reference/query-dsl/queries/span-near-query.asciidoc new file mode 100644 index 00000000000..39982e2ba22 --- /dev/null +++ b/docs/reference/query-dsl/queries/span-near-query.asciidoc @@ -0,0 +1,27 @@ +[[query-dsl-span-near-query]] +=== Span Near Query + +Matches spans which are near one another. One can specify _slop_, the +maximum number of intervening unmatched positions, as well as whether +matches are required to be in-order. The span near query maps to Lucene +`SpanNearQuery`. Here is an example: + +[source,js] +-------------------------------------------------- +{ + "span_near" : { + "clauses" : [ + { "span_term" : { "field" : "value1" } }, + { "span_term" : { "field" : "value2" } }, + { "span_term" : { "field" : "value3" } } + ], + "slop" : 12, + "in_order" : false, + "collect_payloads" : false + } +} +-------------------------------------------------- + +The `clauses` element is a list of one or more other span type queries +and the `slop` controls the maximum number of intervening unmatched +positions permitted. diff --git a/docs/reference/query-dsl/queries/span-not-query.asciidoc b/docs/reference/query-dsl/queries/span-not-query.asciidoc new file mode 100644 index 00000000000..b03572068b8 --- /dev/null +++ b/docs/reference/query-dsl/queries/span-not-query.asciidoc @@ -0,0 +1,24 @@ +[[query-dsl-span-not-query]] +=== Span Not Query + +Removes matches which overlap with another span query. The span not +query maps to Lucene `SpanNotQuery`. Here is an example: + +[source,js] +-------------------------------------------------- +{ + "span_not" : { + "include" : { + "span_term" : { "field1" : "value1" } + }, + "exclude" : { + "span_term" : { "field2" : "value2" } + } + } +} +-------------------------------------------------- + +The `include` and `exclude` clauses can be any span type query. The +`include` clause is the span query whose matches are filtered, and the +`exclude` clause is the span query whose matches must not overlap those +returned. diff --git a/docs/reference/query-dsl/queries/span-or-query.asciidoc b/docs/reference/query-dsl/queries/span-or-query.asciidoc new file mode 100644 index 00000000000..72a4ce8724b --- /dev/null +++ b/docs/reference/query-dsl/queries/span-or-query.asciidoc @@ -0,0 +1,20 @@ +[[query-dsl-span-or-query]] +=== Span Or Query + +Matches the union of its span clauses. The span or query maps to Lucene +`SpanOrQuery`. Here is an example: + +[source,js] +-------------------------------------------------- +{ + "span_or" : { + "clauses" : [ + { "span_term" : { "field" : "value1" } }, + { "span_term" : { "field" : "value2" } }, + { "span_term" : { "field" : "value3" } } + ] + } +} +-------------------------------------------------- + +The `clauses` element is a list of one or more other span type queries. diff --git a/docs/reference/query-dsl/queries/span-term-query.asciidoc b/docs/reference/query-dsl/queries/span-term-query.asciidoc new file mode 100644 index 00000000000..9de86d48684 --- /dev/null +++ b/docs/reference/query-dsl/queries/span-term-query.asciidoc @@ -0,0 +1,30 @@ +[[query-dsl-span-term-query]] +=== Span Term Query + +Matches spans containing a term. The span term query maps to Lucene +`SpanTermQuery`. Here is an example: + +[source,js] +-------------------------------------------------- +{ + "span_term" : { "user" : "kimchy" } +} +-------------------------------------------------- + +A boost can also be associated with the query: + +[source,js] +-------------------------------------------------- +{ + "span_term" : { "user" : { "value" : "kimchy", "boost" : 2.0 } } +} +-------------------------------------------------- + +Or : + +[source,js] +-------------------------------------------------- +{ + "span_term" : { "user" : { "term" : "kimchy", "boost" : 2.0 } } +} +-------------------------------------------------- diff --git a/docs/reference/query-dsl/queries/term-query.asciidoc b/docs/reference/query-dsl/queries/term-query.asciidoc new file mode 100644 index 00000000000..cd9537d9383 --- /dev/null +++ b/docs/reference/query-dsl/queries/term-query.asciidoc @@ -0,0 +1,31 @@ +[[query-dsl-term-query]] +=== Term Query + +Matches documents that have fields that contain a term (*not analyzed*). +The term query maps to Lucene `TermQuery`. The following matches +documents where the user field contains the term `kimchy`: + +[source,js] +-------------------------------------------------- +{ + "term" : { "user" : "kimchy" } +} +-------------------------------------------------- + +A boost can also be associated with the query: + +[source,js] +-------------------------------------------------- +{ + "term" : { "user" : { "value" : "kimchy", "boost" : 2.0 } } +} +-------------------------------------------------- + +Or : + +[source,js] +-------------------------------------------------- +{ + "term" : { "user" : { "term" : "kimchy", "boost" : 2.0 } } +} +-------------------------------------------------- diff --git a/docs/reference/query-dsl/queries/terms-query.asciidoc b/docs/reference/query-dsl/queries/terms-query.asciidoc new file mode 100644 index 00000000000..a1f62a3d2cc --- /dev/null +++ b/docs/reference/query-dsl/queries/terms-query.asciidoc @@ -0,0 +1,19 @@ +[[query-dsl-terms-query]] +=== Terms Query + +A query that match on any (configurable) of the provided terms. This is +a simpler syntax query for using a `bool` query with several `term` +queries in the `should` clauses. For example: + +[source,js] +-------------------------------------------------- +{ + "terms" : { + "tags" : [ "blue", "pill" ], + "minimum_should_match" : 1 + } +} +-------------------------------------------------- + +The `terms` query is also aliased with `in` as the query name for +simpler usage. diff --git a/docs/reference/query-dsl/queries/text-query.asciidoc b/docs/reference/query-dsl/queries/text-query.asciidoc new file mode 100644 index 00000000000..94fd96906e0 --- /dev/null +++ b/docs/reference/query-dsl/queries/text-query.asciidoc @@ -0,0 +1,171 @@ +[[query-dsl-text-query]] +=== Text Query + +`text` query has been deprecated (effectively renamed) to `match` query +since `0.19.9`, please use it. `text` is still supported. + +A family of `text` queries that accept text, analyzes it, and constructs +a query out of it. For example: + +[source,js] +-------------------------------------------------- +{ + "text" : { + "message" : "this is a test" + } +} +-------------------------------------------------- + +Note, even though the name is text, it also supports exact matching +(`term` like) on numeric values and dates. + +Note, `message` is the name of a field, you can substitute the name of +any field (including `_all`) instead. + +[float] +[float] +==== Types of Text Queries + +[float] +[float] +===== boolean + +The default `text` query is of type `boolean`. It means that the text +provided is analyzed and the analysis process constructs a boolean query +from the provided text. The `operator` flag can be set to `or` or `and` +to control the boolean clauses (defaults to `or`). + +The `analyzer` can be set to control which analyzer will perform the +analysis process on the text. It default to the field explicit mapping +definition, or the default search analyzer. + +`fuzziness` can be set to a value (depending on the relevant type, for +string types it should be a value between `0.0` and `1.0`) to constructs +fuzzy queries for each term analyzed. The `prefix_length` and +`max_expansions` can be set in this case to control the fuzzy process. + +Here is an example when providing additional parameters (note the slight +change in structure, `message` is the field name): + +[source,js] +-------------------------------------------------- +{ + "text" : { + "message" : { + "query" : "this is a test", + "operator" : "and" + } + } +} +-------------------------------------------------- + +[float] +[float] +===== phrase + +The `text_phrase` query analyzes the text and creates a `phrase` query +out of the analyzed text. For example: + +[source,js] +-------------------------------------------------- +{ + "text_phrase" : { + "message" : "this is a test" + } +} +-------------------------------------------------- + +Since `text_phrase` is only a `type` of a `text` query, it can also be +used in the following manner: + +[source,js] +-------------------------------------------------- +{ + "text" : { + "message" : { + "query" : "this is a test", + "type" : "phrase" + } + } +} +-------------------------------------------------- + +A phrase query maintains order of the terms up to a configurable `slop` +(which defaults to 0). + +The `analyzer` can be set to control which analyzer will perform the +analysis process on the text. It default to the field explicit mapping +definition, or the default search analyzer, for example: + +[source,js] +-------------------------------------------------- +{ + "text_phrase" : { + "message" : { + "query" : "this is a test", + "analyzer" : "my_analyzer" + } + } +} +-------------------------------------------------- + +[float] +[float] +===== text_phrase_prefix + +The `text_phrase_prefix` is the same as `text_phrase`, expect it allows +for prefix matches on the last term in the text. For example: + +[source,js] +-------------------------------------------------- +{ + "text_phrase_prefix" : { + "message" : "this is a test" + } +} +-------------------------------------------------- + +Or: + +[source,js] +-------------------------------------------------- +{ + "text" : { + "message" : { + "query" : "this is a test", + "type" : "phrase_prefix" + } + } +} +-------------------------------------------------- + +It accepts the same parameters as the phrase type. In addition, it also +accepts a `max_expansions` parameter that can control to how many +prefixes the last term will be expanded. It is highly recommended to set +it to an acceptable value to control the execution time of the query. +For example: + +[source,js] +-------------------------------------------------- +{ + "text_phrase_prefix" : { + "message" : { + "query" : "this is a test", + "max_expansions" : 10 + } + } +} +-------------------------------------------------- + +[float] +[float] +==== Comparison to query_string / field + +The text family of queries does not go through a "query parsing" +process. It does not support field name prefixes, wildcard characters, +or other "advance" features. For this reason, chances of it failing are +very small / non existent, and it provides an excellent behavior when it +comes to just analyze and run that text as a query behavior (which is +usually what a text search box does). Also, the `phrase_prefix` can +provide a great "as you type" behavior to automatically load search +results. diff --git a/docs/reference/query-dsl/queries/top-children-query.asciidoc b/docs/reference/query-dsl/queries/top-children-query.asciidoc new file mode 100644 index 00000000000..00c32bf3358 --- /dev/null +++ b/docs/reference/query-dsl/queries/top-children-query.asciidoc @@ -0,0 +1,71 @@ +[[query-dsl-top-children-query]] +=== Top Children Query + +The `top_children` query runs the child query with an estimated hits +size, and out of the hit docs, aggregates it into parent docs. If there +aren't enough parent docs matching the requested from/size search +request, then it is run again with a wider (more hits) search. + +The `top_children` also provide scoring capabilities, with the ability +to specify `max`, `sum` or `avg` as the score type. + +One downside of using the `top_children` is that if there are more child +docs matching the required hits when executing the child query, then the +`total_hits` result of the search response will be incorrect. + +How many hits are asked for in the first child query run is controlled +using the `factor` parameter (defaults to `5`). For example, when asking +for 10 parent docs (with `from` set to 0), then the child query will +execute with 50 hits expected. If not enough parents are found (in our +example 10), and there are still more child docs to query, then the +child search hits are expanded by multiplying by the +`incremental_factor` (defaults to `2`). + +The required parameters are the `query` and `type` (the child type to +execute the query on). Here is an example with all different parameters, +including the default values: + +[source,js] +-------------------------------------------------- +{ + "top_children" : { + "type": "blog_tag", + "query" : { + "term" : { + "tag" : "something" + } + }, + "score" : "max", + "factor" : 5, + "incremental_factor" : 2 + } +} +-------------------------------------------------- + +[float] +==== Scope + +A `_scope` can be defined on the query allowing to run facets on the +same scope name that will work against the child documents. For example: + +[source,js] +-------------------------------------------------- +{ + "top_children" : { + "_scope" : "my_scope", + "type": "blog_tag", + "query" : { + "term" : { + "tag" : "something" + } + } + } +} +-------------------------------------------------- + +[float] +==== Memory Considerations + +With the current implementation, all `_id` values are loaded to memory +(heap) in order to support fast lookups, so make sure there is enough +memory for it. diff --git a/docs/reference/query-dsl/queries/wildcard-query.asciidoc b/docs/reference/query-dsl/queries/wildcard-query.asciidoc new file mode 100644 index 00000000000..d72dbec2481 --- /dev/null +++ b/docs/reference/query-dsl/queries/wildcard-query.asciidoc @@ -0,0 +1,39 @@ +[[query-dsl-wildcard-query]] +=== Wildcard Query + +Matches documents that have fields matching a wildcard expression (*not +analyzed*). Supported wildcards are `*`, which matches any character +sequence (including the empty one), and `?`, which matches any single +character. Note this query can be slow, as it needs to iterate over many +terms. In order to prevent extremely slow wildcard queries, a wildcard +term should not start with one of the wildcards `*` or `?`. The wildcard +query maps to Lucene `WildcardQuery`. + +[source,js] +-------------------------------------------------- +{ + "wildcard" : { "user" : "ki*y" } +} +-------------------------------------------------- + +A boost can also be associated with the query: + +[source,js] +-------------------------------------------------- +{ + "wildcard" : { "user" : { "value" : "ki*y", "boost" : 2.0 } } +} +-------------------------------------------------- + +Or : + +[source,js] +-------------------------------------------------- +{ + "wildcard" : { "user" : { "wildcard" : "ki*y", "boost" : 2.0 } } +} +-------------------------------------------------- + +This multi term query allows to control how it gets rewritten using the +<> +parameter. diff --git a/docs/reference/search.asciidoc b/docs/reference/search.asciidoc new file mode 100644 index 00000000000..d712bf860e5 --- /dev/null +++ b/docs/reference/search.asciidoc @@ -0,0 +1,105 @@ +[[search]] += Search APIs + +[partintro] +-- +["float",id="search-multi-index"] +== Multiple Indices + +All search APIs support execution across multiple indices, using simple +`test1,test2,test3` notation (or `_all` for all indices). It also +support wildcards, for example: `test*`, and the ability to "add" (`+`) +and "remove" (`-`), for example: `+test*,-test3`. + +All multi indices API support the `ignore_indices` option. Setting it to +`missing` will cause indices that do not exists to be ignored from the +execution. By default, when its not set, the request will fail. Note, +this feature is available since 0.20 version. + +[float] +== Routing + +When executing a search, it will be broadcasted to all the index/indices +shards (round robin between replicas). Which shards will be searched on +can be controlled by providing the `routing` parameter. For example, +when indexing tweets, the routing value can be the user name: + +[source,js] +-------------------------------------------------- +$ curl -XPOST 'http://localhost:9200/twitter/tweet?routing=kimchy' -d '{ + "user" : "kimchy", + "postDate" : "2009-11-15T14:12:12", + "message" : "trying out Elastic Search" +} +' +-------------------------------------------------- + +In such a case, if we want to search only on the tweets for a specific +user, we can specify it as the routing, resulting in the search hitting +only the relevant shard: + +[source,js] +-------------------------------------------------- +$ curl -XGET 'http://localhost:9200/twitter/tweet/_search?routing=kimchy' -d '{ + "query": { + "filtered" : { + "query" : { + "query_string" : { + "query" : "some query string here" + } + }, + "filter" : { + "term" : { "user" : "kimchy" } + } + } + } +} +' +-------------------------------------------------- + +The routing parameter can be multi valued represented as a comma +separated string. This will result in hitting the relevant shards where +the routing values match to. + +[float] +== Stats Groups + +A search can be associated with stats groups, which maintains a +statistics aggregation per group. It can later be retrieved using the +<> API +specifically. For example, here is a search body request that associate +the request with two different groups: + +[source,js] +-------------------------------------------------- +{ + "query" : { + "match_all" : {} + }, + "stats" : ["group1", "group2"] +} +-------------------------------------------------- + +-- + +include::search/search.asciidoc[] + +include::search/uri-request.asciidoc[] + +include::search/request-body.asciidoc[] + +include::search/facets.asciidoc[] + +include::search/suggesters.asciidoc[] + +include::search/multi-search.asciidoc[] + +include::search/count.asciidoc[] + +include::search/validate.asciidoc[] + +include::search/explain.asciidoc[] + +include::search/percolate.asciidoc[] + +include::search/more-like-this.asciidoc[] diff --git a/docs/reference/search/count.asciidoc b/docs/reference/search/count.asciidoc new file mode 100644 index 00000000000..4b238c532ea --- /dev/null +++ b/docs/reference/search/count.asciidoc @@ -0,0 +1,85 @@ +[[search-count]] +== Count API + +The count API allows to easily execute a query and get the number of +matches for that query. It can be executed across one or more indices +and across one or more types. The query can either be provided using a +simple query string as a parameter, or using the +<> defined within the request +body. Here is an example: + +[source,js] +-------------------------------------------------- +$ curl -XGET 'http://localhost:9200/twitter/tweet/_count?q=user:kimchy' + +$ curl -XGET 'http://localhost:9200/twitter/tweet/_count' -d ' +{ + "term" : { "user" : "kimchy" } +}' +-------------------------------------------------- + +Both examples above do the same thing, which is count the number of +tweets from the twitter index for a certain user. The result is: + +[source,js] +-------------------------------------------------- +{ + "count" : 1, + "_shards" : { + "total" : 5, + "successful" : 5, + "failed" : 0 + } +} +-------------------------------------------------- + +The query is optional, and when not provided, it will use `match_all` to +count all the docs. + + +[float] +=== Multi index, Multi type + +The count API can be applied to <>. + +[float] +=== Request Parameters + +When executing count using the query parameter `q`, the query passed is +a query string using Lucene query parser. There are additional +parameters that can be passed: + +[cols="<,<",options="header",] +|======================================================================= +|Name |Description +|df |The default field to use when no field prefix is defined within the +query. + +|analyzer |The analyzer name to be used when analyzing the query string. + +|default_operator |The default operator to be used, can be `AND` or +`OR`. Defaults to `OR`. +|======================================================================= + +[float] +=== Request Body + +The count can use the <> within +its body in order to express the query that should be executed. The body +content can also be passed as a REST parameter named `source`. + +Both HTTP GET and HTTP POST can be used to execute count with body. +Since not all clients support GET with body, POST is allowed as well. + +[float] +=== Distributed + +The count operation is broadcast across all shards. For each shard id +group, a replica is chosen and executed against it. This means that +replicas increase the scalability of count. + +[float] +=== Routing + +The routing value (a comma separated list of the routing values) can be +specified to control which shards the count request will be executed on. diff --git a/docs/reference/search/explain.asciidoc b/docs/reference/search/explain.asciidoc new file mode 100644 index 00000000000..2429efb1741 --- /dev/null +++ b/docs/reference/search/explain.asciidoc @@ -0,0 +1,107 @@ +[[search-explain]] +== Explain API + +The explain api computes a score explanation for a query and a specific +document. This can give useful feedback whether a document matches or +didn't match a specific query. This feature is available from version +`0.19.9` and up. + +[float] +=== Usage + +Full query example: + +[source,js] +-------------------------------------------------- +curl -XGET 'localhost:9200/twitter/tweet/1/_explain' -d '{ + "query" : { + "term" : { "message" : "search" } + } +}' +-------------------------------------------------- + +This will yield the following result: + +[source,js] +-------------------------------------------------- +{ + "ok" : true, + "matches" : true, + "explanation" : { + "value" : 0.15342641, + "description" : "fieldWeight(message:search in 0), product of:", + "details" : [ { + "value" : 1.0, + "description" : "tf(termFreq(message:search)=1)" + }, { + "value" : 0.30685282, + "description" : "idf(docFreq=1, maxDocs=1)" + }, { + "value" : 0.5, + "description" : "fieldNorm(field=message, doc=0)" + } ] + } +} +-------------------------------------------------- + +There is also a simpler way of specifying the query via the `q` +parameter. The specified `q` parameter value is then parsed as if the +`query_string` query was used. Example usage of the `q` parameter in the +explain api: + +[source,js] +-------------------------------------------------- +curl -XGET 'localhost:9200/twitter/tweet/1/_explain?q=message:search' +-------------------------------------------------- + +This will yield the same result as the previous request. + +[float] +=== All parameters: + +[horizontal] +`fields`:: + Allows to control which fields to return as part of the + document explained (support `_source` for the full document). Note, this + feature is available since 0.20. + +`routing`:: + Controls the routing in the case the routing was used + during indexing. + +`parent`:: + Same effect as setting the routing parameter. + +`preference`:: + Controls on which shard the explain is executed. + +`source`:: + Allows the data of the request to be put in the query + string of the url. + +`q`:: + The query string (maps to the query_string query). + +`df`:: + The default field to use when no field prefix is defined within + the query. Defaults to _all field. + +`analyzer`:: + The analyzer name to be used when analyzing the query + string. Defaults to the analyzer of the _all field. + +`analyze_wildcard`:: + Should wildcard and prefix queries be analyzed or + not. Defaults to false. + +`lowercase_expanded_terms`:: + Should terms be automatically lowercased + or not. Defaults to true. + +`lenient`:: + If set to true will cause format based failures (like + providing text to a numeric field) to be ignored. Defaults to false. + +`default_operator`:: + The default operator to be used, can be AND or + OR. Defaults to OR. diff --git a/docs/reference/search/facets.asciidoc b/docs/reference/search/facets.asciidoc new file mode 100644 index 00000000000..7e0c0556871 --- /dev/null +++ b/docs/reference/search/facets.asciidoc @@ -0,0 +1,291 @@ +[[search-facets]] +== Facets + +The usual purpose of a full-text search engine is to return a small +number of documents matching your query. + +_Facets_ provide aggregated data based on a search query. In the +simplest case, a +<> +can return _facet counts_ for various _facet values_ for a specific +_field_. ElasticSearch supports more facet implementations, such as +<> +or +<> facets. + +The field used for facet calculations _must_ be of type numeric, +date/time or be analyzed as a single token — see the +<> guide for details on the +analysis process. + +You can give the facet a custom _name_ and return multiple facets in one +request. + +Let's try it out with a simple example. Suppose we have a number of +articles with a field called `tags`, preferably analyzed with the +<> +analyzer. The facet aggregation will return counts for the most popular +tags across the documents matching your query — or across all documents +in the index. + +We will store some example data first: + +[source,js] +-------------------------------------------------- +curl -X DELETE "http://localhost:9200/articles" +curl -X POST "http://localhost:9200/articles/article" -d '{"title" : "One", "tags" : ["foo"]}' +curl -X POST "http://localhost:9200/articles/article" -d '{"title" : "Two", "tags" : ["foo", "bar"]}' +curl -X POST "http://localhost:9200/articles/article" -d '{"title" : "Three", "tags" : ["foo", "bar", "baz"]}' +-------------------------------------------------- + +Now, let's query the index for articles beginning with letter `T` +and retrieve a +<> +for the `tags` field. We will name the facet simply: _tags_. + +[source,js] +-------------------------------------------------- +curl -X POST "http://localhost:9200/articles/_search?pretty=true" -d ' + { + "query" : { "query_string" : {"query" : "T*"} }, + "facets" : { + "tags" : { "terms" : {"field" : "tags"} } + } + } +' +-------------------------------------------------- + +This request will return articles `Two` and `Three (because +they match our query), as well as the `tags` facet: + +[source,js] +-------------------------------------------------- +"facets" : { + "tags" : { + "_type" : "terms", + "missing" : 0, + "total": 5, + "other": 0, + "terms" : [ { + "term" : "foo", + "count" : 2 + }, { + "term" : "bar", + "count" : 2 + }, { + "term" : "baz", + "count" : 1 + } ] + } +} +-------------------------------------------------- + +In the `terms` array, relevant _terms_ and _counts_ are returned. You'll +probably want to display these to your users. The facet returns several +important counts: + +* `missing` : The number of documents which have no value for the +faceted field + + * `total` : The total number of terms in the facet + + * `other` : The number of terms not included in the returned facet +(effectively `other` = `total` - `terms` ) + +Notice, that the counts are scoped to the current query: _foo_ is +counted only twice (not three times), _bar_ is counted twice and _baz_ +once. Also note that terms are counted once per document, even if the +occur more frequently in that document. + +That's because the primary purpose of facets is to enable +http://en.wikipedia.org/wiki/Faceted_search[_faceted navigation_], +allowing the user to refine her query based on the insight from the +facet, i.e. restrict the search to a specific category, price or date +range. Facets can be used, however, for other purposes: computing +histograms, statistical aggregations, and more. See the blog about +link:/blog/data-visualization-with-elasticsearch-and-protovis/[data visualization].for inspiration. + + + +[float] +=== Scope + +As we have already mentioned, facet computation is restricted to the +scope of the current query, called `main`, by default. Facets can be +computed within the `global` scope as well, in which case it will return +values computed across all documents in the index: + +[source,js] +-------------------------------------------------- +{ + "facets" : { + "" : { + "" : { ... }, + "global" : true + } + } +} +-------------------------------------------------- + +There's one *important distinction* to keep in mind. While search +_queries_ restrict both the returned documents and facet counts, search +_filters_ restrict only returned documents — but _not_ facet counts. + +If you need to restrict both the documents and facets, and you're not +willing or able to use a query, you may use a _facet filter_. + +[float] +=== Facet Filter + +All facets can be configured with an additional filter (explained in the +<> section), which _will_ reduce +the documents they use for computing results. An example with a _term_ +filter: + +[source,js] +-------------------------------------------------- +{ + "facets" : { + "" : { + "" : { + ... + }, + "facet_filter" : { + "term" : { "user" : "kimchy"} + } + } + } +} +-------------------------------------------------- + +Note that this is different from a facet of the +<> type. + +[float] +=== Facets with the _nested_ types + +<> mapping allows +for better support for "inner" documents faceting, especially when it +comes to multi valued key and value facets (like histograms, or term +stats). + +What is it good for? First of all, this is the only way to use facets on +nested documents once they are used (possibly for other reasons). But, +there is also facet specific reason why nested documents can be used, +and that's the fact that facets working on different key and value field +(like term_stats, or histogram) can now support cases where both are +multi valued properly. + +For example, let's use the following mapping: + +[source,js] +-------------------------------------------------- +{ + "type1" : { + "properties" : { + "obj1" : { + "type" : "nested" + } + } + } +} +-------------------------------------------------- + +And, here is a sample data: + +[source,js] +-------------------------------------------------- +{ + "obj1" : [ + { + "name" : "blue", + "count" : 4 + }, + { + "name" : "green", + "count" : 6 + } + ] +} +-------------------------------------------------- + + +.Nested Query Facets +[NOTE] +-- +Scoped filters and queries have been removed from version `0.90.0.Beta1` +instead the facet / queries need be repeated as `facet_filter`. More +information about this can be found in +https://github.com/elasticsearch/elasticsearch/issues/2606[issue 2606] +-- + +[float] +==== All Nested Matching Root Documents + +Another option is to run the facet on all the nested documents matching +the root objects that the main query will end up producing. For example: + +[source,js] +-------------------------------------------------- +{ + "query": { + "match_all": {} + }, + "facets": { + "facet1": { + "terms_stats": { + "key_field" : "name", + "value_field": "count" + }, + "nested": "obj1" + } + } +} +-------------------------------------------------- + +The `nested` element provides the path to the nested document (can be a +multi level nested docs) that will be used. + +Facet filter allows you to filter your facet on the nested object level. +It is important that these filters match on the nested object level and +not on the root document level. In the following example the +`terms_stats` only applies on nested objects with the name 'blue'. + +[source,js] +-------------------------------------------------- +{ + "query": { + "match_all": {} + }, + "facets": { + "facet1": { + "terms_stats": { + "key_field" : "name", + "value_field": "count" + }, + "nested": "obj1", + "facet_filter" : { + "term" : {"name" : "blue"} + } + } + } +} +-------------------------------------------------- + +include::facets/terms-facet.asciidoc[] + +include::facets/range-facet.asciidoc[] + +include::facets/histogram-facet.asciidoc[] + +include::facets/date-histogram-facet.asciidoc[] + +include::facets/filter-facet.asciidoc[] + +include::facets/query-facet.asciidoc[] + +include::facets/statistical-facet.asciidoc[] + +include::facets/terms-stats-facet.asciidoc[] + +include::facets/geo-distance-facet.asciidoc[] + diff --git a/docs/reference/search/facets/date-histogram-facet.asciidoc b/docs/reference/search/facets/date-histogram-facet.asciidoc new file mode 100644 index 00000000000..d92c0b2a032 --- /dev/null +++ b/docs/reference/search/facets/date-histogram-facet.asciidoc @@ -0,0 +1,134 @@ +[[search-facets-date-histogram-facet]] +=== Date Histogram Facet + +A specific histogram facet that can work with `date` field types +enhancing it over the regular +<>. Here is a quick example: + +[source,js] +-------------------------------------------------- +{ + "query" : { + "match_all" : {} + }, + "facets" : { + "histo1" : { + "date_histogram" : { + "field" : "field_name", + "interval" : "day" + } + } + } +} +-------------------------------------------------- + +==== Interval + +The `interval` allows to set the interval at which buckets will be +created for each hit. It allows for the constant values of `year`, +`quarter`, `month`, `week`, `day`, `hour`, `minute`. + +It also support time setting like `1.5h` (up to `w` for weeks). + +==== Time Zone + +By default, times are stored as UTC milliseconds since the epoch. Thus, +all computation and "bucketing" / "rounding" is done on UTC. It is +possible to provide a time zone (both pre rounding, and post rounding) +value, which will cause all computations to take the relevant zone into +account. The time returned for each bucket/entry is milliseconds since +the epoch of the provided time zone. + +The parameters are `pre_zone` (pre rounding based on interval) and +`post_zone` (post rounding based on interval). The `time_zone` parameter +simply sets the `pre_zone` parameter. By default, those are set to +`UTC`. + +The zone value accepts either a numeric value for the hours offset, for +example: `"time_zone" : -2`. It also accepts a format of hours and +minutes, like `"time_zone" : "-02:30"`. Another option is to provide a +time zone accepted as one of the values listed +http://joda-time.sourceforge.net/timezones.html[here]. + +Lets take an example. For `2012-04-01T04:15:30Z`, with a `pre_zone` of +`-08:00`. For `day` interval, the actual time by applying the time zone +and rounding falls under `2012-03-31`, so the returned value will be (in +millis) of `2012-03-31T00:00:00Z` (UTC). For `hour` interval, applying +the time zone results in `2012-03-31T20:15:30`, rounding it results in +`2012-03-31T20:00:00`, but, we want to return it in UTC (`post_zone` is +not set), so we convert it back to UTC: `2012-04-01T04:00:00Z`. Note, we +are consistent in the results, returning the rounded value in UTC. + +`post_zone` simply takes the result, and adds the relevant offset. + +Sometimes, we want to apply the same conversion to UTC we did above for +`hour` also for `day` (and up) intervals. We can set +`pre_zone_adjust_large_interval` to `true`, which will apply the same +conversion done for `hour` interval in the example, to `day` and above +intervals (it can be set regardless of the interval, but only kick in +when using `day` and higher intervals). + +==== Factor + +The date histogram works on numeric values (since time is stored in +milliseconds since the epoch in UTC). But, sometimes, systems will store +a different resolution (like seconds since UTC) in a numeric field. The +`factor` parameter can be used to change the value in the field to +milliseconds to actual do the relevant rounding, and then be applied +again to get to the original unit. For example, when storing in a +numeric field seconds resolution, the `factor` can be set to `1000`. + +==== Pre / Post Offset + +Specific offsets can be provided for pre rounding and post rounding. The +`pre_offset` for pre rounding, and `post_offset` for post rounding. The +format is the date time format (`1h`, `1d`, ...). + +==== Value Field + +The date_histogram facet allows to use a different key (of type date) +which controls the bucketing, with a different value field which will +then return the total and mean for that field values of the hits within +the relevant bucket. For example: + +[source,js] +-------------------------------------------------- +{ + "query" : { + "match_all" : {} + }, + "facets" : { + "histo1" : { + "date_histogram" : { + "key_field" : "timestamp", + "value_field" : "price", + "interval" : "day" + } + } + } +} +-------------------------------------------------- + +==== Script Value Field + +A script can be used to compute the value that will then be used to +compute the total and mean for a bucket. For example: + +[source,js] +-------------------------------------------------- +{ + "query" : { + "match_all" : {} + }, + "facets" : { + "histo1" : { + "date_histogram" : { + "key_field" : "timestamp", + "value_script" : "doc['price'].value * 2", + "interval" : "day" + } + } + } +} +-------------------------------------------------- diff --git a/docs/reference/search/facets/filter-facet.asciidoc b/docs/reference/search/facets/filter-facet.asciidoc new file mode 100644 index 00000000000..74dece4bb3a --- /dev/null +++ b/docs/reference/search/facets/filter-facet.asciidoc @@ -0,0 +1,24 @@ +[[search-facets-filter-facet]] +=== Filter Facets + +A filter facet (not to be confused with a +<>) allows you to +return a count of the hits matching the filter. The filter itself can be +expressed using the <>. For +example: + +[source,js] +-------------------------------------------------- +{ + "facets" : { + "wow_facet" : { + "filter" : { + "term" : { "tag" : "wow" } + } + } + } +} +-------------------------------------------------- + +Note, filter facet filters are faster than query facet when using native +filters (non query wrapper ones). diff --git a/docs/reference/search/facets/geo-distance-facet.asciidoc b/docs/reference/search/facets/geo-distance-facet.asciidoc new file mode 100644 index 00000000000..4e3d366f450 --- /dev/null +++ b/docs/reference/search/facets/geo-distance-facet.asciidoc @@ -0,0 +1,252 @@ +[[search-facets-geo-distance-facet]] +=== Geo Distance Facets + +The geo_distance facet is a facet providing information for ranges of +distances from a provided geo_point including count of the number of +hits that fall within each range, and aggregation information (like +total). + +Assuming the following sample doc: + +[source,js] +-------------------------------------------------- +{ + "pin" : { + "location" : { + "lat" : 40.12, + "lon" : -71.34 + } + } +} +-------------------------------------------------- + +Here is an example that create a `geo_distance` facet from a +`pin.location` of 40,-70, and a set of ranges: + +[source,js] +-------------------------------------------------- +{ + "query" : { + "match_all" : {} + }, + "facets" : { + "geo1" : { + "geo_distance" : { + "pin.location" : { + "lat" : 40, + "lon" : -70 + }, + "ranges" : [ + { "to" : 10 }, + { "from" : 10, "to" : 20 }, + { "from" : 20, "to" : 100 }, + { "from" : 100 } + ] + } + } + } +} +-------------------------------------------------- + +==== Accepted Formats + +In much the same way the geo_point type can accept different +representation of the geo point, the filter can accept it as well: + +===== Lat Lon As Properties + +[source,js] +-------------------------------------------------- +{ + "query" : { + "match_all" : {} + }, + "facets" : { + "geo1" : { + "geo_distance" : { + "pin.location" : { + "lat" : 40, + "lon" : -70 + }, + "ranges" : [ + { "to" : 10 }, + { "from" : 10, "to" : 20 }, + { "from" : 20, "to" : 100 }, + { "from" : 100 } + ] + } + } + } +} +-------------------------------------------------- + +===== Lat Lon As Array + +Format in `[lon, lat]`, note, the order of lon/lat here in order to +conform with http://geojson.org/[GeoJSON]. + +[source,js] +-------------------------------------------------- +{ + "query" : { + "match_all" : {} + }, + "facets" : { + "geo1" : { + "geo_distance" : { + "pin.location" : [40, -70], + "ranges" : [ + { "to" : 10 }, + { "from" : 10, "to" : 20 }, + { "from" : 20, "to" : 100 }, + { "from" : 100 } + ] + } + } + } +} +-------------------------------------------------- + +===== Lat Lon As String + +Format in `lat,lon`. + +[source,js] +-------------------------------------------------- +{ + "query" : { + "match_all" : {} + }, + "facets" : { + "geo1" : { + "geo_distance" : { + "pin.location" : "40, -70", + "ranges" : [ + { "to" : 10 }, + { "from" : 10, "to" : 20 }, + { "from" : 20, "to" : 100 }, + { "from" : 100 } + ] + } + } + } +} +-------------------------------------------------- + +===== Geohash + +[source,js] +-------------------------------------------------- +{ + "query" : { + "match_all" : {} + }, + "facets" : { + "geo1" : { + "geo_distance" : { + "pin.location" : "drm3btev3e86", + "ranges" : [ + { "to" : 10 }, + { "from" : 10, "to" : 20 }, + { "from" : 20, "to" : 100 }, + { "from" : 100 } + ] + } + } + } +} +-------------------------------------------------- + +==== Ranges + +When a `to` or `from` are not set, they are assumed to be unbounded. +Ranges are allowed to overlap, basically, each range is treated by +itself. + +==== Options + +[cols="<,<",options="header",] +|======================================================================= +|Option |Description +|`unit` |The unit the ranges are provided in. Defaults to `km`. Can also +be `mi` or `miles`. + +|`distance_type` |How to compute the distance. Can either be `arc` +(better precision) or `plane` (faster). Defaults to `arc`. +|======================================================================= + +==== Value Options + +On top of the count of hits falling within each range, aggregated data +can be provided (total) as well. By default, the aggregated data will +simply use the distance calculated, but the value can be extracted +either using a different numeric field, or a script. Here is an example +of using a different numeric field: + +[source,js] +-------------------------------------------------- +{ + "query" : { + "match_all" : {} + }, + "facets" : { + "geo1" : { + "geo_distance" : { + "pin.location" : "drm3btev3e86", + "value_field" : "num1", + "ranges" : [ + { "to" : 10 }, + { "from" : 10, "to" : 20 }, + { "from" : 20, "to" : 100 }, + { "from" : 100 } + ] + } + } + } +} +-------------------------------------------------- + +And here is an example of using a script: + +[source,js] +-------------------------------------------------- +{ + "query" : { + "match_all" : {} + }, + "facets" : { + "geo1" : { + "geo_distance" : { + "pin.location" : "drm3btev3e86", + "value_script" : "doc['num1'].value * factor", + "params" : { + "factor" : 5 + } + "ranges" : [ + { "to" : 10 }, + { "from" : 10, "to" : 20 }, + { "from" : 20, "to" : 100 }, + { "from" : 100 } + ] + } + } + } +} +-------------------------------------------------- + +Note the params option, allowing to pass parameters to the script +(resulting in faster script execution instead of providing the values +within the script each time). + +.`geo_point` Type +[NOTE] +-- +The facet *requires* the `geo_point` type to be set on the relevant +field. +-- + +.Multi Location Per Document +[NOTE] +-- +The facet can work with multiple locations per document. +-- \ No newline at end of file diff --git a/docs/reference/search/facets/histogram-facet.asciidoc b/docs/reference/search/facets/histogram-facet.asciidoc new file mode 100644 index 00000000000..284a058584d --- /dev/null +++ b/docs/reference/search/facets/histogram-facet.asciidoc @@ -0,0 +1,138 @@ +[[search-facets-histogram-facet]] +=== Histogram Facets + +The histogram facet works with numeric data by building a histogram +across intervals of the field values. Each value is "rounded" into an +interval (or placed in a bucket), and statistics are provided per +interval/bucket (count and total). Here is a simple example: + +[source,js] +-------------------------------------------------- +{ + "query" : { + "match_all" : {} + }, + "facets" : { + "histo1" : { + "histogram" : { + "field" : "field_name", + "interval" : 100 + } + } + } +} +-------------------------------------------------- + +The above example will run a histogram facet on the `field_name` filed, +with an `interval` of `100` (so, for example, a value of `1055` will be +placed within the `1000` bucket). + +The interval can also be provided as a time based interval (using the +time format). This mainly make sense when working on date fields or +field that represent absolute milliseconds, here is an example: + +[source,js] +-------------------------------------------------- +{ + "query" : { + "match_all" : {} + }, + "facets" : { + "histo1" : { + "histogram" : { + "field" : "field_name", + "time_interval" : "1.5h" + } + } + } +} +-------------------------------------------------- + +==== Key and Value + +The histogram facet allows to use a different key and value. The key is +used to place the hit/document within the appropriate bucket, and the +value is used to compute statistical data (for example, total). Here is +an example: + +[source,js] +-------------------------------------------------- +{ + "query" : { + "match_all" : {} + }, + "facets" : { + "histo1" : { + "histogram" : { + "key_field" : "key_field_name", + "value_field" : "value_field_name", + "interval" : 100 + } + } + } +} +-------------------------------------------------- + +==== Script Key and Value + +Sometimes, some munging of both the key and the value are needed. In the +key case, before it is rounded into a bucket, and for the value, when +the statistical data is computed per bucket +<> can be used. Here +is an example: + +[source,js] +-------------------------------------------------- +{ + "query" : { + "match_all" : {} + }, + "facets" : { + "histo1" : { + "histogram" : { + "key_script" : "doc['date'].date.minuteOfHour", + "value_script" : "doc['num1'].value" + } + } + } +} +-------------------------------------------------- + +In the above sample, we can use a date type field called `date` to get +the minute of hour from it, and the total will be computed based on +another field `num1`. Note, in this case, no `interval` was provided, so +the bucket will be based directly on the `key_script` (no rounding). + +Parameters can also be provided to the different scripts (preferable if +the script is the same, with different values for a specific parameter, +like "factor"): + +[source,js] +-------------------------------------------------- +{ + "query" : { + "match_all" : {} + }, + "facets" : { + "histo1" : { + "histogram" : { + "key_script" : "doc['date'].date.minuteOfHour * factor1", + "value_script" : "doc['num1'].value + factor2", + "params" : { + "factor1" : 2, + "factor2" : 3 + } + } + } + } +} +-------------------------------------------------- + +==== Memory Considerations + +In order to implement the histogram facet, the relevant field values are +loaded into memory from the index. This means that per shard, there +should be enough memory to contain them. Since by default, dynamic +introduced types are `long` and `double`, one option to reduce the +memory footprint is to explicitly set the types for the relevant fields +to either `short`, `integer`, or `float` when possible. diff --git a/docs/reference/search/facets/query-facet.asciidoc b/docs/reference/search/facets/query-facet.asciidoc new file mode 100644 index 00000000000..3f360da4bdf --- /dev/null +++ b/docs/reference/search/facets/query-facet.asciidoc @@ -0,0 +1,19 @@ +[[search-facets-query-facet]] +=== Query Facets + +A facet query allows to return a count of the hits matching the facet +query. The query itself can be expressed using the Query DSL. For +example: + +[source,js] +-------------------------------------------------- +{ + "facets" : { + "wow_facet" : { + "query" : { + "term" : { "tag" : "wow" } + } + } + } +} +-------------------------------------------------- diff --git a/docs/reference/search/facets/range-facet.asciidoc b/docs/reference/search/facets/range-facet.asciidoc new file mode 100644 index 00000000000..fa263ee5595 --- /dev/null +++ b/docs/reference/search/facets/range-facet.asciidoc @@ -0,0 +1,119 @@ +[[search-facets-range-facet]] +=== Range Facets + +`range` facet allows to specify a set of ranges and get both the number +of docs (count) that fall within each range, and aggregated data either +based on the field, or using another field. Here is a simple example: + +[source,js] +-------------------------------------------------- +{ + "query" : { + "match_all" : {} + }, + "facets" : { + "range1" : { + "range" : { + "field" : "field_name", + "ranges" : [ + { "to" : 50 }, + { "from" : 20, "to" : 70 }, + { "from" : 70, "to" : 120 }, + { "from" : 150 } + ] + } + } + } +} +-------------------------------------------------- + +Another option which is a bit more DSL enabled is to provide the ranges +on the actual field name, for example: + +[source,js] +-------------------------------------------------- +{ + "query" : { + "match_all" : {} + }, + "facets" : { + "range1" : { + "range" : { + "my_field" : [ + { "to" : 50 }, + { "from" : 20, "to" : 70 }, + { "from" : 70, "to" : 120 }, + { "from" : 150 } + ] + } + } + } +} +-------------------------------------------------- + +The `range` facet always includes the `from` parameter and excludes the +`to` parameter for each range. + +==== Key and Value + +The `range` facet allows to use a different field to check if its value +falls within a range, and another field to compute aggregated data per +range (like total). For example: + +[source,js] +-------------------------------------------------- +{ + "query" : { + "match_all" : {} + }, + "facets" : { + "range1" : { + "range" : { + "key_field" : "field_name", + "value_field" : "another_field_name", + "ranges" : [ + { "to" : 50 }, + { "from" : 20, "to" : 70 }, + { "from" : 70, "to" : 120 }, + { "from" : 150 } + ] + } + } + } +} +-------------------------------------------------- + +==== Script Key and Value + +Sometimes, some munging of both the key and the value are needed. In the +key case, before it is checked if it falls within a range, and for the +value, when the statistical data is computed per range scripts can be +used. Here is an example: + +[source,js] +-------------------------------------------------- +{ + "query" : { + "match_all" : {} + }, + "facets" : { + "range1" : { + "range" : { + "key_script" : "doc['date'].date.minuteOfHour", + "value_script" : "doc['num1'].value", + "ranges" : [ + { "to" : 50 }, + { "from" : 20, "to" : 70 }, + { "from" : 70, "to" : 120 }, + { "from" : 150 } + ] + } + } + } +} +-------------------------------------------------- + +==== Date Ranges + +The range facet support also providing the range as string formatted +dates. diff --git a/docs/reference/search/facets/statistical-facet.asciidoc b/docs/reference/search/facets/statistical-facet.asciidoc new file mode 100644 index 00000000000..dfa51dfe8d5 --- /dev/null +++ b/docs/reference/search/facets/statistical-facet.asciidoc @@ -0,0 +1,101 @@ +[[search-facets-statistical-facet]] +=== Statistical Facet + +Statistical facet allows to compute statistical data on a numeric +fields. The statistical data include count, total, sum of squares, mean +(average), minimum, maximum, variance, and standard deviation. Here is +an example: + +[source,js] +-------------------------------------------------- +{ + "query" : { + "match_all" : {} + }, + "facets" : { + "stat1" : { + "statistical" : { + "field" : "num1" + } + } + } +} +-------------------------------------------------- + +==== Script field + +When using `field`, the numeric value of the field is used to compute +the statistical information. Sometimes, several fields values represent +the statistics we want to compute, or some sort of mathematical +evaluation. The script field allows to define a +<> to evaluate, with +its value used to compute the statistical information. For example: + +[source,js] +-------------------------------------------------- +{ + "query" : { + "match_all" : {} + }, + "facets" : { + "stat1" : { + "statistical" : { + "script" : "doc['num1'].value + doc['num2'].value" + } + } + } +} +-------------------------------------------------- + +Parameters can also be provided to the different scripts (preferable if +the script is the same, with different values for a specific parameter, +like "factor"): + +[source,js] +-------------------------------------------------- +{ + "query" : { + "match_all" : {} + }, + "facets" : { + "stat1" : { + "statistical" : { + "script" : "(doc['num1'].value + doc['num2'].value) * factor", + "params" : { + "factor" : 5 + } + } + } + } +} +-------------------------------------------------- + +==== Multi Field + +The statistical facet can be executed against more than one field, +returning the aggregation result across those fields. For example: + +[source,js] +-------------------------------------------------- +{ + "query" : { + "match_all" : {} + }, + "facets" : { + "stat1" : { + "statistical" : { + "fields" : ["num1", "num2"] + } + } + } +} +-------------------------------------------------- + +==== Memory Considerations + +In order to implement the statistical facet, the relevant field values +are loaded into memory from the index. This means that per shard, there +should be enough memory to contain them. Since by default, dynamic +introduced types are `long` and `double`, one option to reduce the +memory footprint is to explicitly set the types for the relevant fields +to either `short`, `integer`, or `float` when possible. diff --git a/docs/reference/search/facets/terms-facet.asciidoc b/docs/reference/search/facets/terms-facet.asciidoc new file mode 100644 index 00000000000..47d9d03c8d1 --- /dev/null +++ b/docs/reference/search/facets/terms-facet.asciidoc @@ -0,0 +1,260 @@ +[[search-facets-terms-facet]] +=== Terms Facet + +Allow to specify field facets that return the N most frequent terms. For +example: + +[source,js] +-------------------------------------------------- +{ + "query" : { + "match_all" : { } + }, + "facets" : { + "tag" : { + "terms" : { + "field" : "tag", + "size" : 10 + } + } + } +} +-------------------------------------------------- + +It is preferred to have the terms facet executed on a non analyzed +field, or a field without a large number of terms it breaks to. + +==== Ordering + +Allow to control the ordering of the terms facets, to be ordered by +`count`, `term`, `reverse_count` or `reverse_term`. The default is +`count`. Here is an example: + +[source,js] +-------------------------------------------------- +{ + "query" : { + "match_all" : { } + }, + "facets" : { + "tag" : { + "terms" : { + "field" : "tag", + "size" : 10, + "order" : "term" + } + } + } +} +-------------------------------------------------- + +==== All Terms + +Allow to get all the terms in the terms facet, ones that do not match a +hit, will have a count of 0. Note, this should not be used with fields +that have many terms. + +[source,js] +-------------------------------------------------- +{ + "query" : { + "match_all" : { } + }, + "facets" : { + "tag" : { + "terms" : { + "field" : "tag", + "all_terms" : true + } + } + } +} +-------------------------------------------------- + +==== Excluding Terms + +It is possible to specify a set of terms that should be excluded from +the terms facet request result: + +[source,js] +-------------------------------------------------- +{ + "query" : { + "match_all" : { } + }, + "facets" : { + "tag" : { + "terms" : { + "field" : "tag", + "exclude" : ["term1", "term2"] + } + } + } +} +-------------------------------------------------- + +==== Regex Patterns + +The terms API allows to define regex expression that will control which +terms will be included in the faceted list, here is an example: + +[source,js] +-------------------------------------------------- +{ + "query" : { + "match_all" : { } + }, + "facets" : { + "tag" : { + "terms" : { + "field" : "tag", + "regex" : "_regex expression here_", + "regex_flags" : "DOTALL" + } + } + } +} +-------------------------------------------------- + +Check +http://download.oracle.com/javase/6/docs/api/java/util/regex/Pattern.html#field_summary[Java +Pattern API] for more details about `regex_flags` options. + +==== Term Scripts + +Allow to define a script for terms facet to process the actual term that +will be used in the term facet collection, and also optionally control +its inclusion or not. + +The script can either return a boolean value, with `true` to include it +in the facet collection, and `false` to exclude it from the facet +collection. + +Another option is for the script to return a `string` controlling the +term that will be used to count against. The script execution will +include the term variable which is the current field term used. + +For example: + +[source,js] +-------------------------------------------------- +{ + "query" : { + "match_all" : { } + }, + "facets" : { + "tag" : { + "terms" : { + "field" : "tag", + "size" : 10, + "script" : "term + 'aaa'" + } + } + } +} +-------------------------------------------------- + +And using the boolean feature: + +[source,js] +-------------------------------------------------- +{ + "query" : { + "match_all" : { } + }, + "facets" : { + "tag" : { + "terms" : { + "field" : "tag", + "size" : 10, + "script" : "term == 'aaa' ? true : false" + } + } + } +} +-------------------------------------------------- + +==== Multi Fields + +The term facet can be executed against more than one field, returning +the aggregation result across those fields. For example: + +[source,js] +-------------------------------------------------- +{ + "query" : { + "match_all" : { } + }, + "facets" : { + "tag" : { + "terms" : { + "fields" : ["tag1", "tag2"], + "size" : 10 + } + } + } +} +-------------------------------------------------- + +==== Script Field + +A script that provides the actual terms that will be processed for a +given doc. A `script_field` (or `script` which will be used when no +`field` or `fields` are provided) can be set to provide it. + +As an example, a search request (that is quite "heavy") can be executed +and use either `_source` itself or `_fields` (for stored fields) without +needing to load the terms to memory (at the expense of much slower +execution of the search, and causing more IO load): + +[source,js] +-------------------------------------------------- +{ + "query" : { + "match_all" : { } + }, + "facets" : { + "my_facet" : { + "terms" : { + "script_field" : "_source.my_field", + "size" : 10 + } + } + } +} +-------------------------------------------------- + +Or: + +[source,js] +-------------------------------------------------- +{ + "query" : { + "match_all" : { } + }, + "facets" : { + "my_facet" : { + "terms" : { + "script_field" : "_fields['my_field']", + "size" : 10 + } + } + } +} +-------------------------------------------------- + +Note also, that the above will use the whole field value as a single +term. + +==== _index + +The term facet allows to specify a special field name called `_index`. +This will return a facet count of hits per `_index` the search was +executed on (relevant when a search request spans more than one index). + +==== Memory Considerations + +Term facet causes the relevant field values to be loaded into memory. +This means that per shard, there should be enough memory to contain +them. It is advisable to explicitly set the fields to be `not_analyzed` +or make sure the number of unique tokens a field can have is not large. diff --git a/docs/reference/search/facets/terms-stats-facet.asciidoc b/docs/reference/search/facets/terms-stats-facet.asciidoc new file mode 100644 index 00000000000..74f0cc23cd9 --- /dev/null +++ b/docs/reference/search/facets/terms-stats-facet.asciidoc @@ -0,0 +1,43 @@ +[[search-facets-terms-stats-facet]] +=== Terms Stats Facet + +The `terms_stats` facet combines both the +<> and +<> +allowing to compute stats computed on a field, per term value driven by +another field. For example: + +[source,js] +-------------------------------------------------- +{ + "query" : { + "match_all" : { } + }, + "facets" : { + "tag_price_stats" : { + "terms_stats" : { + "key_field" : "tag", + "value_field" : "price" + } + } + } +} +-------------------------------------------------- + +The `size` parameter controls how many facet entries will be returned. +It defaults to `10`. Setting it to 0 will return all terms matching the +hits (be careful not to return too many results). + +Ordering is done by setting `order`, with possible values of `term`, +`reverse_term`, `count`, `reverse_count`, `total`, `reverse_total`, +`min`, `reverse_min`, `max`, `reverse_max`, `mean`, `reverse_mean`. +Defaults to `count`. + +The value computed can also be a script, using the `value_script` +instead of `value_field`, in which case the `lang` can control its +language, and `params` allow to provide custom parameters (as in other +scripted components). + +Note, the terms stats can work with multi valued key fields, or multi +valued value fields, but not when both are multi valued (as ordering is +not maintained). diff --git a/docs/reference/search/more-like-this.asciidoc b/docs/reference/search/more-like-this.asciidoc new file mode 100644 index 00000000000..28bd07871f9 --- /dev/null +++ b/docs/reference/search/more-like-this.asciidoc @@ -0,0 +1,27 @@ +[[search-more-like-this]] +== More Like This API + +The more like this (mlt) API allows to get documents that are "like" a +specified document. Here is an example: + +[source,js] +-------------------------------------------------- +$ curl -XGET 'http://localhost:9200/twitter/tweet/1/_mlt?mlt_fields=tag,content&min_doc_freq=1' +-------------------------------------------------- + +The API simply results in executing a search request with +<> query (http +parameters match the parameters to the `more_like_this` query). This +means that the body of the request can optionally include all the +request body options in the <> (facets, from/to and so on). + +Rest parameters relating to search are also allowed, including +`search_type`, `search_indices`, `search_types`, `search_scroll`, +`search_size` and `search_from`. + +When no `mlt_fields` are specified, all the fields of the document will +be used in the `more_like_this` query generated. + +Note: In order to use the `mlt` feature a `mlt_field` needs to be either +be `stored`, store `term_vector` or `source` needs to be enabled. diff --git a/docs/reference/search/multi-search.asciidoc b/docs/reference/search/multi-search.asciidoc new file mode 100644 index 00000000000..ad0160647ee --- /dev/null +++ b/docs/reference/search/multi-search.asciidoc @@ -0,0 +1,73 @@ +[[search-multi-search]] +== Multi Search API + +The multi search API allows to execute several search requests within +the same API. The endpoint for it is `_msearch` (available from `0.19` +onwards). + +The format of the request is similar to the bulk API format, and the +structure is as follows (the structure is specifically optimized to +reduce parsing if a specific search ends up redirected to another node): + +[source,js] +-------------------------------------------------- +header\n +body\n +header\n +body\n +-------------------------------------------------- + +The header part includes which index / indices to search on, optional +(mapping) types to search on, the `search_type`, `preference`, and +`routing`. The body includes the typical search body request (including +the `query`, `facets`, `from`, `size`, and so on). Here is an example: + +[source,js] +-------------------------------------------------- +$ cat requests +{"index" : "test"} +{"query" : {"match_all" : {}}, "from" : 0, "size" : 10} +{"index" : "test", "search_type" : "count"} +{"query" : {"match_all" : {}}} +{} +{"query" : {"match_all" : {}}} + +{"query" : {"match_all" : {}}} +{"search_type" : "count"} +{"query" : {"match_all" : {}}} + +$ curl -XGET localhost:9200/_msearch --data-binary @requests; echo +-------------------------------------------------- + +Note, the above includes an example of an empty header (can also be just +without any content) which is supported as well. + +The response returns a `responses` array, which includes the search +response for each search request matching its order in the original +multi search request. If there was a complete failure for that specific +search request, an object with `error` message will be returned in place +of the actual search response. + +The endpoint allows to also search against an index/indices and +type/types in the URI itself, in which case it will be used as the +default unless explicitly defined otherwise in the header. For example: + +[source,js] +-------------------------------------------------- +$ cat requests +{} +{"query" : {"match_all" : {}}, "from" : 0, "size" : 10} +{} +{"query" : {"match_all" : {}}} +{"index" : "test2"} +{"query" : {"match_all" : {}}} + +$ curl -XGET localhost:9200/test/_msearch --data-binary @requests; echo +-------------------------------------------------- + +The above will execute the search against the `test` index for all the +requests that don't define an index, and the last one will be executed +against the `test2` index. + +The `search_type` can be set in a similar manner to globally apply to +all search requests. diff --git a/docs/reference/search/percolate.asciidoc b/docs/reference/search/percolate.asciidoc new file mode 100644 index 00000000000..f8566f398f7 --- /dev/null +++ b/docs/reference/search/percolate.asciidoc @@ -0,0 +1,135 @@ +[[search-percolate]] +== Percolate API + +The percolator allows to register queries against an index, and then +send `percolate` requests which include a doc, and getting back the +queries that match on that doc out of the set of registered queries. + +Think of it as the reverse operation of indexing and then searching. +Instead of sending docs, indexing them, and then running queries. One +sends queries, registers them, and then sends docs and finds out which +queries match that doc. + +As an example, a user can register an interest (a query) on all tweets +that contain the word "elasticsearch". For every tweet, one can +percolate the tweet against all registered user queries, and find out +which ones matched. + +Here is a quick sample, first, lets create a `test` index: + +[source,js] +-------------------------------------------------- +curl -XPUT localhost:9200/test +-------------------------------------------------- + +Next, we will register a percolator query with a specific name called +`kuku` against the `test` index: + +[source,js] +-------------------------------------------------- +curl -XPUT localhost:9200/_percolator/test/kuku -d '{ + "query" : { + "term" : { + "field1" : "value1" + } + } +}' +-------------------------------------------------- + +And now, we can percolate a document and see which queries match on it +(note, its not really indexed!): + +[source,js] +-------------------------------------------------- +curl -XGET localhost:9200/test/type1/_percolate -d '{ + "doc" : { + "field1" : "value1" + } +}' +-------------------------------------------------- + +And the matches are part of the response: + +[source,js] +-------------------------------------------------- +{"ok":true, "matches":["kuku"]} +-------------------------------------------------- + +You can unregister the previous percolator query with the same API you +use to delete any document in an index: + +[source,js] +-------------------------------------------------- +curl -XDELETE localhost:9200/_percolator/test/kuku +-------------------------------------------------- + +[float] +=== Filtering Executed Queries + +Since the registered percolator queries are just docs in an index, one +can filter the queries that will be used to percolate a doc. For +example, we can add a `color` field to the registered query: + +[source,js] +-------------------------------------------------- +curl -XPUT localhost:9200/_percolator/test/kuku -d '{ + "color" : "blue", + "query" : { + "term" : { + "field1" : "value1" + } + } +}' +-------------------------------------------------- + +And then, we can percolate a doc that only matches on blue colors: + +[source,js] +-------------------------------------------------- +curl -XGET localhost:9200/test/type1/_percolate -d '{ + "doc" : { + "field1" : "value1" + }, + "query" : { + "term" : { + "color" : "blue" + } + } +}' +-------------------------------------------------- + +[float] +=== How it Works + +The `_percolator` which holds the repository of registered queries is +just a another index. The query is registered under a concrete index +that exists (or will exist). That index name is represented as the type +in the `_percolator` index (a bit confusing, I know...). + +The fact that the queries are stored as docs in another index +(`_percolator`) gives us both the persistency nature of it, and the +ability to filter out queries to execute using another query. + +The `_percolator` index uses the `index.auto_expand_replica` setting to +make sure that each data node will have access locally to the registered +queries, allowing for fast query executing to filter out queries to run +against a percolated doc. + +The percolate API uses the whole number of shards as percolating +processing "engines", both primaries and replicas. In our above case, if +the `test` index has 2 shards with 1 replica, 4 shards will round-robin +in handling percolate requests. Increasing (dynamically) the number of +replicas will increase the number of percolating processing "engines" +and thus the percolation power. + +Note, percolate requests will prefer to be executed locally, and will +not try and round-robin across shards if a shard exists locally on a +node that received a request (for example, from HTTP). It's important to +do some round-robin in the client code among nodes (in any case its +recommended). If this behavior is not desired, the `prefer_local` +parameter can be set to `false` to disable it. + +Because the percolator API is processing one document at a time, it +doesn't support queries and filters that run against child and nested +documents such as `has_child`, `has_parent`, `top_children`, and +`nested`. diff --git a/docs/reference/search/request-body.asciidoc b/docs/reference/search/request-body.asciidoc new file mode 100644 index 00000000000..51c9c6879bb --- /dev/null +++ b/docs/reference/search/request-body.asciidoc @@ -0,0 +1,109 @@ +[[search-request-body]] +== Request Body Search + +The search request can be executed with a search DSL, which includes the +<>, within its body. Here is an +example: + +[source,js] +-------------------------------------------------- +$ curl -XGET 'http://localhost:9200/twitter/tweet/_search' -d '{ + "query" : { + "term" : { "user" : "kimchy" } + } +} +' +-------------------------------------------------- + +And here is a sample response: + +[source,js] +-------------------------------------------------- +{ + "_shards":{ + "total" : 5, + "successful" : 5, + "failed" : 0 + }, + "hits":{ + "total" : 1, + "hits" : [ + { + "_index" : "twitter", + "_type" : "tweet", + "_id" : "1", + "_source" : { + "user" : "kimchy", + "postDate" : "2009-11-15T14:12:12", + "message" : "trying out Elastic Search" + } + } + ] + } +} +-------------------------------------------------- + +[float] +=== Parameters + +[cols="<,<",options="header",] +|======================================================================= +|Name |Description +|`timeout` |A search timeout, bounding the search request to be executed +within the specified time value and bail with the hits accumulated up to +that point when expired. Defaults to no timeout. + +|`from` |The starting from index of the hits to return. Defaults to `0`. + +|`size` |The number of hits to return. Defaults to `10`. + +|`search_type` |The type of the search operation to perform. Can be +`dfs_query_then_fetch`, `dfs_query_and_fetch`, `query_then_fetch`, +`query_and_fetch`. Defaults to `query_then_fetch`. See +<> for +more details on the different types of search that can be performed. +|======================================================================= + +Out of the above, the `search_type` is the one that can not be passed +within the search request body, and in order to set it, it must be +passed as a request REST parameter. + +The rest of the search request should be passed within the body itself. +The body content can also be passed as a REST parameter named `source`. + +Both HTTP GET and HTTP POST can be used to execute search with body. +Since not all clients support GET with body, POST is allowed as well. + + +include::request/query.asciidoc[] + +include::request/from-size.asciidoc[] + +include::request/sort.asciidoc[] + +include::request/fields.asciidoc[] + +include::request/script-fields.asciidoc[] + +include::request/filter.asciidoc[] + +include::request/highlighting.asciidoc[] + +include::request/rescore.asciidoc[] + +include::request/search-type.asciidoc[] + +include::request/scroll.asciidoc[] + +include::request/preference.asciidoc[] + +include::request/explain.asciidoc[] + +include::request/version.asciidoc[] + +include::request/index-boost.asciidoc[] + +include::request/min-score.asciidoc[] + +include::request/named-filters.asciidoc[] + diff --git a/docs/reference/search/request/explain.asciidoc b/docs/reference/search/request/explain.asciidoc new file mode 100644 index 00000000000..81dc110c263 --- /dev/null +++ b/docs/reference/search/request/explain.asciidoc @@ -0,0 +1,14 @@ +[[search-request-explain]] +=== Explain + +Enables explanation for each hit on how its score was computed. + +[source,js] +-------------------------------------------------- +{ + "explain": true, + "query" : { + "term" : { "user" : "kimchy" } + } +} +-------------------------------------------------- diff --git a/docs/reference/search/request/fields.asciidoc b/docs/reference/search/request/fields.asciidoc new file mode 100644 index 00000000000..2868d9b8d8d --- /dev/null +++ b/docs/reference/search/request/fields.asciidoc @@ -0,0 +1,92 @@ +[[search-request-fields]] +=== Fields + +Allows to selectively load specific fields for each document represented +by a search hit. Defaults to load the internal `_source` field. + +[source,js] +-------------------------------------------------- +{ + "fields" : ["user", "postDate"], + "query" : { + "term" : { "user" : "kimchy" } + } +} +-------------------------------------------------- + +The fields will automatically load stored fields (`store` mapping set to +`yes`), or, if not stored, will load the `_source` and extract it from +it (allowing to return nested document object). + +`*` can be used to load all stored fields from the document. + +An empty array will cause only the `_id` and `_type` for each hit to be +returned, for example: + +[source,js] +-------------------------------------------------- +{ + "fields" : [], + "query" : { + "term" : { "user" : "kimchy" } + } +} +-------------------------------------------------- + +Script fields can also be automatically detected and used as fields, so +things like `_source.obj1.obj2` can be used, though not recommended, as +`obj1.obj2` will work as well. + +==== Partial + +When loading data from `_source`, partial fields can be used to use +wildcards to control what part of the `_source` will be loaded based on +`include` and `exclude` patterns. For example: + +[source,js] +-------------------------------------------------- +{ + "query" : { + "match_all" : {} + }, + "partial_fields" : { + "partial1" : { + "include" : "obj1.obj2.*", + } + } +} +-------------------------------------------------- + +And one that will also exclude `obj1.obj3`: + +[source,js] +-------------------------------------------------- +{ + "query" : { + "match_all" : {} + }, + "partial_fields" : { + "partial1" : { + "include" : "obj1.obj2.*", + "exclude" : "obj1.obj3.*" + } + } +} +-------------------------------------------------- + +Both `include` and `exclude` support multiple patterns: + +[source,js] +-------------------------------------------------- +{ + "query" : { + "match_all" : {} + }, + "partial_fields" : { + "partial1" : { + "include" : ["obj1.obj2.*", "obj1.obj4.*"], + "exclude" : "obj1.obj3.*" + } + } +} +-------------------------------------------------- diff --git a/docs/reference/search/request/filter.asciidoc b/docs/reference/search/request/filter.asciidoc new file mode 100644 index 00000000000..34c329c7fac --- /dev/null +++ b/docs/reference/search/request/filter.asciidoc @@ -0,0 +1,78 @@ +[[search-request-filter]] +=== Filter + +When doing things like facet navigation, sometimes only the hits are +needed to be filtered by the chosen facet, and all the facets should +continue to be calculated based on the original query. The `filter` +element within the search request can be used to accomplish it. + +Note, this is different compared to creating a `filtered` query with the +filter, since this will cause the facets to only process the filtered +results. + +For example, lets create two tweets, with two different tags: + +[source,js] +-------------------------------------------------- +curl -XPUT 'localhost:9200/twitter/tweet/1' -d ' +{ + "message" : "something blue", + "tag" : "blue" +} +' + +curl -XPUT 'localhost:9200/twitter/tweet/2' -d ' +{ + "message" : "something green", + "tag" : "green" +} +' + +curl -XPOST 'localhost:9200/_refresh' +-------------------------------------------------- + +We can now search for something, and have a terms facet. + +[source,js] +-------------------------------------------------- +curl -XPOST 'localhost:9200/twitter/_search?pretty=true' -d ' +{ + "query" : { + "term" : { "message" : "something" } + }, + "facets" : { + "tag" : { + "terms" : { "field" : "tag" } + } + } +} +' +-------------------------------------------------- + +We get two hits, and the relevant facets with a count of 1 for both +`green` and `blue`. Now, lets say the `green` facet is chosen, we can +simply add a filter for it: + +[source,js] +-------------------------------------------------- +curl -XPOST 'localhost:9200/twitter/_search?pretty=true' -d ' +{ + "query" : { + "term" : { "message" : "something" } + }, + "filter" : { + "term" : { "tag" : "green" } + }, + "facets" : { + "tag" : { + "terms" : { "field" : "tag" } + } + } +} +' +-------------------------------------------------- + +And now, we get only 1 hit back, but the facets remain the same. + +Note, if additional filters is required on specific facets, they can be +added as a `facet_filter` to the relevant facets. diff --git a/docs/reference/search/request/from-size.asciidoc b/docs/reference/search/request/from-size.asciidoc new file mode 100644 index 00000000000..d8d80952554 --- /dev/null +++ b/docs/reference/search/request/from-size.asciidoc @@ -0,0 +1,21 @@ +[[search-request-from-size]] +=== From / Size + +Pagination of results can be done by using the `from` and `size` +parameters. The `from` parameter defines the offset from the first +result you want to fetch. The `size` parameter allows you to configure +the maximum amount of hits to be returned. + +Though `from` and `size` can be set as request parameters, they can also +be set within the search body. `from` defaults to `0`, and `size` +defaults to `10`. + +[source,js] +-------------------------------------------------- +{ + "from" : 0, "size" : 10, + "query" : { + "term" : { "user" : "kimchy" } + } +} +-------------------------------------------------- diff --git a/docs/reference/search/request/highlighting.asciidoc b/docs/reference/search/request/highlighting.asciidoc new file mode 100644 index 00000000000..2b09ac1f806 --- /dev/null +++ b/docs/reference/search/request/highlighting.asciidoc @@ -0,0 +1,208 @@ +[[search-request-highlighting]] +=== Highlighting + +Allows to highlight search results on one or more fields. The +implementation uses either the lucene `fast-vector-highlighter` or +`highlighter`. The search request body: + +[source,js] +-------------------------------------------------- +{ + "query" : {...}, + "highlight" : { + "fields" : { + "content" : {} + } + } +} +-------------------------------------------------- + +In the above case, the `content` field will be highlighted for each +search hit (there will be another element in each search hit, called +`highlight`, which includes the highlighted fields and the highlighted +fragments). + +In order to perform highlighting, the actual content of the field is +required. If the field in question is stored (has `store` set to `yes` +in the mapping), it will be used, otherwise, the actual `_source` will +be loaded and the relevant field will be extracted from it. + +If `term_vector` information is provided by setting `term_vector` to +`with_positions_offsets` in the mapping then the fast vector +highlighter will be used instead of the plain highlighter. The fast vector highlighter: + +* Is faster especially for large fields (> `1MB`) +* Can be customized with `boundary_chars`, `boundary_max_scan`, and + `fragment_offset` (see below) +* Requires setting `term_vector` to `with_positions_offsets` which + increases the size of the index + +Here is an example of setting the `content` field to allow for +highlighting using the fast vector highlighter on it (this will cause +the index to be bigger): + +[source,js] +-------------------------------------------------- +{ + "type_name" : { + "content" : {"term_vector" : "with_positions_offsets"} + } +} +-------------------------------------------------- + +Since `0.20.2` the field name support wildcard notation, for example, +using `comment_*` which will cause all fields that match the expression +to be highlighted. + +==== Highlighting Tags + +By default, the highlighting will wrap highlighted text in `` and +``. This can be controlled by setting `pre_tags` and `post_tags`, +for example: + +[source,js] +-------------------------------------------------- +{ + "query" : {...}, + "highlight" : { + "pre_tags" : ["", ""], + "post_tags" : ["", ""], + "fields" : { + "_all" : {} + } + } +} +-------------------------------------------------- + +There can be a single tag or more, and the "importance" is ordered. +There are also built in "tag" schemas, with currently a single schema +called `styled` with `pre_tags` of: + +[source,js] +-------------------------------------------------- +, , , +, , , +, , , + +-------------------------------------------------- + +And post tag of ``. If you think of more nice to have built in tag +schemas, just send an email to the mailing list or open an issue. Here +is an example of switching tag schemas: + +[source,js] +-------------------------------------------------- +{ + "query" : {...}, + "highlight" : { + "tags_schema" : "styled", + "fields" : { + "content" : {} + } + } +} +-------------------------------------------------- + +An `encoder` parameter can be used to define how highlighted text will +be encoded. It can be either `default` (no encoding) or `html` (will +escape html, if you use html highlighting tags). + +==== Highlighted Fragments + +Each field highlighted can control the size of the highlighted fragment +in characters (defaults to `100`), and the maximum number of fragments +to return (defaults to `5`). For example: + +[source,js] +-------------------------------------------------- +{ + "query" : {...}, + "highlight" : { + "fields" : { + "content" : {"fragment_size" : 150, "number_of_fragments" : 3} + } + } +} +-------------------------------------------------- + +On top of this it is possible to specify that highlighted fragments are +order by score: + +[source,js] +-------------------------------------------------- +{ + "query" : {...}, + "highlight" : { + "order" : "score", + "fields" : { + "content" : {"fragment_size" : 150, "number_of_fragments" : 3} + } + } +} +-------------------------------------------------- + +Note the score of text fragment in this case is calculated by Lucene +highlighting framework. For implementation details you can check +`ScoreOrderFragmentsBuilder.java` class. + +If the `number_of_fragments` value is set to 0 then no fragments are +produced, instead the whole content of the field is returned, and of +course it is highlighted. This can be very handy if short texts (like +document title or address) need to be highlighted but no fragmentation +is required. Note that `fragment_size` is ignored in this case. + +[source,js] +-------------------------------------------------- +{ + "query" : {...}, + "highlight" : { + "fields" : { + "_all" : {}, + "bio.title" : {"number_of_fragments" : 0} + } + } +} +-------------------------------------------------- + +When using `fast-vector-highlighter` one can use `fragment_offset` +parameter to control the margin to start highlighting from. + +==== Global Settings + +Highlighting settings can be set on a global level and then overridden +at the field level. + +[source,js] +-------------------------------------------------- +{ + "query" : {...}, + "highlight" : { + "number_of_fragments" : 3, + "fragment_size" : 150, + "tag_schema" : "styled", + "fields" : { + "_all" : { "pre_tags" : [""], "post_tags" : [""] }, + "bio.title" : { "number_of_fragments" : 0 }, + "bio.author" : { "number_of_fragments" : 0 }, + "bio.content" : { "number_of_fragments" : 5, "order" : "score" } + } + } +} +-------------------------------------------------- + +==== Require Field Match + +`require_field_match` can be set to `true` which will cause a field to +be highlighted only if a query matched that field. `false` means that +terms are highlighted on all requested fields regardless if the query +matches specifically on them. + +==== Boundary Characters + +When highlighting a field that is mapped with term vectors, +`boundary_chars` can be configured to define what constitutes a boundary +for highlighting. It's a single string with each boundary character +defined in it. It defaults to `.,!? \t\n`. + +The `boundary_max_scan` allows to control how far to look for boundary +characters, and defaults to `20`. diff --git a/docs/reference/search/request/index-boost.asciidoc b/docs/reference/search/request/index-boost.asciidoc new file mode 100644 index 00000000000..29d1da3885c --- /dev/null +++ b/docs/reference/search/request/index-boost.asciidoc @@ -0,0 +1,17 @@ +[[search-request-index-boost]] +=== Index Boost + +Allows to configure different boost level per index when searching +across more than one indices. This is very handy when hits coming from +one index matter more than hits coming from another index (think social +graph where each user has an index). + +[source,js] +-------------------------------------------------- +{ + "indices_boost" : { + "index1" : 1.4, + "index2" : 1.3 + } +} +-------------------------------------------------- diff --git a/docs/reference/search/request/min-score.asciidoc b/docs/reference/search/request/min-score.asciidoc new file mode 100644 index 00000000000..18ecf6237cd --- /dev/null +++ b/docs/reference/search/request/min-score.asciidoc @@ -0,0 +1,17 @@ +[[search-request-min-score]] +=== min_score + +Allows to filter out documents based on a minimum score: + +[source,js] +-------------------------------------------------- +{ + "min_score": 0.5, + "query" : { + "term" : { "user" : "kimchy" } + } +} +-------------------------------------------------- + +Note, most times, this does not make much sense, but is provided for +advance use cases. diff --git a/docs/reference/search/request/named-filters.asciidoc b/docs/reference/search/request/named-filters.asciidoc new file mode 100644 index 00000000000..8ad0fb60e6a --- /dev/null +++ b/docs/reference/search/request/named-filters.asciidoc @@ -0,0 +1,47 @@ +[[search-request-named-filters]] +=== Named Filters + +Each filter can accept a _name in its top level definition, for example: + +[source,js] +-------------------------------------------------- +{ + "filtered" : { + "query" : { + "term" : { "name.first" : "shay" } + }, + "filter" : { + "terms" : { + "name.last" : ["banon", "kimchy"], + "_name" : "test" + } + } + } +} +-------------------------------------------------- + +The search response will include for each hit the `matched_filters` it +matched on (note, this feature make sense for `or` / `bool` filters). + +Note, the query filter had to be enhanced in order to support this. In +order to set a name, the `fquery` filter should be used, which wraps a +query (just so there will be a place to set a name for it), for example: + +[source,js] +-------------------------------------------------- +{ + "filtered" : { + "query" : { + "term" : { "name.first" : "shay" } + }, + "filter" : { + "fquery" : { + "query" : { + "term" : { "name.last" : "banon" } + }, + "_name" : "test" + } + } + } +} +-------------------------------------------------- diff --git a/docs/reference/search/request/preference.asciidoc b/docs/reference/search/request/preference.asciidoc new file mode 100644 index 00000000000..0a71a363808 --- /dev/null +++ b/docs/reference/search/request/preference.asciidoc @@ -0,0 +1,42 @@ +[[search-request-preference]] +=== Preference + +Controls a `preference` of which shard replicas to execute the search +request on. By default, the operation is randomized between the shard +replicas. + +The `preference` can be set to: + +[horizontal] +`_primary`:: + The operation will go and be executed only on the primary + shards. + +`_primary_first`:: + The operation will go and be executed on the primary + shard, and if not available (failover), will execute on other shards. + +`_local`:: + The operation will prefer to be executed on a local + allocated shard if possible. + +`_only_node:xyz`:: + Restricts the search to execute only on a node with + the provided node id (`xyz` in this case). + +`_prefer_node:xyz`:: + Prefers execution on the node with the provided + node id (`xyz` in this case) if applicable. + +`_shards:2,3`:: + Restricts the operation to the specified shards. (`2` + and `3` in this case). This preference can be combined with other + preferences but it has to appear first: `_shards:2,3;_primary` + +Custom (string) value:: + A custom value will be used to guarantee that + the same shards will be used for the same custom value. This can help + with "jumping values" when hitting different shards in different refresh + states. A sample value can be something like the web session id, or the + user name. + diff --git a/docs/reference/search/request/query.asciidoc b/docs/reference/search/request/query.asciidoc new file mode 100644 index 00000000000..e496320bd97 --- /dev/null +++ b/docs/reference/search/request/query.asciidoc @@ -0,0 +1,14 @@ +[[search-request-query]] +=== Query + +The query element within the search request body allows to define a +query using the <>. + +[source,js] +-------------------------------------------------- +{ + "query" : { + "term" : { "user" : "kimchy" } + } +} +-------------------------------------------------- diff --git a/docs/reference/search/request/rescore.asciidoc b/docs/reference/search/request/rescore.asciidoc new file mode 100644 index 00000000000..c8bc50f046d --- /dev/null +++ b/docs/reference/search/request/rescore.asciidoc @@ -0,0 +1,69 @@ +[[search-request-rescore]] +=== Rescoring + +Rescoring can help to improve precision by reordering just the top (eg +100 - 500) documents returned by the +<> and +<> phases, using a +secondary (usually more costly) algorithm, instead of applying the +costly algorithm to all documents in the index. + +A `rescore` request is executed on each shard before it returns its +results to be sorted by the node handling the overall search request. + +Currently the rescore API has only one implementation: the query +rescorer, which uses a query to tweak the scoring. In the future, +alternative rescorers may be made available, for example, a pair-wise rescorer. + +*Note:* the `rescore` phase is not executed when +<> is set +to `scan` or `count`. + +==== Query rescorer + +The query rescorer executes a second query only on the Top-K results +returned by the <> and +<> phases. The +number of docs which will be examined on each shard can be controlled by +the `window_size` parameter, which defaults to +<>. + +The scores from the original query and the rescore query are combined +linearly to produce the final `_score` for each document. The relative +importance of the original query and of the rescore query can be +controlled with the `query_weight` and `rescore_query_weight` +respectively. Both default to `1`. + +For example: + +[source,js] +-------------------------------------------------- +curl -s -XPOST 'localhost:9200/_search' -d '{ + "query" : { + "match" : { + "field1" : { + "operator" : "OR", + "query" : "the quick brown", + "type" : "boolean" + } + } + }, + "rescore" : { + "window_size" : 50, + "query" : { + "rescore_query" : { + "match" : { + "field1" : { + "query" : "the quick brown", + "type" : "phrase", + "slop" : 2 + } + } + }, + "query_weight" : 0.7, + "rescore_query_weight" : 1.2 + } + } +} +' +-------------------------------------------------- diff --git a/docs/reference/search/request/script-fields.asciidoc b/docs/reference/search/request/script-fields.asciidoc new file mode 100644 index 00000000000..75d8f82c949 --- /dev/null +++ b/docs/reference/search/request/script-fields.asciidoc @@ -0,0 +1,60 @@ +[[search-request-script-fields]] +=== Script Fields + +Allows to return a <> (based on different fields) for each hit, for example: + +[source,js] +-------------------------------------------------- +{ + "query" : { + ... + }, + "script_fields" : { + "test1" : { + "script" : "doc['my_field_name'].value * 2" + }, + "test2" : { + "script" : "doc['my_field_name'].value * factor", + "params" : { + "factor" : 2.0 + } + } + } +} +-------------------------------------------------- + +Script fields can work on fields that are not store (`my_field_name` in +the above case), and allow to return custom values to be returned (the +evaluated value of the script). + +Script fields can also access the actual `_source` document indexed and +extract specific elements to be returned from it (can be an "object" +type). Here is an example: + +[source,js] +-------------------------------------------------- + { + "query" : { + ... + }, + "script_fields" : { + "test1" : { + "script" : "_source.obj1.obj2" + } + } + } +-------------------------------------------------- + +Note the `_source` keyword here to navigate the json like model. + +Its important to understand the difference between +`doc['my_field'].value` and `_source.my_field`. The first, using the doc +keyword, will cause the terms for that field to be loaded to memory +(cached), which will result in faster execution, but more memory +consumption. Also, the `doc[...]` notation only allows for simple valued +fields (can't return a json object from it) and make sense only on non +analyzed or single term based fields. + +The `_source` on the other hand causes the source to be loaded, parsed, +and then only the relevant part of the json is returned. diff --git a/docs/reference/search/request/scroll.asciidoc b/docs/reference/search/request/scroll.asciidoc new file mode 100644 index 00000000000..069427069cd --- /dev/null +++ b/docs/reference/search/request/scroll.asciidoc @@ -0,0 +1,42 @@ +[[search-request-scroll]] +=== Scroll + +A search request can be scrolled by specifying the `scroll` parameter. +The `scroll` parameter is a time value parameter (for example: +`scroll=5m`), indicating for how long the nodes that participate in the +search will maintain relevant resources in order to continue and support +it. This is very similar in its idea to opening a cursor against a +database. + +A `scroll_id` is returned from the first search request (and from +continuous) scroll requests. The `scroll_id` should be used when +scrolling (along with the `scroll` parameter, to stop the scroll from +expiring). The scroll id can also be passed as part of the search +request body. + +*Note*: the `scroll_id` changes for each scroll request and only the +most recent one should be used. + +[source,js] +-------------------------------------------------- +$ curl -XGET 'http://localhost:9200/twitter/tweet/_search?scroll=5m' -d '{ + "query": { + "query_string" : { + "query" : "some query string here" + } + } +} +' +-------------------------------------------------- + +[source,js] +-------------------------------------------------- +$ curl -XGET 'http://localhost:9200/_search/scroll?scroll=5m&scroll_id=c2Nhbjs2OzM0NDg1ODpzRlBLc0FXNlNyNm5JWUc1' +-------------------------------------------------- + +Scrolling is not intended for real time user requests, it is intended +for cases like scrolling over large portions of data that exists within +elasticsearch to reindex it for example. + +For more information on scrolling, see the +<> search type. diff --git a/docs/reference/search/request/search-type.asciidoc b/docs/reference/search/request/search-type.asciidoc new file mode 100644 index 00000000000..bf3126fea81 --- /dev/null +++ b/docs/reference/search/request/search-type.asciidoc @@ -0,0 +1,129 @@ +[[search-request-search-type]] +=== Search Type + +There are different execution paths that can be done when executing a +distributed search. The distributed search operation needs to be +scattered to all the relevant shards and then all the results are +gathered back. When doing scatter/gather type execution, there are +several ways to do that, specifically with search engines. + +One of the questions when executing a distributed search is how much +results to retrieve from each shard. For example, if we have 10 shards, +the 1st shard might hold the most relevant results from 0 till 10, with +other shards results ranking below it. For this reason, when executing a +request, we will need to get results from 0 till 10 from all shards, +sort them, and then return the results if we want to insure correct +results. + +Another question, which relates to search engine, is the fact that each +shard stands on its own. When a query is executed on a specific shard, +it does not take into account term frequencies and other search engine +information from the other shards. If we want to support accurate +ranking, we would need to first execute the query against all shards and +gather the relevant term frequencies, and then, based on it, execute the +query. + +Also, because of the need to sort the results, getting back a large +document set, or even scrolling it, while maintaing the correct sorting +behavior can be a very expensive operation. For large result set +scrolling without sorting, the `scan` search type (explained below) is +also available. + +ElasticSearch is very flexible and allows to control the type of search +to execute on a *per search request* basis. The type can be configured +by setting the *search_type* parameter in the query string. The types +are: + +==== Query And Fetch + +Parameter value: *query_and_fetch*. + +The most naive (and possibly fastest) implementation is to simply +execute the query on all relevant shards and return the results. Each +shard returns `size` results. Since each shard already returns `size` +hits, this type actually returns `size` times `number of shards` results +back to the caller. + +==== Query Then Fetch + +Parameter value: *query_then_fetch*. + +The query is executed against all shards, but only enough information is +returned (*not the document content*). The results are then sorted and +ranked, and based on it, *only the relevant shards* are asked for the +actual document content. The return number of hits is exactly as +specified in `size`, since they are the only ones that are fetched. This +is very handy when the index has a lot of shards (not replicas, shard id +groups). + +NOTE: This is the default setting, if you do not specify a `search_type` + in your request. + +==== Dfs, Query And Fetch + +Parameter value: *dfs_query_and_fetch*. + +Same as "Query And Fetch", except for an initial scatter phase which +goes and computes the distributed term frequencies for more accurate +scoring. + +==== Dfs, Query Then Fetch + +Parameter value: *dfs_query_then_fetch*. + +Same as "Query Then Fetch", except for an initial scatter phase which +goes and computes the distributed term frequencies for more accurate +scoring. + +==== Count + +Parameter value: *count*. + +A special search type that returns the count that matched the search +request without any docs (represented in `total_hits`), and possibly, +including facets as well. In general, this is preferable to the `count` +API as it provides more options. + +==== Scan + +Parameter value: *scan*. + +The `scan` search type allows to efficiently scroll a large result set. +It's used first by executing a search request with scrolling and a +query: + +[source,js] +-------------------------------------------------- +curl -XGET 'localhost:9200/_search?search_type=scan&scroll=10m&size=50' -d ' +{ + "query" : { + "match_all" : {} + } +} +' +-------------------------------------------------- + +The `scroll` parameter control the keep alive time of the scrolling +request and initiates the scrolling process. The timeout applies per +round trip (i.e. between the previous scan scroll request, to the next). + +The response will include no hits, with two important results, the +`total_hits` will include the total hits that match the query, and the +`scroll_id` that allows to start the scroll process. From this stage, +the `_search/scroll` endpoint should be used to scroll the hits, feeding +the next scroll request with the previous search result `scroll_id`. For +example: + +[source,js] +-------------------------------------------------- +curl -XGET 'localhost:9200/_search/scroll?scroll=10m' -d 'c2NhbjsxOjBLMzdpWEtqU2IyZHlmVURPeFJOZnc7MzowSzM3aVhLalNiMmR5ZlVET3hSTmZ3OzU6MEszN2lYS2pTYjJkeWZVRE94Uk5mdzsyOjBLMzdpWEtqU2IyZHlmVURPeFJOZnc7NDowSzM3aVhLalNiMmR5ZlVET3hSTmZ3Ow==' +-------------------------------------------------- + +Scroll requests will include a number of hits equal to the size +multiplied by the number of primary shards. + +The "breaking" condition out of a scroll is when no hits has been +returned. The total_hits will be maintained between scroll requests. + +Note, scan search type does not support sorting (either on score or a +field) or faceting. diff --git a/docs/reference/search/request/sort.asciidoc b/docs/reference/search/request/sort.asciidoc new file mode 100644 index 00000000000..586de4942ad --- /dev/null +++ b/docs/reference/search/request/sort.asciidoc @@ -0,0 +1,319 @@ +[[search-request-sort]] +=== Sort + +Allows to add one or more sort on specific fields. Each sort can be +reversed as well. The sort is defined on a per field level, with special +field name for `_score` to sort by score. + +[source,js] +-------------------------------------------------- +{ + "sort" : [ + { "post_date" : {"order" : "asc"} }, + "user", + { "name" : "desc" }, + { "age" : "desc" }, + "_score" + ], + "query" : { + "term" : { "user" : "kimchy" } + } +} +-------------------------------------------------- + +==== Sort Values + +The sort values for each document returned are also returned as part of +the response. + +==== Sort mode option + +From version `0.90.0.Beta1` Elasticsearch supports sorting by array +fields which is also known as multi-valued fields. The `mode` option +controls what array value is picked for sorting the document it belongs +to. The `mode` option can have the following values: + +[horizontal] +`min`:: Pick the lowest value. +`max`:: Pick the highest value. +`sum`:: Use the sum of all values as sort value. Only applicable for + number based array fields. +`avg`:: Use the average of all values as sort value. Only applicable + for number based array fields. + +===== Sort mode example usage + +In the example below the field price has multiple prices per document. +In this case the result hits will be sort by price ascending based on +the average price per document. + +[source,js] +-------------------------------------------------- +curl -XPOST 'localhost:9200/_search' -d '{ + "query" : { + ... + }, + "sort" : [ + {"price" : {"order" : "asc", "mode" : "avg"}} + ] +}' +-------------------------------------------------- + +==== Sorting within nested objects. + +Also from version `0.90.0.Beta1` Elasticsearch supports sorting by +fields that are inside one or more nested objects. The sorting by nested +field support has the following parameters on top of the already +existing sort options: + +`nested_path`:: + Defines the on what nested object to sort. The actual + sort field must be a direct field inside this nested object. The default + is to use the most immediate inherited nested object from the sort + field. + +`nested_filter`:: + A filter the inner objects inside the nested path + should match with in order for its field values to be taken into account + by sorting. Common case is to repeat the query / filter inside the + nested filter or query. By default no `nested_filter` is active. + +===== Nested sorting example + +In the below example `offer` is a field of type `nested`. Because +`offer` is the closest inherited nested field, it is picked as +`nested_path`. Only the inner objects that have color blue will +participate in sorting. + +[source,js] +-------------------------------------------------- +curl -XPOST 'localhost:9200/_search' -d '{ + "query" : { + ... + }, + "sort" : [ + { + "offer.price" : { + "mode" : "avg", + "order" : "asc", + "nested_filter" : { + "term" : { "offer.color" : "blue" } + } + } + } + ] +}' +-------------------------------------------------- + +Since version `0.90.1` nested sorting is also support when sorting by +scripts and sorting by geo distance. + +==== Missing Values + +Numeric fields support specific handling for missing fields in a doc. +The `missing` value can be `_last`, `_first`, or a custom value (that +will be used for missing docs as the sort value). For example: + +[source,js] +-------------------------------------------------- +{ + "sort" : [ + { "price" : {"missing" : "_last"} }, + ], + "query" : { + "term" : { "user" : "kimchy" } + } +} +-------------------------------------------------- + +Note: from version `0.90.1` if a nested inner object doesn't match with +the `nested_filter` then a missing value is used. + +==== Ignoring Unmapped Fields + +By default, the search request will fail if there is no mapping +associated with a field. The `ignore_unmapped` option allows to ignore +fields that have no mapping and not sort by them. Here is an example of +how it can be used: + +[source,js] +-------------------------------------------------- +{ + "sort" : [ + { "price" : {"ignore_unmapped" : true} }, + ], + "query" : { + "term" : { "user" : "kimchy" } + } +} +-------------------------------------------------- + +==== Geo Distance Sorting + +Allow to sort by `_geo_distance`. Here is an example: + +[source,js] +-------------------------------------------------- +{ + "sort" : [ + { + "_geo_distance" : { + "pin.location" : [-70, 40], + "order" : "asc", + "unit" : "km" + } + } + ], + "query" : { + "term" : { "user" : "kimchy" } + } +} +-------------------------------------------------- + +Note: the geo distance sorting supports `sort_mode` options: `min`, +`max` and `avg`. + +The following formats are supported in providing the coordinates: + +===== Lat Lon as Properties + +[source,js] +-------------------------------------------------- +{ + "sort" : [ + { + "_geo_distance" : { + "pin.location" : { + "lat" : 40, + "lon" : -70 + }, + "order" : "asc", + "unit" : "km" + } + } + ], + "query" : { + "term" : { "user" : "kimchy" } + } +} +-------------------------------------------------- + +===== Lat Lon as String + +Format in `lat,lon`. + +[source,js] +-------------------------------------------------- +{ + "sort" : [ + { + "_geo_distance" : { + "pin.location" : "-70,40", + "order" : "asc", + "unit" : "km" + } + } + ], + "query" : { + "term" : { "user" : "kimchy" } + } +} +-------------------------------------------------- + +===== Geohash + +[source,js] +-------------------------------------------------- +{ + "sort" : [ + { + "_geo_distance" : { + "pin.location" : "drm3btev3e86", + "order" : "asc", + "unit" : "km" + } + } + ], + "query" : { + "term" : { "user" : "kimchy" } + } +} +-------------------------------------------------- + +===== Lat Lon as Array + +Format in `[lon, lat]`, note, the order of lon/lat here in order to +conform with http://geojson.org/[GeoJSON]. + +[source,js] +-------------------------------------------------- +{ + "sort" : [ + { + "_geo_distance" : { + "pin.location" : [-70, 40], + "order" : "asc", + "unit" : "km" + } + } + ], + "query" : { + "term" : { "user" : "kimchy" } + } +} +-------------------------------------------------- + +==== Script Based Sorting + +Allow to sort based on custom scripts, here is an example: + +[source,js] +-------------------------------------------------- +{ + "query" : { + .... + }, + "sort" : { + "_script" : { + "script" : "doc['field_name'].value * factor", + "type" : "number", + "params" : { + "factor" : 1.1 + }, + "order" : "asc" + } + } +} +-------------------------------------------------- + +Note, it is recommended, for single custom based script based sorting, +to use `custom_score` query instead as sorting based on score is faster. + +==== Track Scores + +When sorting on a field, scores are not computed. By setting +`track_scores` to true, scores will still be computed and tracked. + +[source,js] +-------------------------------------------------- +{ + "track_scores": true, + "sort" : [ + { "post_date" : {"reverse" : true} }, + { "name" : "desc" }, + { "age" : "desc" } + ], + "query" : { + "term" : { "user" : "kimchy" } + } +} +-------------------------------------------------- + +==== Memory Considerations + +When sorting, the relevant sorted field values are loaded into memory. +This means that per shard, there should be enough memory to contain +them. For string based types, the field sorted on should not be analyzed +/ tokenized. For numeric types, if possible, it is recommended to +explicitly set the type to six_hun types (like `short`, `integer` and +`float`). diff --git a/docs/reference/search/request/version.asciidoc b/docs/reference/search/request/version.asciidoc new file mode 100644 index 00000000000..3b2329a828a --- /dev/null +++ b/docs/reference/search/request/version.asciidoc @@ -0,0 +1,14 @@ +[[search-request-version]] +=== Version + +Returns a version for each search hit. + +[source,js] +-------------------------------------------------- +{ + "version": true, + "query" : { + "term" : { "user" : "kimchy" } + } +} +-------------------------------------------------- diff --git a/docs/reference/search/search.asciidoc b/docs/reference/search/search.asciidoc new file mode 100644 index 00000000000..745b1cf554a --- /dev/null +++ b/docs/reference/search/search.asciidoc @@ -0,0 +1,52 @@ +[[search-search]] +== Search + +The search API allows to execute a search query and get back search hits +that match the query. The query can either be provided using a simple +<>, or using a +<>. + +["float",id="search-multi-index-type"] +=== Multi-Index, Multi-Type + +All search APIs can be applied across multiple types within an index, and +across multiple indices with support for the +<>. For +example, we can search on all documents across all types within the +twitter index: + +[source,js] +-------------------------------------------------- +$ curl -XGET 'http://localhost:9200/twitter/_search?q=user:kimchy' +-------------------------------------------------- + +We can also search within specific types: + +[source,js] +-------------------------------------------------- +$ curl -XGET 'http://localhost:9200/twitter/tweet,user/_search?q=user:kimchy' +-------------------------------------------------- + +We can also search all tweets with a certain tag across several indices +(for example, when each user has his own index): + +[source,js] +-------------------------------------------------- +$ curl -XGET 'http://localhost:9200/kimchy,elasticsearch/tweet/_search?q=tag:wow' +-------------------------------------------------- + +Or we can search all tweets across all available indices using `_all` +placeholder: + +[source,js] +-------------------------------------------------- +$ curl - XGET 'http://localhost:9200/_all/tweet/_search?q=tag:wow' +-------------------------------------------------- + +Or even search across all indices and all types: + +[source,js] +-------------------------------------------------- +$ curl -XGET 'http://localhost:9200/_search?q=tag:wow' +-------------------------------------------------- + diff --git a/docs/reference/search/suggesters.asciidoc b/docs/reference/search/suggesters.asciidoc new file mode 100644 index 00000000000..2b261344aa6 --- /dev/null +++ b/docs/reference/search/suggesters.asciidoc @@ -0,0 +1,275 @@ +[[search-suggesters]] +== Suggesters + +The suggest feature suggests similar looking terms based on a provided +text by using a suggester. The suggest feature is available from version +`0.90.0.Beta1`. Parts of the suggest feature are still under +development. + +The suggest request part is either defined alongside the query part in a +`_search` request or via the REST `_suggest` endpoint. + +[source,js] +-------------------------------------------------- +curl -s -XPOST 'localhost:9200/_search' -d '{ + "query" : { + ... + }, + "suggest" : { + ... + } +}' +-------------------------------------------------- + +Suggest requests executed against the `_suggest` endpoint should omit +the surrounding `suggest` element which is only used if the suggest +request is part of a search. + +[source,js] +-------------------------------------------------- +curl -XPOST 'localhost:9200/_suggest' -d '{ + "my-suggestion" : { + "text" : "the amsterdma meetpu", + "term" : { + "field" : "body" + } + } +}' +-------------------------------------------------- + +Several suggestions can be specified per request. Each suggestion is +identified with an arbitrary name. In the example below two suggestions +are requested. Both `my-suggest-1` and `my-suggest-2` suggestions use +the `term` suggester, but have a different `text`. + +[source,js] +-------------------------------------------------- +"suggest" : { + "my-suggest-1" : { + "text" : "the amsterdma meetpu", + "term" : { + "field" : "body" + } + }, + "my-suggest-2" : { + "text" : "the rottredam meetpu", + "term" : { + "field" : "title", + } + } +} +-------------------------------------------------- + +The below suggest response example includes the suggestion response for +`my-suggest-1` and `my-suggest-2`. Each suggestion part contains +entries. Each entry is effectively a token from the suggest text and +contains the suggestion entry text, the original start offset and length +in the suggest text and if found an arbitrary number of options. + +[source,js] +-------------------------------------------------- +{ + ... + "suggest": { + "my-suggest-1": [ + { + "text" : "amsterdma", + "offset": 4, + "length": 9, + "options": [ + ... + ] + }, + ... + ], + "my-suggest-2" : [ + ... + ] + } + ... +} +-------------------------------------------------- + +Each options array contains an option object that includes the +suggested text, its document frequency and score compared to the suggest +entry text. The meaning of the score depends on the used suggester. The +term suggester's score is based on the edit distance. + +[source,js] +-------------------------------------------------- +"options": [ + { + "text": "amsterdam", + "freq": 77, + "score": 0.8888889 + }, + ... +] +-------------------------------------------------- + +[float] +=== Global suggest text + +To avoid repetition of the suggest text, it is possible to define a +global text. In the example below the suggest text is defined globally +and applies to the `my-suggest-1` and `my-suggest-2` suggestions. + +[source,js] +-------------------------------------------------- +"suggest" : { + "text" : "the amsterdma meetpu" + "my-suggest-1" : { + "term" : { + "field" : "title" + } + }, + "my-suggest-2" : { + "term" : { + "field" : "body" + } + } +} +-------------------------------------------------- + +The suggest text can in the above example also be specified as +suggestion specific option. The suggest text specified on suggestion +level override the suggest text on the global level. + +[float] +=== Other suggest example. + +In the below example we request suggestions for the following suggest +text: `devloping distibutd saerch engies` on the `title` field with a +maximum of 3 suggestions per term inside the suggest text. Note that in +this example we use the `count` search type. This isn't required, but a +nice optimization. The suggestions are gather in the `query` phase and +in the case that we only care about suggestions (so no hits) we don't +need to execute the `fetch` phase. + +[source,js] +-------------------------------------------------- +curl -s -XPOST 'localhost:9200/_search?search_type=count' -d '{ + "suggest" : { + "my-title-suggestions-1" : { + "text" : "devloping distibutd saerch engies", + "term" : { + "size" : 3, + "field" : "title" + } + } + } +}' +-------------------------------------------------- + +The above request could yield the response as stated in the code example +below. As you can see if we take the first suggested options of each +suggestion entry we get `developing distributed search engines` as +result. + +[source,js] +-------------------------------------------------- +{ + ... + "suggest": { + "my-title-suggestions-1": [ + { + "text": "devloping", + "offset": 0, + "length": 9, + "options": [ + { + "text": "developing", + "freq": 77, + "score": 0.8888889 + }, + { + "text": "deloping", + "freq": 1, + "score": 0.875 + }, + { + "text": "deploying", + "freq": 2, + "score": 0.7777778 + } + ] + }, + { + "text": "distibutd", + "offset": 10, + "length": 9, + "options": [ + { + "text": "distributed", + "freq": 217, + "score": 0.7777778 + }, + { + "text": "disributed", + "freq": 1, + "score": 0.7777778 + }, + { + "text": "distribute", + "freq": 1, + "score": 0.7777778 + } + ] + }, + { + "text": "saerch", + "offset": 20, + "length": 6, + "options": [ + { + "text": "search", + "freq": 1038, + "score": 0.8333333 + }, + { + "text": "smerch", + "freq": 3, + "score": 0.8333333 + }, + { + "text": "serch", + "freq": 2, + "score": 0.8 + } + ] + }, + { + "text": "engies", + "offset": 27, + "length": 6, + "options": [ + { + "text": "engines", + "freq": 568, + "score": 0.8333333 + }, + { + "text": "engles", + "freq": 3, + "score": 0.8333333 + }, + { + "text": "eggies", + "freq": 1, + "score": 0.8333333 + } + ] + } + ] + } + ... +} +-------------------------------------------------- + +include::suggesters/term-suggest.asciidoc[] + +include::suggesters/phrase-suggest.asciidoc[] + +include::suggesters/completion-suggest.asciidoc[] + + diff --git a/docs/reference/search/suggesters/completion-suggest.asciidoc b/docs/reference/search/suggesters/completion-suggest.asciidoc new file mode 100644 index 00000000000..98dd1768732 --- /dev/null +++ b/docs/reference/search/suggesters/completion-suggest.asciidoc @@ -0,0 +1,167 @@ +[[search-suggesters-completion]] +=== Completion Suggester + +NOTE: In order to understand the format of suggestions, please +read the <> page first. + +The `completion` suggester is a so-called prefix suggester. It does not +do spell correction like the `term` or `phrase` suggesters but allows +basic `auto-complete` functionality. + +IMPORTANT: This feature is marked as experimental. This means, that +the API is not considered stable, and that you might need to reindex +your data after an upgrade in order to get suggestions up and running +again. Please keep this in mind. + +==== Why another suggester? Why not prefix queries? + +The first question which comes to mind when reading about a prefix +suggestion is, why you should use it all, if you have prefix queries +already. The answer is simple: Prefix suggestions are fast. + +The data structures are internally backed by Lucenes +`AnalyzingSuggester`, which uses FSTs to execute suggestions. Usually +these data structures are costly to create, stored in-memory and need to +be rebuilt every now and then to reflect changes in your indexed +documents. The `completion` suggester circumvents this by storing the +FST as part of your index during index time. This allows for really fast +loads and executions. + +==== Mapping + +In order to use this feature, you have to specify a special mapping for +this field, which enables the special storage of the field. + +[source,js] +-------------------------------------------------- +curl -X PUT localhost:9200/music +curl -X PUT localhost:9200/music/song/_mapping -d '{ + "song" : { + "properties" : { + "name" : { "type" : "string" }, + "suggest" : { "type" : "completion", + "index_analyzer" : "simple", + "search_analyzer" : "simple", + "payloads" : true + } + } + } +}' +-------------------------------------------------- + +Mapping supports the following parameters: + +`index_analyzer`:: + The index analyzer to use, defaults to `simple`. + +`search_analyzer`:: + The search analyzer to use, defaults to `simple`. + In case you are wondering why we did not opt for the `standard` + analyzer: We try to have easy to understand behaviour here, and if you + index the field content `At the Drive-in`, you will not get any + suggestions for `a`, nor for `d` (the first non stopword). + + +`payloads`:: + Enables the storing of payloads, defaults to `false` + +`preserve_separators`: + Preserves the separators, defaults to `true`. + If disabled, you could find a field starting with `Foo Fighters`, if you + suggest for `foof`. + +`preserve_position_increments`:: + Enables position increments, defaults + to `true`. If disabled and using stopwords analyzer, you could get a + field starting with `The Beatles`, if you suggest for `b`. *Note*: You + could also achieve this by indexing two inputs, `Beatles` and + `The Beatles`, no need to change a simple analyzer, if you are able to + enrich your data. + +==== Indexing + +[source,js] +-------------------------------------------------- +curl -X PUT 'localhost:9200/music/song/1?refresh=true' -d '{ + "name" : "Nevermind", + "suggest" : { + "input": [ "Nevermind", "Nirvana" ], + "output": "Nirvana - Nevermind", + "payload" : { "artistId" : 2321 }, + "weight" : 34 + } +}' +-------------------------------------------------- + +The following parameters are supported: + +`input`:: + The input to store, this can be a an array of strings or just + a string. This field is mandatory. + +`output`:: + The string to return, if a suggestion matches. This is very + useful to normalize outputs (i.e. have them always in the format + `artist - songname`. This is optional. + +`payload`:: + An arbitrary JSON object, which is simply returned in the + suggest option. You could store data like the id of a document, in order + to load it from elasticsearch without executing another search (which + might not yield any results, if `input` and `output` differ strongly). + +`weight`:: + A positive integer, which defines a weight and allows you to + rank your suggestions. This field is optional. + +[NOTE] Even though you are losing most of the features of the +completion suggest, you can opt in for the shortest form, which even +allows you to use inside of multi_field. But keep in mind, that you will +not be able to use several inputs, an output, payloads or weights. + +[source,js] +-------------------------------------------------- +{ + "suggest" : "Nirvana" +} +-------------------------------------------------- + +==== Querying + +Suggesting works as usual, except that you have to specify the suggest +type as `completion`. + +[source,js] +-------------------------------------------------- +curl -X POST 'localhost:9200/music/_suggest?pretty' -d '{ + "song-suggest" : { + "text" : "n", + "completion" : { + "field" : "suggest" + } + } +}' + +{ + "_shards" : { + "total" : 5, + "successful" : 5, + "failed" : 0 + }, + "song-suggest" : [ { + "text" : "n", + "offset" : 0, + "length" : 4, + "options" : [ { + "text" : "Nirvana - Nevermind", + "score" : 34.0, "payload" : {"artistId":2321} + } ] + } ] +} +-------------------------------------------------- + +As you can see, the payload is included in the response, if configured +appropriately. If you configured a weight for a suggestion, this weight +is used as `score`. Also the `text` field uses the `output` of your +indexed suggestion, if configured, otherwise the matched part of the +`input` field. diff --git a/docs/reference/search/suggesters/phrase-suggest.asciidoc b/docs/reference/search/suggesters/phrase-suggest.asciidoc new file mode 100644 index 00000000000..1651bc216ea --- /dev/null +++ b/docs/reference/search/suggesters/phrase-suggest.asciidoc @@ -0,0 +1,302 @@ +[[search-suggesters-phrase]] +=== Phrase Suggester + +NOTE: In order to understand the format of suggestions, please +read the <> page first. + +The `term` suggester provides a very convenient API to access word +alternatives on token basis within a certain string distance. The API +allows accessing each token in the stream individually while +suggest-selection is left to the API consumer. Yet, often pre-selected +suggestions are required in order to present to the end-user. The +`phrase` suggester adds additional logic on top of the `term` suggester +to select entire corrected phrases instead of individual tokens weighted +based on `ngram-langugage` models. In practice it this suggester will be +able to make better decision about which tokens to pick based on +co-occurence and frequencies. + +==== API Example + +The `phrase` request is defined along side the query part in the json +request: + +[source,js] +-------------------------------------------------- +curl -XPOST 'localhost:9200/_search' -d { + "suggest" : { + "text" : "Xor the Got-Jewel", + "simple_phrase" : { + "phrase" : { + "analyzer" : "body", + "field" : "bigram", + "size" : 1, + "real_word_error_likelihood" : 0.95, + "max_errors" : 0.5, + "gram_size" : 2, + "direct_generator" : [ { + "field" : "body", + "suggest_mode" : "always", + "min_word_len" : 1 + } ] + } + } + } +} +-------------------------------------------------- + +The response contains suggested scored by the most likely spell +correction first. In this case we got the expected correction +`xorr the god jewel` first while the second correction is less +conservative where only one of the errors is corrected. Note, the +request is executed with `max_errors` set to `0.5` so 50% of the terms +can contain misspellings (See parameter descriptions below). + +[source,js] +-------------------------------------------------- + { + "took" : 5, + "timed_out" : false, + "_shards" : { + "total" : 5, + "successful" : 5, + "failed" : 0 + }, + "hits" : { + "total" : 2938, + "max_score" : 0.0, + "hits" : [ ] + }, + "suggest" : { + "simple_phrase" : [ { + "text" : "Xor the Got-Jewel", + "offset" : 0, + "length" : 17, + "options" : [ { + "text" : "xorr the god jewel", + "score" : 0.17877324 + }, { + "text" : "xor the god jewel", + "score" : 0.14231323 + } ] + } ] + } +} +-------------------------------------------------- + +==== Basic Phrase suggest API parameters + +[horizontal] +`field`:: + the name of the field used to do n-gram lookups for the + language model, the suggester will use this field to gain statistics to + score corrections. This field is mandatory. + +`gram_size`:: + sets max size of the n-grams (shingles) in the `field`. + If the field doesn't contain n-grams (shingles) this should be omitted + or set to `1`. Note that Elasticsearch tries to detect the gram size + based on the specified `field`. If the field uses a `shingle` filter the + `gram_size` is set to the `max_shingle_size` if not explicitly set. + +`real_word_error_likelihood`:: + the likelihood of a term being a + misspelled even if the term exists in the dictionary. The default it + `0.95` corresponding to 5% or the real words are misspelled. + + +`confidence`:: + The confidence level defines a factor applied to the + input phrases score which is used as a threshold for other suggest + candidates. Only candidates that score higher than the threshold will be + included in the result. For instance a confidence level of `1.0` will + only return suggestions that score higher than the input phrase. If set + to `0.0` the top N candidates are returned. The default is `1.0`. + +`max_errors`:: + the maximum percentage of the terms that at most + considered to be misspellings in order to form a correction. This method + accepts a float value in the range `[0..1)` as a fraction of the actual + query terms a number `>=1` as an absolute number of query terms. The + default is set to `1.0` which corresponds to that only corrections with + at most 1 misspelled term are returned. + +`separator`:: + the separator that is used to separate terms in the + bigram field. If not set the whitespace character is used as a + separator. + +`size`:: + the number of candidates that are generated for each + individual query term Low numbers like `3` or `5` typically produce good + results. Raising this can bring up terms with higher edit distances. The + default is `5`. + +`analyzer`:: + Sets the analyzer to analyse to suggest text with. + Defaults to the search analyzer of the suggest field passed via `field`. + +`shard_size`:: + Sets the maximum number of suggested term to be + retrieved from each individual shard. During the reduce phase, only the + top N suggestions are returned based on the `size` option. Defaults to + `5`. + +`text`:: + Sets the text / query to provide suggestions for. + +==== Smoothing Models + +The `phrase` suggester supports multiple smoothing models to balance +weight between infrequent grams (grams (shingles) are not existing in +the index) and frequent grams (appear at least once in the index). + +[horizontal] +`stupid_backoff`:: + a simple backoff model that backs off to lower + order n-gram models if the higher order count is `0` and discounts the + lower order n-gram model by a constant factor. The default `discount` is + `0.4`. Stupid Backoff is the default model. + +`laplace`:: + a smoothing model that uses an additive smoothing where a + constant (typically `1.0` or smaller) is added to all counts to balance + weights, The default `alpha` is `0.5`. + +`linear_interpolation`:: + a smoothing model that takes the weighted + mean of the unigrams, bigrams and trigrams based on user supplied + weights (lambdas). Linear Interpolation doesn't have any default values. + All parameters (`trigram_lambda`, `bigram_lambda`, `unigram_lambda`) + must be supplied. + +==== Candidate Generators + +The `phrase` suggester uses candidate generators to produce a list of +possible terms per term in the given text. A single candidate generator +is similar to a `term` suggester called for each individual term in the +text. The output of the generators is subsequently scored in combination +with the candidates from the other terms to for suggestion candidates. + +Currently only one type of candidate generator is supported, the +`direct_generator`. The Phrase suggest API accepts a list of generators +under the key `direct_generator` each of the generators in the list are +called per term in the original text. + +==== Direct Generators + +The direct generators support the following parameters: + +[horizontal] +`field`:: + The field to fetch the candidate suggestions from. This is + an required option that either needs to be set globally or per + suggestion. + +`size`:: + The maximum corrections to be returned per suggest text token. + +`suggest_mode`:: + The suggest mode controls what suggestions are + included or controls for what suggest text terms, suggestions should be + suggested. Three possible values can be specified: + ** `missing`: Only suggest terms in the suggest text that aren't in the + index. This is the default. + ** `popular`: Only suggest suggestions that occur in more docs then the + original suggest text term. + ** `always`: Suggest any matching suggestions based on terms in the + suggest text. + +`max_edits`:: + The maximum edit distance candidate suggestions can have + in order to be considered as a suggestion. Can only be a value between 1 + and 2. Any other value result in an bad request error being thrown. + Defaults to 2. + +`prefix_length`:: + The number of minimal prefix characters that must + match in order be a candidate suggestions. Defaults to 1. Increasing + this number improves spellcheck performance. Usually misspellings don't + occur in the beginning of terms. + +`min_word_len`:: + The minimum length a suggest text term must have in + order to be included. Defaults to 4. + +`max_inspections`:: + A factor that is used to multiply with the + `shards_size` in order to inspect more candidate spell corrections on + the shard level. Can improve accuracy at the cost of performance. + Defaults to 5. + +`min_doc_freq`:: + The minimal threshold in number of documents a + suggestion should appear in. This can be specified as an absolute number + or as a relative percentage of number of documents. This can improve + quality by only suggesting high frequency terms. Defaults to 0f and is + not enabled. If a value higher than 1 is specified then the number + cannot be fractional. The shard level document frequencies are used for + this option. + +`max_term_freq`:: + The maximum threshold in number of documents a + suggest text token can exist in order to be included. Can be a relative + percentage number (e.g 0.4) or an absolute number to represent document + frequencies. If an value higher than 1 is specified then fractional can + not be specified. Defaults to 0.01f. This can be used to exclude high + frequency terms from being spellchecked. High frequency terms are + usually spelled correctly on top of this also improves the spellcheck + performance. The shard level document frequencies are used for this + option. + +`pre_filter`:: + a filter (analyzer) that is applied to each of the + tokens passed to this candidate generator. This filter is applied to the + original token before candidates are generated. + +`post_filter`:: + a filter (analyzer) that is applied to each of the + generated tokens before they are passed to the actual phrase scorer. + +The following example shows a `phrase` suggest call with two generators, +the first one is using a field containing ordinary indexed terms and the +second one uses a field that uses terms indexed with a `reverse` filter +(tokens are index in reverse order). This is used to overcome the limitation +of the direct generators to require a constant prefix to provide +high-performance suggestions. The `pre_filter` and `post_filter` options +accept ordinary analyzer names. + +[source,js] +-------------------------------------------------- +curl -s -XPOST 'localhost:9200/_search' -d { + "suggest" : { + "text" : "Xor the Got-Jewel", + "simple_phrase" : { + "phrase" : { + "analyzer" : "body", + "field" : "bigram", + "size" : 4, + "real_word_error_likelihood" : 0.95, + "confidence" : 2.0, + "gram_size" : 2, + "direct_generator" : [ { + "field" : "body", + "suggest_mode" : "always", + "min_word_len" : 1 + }, { + "field" : "reverse", + "suggest_mode" : "always", + "min_word_len" : 1, + "pre_filter" : "reverse", + "post_filter" : "reverse" + } ] + } + } + } +} +-------------------------------------------------- + +`pre_filter` and `post_filter` can also be used to inject synonyms after +candidates are generated. For instance for the query `captain usq` we +might generate a candidate `usa` for term `usq` which is a synonym for +`america` which allows to present `captain america` to the user if this +phrase scores high enough. diff --git a/docs/reference/search/suggesters/term-suggest.asciidoc b/docs/reference/search/suggesters/term-suggest.asciidoc new file mode 100644 index 00000000000..e958e14a844 --- /dev/null +++ b/docs/reference/search/suggesters/term-suggest.asciidoc @@ -0,0 +1,110 @@ +[[search-suggesters-term]] +=== Term suggester + +NOTE: In order to understand the format of suggestions, please +read the <> page first. + +The `term` suggester suggests terms based on edit distance. The provided +suggest text is analyzed before terms are suggested. The suggested terms +are provided per analyzed suggest text token. The `term` suggester +doesn't take the query into account that is part of request. + +==== Common suggest options: + +[horizontal] +`text`:: + The suggest text. The suggest text is a required option that + needs to be set globally or per suggestion. + +`field`:: + The field to fetch the candidate suggestions from. This is + an required option that either needs to be set globally or per + suggestion. + +`analyzer`:: + The analyzer to analyse the suggest text with. Defaults + to the search analyzer of the suggest field. + +`size`:: + The maximum corrections to be returned per suggest text + token. + +`sort`:: + Defines how suggestions should be sorted per suggest text + term. Two possible values: ++ + ** `score`: Sort by sore first, then document frequency and + then the term itself. + ** `frequency`: Sort by document frequency first, then similarity + score and then the term itself. ++ +`suggest_mode`:: + The suggest mode controls what suggestions are + included or controls for what suggest text terms, suggestions should be + suggested. Three possible values can be specified: ++ + ** `missing`: Only suggest terms in the suggest text that aren't in + the index. This is the default. + ** `popular`: Only suggest suggestions that occur in more docs then + the original suggest text term. + ** `always`: Suggest any matching suggestions based on terms in the + suggest text. + +==== Other term suggest options: + +[horizontal] +`lowercase_terms`:: + Lower cases the suggest text terms after text analysis. + +`max_edits`:: + The maximum edit distance candidate suggestions can + have in order to be considered as a suggestion. Can only be a value + between 1 and 2. Any other value result in an bad request error being + thrown. Defaults to 2. + +`prefix_len`:: + The number of minimal prefix characters that must + match in order be a candidate suggestions. Defaults to 1. Increasing + this number improves spellcheck performance. Usually misspellings don't + occur in the beginning of terms. + +`min_word_len`:: + The minimum length a suggest text term must have in + order to be included. Defaults to 4. + +`shard_size`:: + Sets the maximum number of suggestions to be retrieved + from each individual shard. During the reduce phase only the top N + suggestions are returned based on the `size` option. Defaults to the + `size` option. Setting this to a value higher than the `size` can be + useful in order to get a more accurate document frequency for spelling + corrections at the cost of performance. Due to the fact that terms are + partitioned amongst shards, the shard level document frequencies of + spelling corrections may not be precise. Increasing this will make these + document frequencies more precise. + +`max_inspections`:: + A factor that is used to multiply with the + `shards_size` in order to inspect more candidate spell corrections on + the shard level. Can improve accuracy at the cost of performance. + Defaults to 5. + +`min_doc_freq`:: + The minimal threshold in number of documents a + suggestion should appear in. This can be specified as an absolute number + or as a relative percentage of number of documents. This can improve + quality by only suggesting high frequency terms. Defaults to 0f and is + not enabled. If a value higher than 1 is specified then the number + cannot be fractional. The shard level document frequencies are used for + this option. + +`max_term_freq`:: + The maximum threshold in number of documents a + suggest text token can exist in order to be included. Can be a relative + percentage number (e.g 0.4) or an absolute number to represent document + frequencies. If an value higher than 1 is specified then fractional can + not be specified. Defaults to 0.01f. This can be used to exclude high + frequency terms from being spellchecked. High frequency terms are + usually spelled correctly on top of this also improves the spellcheck + performance. The shard level document frequencies are used for this + option. diff --git a/docs/reference/search/uri-request.asciidoc b/docs/reference/search/uri-request.asciidoc new file mode 100644 index 00000000000..1ad2e911d2b --- /dev/null +++ b/docs/reference/search/uri-request.asciidoc @@ -0,0 +1,99 @@ +[[search-uri-request]] +== URI Search + +A search request can be executed purely using a URI by providing request +parameters. Not all search options are exposed when executing a search +using this mode, but it can be handy for quick "curl tests". Here is an +example: + +[source,js] +-------------------------------------------------- +$ curl -XGET 'http://localhost:9200/twitter/tweet/_search?q=user:kimchy' +-------------------------------------------------- + +And here is a sample response: + +[source,js] +-------------------------------------------------- +{ + "_shards":{ + "total" : 5, + "successful" : 5, + "failed" : 0 + }, + "hits":{ + "total" : 1, + "hits" : [ + { + "_index" : "twitter", + "_type" : "tweet", + "_id" : "1", + "_source" : { + "user" : "kimchy", + "postDate" : "2009-11-15T14:12:12", + "message" : "trying out Elastic Search" + } + } + ] + } +} +-------------------------------------------------- + +[float] +=== Parameters + +The parameters allowed in the URI are: + +[cols="<,<",options="header",] +|======================================================================= +|Name |Description +|`q` |The query string (maps to the `query_string` query, see +<> for more details). + +|`df` |The default field to use when no field prefix is defined within the +query. + +|`analyzer` |The analyzer name to be used when analyzing the query string. + +|`default_operator` |The default operator to be used, can be `AND` or +`OR`. Defaults to `OR`. + +|`explain` |For each hit, contain an explanation of how scoring of the +hits was computed. + +|`fields` |The selective fields of the document to return for each hit +(either retrieved from the index if stored, or from the `_source` if +not), comma delimited. Defaults to the internal `_source` field. Not +specifying any value will cause no fields to return. + +|`sort` |Sorting to perform. Can either be in the form of `fieldName`, or +`fieldName:asc`/@fieldName:desc@. The fieldName can either be an actual +field within the document, or the special `_score` name to indicate +sorting based on scores. There can be several `sort` parameters (order +is important). + +|`track_scores` |When sorting, set to `true` in order to still track +scores and return them as part of each hit. + +|`timeout` |A search timeout, bounding the search request to be executed +within the specified time value and bail with the hits accumulated up to +that point when expired. Defaults to no timeout. + +|`from` |The starting from index of the hits to return. Defaults to `0`. + +|`size` |The number of hits to return. Defaults to `10`. + +|`search_type` |The type of the search operation to perform. Can be +`dfs_query_then_fetch`, `dfs_query_and_fetch`, `query_then_fetch`, +`query_and_fetch`, `count`, `scan`. Defaults to `query_then_fetch`. See +<> for +more details on the different types of search that can be performed. + +|`lowercase_expanded_terms` |Should terms be automatically lowercased or +not. Defaults to `true`. + +|`analyze_wildcard` |Should wildcard and prefix queries be analyzed or +not. Defaults to `false`. +|======================================================================= + diff --git a/docs/reference/search/validate.asciidoc b/docs/reference/search/validate.asciidoc new file mode 100644 index 00000000000..371c99ecd87 --- /dev/null +++ b/docs/reference/search/validate.asciidoc @@ -0,0 +1,72 @@ +[[search-validate]] +== Validate API + +The validate API allows a user to validate a potentially expensive query +without executing it. The following example shows how it can be used: + +[source,js] +-------------------------------------------------- +curl -XPUT 'http://localhost:9200/twitter/tweet/1' -d '{ + "user" : "kimchy", + "post_date" : "2009-11-15T14:12:12", + "message" : "trying out Elastic Search" +}' +-------------------------------------------------- + +When the query is valid, the response contains `valid:true`: + +[source,js] +-------------------------------------------------- +curl -XGET 'http://localhost:9200/twitter/_validate/query?q=user:foo' +{"valid":true,"_shards":{"total":1,"successful":1,"failed":0}} +-------------------------------------------------- + +Or, with a request body: + +[source,js] +-------------------------------------------------- +curl -XGET 'http://localhost:9200/twitter/tweet/_validate/query' -d '{ + "filtered" : { + "query" : { + "query_string" : { + "query" : "*:*" + } + }, + "filter" : { + "term" : { "user" : "kimchy" } + } + } +}' +{"valid":true,"_shards":{"total":1,"successful":1,"failed":0}} +-------------------------------------------------- + +If the query is invalid, `valid` will be `false`. Here the query is +invalid because ElasticSearch knows the post_date field should be a date +due to dynamic mapping, and 'foo' does not correctly parse into a date: + +[source,js] +-------------------------------------------------- +curl -XGET 'http://localhost:9200/twitter/tweet/_validate/query?q=post_date:foo' +{"valid":false,"_shards":{"total":1,"successful":1,"failed":0}} +-------------------------------------------------- + +An `explain` parameter can be specified to get more detailed information +about why a query failed: + +[source,js] +-------------------------------------------------- +curl -XGET 'http://localhost:9200/twitter/tweet/_validate/query?q=post_date:foo&pretty=true&explain=true' +{ + "valid" : false, + "_shards" : { + "total" : 1, + "successful" : 1, + "failed" : 0 + }, + "explanations" : [ { + "index" : "twitter", + "valid" : false, + "error" : "org.elasticsearch.index.query.QueryParsingException: [twitter] Failed to parse; org.elasticsearch.ElasticSearchParseException: failed to parse date field [foo], tried both date format [dateOptionalTime], and timestamp number; java.lang.IllegalArgumentException: Invalid format: \"foo\"" + } ] +} +-------------------------------------------------- diff --git a/docs/reference/setup.asciidoc b/docs/reference/setup.asciidoc new file mode 100644 index 00000000000..50012dc5764 --- /dev/null +++ b/docs/reference/setup.asciidoc @@ -0,0 +1,57 @@ +[[setup]] += Setup + +[partintro] +-- +This section includes information on how to setup *elasticsearch* and +get it running. If you haven't already, http://www.elasticsearch.org/download[download] it, and +then check the <> docs. + +[[setup-installation]] +[float] +== Installation + +After link:/download[downloading] the latest release and extracting it, +*elasticsearch* can be started using: + +[source,sh] +-------------------------------------------------- +$ bin/elasticsearch +-------------------------------------------------- + +Under *nix system, the command will start the process in the background. +To run it in the foreground, add the `-f` switch to it: + +[source,sh] +-------------------------------------------------- +$ bin/elasticsearch -f +-------------------------------------------------- + +ElasticSearch is built using Java, and requires at least +http://java.sun.com/javase/downloads/index.jsp[Java 6] in order to run. +The version of Java that will be used can be set by setting the +`JAVA_HOME` environment variable. + +.*NIX +************************************************************************* +There are added features when using the `elasticsearch` shell script. +The first, which was explained earlier, is the ability to easily run the +process either in the foreground or the background. + +Another feature is the ability to pass `-X` and `-D` directly to the +script. When set, both override anything set using either `JAVA_OPTS` or +`ES_JAVA_OPTS`. For example: + +[source,sh] +-------------------------------------------------- +$ bin/elasticsearch -f -Xmx2g -Xms2g -Des.index.storage.type=memory +-------------------------------------------------- +************************************************************************* +-- + +include::setup/configuration.asciidoc[] + +include::setup/as-a-service.asciidoc[] + +include::setup/dir-layout.asciidoc[] + diff --git a/docs/reference/setup/as-a-service.asciidoc b/docs/reference/setup/as-a-service.asciidoc new file mode 100644 index 00000000000..d10c18d85a2 --- /dev/null +++ b/docs/reference/setup/as-a-service.asciidoc @@ -0,0 +1,32 @@ +[[setup-service]] +== Running As a Service + +It should be simple to wrap the `elasticsearch` script in an `init.d` or +the like. But, elasticsearch also supports running it using the +https://github.com/elasticsearch/elasticsearch-servicewrapper[Java Service Wrapper]. + +ElasticSearch can be run as a service using the `elasticsearch` script +located under `bin/service` location. The repo for it is located +http://github.com/elasticsearch/elasticsearch-servicewrapper[here]. The +script accepts a single parameter with the following values: + +[horizontal] +`console`:: Run the elasticsearch in the foreground. + +`start`:: Run elasticsearch in the background. + +`stop`:: Stops elasticsearch if its running. + +`install`:: Install elasticsearch to run on system startup (init.d / service). + +`remove`:: Removes elasticsearch from system startup (init.d / service). + +The service uses Java Service Wrapper which is a small native wrapper +around the Java virtual machine which also monitors it. + +Note, passing JVM level configuration (such as -X parameters) should be +set within the `elasticsearch.conf` file. + +The `ES_MIN_MEM` and `ES_MAX_MEM` environment variables to set the +minimum and maximum memory allocation for the JVM (set in mega bytes). +It defaults to `256` and `1024` respectively. diff --git a/docs/reference/setup/configuration.asciidoc b/docs/reference/setup/configuration.asciidoc new file mode 100644 index 00000000000..0547c629bdc --- /dev/null +++ b/docs/reference/setup/configuration.asciidoc @@ -0,0 +1,223 @@ +[[setup-configuration]] +== Configuration + +[float] +=== Environment Variables + +Within the scripts, ElasticSearch comes with built in `JAVA_OPTS` passed +to the JVM started. The most important setting for that is the `-Xmx` to +control the maximum allowed memory for the process, and `-Xms` to +control the minimum allocated memory for the process (_in general, the +more memory allocated to the process, the better_). + +Most times it is better to leave the default `JAVA_OPTS` as they are, +and use the `ES_JAVA_OPTS` environment variable in order to set / change +JVM settings or arguments. + +The `ES_HEAP_SIZE` environment variable allows to set the heap memory +that will be allocated to elasticsearch java process. It will allocate +the same value to both min and max values, though those can be set +explicitly (not recommended) by setting `ES_MIN_MEM` (defaults to +`256m`), and `ES_MAX_MEM` (defaults to `1gb`). + +It is recommended to set the min and max memory to the same value, and +enable <>. + +[float] +=== System Configuration + +[float] +==== File Descriptors + +Make sure to increase the number of open files descriptors on the +machine (or for the user running elasticsearch). Setting it to 32k or +even 64k is recommended. + +In order to test how many open files the process can open, start it with +`-Des.max-open-files` set to `true`. This will print the number of open +files the process can open on startup. + +["float",id="setup-configuration-memory"] +==== Memory Settings + +There is an option to use +http://opengroup.org/onlinepubs/007908799/xsh/mlockall.html[mlockall] to +try to lock the process address space so it won't be swapped. For this +to work, the `bootstrap.mlockall` should be set to `true` and it is +recommended to set both the min and max memory allocation to be the +same. Note: This option is only available on Linux/Unix operating +systems. + +In order to see if this works or not, set the `common.jna` logging to +DEBUG level. A solution to "Unknown mlockall error 0" can be to set +`ulimit -l unlimited`. + +Note, `mlockall` might cause the JVM or shell +session to exit if it fails to allocate the memory (because not enough +memory is available on the machine). + +[float] +=== Elasticsearch Settings + +*elasticsearch* configuration files can be found under `ES_HOME/config` +folder. The folder comes with two files, the `elasticsearch.yml` for +configuring ElasticSearch different +<>, and `logging.yml` for +configuring the ElasticSearch logging. + +The configuration format is http://www.yaml.org/[YAML]. Here is an +example of changing the address all network based modules will use to +bind and publish to: + +[source,js] +-------------------------------------------------- +network : + host : 10.0.0.4 +-------------------------------------------------- + +*elasticsearch* configuration files can be found under `ES_HOME/config` +folder. The folder comes with two files, the `elasticsearch.yml` for +configuring ElasticSearch different <>, and `logging.yml` +for configuring the ElasticSearch logging. + + +[float] +==== Paths + +In production use, you will almost certainly want to change paths for +data and log files: + +[source,js] +-------------------------------------------------- +path: + logs: /var/log/elasticsearch + data: /var/data/elasticsearch +-------------------------------------------------- + +[float] +==== Cluster name + +Also, don't forget to give your production cluster a name, which is used +to discover and auto-join other nodes: + +[source,js] +-------------------------------------------------- +cluster: + name: +-------------------------------------------------- + +[float] +==== Node name + +You may also want to change the default node name for each node to +something like the display hostname. By default ElasticSearch will +randomly pick a Marvel character name from a list of around 3000 names +when your node starts up. + +[source,js] +-------------------------------------------------- +node: + name: +-------------------------------------------------- + +Internally, all settings are collapsed into "namespaced" settings. For +example, the above gets collapsed into `network.host`. This means that +its easy to support other configuration formats, for example, +http://www.json.org[JSON]. If JSON is a preferred configuration format, +simply rename the `elasticsearch.yml` file to `elasticsearch.json` and +add: + +[float] +==== Configuration styles + +[source,js] +-------------------------------------------------- +{ + "network" : { + "host" : "10.0.0.4" + } +} +-------------------------------------------------- + +It also means that its easy to provide the settings externally either +using the `ES_JAVA_OPTS` or as parameters to the `elasticsearch` +command, for example: + +[source,js] +-------------------------------------------------- +$ elasticsearch -f -Des.network.host=10.0.0.4 +-------------------------------------------------- + +Another option is to set `es.default.` prefix instead of `es.` prefix, +which means the default setting will be used only if not explicitly set +in the configuration file. + +Another option is to use the `${...}` notation within the configuration +file which will resolve to an environment setting, for example: + +[source,js] +-------------------------------------------------- +{ + "network" : { + "host" : "${ES_NET_HOST}" + } +} +-------------------------------------------------- + +The location of the configuration file can be set externally using a +system property: + +[source,js] +-------------------------------------------------- +$ elasticsearch -f -Des.config=/path/to/config/file +-------------------------------------------------- + +[float] +=== Index Settings + +Indices created within the cluster can provide their own settings. For +example, the following creates an index with memory based storage +instead of the default file system based one (the format can be either +YAML or JSON): + +[source,js] +-------------------------------------------------- +$ curl -XPUT http://localhost:9200/kimchy/ -d \ +' +index : + store: + type: memory +' +-------------------------------------------------- + +Index level settings can be set on the node level as well, for example, +within the `elasticsearch.yml` file, the following can be set: + +[source,js] +-------------------------------------------------- +index : + store: + type: memory +-------------------------------------------------- + +This means that every index that gets created on the specific node +started with the mentioned configuration will store the index in memory +*unless the index explicitly sets it*. In other words, any index level +settings override what is set in the node configuration. Of course, the +above can also be set as a "collapsed" setting, for example: + +[source,js] +-------------------------------------------------- +$ elasticsearch -f -Des.index.store.type=memory +-------------------------------------------------- + +All of the index level configuration can be found within each +<>. + +[float] +=== Logging + +ElasticSearch uses an internal logging abstraction and comes, out of the +box, with http://logging.apache.org/log4j/[log4j]. It tries to simplify +log4j configuration by using http://www.yaml.org/[YAML] to configure it, +and the logging configuration file is `config/logging.yml` file. diff --git a/docs/reference/setup/dir-layout.asciidoc b/docs/reference/setup/dir-layout.asciidoc new file mode 100644 index 00000000000..576de5f02e4 --- /dev/null +++ b/docs/reference/setup/dir-layout.asciidoc @@ -0,0 +1,45 @@ +[[setup-dir-layout]] +== Directory Layout + +The directory layout of an installation is as follows: + +[cols="<,<,<,<",options="header",] +|======================================================================= +|Type |Description |Default Location |Setting +|*home* |Home of elasticsearch installation | | `path.home` + +|*bin* |Binary scripts including `elasticsearch` to start a node | `{path.home}/bin` | + +|*conf* |Configuration files including `elasticsearch.yml` |`{path.home}/config` |`path.conf` + +|*data* |The location of the data files of each index / shard allocated +on the node. Can hold multiple locations. |`{path.home}/data`|`path.data` + +|*work* |Temporal files that are used by different nodes. |`{path.home}/work` |`path.work` + +|*logs* |Log files location |`{path.home}/logs` |`path.logs` +|======================================================================= + +The multiple data locations allows to stripe it. The striping is simple, +placing whole files in one of the locations, and deciding where to place +the file based on the value of the `index.store.distributor` setting: + +* `least_used` (default) always selects the directory with the most +available space + + * `random` selects directories at random. The probability of selecting +a particular directory is proportional to amount of available space in +this directory. + +Note, there are no multiple copies of the same data, in that, its +similar to RAID 0. Though simple, it should provide a good solution for +people that don't want to mess with RAID. Here is how it is configured: + +--------------------------------- +path.data: /mnt/first,/mnt/second +--------------------------------- + +Or the in an array format: + +---------------------------------------- +path.data: ["/mnt/first", "/mnt/second"] +--------------------------------- diff --git a/docs/reference/setup/installation.asciidoc b/docs/reference/setup/installation.asciidoc new file mode 100644 index 00000000000..e17db47bc56 --- /dev/null +++ b/docs/reference/setup/installation.asciidoc @@ -0,0 +1,39 @@ +[[setup-installation]] +== Installation + +After link:/download[downloading] the latest release and extracting it, +*elasticsearch* can be started using: + +[source,sh] +-------------------------------------------------- +$ bin/elasticsearch +-------------------------------------------------- + +Under *nix system, the command will start the process in the background. +To run it in the foreground, add the `-f` switch to it: + +[source,sh] +-------------------------------------------------- +$ bin/elasticsearch -f +-------------------------------------------------- + +ElasticSearch is built using Java, and requires at least +http://java.sun.com/javase/downloads/index.jsp[Java 6] in order to run. +The version of Java that will be used can be set by setting the +`JAVA_HOME` environment variable. + +.*NIX +************************************************************************* +There are added features when using the `elasticsearch` shell script. +The first, which was explained earlier, is the ability to easily run the +process either in the foreground or the background. + +Another feature is the ability to pass `-X` and `-D` directly to the +script. When set, both override anything set using either `JAVA_OPTS` or +`ES_JAVA_OPTS`. For example: + +[source,sh] +-------------------------------------------------- +$ bin/elasticsearch -f -Xmx2g -Xms2g -Des.index.storage.type=memory +-------------------------------------------------- +************************************************************************* diff --git a/docs/river/couchdb.asciidoc b/docs/river/couchdb.asciidoc new file mode 100644 index 00000000000..5a2555e3730 --- /dev/null +++ b/docs/river/couchdb.asciidoc @@ -0,0 +1,11 @@ +[[river-couchdb]] +== CouchDB River + +The CouchDB River allows to automatically index couchdb and make it +searchable using the excellent +http://guide.couchdb.org/draft/notifications.html[_changes] stream +couchdb provides. + +See +https://github.com/elasticsearch/elasticsearch-river-couchdb/blob/master/README.md[README +file] for details. diff --git a/docs/river/index.asciidoc b/docs/river/index.asciidoc new file mode 100644 index 00000000000..0c155ba9d85 --- /dev/null +++ b/docs/river/index.asciidoc @@ -0,0 +1,74 @@ +[[river]] += Rivers + +== Intro + +A river is a pluggable service running within elasticsearch cluster +pulling data (or being pushed with data) that is then indexed into the +cluster. + +A river is composed of a unique name and a type. The type is the type of +the river (out of the box, there is the `dummy` river that simply logs +that it is running). The name uniquely identifies the river within the +cluster. For example, one can run a river called `my_river` with type +`dummy`, and another river called `my_other_river` with type `dummy`. + +[float] +== How it Works + +A river instance (and its name) is a type within the `_river` index. All +different rivers implementations accept a document called `_meta` that +at the very least has the type of the river (twitter / couchdb / ...) +associated with it. Creating a river is a simple curl request to index +that `_meta` document (there is actually a `dummy` river used for +testing): + +[source,js] +-------------------------------------------------- +curl -XPUT 'localhost:9200/_river/my_river/_meta' -d '{ + "type" : "dummy" +}' +-------------------------------------------------- + +A river can also have more data associated with it in the form of more +documents indexed under the given index type (the river name). For +example, storing the last indexed state can be stored in a document that +holds it. + +Deleting a river is a call to delete the type (and all documents +associated with it): + +[source,js] +-------------------------------------------------- +curl -XDELETE 'localhost:9200/_river/my_river/' +-------------------------------------------------- + +[float] +== Cluster Allocation + +Rivers are singletons within the cluster. They get allocated +automatically to one of the nodes and run. If that node fails, an river +will be automatically allocated to another node. + +River allocation on nodes can be controlled on each node. The +`node.river` can be set to `_none_` disabling any river allocation to +it. The `node.river` can also include a comma separated list of either +river names or types controlling the rivers allowed to run on it. For +example: `my_river1,my_river2`, or `dummy,twitter`. + +[float] +== Status + +Each river (regardless of the implementation) exposes a high level +`_status` doc which includes the node the river is running on. Getting +the status is a simple curl GET request to +`/_river/{river name}/_status`. + +include::couchdb.asciidoc[] + +include::rabbitmq.asciidoc[] + +include::twitter.asciidoc[] + +include::wikipedia.asciidoc[] + diff --git a/docs/river/rabbitmq.asciidoc b/docs/river/rabbitmq.asciidoc new file mode 100644 index 00000000000..cdad9f83ea7 --- /dev/null +++ b/docs/river/rabbitmq.asciidoc @@ -0,0 +1,9 @@ +[[river-rabbitmq]] +== RabbitMQ River + +RabbitMQ River allows to automatically index a +http://www.rabbitmq.com/[RabbitMQ] queue. + +See +https://github.com/elasticsearch/elasticsearch-river-rabbitmq/blob/master/README.md[README +file] for details. diff --git a/docs/river/twitter.asciidoc b/docs/river/twitter.asciidoc new file mode 100644 index 00000000000..355c1877983 --- /dev/null +++ b/docs/river/twitter.asciidoc @@ -0,0 +1,10 @@ +[[river-twitter]] +== Twitter River + +The twitter river indexes the public +http://dev.twitter.com/pages/streaming_api[twitter stream], aka the +hose, and makes it searchable. + +See +https://github.com/elasticsearch/elasticsearch-river-twitter/blob/master/README.md[README +file] for details. diff --git a/docs/river/wikipedia.asciidoc b/docs/river/wikipedia.asciidoc new file mode 100644 index 00000000000..c65107a2d12 --- /dev/null +++ b/docs/river/wikipedia.asciidoc @@ -0,0 +1,8 @@ +[[river-wikipedia]] +== Wikipedia River + +A simple river to index http://en.wikipedia.org[Wikipedia]. + +See +https://github.com/elasticsearch/elasticsearch-river-wikipedia/blob/master/README.md[README +file] for details.