From 53ad7330fcbb7920fe5b1031aac1cfc81f750106 Mon Sep 17 00:00:00 2001 From: Clinton Gormley Date: Wed, 4 Sep 2013 17:42:58 +0200 Subject: [PATCH] [DOCS] Added docs for term vectors --- docs/reference/search.asciidoc | 2 + docs/reference/search/termvectors.asciidoc | 218 +++++++++++++++++++++ 2 files changed, 220 insertions(+) create mode 100644 docs/reference/search/termvectors.asciidoc diff --git a/docs/reference/search.asciidoc b/docs/reference/search.asciidoc index f0061fcaa34..57502e11aa9 100644 --- a/docs/reference/search.asciidoc +++ b/docs/reference/search.asciidoc @@ -102,3 +102,5 @@ include::search/explain.asciidoc[] include::search/percolate.asciidoc[] include::search/more-like-this.asciidoc[] + +include::search/termvectors.asciidoc[] diff --git a/docs/reference/search/termvectors.asciidoc b/docs/reference/search/termvectors.asciidoc new file mode 100644 index 00000000000..1022c980737 --- /dev/null +++ b/docs/reference/search/termvectors.asciidoc @@ -0,0 +1,218 @@ +[[search-termvectors]] +== Term Vectors + +added[1.00.Beta] + +Returns information and statistics on terms in the fields of a +particular document as stored in the index. + +[source,js] +-------------------------------------------------- +curl -XGET 'http://localhost:9200/twitter/tweet/1/_termvector?pretty=true' +-------------------------------------------------- + +Optionally, you can specify the fields for which the information is +retrieved either with a parameter in the url + +[source,js] +-------------------------------------------------- +curl -XGET 'http://localhost:9200/twitter/tweet/1/_termvector?fields=text,...' +-------------------------------------------------- + +or adding by adding the requested fields in the request body (see +example below). + +[float] +=== Return values + +Three types of values can be requested: _term information_, _term statistics_ +and _field statistics_. By default, all term information and field +statistics are returned for all fields but no term statistics. + +[float] +==== Term information + + * term frequency in the field (always returned) + * term positions (`positions` : true) + * start and end offsets (`offsets` : true) + * term payloads (`payloads` : true), as base64 encoded bytes + +If the requested information wasn't stored in the index, it will be +omitted without further warning. See <> +for how to configure your index to store term vectors. + +[float] +==== Term statistics + +Setting `term_statistics` to `true` (default is `false`) will +return + + * total term frequency (how often a term occurs in all documents) + + * document frequency (the number of documents containing the current + term) + +By default these values are not returned since term statistics can +have a serious performance impact. + +[float] +==== Field statistics + +Setting `field_statistics` to `false` (default is `true`) will +omit : + + * document count (how many documents contain this field) + * sum of document frequencies (the sum of document frequencies for all + terms in this field) + * sum of total term frequencies (the sum of total term frequencies of + each term in this field) + +[float] +=== Behaviour + +The term and field statistics are not accurate. Deleted documents +are not taken into account. The information is only retrieved for the +shard the requested document resides in. The term and field statistics +are therefore only useful as relative measures whereas the absolute +numbers have no meaning in this context. + +[float] +=== Example + +First, we create an index that stores term vectors, payloads etc. : + +[source,js] +-------------------------------------------------- +curl -s -XPUT 'http://localhost:9200/twitter/' -d '{ + "mappings": { + "tweet": { + "properties": { + "text": { + "type": "string", + "term_vector": "with_positions_offsets_payloads", + "store" : "yes", + "index_analyzer" : "fulltext_analyzer" + }, + "fullname": { + "type": "string", + "term_vector": "with_positions_offsets_payloads", + "index_analyzer" : "fulltext_analyzer" + } + } + } + }, + "settings" : { + "index" : { + "number_of_shards" : 1, + "number_of_replicas" : 0 + }, + "analysis": { + "analyzer": { + "fulltext_analyzer": { + "type": "custom", + "tokenizer": "whitespace", + "filter": [ + "lowercase", + "type_as_payload" + ] + } + } + } + } +}' +-------------------------------------------------- + +Second, we add some documents: + +[source,js] +-------------------------------------------------- +curl -XPUT 'http://localhost:9200/twitter/tweet/1?pretty=true' -d '{ + "fullname" : "John Doe", + "text" : "twitter test test test " +}' + +curl -XPUT 'http://localhost:9200/twitter/tweet/2?pretty=true' -d '{ + "fullname" : "Jane Doe", + "text" : "Another twitter test ..." +}' +-------------------------------------------------- + +The following request returns all information and statistics for field +`text` in document `1` (John Doe): + +[source,js] +-------------------------------------------------- + +curl -XGET 'http://localhost:9200/twitter/tweet/1/_termvector?pretty=true' -d '{ + "fields" : ["text"], + "offsets" : true, + "payloads" : true, + "positions" : true, + "term_statistics" : true, + "field_statistics" : true +}' +-------------------------------------------------- + +Response: + +[source,js] +-------------------------------------------------- + +{ + "_id": "1", + "_index": "twitter", + "_type": "tweet", + "_version": 1, + "exists": true, + "term_vectors": { + "text": { + "field_statistics": { + "doc_count": 2, + "sum_doc_freq": 6, + "sum_ttf": 8 + }, + "terms": { + "test": { + "doc_freq": 2, + "term_freq": 3, + "tokens": [ + { + "end_offset": 12, + "payload": "d29yZA==", + "position": 1, + "start_offset": 8 + }, + { + "end_offset": 17, + "payload": "d29yZA==", + "position": 2, + "start_offset": 13 + }, + { + "end_offset": 22, + "payload": "d29yZA==", + "position": 3, + "start_offset": 18 + } + ], + "ttf": 4 + }, + "twitter": { + "doc_freq": 2, + "term_freq": 1, + "tokens": [ + { + "end_offset": 7, + "payload": "d29yZA==", + "position": 0, + "start_offset": 0 + } + ], + "ttf": 2 + } + } + } + } +} +-------------------------------------------------- + +