make term statistics accessible in scripts
term statistics can be accessed via the _shard variable.
Below is a minimal example. See documentation on details.
```
DELETE paytest
PUT paytest
{
"mappings": {
"test": {
"_all": {
"auto_boost": true,
"enabled": true
},
"properties": {
"text": {
"index_analyzer": "fulltext_analyzer",
"store": "yes",
"type": "string"
}
}
}
},
"settings": {
"analysis": {
"analyzer": {
"fulltext_analyzer": {
"filter": [
"my_delimited_payload_filter"
],
"tokenizer": "whitespace",
"type": "custom"
}
},
"filter": {
"my_delimited_payload_filter": {
"delimiter": "+",
"encoding": "float",
"type": "delimited_payload_filter"
}
}
},
"index": {
"number_of_replicas": 0,
"number_of_shards": 1
}
}
}
POST paytest/test/1
{
"text": "the+1 quick+2 brown+3 fox+4 is quick+10"
}
POST paytest/test/2
{
"text": "the+1 quick+2 red+3 fox+4"
}
POST paytest/_refresh
POST paytest/_search
{
"script_fields": {
"ttf": {
"script": "_shard[\"text\"][\"quick\"].ttf()"
}
}
}
POST paytest/_search
{
"script_fields": {
"freq": {
"script": "_shard[\"text\"][\"quick\"].freq()"
}
}
}
POST paytest/test/2/_termvector
POST paytest/_search
{
"script_fields": {
"payloads": {
"script": "term = _shard[\"text\"].get(\"red\",_PAYLOADS);payloads = []; for(pos : term){payloads.add(pos.payloadAsFloat(-1));} return payloads;"
}
}
}
POST paytest/_search
{
"script_fields": {
"tv": {
"script": "_shard[\"text\"][\"quick\"].freq()"
}
},
"query": {
"function_score": {
"functions": [
{
"script_score": {
"script": "_shard[\"text\"][\"quick\"].freq()"
}
}
]
}
}
}
```
closes #3772
2014-01-02 05:17:33 -05:00
[[modules-advanced-scripting]]
== Text scoring in scripts
2014-01-02 07:54:40 -05:00
Text features, such as term or document frequency for a specific term can be accessed in scripts (see <<modules-scripting, scripting documentation>> ) with the `_index` variable. This can be useful if, for example, you want to implement your own scoring model using for example a script inside a <<query-dsl-function-score-query,function score query>>.
make term statistics accessible in scripts
term statistics can be accessed via the _shard variable.
Below is a minimal example. See documentation on details.
```
DELETE paytest
PUT paytest
{
"mappings": {
"test": {
"_all": {
"auto_boost": true,
"enabled": true
},
"properties": {
"text": {
"index_analyzer": "fulltext_analyzer",
"store": "yes",
"type": "string"
}
}
}
},
"settings": {
"analysis": {
"analyzer": {
"fulltext_analyzer": {
"filter": [
"my_delimited_payload_filter"
],
"tokenizer": "whitespace",
"type": "custom"
}
},
"filter": {
"my_delimited_payload_filter": {
"delimiter": "+",
"encoding": "float",
"type": "delimited_payload_filter"
}
}
},
"index": {
"number_of_replicas": 0,
"number_of_shards": 1
}
}
}
POST paytest/test/1
{
"text": "the+1 quick+2 brown+3 fox+4 is quick+10"
}
POST paytest/test/2
{
"text": "the+1 quick+2 red+3 fox+4"
}
POST paytest/_refresh
POST paytest/_search
{
"script_fields": {
"ttf": {
"script": "_shard[\"text\"][\"quick\"].ttf()"
}
}
}
POST paytest/_search
{
"script_fields": {
"freq": {
"script": "_shard[\"text\"][\"quick\"].freq()"
}
}
}
POST paytest/test/2/_termvector
POST paytest/_search
{
"script_fields": {
"payloads": {
"script": "term = _shard[\"text\"].get(\"red\",_PAYLOADS);payloads = []; for(pos : term){payloads.add(pos.payloadAsFloat(-1));} return payloads;"
}
}
}
POST paytest/_search
{
"script_fields": {
"tv": {
"script": "_shard[\"text\"][\"quick\"].freq()"
}
},
"query": {
"function_score": {
"functions": [
{
"script_score": {
"script": "_shard[\"text\"][\"quick\"].freq()"
}
}
]
}
}
}
```
closes #3772
2014-01-02 05:17:33 -05:00
Statistics over the document collection are computed *per shard*, not per
index.
[float]
=== Nomenclature:
[horizontal]
`df`::
document frequency. The number of documents a term appears in. Computed
per field.
`tf`::
term frequency. The number times a term appears in a field in one specific
document.
`ttf`::
total term frequency. The number of times this term appears in all
documents, that is, the sum of `tf` over all documents. Computed per
field.
`df` and `ttf` are computed per shard and therefore these numbers can vary
depending on the shard the current document resides in.
[float]
=== Shard statistics:
2014-01-02 07:54:40 -05:00
`_index.numDocs()`::
make term statistics accessible in scripts
term statistics can be accessed via the _shard variable.
Below is a minimal example. See documentation on details.
```
DELETE paytest
PUT paytest
{
"mappings": {
"test": {
"_all": {
"auto_boost": true,
"enabled": true
},
"properties": {
"text": {
"index_analyzer": "fulltext_analyzer",
"store": "yes",
"type": "string"
}
}
}
},
"settings": {
"analysis": {
"analyzer": {
"fulltext_analyzer": {
"filter": [
"my_delimited_payload_filter"
],
"tokenizer": "whitespace",
"type": "custom"
}
},
"filter": {
"my_delimited_payload_filter": {
"delimiter": "+",
"encoding": "float",
"type": "delimited_payload_filter"
}
}
},
"index": {
"number_of_replicas": 0,
"number_of_shards": 1
}
}
}
POST paytest/test/1
{
"text": "the+1 quick+2 brown+3 fox+4 is quick+10"
}
POST paytest/test/2
{
"text": "the+1 quick+2 red+3 fox+4"
}
POST paytest/_refresh
POST paytest/_search
{
"script_fields": {
"ttf": {
"script": "_shard[\"text\"][\"quick\"].ttf()"
}
}
}
POST paytest/_search
{
"script_fields": {
"freq": {
"script": "_shard[\"text\"][\"quick\"].freq()"
}
}
}
POST paytest/test/2/_termvector
POST paytest/_search
{
"script_fields": {
"payloads": {
"script": "term = _shard[\"text\"].get(\"red\",_PAYLOADS);payloads = []; for(pos : term){payloads.add(pos.payloadAsFloat(-1));} return payloads;"
}
}
}
POST paytest/_search
{
"script_fields": {
"tv": {
"script": "_shard[\"text\"][\"quick\"].freq()"
}
},
"query": {
"function_score": {
"functions": [
{
"script_score": {
"script": "_shard[\"text\"][\"quick\"].freq()"
}
}
]
}
}
}
```
closes #3772
2014-01-02 05:17:33 -05:00
Number of documents in shard.
2014-01-02 07:54:40 -05:00
`_index.maxDoc()`::
make term statistics accessible in scripts
term statistics can be accessed via the _shard variable.
Below is a minimal example. See documentation on details.
```
DELETE paytest
PUT paytest
{
"mappings": {
"test": {
"_all": {
"auto_boost": true,
"enabled": true
},
"properties": {
"text": {
"index_analyzer": "fulltext_analyzer",
"store": "yes",
"type": "string"
}
}
}
},
"settings": {
"analysis": {
"analyzer": {
"fulltext_analyzer": {
"filter": [
"my_delimited_payload_filter"
],
"tokenizer": "whitespace",
"type": "custom"
}
},
"filter": {
"my_delimited_payload_filter": {
"delimiter": "+",
"encoding": "float",
"type": "delimited_payload_filter"
}
}
},
"index": {
"number_of_replicas": 0,
"number_of_shards": 1
}
}
}
POST paytest/test/1
{
"text": "the+1 quick+2 brown+3 fox+4 is quick+10"
}
POST paytest/test/2
{
"text": "the+1 quick+2 red+3 fox+4"
}
POST paytest/_refresh
POST paytest/_search
{
"script_fields": {
"ttf": {
"script": "_shard[\"text\"][\"quick\"].ttf()"
}
}
}
POST paytest/_search
{
"script_fields": {
"freq": {
"script": "_shard[\"text\"][\"quick\"].freq()"
}
}
}
POST paytest/test/2/_termvector
POST paytest/_search
{
"script_fields": {
"payloads": {
"script": "term = _shard[\"text\"].get(\"red\",_PAYLOADS);payloads = []; for(pos : term){payloads.add(pos.payloadAsFloat(-1));} return payloads;"
}
}
}
POST paytest/_search
{
"script_fields": {
"tv": {
"script": "_shard[\"text\"][\"quick\"].freq()"
}
},
"query": {
"function_score": {
"functions": [
{
"script_score": {
"script": "_shard[\"text\"][\"quick\"].freq()"
}
}
]
}
}
}
```
closes #3772
2014-01-02 05:17:33 -05:00
Maximal document number in shard.
2014-01-02 07:54:40 -05:00
`_index.numDeletedDocs()`::
make term statistics accessible in scripts
term statistics can be accessed via the _shard variable.
Below is a minimal example. See documentation on details.
```
DELETE paytest
PUT paytest
{
"mappings": {
"test": {
"_all": {
"auto_boost": true,
"enabled": true
},
"properties": {
"text": {
"index_analyzer": "fulltext_analyzer",
"store": "yes",
"type": "string"
}
}
}
},
"settings": {
"analysis": {
"analyzer": {
"fulltext_analyzer": {
"filter": [
"my_delimited_payload_filter"
],
"tokenizer": "whitespace",
"type": "custom"
}
},
"filter": {
"my_delimited_payload_filter": {
"delimiter": "+",
"encoding": "float",
"type": "delimited_payload_filter"
}
}
},
"index": {
"number_of_replicas": 0,
"number_of_shards": 1
}
}
}
POST paytest/test/1
{
"text": "the+1 quick+2 brown+3 fox+4 is quick+10"
}
POST paytest/test/2
{
"text": "the+1 quick+2 red+3 fox+4"
}
POST paytest/_refresh
POST paytest/_search
{
"script_fields": {
"ttf": {
"script": "_shard[\"text\"][\"quick\"].ttf()"
}
}
}
POST paytest/_search
{
"script_fields": {
"freq": {
"script": "_shard[\"text\"][\"quick\"].freq()"
}
}
}
POST paytest/test/2/_termvector
POST paytest/_search
{
"script_fields": {
"payloads": {
"script": "term = _shard[\"text\"].get(\"red\",_PAYLOADS);payloads = []; for(pos : term){payloads.add(pos.payloadAsFloat(-1));} return payloads;"
}
}
}
POST paytest/_search
{
"script_fields": {
"tv": {
"script": "_shard[\"text\"][\"quick\"].freq()"
}
},
"query": {
"function_score": {
"functions": [
{
"script_score": {
"script": "_shard[\"text\"][\"quick\"].freq()"
}
}
]
}
}
}
```
closes #3772
2014-01-02 05:17:33 -05:00
Number of deleted documents in shard.
[float]
=== Field statistics:
Field statistics can be accessed with a subscript operator like this:
2014-01-02 07:54:40 -05:00
`_index['FIELD']`.
make term statistics accessible in scripts
term statistics can be accessed via the _shard variable.
Below is a minimal example. See documentation on details.
```
DELETE paytest
PUT paytest
{
"mappings": {
"test": {
"_all": {
"auto_boost": true,
"enabled": true
},
"properties": {
"text": {
"index_analyzer": "fulltext_analyzer",
"store": "yes",
"type": "string"
}
}
}
},
"settings": {
"analysis": {
"analyzer": {
"fulltext_analyzer": {
"filter": [
"my_delimited_payload_filter"
],
"tokenizer": "whitespace",
"type": "custom"
}
},
"filter": {
"my_delimited_payload_filter": {
"delimiter": "+",
"encoding": "float",
"type": "delimited_payload_filter"
}
}
},
"index": {
"number_of_replicas": 0,
"number_of_shards": 1
}
}
}
POST paytest/test/1
{
"text": "the+1 quick+2 brown+3 fox+4 is quick+10"
}
POST paytest/test/2
{
"text": "the+1 quick+2 red+3 fox+4"
}
POST paytest/_refresh
POST paytest/_search
{
"script_fields": {
"ttf": {
"script": "_shard[\"text\"][\"quick\"].ttf()"
}
}
}
POST paytest/_search
{
"script_fields": {
"freq": {
"script": "_shard[\"text\"][\"quick\"].freq()"
}
}
}
POST paytest/test/2/_termvector
POST paytest/_search
{
"script_fields": {
"payloads": {
"script": "term = _shard[\"text\"].get(\"red\",_PAYLOADS);payloads = []; for(pos : term){payloads.add(pos.payloadAsFloat(-1));} return payloads;"
}
}
}
POST paytest/_search
{
"script_fields": {
"tv": {
"script": "_shard[\"text\"][\"quick\"].freq()"
}
},
"query": {
"function_score": {
"functions": [
{
"script_score": {
"script": "_shard[\"text\"][\"quick\"].freq()"
}
}
]
}
}
}
```
closes #3772
2014-01-02 05:17:33 -05:00
2014-01-02 07:54:40 -05:00
`_index['FIELD'].docCount()`::
make term statistics accessible in scripts
term statistics can be accessed via the _shard variable.
Below is a minimal example. See documentation on details.
```
DELETE paytest
PUT paytest
{
"mappings": {
"test": {
"_all": {
"auto_boost": true,
"enabled": true
},
"properties": {
"text": {
"index_analyzer": "fulltext_analyzer",
"store": "yes",
"type": "string"
}
}
}
},
"settings": {
"analysis": {
"analyzer": {
"fulltext_analyzer": {
"filter": [
"my_delimited_payload_filter"
],
"tokenizer": "whitespace",
"type": "custom"
}
},
"filter": {
"my_delimited_payload_filter": {
"delimiter": "+",
"encoding": "float",
"type": "delimited_payload_filter"
}
}
},
"index": {
"number_of_replicas": 0,
"number_of_shards": 1
}
}
}
POST paytest/test/1
{
"text": "the+1 quick+2 brown+3 fox+4 is quick+10"
}
POST paytest/test/2
{
"text": "the+1 quick+2 red+3 fox+4"
}
POST paytest/_refresh
POST paytest/_search
{
"script_fields": {
"ttf": {
"script": "_shard[\"text\"][\"quick\"].ttf()"
}
}
}
POST paytest/_search
{
"script_fields": {
"freq": {
"script": "_shard[\"text\"][\"quick\"].freq()"
}
}
}
POST paytest/test/2/_termvector
POST paytest/_search
{
"script_fields": {
"payloads": {
"script": "term = _shard[\"text\"].get(\"red\",_PAYLOADS);payloads = []; for(pos : term){payloads.add(pos.payloadAsFloat(-1));} return payloads;"
}
}
}
POST paytest/_search
{
"script_fields": {
"tv": {
"script": "_shard[\"text\"][\"quick\"].freq()"
}
},
"query": {
"function_score": {
"functions": [
{
"script_score": {
"script": "_shard[\"text\"][\"quick\"].freq()"
}
}
]
}
}
}
```
closes #3772
2014-01-02 05:17:33 -05:00
Number of documents containing the field `FIELD`. Does not take deleted documents into account.
2014-01-02 07:54:40 -05:00
`_index['FIELD'].sumttf()`::
make term statistics accessible in scripts
term statistics can be accessed via the _shard variable.
Below is a minimal example. See documentation on details.
```
DELETE paytest
PUT paytest
{
"mappings": {
"test": {
"_all": {
"auto_boost": true,
"enabled": true
},
"properties": {
"text": {
"index_analyzer": "fulltext_analyzer",
"store": "yes",
"type": "string"
}
}
}
},
"settings": {
"analysis": {
"analyzer": {
"fulltext_analyzer": {
"filter": [
"my_delimited_payload_filter"
],
"tokenizer": "whitespace",
"type": "custom"
}
},
"filter": {
"my_delimited_payload_filter": {
"delimiter": "+",
"encoding": "float",
"type": "delimited_payload_filter"
}
}
},
"index": {
"number_of_replicas": 0,
"number_of_shards": 1
}
}
}
POST paytest/test/1
{
"text": "the+1 quick+2 brown+3 fox+4 is quick+10"
}
POST paytest/test/2
{
"text": "the+1 quick+2 red+3 fox+4"
}
POST paytest/_refresh
POST paytest/_search
{
"script_fields": {
"ttf": {
"script": "_shard[\"text\"][\"quick\"].ttf()"
}
}
}
POST paytest/_search
{
"script_fields": {
"freq": {
"script": "_shard[\"text\"][\"quick\"].freq()"
}
}
}
POST paytest/test/2/_termvector
POST paytest/_search
{
"script_fields": {
"payloads": {
"script": "term = _shard[\"text\"].get(\"red\",_PAYLOADS);payloads = []; for(pos : term){payloads.add(pos.payloadAsFloat(-1));} return payloads;"
}
}
}
POST paytest/_search
{
"script_fields": {
"tv": {
"script": "_shard[\"text\"][\"quick\"].freq()"
}
},
"query": {
"function_score": {
"functions": [
{
"script_score": {
"script": "_shard[\"text\"][\"quick\"].freq()"
}
}
]
}
}
}
```
closes #3772
2014-01-02 05:17:33 -05:00
Sum of `ttf` over all terms that appear in field `FIELD` in all documents.
2014-01-02 07:54:40 -05:00
`_index['FIELD'].sumdf()`::
make term statistics accessible in scripts
term statistics can be accessed via the _shard variable.
Below is a minimal example. See documentation on details.
```
DELETE paytest
PUT paytest
{
"mappings": {
"test": {
"_all": {
"auto_boost": true,
"enabled": true
},
"properties": {
"text": {
"index_analyzer": "fulltext_analyzer",
"store": "yes",
"type": "string"
}
}
}
},
"settings": {
"analysis": {
"analyzer": {
"fulltext_analyzer": {
"filter": [
"my_delimited_payload_filter"
],
"tokenizer": "whitespace",
"type": "custom"
}
},
"filter": {
"my_delimited_payload_filter": {
"delimiter": "+",
"encoding": "float",
"type": "delimited_payload_filter"
}
}
},
"index": {
"number_of_replicas": 0,
"number_of_shards": 1
}
}
}
POST paytest/test/1
{
"text": "the+1 quick+2 brown+3 fox+4 is quick+10"
}
POST paytest/test/2
{
"text": "the+1 quick+2 red+3 fox+4"
}
POST paytest/_refresh
POST paytest/_search
{
"script_fields": {
"ttf": {
"script": "_shard[\"text\"][\"quick\"].ttf()"
}
}
}
POST paytest/_search
{
"script_fields": {
"freq": {
"script": "_shard[\"text\"][\"quick\"].freq()"
}
}
}
POST paytest/test/2/_termvector
POST paytest/_search
{
"script_fields": {
"payloads": {
"script": "term = _shard[\"text\"].get(\"red\",_PAYLOADS);payloads = []; for(pos : term){payloads.add(pos.payloadAsFloat(-1));} return payloads;"
}
}
}
POST paytest/_search
{
"script_fields": {
"tv": {
"script": "_shard[\"text\"][\"quick\"].freq()"
}
},
"query": {
"function_score": {
"functions": [
{
"script_score": {
"script": "_shard[\"text\"][\"quick\"].freq()"
}
}
]
}
}
}
```
closes #3772
2014-01-02 05:17:33 -05:00
The sum of `df` s over all terms that appear in field `FIELD` in all
documents.
Field statistics are computed per shard and therfore these numbers can vary
depending on the shard the current document resides in.
2014-01-02 07:54:40 -05:00
The number of terms in a field cannot be accessed using the `_index` variable. See <<mapping-core-types, word count mapping type>> on how to do that.
make term statistics accessible in scripts
term statistics can be accessed via the _shard variable.
Below is a minimal example. See documentation on details.
```
DELETE paytest
PUT paytest
{
"mappings": {
"test": {
"_all": {
"auto_boost": true,
"enabled": true
},
"properties": {
"text": {
"index_analyzer": "fulltext_analyzer",
"store": "yes",
"type": "string"
}
}
}
},
"settings": {
"analysis": {
"analyzer": {
"fulltext_analyzer": {
"filter": [
"my_delimited_payload_filter"
],
"tokenizer": "whitespace",
"type": "custom"
}
},
"filter": {
"my_delimited_payload_filter": {
"delimiter": "+",
"encoding": "float",
"type": "delimited_payload_filter"
}
}
},
"index": {
"number_of_replicas": 0,
"number_of_shards": 1
}
}
}
POST paytest/test/1
{
"text": "the+1 quick+2 brown+3 fox+4 is quick+10"
}
POST paytest/test/2
{
"text": "the+1 quick+2 red+3 fox+4"
}
POST paytest/_refresh
POST paytest/_search
{
"script_fields": {
"ttf": {
"script": "_shard[\"text\"][\"quick\"].ttf()"
}
}
}
POST paytest/_search
{
"script_fields": {
"freq": {
"script": "_shard[\"text\"][\"quick\"].freq()"
}
}
}
POST paytest/test/2/_termvector
POST paytest/_search
{
"script_fields": {
"payloads": {
"script": "term = _shard[\"text\"].get(\"red\",_PAYLOADS);payloads = []; for(pos : term){payloads.add(pos.payloadAsFloat(-1));} return payloads;"
}
}
}
POST paytest/_search
{
"script_fields": {
"tv": {
"script": "_shard[\"text\"][\"quick\"].freq()"
}
},
"query": {
"function_score": {
"functions": [
{
"script_score": {
"script": "_shard[\"text\"][\"quick\"].freq()"
}
}
]
}
}
}
```
closes #3772
2014-01-02 05:17:33 -05:00
[float]
=== Term statistics:
Term statistics for a field can be accessed with a subscript operator like
2014-01-02 07:54:40 -05:00
this: `_index['FIELD']['TERM']`. This will never return null, even if term or field does not exist.
If you do not need the term frequency, call `_index['FIELD'].get('TERM', 0)`
make term statistics accessible in scripts
term statistics can be accessed via the _shard variable.
Below is a minimal example. See documentation on details.
```
DELETE paytest
PUT paytest
{
"mappings": {
"test": {
"_all": {
"auto_boost": true,
"enabled": true
},
"properties": {
"text": {
"index_analyzer": "fulltext_analyzer",
"store": "yes",
"type": "string"
}
}
}
},
"settings": {
"analysis": {
"analyzer": {
"fulltext_analyzer": {
"filter": [
"my_delimited_payload_filter"
],
"tokenizer": "whitespace",
"type": "custom"
}
},
"filter": {
"my_delimited_payload_filter": {
"delimiter": "+",
"encoding": "float",
"type": "delimited_payload_filter"
}
}
},
"index": {
"number_of_replicas": 0,
"number_of_shards": 1
}
}
}
POST paytest/test/1
{
"text": "the+1 quick+2 brown+3 fox+4 is quick+10"
}
POST paytest/test/2
{
"text": "the+1 quick+2 red+3 fox+4"
}
POST paytest/_refresh
POST paytest/_search
{
"script_fields": {
"ttf": {
"script": "_shard[\"text\"][\"quick\"].ttf()"
}
}
}
POST paytest/_search
{
"script_fields": {
"freq": {
"script": "_shard[\"text\"][\"quick\"].freq()"
}
}
}
POST paytest/test/2/_termvector
POST paytest/_search
{
"script_fields": {
"payloads": {
"script": "term = _shard[\"text\"].get(\"red\",_PAYLOADS);payloads = []; for(pos : term){payloads.add(pos.payloadAsFloat(-1));} return payloads;"
}
}
}
POST paytest/_search
{
"script_fields": {
"tv": {
"script": "_shard[\"text\"][\"quick\"].freq()"
}
},
"query": {
"function_score": {
"functions": [
{
"script_score": {
"script": "_shard[\"text\"][\"quick\"].freq()"
}
}
]
}
}
}
```
closes #3772
2014-01-02 05:17:33 -05:00
to avoid uneccesary initialization of the frequencies. The flag will have only
affect is your set the `index_options` to `docs` (see <<mapping-core-types, mapping documentation>>).
2014-01-02 07:54:40 -05:00
`_index['FIELD']['TERM'].df()`::
make term statistics accessible in scripts
term statistics can be accessed via the _shard variable.
Below is a minimal example. See documentation on details.
```
DELETE paytest
PUT paytest
{
"mappings": {
"test": {
"_all": {
"auto_boost": true,
"enabled": true
},
"properties": {
"text": {
"index_analyzer": "fulltext_analyzer",
"store": "yes",
"type": "string"
}
}
}
},
"settings": {
"analysis": {
"analyzer": {
"fulltext_analyzer": {
"filter": [
"my_delimited_payload_filter"
],
"tokenizer": "whitespace",
"type": "custom"
}
},
"filter": {
"my_delimited_payload_filter": {
"delimiter": "+",
"encoding": "float",
"type": "delimited_payload_filter"
}
}
},
"index": {
"number_of_replicas": 0,
"number_of_shards": 1
}
}
}
POST paytest/test/1
{
"text": "the+1 quick+2 brown+3 fox+4 is quick+10"
}
POST paytest/test/2
{
"text": "the+1 quick+2 red+3 fox+4"
}
POST paytest/_refresh
POST paytest/_search
{
"script_fields": {
"ttf": {
"script": "_shard[\"text\"][\"quick\"].ttf()"
}
}
}
POST paytest/_search
{
"script_fields": {
"freq": {
"script": "_shard[\"text\"][\"quick\"].freq()"
}
}
}
POST paytest/test/2/_termvector
POST paytest/_search
{
"script_fields": {
"payloads": {
"script": "term = _shard[\"text\"].get(\"red\",_PAYLOADS);payloads = []; for(pos : term){payloads.add(pos.payloadAsFloat(-1));} return payloads;"
}
}
}
POST paytest/_search
{
"script_fields": {
"tv": {
"script": "_shard[\"text\"][\"quick\"].freq()"
}
},
"query": {
"function_score": {
"functions": [
{
"script_score": {
"script": "_shard[\"text\"][\"quick\"].freq()"
}
}
]
}
}
}
```
closes #3772
2014-01-02 05:17:33 -05:00
`df` of term `TERM` in field `FIELD`. Will be returned, even if the term
is not present in the current document.
2014-01-02 07:54:40 -05:00
`_index['FIELD']['TERM'].ttf()`::
make term statistics accessible in scripts
term statistics can be accessed via the _shard variable.
Below is a minimal example. See documentation on details.
```
DELETE paytest
PUT paytest
{
"mappings": {
"test": {
"_all": {
"auto_boost": true,
"enabled": true
},
"properties": {
"text": {
"index_analyzer": "fulltext_analyzer",
"store": "yes",
"type": "string"
}
}
}
},
"settings": {
"analysis": {
"analyzer": {
"fulltext_analyzer": {
"filter": [
"my_delimited_payload_filter"
],
"tokenizer": "whitespace",
"type": "custom"
}
},
"filter": {
"my_delimited_payload_filter": {
"delimiter": "+",
"encoding": "float",
"type": "delimited_payload_filter"
}
}
},
"index": {
"number_of_replicas": 0,
"number_of_shards": 1
}
}
}
POST paytest/test/1
{
"text": "the+1 quick+2 brown+3 fox+4 is quick+10"
}
POST paytest/test/2
{
"text": "the+1 quick+2 red+3 fox+4"
}
POST paytest/_refresh
POST paytest/_search
{
"script_fields": {
"ttf": {
"script": "_shard[\"text\"][\"quick\"].ttf()"
}
}
}
POST paytest/_search
{
"script_fields": {
"freq": {
"script": "_shard[\"text\"][\"quick\"].freq()"
}
}
}
POST paytest/test/2/_termvector
POST paytest/_search
{
"script_fields": {
"payloads": {
"script": "term = _shard[\"text\"].get(\"red\",_PAYLOADS);payloads = []; for(pos : term){payloads.add(pos.payloadAsFloat(-1));} return payloads;"
}
}
}
POST paytest/_search
{
"script_fields": {
"tv": {
"script": "_shard[\"text\"][\"quick\"].freq()"
}
},
"query": {
"function_score": {
"functions": [
{
"script_score": {
"script": "_shard[\"text\"][\"quick\"].freq()"
}
}
]
}
}
}
```
closes #3772
2014-01-02 05:17:33 -05:00
The sum of term frequencys of term `TERM` in field `FIELD` over all
documents. Will be returned, even if the term is not present in the
current document.
2014-01-02 07:54:40 -05:00
`_index['FIELD']['TERM'].tf()`::
make term statistics accessible in scripts
term statistics can be accessed via the _shard variable.
Below is a minimal example. See documentation on details.
```
DELETE paytest
PUT paytest
{
"mappings": {
"test": {
"_all": {
"auto_boost": true,
"enabled": true
},
"properties": {
"text": {
"index_analyzer": "fulltext_analyzer",
"store": "yes",
"type": "string"
}
}
}
},
"settings": {
"analysis": {
"analyzer": {
"fulltext_analyzer": {
"filter": [
"my_delimited_payload_filter"
],
"tokenizer": "whitespace",
"type": "custom"
}
},
"filter": {
"my_delimited_payload_filter": {
"delimiter": "+",
"encoding": "float",
"type": "delimited_payload_filter"
}
}
},
"index": {
"number_of_replicas": 0,
"number_of_shards": 1
}
}
}
POST paytest/test/1
{
"text": "the+1 quick+2 brown+3 fox+4 is quick+10"
}
POST paytest/test/2
{
"text": "the+1 quick+2 red+3 fox+4"
}
POST paytest/_refresh
POST paytest/_search
{
"script_fields": {
"ttf": {
"script": "_shard[\"text\"][\"quick\"].ttf()"
}
}
}
POST paytest/_search
{
"script_fields": {
"freq": {
"script": "_shard[\"text\"][\"quick\"].freq()"
}
}
}
POST paytest/test/2/_termvector
POST paytest/_search
{
"script_fields": {
"payloads": {
"script": "term = _shard[\"text\"].get(\"red\",_PAYLOADS);payloads = []; for(pos : term){payloads.add(pos.payloadAsFloat(-1));} return payloads;"
}
}
}
POST paytest/_search
{
"script_fields": {
"tv": {
"script": "_shard[\"text\"][\"quick\"].freq()"
}
},
"query": {
"function_score": {
"functions": [
{
"script_score": {
"script": "_shard[\"text\"][\"quick\"].freq()"
}
}
]
}
}
}
```
closes #3772
2014-01-02 05:17:33 -05:00
`tf` of term `TERM` in field `FIELD`. Will be 0 if the term is not present
in the current document.
[float]
=== Term positions, offsets and payloads:
If you need information on the positions of terms in a field, call
2014-01-02 07:54:40 -05:00
`_index['FIELD'].get('TERM', flag)` where flag can be
make term statistics accessible in scripts
term statistics can be accessed via the _shard variable.
Below is a minimal example. See documentation on details.
```
DELETE paytest
PUT paytest
{
"mappings": {
"test": {
"_all": {
"auto_boost": true,
"enabled": true
},
"properties": {
"text": {
"index_analyzer": "fulltext_analyzer",
"store": "yes",
"type": "string"
}
}
}
},
"settings": {
"analysis": {
"analyzer": {
"fulltext_analyzer": {
"filter": [
"my_delimited_payload_filter"
],
"tokenizer": "whitespace",
"type": "custom"
}
},
"filter": {
"my_delimited_payload_filter": {
"delimiter": "+",
"encoding": "float",
"type": "delimited_payload_filter"
}
}
},
"index": {
"number_of_replicas": 0,
"number_of_shards": 1
}
}
}
POST paytest/test/1
{
"text": "the+1 quick+2 brown+3 fox+4 is quick+10"
}
POST paytest/test/2
{
"text": "the+1 quick+2 red+3 fox+4"
}
POST paytest/_refresh
POST paytest/_search
{
"script_fields": {
"ttf": {
"script": "_shard[\"text\"][\"quick\"].ttf()"
}
}
}
POST paytest/_search
{
"script_fields": {
"freq": {
"script": "_shard[\"text\"][\"quick\"].freq()"
}
}
}
POST paytest/test/2/_termvector
POST paytest/_search
{
"script_fields": {
"payloads": {
"script": "term = _shard[\"text\"].get(\"red\",_PAYLOADS);payloads = []; for(pos : term){payloads.add(pos.payloadAsFloat(-1));} return payloads;"
}
}
}
POST paytest/_search
{
"script_fields": {
"tv": {
"script": "_shard[\"text\"][\"quick\"].freq()"
}
},
"query": {
"function_score": {
"functions": [
{
"script_score": {
"script": "_shard[\"text\"][\"quick\"].freq()"
}
}
]
}
}
}
```
closes #3772
2014-01-02 05:17:33 -05:00
[horizontal]
`_POSITIONS`:: if you need the positions of the term
`_OFFSETS`:: if you need the offests of the term
`_PAYLOADS`:: if you need the payloads of the term
`_CACHE`:: if you need to iterate over all positions several times
The iterator uses the underlying lucene classes to iterate over positions. For efficiency reasons, you can only iterate over positions once. If you need to iterate over the positions several times, set the `_CACHE` flag.
You can combine the operators with a `|` if you need more than one info. For
example, the following will return an object holding the positions and payloads,
as well as all statistics:
2014-01-02 07:54:40 -05:00
`_index['FIELD'].get('TERM', _POSITIONS | _PAYLOADS)`
make term statistics accessible in scripts
term statistics can be accessed via the _shard variable.
Below is a minimal example. See documentation on details.
```
DELETE paytest
PUT paytest
{
"mappings": {
"test": {
"_all": {
"auto_boost": true,
"enabled": true
},
"properties": {
"text": {
"index_analyzer": "fulltext_analyzer",
"store": "yes",
"type": "string"
}
}
}
},
"settings": {
"analysis": {
"analyzer": {
"fulltext_analyzer": {
"filter": [
"my_delimited_payload_filter"
],
"tokenizer": "whitespace",
"type": "custom"
}
},
"filter": {
"my_delimited_payload_filter": {
"delimiter": "+",
"encoding": "float",
"type": "delimited_payload_filter"
}
}
},
"index": {
"number_of_replicas": 0,
"number_of_shards": 1
}
}
}
POST paytest/test/1
{
"text": "the+1 quick+2 brown+3 fox+4 is quick+10"
}
POST paytest/test/2
{
"text": "the+1 quick+2 red+3 fox+4"
}
POST paytest/_refresh
POST paytest/_search
{
"script_fields": {
"ttf": {
"script": "_shard[\"text\"][\"quick\"].ttf()"
}
}
}
POST paytest/_search
{
"script_fields": {
"freq": {
"script": "_shard[\"text\"][\"quick\"].freq()"
}
}
}
POST paytest/test/2/_termvector
POST paytest/_search
{
"script_fields": {
"payloads": {
"script": "term = _shard[\"text\"].get(\"red\",_PAYLOADS);payloads = []; for(pos : term){payloads.add(pos.payloadAsFloat(-1));} return payloads;"
}
}
}
POST paytest/_search
{
"script_fields": {
"tv": {
"script": "_shard[\"text\"][\"quick\"].freq()"
}
},
"query": {
"function_score": {
"functions": [
{
"script_score": {
"script": "_shard[\"text\"][\"quick\"].freq()"
}
}
]
}
}
}
```
closes #3772
2014-01-02 05:17:33 -05:00
Positions can be accessed with an iterator that returns an object
(`POS_OBJECT`) holding position, offsets and payload for each term position.
`POS_OBJECT.position`::
The position of the term.
`POS_OBJECT.startOffset`::
The start offset of the term.
`POS_OBJECT.endOffset`::
The end offset of the term.
`POS_OBJECT.payload`::
The payload of the term.
`POS_OBJECT.payloadAsInt(missingValue)`::
The payload of the term converted to integer. If the current position has
no payload, the `missingValue` will be returned. Call this only if you
know that your payloads are integers.
`POS_OBJECT.payloadAsFloat(missingValue)`::
The payload of the term converted to float. If the current position has no
payload, the `missingValue` will be returned. Call this only if you know
that your payloads are floats.
`POS_OBJECT.payloadAsString()`::
The payload of the term converted to string. If the current position has
no payload, `null` will be returned. Call this only if you know that your
payloads are strings.
Example: sums up all payloads for the term `foo`.
[source,mvel]
---------------------------------------------------------
2014-01-02 07:54:40 -05:00
termInfo = _index['my_field'].get('foo',_PAYLOADS);
make term statistics accessible in scripts
term statistics can be accessed via the _shard variable.
Below is a minimal example. See documentation on details.
```
DELETE paytest
PUT paytest
{
"mappings": {
"test": {
"_all": {
"auto_boost": true,
"enabled": true
},
"properties": {
"text": {
"index_analyzer": "fulltext_analyzer",
"store": "yes",
"type": "string"
}
}
}
},
"settings": {
"analysis": {
"analyzer": {
"fulltext_analyzer": {
"filter": [
"my_delimited_payload_filter"
],
"tokenizer": "whitespace",
"type": "custom"
}
},
"filter": {
"my_delimited_payload_filter": {
"delimiter": "+",
"encoding": "float",
"type": "delimited_payload_filter"
}
}
},
"index": {
"number_of_replicas": 0,
"number_of_shards": 1
}
}
}
POST paytest/test/1
{
"text": "the+1 quick+2 brown+3 fox+4 is quick+10"
}
POST paytest/test/2
{
"text": "the+1 quick+2 red+3 fox+4"
}
POST paytest/_refresh
POST paytest/_search
{
"script_fields": {
"ttf": {
"script": "_shard[\"text\"][\"quick\"].ttf()"
}
}
}
POST paytest/_search
{
"script_fields": {
"freq": {
"script": "_shard[\"text\"][\"quick\"].freq()"
}
}
}
POST paytest/test/2/_termvector
POST paytest/_search
{
"script_fields": {
"payloads": {
"script": "term = _shard[\"text\"].get(\"red\",_PAYLOADS);payloads = []; for(pos : term){payloads.add(pos.payloadAsFloat(-1));} return payloads;"
}
}
}
POST paytest/_search
{
"script_fields": {
"tv": {
"script": "_shard[\"text\"][\"quick\"].freq()"
}
},
"query": {
"function_score": {
"functions": [
{
"script_score": {
"script": "_shard[\"text\"][\"quick\"].freq()"
}
}
]
}
}
}
```
closes #3772
2014-01-02 05:17:33 -05:00
score = 0;
for (pos : termInfo) {
score = score + pos.payloadAsInt(0);
}
return score;
---------------------------------------------------------
[float]
=== Term vectors:
2014-01-02 07:54:40 -05:00
The `_index` variable can only be used to gather statistics for single terms. If you want to use information on all terms in a field, you must store the term vectors (set `term_vector` in the mapping as described in the <<mapping-core-types,mapping documentation>>). To access them, call
`_index.getTermVectors()` to get a
make term statistics accessible in scripts
term statistics can be accessed via the _shard variable.
Below is a minimal example. See documentation on details.
```
DELETE paytest
PUT paytest
{
"mappings": {
"test": {
"_all": {
"auto_boost": true,
"enabled": true
},
"properties": {
"text": {
"index_analyzer": "fulltext_analyzer",
"store": "yes",
"type": "string"
}
}
}
},
"settings": {
"analysis": {
"analyzer": {
"fulltext_analyzer": {
"filter": [
"my_delimited_payload_filter"
],
"tokenizer": "whitespace",
"type": "custom"
}
},
"filter": {
"my_delimited_payload_filter": {
"delimiter": "+",
"encoding": "float",
"type": "delimited_payload_filter"
}
}
},
"index": {
"number_of_replicas": 0,
"number_of_shards": 1
}
}
}
POST paytest/test/1
{
"text": "the+1 quick+2 brown+3 fox+4 is quick+10"
}
POST paytest/test/2
{
"text": "the+1 quick+2 red+3 fox+4"
}
POST paytest/_refresh
POST paytest/_search
{
"script_fields": {
"ttf": {
"script": "_shard[\"text\"][\"quick\"].ttf()"
}
}
}
POST paytest/_search
{
"script_fields": {
"freq": {
"script": "_shard[\"text\"][\"quick\"].freq()"
}
}
}
POST paytest/test/2/_termvector
POST paytest/_search
{
"script_fields": {
"payloads": {
"script": "term = _shard[\"text\"].get(\"red\",_PAYLOADS);payloads = []; for(pos : term){payloads.add(pos.payloadAsFloat(-1));} return payloads;"
}
}
}
POST paytest/_search
{
"script_fields": {
"tv": {
"script": "_shard[\"text\"][\"quick\"].freq()"
}
},
"query": {
"function_score": {
"functions": [
{
"script_score": {
"script": "_shard[\"text\"][\"quick\"].freq()"
}
}
]
}
}
}
```
closes #3772
2014-01-02 05:17:33 -05:00
https://lucene.apache.org/core/4_0_0/core/org/apache/lucene/index/Fields.html[Fields]
instance. This object can then be used as described in https://lucene.apache.org/core/4_0_0/core/org/apache/lucene/index/Fields.html[lucene doc] to iterate over fields and then for each field iterate over each term in the field.
The method will return null if the term vectors were not stored.