mirror of
https://github.com/honeymoose/OpenSearch.git
synced 2025-02-20 03:45:02 +00:00
CONSOLEify analysis docs
Converts the analysis docs to that were marked as json into `CONSOLE` format. A few of them were in yaml but marked as json for historical reasons. I added more complete examples for a few of the less obvious sounding ones. Relates to #18160
This commit is contained in:
parent
01b807f98e
commit
ad69503dce
@ -53,18 +53,6 @@ buildRestTests.expectedUnconvertedCandidates = [
|
|||||||
'reference/aggregations/pipeline/serial-diff-aggregation.asciidoc',
|
'reference/aggregations/pipeline/serial-diff-aggregation.asciidoc',
|
||||||
'reference/aggregations/pipeline/stats-bucket-aggregation.asciidoc',
|
'reference/aggregations/pipeline/stats-bucket-aggregation.asciidoc',
|
||||||
'reference/aggregations/pipeline/sum-bucket-aggregation.asciidoc',
|
'reference/aggregations/pipeline/sum-bucket-aggregation.asciidoc',
|
||||||
'reference/analysis/tokenfilters/asciifolding-tokenfilter.asciidoc',
|
|
||||||
'reference/analysis/tokenfilters/cjk-bigram-tokenfilter.asciidoc',
|
|
||||||
'reference/analysis/tokenfilters/common-grams-tokenfilter.asciidoc',
|
|
||||||
'reference/analysis/tokenfilters/compound-word-tokenfilter.asciidoc',
|
|
||||||
'reference/analysis/tokenfilters/elision-tokenfilter.asciidoc',
|
|
||||||
'reference/analysis/tokenfilters/hunspell-tokenfilter.asciidoc',
|
|
||||||
'reference/analysis/tokenfilters/keep-types-tokenfilter.asciidoc',
|
|
||||||
'reference/analysis/tokenfilters/keep-words-tokenfilter.asciidoc',
|
|
||||||
'reference/analysis/tokenfilters/keyword-marker-tokenfilter.asciidoc',
|
|
||||||
'reference/analysis/tokenfilters/keyword-repeat-tokenfilter.asciidoc',
|
|
||||||
'reference/analysis/tokenfilters/limit-token-count-tokenfilter.asciidoc',
|
|
||||||
'reference/analysis/tokenfilters/lowercase-tokenfilter.asciidoc',
|
|
||||||
'reference/cat/snapshots.asciidoc',
|
'reference/cat/snapshots.asciidoc',
|
||||||
'reference/cat/templates.asciidoc',
|
'reference/cat/templates.asciidoc',
|
||||||
'reference/cat/thread_pool.asciidoc',
|
'reference/cat/thread_pool.asciidoc',
|
||||||
@ -124,10 +112,14 @@ integTestCluster {
|
|||||||
configFile 'scripts/my_map_script.painless'
|
configFile 'scripts/my_map_script.painless'
|
||||||
configFile 'scripts/my_combine_script.painless'
|
configFile 'scripts/my_combine_script.painless'
|
||||||
configFile 'scripts/my_reduce_script.painless'
|
configFile 'scripts/my_reduce_script.painless'
|
||||||
|
configFile 'analysis/example_word_list.txt'
|
||||||
|
configFile 'analysis/hyphenation_patterns.xml'
|
||||||
configFile 'analysis/synonym.txt'
|
configFile 'analysis/synonym.txt'
|
||||||
configFile 'analysis/stemmer_override.txt'
|
configFile 'analysis/stemmer_override.txt'
|
||||||
configFile 'userdict_ja.txt'
|
configFile 'userdict_ja.txt'
|
||||||
configFile 'KeywordTokenizer.rbbi'
|
configFile 'KeywordTokenizer.rbbi'
|
||||||
|
extraConfigFile 'hunspell/en_US/en_US.aff', '../core/src/test/resources/indices/analyze/conf_dir/hunspell/en_US/en_US.aff'
|
||||||
|
extraConfigFile 'hunspell/en_US/en_US.dic', '../core/src/test/resources/indices/analyze/conf_dir/hunspell/en_US/en_US.dic'
|
||||||
// Whitelist reindexing from the local node so we can test it.
|
// Whitelist reindexing from the local node so we can test it.
|
||||||
setting 'reindex.remote.whitelist', '127.0.0.1:*'
|
setting 'reindex.remote.whitelist', '127.0.0.1:*'
|
||||||
}
|
}
|
||||||
|
@ -8,17 +8,21 @@ equivalents, if one exists. Example:
|
|||||||
|
|
||||||
[source,js]
|
[source,js]
|
||||||
--------------------------------------------------
|
--------------------------------------------------
|
||||||
"index" : {
|
PUT /asciifold_example
|
||||||
"analysis" : {
|
{
|
||||||
"analyzer" : {
|
"settings" : {
|
||||||
"default" : {
|
"analysis" : {
|
||||||
"tokenizer" : "standard",
|
"analyzer" : {
|
||||||
"filter" : ["standard", "asciifolding"]
|
"default" : {
|
||||||
|
"tokenizer" : "standard",
|
||||||
|
"filter" : ["standard", "asciifolding"]
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
--------------------------------------------------
|
--------------------------------------------------
|
||||||
|
// CONSOLE
|
||||||
|
|
||||||
Accepts `preserve_original` setting which defaults to false but if true
|
Accepts `preserve_original` setting which defaults to false but if true
|
||||||
will keep the original token as well as emit the folded token. For
|
will keep the original token as well as emit the folded token. For
|
||||||
@ -26,20 +30,24 @@ example:
|
|||||||
|
|
||||||
[source,js]
|
[source,js]
|
||||||
--------------------------------------------------
|
--------------------------------------------------
|
||||||
"index" : {
|
PUT /asciifold_example
|
||||||
"analysis" : {
|
{
|
||||||
"analyzer" : {
|
"settings" : {
|
||||||
"default" : {
|
"analysis" : {
|
||||||
"tokenizer" : "standard",
|
"analyzer" : {
|
||||||
"filter" : ["standard", "my_ascii_folding"]
|
"default" : {
|
||||||
}
|
"tokenizer" : "standard",
|
||||||
},
|
"filter" : ["standard", "my_ascii_folding"]
|
||||||
"filter" : {
|
}
|
||||||
"my_ascii_folding" : {
|
},
|
||||||
"type" : "asciifolding",
|
"filter" : {
|
||||||
"preserve_original" : true
|
"my_ascii_folding" : {
|
||||||
|
"type" : "asciifolding",
|
||||||
|
"preserve_original" : true
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
--------------------------------------------------
|
--------------------------------------------------
|
||||||
|
// CONSOLE
|
||||||
|
@ -16,8 +16,9 @@ Bigrams are generated for characters in `han`, `hiragana`, `katakana` and
|
|||||||
|
|
||||||
[source,js]
|
[source,js]
|
||||||
--------------------------------------------------
|
--------------------------------------------------
|
||||||
|
PUT /cjk_bigram_example
|
||||||
{
|
{
|
||||||
"index" : {
|
"settings" : {
|
||||||
"analysis" : {
|
"analysis" : {
|
||||||
"analyzer" : {
|
"analyzer" : {
|
||||||
"han_bigrams" : {
|
"han_bigrams" : {
|
||||||
@ -40,3 +41,4 @@ Bigrams are generated for characters in `han`, `hiragana`, `katakana` and
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
--------------------------------------------------
|
--------------------------------------------------
|
||||||
|
// CONSOLE
|
||||||
|
@ -41,21 +41,33 @@ Here is an example:
|
|||||||
|
|
||||||
[source,js]
|
[source,js]
|
||||||
--------------------------------------------------
|
--------------------------------------------------
|
||||||
index :
|
PUT /common_grams_example
|
||||||
analysis :
|
{
|
||||||
analyzer :
|
"settings": {
|
||||||
index_grams :
|
"analysis": {
|
||||||
tokenizer : whitespace
|
"my_analyzer": {
|
||||||
filter : [common_grams]
|
"index_grams": {
|
||||||
search_grams :
|
"tokenizer": "whitespace",
|
||||||
tokenizer : whitespace
|
"filter": ["common_grams"]
|
||||||
filter : [common_grams_query]
|
},
|
||||||
filter :
|
"search_grams": {
|
||||||
common_grams :
|
"tokenizer": "whitespace",
|
||||||
type : common_grams
|
"filter": ["common_grams_query"]
|
||||||
common_words: [a, an, the]
|
}
|
||||||
common_grams_query :
|
},
|
||||||
type : common_grams
|
"filter": {
|
||||||
query_mode: true
|
"common_grams": {
|
||||||
common_words: [a, an, the]
|
"type": "common_grams",
|
||||||
|
"common_words": ["a", "an", "the"]
|
||||||
|
},
|
||||||
|
"common_grams_query": {
|
||||||
|
"type": "common_grams",
|
||||||
|
"query_mode": true,
|
||||||
|
"common_words": ["a", "an", "the"]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
--------------------------------------------------
|
--------------------------------------------------
|
||||||
|
// CONSOLE
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
[[analysis-compound-word-tokenfilter]]
|
[[analysis-compound-word-tokenfilter]]
|
||||||
=== Compound Word Token Filter
|
=== Compound Word Token Filters
|
||||||
|
|
||||||
The `hyphenation_decompounder` and `dictionary_decompounder` token filters can
|
The `hyphenation_decompounder` and `dictionary_decompounder` token filters can
|
||||||
decompose compound words found in many German languages into word parts.
|
decompose compound words found in many German languages into word parts.
|
||||||
@ -84,20 +84,31 @@ Here is an example:
|
|||||||
|
|
||||||
[source,js]
|
[source,js]
|
||||||
--------------------------------------------------
|
--------------------------------------------------
|
||||||
index :
|
PUT /compound_word_example
|
||||||
analysis :
|
{
|
||||||
analyzer :
|
"index": {
|
||||||
myAnalyzer2 :
|
"analysis": {
|
||||||
type : custom
|
"analyzer": {
|
||||||
tokenizer : standard
|
"my_analyzer": {
|
||||||
filter : [myTokenFilter1, myTokenFilter2]
|
"type": "custom",
|
||||||
filter :
|
"tokenizer": "standard",
|
||||||
myTokenFilter1 :
|
"filter": ["dictionary_decompounder", "hyphenation_decompounder"]
|
||||||
type : dictionary_decompounder
|
}
|
||||||
word_list: [one, two, three]
|
},
|
||||||
myTokenFilter2 :
|
"filter": {
|
||||||
type : hyphenation_decompounder
|
"dictionary_decompounder": {
|
||||||
word_list_path: path/to/words.txt
|
"type": "dictionary_decompounder",
|
||||||
hyphenation_patterns_path: path/to/fop.xml
|
"word_list": ["one", "two", "three"]
|
||||||
max_subword_size : 22
|
},
|
||||||
|
"hyphenation_decompounder": {
|
||||||
|
"type" : "hyphenation_decompounder",
|
||||||
|
"word_list_path": "analysis/example_word_list.txt",
|
||||||
|
"hyphenation_patterns_path": "analysis/hyphenation_patterns.xml",
|
||||||
|
"max_subword_size": 22
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
--------------------------------------------------
|
--------------------------------------------------
|
||||||
|
// CONSOLE
|
||||||
|
@ -9,20 +9,24 @@ example:
|
|||||||
|
|
||||||
[source,js]
|
[source,js]
|
||||||
--------------------------------------------------
|
--------------------------------------------------
|
||||||
"index" : {
|
PUT /elision_example
|
||||||
"analysis" : {
|
{
|
||||||
"analyzer" : {
|
"settings" : {
|
||||||
"default" : {
|
"analysis" : {
|
||||||
"tokenizer" : "standard",
|
"analyzer" : {
|
||||||
"filter" : ["standard", "elision"]
|
"default" : {
|
||||||
}
|
"tokenizer" : "standard",
|
||||||
},
|
"filter" : ["standard", "elision"]
|
||||||
"filter" : {
|
}
|
||||||
"elision" : {
|
},
|
||||||
"type" : "elision",
|
"filter" : {
|
||||||
"articles" : ["l", "m", "t", "qu", "n", "s", "j"]
|
"elision" : {
|
||||||
|
"type" : "elision",
|
||||||
|
"articles" : ["l", "m", "t", "qu", "n", "s", "j"]
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
--------------------------------------------------
|
--------------------------------------------------
|
||||||
|
// CONSOLE
|
||||||
|
@ -10,7 +10,7 @@ one or more `*.dic` files (all of which will automatically be picked up).
|
|||||||
For example, assuming the default hunspell location is used, the
|
For example, assuming the default hunspell location is used, the
|
||||||
following directory layout will define the `en_US` dictionary:
|
following directory layout will define the `en_US` dictionary:
|
||||||
|
|
||||||
[source,js]
|
[source,txt]
|
||||||
--------------------------------------------------
|
--------------------------------------------------
|
||||||
- conf
|
- conf
|
||||||
|-- hunspell
|
|-- hunspell
|
||||||
@ -42,24 +42,28 @@ settings:
|
|||||||
|
|
||||||
[source,js]
|
[source,js]
|
||||||
--------------------------------------------------
|
--------------------------------------------------
|
||||||
|
PUT /hunspell_example
|
||||||
{
|
{
|
||||||
"analysis" : {
|
"settings": {
|
||||||
"analyzer" : {
|
"analysis" : {
|
||||||
"en" : {
|
"analyzer" : {
|
||||||
"tokenizer" : "standard",
|
"en" : {
|
||||||
"filter" : [ "lowercase", "en_US" ]
|
"tokenizer" : "standard",
|
||||||
}
|
"filter" : [ "lowercase", "en_US" ]
|
||||||
},
|
}
|
||||||
"filter" : {
|
},
|
||||||
"en_US" : {
|
"filter" : {
|
||||||
"type" : "hunspell",
|
"en_US" : {
|
||||||
"locale" : "en_US",
|
"type" : "hunspell",
|
||||||
"dedup" : true
|
"locale" : "en_US",
|
||||||
|
"dedup" : true
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
--------------------------------------------------
|
--------------------------------------------------
|
||||||
|
// CONSOLE
|
||||||
|
|
||||||
The hunspell token filter accepts four options:
|
The hunspell token filter accepts four options:
|
||||||
|
|
||||||
|
@ -1,7 +1,7 @@
|
|||||||
[[analysis-keep-types-tokenfilter]]
|
[[analysis-keep-types-tokenfilter]]
|
||||||
=== Keep Types Token Filter
|
=== Keep Types Token Filter
|
||||||
|
|
||||||
A token filter of type `keep_types` that only keeps tokens with a token type
|
A token filter of type `keep_types` that only keeps tokens with a token type
|
||||||
contained in a predefined set.
|
contained in a predefined set.
|
||||||
|
|
||||||
|
|
||||||
@ -14,24 +14,61 @@ types:: a list of types to keep
|
|||||||
[float]
|
[float]
|
||||||
=== Settings example
|
=== Settings example
|
||||||
|
|
||||||
|
You can set it up like:
|
||||||
|
|
||||||
[source,js]
|
[source,js]
|
||||||
--------------------------------------------------
|
--------------------------------------------------
|
||||||
|
PUT /keep_types_example
|
||||||
{
|
{
|
||||||
"index" : {
|
"settings" : {
|
||||||
"analysis" : {
|
"analysis" : {
|
||||||
"analyzer" : {
|
"analyzer" : {
|
||||||
"my_analyzer" : {
|
"my_analyzer" : {
|
||||||
"tokenizer" : "standard",
|
"tokenizer" : "standard",
|
||||||
"filter" : ["standard", "lowercase", "extract_numbers"]
|
"filter" : ["standard", "lowercase", "extract_numbers"]
|
||||||
},
|
}
|
||||||
},
|
},
|
||||||
"filter" : {
|
"filter" : {
|
||||||
"extract_numbers" : {
|
"extract_numbers" : {
|
||||||
"type" : "keep_types",
|
"type" : "keep_types",
|
||||||
"types" : [ "<NUM>" ]
|
"types" : [ "<NUM>" ]
|
||||||
},
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
--------------------------------------------------
|
--------------------------------------------------
|
||||||
|
// CONSOLE
|
||||||
|
|
||||||
|
And test it like:
|
||||||
|
|
||||||
|
[source,js]
|
||||||
|
--------------------------------------------------
|
||||||
|
POST /keep_types_example/_analyze
|
||||||
|
{
|
||||||
|
"analyzer" : "my_analyzer",
|
||||||
|
"text" : "this is just 1 a test"
|
||||||
|
}
|
||||||
|
--------------------------------------------------
|
||||||
|
// CONSOLE
|
||||||
|
// TEST[continued]
|
||||||
|
|
||||||
|
And it'd respond:
|
||||||
|
|
||||||
|
[source,js]
|
||||||
|
--------------------------------------------------
|
||||||
|
{
|
||||||
|
"tokens": [
|
||||||
|
{
|
||||||
|
"token": "1",
|
||||||
|
"start_offset": 13,
|
||||||
|
"end_offset": 14,
|
||||||
|
"type": "<NUM>",
|
||||||
|
"position": 3
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
--------------------------------------------------
|
||||||
|
// TESTRESPONSE
|
||||||
|
|
||||||
|
Note how only the `<NUM>` token is in the output.
|
||||||
|
@ -20,17 +20,18 @@ keep_words_case:: a boolean indicating whether to lower case the words (defaults
|
|||||||
|
|
||||||
[source,js]
|
[source,js]
|
||||||
--------------------------------------------------
|
--------------------------------------------------
|
||||||
|
PUT /keep_words_example
|
||||||
{
|
{
|
||||||
"index" : {
|
"settings" : {
|
||||||
"analysis" : {
|
"analysis" : {
|
||||||
"analyzer" : {
|
"analyzer" : {
|
||||||
"my_analyzer" : {
|
"example_1" : {
|
||||||
"tokenizer" : "standard",
|
"tokenizer" : "standard",
|
||||||
"filter" : ["standard", "lowercase", "words_till_three"]
|
"filter" : ["standard", "lowercase", "words_till_three"]
|
||||||
},
|
},
|
||||||
"my_analyzer1" : {
|
"example_2" : {
|
||||||
"tokenizer" : "standard",
|
"tokenizer" : "standard",
|
||||||
"filter" : ["standard", "lowercase", "words_on_file"]
|
"filter" : ["standard", "lowercase", "words_in_file"]
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"filter" : {
|
"filter" : {
|
||||||
@ -38,12 +39,13 @@ keep_words_case:: a boolean indicating whether to lower case the words (defaults
|
|||||||
"type" : "keep",
|
"type" : "keep",
|
||||||
"keep_words" : [ "one", "two", "three"]
|
"keep_words" : [ "one", "two", "three"]
|
||||||
},
|
},
|
||||||
"words_on_file" : {
|
"words_in_file" : {
|
||||||
"type" : "keep",
|
"type" : "keep",
|
||||||
"keep_words_path" : "/path/to/word/file"
|
"keep_words_path" : "analysis/example_word_list.txt"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
--------------------------------------------------
|
--------------------------------------------------
|
||||||
|
// CONSOLE
|
||||||
|
@ -19,19 +19,124 @@ in the text.
|
|||||||
`false`.
|
`false`.
|
||||||
|=======================================================================
|
|=======================================================================
|
||||||
|
|
||||||
Here is an example:
|
You can configure it like:
|
||||||
|
|
||||||
[source,js]
|
[source,js]
|
||||||
--------------------------------------------------
|
--------------------------------------------------
|
||||||
index :
|
PUT /keyword_marker_example
|
||||||
analysis :
|
{
|
||||||
analyzer :
|
"settings": {
|
||||||
myAnalyzer :
|
"analysis": {
|
||||||
type : custom
|
"analyzer": {
|
||||||
tokenizer : standard
|
"protect_cats": {
|
||||||
filter : [lowercase, protwords, porter_stem]
|
"type": "custom",
|
||||||
filter :
|
"tokenizer": "standard",
|
||||||
protwords :
|
"filter": ["lowercase", "protect_cats", "porter_stem"]
|
||||||
type : keyword_marker
|
},
|
||||||
keywords_path : analysis/protwords.txt
|
"normal": {
|
||||||
|
"type": "custom",
|
||||||
|
"tokenizer": "standard",
|
||||||
|
"filter": ["lowercase", "porter_stem"]
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"filter": {
|
||||||
|
"protect_cats": {
|
||||||
|
"type": "keyword_marker",
|
||||||
|
"keywords": ["cats"]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
--------------------------------------------------
|
--------------------------------------------------
|
||||||
|
// CONSOLE
|
||||||
|
|
||||||
|
And test it with:
|
||||||
|
|
||||||
|
[source,js]
|
||||||
|
--------------------------------------------------
|
||||||
|
POST /keyword_marker_example/_analyze
|
||||||
|
{
|
||||||
|
"analyzer" : "protect_cats",
|
||||||
|
"text" : "I like cats"
|
||||||
|
}
|
||||||
|
--------------------------------------------------
|
||||||
|
// CONSOLE
|
||||||
|
// TEST[continued]
|
||||||
|
|
||||||
|
And it'd respond:
|
||||||
|
|
||||||
|
[source,js]
|
||||||
|
--------------------------------------------------
|
||||||
|
{
|
||||||
|
"tokens": [
|
||||||
|
{
|
||||||
|
"token": "i",
|
||||||
|
"start_offset": 0,
|
||||||
|
"end_offset": 1,
|
||||||
|
"type": "<ALPHANUM>",
|
||||||
|
"position": 0
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"token": "like",
|
||||||
|
"start_offset": 2,
|
||||||
|
"end_offset": 6,
|
||||||
|
"type": "<ALPHANUM>",
|
||||||
|
"position": 1
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"token": "cats",
|
||||||
|
"start_offset": 7,
|
||||||
|
"end_offset": 11,
|
||||||
|
"type": "<ALPHANUM>",
|
||||||
|
"position": 2
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
--------------------------------------------------
|
||||||
|
// TESTRESPONSE
|
||||||
|
|
||||||
|
As compared to the `normal` analyzer which has `cats` stemmed to `cat`:
|
||||||
|
|
||||||
|
[source,js]
|
||||||
|
--------------------------------------------------
|
||||||
|
POST /keyword_marker_example/_analyze
|
||||||
|
{
|
||||||
|
"analyzer" : "normal",
|
||||||
|
"text" : "I like cats"
|
||||||
|
}
|
||||||
|
--------------------------------------------------
|
||||||
|
// CONSOLE
|
||||||
|
// TEST[continued]
|
||||||
|
|
||||||
|
Response:
|
||||||
|
|
||||||
|
[source,js]
|
||||||
|
--------------------------------------------------
|
||||||
|
{
|
||||||
|
"tokens": [
|
||||||
|
{
|
||||||
|
"token": "i",
|
||||||
|
"start_offset": 0,
|
||||||
|
"end_offset": 1,
|
||||||
|
"type": "<ALPHANUM>",
|
||||||
|
"position": 0
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"token": "like",
|
||||||
|
"start_offset": 2,
|
||||||
|
"end_offset": 6,
|
||||||
|
"type": "<ALPHANUM>",
|
||||||
|
"position": 1
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"token": "cat",
|
||||||
|
"start_offset": 7,
|
||||||
|
"end_offset": 11,
|
||||||
|
"type": "<ALPHANUM>",
|
||||||
|
"position": 2
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
--------------------------------------------------
|
||||||
|
// TESTRESPONSE
|
||||||
|
@ -9,18 +9,85 @@ subsequent stemmer will be indexed twice. Therefore, consider adding a
|
|||||||
`unique` filter with `only_on_same_position` set to `true` to drop
|
`unique` filter with `only_on_same_position` set to `true` to drop
|
||||||
unnecessary duplicates.
|
unnecessary duplicates.
|
||||||
|
|
||||||
Here is an example:
|
Here is an example of using the `keyword_repeat` token filter to
|
||||||
|
preserve both the stemmed and unstemmed version of tokens:
|
||||||
|
|
||||||
[source,js]
|
[source,js]
|
||||||
--------------------------------------------------
|
--------------------------------------------------
|
||||||
index :
|
PUT /keyword_repeat_example
|
||||||
analysis :
|
{
|
||||||
analyzer :
|
"settings": {
|
||||||
myAnalyzer :
|
"analysis": {
|
||||||
type : custom
|
"analyzer": {
|
||||||
tokenizer : standard
|
"stemmed_and_unstemmed": {
|
||||||
filter : [lowercase, keyword_repeat, porter_stem, unique_stem]
|
"type": "custom",
|
||||||
unique_stem:
|
"tokenizer": "standard",
|
||||||
type: unique
|
"filter": ["lowercase", "keyword_repeat", "porter_stem", "unique_stem"]
|
||||||
only_on_same_position : true
|
}
|
||||||
|
},
|
||||||
|
"filter": {
|
||||||
|
"unique_stem": {
|
||||||
|
"type": "unique",
|
||||||
|
"only_on_same_position": true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
--------------------------------------------------
|
--------------------------------------------------
|
||||||
|
// CONSOLE
|
||||||
|
|
||||||
|
And you can test it with:
|
||||||
|
|
||||||
|
[source,js]
|
||||||
|
--------------------------------------------------
|
||||||
|
POST /keyword_repeat_example/_analyze
|
||||||
|
{
|
||||||
|
"analyzer" : "stemmed_and_unstemmed",
|
||||||
|
"text" : "I like cats"
|
||||||
|
}
|
||||||
|
--------------------------------------------------
|
||||||
|
// CONSOLE
|
||||||
|
// TEST[continued]
|
||||||
|
|
||||||
|
And it'd respond:
|
||||||
|
|
||||||
|
[source,js]
|
||||||
|
--------------------------------------------------
|
||||||
|
{
|
||||||
|
"tokens": [
|
||||||
|
{
|
||||||
|
"token": "i",
|
||||||
|
"start_offset": 0,
|
||||||
|
"end_offset": 1,
|
||||||
|
"type": "<ALPHANUM>",
|
||||||
|
"position": 0
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"token": "like",
|
||||||
|
"start_offset": 2,
|
||||||
|
"end_offset": 6,
|
||||||
|
"type": "<ALPHANUM>",
|
||||||
|
"position": 1
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"token": "cats",
|
||||||
|
"start_offset": 7,
|
||||||
|
"end_offset": 11,
|
||||||
|
"type": "<ALPHANUM>",
|
||||||
|
"position": 2
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"token": "cat",
|
||||||
|
"start_offset": 7,
|
||||||
|
"end_offset": 11,
|
||||||
|
"type": "<ALPHANUM>",
|
||||||
|
"position": 2
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
--------------------------------------------------
|
||||||
|
// TESTRESPONSE
|
||||||
|
|
||||||
|
Which preserves both the `cat` and `cats` tokens. Compare this to the example
|
||||||
|
on the <<analysis-keyword-marker-tokenfilter>>.
|
||||||
|
@ -18,15 +18,25 @@ Here is an example:
|
|||||||
|
|
||||||
[source,js]
|
[source,js]
|
||||||
--------------------------------------------------
|
--------------------------------------------------
|
||||||
index :
|
PUT /limit_example
|
||||||
analysis :
|
{
|
||||||
analyzer :
|
"settings": {
|
||||||
myAnalyzer :
|
"analysis": {
|
||||||
type : custom
|
"analyzer": {
|
||||||
tokenizer : standard
|
"limit_example": {
|
||||||
filter : [lowercase, five_token_limit]
|
"type": "custom",
|
||||||
filter :
|
"tokenizer": "standard",
|
||||||
five_token_limit :
|
"filter": ["lowercase", "five_token_limit"]
|
||||||
type : limit
|
}
|
||||||
max_token_count : 5
|
},
|
||||||
|
"filter": {
|
||||||
|
"five_token_limit": {
|
||||||
|
"type": "limit",
|
||||||
|
"max_token_count": 5
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
--------------------------------------------------
|
--------------------------------------------------
|
||||||
|
// CONSOLE
|
||||||
|
@ -10,28 +10,30 @@ custom analyzer
|
|||||||
|
|
||||||
[source,js]
|
[source,js]
|
||||||
--------------------------------------------------
|
--------------------------------------------------
|
||||||
index :
|
PUT /lowercase_example
|
||||||
analysis :
|
{
|
||||||
analyzer :
|
"settings": {
|
||||||
myAnalyzer2 :
|
"analysis": {
|
||||||
type : custom
|
"analyzer": {
|
||||||
tokenizer : myTokenizer1
|
"standard_lowercase_example": {
|
||||||
filter : [myTokenFilter1, myGreekLowerCaseFilter]
|
"type": "custom",
|
||||||
char_filter : [my_html]
|
"tokenizer": "standard",
|
||||||
tokenizer :
|
"filter": ["lowercase"]
|
||||||
myTokenizer1 :
|
},
|
||||||
type : standard
|
"greek_lowercase_example": {
|
||||||
max_token_length : 900
|
"type": "custom",
|
||||||
filter :
|
"tokenizer": "standard",
|
||||||
myTokenFilter1 :
|
"filter": ["greek_lowercase"]
|
||||||
type : stop
|
}
|
||||||
stopwords : [stop1, stop2, stop3, stop4]
|
},
|
||||||
myGreekLowerCaseFilter :
|
"filter": {
|
||||||
type : lowercase
|
"greek_lowercase": {
|
||||||
language : greek
|
"type": "lowercase",
|
||||||
char_filter :
|
"language": "greek"
|
||||||
my_html :
|
}
|
||||||
type : html_strip
|
}
|
||||||
escaped_tags : [xxx, yyy]
|
}
|
||||||
read_ahead : 1024
|
}
|
||||||
|
}
|
||||||
--------------------------------------------------
|
--------------------------------------------------
|
||||||
|
// CONSOLE
|
||||||
|
@ -0,0 +1,4 @@
|
|||||||
|
test
|
||||||
|
list
|
||||||
|
of
|
||||||
|
words
|
@ -0,0 +1,21 @@
|
|||||||
|
<?xml version="1.0" encoding="utf-8"?>
|
||||||
|
<!DOCTYPE hyphenation-info SYSTEM "hyphenation.dtd">
|
||||||
|
|
||||||
|
<!-- Example hyphenation patterns file. -->
|
||||||
|
|
||||||
|
<hyphenation-info>
|
||||||
|
|
||||||
|
<hyphen-char value="-"/>
|
||||||
|
<hyphen-min before="2" after="2"/>
|
||||||
|
|
||||||
|
<classes>
|
||||||
|
aA
|
||||||
|
</classes>
|
||||||
|
|
||||||
|
<exceptions>
|
||||||
|
</exceptions>
|
||||||
|
|
||||||
|
<patterns>
|
||||||
|
.a2
|
||||||
|
</patterns>
|
||||||
|
</hyphenation-info>
|
Loading…
x
Reference in New Issue
Block a user