diff --git a/docs/reference/analysis/tokenizers.asciidoc b/docs/reference/analysis/tokenizers.asciidoc index 830ab3149ce..04f7b673940 100644 --- a/docs/reference/analysis/tokenizers.asciidoc +++ b/docs/reference/analysis/tokenizers.asciidoc @@ -140,8 +140,6 @@ include::tokenizers/ngram-tokenizer.asciidoc[] include::tokenizers/pathhierarchy-tokenizer.asciidoc[] -include::tokenizers/pathhierarchy-tokenizer-examples.asciidoc[] - include::tokenizers/pattern-tokenizer.asciidoc[] include::tokenizers/simplepattern-tokenizer.asciidoc[] diff --git a/docs/reference/analysis/tokenizers/chargroup-tokenizer.asciidoc b/docs/reference/analysis/tokenizers/chargroup-tokenizer.asciidoc index 92329fab543..f9668866f29 100644 --- a/docs/reference/analysis/tokenizers/chargroup-tokenizer.asciidoc +++ b/docs/reference/analysis/tokenizers/chargroup-tokenizer.asciidoc @@ -1,5 +1,8 @@ [[analysis-chargroup-tokenizer]] -=== Char Group Tokenizer +=== Character group tokenizer +++++ +Character group +++++ The `char_group` tokenizer breaks text into terms whenever it encounters a character which is in a defined set. It is mostly useful for cases where a simple diff --git a/docs/reference/analysis/tokenizers/classic-tokenizer.asciidoc b/docs/reference/analysis/tokenizers/classic-tokenizer.asciidoc index 6405e5601b6..dd083a8ab7a 100644 --- a/docs/reference/analysis/tokenizers/classic-tokenizer.asciidoc +++ b/docs/reference/analysis/tokenizers/classic-tokenizer.asciidoc @@ -1,5 +1,8 @@ [[analysis-classic-tokenizer]] -=== Classic Tokenizer +=== Classic tokenizer +++++ +Classic +++++ The `classic` tokenizer is a grammar based tokenizer that is good for English language documents. This tokenizer has heuristics for special treatment of diff --git a/docs/reference/analysis/tokenizers/edgengram-tokenizer.asciidoc b/docs/reference/analysis/tokenizers/edgengram-tokenizer.asciidoc index 2f3f87eda93..ed51463a1d3 100644 --- a/docs/reference/analysis/tokenizers/edgengram-tokenizer.asciidoc +++ b/docs/reference/analysis/tokenizers/edgengram-tokenizer.asciidoc @@ -1,5 +1,8 @@ [[analysis-edgengram-tokenizer]] === Edge n-gram tokenizer +++++ +Edge n-gram +++++ The `edge_ngram` tokenizer first breaks text down into words whenever it encounters one of a list of specified characters, then it emits diff --git a/docs/reference/analysis/tokenizers/keyword-tokenizer.asciidoc b/docs/reference/analysis/tokenizers/keyword-tokenizer.asciidoc index 57f3fbfd9f9..8b5605653bb 100644 --- a/docs/reference/analysis/tokenizers/keyword-tokenizer.asciidoc +++ b/docs/reference/analysis/tokenizers/keyword-tokenizer.asciidoc @@ -1,5 +1,8 @@ [[analysis-keyword-tokenizer]] -=== Keyword Tokenizer +=== Keyword tokenizer +++++ +Keyword +++++ The `keyword` tokenizer is a ``noop'' tokenizer that accepts whatever text it is given and outputs the exact same text as a single term. It can be combined diff --git a/docs/reference/analysis/tokenizers/letter-tokenizer.asciidoc b/docs/reference/analysis/tokenizers/letter-tokenizer.asciidoc index 84bf757b25b..ebec0afd38d 100644 --- a/docs/reference/analysis/tokenizers/letter-tokenizer.asciidoc +++ b/docs/reference/analysis/tokenizers/letter-tokenizer.asciidoc @@ -1,5 +1,8 @@ [[analysis-letter-tokenizer]] -=== Letter Tokenizer +=== Letter tokenizer +++++ +Letter +++++ The `letter` tokenizer breaks text into terms whenever it encounters a character which is not a letter. It does a reasonable job for most European diff --git a/docs/reference/analysis/tokenizers/lowercase-tokenizer.asciidoc b/docs/reference/analysis/tokenizers/lowercase-tokenizer.asciidoc index 927b1742ca6..88bbb77fcac 100644 --- a/docs/reference/analysis/tokenizers/lowercase-tokenizer.asciidoc +++ b/docs/reference/analysis/tokenizers/lowercase-tokenizer.asciidoc @@ -1,6 +1,8 @@ [[analysis-lowercase-tokenizer]] -=== Lowercase Tokenizer - +=== Lowercase tokenizer +++++ +Lowercase +++++ The `lowercase` tokenizer, like the <> breaks text into terms diff --git a/docs/reference/analysis/tokenizers/ngram-tokenizer.asciidoc b/docs/reference/analysis/tokenizers/ngram-tokenizer.asciidoc index 3bd69844591..1abc5ebc6a0 100644 --- a/docs/reference/analysis/tokenizers/ngram-tokenizer.asciidoc +++ b/docs/reference/analysis/tokenizers/ngram-tokenizer.asciidoc @@ -1,5 +1,8 @@ [[analysis-ngram-tokenizer]] === N-gram tokenizer +++++ +N-gram +++++ The `ngram` tokenizer first breaks text down into words whenever it encounters one of a list of specified characters, then it emits diff --git a/docs/reference/analysis/tokenizers/pathhierarchy-tokenizer-examples.asciidoc b/docs/reference/analysis/tokenizers/pathhierarchy-tokenizer-examples.asciidoc deleted file mode 100644 index 646554b7a39..00000000000 --- a/docs/reference/analysis/tokenizers/pathhierarchy-tokenizer-examples.asciidoc +++ /dev/null @@ -1,183 +0,0 @@ -[[analysis-pathhierarchy-tokenizer-examples]] -=== Path Hierarchy Tokenizer Examples - -A common use-case for the `path_hierarchy` tokenizer is filtering results by -file paths. If indexing a file path along with the data, the use of the -`path_hierarchy` tokenizer to analyze the path allows filtering the results -by different parts of the file path string. - - -This example configures an index to have two custom analyzers and applies -those analyzers to multifields of the `file_path` text field that will -store filenames. One of the two analyzers uses reverse tokenization. -Some sample documents are then indexed to represent some file paths -for photos inside photo folders of two different users. - - -[source,console] --------------------------------------------------- -PUT file-path-test -{ - "settings": { - "analysis": { - "analyzer": { - "custom_path_tree": { - "tokenizer": "custom_hierarchy" - }, - "custom_path_tree_reversed": { - "tokenizer": "custom_hierarchy_reversed" - } - }, - "tokenizer": { - "custom_hierarchy": { - "type": "path_hierarchy", - "delimiter": "/" - }, - "custom_hierarchy_reversed": { - "type": "path_hierarchy", - "delimiter": "/", - "reverse": "true" - } - } - } - }, - "mappings": { - "properties": { - "file_path": { - "type": "text", - "fields": { - "tree": { - "type": "text", - "analyzer": "custom_path_tree" - }, - "tree_reversed": { - "type": "text", - "analyzer": "custom_path_tree_reversed" - } - } - } - } - } -} - -POST file-path-test/_doc/1 -{ - "file_path": "/User/alice/photos/2017/05/16/my_photo1.jpg" -} - -POST file-path-test/_doc/2 -{ - "file_path": "/User/alice/photos/2017/05/16/my_photo2.jpg" -} - -POST file-path-test/_doc/3 -{ - "file_path": "/User/alice/photos/2017/05/16/my_photo3.jpg" -} - -POST file-path-test/_doc/4 -{ - "file_path": "/User/alice/photos/2017/05/15/my_photo1.jpg" -} - -POST file-path-test/_doc/5 -{ - "file_path": "/User/bob/photos/2017/05/16/my_photo1.jpg" -} --------------------------------------------------- -// TESTSETUP - - -A search for a particular file path string against the text field matches all -the example documents, with Bob's documents ranking highest due to `bob` also -being one of the terms created by the standard analyzer boosting relevance for -Bob's documents. - -[source,console] --------------------------------------------------- -GET file-path-test/_search -{ - "query": { - "match": { - "file_path": "/User/bob/photos/2017/05" - } - } -} --------------------------------------------------- - -It's simple to match or filter documents with file paths that exist within a -particular directory using the `file_path.tree` field. - -[source,console] --------------------------------------------------- -GET file-path-test/_search -{ - "query": { - "term": { - "file_path.tree": "/User/alice/photos/2017/05/16" - } - } -} --------------------------------------------------- - -With the reverse parameter for this tokenizer, it's also possible to match -from the other end of the file path, such as individual file names or a deep -level subdirectory. The following example shows a search for all files named -`my_photo1.jpg` within any directory via the `file_path.tree_reversed` field -configured to use the reverse parameter in the mapping. - - -[source,console] --------------------------------------------------- -GET file-path-test/_search -{ - "query": { - "term": { - "file_path.tree_reversed": { - "value": "my_photo1.jpg" - } - } - } -} --------------------------------------------------- - -Viewing the tokens generated with both forward and reverse is instructive -in showing the tokens created for the same file path value. - - -[source,console] --------------------------------------------------- -POST file-path-test/_analyze -{ - "analyzer": "custom_path_tree", - "text": "/User/alice/photos/2017/05/16/my_photo1.jpg" -} - -POST file-path-test/_analyze -{ - "analyzer": "custom_path_tree_reversed", - "text": "/User/alice/photos/2017/05/16/my_photo1.jpg" -} --------------------------------------------------- - - -It's also useful to be able to filter with file paths when combined with other -types of searches, such as this example looking for any files paths with `16` -that also must be in Alice's photo directory. - -[source,console] --------------------------------------------------- -GET file-path-test/_search -{ - "query": { - "bool" : { - "must" : { - "match" : { "file_path" : "16" } - }, - "filter": { - "term" : { "file_path.tree" : "/User/alice" } - } - } - } -} --------------------------------------------------- diff --git a/docs/reference/analysis/tokenizers/pathhierarchy-tokenizer.asciidoc b/docs/reference/analysis/tokenizers/pathhierarchy-tokenizer.asciidoc index e431f77c653..2081fdda400 100644 --- a/docs/reference/analysis/tokenizers/pathhierarchy-tokenizer.asciidoc +++ b/docs/reference/analysis/tokenizers/pathhierarchy-tokenizer.asciidoc @@ -1,5 +1,8 @@ [[analysis-pathhierarchy-tokenizer]] -=== Path Hierarchy Tokenizer +=== Path hierarchy tokenizer +++++ +Path hierarchy +++++ The `path_hierarchy` tokenizer takes a hierarchical value like a filesystem path, splits on the path separator, and emits a term for each component in the @@ -167,6 +170,191 @@ If we were to set `reverse` to `true`, it would produce the following: [ one/two/three/, two/three/, three/ ] --------------------------- -[float] -=== Detailed Examples -See <>. +[discrete] +[[analysis-pathhierarchy-tokenizer-detailed-examples]] +=== Detailed examples + +A common use-case for the `path_hierarchy` tokenizer is filtering results by +file paths. If indexing a file path along with the data, the use of the +`path_hierarchy` tokenizer to analyze the path allows filtering the results +by different parts of the file path string. + + +This example configures an index to have two custom analyzers and applies +those analyzers to multifields of the `file_path` text field that will +store filenames. One of the two analyzers uses reverse tokenization. +Some sample documents are then indexed to represent some file paths +for photos inside photo folders of two different users. + + +[source,console] +-------------------------------------------------- +PUT file-path-test +{ + "settings": { + "analysis": { + "analyzer": { + "custom_path_tree": { + "tokenizer": "custom_hierarchy" + }, + "custom_path_tree_reversed": { + "tokenizer": "custom_hierarchy_reversed" + } + }, + "tokenizer": { + "custom_hierarchy": { + "type": "path_hierarchy", + "delimiter": "/" + }, + "custom_hierarchy_reversed": { + "type": "path_hierarchy", + "delimiter": "/", + "reverse": "true" + } + } + } + }, + "mappings": { + "properties": { + "file_path": { + "type": "text", + "fields": { + "tree": { + "type": "text", + "analyzer": "custom_path_tree" + }, + "tree_reversed": { + "type": "text", + "analyzer": "custom_path_tree_reversed" + } + } + } + } + } +} + +POST file-path-test/_doc/1 +{ + "file_path": "/User/alice/photos/2017/05/16/my_photo1.jpg" +} + +POST file-path-test/_doc/2 +{ + "file_path": "/User/alice/photos/2017/05/16/my_photo2.jpg" +} + +POST file-path-test/_doc/3 +{ + "file_path": "/User/alice/photos/2017/05/16/my_photo3.jpg" +} + +POST file-path-test/_doc/4 +{ + "file_path": "/User/alice/photos/2017/05/15/my_photo1.jpg" +} + +POST file-path-test/_doc/5 +{ + "file_path": "/User/bob/photos/2017/05/16/my_photo1.jpg" +} +-------------------------------------------------- + + +A search for a particular file path string against the text field matches all +the example documents, with Bob's documents ranking highest due to `bob` also +being one of the terms created by the standard analyzer boosting relevance for +Bob's documents. + +[source,console] +-------------------------------------------------- +GET file-path-test/_search +{ + "query": { + "match": { + "file_path": "/User/bob/photos/2017/05" + } + } +} +-------------------------------------------------- +// TEST[continued] + +It's simple to match or filter documents with file paths that exist within a +particular directory using the `file_path.tree` field. + +[source,console] +-------------------------------------------------- +GET file-path-test/_search +{ + "query": { + "term": { + "file_path.tree": "/User/alice/photos/2017/05/16" + } + } +} +-------------------------------------------------- +// TEST[continued] + +With the reverse parameter for this tokenizer, it's also possible to match +from the other end of the file path, such as individual file names or a deep +level subdirectory. The following example shows a search for all files named +`my_photo1.jpg` within any directory via the `file_path.tree_reversed` field +configured to use the reverse parameter in the mapping. + + +[source,console] +-------------------------------------------------- +GET file-path-test/_search +{ + "query": { + "term": { + "file_path.tree_reversed": { + "value": "my_photo1.jpg" + } + } + } +} +-------------------------------------------------- +// TEST[continued] + +Viewing the tokens generated with both forward and reverse is instructive +in showing the tokens created for the same file path value. + + +[source,console] +-------------------------------------------------- +POST file-path-test/_analyze +{ + "analyzer": "custom_path_tree", + "text": "/User/alice/photos/2017/05/16/my_photo1.jpg" +} + +POST file-path-test/_analyze +{ + "analyzer": "custom_path_tree_reversed", + "text": "/User/alice/photos/2017/05/16/my_photo1.jpg" +} +-------------------------------------------------- +// TEST[continued] + + +It's also useful to be able to filter with file paths when combined with other +types of searches, such as this example looking for any files paths with `16` +that also must be in Alice's photo directory. + +[source,console] +-------------------------------------------------- +GET file-path-test/_search +{ + "query": { + "bool" : { + "must" : { + "match" : { "file_path" : "16" } + }, + "filter": { + "term" : { "file_path.tree" : "/User/alice" } + } + } + } +} +-------------------------------------------------- +// TEST[continued] diff --git a/docs/reference/analysis/tokenizers/pattern-tokenizer.asciidoc b/docs/reference/analysis/tokenizers/pattern-tokenizer.asciidoc index c1f49e4da22..13eb38f8c4c 100644 --- a/docs/reference/analysis/tokenizers/pattern-tokenizer.asciidoc +++ b/docs/reference/analysis/tokenizers/pattern-tokenizer.asciidoc @@ -1,5 +1,8 @@ [[analysis-pattern-tokenizer]] -=== Pattern Tokenizer +=== Pattern tokenizer +++++ +Pattern +++++ The `pattern` tokenizer uses a regular expression to either split text into terms whenever it matches a word separator, or to capture matching text as diff --git a/docs/reference/analysis/tokenizers/simplepattern-tokenizer.asciidoc b/docs/reference/analysis/tokenizers/simplepattern-tokenizer.asciidoc index 67c0cefc989..d7048986870 100644 --- a/docs/reference/analysis/tokenizers/simplepattern-tokenizer.asciidoc +++ b/docs/reference/analysis/tokenizers/simplepattern-tokenizer.asciidoc @@ -1,5 +1,8 @@ [[analysis-simplepattern-tokenizer]] -=== Simple Pattern Tokenizer +=== Simple pattern tokenizer +++++ +Simple pattern +++++ The `simple_pattern` tokenizer uses a regular expression to capture matching text as terms. The set of regular expression features it supports is more diff --git a/docs/reference/analysis/tokenizers/simplepatternsplit-tokenizer.asciidoc b/docs/reference/analysis/tokenizers/simplepatternsplit-tokenizer.asciidoc index 3f24233334e..9e48015d189 100644 --- a/docs/reference/analysis/tokenizers/simplepatternsplit-tokenizer.asciidoc +++ b/docs/reference/analysis/tokenizers/simplepatternsplit-tokenizer.asciidoc @@ -1,5 +1,8 @@ [[analysis-simplepatternsplit-tokenizer]] -=== Simple Pattern Split Tokenizer +=== Simple pattern split tokenizer +++++ +Simple pattern split +++++ The `simple_pattern_split` tokenizer uses a regular expression to split the input into terms at pattern matches. The set of regular expression features it diff --git a/docs/reference/analysis/tokenizers/standard-tokenizer.asciidoc b/docs/reference/analysis/tokenizers/standard-tokenizer.asciidoc index 0db5cc1186b..6776bfadc7c 100644 --- a/docs/reference/analysis/tokenizers/standard-tokenizer.asciidoc +++ b/docs/reference/analysis/tokenizers/standard-tokenizer.asciidoc @@ -1,5 +1,8 @@ [[analysis-standard-tokenizer]] -=== Standard Tokenizer +=== Standard tokenizer +++++ +Standard +++++ The `standard` tokenizer provides grammar based tokenization (based on the Unicode Text Segmentation algorithm, as specified in diff --git a/docs/reference/analysis/tokenizers/thai-tokenizer.asciidoc b/docs/reference/analysis/tokenizers/thai-tokenizer.asciidoc index 4c6298cc67b..a946db24627 100644 --- a/docs/reference/analysis/tokenizers/thai-tokenizer.asciidoc +++ b/docs/reference/analysis/tokenizers/thai-tokenizer.asciidoc @@ -1,5 +1,8 @@ [[analysis-thai-tokenizer]] -=== Thai Tokenizer +=== Thai tokenizer +++++ +Thai +++++ The `thai` tokenizer segments Thai text into words, using the Thai segmentation algorithm included with Java. Text in other languages in general diff --git a/docs/reference/analysis/tokenizers/uaxurlemail-tokenizer.asciidoc b/docs/reference/analysis/tokenizers/uaxurlemail-tokenizer.asciidoc index 7bb28e112e5..67bd4e952f6 100644 --- a/docs/reference/analysis/tokenizers/uaxurlemail-tokenizer.asciidoc +++ b/docs/reference/analysis/tokenizers/uaxurlemail-tokenizer.asciidoc @@ -1,5 +1,8 @@ [[analysis-uaxurlemail-tokenizer]] -=== UAX URL Email Tokenizer +=== UAX URL email tokenizer +++++ +UAX URL email +++++ The `uax_url_email` tokenizer is like the <> except that it recognises URLs and email addresses as single tokens. diff --git a/docs/reference/analysis/tokenizers/whitespace-tokenizer.asciidoc b/docs/reference/analysis/tokenizers/whitespace-tokenizer.asciidoc index 8d69f6ecc92..c7e49ba16ea 100644 --- a/docs/reference/analysis/tokenizers/whitespace-tokenizer.asciidoc +++ b/docs/reference/analysis/tokenizers/whitespace-tokenizer.asciidoc @@ -1,5 +1,8 @@ [[analysis-whitespace-tokenizer]] -=== Whitespace Tokenizer +=== Whitespace tokenizer +++++ +Whitespace +++++ The `whitespace` tokenizer breaks text into terms whenever it encounters a whitespace character. diff --git a/docs/reference/redirects.asciidoc b/docs/reference/redirects.asciidoc index 23f5b39baca..6c1d771134a 100644 --- a/docs/reference/redirects.asciidoc +++ b/docs/reference/redirects.asciidoc @@ -886,6 +886,10 @@ See <>. See <>. +[role="exclude",id="analysis-pathhierarchy-tokenizer-examples"] +=== Path hierarchy tokenizer examples + +See <>. //// [role="exclude",id="search-request-body"]